mirror of
https://github.com/microsoft/graphrag.git
synced 2026-01-14 09:07:20 +08:00
Deploying to gh-pages from @ microsoft/graphrag@0e7d22bfb0 🚀
This commit is contained in:
parent
6dd47d7de0
commit
92161583f1
@ -1434,6 +1434,7 @@
|
||||
<li><code>poetry run poe test_unit</code> - This will execute unit tests.</li>
|
||||
<li><code>poetry run poe test_integration</code> - This will execute integration tests.</li>
|
||||
<li><code>poetry run poe test_smoke</code> - This will execute smoke tests.</li>
|
||||
<li><code>poetry run poe test_verbs</code> - This will execute tests of the basic workflows.</li>
|
||||
<li><code>poetry run poe check</code> - This will perform a suite of static checks across the package, including:</li>
|
||||
<li>formatting</li>
|
||||
<li>documentation formatting</li>
|
||||
|
||||
@ -2248,7 +2248,7 @@ report_df.head()</div>
|
||||
<pre>
|
||||
<span class="ansi-red-fg">---------------------------------------------------------------------------</span>
|
||||
<span class="ansi-red-fg">AttributeError</span> Traceback (most recent call last)
|
||||
<span class="ansi-green-fg">/tmp/ipykernel_2138/1512985616.py</span> in <span class="ansi-cyan-fg">?</span><span class="ansi-blue-fg">()</span>
|
||||
<span class="ansi-green-fg">/tmp/ipykernel_2065/1512985616.py</span> in <span class="ansi-cyan-fg">?</span><span class="ansi-blue-fg">()</span>
|
||||
<span class="ansi-green-intense-fg ansi-bold"> 2</span> entity_df <span class="ansi-blue-fg">=</span> pd<span class="ansi-blue-fg">.</span>read_parquet<span class="ansi-blue-fg">(</span><span class="ansi-blue-fg">f"{INPUT_DIR}/{ENTITY_TABLE}.parquet"</span><span class="ansi-blue-fg">)</span>
|
||||
<span class="ansi-green-intense-fg ansi-bold"> 3</span> report_df <span class="ansi-blue-fg">=</span> pd<span class="ansi-blue-fg">.</span>read_parquet<span class="ansi-blue-fg">(</span><span class="ansi-blue-fg">f"{INPUT_DIR}/{COMMUNITY_REPORT_TABLE}.parquet"</span><span class="ansi-blue-fg">)</span>
|
||||
<span class="ansi-green-intense-fg ansi-bold"> 4</span> entity_embedding_df <span class="ansi-blue-fg">=</span> pd<span class="ansi-blue-fg">.</span>read_parquet<span class="ansi-blue-fg">(</span><span class="ansi-blue-fg">f"{INPUT_DIR}/{ENTITY_EMBEDDING_TABLE}.parquet"</span><span class="ansi-blue-fg">)</span>
|
||||
|
||||
@ -2156,7 +2156,7 @@ report_df.head()</div>
|
||||
<pre>
|
||||
<span class="ansi-red-fg">---------------------------------------------------------------------------</span>
|
||||
<span class="ansi-red-fg">AttributeError</span> Traceback (most recent call last)
|
||||
<span class="ansi-green-fg">/tmp/ipykernel_2168/2760368953.py</span> in <span class="ansi-cyan-fg">?</span><span class="ansi-blue-fg">()</span>
|
||||
<span class="ansi-green-fg">/tmp/ipykernel_2098/2760368953.py</span> in <span class="ansi-cyan-fg">?</span><span class="ansi-blue-fg">()</span>
|
||||
<span class="ansi-green-intense-fg ansi-bold"> 2</span> entity_df <span class="ansi-blue-fg">=</span> pd<span class="ansi-blue-fg">.</span>read_parquet<span class="ansi-blue-fg">(</span><span class="ansi-blue-fg">f"{INPUT_DIR}/{ENTITY_TABLE}.parquet"</span><span class="ansi-blue-fg">)</span>
|
||||
<span class="ansi-green-intense-fg ansi-bold"> 3</span> report_df <span class="ansi-blue-fg">=</span> pd<span class="ansi-blue-fg">.</span>read_parquet<span class="ansi-blue-fg">(</span><span class="ansi-blue-fg">f"{INPUT_DIR}/{COMMUNITY_REPORT_TABLE}.parquet"</span><span class="ansi-blue-fg">)</span>
|
||||
<span class="ansi-green-intense-fg ansi-bold"> 4</span> entity_embedding_df <span class="ansi-blue-fg">=</span> pd<span class="ansi-blue-fg">.</span>read_parquet<span class="ansi-blue-fg">(</span><span class="ansi-blue-fg">f"{INPUT_DIR}/{ENTITY_EMBEDDING_TABLE}.parquet"</span><span class="ansi-blue-fg">)</span>
|
||||
|
||||
@ -685,9 +685,9 @@
|
||||
</li>
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="#claim-extraction-emission" class="md-nav__link">
|
||||
<a href="#claim-extraction-optional" class="md-nav__link">
|
||||
<span class="md-ellipsis">
|
||||
Claim Extraction & Emission
|
||||
Claim Extraction (optional)
|
||||
</span>
|
||||
</a>
|
||||
|
||||
@ -718,18 +718,9 @@
|
||||
</li>
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="#graph-embedding" class="md-nav__link">
|
||||
<a href="#graph-tables" class="md-nav__link">
|
||||
<span class="md-ellipsis">
|
||||
Graph Embedding
|
||||
</span>
|
||||
</a>
|
||||
|
||||
</li>
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="#graph-tables-emission" class="md-nav__link">
|
||||
<span class="md-ellipsis">
|
||||
Graph Tables Emission
|
||||
Graph Tables
|
||||
</span>
|
||||
</a>
|
||||
|
||||
@ -769,18 +760,9 @@
|
||||
</li>
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="#community-embedding" class="md-nav__link">
|
||||
<a href="#community-reports-table" class="md-nav__link">
|
||||
<span class="md-ellipsis">
|
||||
Community Embedding
|
||||
</span>
|
||||
</a>
|
||||
|
||||
</li>
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="#community-tables-emission" class="md-nav__link">
|
||||
<span class="md-ellipsis">
|
||||
Community Tables Emission
|
||||
Community Reports Table
|
||||
</span>
|
||||
</a>
|
||||
|
||||
@ -820,18 +802,9 @@
|
||||
</li>
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="#document-embedding" class="md-nav__link">
|
||||
<a href="#documents-table" class="md-nav__link">
|
||||
<span class="md-ellipsis">
|
||||
Document Embedding
|
||||
</span>
|
||||
</a>
|
||||
|
||||
</li>
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="#documents-table-emission" class="md-nav__link">
|
||||
<span class="md-ellipsis">
|
||||
Documents Table Emission
|
||||
Documents Table
|
||||
</span>
|
||||
</a>
|
||||
|
||||
@ -843,9 +816,42 @@
|
||||
</li>
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="#phase-6-network-visualization" class="md-nav__link">
|
||||
<a href="#phase-6-network-visualization-optional" class="md-nav__link">
|
||||
<span class="md-ellipsis">
|
||||
Phase 6: Network Visualization
|
||||
Phase 6: Network Visualization (optional)
|
||||
</span>
|
||||
</a>
|
||||
|
||||
<nav class="md-nav" aria-label="Phase 6: Network Visualization (optional)">
|
||||
<ul class="md-nav__list">
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="#graph-embedding" class="md-nav__link">
|
||||
<span class="md-ellipsis">
|
||||
Graph Embedding
|
||||
</span>
|
||||
</a>
|
||||
|
||||
</li>
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="#dimensionality-reduction" class="md-nav__link">
|
||||
<span class="md-ellipsis">
|
||||
Dimensionality Reduction
|
||||
</span>
|
||||
</a>
|
||||
|
||||
</li>
|
||||
|
||||
</ul>
|
||||
</nav>
|
||||
|
||||
</li>
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="#phase-7-text-embedding" class="md-nav__link">
|
||||
<span class="md-ellipsis">
|
||||
Phase 7: Text Embedding
|
||||
</span>
|
||||
</a>
|
||||
|
||||
@ -1682,9 +1688,9 @@
|
||||
</li>
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="#claim-extraction-emission" class="md-nav__link">
|
||||
<a href="#claim-extraction-optional" class="md-nav__link">
|
||||
<span class="md-ellipsis">
|
||||
Claim Extraction & Emission
|
||||
Claim Extraction (optional)
|
||||
</span>
|
||||
</a>
|
||||
|
||||
@ -1715,18 +1721,9 @@
|
||||
</li>
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="#graph-embedding" class="md-nav__link">
|
||||
<a href="#graph-tables" class="md-nav__link">
|
||||
<span class="md-ellipsis">
|
||||
Graph Embedding
|
||||
</span>
|
||||
</a>
|
||||
|
||||
</li>
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="#graph-tables-emission" class="md-nav__link">
|
||||
<span class="md-ellipsis">
|
||||
Graph Tables Emission
|
||||
Graph Tables
|
||||
</span>
|
||||
</a>
|
||||
|
||||
@ -1766,18 +1763,9 @@
|
||||
</li>
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="#community-embedding" class="md-nav__link">
|
||||
<a href="#community-reports-table" class="md-nav__link">
|
||||
<span class="md-ellipsis">
|
||||
Community Embedding
|
||||
</span>
|
||||
</a>
|
||||
|
||||
</li>
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="#community-tables-emission" class="md-nav__link">
|
||||
<span class="md-ellipsis">
|
||||
Community Tables Emission
|
||||
Community Reports Table
|
||||
</span>
|
||||
</a>
|
||||
|
||||
@ -1817,18 +1805,9 @@
|
||||
</li>
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="#document-embedding" class="md-nav__link">
|
||||
<a href="#documents-table" class="md-nav__link">
|
||||
<span class="md-ellipsis">
|
||||
Document Embedding
|
||||
</span>
|
||||
</a>
|
||||
|
||||
</li>
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="#documents-table-emission" class="md-nav__link">
|
||||
<span class="md-ellipsis">
|
||||
Documents Table Emission
|
||||
Documents Table
|
||||
</span>
|
||||
</a>
|
||||
|
||||
@ -1840,9 +1819,42 @@
|
||||
</li>
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="#phase-6-network-visualization" class="md-nav__link">
|
||||
<a href="#phase-6-network-visualization-optional" class="md-nav__link">
|
||||
<span class="md-ellipsis">
|
||||
Phase 6: Network Visualization
|
||||
Phase 6: Network Visualization (optional)
|
||||
</span>
|
||||
</a>
|
||||
|
||||
<nav class="md-nav" aria-label="Phase 6: Network Visualization (optional)">
|
||||
<ul class="md-nav__list">
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="#graph-embedding" class="md-nav__link">
|
||||
<span class="md-ellipsis">
|
||||
Graph Embedding
|
||||
</span>
|
||||
</a>
|
||||
|
||||
</li>
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="#dimensionality-reduction" class="md-nav__link">
|
||||
<span class="md-ellipsis">
|
||||
Dimensionality Reduction
|
||||
</span>
|
||||
</a>
|
||||
|
||||
</li>
|
||||
|
||||
</ul>
|
||||
</nav>
|
||||
|
||||
</li>
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="#phase-7-text-embedding" class="md-nav__link">
|
||||
<span class="md-ellipsis">
|
||||
Phase 7: Text Embedding
|
||||
</span>
|
||||
</a>
|
||||
|
||||
@ -1873,7 +1885,7 @@
|
||||
<li><code>Document</code> - An input document into the system. These either represent individual rows in a CSV or individual .txt file.</li>
|
||||
<li><code>TextUnit</code> - A chunk of text to analyze. The size of these chunks, their overlap, and whether they adhere to any data boundaries may be configured below. A common use case is to set <code>CHUNK_BY_COLUMNS</code> to <code>id</code> so that there is a 1-to-many relationship between documents and TextUnits instead of a many-to-many.</li>
|
||||
<li><code>Entity</code> - An entity extracted from a TextUnit. These represent people, places, events, or some other entity-model that you provide.</li>
|
||||
<li><code>Relationship</code> - A relationship between two entities. These are generated from the covariates.</li>
|
||||
<li><code>Relationship</code> - A relationship between two entities.</li>
|
||||
<li><code>Covariate</code> - Extracted claim information, which contains statements about entities which may be time-bound.</li>
|
||||
<li><code>Community</code> - Once the graph of entities and relationships is built, we perform hierarchical community detection on them to create a clustering structure.</li>
|
||||
<li><code>Community Report</code> - The contents of each community are summarized into a generated report, useful for human reading and downstream search.</li>
|
||||
@ -1887,7 +1899,7 @@ title: Dataflow Overview
|
||||
flowchart TB
|
||||
subgraph phase1[Phase 1: Compose TextUnits]
|
||||
documents[Documents] --> chunk[Chunk]
|
||||
chunk --> embed[Embed] --> textUnits[Text Units]
|
||||
chunk --> textUnits[Text Units]
|
||||
end
|
||||
subgraph phase2[Phase 2: Graph Extraction]
|
||||
textUnits --> graph_extract[Entity & Relationship Extraction]
|
||||
@ -1897,32 +1909,31 @@ flowchart TB
|
||||
end
|
||||
subgraph phase3[Phase 3: Graph Augmentation]
|
||||
graph_outputs --> community_detect[Community Detection]
|
||||
community_detect --> graph_embed[Graph Embedding]
|
||||
graph_embed --> augmented_graph[Augmented Graph Tables]
|
||||
community_detect --> community_outputs[Communities Table]
|
||||
end
|
||||
subgraph phase4[Phase 4: Community Summarization]
|
||||
augmented_graph --> summarized_communities[Community Summarization]
|
||||
summarized_communities --> embed_communities[Community Embedding]
|
||||
embed_communities --> community_outputs[Community Tables]
|
||||
community_outputs --> summarized_communities[Community Summarization]
|
||||
summarized_communities --> community_report_outputs[Community Reports Table]
|
||||
end
|
||||
subgraph phase5[Phase 5: Document Processing]
|
||||
documents --> link_to_text_units[Link to TextUnits]
|
||||
textUnits --> link_to_text_units
|
||||
link_to_text_units --> embed_documents[Document Embedding]
|
||||
embed_documents --> document_graph[Document Graph Creation]
|
||||
document_graph --> document_outputs[Document Tables]
|
||||
link_to_text_units --> document_outputs[Documents Table]
|
||||
end
|
||||
subgraph phase6[Phase 6: Network Visualization]
|
||||
document_outputs --> umap_docs[Umap Documents]
|
||||
augmented_graph --> umap_entities[Umap Entities]
|
||||
umap_docs --> combine_nodes[Nodes Table]
|
||||
umap_entities --> combine_nodes
|
||||
graph_outputs --> graph_embed[Graph Embedding]
|
||||
graph_embed --> umap_entities[Umap Entities]
|
||||
umap_entities --> combine_nodes[Final Nodes]
|
||||
end
|
||||
subgraph phase7[Phase 7: Text Embeddings]
|
||||
textUnits --> text_embed[Text Embedding]
|
||||
graph_outputs --> description_embed[Description Embedding]
|
||||
community_report_outputs --> content_embed[Content Embedding]
|
||||
end</code></pre>
|
||||
<h2 id="phase-1-compose-textunits">Phase 1: Compose TextUnits</h2>
|
||||
<p>The first phase of the default-configuration workflow is to transform input documents into <em>TextUnits</em>. A <em>TextUnit</em> is a chunk of text that is used for our graph extraction techniques. They are also used as source-references by extracted knowledge items in order to empower breadcrumbs and provenance by concepts back to their original source tex.</p>
|
||||
<p>The first phase of the default-configuration workflow is to transform input documents into <em>TextUnits</em>. A <em>TextUnit</em> is a chunk of text that is used for our graph extraction techniques. They are also used as source-references by extracted knowledge items in order to empower breadcrumbs and provenance by concepts back to their original source text.</p>
|
||||
<p>The chunk size (counted in tokens), is user-configurable. By default this is set to 300 tokens, although we've had positive experience with 1200-token chunks using a single "glean" step. (A "glean" step is a follow-on extraction). Larger chunks result in lower-fidelity output and less meaningful reference texts; however, using larger chunks can result in much faster processing time.</p>
|
||||
<p>The group-by configuration is also user-configurable. By default, we align our chunks to document boundaries, meaning that there is a strict 1-to-many relationship between Documents and TextUnits. In rare cases, this can be turned into a many-to-many relationship. This is useful when the documents are very short and we need several of them to compose a meaningful analysis unit (e.g. Tweets or a chat log)</p>
|
||||
<p>Each of these text-units are text-embedded and passed into the next phase of the pipeline.</p>
|
||||
<pre class="mermaid"><code>---
|
||||
title: Documents into Text Chunks
|
||||
---
|
||||
@ -1942,65 +1953,71 @@ flowchart LR
|
||||
tu[TextUnit] --> ge[Graph Extraction] --> gs[Graph Summarization]
|
||||
tu --> ce[Claim Extraction]</code></pre>
|
||||
<h3 id="entity-relationship-extraction">Entity & Relationship Extraction</h3>
|
||||
<p>In this first step of graph extraction, we process each text-unit in order to extract entities and relationships out of the raw text using the LLM. The output of this step is a subgraph-per-TextUnit containing a list of <strong>entities</strong> with a <em>name</em>, <em>type</em>, and <em>description</em>, and a list of <strong>relationships</strong> with a <em>source</em>, <em>target</em>, and <em>description</em>.</p>
|
||||
<p>These subgraphs are merged together - any entities with the same <em>name</em> and <em>type</em> are merged by creating an array of their descriptions. Similarly, any relationships with the same <em>source</em> and <em>target</em> are merged by creating an array of their descriptions.</p>
|
||||
<p>In this first step of graph extraction, we process each text-unit in order to extract entities and relationships out of the raw text using the LLM. The output of this step is a subgraph-per-TextUnit containing a list of <strong>entities</strong> with a <em>title</em>, <em>type</em>, and <em>description</em>, and a list of <strong>relationships</strong> with a <em>source</em>, <em>target</em>, and <em>description</em>.</p>
|
||||
<p>These subgraphs are merged together - any entities with the same <em>title</em> and <em>type</em> are merged by creating an array of their descriptions. Similarly, any relationships with the same <em>source</em> and <em>target</em> are merged by creating an array of their descriptions.</p>
|
||||
<h3 id="entity-relationship-summarization">Entity & Relationship Summarization</h3>
|
||||
<p>Now that we have a graph of entities and relationships, each with a list of descriptions, we can summarize these lists into a single description per entity and relationship. This is done by asking the LLM for a short summary that captures all of the distinct information from each description. This allows all of our entities and relationships to have a single concise description.</p>
|
||||
<h3 id="claim-extraction-emission">Claim Extraction & Emission</h3>
|
||||
<h3 id="claim-extraction-optional">Claim Extraction (optional)</h3>
|
||||
<p>Finally, as an independent workflow, we extract claims from the source TextUnits. These claims represent positive factual statements with an evaluated status and time-bounds. These get exported as a primary artifact called <strong>Covariates</strong>.</p>
|
||||
<p>Note: claim extraction is <em>optional</em> and turned off by default. This is because claim extraction generally requires prompt tuning to be useful.</p>
|
||||
<h2 id="phase-3-graph-augmentation">Phase 3: Graph Augmentation</h2>
|
||||
<p>Now that we have a usable graph of entities and relationships, we want to understand their community structure and augment the graph with additional information. This is done in two steps: <em>Community Detection</em> and <em>Graph Embedding</em>. These give us explicit (communities) and implicit (embeddings) ways of understanding the topological structure of our graph.</p>
|
||||
<p>Now that we have a usable graph of entities and relationships, we want to understand their community structure. These give us explicit ways of understanding the topological structure of our graph.</p>
|
||||
<pre class="mermaid"><code>---
|
||||
title: Graph Augmentation
|
||||
---
|
||||
flowchart LR
|
||||
cd[Leiden Hierarchical Community Detection] --> ge[Node2Vec Graph Embedding] --> ag[Graph Table Emission]</code></pre>
|
||||
cd[Leiden Hierarchical Community Detection] --> ag[Graph Tables]</code></pre>
|
||||
<h3 id="community-detection">Community Detection</h3>
|
||||
<p>In this step, we generate a hierarchy of entity communities using the Hierarchical Leiden Algorithm. This method will apply a recursive community-clustering to our graph until we reach a community-size threshold. This will allow us to understand the community structure of our graph and provide a way to navigate and summarize the graph at different levels of granularity.</p>
|
||||
<h3 id="graph-embedding">Graph Embedding</h3>
|
||||
<p>In this step, we generate a vector representation of our graph using the Node2Vec algorithm. This will allow us to understand the implicit structure of our graph and provide an additional vector-space in which to search for related concepts during our query phase.</p>
|
||||
<h3 id="graph-tables-emission">Graph Tables Emission</h3>
|
||||
<p>Once our graph augmentation steps are complete, the final <strong>Entities</strong> and <strong>Relationships</strong> tables are exported after their text fields are text-embedded.</p>
|
||||
<h3 id="graph-tables">Graph Tables</h3>
|
||||
<p>Once our graph augmentation steps are complete, the final <strong>Entities</strong>, <strong>Relationships</strong>, and <strong>Communities</strong> tables are exported.</p>
|
||||
<h2 id="phase-4-community-summarization">Phase 4: Community Summarization</h2>
|
||||
<pre class="mermaid"><code>---
|
||||
title: Community Summarization
|
||||
---
|
||||
flowchart LR
|
||||
sc[Generate Community Reports] --> ss[Summarize Community Reports] --> ce[Community Embedding] --> co[Community Tables Emission]</code></pre>
|
||||
<p>At this point, we have a functional graph of entities and relationships, a hierarchy of communities for the entities, as well as node2vec embeddings.</p>
|
||||
sc[Generate Community Reports] --> ss[Summarize Community Reports] --> co[Community Reports Table]</code></pre>
|
||||
<p>At this point, we have a functional graph of entities and relationships and a hierarchy of communities for the entities.</p>
|
||||
<p>Now we want to build on the communities data and generate reports for each community. This gives us a high-level understanding of the graph at several points of graph granularity. For example, if community A is the top-level community, we'll get a report about the entire graph. If the community is lower-level, we'll get a report about a local cluster.</p>
|
||||
<h3 id="generate-community-reports">Generate Community Reports</h3>
|
||||
<p>In this step, we generate a summary of each community using the LLM. This will allow us to understand the distinct information contained within each community and provide a scoped understanding of the graph, from either a high-level or a low-level perspective. These reports contain an executive overview and reference the key entities, relationships, and claims within the community sub-structure.</p>
|
||||
<h3 id="summarize-community-reports">Summarize Community Reports</h3>
|
||||
<p>In this step, each <em>community report</em> is then summarized via the LLM for shorthand use.</p>
|
||||
<h3 id="community-embedding">Community Embedding</h3>
|
||||
<p>In this step, we generate a vector representation of our communities by generating text embeddings of the community report, the community report summary, and the title of the community report.</p>
|
||||
<h3 id="community-tables-emission">Community Tables Emission</h3>
|
||||
<p>At this point, some bookkeeping work is performed and we export the <strong>Communities</strong> and <strong>CommunityReports</strong> tables.</p>
|
||||
<h3 id="community-reports-table">Community Reports Table</h3>
|
||||
<p>At this point, some bookkeeping work is performed and we export the <strong>Community Reports</strong> tables.</p>
|
||||
<h2 id="phase-5-document-processing">Phase 5: Document Processing</h2>
|
||||
<p>In this phase of the workflow, we create the <em>Documents</em> table for the knowledge model.</p>
|
||||
<pre class="mermaid"><code>---
|
||||
title: Document Processing
|
||||
---
|
||||
flowchart LR
|
||||
aug[Augment] --> dp[Link to TextUnits] --> de[Avg. Embedding] --> dg[Document Table Emission]</code></pre>
|
||||
aug[Augment] --> dp[Link to TextUnits] --> dg[Documents Table]</code></pre>
|
||||
<h3 id="augment-with-columns-csv-only">Augment with Columns (CSV Only)</h3>
|
||||
<p>If the workflow is operating on CSV data, you may configure your workflow to add additional fields to Documents output. These fields should exist on the incoming CSV tables. Details about configuring this can be found in the <a href="../../config/overview/">configuration documentation</a>.</p>
|
||||
<h3 id="link-to-textunits">Link to TextUnits</h3>
|
||||
<p>In this step, we link each document to the text-units that were created in the first phase. This allows us to understand which documents are related to which text-units and vice-versa.</p>
|
||||
<h3 id="document-embedding">Document Embedding</h3>
|
||||
<p>In this step, we generate a vector representation of our documents using an average embedding of document slices. We re-chunk documents without overlapping chunks, and then generate an embedding for each chunk. We create an average of these chunks weighted by token-count and use this as the document embedding. This will allow us to understand the implicit relationship between documents, and will help us generate a network representation of our documents.</p>
|
||||
<h3 id="documents-table-emission">Documents Table Emission</h3>
|
||||
<h3 id="documents-table">Documents Table</h3>
|
||||
<p>At this point, we can export the <strong>Documents</strong> table into the knowledge Model.</p>
|
||||
<h2 id="phase-6-network-visualization">Phase 6: Network Visualization</h2>
|
||||
<h2 id="phase-6-network-visualization-optional">Phase 6: Network Visualization (optional)</h2>
|
||||
<p>In this phase of the workflow, we perform some steps to support network visualization of our high-dimensional vector spaces within our existing graphs. At this point there are two logical graphs at play: the <em>Entity-Relationship</em> graph and the <em>Document</em> graph.</p>
|
||||
<pre class="mermaid"><code>---
|
||||
title: Network Visualization Workflows
|
||||
---
|
||||
flowchart LR
|
||||
nv[Umap Documents] --> ne[Umap Entities] --> ng[Nodes Table Emission]</code></pre>
|
||||
<p>For each of the logical graphs, we perform a UMAP dimensionality reduction to generate a 2D representation of the graph. This will allow us to visualize the graph in a 2D space and understand the relationships between the nodes in the graph. The UMAP embeddings are then exported as a table of <em>Nodes</em>. The rows of this table include a discriminator indicating whether the node is a document or an entity, and the UMAP coordinates.</p>
|
||||
ag[Graph Table] --> ge[Node2Vec Graph Embedding] --> ne[Umap Entities] --> ng[Nodes Table]</code></pre>
|
||||
<h3 id="graph-embedding">Graph Embedding</h3>
|
||||
<p>In this step, we generate a vector representation of our graph using the Node2Vec algorithm. This will allow us to understand the implicit structure of our graph and provide an additional vector-space in which to search for related concepts during our query phase.</p>
|
||||
<h3 id="dimensionality-reduction">Dimensionality Reduction</h3>
|
||||
<p>For each of the logical graphs, we perform a UMAP dimensionality reduction to generate a 2D representation of the graph. This will allow us to visualize the graph in a 2D space and understand the relationships between the nodes in the graph. The UMAP embeddings are then exported as a table of <em>Nodes</em>. The rows of this table include the UMAP dimensions as x/y coordinates.</p>
|
||||
<h2 id="phase-7-text-embedding">Phase 7: Text Embedding</h2>
|
||||
<p>For all artifacts that require downstream vector search, we generate text embeddings as a final step. These embeddings are written directly to a configured vector store. By default we embed entity descriptions, text unit text, and community report text.</p>
|
||||
<pre class="mermaid"><code>---
|
||||
title: Text Embedding Workflows
|
||||
---
|
||||
flowchart LR
|
||||
textUnits[Text Units] --> text_embed[Text Embedding]
|
||||
graph_outputs[Graph Tables] --> description_embed[Description Embedding]
|
||||
community_report_outputs[Community Reports] --> content_embed[Content Embedding]</code></pre>
|
||||
|
||||
|
||||
|
||||
|
||||
@ -1564,22 +1564,16 @@
|
||||
<li>embed entities into a graph vector space</li>
|
||||
<li>embed text chunks into a textual vector space</li>
|
||||
</ul>
|
||||
<p>The outputs of the pipeline can be stored in a variety of formats, including JSON and Parquet - or they can be handled manually via the Python API.</p>
|
||||
<p>The outputs of the pipeline are stored as Parquet tables by default, and embeddings are written to your configured vector store.</p>
|
||||
<h2 id="getting-started">Getting Started</h2>
|
||||
<h3 id="requirements">Requirements</h3>
|
||||
<p>See the <a href="../../developing/#requirements">requirements</a> section in <a href="../../get_started/">Get Started</a> for details on setting up a development environment.</p>
|
||||
<p>The Indexing Engine can be used in either a default configuration mode or with a custom pipeline.
|
||||
To configure GraphRAG, see the <a href="../../config/overview/">configuration</a> documentation.
|
||||
<p>To configure GraphRAG, see the <a href="../../config/overview/">configuration</a> documentation.
|
||||
After you have a config file you can run the pipeline using the CLI or the Python API.</p>
|
||||
<h2 id="usage">Usage</h2>
|
||||
<h3 id="cli">CLI</h3>
|
||||
<div class="highlight"><pre><span></span><code><a id="__codelineno-0-1" name="__codelineno-0-1" href="#__codelineno-0-1"></a><span class="c1"># Via Poetry</span>
|
||||
<a id="__codelineno-0-2" name="__codelineno-0-2" href="#__codelineno-0-2"></a>poetry<span class="w"> </span>run<span class="w"> </span>poe<span class="w"> </span>cli<span class="w"> </span>--root<span class="w"> </span><data_root><span class="w"> </span><span class="c1"># default config mode</span>
|
||||
<a id="__codelineno-0-3" name="__codelineno-0-3" href="#__codelineno-0-3"></a>poetry<span class="w"> </span>run<span class="w"> </span>poe<span class="w"> </span>cli<span class="w"> </span>--config<span class="w"> </span>your_pipeline.yml<span class="w"> </span><span class="c1"># custom config mode</span>
|
||||
<a id="__codelineno-0-4" name="__codelineno-0-4" href="#__codelineno-0-4"></a>
|
||||
<a id="__codelineno-0-5" name="__codelineno-0-5" href="#__codelineno-0-5"></a><span class="c1"># Via Node</span>
|
||||
<a id="__codelineno-0-6" name="__codelineno-0-6" href="#__codelineno-0-6"></a>yarn<span class="w"> </span>run:index<span class="w"> </span>--root<span class="w"> </span><data_root><span class="w"> </span><span class="c1"># default config mode</span>
|
||||
<a id="__codelineno-0-7" name="__codelineno-0-7" href="#__codelineno-0-7"></a>yarn<span class="w"> </span>run:index<span class="w"> </span>--config<span class="w"> </span>your_pipeline.yml<span class="w"> </span><span class="c1"># custom config mode</span>
|
||||
</code></pre></div>
|
||||
<h3 id="python-api">Python API</h3>
|
||||
<p>Please see the <a href="https://github.com/microsoft/graphrag/blob/main/examples/README.md">examples folder</a> for a handful of functional pipelines illustrating how to create and run via a custom settings.yml or through custom python scripts.</p>
|
||||
|
||||
File diff suppressed because one or more lines are too long
Loading…
Reference in New Issue
Block a user