From 7896e8e8cc8d0736ef7df213f20f01653434992c Mon Sep 17 00:00:00 2001 From: natoverse Date: Fri, 7 Feb 2025 19:12:53 +0000 Subject: [PATCH] =?UTF-8?q?Deploying=20to=20gh-pages=20from=20@=20microsof?= =?UTF-8?q?t/graphrag@c02ab0984afee86789f22311185cc0b841b74ac6=20?= =?UTF-8?q?=F0=9F=9A=80?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- examples_notebooks/api_overview/index.html | 65 +- examples_notebooks/drift_search/index.html | 491 ++-- examples_notebooks/global_search/index.html | 814 ++---- .../index.html | 413 ++- .../index.html | 127 +- .../index_migration_to_v2/index.html | 2227 +++++++++++++++++ examples_notebooks/local_search/index.html | 1225 ++++----- sitemap.xml.gz | Bin 127 -> 127 bytes 8 files changed, 3448 insertions(+), 1914 deletions(-) rename examples_notebooks/{index_migration => index_migration_to_v1}/index.html (98%) create mode 100644 examples_notebooks/index_migration_to_v2/index.html diff --git a/examples_notebooks/api_overview/index.html b/examples_notebooks/api_overview/index.html index 81252a52..24a72c08 100644 --- a/examples_notebooks/api_overview/index.html +++ b/examples_notebooks/api_overview/index.html @@ -1992,11 +1992,13 @@ from graphrag.index.typing import PipelineRunResult
import yaml
 
-settings = yaml.safe_load(open("<project_directory>/settings.yaml"))  # noqa: PTH123, SIM115
+PROJECT_DIRECTORY = "<project_directory>"
+settings = yaml.safe_load(open(f"{PROJECT_DIRECTORY}/settings.yaml"))  # noqa: PTH123, SIM115
 
import yaml -settings = yaml.safe_load(open("/settings.yaml")) # noqa: PTH123, SIM115
+PROJECT_DIRECTORY = "" +settings = yaml.safe_load(open(f"{PROJECT_DIRECTORY}/settings.yaml")) # noqa: PTH123, SIM115 @@ -2011,9 +2013,10 @@ settings = yaml.safe_load(open("/settings.yaml")) # noqa: PT
 ---------------------------------------------------------------------------
 FileNotFoundError                         Traceback (most recent call last)
-Cell In[3], line 3
+Cell In[3], line 4
       1 import yaml
-----> 3 settings = yaml.safe_load(open("<project_directory>/settings.yaml"))  # noqa: PTH123, SIM115
+      3 PROJECT_DIRECTORY = "<project_directory>"
+----> 4 settings = yaml.safe_load(open(f"{PROJECT_DIRECTORY}/settings.yaml"))  # noqa: PTH123, SIM115
 
 File ~/.cache/pypoetry/virtualenvs/graphrag-F2jvqev7-py3.11/lib/python3.11/site-packages/IPython/core/interactiveshell.py:324, in _modified_open(file, *args, **kwargs)
     317 if file in {0, 1, 2}:
@@ -2073,15 +2076,11 @@ File ~/.cache/pypoetry/virtualenvs/graphrag-F2jvqev7
 
 
from graphrag.config.create_graphrag_config import create_graphrag_config
 
-graphrag_config = create_graphrag_config(
-    values=settings, root_dir="<project_directory>"
-)
+graphrag_config = create_graphrag_config(values=settings, root_dir=PROJECT_DIRECTORY)
 
from graphrag.config.create_graphrag_config import create_graphrag_config -graphrag_config = create_graphrag_config( - values=settings, root_dir="" -)
+graphrag_config = create_graphrag_config(values=settings, root_dir=PROJECT_DIRECTORY) @@ -2096,11 +2095,9 @@ graphrag_config = create_graphrag_config(
 ---------------------------------------------------------------------------
 NameError                                 Traceback (most recent call last)
-Cell In[4], line 4
+Cell In[4], line 3
       1 from graphrag.config.create_graphrag_config import create_graphrag_config
-      3 graphrag_config = create_graphrag_config(
-----> 4     values=settings, root_dir="<project_directory>"
-      5 )
+----> 3 graphrag_config = create_graphrag_config(values=settings, root_dir=PROJECT_DIRECTORY)
 
 NameError: name 'settings' is not defined
@@ -2219,20 +2216,14 @@ Cell In[5], line 1
import pandas as pd
 
-final_nodes = pd.read_parquet("<project_directory>/output/create_final_nodes.parquet")
-final_entities = pd.read_parquet(
-    "<project_directory>/output/create_final_entities.parquet"
-)
-final_communities = pd.read_parquet(
-    "<project_directory>/output/create_final_communities.parquet"
-)
+final_entities = pd.read_parquet(f"{PROJECT_DIRECTORY}/output/entities.parquet")
+final_communities = pd.read_parquet(f"{PROJECT_DIRECTORY}/output/communities.parquet")
 final_community_reports = pd.read_parquet(
-    "<project_directory>/output/create_final_community_reports.parquet"
+    f"{PROJECT_DIRECTORY}/output/community_reports.parquet"
 )
 
 response, context = await api.global_search(
     config=graphrag_config,
-    nodes=final_nodes,
     entities=final_entities,
     communities=final_communities,
     community_reports=final_community_reports,
@@ -2244,20 +2235,14 @@ Cell In[5], line 1
 
import pandas as pd -final_nodes = pd.read_parquet("/output/create_final_nodes.parquet") -final_entities = pd.read_parquet( - "/output/create_final_entities.parquet" -) -final_communities = pd.read_parquet( - "/output/create_final_communities.parquet" -) +final_entities = pd.read_parquet(f"{PROJECT_DIRECTORY}/output/entities.parquet") +final_communities = pd.read_parquet(f"{PROJECT_DIRECTORY}/output/communities.parquet") final_community_reports = pd.read_parquet( - "/output/create_final_community_reports.parquet" + f"{PROJECT_DIRECTORY}/output/community_reports.parquet" ) response, context = await api.global_search( config=graphrag_config, - nodes=final_nodes, entities=final_entities, communities=final_communities, community_reports=final_community_reports, @@ -2265,7 +2250,7 @@ response, context = await api.global_search( dynamic_community_selection=False, response_type="Multiple Paragraphs", query="Who is Scrooge and what are his main relationships?", -)
+) @@ -2282,13 +2267,11 @@ response, context = await api.global_search( FileNotFoundError Traceback (most recent call last) Cell In[6], line 3 1 import pandas as pd -----> 3 final_nodes = pd.read_parquet("<project_directory>/output/create_final_nodes.parquet") - 4 final_entities = pd.read_parquet( - 5 "<project_directory>/output/create_final_entities.parquet" - 6 ) - 7 final_communities = pd.read_parquet( - 8 "<project_directory>/output/create_final_communities.parquet" - 9 ) +----> 3 final_entities = pd.read_parquet(f"{PROJECT_DIRECTORY}/output/entities.parquet") + 4 final_communities = pd.read_parquet(f"{PROJECT_DIRECTORY}/output/communities.parquet") + 5 final_community_reports = pd.read_parquet( + 6 f"{PROJECT_DIRECTORY}/output/community_reports.parquet" + 7 ) File ~/.cache/pypoetry/virtualenvs/graphrag-F2jvqev7-py3.11/lib/python3.11/site-packages/pandas/io/parquet.py:667, in read_parquet(path, engine, columns, storage_options, use_nullable_dtypes, dtype_backend, filesystem, filters, **kwargs) 664 use_nullable_dtypes = False @@ -2348,7 +2331,7 @@ File ~/.cache/pypoetry/virtualenvs/graphrag-F2jvqev7 883 handles.append(handle) 885 # Convert BytesIO or file objects passed with an encoding -FileNotFoundError: [Errno 2] No such file or directory: '<project_directory>/output/create_final_nodes.parquet'
+FileNotFoundError: [Errno 2] No such file or directory: '<project_directory>/output/entities.parquet' diff --git a/examples_notebooks/drift_search/index.html b/examples_notebooks/drift_search/index.html index cc3d3530..9ae3a4ed 100644 --- a/examples_notebooks/drift_search/index.html +++ b/examples_notebooks/drift_search/index.html @@ -1889,22 +1889,22 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left: INPUT_DIR = "./inputs/operation dulce" LANCEDB_URI = f"{INPUT_DIR}/lancedb" -COMMUNITY_REPORT_TABLE = "create_final_community_reports" -ENTITY_TABLE = "create_final_nodes" -ENTITY_EMBEDDING_TABLE = "create_final_entities" -RELATIONSHIP_TABLE = "create_final_relationships" -COVARIATE_TABLE = "create_final_covariates" -TEXT_UNIT_TABLE = "create_final_text_units" +COMMUNITY_REPORT_TABLE = "community_reports" +COMMUNITY_TABLE = "communities" +ENTITY_TABLE = "entities" +RELATIONSHIP_TABLE = "relationships" +COVARIATE_TABLE = "covariates" +TEXT_UNIT_TABLE = "text_units" COMMUNITY_LEVEL = 2 # read nodes table to get community and degree data entity_df = pd.read_parquet(f"{INPUT_DIR}/{ENTITY_TABLE}.parquet") -entity_embedding_df = pd.read_parquet(f"{INPUT_DIR}/{ENTITY_EMBEDDING_TABLE}.parquet") +community_df = pd.read_parquet(f"{INPUT_DIR}/{COMMUNITY_TABLE}.parquet") print(f"Entity df columns: {entity_df.columns}") -entities = read_indexer_entities(entity_df, entity_embedding_df, COMMUNITY_LEVEL) +entities = read_indexer_entities(entity_df, community_df, COMMUNITY_LEVEL) # load description embeddings to an in-memory lancedb vectorstore # to connect to a remote db, specify url and port values. @@ -1959,22 +1959,22 @@ from graphrag.vector_stores.lancedb import LanceDBVectorStore INPUT_DIR = "./inputs/operation dulce" LANCEDB_URI = f"{INPUT_DIR}/lancedb" -COMMUNITY_REPORT_TABLE = "create_final_community_reports" -ENTITY_TABLE = "create_final_nodes" -ENTITY_EMBEDDING_TABLE = "create_final_entities" -RELATIONSHIP_TABLE = "create_final_relationships" -COVARIATE_TABLE = "create_final_covariates" -TEXT_UNIT_TABLE = "create_final_text_units" +COMMUNITY_REPORT_TABLE = "community_reports" +COMMUNITY_TABLE = "communities" +ENTITY_TABLE = "entities" +RELATIONSHIP_TABLE = "relationships" +COVARIATE_TABLE = "covariates" +TEXT_UNIT_TABLE = "text_units" COMMUNITY_LEVEL = 2 # read nodes table to get community and degree data entity_df = pd.read_parquet(f"{INPUT_DIR}/{ENTITY_TABLE}.parquet") -entity_embedding_df = pd.read_parquet(f"{INPUT_DIR}/{ENTITY_EMBEDDING_TABLE}.parquet") +community_df = pd.read_parquet(f"{INPUT_DIR}/{COMMUNITY_TABLE}.parquet") print(f"Entity df columns: {entity_df.columns}") -entities = read_indexer_entities(entity_df, entity_embedding_df, COMMUNITY_LEVEL) +entities = read_indexer_entities(entity_df, community_df, COMMUNITY_LEVEL) # load description embeddings to an in-memory lancedb vectorstore # to connect to a remote db, specify url and port values. @@ -2012,106 +2012,76 @@ text_unit_df.head() @@ -2184,8 +2154,8 @@ text_embedder = OpenAIEmbedding( - + + + @@ -2161,10 +2161,9 @@ token_encoder = tiktoken.encoding_for_model(llm_model)
# parquet files generated from indexing pipeline
 INPUT_DIR = "./inputs/operation dulce"
-COMMUNITY_TABLE = "create_final_communities"
-COMMUNITY_REPORT_TABLE = "create_final_community_reports"
-ENTITY_TABLE = "create_final_nodes"
-ENTITY_EMBEDDING_TABLE = "create_final_entities"
+COMMUNITY_TABLE = "communities"
+COMMUNITY_REPORT_TABLE = "community_reports"
+ENTITY_TABLE = "entities"
 
 # community level in the Leiden community hierarchy from which we will load the community reports
 # higher value means we use reports from more fine-grained communities (at the cost of higher computation cost)
@@ -2172,10 +2171,9 @@ token_encoder = tiktoken.encoding_for_model(llm_model)
# parquet files generated from indexing pipeline INPUT_DIR = "./inputs/operation dulce" -COMMUNITY_TABLE = "create_final_communities" -COMMUNITY_REPORT_TABLE = "create_final_community_reports" -ENTITY_TABLE = "create_final_nodes" -ENTITY_EMBEDDING_TABLE = "create_final_entities" +COMMUNITY_TABLE = "communities" +COMMUNITY_REPORT_TABLE = "community_reports" +ENTITY_TABLE = "entities" # community level in the Leiden community hierarchy from which we will load the community reports # higher value means we use reports from more fine-grained communities (at the cost of higher computation cost) @@ -2207,11 +2205,10 @@ COMMUNITY_LEVEL = 2
community_df = pd.read_parquet(f"{INPUT_DIR}/{COMMUNITY_TABLE}.parquet")
 entity_df = pd.read_parquet(f"{INPUT_DIR}/{ENTITY_TABLE}.parquet")
 report_df = pd.read_parquet(f"{INPUT_DIR}/{COMMUNITY_REPORT_TABLE}.parquet")
-entity_embedding_df = pd.read_parquet(f"{INPUT_DIR}/{ENTITY_EMBEDDING_TABLE}.parquet")
 
-communities = read_indexer_communities(community_df, entity_df, report_df)
-reports = read_indexer_reports(report_df, entity_df, COMMUNITY_LEVEL)
-entities = read_indexer_entities(entity_df, entity_embedding_df, COMMUNITY_LEVEL)
+communities = read_indexer_communities(community_df, report_df)
+reports = read_indexer_reports(report_df, community_df, COMMUNITY_LEVEL)
+entities = read_indexer_entities(entity_df, community_df, COMMUNITY_LEVEL)
 
 print(f"Total report count: {len(report_df)}")
 print(
@@ -2223,11 +2220,10 @@ COMMUNITY_LEVEL = 2
community_df = pd.read_parquet(f"{INPUT_DIR}/{COMMUNITY_TABLE}.parquet") entity_df = pd.read_parquet(f"{INPUT_DIR}/{ENTITY_TABLE}.parquet") report_df = pd.read_parquet(f"{INPUT_DIR}/{COMMUNITY_REPORT_TABLE}.parquet") -entity_embedding_df = pd.read_parquet(f"{INPUT_DIR}/{ENTITY_EMBEDDING_TABLE}.parquet") -communities = read_indexer_communities(community_df, entity_df, report_df) -reports = read_indexer_reports(report_df, entity_df, COMMUNITY_LEVEL) -entities = read_indexer_entities(entity_df, entity_embedding_df, COMMUNITY_LEVEL) +communities = read_indexer_communities(community_df, report_df) +reports = read_indexer_reports(report_df, community_df, COMMUNITY_LEVEL) +entities = read_indexer_entities(entity_df, community_df, COMMUNITY_LEVEL) print(f"Total report count: {len(report_df)}") print( @@ -2245,138 +2241,74 @@ report_df.head()
@@ -2393,8 +2325,8 @@ Report count after filtering by community level 2: 56 - - @@ -3174,9 +2690,17 @@ print( @@ -2055,10 +2055,9 @@ token_encoder = tiktoken.encoding_for_model(llm_model)
# parquet files generated from indexing pipeline
 INPUT_DIR = "./inputs/operation dulce"
-COMMUNITY_TABLE = "create_final_communities"
-COMMUNITY_REPORT_TABLE = "create_final_community_reports"
-ENTITY_TABLE = "create_final_nodes"
-ENTITY_EMBEDDING_TABLE = "create_final_entities"
+COMMUNITY_TABLE = "communities"
+COMMUNITY_REPORT_TABLE = "community_reports"
+ENTITY_TABLE = "entities"
 
 # we don't fix a specific community level but instead use an agent to dynamicially
 # search through all the community reports to check if they are relevant.
@@ -2066,10 +2065,9 @@ token_encoder = tiktoken.encoding_for_model(llm_model)
# parquet files generated from indexing pipeline INPUT_DIR = "./inputs/operation dulce" -COMMUNITY_TABLE = "create_final_communities" -COMMUNITY_REPORT_TABLE = "create_final_community_reports" -ENTITY_TABLE = "create_final_nodes" -ENTITY_EMBEDDING_TABLE = "create_final_entities" +COMMUNITY_TABLE = "communities" +COMMUNITY_REPORT_TABLE = "community_reports" +ENTITY_TABLE = "entities" # we don't fix a specific community level but instead use an agent to dynamicially # search through all the community reports to check if they are relevant. @@ -2101,17 +2099,16 @@ COMMUNITY_LEVEL = None
community_df = pd.read_parquet(f"{INPUT_DIR}/{COMMUNITY_TABLE}.parquet")
 entity_df = pd.read_parquet(f"{INPUT_DIR}/{ENTITY_TABLE}.parquet")
 report_df = pd.read_parquet(f"{INPUT_DIR}/{COMMUNITY_REPORT_TABLE}.parquet")
-entity_embedding_df = pd.read_parquet(f"{INPUT_DIR}/{ENTITY_EMBEDDING_TABLE}.parquet")
 
-communities = read_indexer_communities(community_df, entity_df, report_df)
+communities = read_indexer_communities(community_df, report_df)
 reports = read_indexer_reports(
     report_df,
-    entity_df,
+    community_df,
     community_level=COMMUNITY_LEVEL,
     dynamic_community_selection=True,
 )
 entities = read_indexer_entities(
-    entity_df, entity_embedding_df, community_level=COMMUNITY_LEVEL
+    entity_df, community_df, community_level=COMMUNITY_LEVEL
 )
 
 print(f"Total report count: {len(report_df)}")
@@ -2124,17 +2121,16 @@ COMMUNITY_LEVEL = None
community_df = pd.read_parquet(f"{INPUT_DIR}/{COMMUNITY_TABLE}.parquet") entity_df = pd.read_parquet(f"{INPUT_DIR}/{ENTITY_TABLE}.parquet") report_df = pd.read_parquet(f"{INPUT_DIR}/{COMMUNITY_REPORT_TABLE}.parquet") -entity_embedding_df = pd.read_parquet(f"{INPUT_DIR}/{ENTITY_EMBEDDING_TABLE}.parquet") -communities = read_indexer_communities(community_df, entity_df, report_df) +communities = read_indexer_communities(community_df, report_df) reports = read_indexer_reports( report_df, - entity_df, + community_df, community_level=COMMUNITY_LEVEL, dynamic_community_selection=True, ) entities = read_indexer_entities( - entity_df, entity_embedding_df, community_level=COMMUNITY_LEVEL + entity_df, community_df, community_level=COMMUNITY_LEVEL ) print(f"Total report count: {len(report_df)}") @@ -2153,138 +2149,74 @@ report_df.head()
@@ -2303,8 +2235,8 @@ Report count after filtering by community level None: 72 - - @@ -2722,12 +2666,17 @@ print( @@ -2318,9 +2318,9 @@ COMMUNITY_LEVEL = 2
# read nodes table to get community and degree data
 entity_df = pd.read_parquet(f"{INPUT_DIR}/{ENTITY_TABLE}.parquet")
-entity_embedding_df = pd.read_parquet(f"{INPUT_DIR}/{ENTITY_EMBEDDING_TABLE}.parquet")
+community_df = pd.read_parquet(f"{INPUT_DIR}/{COMMUNITY_TABLE}.parquet")
 
-entities = read_indexer_entities(entity_df, entity_embedding_df, COMMUNITY_LEVEL)
+entities = read_indexer_entities(entity_df, community_df, COMMUNITY_LEVEL)
 
 # load description embeddings to an in-memory lancedb vectorstore
 # to connect to a remote db, specify url and port values.
@@ -2334,9 +2334,9 @@ COMMUNITY_LEVEL = 2
# read nodes table to get community and degree data entity_df = pd.read_parquet(f"{INPUT_DIR}/{ENTITY_TABLE}.parquet") -entity_embedding_df = pd.read_parquet(f"{INPUT_DIR}/{ENTITY_EMBEDDING_TABLE}.parquet") +community_df = pd.read_parquet(f"{INPUT_DIR}/{COMMUNITY_TABLE}.parquet") -entities = read_indexer_entities(entity_df, entity_embedding_df, COMMUNITY_LEVEL) +entities = read_indexer_entities(entity_df, community_df, COMMUNITY_LEVEL) # load description embeddings to an in-memory lancedb vectorstore # to connect to a remote db, specify url and port values. @@ -2357,101 +2357,75 @@ entity_df.head()
@@ -2508,101 +2482,74 @@ relationship_df.head() @@ -2654,9 +2601,76 @@ covariates = {"claims": claims}
report_df = pd.read_parquet(f"{INPUT_DIR}/{COMMUNITY_REPORT_TABLE}.parquet")
-reports = read_indexer_reports(report_df, entity_df, COMMUNITY_LEVEL)
+reports = read_indexer_reports(report_df, community_df, COMMUNITY_LEVEL)
 
 print(f"Report records: {len(report_df)}")
 report_df.head()
 
report_df = pd.read_parquet(f"{INPUT_DIR}/{COMMUNITY_REPORT_TABLE}.parquet") -reports = read_indexer_reports(report_df, entity_df, COMMUNITY_LEVEL) +reports = read_indexer_reports(report_df, community_df, COMMUNITY_LEVEL) print(f"Report records: {len(report_df)}") report_df.head()
@@ -2713,137 +2727,74 @@ report_df.head() @@ -2900,101 +2851,74 @@ text_unit_df.head() @@ -3078,8 +3002,8 @@ text_embedder = OpenAIEmbedding( - - @@ -3383,29 +3347,16 @@ print(result.response) @@ -3452,78 +3403,16 @@ In summary, Dr. Jordan Hayes is a pivotal member of the Paranormal Military Squa @@ -3559,97 +3448,23 @@ In summary, Dr. Jordan Hayes is a pivotal member of the Paranormal Military Squa - - -