From be1909b0da84a0a37860f5a60778df0a781784f7 Mon Sep 17 00:00:00 2001 From: KennyZhang1 <90438893+KennyZhang1@users.noreply.github.com> Date: Fri, 7 Feb 2025 22:16:23 +0000 Subject: [PATCH] =?UTF-8?q?Deploying=20to=20gh-pages=20from=20@=20microsof?= =?UTF-8?q?t/graphrag@83cc2daf91fe9c39dbc6591f55885e61a947e4b8=20?= =?UTF-8?q?=F0=9F=9A=80?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- examples_notebooks/api_overview/index.html | 65 +- examples_notebooks/drift_search/index.html | 643 +++-- examples_notebooks/global_search/index.html | 1326 ++++++++-- .../index.html | 612 +++-- .../index.html | 127 +- .../index_migration_to_v2/index.html | 2227 ----------------- examples_notebooks/local_search/index.html | 1182 +++++---- 7 files changed, 2858 insertions(+), 3324 deletions(-) rename examples_notebooks/{index_migration_to_v1 => index_migration}/index.html (98%) delete mode 100644 examples_notebooks/index_migration_to_v2/index.html diff --git a/examples_notebooks/api_overview/index.html b/examples_notebooks/api_overview/index.html index 24a72c08..81252a52 100644 --- a/examples_notebooks/api_overview/index.html +++ b/examples_notebooks/api_overview/index.html @@ -1992,13 +1992,11 @@ from graphrag.index.typing import PipelineRunResult
import yaml
 
-PROJECT_DIRECTORY = "<project_directory>"
-settings = yaml.safe_load(open(f"{PROJECT_DIRECTORY}/settings.yaml"))  # noqa: PTH123, SIM115
+settings = yaml.safe_load(open("<project_directory>/settings.yaml"))  # noqa: PTH123, SIM115
 
import yaml -PROJECT_DIRECTORY = "" -settings = yaml.safe_load(open(f"{PROJECT_DIRECTORY}/settings.yaml")) # noqa: PTH123, SIM115
+settings = yaml.safe_load(open("/settings.yaml")) # noqa: PTH123, SIM115 @@ -2013,10 +2011,9 @@ settings = yaml.safe_load(open(f"{PROJECT_DIRECTORY}/settings.yaml")) # noqa: P
 ---------------------------------------------------------------------------
 FileNotFoundError                         Traceback (most recent call last)
-Cell In[3], line 4
+Cell In[3], line 3
       1 import yaml
-      3 PROJECT_DIRECTORY = "<project_directory>"
-----> 4 settings = yaml.safe_load(open(f"{PROJECT_DIRECTORY}/settings.yaml"))  # noqa: PTH123, SIM115
+----> 3 settings = yaml.safe_load(open("<project_directory>/settings.yaml"))  # noqa: PTH123, SIM115
 
 File ~/.cache/pypoetry/virtualenvs/graphrag-F2jvqev7-py3.11/lib/python3.11/site-packages/IPython/core/interactiveshell.py:324, in _modified_open(file, *args, **kwargs)
     317 if file in {0, 1, 2}:
@@ -2076,11 +2073,15 @@ File ~/.cache/pypoetry/virtualenvs/graphrag-F2jvqev7
 
 
from graphrag.config.create_graphrag_config import create_graphrag_config
 
-graphrag_config = create_graphrag_config(values=settings, root_dir=PROJECT_DIRECTORY)
+graphrag_config = create_graphrag_config(
+    values=settings, root_dir="<project_directory>"
+)
 
from graphrag.config.create_graphrag_config import create_graphrag_config -graphrag_config = create_graphrag_config(values=settings, root_dir=PROJECT_DIRECTORY)
+graphrag_config = create_graphrag_config( + values=settings, root_dir="" +) @@ -2095,9 +2096,11 @@ graphrag_config = create_graphrag_config(values=settings, root_dir=PROJECT_DIREC
 ---------------------------------------------------------------------------
 NameError                                 Traceback (most recent call last)
-Cell In[4], line 3
+Cell In[4], line 4
       1 from graphrag.config.create_graphrag_config import create_graphrag_config
-----> 3 graphrag_config = create_graphrag_config(values=settings, root_dir=PROJECT_DIRECTORY)
+      3 graphrag_config = create_graphrag_config(
+----> 4     values=settings, root_dir="<project_directory>"
+      5 )
 
 NameError: name 'settings' is not defined
@@ -2216,14 +2219,20 @@ Cell In[5], line 1
import pandas as pd
 
-final_entities = pd.read_parquet(f"{PROJECT_DIRECTORY}/output/entities.parquet")
-final_communities = pd.read_parquet(f"{PROJECT_DIRECTORY}/output/communities.parquet")
+final_nodes = pd.read_parquet("<project_directory>/output/create_final_nodes.parquet")
+final_entities = pd.read_parquet(
+    "<project_directory>/output/create_final_entities.parquet"
+)
+final_communities = pd.read_parquet(
+    "<project_directory>/output/create_final_communities.parquet"
+)
 final_community_reports = pd.read_parquet(
-    f"{PROJECT_DIRECTORY}/output/community_reports.parquet"
+    "<project_directory>/output/create_final_community_reports.parquet"
 )
 
 response, context = await api.global_search(
     config=graphrag_config,
+    nodes=final_nodes,
     entities=final_entities,
     communities=final_communities,
     community_reports=final_community_reports,
@@ -2235,14 +2244,20 @@ Cell In[5], line 1
 
import pandas as pd -final_entities = pd.read_parquet(f"{PROJECT_DIRECTORY}/output/entities.parquet") -final_communities = pd.read_parquet(f"{PROJECT_DIRECTORY}/output/communities.parquet") +final_nodes = pd.read_parquet("/output/create_final_nodes.parquet") +final_entities = pd.read_parquet( + "/output/create_final_entities.parquet" +) +final_communities = pd.read_parquet( + "/output/create_final_communities.parquet" +) final_community_reports = pd.read_parquet( - f"{PROJECT_DIRECTORY}/output/community_reports.parquet" + "/output/create_final_community_reports.parquet" ) response, context = await api.global_search( config=graphrag_config, + nodes=final_nodes, entities=final_entities, communities=final_communities, community_reports=final_community_reports, @@ -2250,7 +2265,7 @@ response, context = await api.global_search( dynamic_community_selection=False, response_type="Multiple Paragraphs", query="Who is Scrooge and what are his main relationships?", -)
+) @@ -2267,11 +2282,13 @@ response, context = await api.global_search( FileNotFoundError Traceback (most recent call last) Cell In[6], line 3 1 import pandas as pd -----> 3 final_entities = pd.read_parquet(f"{PROJECT_DIRECTORY}/output/entities.parquet") - 4 final_communities = pd.read_parquet(f"{PROJECT_DIRECTORY}/output/communities.parquet") - 5 final_community_reports = pd.read_parquet( - 6 f"{PROJECT_DIRECTORY}/output/community_reports.parquet" - 7 ) +----> 3 final_nodes = pd.read_parquet("<project_directory>/output/create_final_nodes.parquet") + 4 final_entities = pd.read_parquet( + 5 "<project_directory>/output/create_final_entities.parquet" + 6 ) + 7 final_communities = pd.read_parquet( + 8 "<project_directory>/output/create_final_communities.parquet" + 9 ) File ~/.cache/pypoetry/virtualenvs/graphrag-F2jvqev7-py3.11/lib/python3.11/site-packages/pandas/io/parquet.py:667, in read_parquet(path, engine, columns, storage_options, use_nullable_dtypes, dtype_backend, filesystem, filters, **kwargs) 664 use_nullable_dtypes = False @@ -2331,7 +2348,7 @@ File ~/.cache/pypoetry/virtualenvs/graphrag-F2jvqev7 883 handles.append(handle) 885 # Convert BytesIO or file objects passed with an encoding -FileNotFoundError: [Errno 2] No such file or directory: '<project_directory>/output/entities.parquet'
+FileNotFoundError: [Errno 2] No such file or directory: '<project_directory>/output/create_final_nodes.parquet' diff --git a/examples_notebooks/drift_search/index.html b/examples_notebooks/drift_search/index.html index 9ae3a4ed..ae85de90 100644 --- a/examples_notebooks/drift_search/index.html +++ b/examples_notebooks/drift_search/index.html @@ -1889,22 +1889,22 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left: INPUT_DIR = "./inputs/operation dulce" LANCEDB_URI = f"{INPUT_DIR}/lancedb" -COMMUNITY_REPORT_TABLE = "community_reports" -COMMUNITY_TABLE = "communities" -ENTITY_TABLE = "entities" -RELATIONSHIP_TABLE = "relationships" -COVARIATE_TABLE = "covariates" -TEXT_UNIT_TABLE = "text_units" +COMMUNITY_REPORT_TABLE = "create_final_community_reports" +ENTITY_TABLE = "create_final_nodes" +ENTITY_EMBEDDING_TABLE = "create_final_entities" +RELATIONSHIP_TABLE = "create_final_relationships" +COVARIATE_TABLE = "create_final_covariates" +TEXT_UNIT_TABLE = "create_final_text_units" COMMUNITY_LEVEL = 2 # read nodes table to get community and degree data entity_df = pd.read_parquet(f"{INPUT_DIR}/{ENTITY_TABLE}.parquet") -community_df = pd.read_parquet(f"{INPUT_DIR}/{COMMUNITY_TABLE}.parquet") +entity_embedding_df = pd.read_parquet(f"{INPUT_DIR}/{ENTITY_EMBEDDING_TABLE}.parquet") print(f"Entity df columns: {entity_df.columns}") -entities = read_indexer_entities(entity_df, community_df, COMMUNITY_LEVEL) +entities = read_indexer_entities(entity_df, entity_embedding_df, COMMUNITY_LEVEL) # load description embeddings to an in-memory lancedb vectorstore # to connect to a remote db, specify url and port values. @@ -1959,22 +1959,22 @@ from graphrag.vector_stores.lancedb import LanceDBVectorStore INPUT_DIR = "./inputs/operation dulce" LANCEDB_URI = f"{INPUT_DIR}/lancedb" -COMMUNITY_REPORT_TABLE = "community_reports" -COMMUNITY_TABLE = "communities" -ENTITY_TABLE = "entities" -RELATIONSHIP_TABLE = "relationships" -COVARIATE_TABLE = "covariates" -TEXT_UNIT_TABLE = "text_units" +COMMUNITY_REPORT_TABLE = "create_final_community_reports" +ENTITY_TABLE = "create_final_nodes" +ENTITY_EMBEDDING_TABLE = "create_final_entities" +RELATIONSHIP_TABLE = "create_final_relationships" +COVARIATE_TABLE = "create_final_covariates" +TEXT_UNIT_TABLE = "create_final_text_units" COMMUNITY_LEVEL = 2 # read nodes table to get community and degree data entity_df = pd.read_parquet(f"{INPUT_DIR}/{ENTITY_TABLE}.parquet") -community_df = pd.read_parquet(f"{INPUT_DIR}/{COMMUNITY_TABLE}.parquet") +entity_embedding_df = pd.read_parquet(f"{INPUT_DIR}/{ENTITY_EMBEDDING_TABLE}.parquet") print(f"Entity df columns: {entity_df.columns}") -entities = read_indexer_entities(entity_df, community_df, COMMUNITY_LEVEL) +entities = read_indexer_entities(entity_df, entity_embedding_df, COMMUNITY_LEVEL) # load description embeddings to an in-memory lancedb vectorstore # to connect to a remote db, specify url and port values. @@ -2012,76 +2012,106 @@ text_unit_df.head()
- +
@@ -2154,8 +2184,8 @@ text_embedder = OpenAIEmbedding(
-
-
+
+
-
-
-
- -
-
- -
-
+
+
-
- - -
@@ -2448,11 +2351,311 @@ Cell In[5], line 13
diff --git a/examples_notebooks/global_search/index.html b/examples_notebooks/global_search/index.html index ec8b8d28..468dd175 100644 --- a/examples_notebooks/global_search/index.html +++ b/examples_notebooks/global_search/index.html @@ -2133,9 +2133,9 @@ token_encoder = tiktoken.encoding_for_model(llm_model)
@@ -2161,9 +2161,10 @@ token_encoder = tiktoken.encoding_for_model(llm_model)
# parquet files generated from indexing pipeline
 INPUT_DIR = "./inputs/operation dulce"
-COMMUNITY_TABLE = "communities"
-COMMUNITY_REPORT_TABLE = "community_reports"
-ENTITY_TABLE = "entities"
+COMMUNITY_TABLE = "create_final_communities"
+COMMUNITY_REPORT_TABLE = "create_final_community_reports"
+ENTITY_TABLE = "create_final_nodes"
+ENTITY_EMBEDDING_TABLE = "create_final_entities"
 
 # community level in the Leiden community hierarchy from which we will load the community reports
 # higher value means we use reports from more fine-grained communities (at the cost of higher computation cost)
@@ -2171,9 +2172,10 @@ token_encoder = tiktoken.encoding_for_model(llm_model)
# parquet files generated from indexing pipeline INPUT_DIR = "./inputs/operation dulce" -COMMUNITY_TABLE = "communities" -COMMUNITY_REPORT_TABLE = "community_reports" -ENTITY_TABLE = "entities" +COMMUNITY_TABLE = "create_final_communities" +COMMUNITY_REPORT_TABLE = "create_final_community_reports" +ENTITY_TABLE = "create_final_nodes" +ENTITY_EMBEDDING_TABLE = "create_final_entities" # community level in the Leiden community hierarchy from which we will load the community reports # higher value means we use reports from more fine-grained communities (at the cost of higher computation cost) @@ -2205,10 +2207,11 @@ COMMUNITY_LEVEL = 2
community_df = pd.read_parquet(f"{INPUT_DIR}/{COMMUNITY_TABLE}.parquet")
 entity_df = pd.read_parquet(f"{INPUT_DIR}/{ENTITY_TABLE}.parquet")
 report_df = pd.read_parquet(f"{INPUT_DIR}/{COMMUNITY_REPORT_TABLE}.parquet")
+entity_embedding_df = pd.read_parquet(f"{INPUT_DIR}/{ENTITY_EMBEDDING_TABLE}.parquet")
 
-communities = read_indexer_communities(community_df, report_df)
-reports = read_indexer_reports(report_df, community_df, COMMUNITY_LEVEL)
-entities = read_indexer_entities(entity_df, community_df, COMMUNITY_LEVEL)
+communities = read_indexer_communities(community_df, entity_df, report_df)
+reports = read_indexer_reports(report_df, entity_df, COMMUNITY_LEVEL)
+entities = read_indexer_entities(entity_df, entity_embedding_df, COMMUNITY_LEVEL)
 
 print(f"Total report count: {len(report_df)}")
 print(
@@ -2220,10 +2223,11 @@ COMMUNITY_LEVEL = 2
community_df = pd.read_parquet(f"{INPUT_DIR}/{COMMUNITY_TABLE}.parquet") entity_df = pd.read_parquet(f"{INPUT_DIR}/{ENTITY_TABLE}.parquet") report_df = pd.read_parquet(f"{INPUT_DIR}/{COMMUNITY_REPORT_TABLE}.parquet") +entity_embedding_df = pd.read_parquet(f"{INPUT_DIR}/{ENTITY_EMBEDDING_TABLE}.parquet") -communities = read_indexer_communities(community_df, report_df) -reports = read_indexer_reports(report_df, community_df, COMMUNITY_LEVEL) -entities = read_indexer_entities(entity_df, community_df, COMMUNITY_LEVEL) +communities = read_indexer_communities(community_df, entity_df, report_df) +reports = read_indexer_reports(report_df, entity_df, COMMUNITY_LEVEL) +entities = read_indexer_entities(entity_df, entity_embedding_df, COMMUNITY_LEVEL) print(f"Total report count: {len(report_df)}") print( @@ -2241,74 +2245,138 @@ report_df.head()
- +
@@ -2325,8 +2393,8 @@ File ~/.cache/pypoetry/virtualenvs/graphrag-F2jvqev7
-
-
+
+
-
- - -
@@ -2467,8 +2512,8 @@ reduce_llm_params = {
-
-
+
+
-
- - -
@@ -2589,16 +2604,528 @@ print(result.response)
+
+
+ + +
+
+ + +
+
+ + +
+
+ +
@@ -2636,17 +3163,486 @@ result.context_data["reports"]
-
- - @@ -2690,17 +3686,9 @@ print( @@ -2055,9 +2055,10 @@ token_encoder = tiktoken.encoding_for_model(llm_model)
# parquet files generated from indexing pipeline
 INPUT_DIR = "./inputs/operation dulce"
-COMMUNITY_TABLE = "communities"
-COMMUNITY_REPORT_TABLE = "community_reports"
-ENTITY_TABLE = "entities"
+COMMUNITY_TABLE = "create_final_communities"
+COMMUNITY_REPORT_TABLE = "create_final_community_reports"
+ENTITY_TABLE = "create_final_nodes"
+ENTITY_EMBEDDING_TABLE = "create_final_entities"
 
 # we don't fix a specific community level but instead use an agent to dynamicially
 # search through all the community reports to check if they are relevant.
@@ -2065,9 +2066,10 @@ token_encoder = tiktoken.encoding_for_model(llm_model)
# parquet files generated from indexing pipeline INPUT_DIR = "./inputs/operation dulce" -COMMUNITY_TABLE = "communities" -COMMUNITY_REPORT_TABLE = "community_reports" -ENTITY_TABLE = "entities" +COMMUNITY_TABLE = "create_final_communities" +COMMUNITY_REPORT_TABLE = "create_final_community_reports" +ENTITY_TABLE = "create_final_nodes" +ENTITY_EMBEDDING_TABLE = "create_final_entities" # we don't fix a specific community level but instead use an agent to dynamicially # search through all the community reports to check if they are relevant. @@ -2099,16 +2101,17 @@ COMMUNITY_LEVEL = None
community_df = pd.read_parquet(f"{INPUT_DIR}/{COMMUNITY_TABLE}.parquet")
 entity_df = pd.read_parquet(f"{INPUT_DIR}/{ENTITY_TABLE}.parquet")
 report_df = pd.read_parquet(f"{INPUT_DIR}/{COMMUNITY_REPORT_TABLE}.parquet")
+entity_embedding_df = pd.read_parquet(f"{INPUT_DIR}/{ENTITY_EMBEDDING_TABLE}.parquet")
 
-communities = read_indexer_communities(community_df, report_df)
+communities = read_indexer_communities(community_df, entity_df, report_df)
 reports = read_indexer_reports(
     report_df,
-    community_df,
+    entity_df,
     community_level=COMMUNITY_LEVEL,
     dynamic_community_selection=True,
 )
 entities = read_indexer_entities(
-    entity_df, community_df, community_level=COMMUNITY_LEVEL
+    entity_df, entity_embedding_df, community_level=COMMUNITY_LEVEL
 )
 
 print(f"Total report count: {len(report_df)}")
@@ -2121,16 +2124,17 @@ COMMUNITY_LEVEL = None
community_df = pd.read_parquet(f"{INPUT_DIR}/{COMMUNITY_TABLE}.parquet") entity_df = pd.read_parquet(f"{INPUT_DIR}/{ENTITY_TABLE}.parquet") report_df = pd.read_parquet(f"{INPUT_DIR}/{COMMUNITY_REPORT_TABLE}.parquet") +entity_embedding_df = pd.read_parquet(f"{INPUT_DIR}/{ENTITY_EMBEDDING_TABLE}.parquet") -communities = read_indexer_communities(community_df, report_df) +communities = read_indexer_communities(community_df, entity_df, report_df) reports = read_indexer_reports( report_df, - community_df, + entity_df, community_level=COMMUNITY_LEVEL, dynamic_community_selection=True, ) entities = read_indexer_entities( - entity_df, community_df, community_level=COMMUNITY_LEVEL + entity_df, entity_embedding_df, community_level=COMMUNITY_LEVEL ) print(f"Total report count: {len(report_df)}") @@ -2149,74 +2153,138 @@ report_df.head()
- +
@@ -2235,8 +2303,8 @@ File ~/.cache/pypoetry/virtualenvs/graphrag-F2jvqev7
-
-
+
+
-
- - -
@@ -2415,8 +2448,8 @@ reduce_llm_params = {
-
-
+
+
-
- - -
@@ -2539,14 +2542,317 @@ print(result.response)
 ---------------------------------------------------------------------------
-NameError                                 Traceback (most recent call last)
+RateLimitError                            Traceback (most recent call last)
 Cell In[9], line 1
-----> 1 result = await search_engine.asearch(
+----> 1 result = await search_engine.asearch(
       2     "What is Cosmic Vocalization and who are involved in it?"
       3 )
       5 print(result.response)
 
-NameError: name 'search_engine' is not defined
+File ~/work/graphrag/graphrag/graphrag/query/structured_search/global_search/search.py:156, in GlobalSearch.asearch(self, query, conversation_history, **kwargs) + 153 llm_calls, prompt_tokens, output_tokens = {}, {}, {} + 155 start_time = time.time() +--> 156 context_result = await self.context_builder.build_context( + 157 query=query, + 158 conversation_history=conversation_history, + 159 **self.context_builder_params, + 160 ) + 161 llm_calls["build_context"] = context_result.llm_calls + 162 prompt_tokens["build_context"] = context_result.prompt_tokens + +File ~/work/graphrag/graphrag/graphrag/query/structured_search/global_search/community_context.py:98, in GlobalCommunityContext.build_context(self, query, conversation_history, use_community_summary, column_delimiter, shuffle_data, include_community_rank, min_community_rank, community_rank_name, include_community_weight, community_weight_name, normalize_community_weight, max_tokens, context_name, conversation_history_user_turns_only, conversation_history_max_turns, **kwargs) + 93 community_reports = self.community_reports + 94 if self.dynamic_community_selection is not None: + 95 ( + 96 community_reports, + 97 dynamic_info, +---> 98 ) = await self.dynamic_community_selection.select(query) + 99 llm_calls += dynamic_info["llm_calls"] + 100 prompt_tokens += dynamic_info["prompt_tokens"] + +File ~/work/graphrag/graphrag/graphrag/query/context_builder/dynamic_community_selection.py:109, in DynamicCommunitySelection.select(self, query) + 106 relevant_communities = set() + 108 while queue: +--> 109 gather_results = await asyncio.gather(*[ + 110 rate_relevancy( + 111 query=query, + 112 description=( + 113 self.reports[community].summary + 114 if self.use_summary + 115 else self.reports[community].full_content + 116 ), + 117 llm=self.llm, + 118 token_encoder=self.token_encoder, + 119 rate_query=self.rate_query, + 120 num_repeats=self.num_repeats, + 121 semaphore=self.semaphore, + 122 **self.llm_kwargs, + 123 ) + 124 for community in queue + 125 ]) + 127 communities_to_rate = [] + 128 for community, result in zip(queue, gather_results, strict=True): + +File ~/work/graphrag/graphrag/graphrag/query/context_builder/rate_relevancy.py:54, in rate_relevancy(query, description, llm, token_encoder, rate_query, num_repeats, semaphore, **llm_kwargs) + 52 for _ in range(num_repeats): + 53 async with semaphore if semaphore is not None else nullcontext(): +---> 54 response = await llm.agenerate(messages=messages, **llm_kwargs) + 55 try: + 56 _, parsed_response = try_parse_json_object(response) + +File ~/work/graphrag/graphrag/graphrag/query/llm/oai/chat_openai.py:142, in ChatOpenAI.agenerate(self, messages, streaming, callbacks, **kwargs) + 135 try: + 136 retryer = AsyncRetrying( + 137 stop=stop_after_attempt(self.max_retries), + 138 wait=wait_exponential_jitter(max=10), + 139 reraise=True, + 140 retry=retry_if_exception_type(self.retry_error_types), # type: ignore + 141 ) +--> 142 async for attempt in retryer: + 143 with attempt: + 144 return await self._agenerate( + 145 messages=messages, + 146 streaming=streaming, + 147 callbacks=callbacks, + 148 **kwargs, + 149 ) + +File ~/.cache/pypoetry/virtualenvs/graphrag-F2jvqev7-py3.11/lib/python3.11/site-packages/tenacity/asyncio/__init__.py:166, in AsyncRetrying.__anext__(self) + 164 async def __anext__(self) -> AttemptManager: + 165 while True: +--> 166 do = await self.iter(retry_state=self._retry_state) + 167 if do is None: + 168 raise StopAsyncIteration + +File ~/.cache/pypoetry/virtualenvs/graphrag-F2jvqev7-py3.11/lib/python3.11/site-packages/tenacity/asyncio/__init__.py:153, in AsyncRetrying.iter(self, retry_state) + 151 result = None + 152 for action in self.iter_state.actions: +--> 153 result = await action(retry_state) + 154 return result + +File ~/.cache/pypoetry/virtualenvs/graphrag-F2jvqev7-py3.11/lib/python3.11/site-packages/tenacity/_utils.py:99, in wrap_to_async_func.<locals>.inner(*args, **kwargs) + 98 async def inner(*args: typing.Any, **kwargs: typing.Any) -> typing.Any: +---> 99 return call(*args, **kwargs) + +File ~/.cache/pypoetry/virtualenvs/graphrag-F2jvqev7-py3.11/lib/python3.11/site-packages/tenacity/__init__.py:418, in BaseRetrying._post_stop_check_actions.<locals>.exc_check(rs) + 416 retry_exc = self.retry_error_cls(fut) + 417 if self.reraise: +--> 418 raise retry_exc.reraise() + 419 raise retry_exc from fut.exception() + +File ~/.cache/pypoetry/virtualenvs/graphrag-F2jvqev7-py3.11/lib/python3.11/site-packages/tenacity/__init__.py:185, in RetryError.reraise(self) + 183 def reraise(self) -> t.NoReturn: + 184 if self.last_attempt.failed: +--> 185 raise self.last_attempt.result() + 186 raise self + +File /opt/hostedtoolcache/Python/3.11.11/x64/lib/python3.11/concurrent/futures/_base.py:449, in Future.result(self, timeout) + 447 raise CancelledError() + 448 elif self._state == FINISHED: +--> 449 return self.__get_result() + 451 self._condition.wait(timeout) + 453 if self._state in [CANCELLED, CANCELLED_AND_NOTIFIED]: + +File /opt/hostedtoolcache/Python/3.11.11/x64/lib/python3.11/concurrent/futures/_base.py:401, in Future.__get_result(self) + 399 if self._exception: + 400 try: +--> 401 raise self._exception + 402 finally: + 403 # Break a reference cycle with the exception in self._exception + 404 self = None + +File ~/work/graphrag/graphrag/graphrag/query/llm/oai/chat_openai.py:144, in ChatOpenAI.agenerate(self, messages, streaming, callbacks, **kwargs) + 142 async for attempt in retryer: + 143 with attempt: +--> 144 return await self._agenerate( + 145 messages=messages, + 146 streaming=streaming, + 147 callbacks=callbacks, + 148 **kwargs, + 149 ) + 150 except RetryError as e: + 151 self._reporter.error(f"Error at agenerate(): {e}") + +File ~/work/graphrag/graphrag/graphrag/query/llm/oai/chat_openai.py:268, in ChatOpenAI._agenerate(self, messages, streaming, callbacks, **kwargs) + 266 if not model: + 267 raise ValueError(_MODEL_REQUIRED_MSG) +--> 268 response = await self.async_client.chat.completions.create( # type: ignore + 269 model=model, + 270 messages=messages, # type: ignore + 271 stream=streaming, + 272 **kwargs, + 273 ) + 274 if streaming: + 275 full_response = "" + +File ~/.cache/pypoetry/virtualenvs/graphrag-F2jvqev7-py3.11/lib/python3.11/site-packages/openai/resources/chat/completions.py:1720, in AsyncCompletions.create(self, messages, model, audio, frequency_penalty, function_call, functions, logit_bias, logprobs, max_completion_tokens, max_tokens, metadata, modalities, n, parallel_tool_calls, prediction, presence_penalty, reasoning_effort, response_format, seed, service_tier, stop, store, stream, stream_options, temperature, tool_choice, tools, top_logprobs, top_p, user, extra_headers, extra_query, extra_body, timeout) + 1678 @required_args(["messages", "model"], ["messages", "model", "stream"]) + 1679 async def create( + 1680 self, + (...) + 1717 timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN, + 1718 ) -> ChatCompletion | AsyncStream[ChatCompletionChunk]: + 1719 validate_response_format(response_format) +-> 1720 return await self._post( + 1721 "/chat/completions", + 1722 body=await async_maybe_transform( + 1723 { + 1724 "messages": messages, + 1725 "model": model, + 1726 "audio": audio, + 1727 "frequency_penalty": frequency_penalty, + 1728 "function_call": function_call, + 1729 "functions": functions, + 1730 "logit_bias": logit_bias, + 1731 "logprobs": logprobs, + 1732 "max_completion_tokens": max_completion_tokens, + 1733 "max_tokens": max_tokens, + 1734 "metadata": metadata, + 1735 "modalities": modalities, + 1736 "n": n, + 1737 "parallel_tool_calls": parallel_tool_calls, + 1738 "prediction": prediction, + 1739 "presence_penalty": presence_penalty, + 1740 "reasoning_effort": reasoning_effort, + 1741 "response_format": response_format, + 1742 "seed": seed, + 1743 "service_tier": service_tier, + 1744 "stop": stop, + 1745 "store": store, + 1746 "stream": stream, + 1747 "stream_options": stream_options, + 1748 "temperature": temperature, + 1749 "tool_choice": tool_choice, + 1750 "tools": tools, + 1751 "top_logprobs": top_logprobs, + 1752 "top_p": top_p, + 1753 "user": user, + 1754 }, + 1755 completion_create_params.CompletionCreateParams, + 1756 ), + 1757 options=make_request_options( + 1758 extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout + 1759 ), + 1760 cast_to=ChatCompletion, + 1761 stream=stream or False, + 1762 stream_cls=AsyncStream[ChatCompletionChunk], + 1763 ) + +File ~/.cache/pypoetry/virtualenvs/graphrag-F2jvqev7-py3.11/lib/python3.11/site-packages/openai/_base_client.py:1849, in AsyncAPIClient.post(self, path, cast_to, body, files, options, stream, stream_cls) + 1835 async def post( + 1836 self, + 1837 path: str, + (...) + 1844 stream_cls: type[_AsyncStreamT] | None = None, + 1845 ) -> ResponseT | _AsyncStreamT: + 1846 opts = FinalRequestOptions.construct( + 1847 method="post", url=path, json_data=body, files=await async_to_httpx_files(files), **options + 1848 ) +-> 1849 return await self.request(cast_to, opts, stream=stream, stream_cls=stream_cls) + +File ~/.cache/pypoetry/virtualenvs/graphrag-F2jvqev7-py3.11/lib/python3.11/site-packages/openai/_base_client.py:1543, in AsyncAPIClient.request(self, cast_to, options, stream, stream_cls, remaining_retries) + 1540 else: + 1541 retries_taken = 0 +-> 1543 return await self._request( + 1544 cast_to=cast_to, + 1545 options=options, + 1546 stream=stream, + 1547 stream_cls=stream_cls, + 1548 retries_taken=retries_taken, + 1549 ) + +File ~/.cache/pypoetry/virtualenvs/graphrag-F2jvqev7-py3.11/lib/python3.11/site-packages/openai/_base_client.py:1629, in AsyncAPIClient._request(self, cast_to, options, stream, stream_cls, retries_taken) + 1627 if remaining_retries > 0 and self._should_retry(err.response): + 1628 await err.response.aclose() +-> 1629 return await self._retry_request( + 1630 input_options, + 1631 cast_to, + 1632 retries_taken=retries_taken, + 1633 response_headers=err.response.headers, + 1634 stream=stream, + 1635 stream_cls=stream_cls, + 1636 ) + 1638 # If the response is streamed then we need to explicitly read the response + 1639 # to completion before attempting to access the response text. + 1640 if not err.response.is_closed: + +File ~/.cache/pypoetry/virtualenvs/graphrag-F2jvqev7-py3.11/lib/python3.11/site-packages/openai/_base_client.py:1676, in AsyncAPIClient._retry_request(self, options, cast_to, retries_taken, response_headers, stream, stream_cls) + 1672 log.info("Retrying request to %s in %f seconds", options.url, timeout) + 1674 await anyio.sleep(timeout) +-> 1676 return await self._request( + 1677 options=options, + 1678 cast_to=cast_to, + 1679 retries_taken=retries_taken + 1, + 1680 stream=stream, + 1681 stream_cls=stream_cls, + 1682 ) + +File ~/.cache/pypoetry/virtualenvs/graphrag-F2jvqev7-py3.11/lib/python3.11/site-packages/openai/_base_client.py:1629, in AsyncAPIClient._request(self, cast_to, options, stream, stream_cls, retries_taken) + 1627 if remaining_retries > 0 and self._should_retry(err.response): + 1628 await err.response.aclose() +-> 1629 return await self._retry_request( + 1630 input_options, + 1631 cast_to, + 1632 retries_taken=retries_taken, + 1633 response_headers=err.response.headers, + 1634 stream=stream, + 1635 stream_cls=stream_cls, + 1636 ) + 1638 # If the response is streamed then we need to explicitly read the response + 1639 # to completion before attempting to access the response text. + 1640 if not err.response.is_closed: + +File ~/.cache/pypoetry/virtualenvs/graphrag-F2jvqev7-py3.11/lib/python3.11/site-packages/openai/_base_client.py:1676, in AsyncAPIClient._retry_request(self, options, cast_to, retries_taken, response_headers, stream, stream_cls) + 1672 log.info("Retrying request to %s in %f seconds", options.url, timeout) + 1674 await anyio.sleep(timeout) +-> 1676 return await self._request( + 1677 options=options, + 1678 cast_to=cast_to, + 1679 retries_taken=retries_taken + 1, + 1680 stream=stream, + 1681 stream_cls=stream_cls, + 1682 ) + + [... skipping similar frames: AsyncAPIClient._request at line 1629 (17 times), AsyncAPIClient._retry_request at line 1676 (17 times)] + +File ~/.cache/pypoetry/virtualenvs/graphrag-F2jvqev7-py3.11/lib/python3.11/site-packages/openai/_base_client.py:1629, in AsyncAPIClient._request(self, cast_to, options, stream, stream_cls, retries_taken) + 1627 if remaining_retries > 0 and self._should_retry(err.response): + 1628 await err.response.aclose() +-> 1629 return await self._retry_request( + 1630 input_options, + 1631 cast_to, + 1632 retries_taken=retries_taken, + 1633 response_headers=err.response.headers, + 1634 stream=stream, + 1635 stream_cls=stream_cls, + 1636 ) + 1638 # If the response is streamed then we need to explicitly read the response + 1639 # to completion before attempting to access the response text. + 1640 if not err.response.is_closed: + +File ~/.cache/pypoetry/virtualenvs/graphrag-F2jvqev7-py3.11/lib/python3.11/site-packages/openai/_base_client.py:1676, in AsyncAPIClient._retry_request(self, options, cast_to, retries_taken, response_headers, stream, stream_cls) + 1672 log.info("Retrying request to %s in %f seconds", options.url, timeout) + 1674 await anyio.sleep(timeout) +-> 1676 return await self._request( + 1677 options=options, + 1678 cast_to=cast_to, + 1679 retries_taken=retries_taken + 1, + 1680 stream=stream, + 1681 stream_cls=stream_cls, + 1682 ) + +File ~/.cache/pypoetry/virtualenvs/graphrag-F2jvqev7-py3.11/lib/python3.11/site-packages/openai/_base_client.py:1644, in AsyncAPIClient._request(self, cast_to, options, stream, stream_cls, retries_taken) + 1641 await err.response.aread() + 1643 log.debug("Re-raising status error") +-> 1644 raise self._make_status_error_from_response(err.response) from None + 1646 return await self._process_response( + 1647 cast_to=cast_to, + 1648 options=options, + (...) + 1652 retries_taken=retries_taken, + 1653 ) + +RateLimitError: Error code: 429 - {'error': {'message': 'You exceeded your current quota, please check your plan and billing details. For more information on this error, read the docs: https://platform.openai.com/docs/guides/error-codes/api-errors.', 'type': 'insufficient_quota', 'param': None, 'code': 'insufficient_quota'}}
diff --git a/examples_notebooks/index_migration_to_v1/index.html b/examples_notebooks/index_migration/index.html similarity index 98% rename from examples_notebooks/index_migration_to_v1/index.html rename to examples_notebooks/index_migration/index.html index a1c0a3f6..4eaaa3ba 100644 --- a/examples_notebooks/index_migration_to_v1/index.html +++ b/examples_notebooks/index_migration/index.html @@ -16,7 +16,7 @@ - Index migration to v1 - GraphRAG + Index migration - GraphRAG @@ -72,7 +72,7 @@
- + Skip to content @@ -106,7 +106,7 @@
- Index migration to v1 + Index migration
@@ -1352,9 +1352,9 @@
INPUT_DIR = "./inputs/operation dulce"
 LANCEDB_URI = f"{INPUT_DIR}/lancedb"
 
-COMMUNITY_REPORT_TABLE = "community_reports"
-ENTITY_TABLE = "entities"
-COMMUNITY_TABLE = "communities"
-RELATIONSHIP_TABLE = "relationships"
-COVARIATE_TABLE = "covariates"
-TEXT_UNIT_TABLE = "text_units"
+COMMUNITY_REPORT_TABLE = "create_final_community_reports"
+ENTITY_TABLE = "create_final_nodes"
+ENTITY_EMBEDDING_TABLE = "create_final_entities"
+RELATIONSHIP_TABLE = "create_final_relationships"
+COVARIATE_TABLE = "create_final_covariates"
+TEXT_UNIT_TABLE = "create_final_text_units"
 COMMUNITY_LEVEL = 2
 
INPUT_DIR = "./inputs/operation dulce" LANCEDB_URI = f"{INPUT_DIR}/lancedb" -COMMUNITY_REPORT_TABLE = "community_reports" -ENTITY_TABLE = "entities" -COMMUNITY_TABLE = "communities" -RELATIONSHIP_TABLE = "relationships" -COVARIATE_TABLE = "covariates" -TEXT_UNIT_TABLE = "text_units" +COMMUNITY_REPORT_TABLE = "create_final_community_reports" +ENTITY_TABLE = "create_final_nodes" +ENTITY_EMBEDDING_TABLE = "create_final_entities" +RELATIONSHIP_TABLE = "create_final_relationships" +COVARIATE_TABLE = "create_final_covariates" +TEXT_UNIT_TABLE = "create_final_text_units" COMMUNITY_LEVEL = 2
@@ -2318,9 +2318,9 @@ COMMUNITY_LEVEL = 2
# read nodes table to get community and degree data
 entity_df = pd.read_parquet(f"{INPUT_DIR}/{ENTITY_TABLE}.parquet")
-community_df = pd.read_parquet(f"{INPUT_DIR}/{COMMUNITY_TABLE}.parquet")
+entity_embedding_df = pd.read_parquet(f"{INPUT_DIR}/{ENTITY_EMBEDDING_TABLE}.parquet")
 
-entities = read_indexer_entities(entity_df, community_df, COMMUNITY_LEVEL)
+entities = read_indexer_entities(entity_df, entity_embedding_df, COMMUNITY_LEVEL)
 
 # load description embeddings to an in-memory lancedb vectorstore
 # to connect to a remote db, specify url and port values.
@@ -2334,9 +2334,9 @@ COMMUNITY_LEVEL = 2
# read nodes table to get community and degree data entity_df = pd.read_parquet(f"{INPUT_DIR}/{ENTITY_TABLE}.parquet") -community_df = pd.read_parquet(f"{INPUT_DIR}/{COMMUNITY_TABLE}.parquet") +entity_embedding_df = pd.read_parquet(f"{INPUT_DIR}/{ENTITY_EMBEDDING_TABLE}.parquet") -entities = read_indexer_entities(entity_df, community_df, COMMUNITY_LEVEL) +entities = read_indexer_entities(entity_df, entity_embedding_df, COMMUNITY_LEVEL) # load description embeddings to an in-memory lancedb vectorstore # to connect to a remote db, specify url and port values. @@ -2357,75 +2357,101 @@ entity_df.head()
- +
@@ -2482,74 +2508,101 @@ relationship_df.head()
- +
@@ -2601,76 +2654,9 @@ covariates = {"claims": claims}
-
@@ -2707,13 +2693,13 @@ File ~/.cache/pypoetry/virtualenvs/graphrag-F2jvqev7
report_df = pd.read_parquet(f"{INPUT_DIR}/{COMMUNITY_REPORT_TABLE}.parquet")
-reports = read_indexer_reports(report_df, community_df, COMMUNITY_LEVEL)
+reports = read_indexer_reports(report_df, entity_df, COMMUNITY_LEVEL)
 
 print(f"Report records: {len(report_df)}")
 report_df.head()
 
report_df = pd.read_parquet(f"{INPUT_DIR}/{COMMUNITY_REPORT_TABLE}.parquet") -reports = read_indexer_reports(report_df, community_df, COMMUNITY_LEVEL) +reports = read_indexer_reports(report_df, entity_df, COMMUNITY_LEVEL) print(f"Report records: {len(report_df)}") report_df.head()
@@ -2727,74 +2713,137 @@ report_df.head()
- +
@@ -2851,74 +2900,101 @@ text_unit_df.head()
- +
@@ -3002,8 +3078,8 @@ text_embedder = OpenAIEmbedding(
-
-
+
+
-
- - -
@@ -3186,8 +3233,8 @@ llm_params = {
-
-
+
+
-
- - -
@@ -3297,15 +3319,111 @@ print(result.response)
+ +
+
+
@@ -3347,16 +3465,112 @@ print(result.response)
+ +
+
+
@@ -3626,8 +3840,8 @@ Cell In[19], line 1 -
-
+
+
-
- - -
@@ -3735,21 +3925,121 @@ print(candidate_questions.response)
+