diff --git a/config/yaml/index.html b/config/yaml/index.html index c11bd831..19e1e00c 100644 --- a/config/yaml/index.html +++ b/config/yaml/index.html @@ -1590,8 +1590,6 @@

encoding_model

diff --git a/examples_notebooks/drift_search/index.html b/examples_notebooks/drift_search/index.html index 70d1ad36..0bb2efcd 100644 --- a/examples_notebooks/drift_search/index.html +++ b/examples_notebooks/drift_search/index.html @@ -2015,7 +2015,7 @@ Text unit records: 50
-
[2024-12-04T22:53:08Z WARN  lance::dataset] No existing dataset at /home/runner/work/graphrag/graphrag/docs/examples_notebooks/inputs/operation dulce/lancedb/default-entity-description.lance, it will be created
+
[2024-12-05T17:58:22Z WARN  lance::dataset] No existing dataset at /home/runner/work/graphrag/graphrag/docs/examples_notebooks/inputs/operation dulce/lancedb/default-entity-description.lance, it will be created
 
@@ -2287,15 +2287,130 @@ reports = read_indexer_reports(
-
Embeddings saved to inputs/operation dulce/create_final_community_reports_with_embeddings.parquet
+
Error embedding chunk {'OpenAIEmbedding': "Error code: 429 - {'error': {'message': 'You exceeded your current quota, please check your plan and billing details. For more information on this error, read the docs: https://platform.openai.com/docs/guides/error-codes/api-errors.', 'type': 'insufficient_quota', 'param': None, 'code': 'insufficient_quota'}}"}
 
+
+
+
+
+---------------------------------------------------------------------------
+ZeroDivisionError                         Traceback (most recent call last)
+Cell In[4], line 31
+     27     print(f"Embeddings file already exists at {output_path}")
+     28     return pd.read_parquet(output_path)
+---> 31 report_df = embed_community_reports(INPUT_DIR, text_embedder)
+     32 reports = read_indexer_reports(
+     33     report_df,
+     34     entity_df,
+     35     COMMUNITY_LEVEL,
+     36     content_embedding_col="full_content_embeddings",
+     37 )
+
+Cell In[4], line 19, in embed_community_reports(input_dir, embedder, community_report_table)
+     16     error_msg = f"'full_content' column not found in {input_path}"
+     17     raise ValueError(error_msg)
+---> 19 report_df["full_content_embeddings"] = report_df.loc[:, "full_content"].apply(
+     20     lambda x: embedder.embed(x)
+     21 )
+     23 # Save the DataFrame with embeddings to the output path
+     24 report_df.to_parquet(output_path)
+
+File ~/.cache/pypoetry/virtualenvs/graphrag-F2jvqev7-py3.11/lib/python3.11/site-packages/pandas/core/series.py:4924, in Series.apply(self, func, convert_dtype, args, by_row, **kwargs)
+   4789 def apply(
+   4790     self,
+   4791     func: AggFuncType,
+   (...)
+   4796     **kwargs,
+   4797 ) -> DataFrame | Series:
+   4798     """
+   4799     Invoke function on values of Series.
+   4800 
+   (...)
+   4915     dtype: float64
+   4916     """
+   4917     return SeriesApply(
+   4918         self,
+   4919         func,
+   4920         convert_dtype=convert_dtype,
+   4921         by_row=by_row,
+   4922         args=args,
+   4923         kwargs=kwargs,
+-> 4924     ).apply()
+
+File ~/.cache/pypoetry/virtualenvs/graphrag-F2jvqev7-py3.11/lib/python3.11/site-packages/pandas/core/apply.py:1427, in SeriesApply.apply(self)
+   1424     return self.apply_compat()
+   1426 # self.func is Callable
+-> 1427 return self.apply_standard()
+
+File ~/.cache/pypoetry/virtualenvs/graphrag-F2jvqev7-py3.11/lib/python3.11/site-packages/pandas/core/apply.py:1507, in SeriesApply.apply_standard(self)
+   1501 # row-wise access
+   1502 # apply doesn't have a `na_action` keyword and for backward compat reasons
+   1503 # we need to give `na_action="ignore"` for categorical data.
+   1504 # TODO: remove the `na_action="ignore"` when that default has been changed in
+   1505 #  Categorical (GH51645).
+   1506 action = "ignore" if isinstance(obj.dtype, CategoricalDtype) else None
+-> 1507 mapped = obj._map_values(
+   1508     mapper=curried, na_action=action, convert=self.convert_dtype
+   1509 )
+   1511 if len(mapped) and isinstance(mapped[0], ABCSeries):
+   1512     # GH#43986 Need to do list(mapped) in order to get treated as nested
+   1513     #  See also GH#25959 regarding EA support
+   1514     return obj._constructor_expanddim(list(mapped), index=obj.index)
+
+File ~/.cache/pypoetry/virtualenvs/graphrag-F2jvqev7-py3.11/lib/python3.11/site-packages/pandas/core/base.py:921, in IndexOpsMixin._map_values(self, mapper, na_action, convert)
+    918 if isinstance(arr, ExtensionArray):
+    919     return arr.map(mapper, na_action=na_action)
+--> 921 return algorithms.map_array(arr, mapper, na_action=na_action, convert=convert)
+
+File ~/.cache/pypoetry/virtualenvs/graphrag-F2jvqev7-py3.11/lib/python3.11/site-packages/pandas/core/algorithms.py:1743, in map_array(arr, mapper, na_action, convert)
+   1741 values = arr.astype(object, copy=False)
+   1742 if na_action is None:
+-> 1743     return lib.map_infer(values, mapper, convert=convert)
+   1744 else:
+   1745     return lib.map_infer_mask(
+   1746         values, mapper, mask=isna(values).view(np.uint8), convert=convert
+   1747     )
+
+File lib.pyx:2972, in pandas._libs.lib.map_infer()
+
+Cell In[4], line 20, in embed_community_reports.<locals>.<lambda>(x)
+     16     error_msg = f"'full_content' column not found in {input_path}"
+     17     raise ValueError(error_msg)
+     19 report_df["full_content_embeddings"] = report_df.loc[:, "full_content"].apply(
+---> 20     lambda x: embedder.embed(x)
+     21 )
+     23 # Save the DataFrame with embeddings to the output path
+     24 report_df.to_parquet(output_path)
+
+File ~/work/graphrag/graphrag/graphrag/query/llm/oai/embedding.py:96, in OpenAIEmbedding.embed(self, text, **kwargs)
+     90         self._reporter.error(
+     91             message="Error embedding chunk",
+     92             details={self.__class__.__name__: str(e)},
+     93         )
+     95         continue
+---> 96 chunk_embeddings = np.average(chunk_embeddings, axis=0, weights=chunk_lens)
+     97 chunk_embeddings = chunk_embeddings / np.linalg.norm(chunk_embeddings)
+     98 return chunk_embeddings.tolist()
+
+File ~/.cache/pypoetry/virtualenvs/graphrag-F2jvqev7-py3.11/lib/python3.11/site-packages/numpy/lib/function_base.py:550, in average(a, axis, weights, returned, keepdims)
+    548     scl = wgt.sum(axis=axis, dtype=result_dtype, **keepdims_kw)
+    549     if np.any(scl == 0.0):
+--> 550         raise ZeroDivisionError(
+    551             "Weights sum to zero, can't be normalized")
+    553     avg = avg_as_array = np.multiply(a, wgt,
+    554                       dtype=result_dtype).sum(axis, **keepdims_kw) / scl
+    556 if returned:
+
+ZeroDivisionError: Weights sum to zero, can't be normalized
-
-
+
+
+
+
+
+
+
+
+ +
+
@@ -2378,345 +2522,13 @@ search = DRIFTSearch( - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
@@ -2752,3316 +2564,16 @@ search = DRIFTSearch(
-
@@ -3409,26 +3495,113 @@ print(result.response)
-
### Overview of Dr. Jordan Hayes
-
-Dr. Jordan Hayes is a prominent scientist at Dulce Base, playing a crucial role in the Paranormal Military Squad's efforts to understand and communicate with extraterrestrial entities. Dr. Hayes's work is primarily focused on decoding and analyzing alien signals and codes, which are essential for interstellar communication and potentially interspecies interaction. This involves decrypting algorithms, interpreting cosmic signals, and analyzing the implications of alien society [Data: Entities (2, 17, 47); Relationships (201, 322, 283)].
-
-### Key Contributions and Expertise
-
-Dr. Hayes is noted for their adaptability and skepticism, qualities that are essential given the uncertainties and unknown challenges of their mission. They are deeply involved in the scientific exploration aspects of Operation: Dulce, where their efforts are on the verge of a significant scientific breakthrough. Dr. Hayes leads efforts in isolating and understanding complex alien signals that resemble human cognition, suggesting that these signals are artificial and patterned, indicating a tandem evolution with humanity [Data: Entities (2, 17); Claims (60, 83, 91, 134)].
-
-### Collaborative Efforts and Leadership
-
-In addition to their scientific endeavors, Dr. Hayes is involved in setting up lab stations, operating the mainframe, and playing a crucial role in the command center at Dulce Base. They work closely with colleagues like Alex Mercer, engaging in thoughtful dialogue and showing analytical thinking about the mission's uncertainties. Dr. Hayes's attention to detail is also evident in their discovery of significant panels among secured doorways and their contemplation of the mission's broader ramifications [Data: Entities (2); Relationships (26, 270, 254)].
-
-### Challenges and Philosophical Reflections
-
-Dr. Hayes's work is not without its challenges. They have reflected on their own skepticism and its potential as a blind spot, indicating a moment of self-awareness and growth. This introspection is crucial as they navigate the complexities of alien communication and the profound implications of their findings on physics and the possibilities of their mission [Data: Claims (13, 2); Sources (4, 42)].
-
-In summary, Dr. Jordan Hayes is a vital asset to the Paranormal Military Squad, contributing significantly to the understanding of extraterrestrial communications and the potential for interstellar diplomacy. Their work at Dulce Base is characterized by a blend of scientific rigor, adaptability, and philosophical inquiry, making them a key figure in the ongoing exploration of the unknown.
+
Error embedding chunk {'OpenAIEmbedding': "Error code: 429 - {'error': {'message': 'You exceeded your current quota, please check your plan and billing details. For more information on this error, read the docs: https://platform.openai.com/docs/guides/error-codes/api-errors.', 'type': 'insufficient_quota', 'param': None, 'code': 'insufficient_quota'}}"}
 
+
+
+
+
+---------------------------------------------------------------------------
+ZeroDivisionError                         Traceback (most recent call last)
+Cell In[14], line 2
+      1 question = "Tell me about Dr. Jordan Hayes"
+----> 2 result = await search_engine.asearch(question)
+      3 print(result.response)
+
+File ~/work/graphrag/graphrag/graphrag/query/structured_search/local_search/search.py:67, in LocalSearch.asearch(self, query, conversation_history, **kwargs)
+     65 search_prompt = ""
+     66 llm_calls, prompt_tokens, output_tokens = {}, {}, {}
+---> 67 context_result = self.context_builder.build_context(
+     68     query=query,
+     69     conversation_history=conversation_history,
+     70     **kwargs,
+     71     **self.context_builder_params,
+     72 )
+     73 llm_calls["build_context"] = context_result.llm_calls
+     74 prompt_tokens["build_context"] = context_result.prompt_tokens
+
+File ~/work/graphrag/graphrag/graphrag/query/structured_search/local_search/mixed_context.py:139, in LocalSearchMixedContext.build_context(self, query, conversation_history, include_entity_names, exclude_entity_names, conversation_history_max_turns, conversation_history_user_turns_only, max_tokens, text_unit_prop, community_prop, top_k_mapped_entities, top_k_relationships, include_community_rank, include_entity_rank, rank_description, include_relationship_weight, relationship_ranking_attribute, return_candidate_context, use_community_summary, min_community_rank, community_context_name, column_delimiter, **kwargs)
+    134     pre_user_questions = "\n".join(
+    135         conversation_history.get_user_turns(conversation_history_max_turns)
+    136     )
+    137     query = f"{query}\n{pre_user_questions}"
+--> 139 selected_entities = map_query_to_entities(
+    140     query=query,
+    141     text_embedding_vectorstore=self.entity_text_embeddings,
+    142     text_embedder=self.text_embedder,
+    143     all_entities_dict=self.entities,
+    144     embedding_vectorstore_key=self.embedding_vectorstore_key,
+    145     include_entity_names=include_entity_names,
+    146     exclude_entity_names=exclude_entity_names,
+    147     k=top_k_mapped_entities,
+    148     oversample_scaler=2,
+    149 )
+    151 # build context
+    152 final_context = list[str]()
+
+File ~/work/graphrag/graphrag/graphrag/query/context_builder/entity_extraction.py:58, in map_query_to_entities(query, text_embedding_vectorstore, text_embedder, all_entities_dict, embedding_vectorstore_key, include_entity_names, exclude_entity_names, k, oversample_scaler)
+     54 matched_entities = []
+     55 if query != "":
+     56     # get entities with highest semantic similarity to query
+     57     # oversample to account for excluded entities
+---> 58     search_results = text_embedding_vectorstore.similarity_search_by_text(
+     59         text=query,
+     60         text_embedder=lambda t: text_embedder.embed(t),
+     61         k=k * oversample_scaler,
+     62     )
+     63     for result in search_results:
+     64         if embedding_vectorstore_key == EntityVectorStoreKey.ID and isinstance(
+     65             result.document.id, str
+     66         ):
+
+File ~/work/graphrag/graphrag/graphrag/vector_stores/lancedb.py:134, in LanceDBVectorStore.similarity_search_by_text(self, text, text_embedder, k, **kwargs)
+    130 def similarity_search_by_text(
+    131     self, text: str, text_embedder: TextEmbedder, k: int = 10, **kwargs: Any
+    132 ) -> list[VectorStoreSearchResult]:
+    133     """Perform a similarity search using a given input text."""
+--> 134     query_embedding = text_embedder(text)
+    135     if query_embedding:
+    136         return self.similarity_search_by_vector(query_embedding, k)
+
+File ~/work/graphrag/graphrag/graphrag/query/context_builder/entity_extraction.py:60, in map_query_to_entities.<locals>.<lambda>(t)
+     54 matched_entities = []
+     55 if query != "":
+     56     # get entities with highest semantic similarity to query
+     57     # oversample to account for excluded entities
+     58     search_results = text_embedding_vectorstore.similarity_search_by_text(
+     59         text=query,
+---> 60         text_embedder=lambda t: text_embedder.embed(t),
+     61         k=k * oversample_scaler,
+     62     )
+     63     for result in search_results:
+     64         if embedding_vectorstore_key == EntityVectorStoreKey.ID and isinstance(
+     65             result.document.id, str
+     66         ):
+
+File ~/work/graphrag/graphrag/graphrag/query/llm/oai/embedding.py:96, in OpenAIEmbedding.embed(self, text, **kwargs)
+     90         self._reporter.error(
+     91             message="Error embedding chunk",
+     92             details={self.__class__.__name__: str(e)},
+     93         )
+     95         continue
+---> 96 chunk_embeddings = np.average(chunk_embeddings, axis=0, weights=chunk_lens)
+     97 chunk_embeddings = chunk_embeddings / np.linalg.norm(chunk_embeddings)
+     98 return chunk_embeddings.tolist()
+
+File ~/.cache/pypoetry/virtualenvs/graphrag-F2jvqev7-py3.11/lib/python3.11/site-packages/numpy/lib/function_base.py:550, in average(a, axis, weights, returned, keepdims)
+    548     scl = wgt.sum(axis=axis, dtype=result_dtype, **keepdims_kw)
+    549     if np.any(scl == 0.0):
+--> 550         raise ZeroDivisionError(
+    551             "Weights sum to zero, can't be normalized")
+    553     avg = avg_as_array = np.multiply(a, wgt,
+    554                       dtype=result_dtype).sum(axis, **keepdims_kw) / scl
+    556 if returned:
+
+ZeroDivisionError: Weights sum to zero, can't be normalized
+
+
@@ -3473,78 +3646,16 @@ In summary, Dr. Jordan Hayes is a vital asset to the Paranormal Military Squad,
-
- -
@@ -3580,97 +3691,23 @@ In summary, Dr. Jordan Hayes is a vital asset to the Paranormal Military Squad,
-
- -
- -
-
+
+
+
+ + +
@@ -3728,60 +3784,16 @@ In summary, Dr. Jordan Hayes is a vital asset to the Paranormal Military Squad, @@ -3821,21 +3833,15 @@ In summary, Dr. Jordan Hayes is a vital asset to the Paranormal Military Squad, @@ -3949,10 +3955,122 @@ print(candidate_questions.response)
+
+ + +
diff --git a/search/search_index.json b/search/search_index.json index ca6ec13e..e9b33690 100644 --- a/search/search_index.json +++ b/search/search_index.json @@ -1 +1 @@ -{"config": {"lang": ["en"], "separator": "[\\s\\-]+", "pipeline": ["stopWordFilter"]}, "docs": [{"location": "", "title": "Welcome to GraphRAG", "text": "

\ud83d\udc49 Microsoft Research Blog Post \ud83d\udc49 GraphRAG Accelerator \ud83d\udc49 GraphRAG Arxiv

Figure 1: An LLM-generated knowledge graph built using GPT-4 Turbo.

GraphRAG is a structured, hierarchical approach to Retrieval Augmented Generation (RAG), as opposed to naive semantic-search approaches using plain text snippets. The GraphRAG process involves extracting a knowledge graph out of raw text, building a community hierarchy, generating summaries for these communities, and then leveraging these structures when perform RAG-based tasks.

To learn more about GraphRAG and how it can be used to enhance your LLMs ability to reason about your private data, please visit the Microsoft Research Blog Post.

"}, {"location": "#solution-accelerator", "title": "Solution Accelerator \ud83d\ude80", "text": "

To quickstart the GraphRAG system we recommend trying the Solution Accelerator package. This provides a user-friendly end-to-end experience with Azure resources.

"}, {"location": "#get-started-with-graphrag", "title": "Get Started with GraphRAG \ud83d\ude80", "text": "

To start using GraphRAG, check out the Get Started guide. For a deeper dive into the main sub-systems, please visit the docpages for the Indexer and Query packages.

"}, {"location": "#graphrag-vs-baseline-rag", "title": "GraphRAG vs Baseline RAG \ud83d\udd0d", "text": "

Retrieval-Augmented Generation (RAG) is a technique to improve LLM outputs using real-world information. This technique is an important part of most LLM-based tools and the majority of RAG approaches use vector similarity as the search technique, which we call Baseline RAG. GraphRAG uses knowledge graphs to provide substantial improvements in question-and-answer performance when reasoning about complex information. RAG techniques have shown promise in helping LLMs to reason about private datasets - data that the LLM is not trained on and has never seen before, such as an enterprise\u2019s proprietary research, business documents, or communications. Baseline RAG was created to help solve this problem, but we observe situations where baseline RAG performs very poorly. For example:

To address this, the tech community is working to develop methods that extend and enhance RAG. Microsoft Research\u2019s new approach, GraphRAG, uses LLMs to create a knowledge graph based on an input corpus. This graph, along with community summaries and graph machine learning outputs, are used to augment prompts at query time. GraphRAG shows substantial improvement in answering the two classes of questions described above, demonstrating intelligence or mastery that outperforms other approaches previously applied to private datasets.

"}, {"location": "#the-graphrag-process", "title": "The GraphRAG Process \ud83e\udd16", "text": "

GraphRAG builds upon our prior research and tooling using graph machine learning. The basic steps of the GraphRAG process are as follows:

"}, {"location": "#index", "title": "Index", "text": ""}, {"location": "#query", "title": "Query", "text": "

At query time, these structures are used to provide materials for the LLM context window when answering a question. The primary query modes are:

"}, {"location": "#prompt-tuning", "title": "Prompt Tuning", "text": "

Using GraphRAG with your data out of the box may not yield the best possible results. We strongly recommend to fine-tune your prompts following the Prompt Tuning Guide in our documentation.

"}, {"location": "blog_posts/", "title": "Microsoft Research Blog", "text": "