entity_df = pd.read_parquet(f"{INPUT_DIR}/{ENTITY_TABLE}.parquet")
-entity_df = entity_df[
- (entity_df.type == "entity") & (entity_df.level <= f"level_{COMMUNITY_LEVEL}")
-]
-entity_df["community"] = entity_df["community"].fillna(-1)
-entity_df["community"] = entity_df["community"].astype(int)
-
-entity_df = entity_df.groupby(["title"]).agg({"community": "max"}).reset_index()
-entity_df["community"] = entity_df["community"].astype(str)
-filtered_community_df = entity_df.rename(columns={"community": "community_id"})[
- "community_id"
-].drop_duplicates()
-
report_df = pd.read_parquet(f"{INPUT_DIR}/{COMMUNITY_REPORT_TABLE}.parquet")
-report_df = report_df[report_df.level <= f"level_{COMMUNITY_LEVEL}"]
-report_df["rank"] = report_df["rank"].fillna(-1)
-report_df["rank"] = report_df["rank"].astype(int)
-
-report_df = report_df.merge(filtered_community_df, on="community_id", how="inner")
-
-reports = read_community_reports(
- df=report_df,
- id_col="community_id",
- short_id_col="community_id",
- community_col="community_id",
- title_col="title",
- summary_col="summary",
- content_col="full_content",
- rank_col="rank",
- summary_embedding_col=None,
- content_embedding_col=None,
-)
+reports = read_indexer_reports(report_df, entity_df, COMMUNITY_LEVEL)
print(f"Report records: {len(report_df)}")
report_df.head()
diff --git a/posts/query/notebooks/local_search_nb/index.html b/posts/query/notebooks/local_search_nb/index.html
index db0abba1..95405ae7 100644
--- a/posts/query/notebooks/local_search_nb/index.html
+++ b/posts/query/notebooks/local_search_nb/index.html
@@ -286,16 +286,15 @@ a {
import tiktoken
from graphrag.query.context_builder.entity_extraction import EntityVectorStoreKey
-from graphrag.query.input.loaders.dfs import (
- read_community_reports,
- read_covariates,
- read_entities,
- read_relationships,
- read_text_units,
- store_entity_semantic_embeddings,
+from graphrag.query.indexer_adapters import (
+ read_indexer_covariates,
+ read_indexer_entities,
+ read_indexer_relationships,
+ read_indexer_reports,
+ read_indexer_text_units,
)
-from graphrag.query.input.retrieval.relationships import (
- calculate_relationship_combined_rank,
+from graphrag.query.input.loaders.dfs import (
+ store_entity_semantic_embeddings,
)
from graphrag.query.llm.oai.chat_openai import ChatOpenAI
from graphrag.query.llm.oai.embedding import OpenAIEmbedding
@@ -339,54 +338,9 @@ COMMUNITY_LEVEL =
# read nodes table to get community and degree data
entity_df = pd.read_parquet(f"{INPUT_DIR}/{ENTITY_TABLE}.parquet")
-entity_df = entity_df[
- (entity_df.type == "entity") & (entity_df.level <= f"level_{COMMUNITY_LEVEL}")
-]
-entity_df = entity_df[["title", "degree", "community"]].rename(
- columns={"title": "name", "degree": "rank"}
-)
-
-entity_df["community"] = entity_df["community"].fillna(-1)
-entity_df["community"] = entity_df["community"].astype(int)
-entity_df["rank"] = entity_df["rank"].astype(int)
-
-# for duplicate entities, keep the one with the highest community level
-entity_df = entity_df.groupby(["name", "rank"]).agg({"community": "max"}).reset_index()
-entity_df["community"] = entity_df["community"].apply(lambda x: [str(x)])
-
entity_embedding_df = pd.read_parquet(f"{INPUT_DIR}/{ENTITY_EMBEDDING_TABLE}.parquet")
-entity_embedding_df = entity_embedding_df[
- [
- "id",
- "human_readable_id",
- "name",
- "type",
- "description",
- "description_embedding",
- "text_unit_ids",
- ]
-]
-entity_df = entity_df.merge(
- entity_embedding_df, on="name", how="inner"
-).drop_duplicates(subset=["name"])
-
-# read entity dataframe to knowledge model objects
-entities = read_entities(
- df=entity_df,
- id_col="id",
- title_col="name",
- type_col="type",
- short_id_col="human_readable_id",
- description_col="description",
- community_col="community",
- rank_col="rank",
- name_embedding_col=None,
- description_embedding_col="description_embedding",
- graph_embedding_col=None,
- text_unit_ids_col="text_unit_ids",
- document_ids_col=None,
-)
+entities = read_indexer_entities(entity_df, entity_embedding_df, COMMUNITY_LEVEL)
# load description embeddings to an in-memory qdrant vectorstore
# to connect to a remote db, specify url and port values.
@@ -409,39 +363,7 @@ entity_df.head
relationship_df = pd.read_parquet(f"{INPUT_DIR}/{RELATIONSHIP_TABLE}.parquet")
-relationship_df = relationship_df[
- [
- "id",
- "human_readable_id",
- "source",
- "target",
- "description",
- "weight",
- "text_unit_ids",
- ]
-]
-relationship_df["id"] = relationship_df["id"].astype(str)
-relationship_df["human_readable_id"] = relationship_df["human_readable_id"].astype(str)
-relationship_df["weight"] = relationship_df["weight"].astype(float)
-relationship_df["text_unit_ids"] = relationship_df["text_unit_ids"].apply(
- lambda x: x.split(",")
-)
-
-relationships = read_relationships(
- df=relationship_df,
- id_col="id",
- short_id_col="human_readable_id",
- source_col="source",
- target_col="target",
- description_col="description",
- weight_col="weight",
- description_embedding_col=None,
- text_unit_ids_col="text_unit_ids",
- document_ids_col=None,
-)
-relationships = calculate_relationship_combined_rank(
- relationships=relationships, entities=entities, ranking_attribute="rank"
-)
+relationships = read_indexer_relationships(relationship_df)
print(f"Relationship count: {len(relationship_df)}")
relationship_df.head()
@@ -452,59 +374,10 @@ relationship_df.head
- try:
- covariate_df = pd.read_parquet(f"{INPUT_DIR}/{COVARIATE_TABLE}.parquet")
- covariate_df = (
- covariate_df[
- [
- "id",
- "human_readable_id",
- "type",
- "subject_id",
- "subject_type",
- "object_id",
- "status",
- "start_date",
- "end_date",
- "description",
- ]
- ],
- )
+ covariate_df = pd.read_parquet(f"{INPUT_DIR}/{COVARIATE_TABLE}.parquet")
-except: # noqa: E722
- columns = [
- "id",
- "human_readable_id",
- "type",
- "subject_id",
- "object_id",
- "status",
- "start_date",
- "end_date",
- "description",
- ]
- covariate_df = pd.DataFrame({column: [] for column in columns})
+claims = read_indexer_covariates(covariate_df)
-covariate_df["id"] = covariate_df["id"].astype(str)
-covariate_df["human_readable_id"] = covariate_df["human_readable_id"].astype(str)
-
-claims = read_covariates(
- df=covariate_df,
- id_col="id",
- short_id_col="human_readable_id",
- subject_col="subject_id",
- subject_type_col=None,
- covariate_type_col="type",
- attributes_cols=[
- "object_id",
- "status",
- "start_date",
- "end_date",
- "description",
- ],
- text_unit_ids_col=None,
- document_ids_col=None,
-)
print(f"Claim records: {len(claims)}")
covariates = {"claims": claims}
@@ -515,71 +388,32 @@ covariates = Read community reports
- # get a list of communities from entity table
-community_df = entity_df[["community"]].copy()
-community_df["community_id"] = community_df["community"].apply(lambda x: str(x[0]))
-community_df = community_df[["community_id"]].drop_duplicates(subset=["community_id"])
-print(f"Community records: {len(community_df)}")
-
-
-
-
-
- report_df = pd.read_parquet(f"{INPUT_DIR}/{COMMUNITY_REPORT_TABLE}.parquet")
-report_df = report_df[report_df.level <= f"level_{COMMUNITY_LEVEL}"]
-
-report_df["rank"] = report_df["rank"].fillna(-1)
-report_df["rank"] = report_df["rank"].astype(int)
-
-report_df = report_df.merge(community_df, on="community_id", how="inner")
-
-reports = read_community_reports(
- df=report_df,
- id_col="community_id",
- short_id_col="community_id",
- community_col="community_id",
- title_col="title",
- summary_col="summary",
- content_col="full_content",
- rank_col="rank",
- summary_embedding_col=None,
- content_embedding_col=None,
-)
+ report_df = pd.read_parquet(f"{INPUT_DIR}/{COMMUNITY_REPORT_TABLE}.parquet")
+reports = read_indexer_reports(report_df, entity_df, COMMUNITY_LEVEL)
print(f"Report records: {len(report_df)}")
report_df.head()
-
Read text units
- text_unit_df = pd.read_parquet(f"{INPUT_DIR}/{TEXT_UNIT_TABLE}.parquet")
+ text_unit_df = pd.read_parquet(f"{INPUT_DIR}/{TEXT_UNIT_TABLE}.parquet")
+text_units = read_indexer_text_units(text_unit_df)
-text_units = read_text_units(
- df=text_unit_df,
- id_col="id",
- short_id_col=None,
- text_col="text",
- embedding_col="text_embedding",
- entities_col=None,
- relationships_col=None,
- covariates_col=None,
-)
print(f"Text unit records: {len(text_unit_df)}")
text_unit_df.head()
-
+
- api_key = os.environ["GRAPHRAG_API_KEY"]
+ api_key = os.environ["GRAPHRAG_API_KEY"]
llm_model = os.environ["GRAPHRAG_EMBEDDING_MODEL"]
embedding_model = os.environ["GRAPHRAG_EMBEDDING_MODEL"]
@@ -601,14 +435,14 @@ text_embedder = OpenAIEmbedding=20,
)
-
+
Create local search context builder
- context_builder = LocalSearchMixedContext(
+ context_builder = LocalSearchMixedContext(
community_reports=reports,
text_units=text_units,
entities=entities,
@@ -620,14 +454,14 @@ text_embedder = OpenAIEmbedding=token_encoder,
)
-
+
Create local search engine
- # text_unit_prop: proportion of context window dedicated to related text units
+ # text_unit_prop: proportion of context window dedicated to related text units
# community_prop: proportion of context window dedicated to community reports.
# The remaining proportion is dedicated to entities and relationships. Sum of text_unit_prop and community_prop should be <= 1
# conversation_history_max_turns: maximum number of turns to include in the conversation history.
@@ -663,13 +497,13 @@ llm_params = "temperature": 0.0,
}
-
+
- search_engine = LocalSearch(
+ search_engine = LocalSearch(
llm=llm,
context_builder=context_builder,
token_encoder=token_encoder,
@@ -678,34 +512,42 @@ llm_params = ="multiple paragraphs", # free form text describing the response type and format, can be anything, e.g. prioritized list, single paragraph, multiple paragraphs, multiple-page report
)
-
+
Run local search on sample queries
- result = await search_engine.asearch("Tell me about Agent Mercer")
+ result = await search_engine.asearch("Tell me about Agent Mercer")
+print(result.response)
+
+
+
+
+
+
+
+ question = "Tell me about Dr. Jordan Hayes"
+result = await search_engine.asearch(question)
print(result.response)
-
-
- question = "Tell me about Dr. Jordan Hayes"
-result = await search_engine.asearch(question)
-print(result.response)
-
-
-
-
-
Inspecting the context data used to generate the response
- result.context_data["entities"].head()
+ result.context_data["entities"].head()
+
+
+
+
+
+
+
+ result.context_data["relationships"].head()
@@ -713,7 +555,7 @@ result = await
- result.context_data["relationships"].head()
+ result.context_data["reports"].head()
@@ -721,25 +563,17 @@ result = await
- result.context_data["reports"].head()
+ result.context_data["sources"].head()
-
-
- result.context_data["sources"].head()
-
-
-
-
-
Question Generation
This function takes a list of user queries and generates the next candidate questions.
- question_generator = LocalQuestionGen(
+ question_generator = LocalQuestionGen(
llm=llm,
context_builder=context_builder,
token_encoder=token_encoder,
@@ -747,13 +581,13 @@ result = await
context_builder_params=local_context_params,
)
-
+
- question_history = [
+ question_history = [
"Tell me about Agent Mercer",
"What happens in Dulce military base?",
]
@@ -762,7 +596,7 @@ candidate_questions = )
print(candidate_questions.response)
-
+