diff --git a/posts/query/notebooks/global_search_nb/index.html b/posts/query/notebooks/global_search_nb/index.html index 336a465f..dc9a5587 100644 --- a/posts/query/notebooks/global_search_nb/index.html +++ b/posts/query/notebooks/global_search_nb/index.html @@ -287,7 +287,7 @@ a { import pandas as pd import tiktoken -from graphrag.query.input.loaders.dfs import read_community_reports +from graphrag.query.indexer_adapters import read_indexer_reports from graphrag.query.llm.oai.chat_openai import ChatOpenAI from graphrag.query.llm.oai.typing import OpenaiApiType from graphrag.query.structured_search.global_search.community_context import ( @@ -344,38 +344,9 @@ COMMUNITY_LEVEL =
entity_df = pd.read_parquet(f"{INPUT_DIR}/{ENTITY_TABLE}.parquet")
-entity_df = entity_df[
-    (entity_df.type == "entity") & (entity_df.level <= f"level_{COMMUNITY_LEVEL}")
-]
-entity_df["community"] = entity_df["community"].fillna(-1)
-entity_df["community"] = entity_df["community"].astype(int)
-
-entity_df = entity_df.groupby(["title"]).agg({"community": "max"}).reset_index()
-entity_df["community"] = entity_df["community"].astype(str)
-filtered_community_df = entity_df.rename(columns={"community": "community_id"})[
-    "community_id"
-].drop_duplicates()
-
 report_df = pd.read_parquet(f"{INPUT_DIR}/{COMMUNITY_REPORT_TABLE}.parquet")
-report_df = report_df[report_df.level <= f"level_{COMMUNITY_LEVEL}"]
 
-report_df["rank"] = report_df["rank"].fillna(-1)
-report_df["rank"] = report_df["rank"].astype(int)
-
-report_df = report_df.merge(filtered_community_df, on="community_id", how="inner")
-
-reports = read_community_reports(
-    df=report_df,
-    id_col="community_id",
-    short_id_col="community_id",
-    community_col="community_id",
-    title_col="title",
-    summary_col="summary",
-    content_col="full_content",
-    rank_col="rank",
-    summary_embedding_col=None,
-    content_embedding_col=None,
-)
+reports = read_indexer_reports(report_df, entity_df, COMMUNITY_LEVEL)
 
 print(f"Report records: {len(report_df)}")
 report_df.head()
diff --git a/posts/query/notebooks/local_search_nb/index.html b/posts/query/notebooks/local_search_nb/index.html index db0abba1..95405ae7 100644 --- a/posts/query/notebooks/local_search_nb/index.html +++ b/posts/query/notebooks/local_search_nb/index.html @@ -286,16 +286,15 @@ a { import tiktoken from graphrag.query.context_builder.entity_extraction import EntityVectorStoreKey -from graphrag.query.input.loaders.dfs import ( - read_community_reports, - read_covariates, - read_entities, - read_relationships, - read_text_units, - store_entity_semantic_embeddings, +from graphrag.query.indexer_adapters import ( + read_indexer_covariates, + read_indexer_entities, + read_indexer_relationships, + read_indexer_reports, + read_indexer_text_units, ) -from graphrag.query.input.retrieval.relationships import ( - calculate_relationship_combined_rank, +from graphrag.query.input.loaders.dfs import ( + store_entity_semantic_embeddings, ) from graphrag.query.llm.oai.chat_openai import ChatOpenAI from graphrag.query.llm.oai.embedding import OpenAIEmbedding @@ -339,54 +338,9 @@ COMMUNITY_LEVEL =
# read nodes table to get community and degree data
 entity_df = pd.read_parquet(f"{INPUT_DIR}/{ENTITY_TABLE}.parquet")
-entity_df = entity_df[
-    (entity_df.type == "entity") & (entity_df.level <= f"level_{COMMUNITY_LEVEL}")
-]
-entity_df = entity_df[["title", "degree", "community"]].rename(
-    columns={"title": "name", "degree": "rank"}
-)
-
-entity_df["community"] = entity_df["community"].fillna(-1)
-entity_df["community"] = entity_df["community"].astype(int)
-entity_df["rank"] = entity_df["rank"].astype(int)
-
-# for duplicate entities, keep the one with the highest community level
-entity_df = entity_df.groupby(["name", "rank"]).agg({"community": "max"}).reset_index()
-entity_df["community"] = entity_df["community"].apply(lambda x: [str(x)])
-
 entity_embedding_df = pd.read_parquet(f"{INPUT_DIR}/{ENTITY_EMBEDDING_TABLE}.parquet")
-entity_embedding_df = entity_embedding_df[
-    [
-        "id",
-        "human_readable_id",
-        "name",
-        "type",
-        "description",
-        "description_embedding",
-        "text_unit_ids",
-    ]
-]
 
-entity_df = entity_df.merge(
-    entity_embedding_df, on="name", how="inner"
-).drop_duplicates(subset=["name"])
-
-# read entity dataframe to knowledge model objects
-entities = read_entities(
-    df=entity_df,
-    id_col="id",
-    title_col="name",
-    type_col="type",
-    short_id_col="human_readable_id",
-    description_col="description",
-    community_col="community",
-    rank_col="rank",
-    name_embedding_col=None,
-    description_embedding_col="description_embedding",
-    graph_embedding_col=None,
-    text_unit_ids_col="text_unit_ids",
-    document_ids_col=None,
-)
+entities = read_indexer_entities(entity_df, entity_embedding_df, COMMUNITY_LEVEL)
 
 # load description embeddings to an in-memory qdrant vectorstore
 # to connect to a remote db, specify url and port values.
@@ -409,39 +363,7 @@ entity_df.head
   
relationship_df = pd.read_parquet(f"{INPUT_DIR}/{RELATIONSHIP_TABLE}.parquet")
-relationship_df = relationship_df[
-    [
-        "id",
-        "human_readable_id",
-        "source",
-        "target",
-        "description",
-        "weight",
-        "text_unit_ids",
-    ]
-]
-relationship_df["id"] = relationship_df["id"].astype(str)
-relationship_df["human_readable_id"] = relationship_df["human_readable_id"].astype(str)
-relationship_df["weight"] = relationship_df["weight"].astype(float)
-relationship_df["text_unit_ids"] = relationship_df["text_unit_ids"].apply(
-    lambda x: x.split(",")
-)
-
-relationships = read_relationships(
-    df=relationship_df,
-    id_col="id",
-    short_id_col="human_readable_id",
-    source_col="source",
-    target_col="target",
-    description_col="description",
-    weight_col="weight",
-    description_embedding_col=None,
-    text_unit_ids_col="text_unit_ids",
-    document_ids_col=None,
-)
-relationships = calculate_relationship_combined_rank(
-    relationships=relationships, entities=entities, ranking_attribute="rank"
-)
+relationships = read_indexer_relationships(relationship_df)
 
 print(f"Relationship count: {len(relationship_df)}")
 relationship_df.head()
@@ -452,59 +374,10 @@ relationship_df.head -
try:
-    covariate_df = pd.read_parquet(f"{INPUT_DIR}/{COVARIATE_TABLE}.parquet")
-    covariate_df = (
-        covariate_df[
-            [
-                "id",
-                "human_readable_id",
-                "type",
-                "subject_id",
-                "subject_type",
-                "object_id",
-                "status",
-                "start_date",
-                "end_date",
-                "description",
-            ]
-        ],
-    )
+  
covariate_df = pd.read_parquet(f"{INPUT_DIR}/{COVARIATE_TABLE}.parquet")
 
-except:  # noqa: E722
-    columns = [
-        "id",
-        "human_readable_id",
-        "type",
-        "subject_id",
-        "object_id",
-        "status",
-        "start_date",
-        "end_date",
-        "description",
-    ]
-    covariate_df = pd.DataFrame({column: [] for column in columns})
+claims = read_indexer_covariates(covariate_df)
 
-covariate_df["id"] = covariate_df["id"].astype(str)
-covariate_df["human_readable_id"] = covariate_df["human_readable_id"].astype(str)
-
-claims = read_covariates(
-    df=covariate_df,
-    id_col="id",
-    short_id_col="human_readable_id",
-    subject_col="subject_id",
-    subject_type_col=None,
-    covariate_type_col="type",
-    attributes_cols=[
-        "object_id",
-        "status",
-        "start_date",
-        "end_date",
-        "description",
-    ],
-    text_unit_ids_col=None,
-    document_ids_col=None,
-)
 print(f"Claim records: {len(claims)}")
 covariates = {"claims": claims}
@@ -515,71 +388,32 @@ covariates = Read community reports
-
# get a list of communities from entity table
-community_df = entity_df[["community"]].copy()
-community_df["community_id"] = community_df["community"].apply(lambda x: str(x[0]))
-community_df = community_df[["community_id"]].drop_duplicates(subset=["community_id"])
-print(f"Community records: {len(community_df)}")
- - -
- -
-
report_df = pd.read_parquet(f"{INPUT_DIR}/{COMMUNITY_REPORT_TABLE}.parquet")
-report_df = report_df[report_df.level <= f"level_{COMMUNITY_LEVEL}"]
-
-report_df["rank"] = report_df["rank"].fillna(-1)
-report_df["rank"] = report_df["rank"].astype(int)
-
-report_df = report_df.merge(community_df, on="community_id", how="inner")
-
-reports = read_community_reports(
-    df=report_df,
-    id_col="community_id",
-    short_id_col="community_id",
-    community_col="community_id",
-    title_col="title",
-    summary_col="summary",
-    content_col="full_content",
-    rank_col="rank",
-    summary_embedding_col=None,
-    content_embedding_col=None,
-)
+  
report_df = pd.read_parquet(f"{INPUT_DIR}/{COMMUNITY_REPORT_TABLE}.parquet")
+reports = read_indexer_reports(report_df, entity_df, COMMUNITY_LEVEL)
 
 print(f"Report records: {len(report_df)}")
 report_df.head()
-

Read text units

-
text_unit_df = pd.read_parquet(f"{INPUT_DIR}/{TEXT_UNIT_TABLE}.parquet")
+  
text_unit_df = pd.read_parquet(f"{INPUT_DIR}/{TEXT_UNIT_TABLE}.parquet")
+text_units = read_indexer_text_units(text_unit_df)
 
-text_units = read_text_units(
-    df=text_unit_df,
-    id_col="id",
-    short_id_col=None,
-    text_col="text",
-    embedding_col="text_embedding",
-    entities_col=None,
-    relationships_col=None,
-    covariates_col=None,
-)
 print(f"Text unit records: {len(text_unit_df)}")
 text_unit_df.head()
-
-
api_key = os.environ["GRAPHRAG_API_KEY"]
+  
api_key = os.environ["GRAPHRAG_API_KEY"]
 llm_model = os.environ["GRAPHRAG_EMBEDDING_MODEL"]
 embedding_model = os.environ["GRAPHRAG_EMBEDDING_MODEL"]
 
@@ -601,14 +435,14 @@ text_embedder = OpenAIEmbedding=20,
 )
-

Create local search context builder

-
context_builder = LocalSearchMixedContext(
+  
context_builder = LocalSearchMixedContext(
     community_reports=reports,
     text_units=text_units,
     entities=entities,
@@ -620,14 +454,14 @@ text_embedder = OpenAIEmbedding=token_encoder,
 )
-

Create local search engine

-
# text_unit_prop: proportion of context window dedicated to related text units
+  
# text_unit_prop: proportion of context window dedicated to related text units
 # community_prop: proportion of context window dedicated to community reports.
 # The remaining proportion is dedicated to entities and relationships. Sum of text_unit_prop and community_prop should be <= 1
 # conversation_history_max_turns: maximum number of turns to include in the conversation history.
@@ -663,13 +497,13 @@ llm_params = "temperature": 0.0,
 }
-
-
search_engine = LocalSearch(
+  
search_engine = LocalSearch(
     llm=llm,
     context_builder=context_builder,
     token_encoder=token_encoder,
@@ -678,34 +512,42 @@ llm_params = ="multiple paragraphs",  # free form text describing the response type and format, can be anything, e.g. prioritized list, single paragraph, multiple paragraphs, multiple-page report
 )
-

Run local search on sample queries

-
result = await search_engine.asearch("Tell me about Agent Mercer")
+  
result = await search_engine.asearch("Tell me about Agent Mercer")
+print(result.response)
+ + +
+ +
+
question = "Tell me about Dr. Jordan Hayes"
+result = await search_engine.asearch(question)
 print(result.response)
- -
-
question = "Tell me about Dr. Jordan Hayes"
-result = await search_engine.asearch(question)
-print(result.response)
- - -

Inspecting the context data used to generate the response

-
result.context_data["entities"].head()
+
result.context_data["entities"].head()
+ + +
+ +
+
result.context_data["relationships"].head()
- -
-
result.context_data["sources"].head()
- - -

Question Generation

This function takes a list of user queries and generates the next candidate questions.

-
question_generator = LocalQuestionGen(
+  
question_generator = LocalQuestionGen(
     llm=llm,
     context_builder=context_builder,
     token_encoder=token_encoder,
@@ -747,13 +581,13 @@ result = await=local_context_params,
 )
-
-
question_history = [
+  
question_history = [
     "Tell me about Agent Mercer",
     "What happens in Dulce military base?",
 ]
@@ -762,7 +596,7 @@ candidate_questions = )
 print(candidate_questions.response)
-