Snapshot full graph (#1889)
Some checks are pending
gh-pages / build (push) Waiting to run
Python CI / python-ci (ubuntu-latest, 3.10) (push) Waiting to run
Python CI / python-ci (ubuntu-latest, 3.11) (push) Waiting to run
Python CI / python-ci (windows-latest, 3.10) (push) Waiting to run
Python CI / python-ci (windows-latest, 3.11) (push) Waiting to run
Python Integration Tests / python-ci (ubuntu-latest, 3.10) (push) Waiting to run
Python Integration Tests / python-ci (windows-latest, 3.10) (push) Waiting to run
Python Notebook Tests / python-ci (ubuntu-latest, 3.10) (push) Waiting to run
Python Notebook Tests / python-ci (windows-latest, 3.10) (push) Waiting to run
Python Publish (pypi) / Upload release to PyPI (push) Waiting to run
Python Smoke Tests / python-ci (ubuntu-latest, 3.10) (push) Waiting to run
Python Smoke Tests / python-ci (windows-latest, 3.10) (push) Waiting to run
Spellcheck / spellcheck (push) Waiting to run

* Snapshot un-merged entities and relationships

* Semver

* Fix raw df modification
This commit is contained in:
Nathan Evans 2025-04-25 14:14:48 -07:00 committed by GitHub
parent e2a448170a
commit 25b605b6cd
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 22 additions and 3 deletions

View File

@ -0,0 +1,4 @@
{
"type": "patch",
"description": "Add option to snapshot raw extractd graph tables."
}

View File

@ -339,6 +339,7 @@ class SnapshotsDefaults:
embeddings: bool = False
graphml: bool = False
raw_graph: bool = False
@dataclass

View File

@ -19,3 +19,7 @@ class SnapshotsConfig(BaseModel):
description="A flag indicating whether to take snapshots of GraphML.",
default=graphrag_config_defaults.snapshots.graphml,
)
raw_graph: bool = Field(
description="A flag indicating whether to take snapshots of the raw extracted graph (entities and relationships) before merging.",
default=graphrag_config_defaults.snapshots.raw_graph,
)

View File

@ -43,7 +43,7 @@ async def run_workflow(
config.root_dir, summarization_llm_settings
)
entities, relationships = await extract_graph(
entities, relationships, raw_entities, raw_relationships = await extract_graph(
text_units=text_units,
callbacks=context.callbacks,
cache=context.cache,
@ -58,6 +58,12 @@ async def run_workflow(
await write_table_to_storage(entities, "entities", context.storage)
await write_table_to_storage(relationships, "relationships", context.storage)
if config.snapshots.raw_graph:
await write_table_to_storage(raw_entities, "raw_entities", context.storage)
await write_table_to_storage(
raw_relationships, "raw_relationships", context.storage
)
return WorkflowFunctionOutput(
result={
"entities": entities,
@ -76,7 +82,7 @@ async def extract_graph(
entity_types: list[str] | None = None,
summarization_strategy: dict[str, Any] | None = None,
summarization_num_threads: int = 4,
) -> tuple[pd.DataFrame, pd.DataFrame]:
) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame]:
"""All the steps to create the base entity graph."""
# this returns a graph for each text unit, to be merged later
extracted_entities, extracted_relationships = await extractor(
@ -103,6 +109,10 @@ async def extract_graph(
callbacks.error(error_msg)
raise ValueError(error_msg)
# copy these as is before any summarization
raw_entities = extracted_entities.copy()
raw_relationships = extracted_relationships.copy()
entities, relationships = await get_summarized_entities_relationships(
extracted_entities=extracted_entities,
extracted_relationships=extracted_relationships,
@ -112,7 +122,7 @@ async def extract_graph(
summarization_num_threads=summarization_num_threads,
)
return (entities, relationships)
return (entities, relationships, raw_entities, raw_relationships)
async def get_summarized_entities_relationships(