Snapshot full graph (#1889)

* Snapshot un-merged entities and relationships * Semver * Fix raw df modification
2026-01-13 16:47:20 +08:00 · 2025-04-25 14:14:48 -07:00 · 2025-04-25 14:14:48 -07:00 · 25b605b6cd
commit 25b605b6cd
parent e2a448170a
4 changed files with 22 additions and 3 deletions
--- a/.semversioner/next-release/patch-20250422215029679348.json
+++ b/.semversioner/next-release/patch-20250422215029679348.json
@ -0,0 +1,4 @@
+{
+  "type": "patch",
+  "description": "Add option to snapshot raw extractd graph tables."
+}
--- a/graphrag/config/defaults.py
+++ b/graphrag/config/defaults.py
@ -339,6 +339,7 @@ class SnapshotsDefaults:

    embeddings: bool = False
    graphml: bool = False
+    raw_graph: bool = False


@dataclass
--- a/graphrag/config/models/snapshots_config.py
+++ b/graphrag/config/models/snapshots_config.py
@ -19,3 +19,7 @@ class SnapshotsConfig(BaseModel):
        description="A flag indicating whether to take snapshots of GraphML.",
        default=graphrag_config_defaults.snapshots.graphml,
    )
+    raw_graph: bool = Field(
+        description="A flag indicating whether to take snapshots of the raw extracted graph (entities and relationships) before merging.",
+        default=graphrag_config_defaults.snapshots.raw_graph,
+    )
--- a/graphrag/index/workflows/extract_graph.py
+++ b/graphrag/index/workflows/extract_graph.py
@ -43,7 +43,7 @@ async def run_workflow(
        config.root_dir, summarization_llm_settings
    )

-    entities, relationships = await extract_graph(
+    entities, relationships, raw_entities, raw_relationships = await extract_graph(
        text_units=text_units,
        callbacks=context.callbacks,
        cache=context.cache,
@ -58,6 +58,12 @@ async def run_workflow(
    await write_table_to_storage(entities, "entities", context.storage)
    await write_table_to_storage(relationships, "relationships", context.storage)

+    if config.snapshots.raw_graph:
+        await write_table_to_storage(raw_entities, "raw_entities", context.storage)
+        await write_table_to_storage(
+            raw_relationships, "raw_relationships", context.storage
+        )
+
    return WorkflowFunctionOutput(
        result={
            "entities": entities,
@ -76,7 +82,7 @@ async def extract_graph(
    entity_types: list[str] | None = None,
    summarization_strategy: dict[str, Any] | None = None,
    summarization_num_threads: int = 4,
-) -> tuple[pd.DataFrame, pd.DataFrame]:
+) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame]:
    """All the steps to create the base entity graph."""
    # this returns a graph for each text unit, to be merged later
    extracted_entities, extracted_relationships = await extractor(
@ -103,6 +109,10 @@ async def extract_graph(
        callbacks.error(error_msg)
        raise ValueError(error_msg)

+    # copy these as is before any summarization
+    raw_entities = extracted_entities.copy()
+    raw_relationships = extracted_relationships.copy()
+
    entities, relationships = await get_summarized_entities_relationships(
        extracted_entities=extracted_entities,
        extracted_relationships=extracted_relationships,
@ -112,7 +122,7 @@ async def extract_graph(
        summarization_num_threads=summarization_num_threads,
    )

-    return (entities, relationships)
+    return (entities, relationships, raw_entities, raw_relationships)


 async def get_summarized_entities_relationships(