mirror of
https://github.com/microsoft/graphrag.git
synced 2026-01-14 00:57:23 +08:00
Snapshot full graph (#1889)
Some checks are pending
gh-pages / build (push) Waiting to run
Python CI / python-ci (ubuntu-latest, 3.10) (push) Waiting to run
Python CI / python-ci (ubuntu-latest, 3.11) (push) Waiting to run
Python CI / python-ci (windows-latest, 3.10) (push) Waiting to run
Python CI / python-ci (windows-latest, 3.11) (push) Waiting to run
Python Integration Tests / python-ci (ubuntu-latest, 3.10) (push) Waiting to run
Python Integration Tests / python-ci (windows-latest, 3.10) (push) Waiting to run
Python Notebook Tests / python-ci (ubuntu-latest, 3.10) (push) Waiting to run
Python Notebook Tests / python-ci (windows-latest, 3.10) (push) Waiting to run
Python Publish (pypi) / Upload release to PyPI (push) Waiting to run
Python Smoke Tests / python-ci (ubuntu-latest, 3.10) (push) Waiting to run
Python Smoke Tests / python-ci (windows-latest, 3.10) (push) Waiting to run
Spellcheck / spellcheck (push) Waiting to run
Some checks are pending
gh-pages / build (push) Waiting to run
Python CI / python-ci (ubuntu-latest, 3.10) (push) Waiting to run
Python CI / python-ci (ubuntu-latest, 3.11) (push) Waiting to run
Python CI / python-ci (windows-latest, 3.10) (push) Waiting to run
Python CI / python-ci (windows-latest, 3.11) (push) Waiting to run
Python Integration Tests / python-ci (ubuntu-latest, 3.10) (push) Waiting to run
Python Integration Tests / python-ci (windows-latest, 3.10) (push) Waiting to run
Python Notebook Tests / python-ci (ubuntu-latest, 3.10) (push) Waiting to run
Python Notebook Tests / python-ci (windows-latest, 3.10) (push) Waiting to run
Python Publish (pypi) / Upload release to PyPI (push) Waiting to run
Python Smoke Tests / python-ci (ubuntu-latest, 3.10) (push) Waiting to run
Python Smoke Tests / python-ci (windows-latest, 3.10) (push) Waiting to run
Spellcheck / spellcheck (push) Waiting to run
* Snapshot un-merged entities and relationships * Semver * Fix raw df modification
This commit is contained in:
parent
e2a448170a
commit
25b605b6cd
@ -0,0 +1,4 @@
|
||||
{
|
||||
"type": "patch",
|
||||
"description": "Add option to snapshot raw extractd graph tables."
|
||||
}
|
||||
@ -339,6 +339,7 @@ class SnapshotsDefaults:
|
||||
|
||||
embeddings: bool = False
|
||||
graphml: bool = False
|
||||
raw_graph: bool = False
|
||||
|
||||
|
||||
@dataclass
|
||||
|
||||
@ -19,3 +19,7 @@ class SnapshotsConfig(BaseModel):
|
||||
description="A flag indicating whether to take snapshots of GraphML.",
|
||||
default=graphrag_config_defaults.snapshots.graphml,
|
||||
)
|
||||
raw_graph: bool = Field(
|
||||
description="A flag indicating whether to take snapshots of the raw extracted graph (entities and relationships) before merging.",
|
||||
default=graphrag_config_defaults.snapshots.raw_graph,
|
||||
)
|
||||
|
||||
@ -43,7 +43,7 @@ async def run_workflow(
|
||||
config.root_dir, summarization_llm_settings
|
||||
)
|
||||
|
||||
entities, relationships = await extract_graph(
|
||||
entities, relationships, raw_entities, raw_relationships = await extract_graph(
|
||||
text_units=text_units,
|
||||
callbacks=context.callbacks,
|
||||
cache=context.cache,
|
||||
@ -58,6 +58,12 @@ async def run_workflow(
|
||||
await write_table_to_storage(entities, "entities", context.storage)
|
||||
await write_table_to_storage(relationships, "relationships", context.storage)
|
||||
|
||||
if config.snapshots.raw_graph:
|
||||
await write_table_to_storage(raw_entities, "raw_entities", context.storage)
|
||||
await write_table_to_storage(
|
||||
raw_relationships, "raw_relationships", context.storage
|
||||
)
|
||||
|
||||
return WorkflowFunctionOutput(
|
||||
result={
|
||||
"entities": entities,
|
||||
@ -76,7 +82,7 @@ async def extract_graph(
|
||||
entity_types: list[str] | None = None,
|
||||
summarization_strategy: dict[str, Any] | None = None,
|
||||
summarization_num_threads: int = 4,
|
||||
) -> tuple[pd.DataFrame, pd.DataFrame]:
|
||||
) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame]:
|
||||
"""All the steps to create the base entity graph."""
|
||||
# this returns a graph for each text unit, to be merged later
|
||||
extracted_entities, extracted_relationships = await extractor(
|
||||
@ -103,6 +109,10 @@ async def extract_graph(
|
||||
callbacks.error(error_msg)
|
||||
raise ValueError(error_msg)
|
||||
|
||||
# copy these as is before any summarization
|
||||
raw_entities = extracted_entities.copy()
|
||||
raw_relationships = extracted_relationships.copy()
|
||||
|
||||
entities, relationships = await get_summarized_entities_relationships(
|
||||
extracted_entities=extracted_entities,
|
||||
extracted_relationships=extracted_relationships,
|
||||
@ -112,7 +122,7 @@ async def extract_graph(
|
||||
summarization_num_threads=summarization_num_threads,
|
||||
)
|
||||
|
||||
return (entities, relationships)
|
||||
return (entities, relationships, raw_entities, raw_relationships)
|
||||
|
||||
|
||||
async def get_summarized_entities_relationships(
|
||||
|
||||
Loading…
Reference in New Issue
Block a user