From 61b3d6d56a373e7eeec782be4e460c607c0e1716 Mon Sep 17 00:00:00 2001 From: Nathan Evans Date: Wed, 9 Oct 2024 13:46:44 -0700 Subject: [PATCH 1/6] Migrate helper verbs (#1248) * Remove genid * Move snapshot_rows * Move snapshot * Delete spread_json * Delete unzip * Delete zip * Move unpack_graph * Move compute_edge_combined_degree * Delete create_graph * Delete concat * Delete text replace * Delete text_translate * Move text_split * Inline aggregate override * Move cluster_graph * Move merge_graphs * Semver * Move text_chunk * Move layout_graph and fix some __init__s * Move extract_covariates * Rename text_split -> split_text * Move extract_entities * Move summarize_descriptions * Rename text_chunk -> chunk_text * Move community report creation * Remove verb-level packing operators * Streamline some naming * Streamline param name/order * Move mock LLM data to tests * Fixed missed rename * Update some strategy refs * Rename run_gi * Inject mock responses into integ test config --- .../patch-20241003214516791831.json | 4 + dictionary.txt | 1 - graphrag/config/models/chunking_config.py | 2 +- .../config/models/claim_extraction_config.py | 2 +- .../config/models/cluster_graph_config.py | 2 +- .../config/models/community_reports_config.py | 4 +- graphrag/config/models/embed_graph_config.py | 2 +- .../config/models/entity_extraction_config.py | 4 +- .../models/summarize_descriptions_config.py | 4 +- .../config/models/text_embedding_config.py | 2 +- .../index/flows/create_base_entity_graph.py | 12 +- .../flows/create_base_extracted_entities.py | 24 ++-- .../index/flows/create_base_text_units.py | 85 +++++++++-- .../index/flows/create_final_communities.py | 6 +- .../flows/create_final_community_reports.py | 30 ++-- .../index/flows/create_final_covariates.py | 14 +- .../index/flows/create_final_documents.py | 2 +- graphrag/index/flows/create_final_entities.py | 14 +- graphrag/index/flows/create_final_nodes.py | 16 +-- .../index/flows/create_final_relationships.py | 12 +- .../index/flows/create_final_text_units.py | 2 +- .../index/flows/create_summarized_entities.py | 20 +-- .../index/operations/chunk_text/__init__.py | 8 ++ .../chunk_text/chunk_text.py} | 68 +++------ .../chunk_text/strategies.py} | 28 +++- .../chunk => operations/chunk_text}/typing.py | 20 +++ .../cluster_graph.py | 135 ++++++++++-------- .../compute_edge_combined_degree.py | 44 ++++++ .../index/operations/embed_graph/__init__.py | 3 +- .../operations/embed_graph/embed_graph.py | 26 +++- .../embed_graph/strategies/__init__.py | 4 - .../embed_graph/strategies/node_2_vec.py | 34 ----- .../extract_covariates/__init__.py | 0 .../extract_covariates/extract_covariates.py | 53 +------ .../extract_covariates/strategies.py} | 28 ++-- .../extract_covariates}/typing.py | 11 ++ .../extract_entities}/__init__.py | 4 +- .../extract_entities/extract_entities.py} | 63 +------- .../extract_entities}/strategies/__init__.py | 0 .../strategies/graph_intelligence.py} | 30 ++-- .../extract_entities}/strategies/nltk.py | 4 +- .../extract_entities}/strategies/typing.py | 0 .../layout_graph}/__init__.py | 0 .../layout_graph}/layout_graph.py | 35 +---- .../layout_graph}/methods/__init__.py | 0 .../layout_graph}/methods/umap.py | 2 +- .../layout_graph}/methods/zero.py | 0 .../merge_graphs}/__init__.py | 6 +- .../merge_graphs}/merge_graphs.py | 39 +++-- .../merge_graphs}/typing.py | 0 .../index/{verbs => operations}/snapshot.py | 19 --- .../{verbs => operations}/snapshot_rows.py | 26 +--- graphrag/index/operations/split_text.py | 26 ++++ .../summarize_communities/__init__.py | 16 +++ .../prepare_community_reports.py | 24 ---- .../restore_community_hierarchy.py | 22 --- .../summarize_communities/strategies.py} | 32 ++--- .../summarize_communities.py} | 62 ++------ .../summarize_communities}/typing.py | 11 ++ .../summarize_descriptions/__init__.py | 13 ++ .../summarize_descriptions/strategies.py} | 32 ++--- .../summarize_descriptions.py} | 59 ++------ .../summarize_descriptions}/typing.py | 19 ++- .../unpack.py => operations/unpack_graph.py} | 49 +------ graphrag/index/run/run.py | 1 - graphrag/index/verbs/__init__.py | 46 ------ graphrag/index/verbs/covariates/__init__.py | 8 -- .../extract_covariates/strategies/__init__.py | 4 - .../strategies/graph_intelligence/__init__.py | 8 -- .../strategies/graph_intelligence/defaults.py | 10 -- graphrag/index/verbs/entities/__init__.py | 9 -- .../strategies/graph_intelligence/__init__.py | 8 -- .../strategies/graph_intelligence/defaults.py | 25 ---- .../verbs/entities/summarize/__init__.py | 8 -- .../entities/summarize/strategies/__init__.py | 8 -- .../strategies/graph_intelligence/__init__.py | 8 -- .../strategies/graph_intelligence/defaults.py | 17 --- graphrag/index/verbs/genid.py | 80 ----------- graphrag/index/verbs/graph/__init__.py | 34 ----- .../index/verbs/graph/clustering/__init__.py | 8 -- .../graph/clustering/strategies/__init__.py | 4 - .../graph/clustering/strategies/leiden.py | 69 --------- .../index/verbs/graph/clustering/typing.py | 6 - .../graph/compute_edge_combined_degree.py | 93 ------------ graphrag/index/verbs/graph/create.py | 135 ------------------ graphrag/index/verbs/graph/merge/defaults.py | 21 --- graphrag/index/verbs/graph/report/__init__.py | 25 ---- .../prepare_community_reports_claims.py | 50 ------- .../report/prepare_community_reports_edges.py | 48 ------- .../report/prepare_community_reports_nodes.py | 46 ------ .../verbs/graph/report/strategies/__init__.py | 4 - .../strategies/graph_intelligence/__init__.py | 8 -- .../strategies/graph_intelligence/defaults.py | 27 ---- graphrag/index/verbs/overrides/__init__.py | 9 -- graphrag/index/verbs/overrides/aggregate.py | 101 ------------- graphrag/index/verbs/overrides/concat.py | 27 ---- graphrag/index/verbs/spread_json.py | 55 ------- graphrag/index/verbs/text/__init__.py | 16 --- graphrag/index/verbs/text/chunk/__init__.py | 8 -- .../verbs/text/chunk/strategies/__init__.py | 4 - .../verbs/text/chunk/strategies/sentence.py | 26 ---- .../verbs/text/chunk/strategies/typing.py | 17 --- graphrag/index/verbs/text/replace/__init__.py | 8 -- graphrag/index/verbs/text/replace/replace.py | 47 ------ graphrag/index/verbs/text/replace/typing.py | 14 -- graphrag/index/verbs/text/split.py | 54 ------- .../index/verbs/text/translate/__init__.py | 8 -- .../text/translate/strategies/__init__.py | 9 -- .../text/translate/strategies/defaults.py | 8 -- .../verbs/text/translate/strategies/mock.py | 28 ---- .../verbs/text/translate/strategies/openai.py | 93 ------------ .../verbs/text/translate/strategies/typing.py | 25 ---- .../verbs/text/translate/text_translate.py | 120 ---------------- graphrag/index/verbs/unzip.py | 25 ---- graphrag/index/verbs/zip.py | 51 ------- .../workflows/v1/create_base_entity_graph.py | 4 +- .../v1/create_base_extracted_entities.py | 4 +- .../workflows/v1/create_base_text_units.py | 5 +- .../v1/create_final_community_reports.py | 11 +- .../workflows/v1/create_final_covariates.py | 13 +- .../workflows/v1/create_final_entities.py | 3 +- .../index/workflows/v1/create_final_nodes.py | 3 +- .../workflows/v1/create_final_text_units.py | 1 + .../v1/create_summarized_entities.py | 6 +- .../v1/subflows/create_base_documents.py | 4 +- .../v1/subflows/create_base_entity_graph.py | 2 +- .../create_base_extracted_entities.py | 16 +-- .../v1/subflows/create_base_text_units.py | 4 +- .../create_final_community_reports.py | 14 +- .../v1/subflows/create_final_covariates.py | 14 +- .../v1/subflows/create_final_documents.py | 2 +- .../v1/subflows/create_final_entities.py | 8 +- .../v1/subflows/create_final_nodes.py | 6 +- .../v1/subflows/create_final_relationships.py | 2 +- .../v1/subflows/create_final_text_units.py | 2 +- .../v1/subflows/create_summarized_entities.py | 12 +- graphrag/prompt_tune/loader/input.py | 13 +- tests/integration/_pipeline/megapipeline.yml | 24 +++- .../test_gi_entity_extraction.py | 16 ++- tests/unit/indexing/verbs/text/test_split.py | 14 +- .../test_create_base_extracted_entities.py | 44 +++++- .../test_create_final_community_reports.py | 56 +++++++- tests/verbs/test_create_final_covariates.py | 32 ++++- .../verbs/test_create_summarized_entities.py | 37 ++++- tests/verbs/util.py | 1 + 145 files changed, 871 insertions(+), 2392 deletions(-) create mode 100644 .semversioner/next-release/patch-20241003214516791831.json create mode 100644 graphrag/index/operations/chunk_text/__init__.py rename graphrag/index/{verbs/text/chunk/text_chunk.py => operations/chunk_text/chunk_text.py} (80%) rename graphrag/index/{verbs/text/chunk/strategies/tokens.py => operations/chunk_text/strategies.py} (78%) rename graphrag/index/{verbs/text/chunk => operations/chunk_text}/typing.py (50%) rename graphrag/index/{verbs/graph/clustering => operations}/cluster_graph.py (69%) create mode 100644 graphrag/index/operations/compute_edge_combined_degree.py delete mode 100644 graphrag/index/operations/embed_graph/strategies/__init__.py delete mode 100644 graphrag/index/operations/embed_graph/strategies/node_2_vec.py rename graphrag/index/{verbs/covariates => operations}/extract_covariates/__init__.py (100%) rename graphrag/index/{verbs/covariates => operations}/extract_covariates/extract_covariates.py (64%) rename graphrag/index/{verbs/covariates/extract_covariates/strategies/graph_intelligence/run_gi_extract_claims.py => operations/extract_covariates/strategies.py} (80%) rename graphrag/index/{verbs/covariates => operations/extract_covariates}/typing.py (81%) rename graphrag/index/{verbs/entities/extraction => operations/extract_entities}/__init__.py (51%) rename graphrag/index/{verbs/entities/extraction/entity_extract.py => operations/extract_entities/extract_entities.py} (74%) rename graphrag/index/{verbs/entities/extraction => operations/extract_entities}/strategies/__init__.py (100%) rename graphrag/index/{verbs/entities/extraction/strategies/graph_intelligence/run_graph_intelligence.py => operations/extract_entities/strategies/graph_intelligence.py} (82%) rename graphrag/index/{verbs/entities/extraction => operations/extract_entities}/strategies/nltk.py (95%) rename graphrag/index/{verbs/entities/extraction => operations/extract_entities}/strategies/typing.py (100%) rename graphrag/index/{verbs/graph/layout => operations/layout_graph}/__init__.py (100%) rename graphrag/index/{verbs/graph/layout => operations/layout_graph}/layout_graph.py (82%) rename graphrag/index/{verbs/graph/layout => operations/layout_graph}/methods/__init__.py (100%) rename graphrag/index/{verbs/graph/layout => operations/layout_graph}/methods/umap.py (97%) rename graphrag/index/{verbs/graph/layout => operations/layout_graph}/methods/zero.py (100%) rename graphrag/index/{verbs/graph/merge => operations/merge_graphs}/__init__.py (60%) rename graphrag/index/{verbs/graph/merge => operations/merge_graphs}/merge_graphs.py (90%) rename graphrag/index/{verbs/graph/merge => operations/merge_graphs}/typing.py (100%) rename graphrag/index/{verbs => operations}/snapshot.py (58%) rename graphrag/index/{verbs => operations}/snapshot_rows.py (79%) create mode 100644 graphrag/index/operations/split_text.py create mode 100644 graphrag/index/operations/summarize_communities/__init__.py rename graphrag/index/{verbs/graph/report => operations/summarize_communities}/prepare_community_reports.py (87%) rename graphrag/index/{verbs/graph/report => operations/summarize_communities}/restore_community_hierarchy.py (78%) rename graphrag/index/{verbs/graph/report/strategies/graph_intelligence/run_graph_intelligence.py => operations/summarize_communities/strategies.py} (78%) rename graphrag/index/{verbs/graph/report/create_community_reports.py => operations/summarize_communities/summarize_communities.py} (67%) rename graphrag/index/{verbs/graph/report/strategies => operations/summarize_communities}/typing.py (78%) create mode 100644 graphrag/index/operations/summarize_descriptions/__init__.py rename graphrag/index/{verbs/entities/summarize/strategies/graph_intelligence/run_graph_intelligence.py => operations/summarize_descriptions/strategies.py} (69%) rename graphrag/index/{verbs/entities/summarize/description_summarize.py => operations/summarize_descriptions/summarize_descriptions.py} (83%) rename graphrag/index/{verbs/entities/summarize/strategies => operations/summarize_descriptions}/typing.py (63%) rename graphrag/index/{verbs/graph/unpack.py => operations/unpack_graph.py} (61%) delete mode 100644 graphrag/index/verbs/__init__.py delete mode 100644 graphrag/index/verbs/covariates/__init__.py delete mode 100644 graphrag/index/verbs/covariates/extract_covariates/strategies/__init__.py delete mode 100644 graphrag/index/verbs/covariates/extract_covariates/strategies/graph_intelligence/__init__.py delete mode 100644 graphrag/index/verbs/covariates/extract_covariates/strategies/graph_intelligence/defaults.py delete mode 100644 graphrag/index/verbs/entities/__init__.py delete mode 100644 graphrag/index/verbs/entities/extraction/strategies/graph_intelligence/__init__.py delete mode 100644 graphrag/index/verbs/entities/extraction/strategies/graph_intelligence/defaults.py delete mode 100644 graphrag/index/verbs/entities/summarize/__init__.py delete mode 100644 graphrag/index/verbs/entities/summarize/strategies/__init__.py delete mode 100644 graphrag/index/verbs/entities/summarize/strategies/graph_intelligence/__init__.py delete mode 100644 graphrag/index/verbs/entities/summarize/strategies/graph_intelligence/defaults.py delete mode 100644 graphrag/index/verbs/genid.py delete mode 100644 graphrag/index/verbs/graph/__init__.py delete mode 100644 graphrag/index/verbs/graph/clustering/__init__.py delete mode 100644 graphrag/index/verbs/graph/clustering/strategies/__init__.py delete mode 100644 graphrag/index/verbs/graph/clustering/strategies/leiden.py delete mode 100644 graphrag/index/verbs/graph/clustering/typing.py delete mode 100644 graphrag/index/verbs/graph/compute_edge_combined_degree.py delete mode 100644 graphrag/index/verbs/graph/create.py delete mode 100644 graphrag/index/verbs/graph/merge/defaults.py delete mode 100644 graphrag/index/verbs/graph/report/__init__.py delete mode 100644 graphrag/index/verbs/graph/report/prepare_community_reports_claims.py delete mode 100644 graphrag/index/verbs/graph/report/prepare_community_reports_edges.py delete mode 100644 graphrag/index/verbs/graph/report/prepare_community_reports_nodes.py delete mode 100644 graphrag/index/verbs/graph/report/strategies/__init__.py delete mode 100644 graphrag/index/verbs/graph/report/strategies/graph_intelligence/__init__.py delete mode 100644 graphrag/index/verbs/graph/report/strategies/graph_intelligence/defaults.py delete mode 100644 graphrag/index/verbs/overrides/__init__.py delete mode 100644 graphrag/index/verbs/overrides/aggregate.py delete mode 100644 graphrag/index/verbs/overrides/concat.py delete mode 100644 graphrag/index/verbs/spread_json.py delete mode 100644 graphrag/index/verbs/text/__init__.py delete mode 100644 graphrag/index/verbs/text/chunk/__init__.py delete mode 100644 graphrag/index/verbs/text/chunk/strategies/__init__.py delete mode 100644 graphrag/index/verbs/text/chunk/strategies/sentence.py delete mode 100644 graphrag/index/verbs/text/chunk/strategies/typing.py delete mode 100644 graphrag/index/verbs/text/replace/__init__.py delete mode 100644 graphrag/index/verbs/text/replace/replace.py delete mode 100644 graphrag/index/verbs/text/replace/typing.py delete mode 100644 graphrag/index/verbs/text/split.py delete mode 100644 graphrag/index/verbs/text/translate/__init__.py delete mode 100644 graphrag/index/verbs/text/translate/strategies/__init__.py delete mode 100644 graphrag/index/verbs/text/translate/strategies/defaults.py delete mode 100644 graphrag/index/verbs/text/translate/strategies/mock.py delete mode 100644 graphrag/index/verbs/text/translate/strategies/openai.py delete mode 100644 graphrag/index/verbs/text/translate/strategies/typing.py delete mode 100644 graphrag/index/verbs/text/translate/text_translate.py delete mode 100644 graphrag/index/verbs/unzip.py delete mode 100644 graphrag/index/verbs/zip.py diff --git a/.semversioner/next-release/patch-20241003214516791831.json b/.semversioner/next-release/patch-20241003214516791831.json new file mode 100644 index 00000000..0fa47e64 --- /dev/null +++ b/.semversioner/next-release/patch-20241003214516791831.json @@ -0,0 +1,4 @@ +{ + "type": "patch", + "description": "Moving verbs around." +} diff --git a/dictionary.txt b/dictionary.txt index b7eb072a..ee01cc68 100644 --- a/dictionary.txt +++ b/dictionary.txt @@ -89,7 +89,6 @@ nbconvert binarize prechunked openai -genid umap concat unhot diff --git a/graphrag/config/models/chunking_config.py b/graphrag/config/models/chunking_config.py index 4ca8a8d3..a2b40017 100644 --- a/graphrag/config/models/chunking_config.py +++ b/graphrag/config/models/chunking_config.py @@ -29,7 +29,7 @@ class ChunkingConfig(BaseModel): def resolved_strategy(self, encoding_model: str) -> dict: """Get the resolved chunking strategy.""" - from graphrag.index.verbs.text.chunk import ChunkStrategyType + from graphrag.index.operations.chunk_text import ChunkStrategyType return self.strategy or { "type": ChunkStrategyType.tokens, diff --git a/graphrag/config/models/claim_extraction_config.py b/graphrag/config/models/claim_extraction_config.py index a26fdad2..6a4de8e3 100644 --- a/graphrag/config/models/claim_extraction_config.py +++ b/graphrag/config/models/claim_extraction_config.py @@ -38,7 +38,7 @@ class ClaimExtractionConfig(LLMConfig): def resolved_strategy(self, root_dir: str, encoding_model: str) -> dict: """Get the resolved claim extraction strategy.""" - from graphrag.index.verbs.covariates.extract_covariates import ( + from graphrag.index.operations.extract_covariates import ( ExtractClaimsStrategyType, ) diff --git a/graphrag/config/models/cluster_graph_config.py b/graphrag/config/models/cluster_graph_config.py index 3029baeb..805e5a18 100644 --- a/graphrag/config/models/cluster_graph_config.py +++ b/graphrag/config/models/cluster_graph_config.py @@ -20,7 +20,7 @@ class ClusterGraphConfig(BaseModel): def resolved_strategy(self) -> dict: """Get the resolved cluster strategy.""" - from graphrag.index.verbs.graph.clustering import GraphCommunityStrategyType + from graphrag.index.operations.cluster_graph import GraphCommunityStrategyType return self.strategy or { "type": GraphCommunityStrategyType.leiden, diff --git a/graphrag/config/models/community_reports_config.py b/graphrag/config/models/community_reports_config.py index ab55063c..0eafa81c 100644 --- a/graphrag/config/models/community_reports_config.py +++ b/graphrag/config/models/community_reports_config.py @@ -32,7 +32,9 @@ class CommunityReportsConfig(LLMConfig): def resolved_strategy(self, root_dir) -> dict: """Get the resolved community report extraction strategy.""" - from graphrag.index.verbs.graph.report import CreateCommunityReportsStrategyType + from graphrag.index.operations.summarize_communities import ( + CreateCommunityReportsStrategyType, + ) return self.strategy or { "type": CreateCommunityReportsStrategyType.graph_intelligence, diff --git a/graphrag/config/models/embed_graph_config.py b/graphrag/config/models/embed_graph_config.py index e3f717c0..12dd90cf 100644 --- a/graphrag/config/models/embed_graph_config.py +++ b/graphrag/config/models/embed_graph_config.py @@ -36,7 +36,7 @@ class EmbedGraphConfig(BaseModel): def resolved_strategy(self) -> dict: """Get the resolved node2vec strategy.""" - from graphrag.index.operations.embed_graph.embed_graph import ( + from graphrag.index.operations.embed_graph import ( EmbedGraphStrategyType, ) diff --git a/graphrag/config/models/entity_extraction_config.py b/graphrag/config/models/entity_extraction_config.py index ca160bc4..08055d51 100644 --- a/graphrag/config/models/entity_extraction_config.py +++ b/graphrag/config/models/entity_extraction_config.py @@ -35,7 +35,9 @@ class EntityExtractionConfig(LLMConfig): def resolved_strategy(self, root_dir: str, encoding_model: str) -> dict: """Get the resolved entity extraction strategy.""" - from graphrag.index.verbs.entities.extraction import ExtractEntityStrategyType + from graphrag.index.operations.extract_entities import ( + ExtractEntityStrategyType, + ) return self.strategy or { "type": ExtractEntityStrategyType.graph_intelligence, diff --git a/graphrag/config/models/summarize_descriptions_config.py b/graphrag/config/models/summarize_descriptions_config.py index 9747d949..9104a60a 100644 --- a/graphrag/config/models/summarize_descriptions_config.py +++ b/graphrag/config/models/summarize_descriptions_config.py @@ -28,7 +28,9 @@ class SummarizeDescriptionsConfig(LLMConfig): def resolved_strategy(self, root_dir: str) -> dict: """Get the resolved description summarization strategy.""" - from graphrag.index.verbs.entities.summarize import SummarizeStrategyType + from graphrag.index.operations.summarize_descriptions import ( + SummarizeStrategyType, + ) return self.strategy or { "type": SummarizeStrategyType.graph_intelligence, diff --git a/graphrag/config/models/text_embedding_config.py b/graphrag/config/models/text_embedding_config.py index cec0ee46..abd2f2bf 100644 --- a/graphrag/config/models/text_embedding_config.py +++ b/graphrag/config/models/text_embedding_config.py @@ -35,7 +35,7 @@ class TextEmbeddingConfig(LLMConfig): def resolved_strategy(self) -> dict: """Get the resolved text embedding strategy.""" - from graphrag.index.operations.embed_text.embed_text import ( + from graphrag.index.operations.embed_text import ( TextEmbedStrategyType, ) diff --git a/graphrag/index/flows/create_base_entity_graph.py b/graphrag/index/flows/create_base_entity_graph.py index 25f6375e..39880a45 100644 --- a/graphrag/index/flows/create_base_entity_graph.py +++ b/graphrag/index/flows/create_base_entity_graph.py @@ -10,10 +10,10 @@ from datashaper import ( VerbCallbacks, ) -from graphrag.index.operations.embed_graph.embed_graph import embed_graph +from graphrag.index.operations.cluster_graph import cluster_graph +from graphrag.index.operations.embed_graph import embed_graph +from graphrag.index.operations.snapshot_rows import snapshot_rows from graphrag.index.storage import PipelineStorage -from graphrag.index.verbs.graph.clustering.cluster_graph import cluster_graph_df -from graphrag.index.verbs.snapshot_rows import snapshot_rows_df async def create_base_entity_graph( @@ -25,7 +25,7 @@ async def create_base_entity_graph( graphml_snapshot_enabled: bool = False, ) -> pd.DataFrame: """All the steps to create the base entity graph.""" - clustered = cluster_graph_df( + clustered = cluster_graph( entities, callbacks, column="entity_graph", @@ -35,7 +35,7 @@ async def create_base_entity_graph( ) if graphml_snapshot_enabled: - await snapshot_rows_df( + await snapshot_rows( clustered, column="clustered_graph", base_name="clustered_graph", @@ -54,7 +54,7 @@ async def create_base_entity_graph( # take second snapshot after embedding # todo: this could be skipped if embedding isn't performed, other wise it is a copy of the regular graph? if graphml_snapshot_enabled: - await snapshot_rows_df( + await snapshot_rows( clustered, column="entity_graph", base_name="embedded_graph", diff --git a/graphrag/index/flows/create_base_extracted_entities.py b/graphrag/index/flows/create_base_extracted_entities.py index b538f18f..bfbf4d23 100644 --- a/graphrag/index/flows/create_base_extracted_entities.py +++ b/graphrag/index/flows/create_base_extracted_entities.py @@ -12,23 +12,23 @@ from datashaper import ( ) from graphrag.index.cache import PipelineCache +from graphrag.index.operations.extract_entities import extract_entities +from graphrag.index.operations.merge_graphs import merge_graphs +from graphrag.index.operations.snapshot import snapshot +from graphrag.index.operations.snapshot_rows import snapshot_rows from graphrag.index.storage import PipelineStorage -from graphrag.index.verbs.entities.extraction.entity_extract import entity_extract_df -from graphrag.index.verbs.graph.merge.merge_graphs import merge_graphs_df -from graphrag.index.verbs.snapshot import snapshot_df -from graphrag.index.verbs.snapshot_rows import snapshot_rows_df async def create_base_extracted_entities( text_units: pd.DataFrame, - cache: PipelineCache, callbacks: VerbCallbacks, + cache: PipelineCache, storage: PipelineStorage, column: str, id_column: str, nodes: dict[str, Any], edges: dict[str, Any], - strategy: dict[str, Any] | None, + extraction_strategy: dict[str, Any] | None, async_mode: AsyncType = AsyncType.AsyncIO, entity_types: list[str] | None = None, graphml_snapshot_enabled: bool = False, @@ -36,13 +36,13 @@ async def create_base_extracted_entities( num_threads: int = 4, ) -> pd.DataFrame: """All the steps to extract and format covariates.""" - entity_graph = await entity_extract_df( + entity_graph = await extract_entities( text_units, - cache, callbacks, + cache, column=column, id_column=id_column, - strategy=strategy, + strategy=extraction_strategy, async_mode=async_mode, entity_types=entity_types, to="entities", @@ -51,14 +51,14 @@ async def create_base_extracted_entities( ) if raw_entity_snapshot_enabled: - await snapshot_df( + await snapshot( entity_graph, name="raw_extracted_entities", storage=storage, formats=["json"], ) - merged_graph = merge_graphs_df( + merged_graph = merge_graphs( entity_graph, callbacks, column="entity_graph", @@ -68,7 +68,7 @@ async def create_base_extracted_entities( ) if graphml_snapshot_enabled: - await snapshot_rows_df( + await snapshot_rows( merged_graph, base_name="merged_graph", column="entity_graph", diff --git a/graphrag/index/flows/create_base_text_units.py b/graphrag/index/flows/create_base_text_units.py index 091b9221..4e07e868 100644 --- a/graphrag/index/flows/create_base_text_units.py +++ b/graphrag/index/flows/create_base_text_units.py @@ -3,14 +3,19 @@ """All the steps to transform base text_units.""" +from dataclasses import dataclass from typing import Any, cast import pandas as pd -from datashaper import VerbCallbacks +from datashaper import ( + FieldAggregateOperation, + Progress, + VerbCallbacks, + aggregate_operation_mapping, +) -from graphrag.index.verbs.genid import genid_df -from graphrag.index.verbs.overrides.aggregate import aggregate_df -from graphrag.index.verbs.text.chunk.text_chunk import chunk_df +from graphrag.index.operations.chunk_text import chunk_text +from graphrag.index.utils import gen_md5_hash def create_base_text_units( @@ -19,7 +24,7 @@ def create_base_text_units( chunk_column_name: str, n_tokens_column_name: str, chunk_by_columns: list[str], - strategy: dict[str, Any] | None = None, + chunk_strategy: dict[str, Any] | None = None, ) -> pd.DataFrame: """All the steps to transform base text_units.""" sort = documents.sort_values(by=["id"], ascending=[True]) @@ -28,7 +33,9 @@ def create_base_text_units( zip(*[sort[col] for col in ["id", "text"]], strict=True) ) - aggregated = aggregate_df( + callbacks.progress(Progress(percent=0)) + + aggregated = _aggregate_df( sort, groupby=[*chunk_by_columns] if len(chunk_by_columns) > 0 else None, aggregations=[ @@ -40,12 +47,14 @@ def create_base_text_units( ], ) - chunked = chunk_df( + callbacks.progress(Progress(percent=1)) + + chunked = chunk_text( aggregated, column="texts", to="chunks", callbacks=callbacks, - strategy=strategy, + strategy=chunk_strategy, ) chunked = cast(pd.DataFrame, chunked[[*chunk_by_columns, "chunks"]]) @@ -56,11 +65,9 @@ def create_base_text_units( }, inplace=True, ) - - chunked = genid_df( - chunked, to="chunk_id", method="md5_hash", hash=[chunk_column_name] + chunked["chunk_id"] = chunked.apply( + lambda row: gen_md5_hash(row, [chunk_column_name]), axis=1 ) - chunked[["document_ids", chunk_column_name, n_tokens_column_name]] = pd.DataFrame( chunked[chunk_column_name].tolist(), index=chunked.index ) @@ -69,3 +76,57 @@ def create_base_text_units( return cast( pd.DataFrame, chunked[chunked[chunk_column_name].notna()].reset_index(drop=True) ) + + +# TODO: would be nice to inline this completely in the main method with pandas +def _aggregate_df( + input: pd.DataFrame, + aggregations: list[dict[str, Any]], + groupby: list[str] | None = None, +) -> pd.DataFrame: + """Aggregate method definition.""" + aggregations_to_apply = _load_aggregations(aggregations) + df_aggregations = { + agg.column: _get_pandas_agg_operation(agg) + for agg in aggregations_to_apply.values() + } + if groupby is None: + output_grouped = input.groupby(lambda _x: True) + else: + output_grouped = input.groupby(groupby, sort=False) + output = cast(pd.DataFrame, output_grouped.agg(df_aggregations)) + output.rename( + columns={agg.column: agg.to for agg in aggregations_to_apply.values()}, + inplace=True, + ) + output.columns = [agg.to for agg in aggregations_to_apply.values()] + return output.reset_index() + + +@dataclass +class Aggregation: + """Aggregation class method definition.""" + + column: str | None + operation: str + to: str + + # Only useful for the concat operation + separator: str | None = None + + +def _get_pandas_agg_operation(agg: Aggregation) -> Any: + if agg.operation == "string_concat": + return (agg.separator or ",").join + return aggregate_operation_mapping[FieldAggregateOperation(agg.operation)] + + +def _load_aggregations( + aggregations: list[dict[str, Any]], +) -> dict[str, Aggregation]: + return { + aggregation["column"]: Aggregation( + aggregation["column"], aggregation["operation"], aggregation["to"] + ) + for aggregation in aggregations + } diff --git a/graphrag/index/flows/create_final_communities.py b/graphrag/index/flows/create_final_communities.py index 51654121..23c84c56 100644 --- a/graphrag/index/flows/create_final_communities.py +++ b/graphrag/index/flows/create_final_communities.py @@ -8,7 +8,7 @@ from datashaper import ( VerbCallbacks, ) -from graphrag.index.verbs.graph.unpack import unpack_graph_df +from graphrag.index.operations.unpack_graph import unpack_graph def create_final_communities( @@ -16,8 +16,8 @@ def create_final_communities( callbacks: VerbCallbacks, ) -> pd.DataFrame: """All the steps to transform final communities.""" - graph_nodes = unpack_graph_df(entity_graph, callbacks, "clustered_graph", "nodes") - graph_edges = unpack_graph_df(entity_graph, callbacks, "clustered_graph", "edges") + graph_nodes = unpack_graph(entity_graph, callbacks, "clustered_graph", "nodes") + graph_edges = unpack_graph(entity_graph, callbacks, "clustered_graph", "edges") # Merge graph_nodes with graph_edges for both source and target matches source_clusters = graph_nodes.merge( diff --git a/graphrag/index/flows/create_final_community_reports.py b/graphrag/index/flows/create_final_community_reports.py index ddf7ea69..3556fa3f 100644 --- a/graphrag/index/flows/create_final_community_reports.py +++ b/graphrag/index/flows/create_final_community_reports.py @@ -31,15 +31,11 @@ from graphrag.index.graph.extractors.community_reports.schemas import ( NODE_ID, NODE_NAME, ) -from graphrag.index.operations.embed_text.embed_text import embed_text -from graphrag.index.verbs.graph.report.create_community_reports import ( - create_community_reports_df, -) -from graphrag.index.verbs.graph.report.prepare_community_reports import ( - prepare_community_reports_df, -) -from graphrag.index.verbs.graph.report.restore_community_hierarchy import ( - restore_community_hierarchy_df, +from graphrag.index.operations.embed_text import embed_text +from graphrag.index.operations.summarize_communities import ( + prepare_community_reports, + restore_community_hierarchy, + summarize_communities, ) @@ -49,7 +45,7 @@ async def create_final_community_reports( claims_input: pd.DataFrame | None, callbacks: VerbCallbacks, cache: PipelineCache, - strategy: dict, + summarization_strategy: dict, async_mode: AsyncType = AsyncType.AsyncIO, num_threads: int = 4, full_content_text_embed: dict | None = None, @@ -64,19 +60,23 @@ async def create_final_community_reports( if claims_input is not None: claims = _prep_claims(claims_input) - community_hierarchy = restore_community_hierarchy_df(nodes) + community_hierarchy = restore_community_hierarchy(nodes) - local_contexts = prepare_community_reports_df( - nodes, edges, claims, callbacks, strategy.get("max_input_length", 16_000) + local_contexts = prepare_community_reports( + nodes, + edges, + claims, + callbacks, + summarization_strategy.get("max_input_length", 16_000), ) - community_reports = await create_community_reports_df( + community_reports = await summarize_communities( local_contexts, nodes, community_hierarchy, callbacks, cache, - strategy, + summarization_strategy, async_mode=async_mode, num_threads=num_threads, ) diff --git a/graphrag/index/flows/create_final_covariates.py b/graphrag/index/flows/create_final_covariates.py index 98b352e4..09ec9f8f 100644 --- a/graphrag/index/flows/create_final_covariates.py +++ b/graphrag/index/flows/create_final_covariates.py @@ -13,30 +13,30 @@ from datashaper import ( ) from graphrag.index.cache import PipelineCache -from graphrag.index.verbs.covariates.extract_covariates.extract_covariates import ( - extract_covariates_df, +from graphrag.index.operations.extract_covariates import ( + extract_covariates, ) async def create_final_covariates( text_units: pd.DataFrame, - cache: PipelineCache, callbacks: VerbCallbacks, + cache: PipelineCache, column: str, covariate_type: str, - strategy: dict[str, Any] | None, + extraction_strategy: dict[str, Any] | None, async_mode: AsyncType = AsyncType.AsyncIO, entity_types: list[str] | None = None, num_threads: int = 4, ) -> pd.DataFrame: """All the steps to extract and format covariates.""" - covariates = await extract_covariates_df( + covariates = await extract_covariates( text_units, - cache, callbacks, + cache, column, covariate_type, - strategy, + extraction_strategy, async_mode, entity_types, num_threads, diff --git a/graphrag/index/flows/create_final_documents.py b/graphrag/index/flows/create_final_documents.py index c8f35132..29504000 100644 --- a/graphrag/index/flows/create_final_documents.py +++ b/graphrag/index/flows/create_final_documents.py @@ -9,7 +9,7 @@ from datashaper import ( ) from graphrag.index.cache import PipelineCache -from graphrag.index.operations.embed_text.embed_text import embed_text +from graphrag.index.operations.embed_text import embed_text async def create_final_documents( diff --git a/graphrag/index/flows/create_final_entities.py b/graphrag/index/flows/create_final_entities.py index e653f0f1..9601cb31 100644 --- a/graphrag/index/flows/create_final_entities.py +++ b/graphrag/index/flows/create_final_entities.py @@ -9,22 +9,22 @@ from datashaper import ( ) from graphrag.index.cache import PipelineCache -from graphrag.index.operations.embed_text.embed_text import embed_text -from graphrag.index.verbs.graph.unpack import unpack_graph_df -from graphrag.index.verbs.text.split import text_split_df +from graphrag.index.operations.embed_text import embed_text +from graphrag.index.operations.split_text import split_text +from graphrag.index.operations.unpack_graph import unpack_graph async def create_final_entities( entity_graph: pd.DataFrame, callbacks: VerbCallbacks, cache: PipelineCache, - name_text_embed: dict, - description_text_embed: dict, + name_text_embed: dict | None = None, + description_text_embed: dict | None = None, ) -> pd.DataFrame: """All the steps to transform final entities.""" # Process nodes nodes = ( - unpack_graph_df(entity_graph, callbacks, "clustered_graph", "nodes") + unpack_graph(entity_graph, callbacks, "clustered_graph", "nodes") .rename(columns={"label": "name"}) .loc[ :, @@ -44,7 +44,7 @@ async def create_final_entities( nodes = nodes.loc[nodes["name"].notna()] # Split 'source_id' column into 'text_unit_ids' - nodes = text_split_df( + nodes = split_text( nodes, column="source_id", separator=",", to="text_unit_ids" ).drop(columns=["source_id"]) diff --git a/graphrag/index/flows/create_final_nodes.py b/graphrag/index/flows/create_final_nodes.py index 4597a6f0..fb0b6890 100644 --- a/graphrag/index/flows/create_final_nodes.py +++ b/graphrag/index/flows/create_final_nodes.py @@ -10,27 +10,27 @@ from datashaper import ( VerbCallbacks, ) +from graphrag.index.operations.layout_graph import layout_graph +from graphrag.index.operations.snapshot import snapshot +from graphrag.index.operations.unpack_graph import unpack_graph from graphrag.index.storage import PipelineStorage -from graphrag.index.verbs.graph.layout.layout_graph import layout_graph_df -from graphrag.index.verbs.graph.unpack import unpack_graph_df -from graphrag.index.verbs.snapshot import snapshot_df async def create_final_nodes( entity_graph: pd.DataFrame, callbacks: VerbCallbacks, storage: PipelineStorage, - strategy: dict[str, Any], + layout_strategy: dict[str, Any], level_for_node_positions: int, snapshot_top_level_nodes: bool = False, ) -> pd.DataFrame: """All the steps to transform final nodes.""" laid_out_entity_graph = cast( pd.DataFrame, - layout_graph_df( + layout_graph( entity_graph, callbacks, - strategy, + layout_strategy, embeddings_column="embeddings", graph_column="clustered_graph", to="node_positions", @@ -40,7 +40,7 @@ async def create_final_nodes( nodes = cast( pd.DataFrame, - unpack_graph_df( + unpack_graph( laid_out_entity_graph, callbacks, column="positioned_graph", type="nodes" ), ) @@ -51,7 +51,7 @@ async def create_final_nodes( nodes = cast(pd.DataFrame, nodes[["id", "x", "y"]]) if snapshot_top_level_nodes: - await snapshot_df( + await snapshot( nodes, name="top_level_nodes", storage=storage, diff --git a/graphrag/index/flows/create_final_relationships.py b/graphrag/index/flows/create_final_relationships.py index 8e5bcd7d..ba82c5bc 100644 --- a/graphrag/index/flows/create_final_relationships.py +++ b/graphrag/index/flows/create_final_relationships.py @@ -11,11 +11,11 @@ from datashaper import ( ) from graphrag.index.cache import PipelineCache -from graphrag.index.operations.embed_text.embed_text import embed_text -from graphrag.index.verbs.graph.compute_edge_combined_degree import ( - compute_edge_combined_degree_df, +from graphrag.index.operations.compute_edge_combined_degree import ( + compute_edge_combined_degree, ) -from graphrag.index.verbs.graph.unpack import unpack_graph_df +from graphrag.index.operations.embed_text import embed_text +from graphrag.index.operations.unpack_graph import unpack_graph async def create_final_relationships( @@ -26,7 +26,7 @@ async def create_final_relationships( description_text_embed: dict | None = None, ) -> pd.DataFrame: """All the steps to transform final relationships.""" - graph_edges = unpack_graph_df(entity_graph, callbacks, "clustered_graph", "edges") + graph_edges = unpack_graph(entity_graph, callbacks, "clustered_graph", "edges") graph_edges.rename(columns={"source_id": "text_unit_ids"}, inplace=True) @@ -49,7 +49,7 @@ async def create_final_relationships( filtered_nodes = nodes[nodes["level"] == 0].reset_index(drop=True) filtered_nodes = cast(pd.DataFrame, filtered_nodes[["title", "degree"]]) - edge_combined_degree = compute_edge_combined_degree_df( + edge_combined_degree = compute_edge_combined_degree( pruned_edges, filtered_nodes, to="rank", diff --git a/graphrag/index/flows/create_final_text_units.py b/graphrag/index/flows/create_final_text_units.py index 2522c4c9..a63d797f 100644 --- a/graphrag/index/flows/create_final_text_units.py +++ b/graphrag/index/flows/create_final_text_units.py @@ -11,7 +11,7 @@ from datashaper import ( ) from graphrag.index.cache import PipelineCache -from graphrag.index.operations.embed_text.embed_text import embed_text +from graphrag.index.operations.embed_text import embed_text async def create_final_text_units( diff --git a/graphrag/index/flows/create_summarized_entities.py b/graphrag/index/flows/create_summarized_entities.py index dc5c6d25..a9a5d59a 100644 --- a/graphrag/index/flows/create_summarized_entities.py +++ b/graphrag/index/flows/create_summarized_entities.py @@ -11,35 +11,35 @@ from datashaper import ( ) from graphrag.index.cache import PipelineCache -from graphrag.index.storage import PipelineStorage -from graphrag.index.verbs.entities.summarize.description_summarize import ( - summarize_descriptions_df, +from graphrag.index.operations.snapshot_rows import snapshot_rows +from graphrag.index.operations.summarize_descriptions import ( + summarize_descriptions, ) -from graphrag.index.verbs.snapshot_rows import snapshot_rows_df +from graphrag.index.storage import PipelineStorage async def create_summarized_entities( entities: pd.DataFrame, - cache: PipelineCache, callbacks: VerbCallbacks, + cache: PipelineCache, storage: PipelineStorage, - strategy: dict[str, Any] | None = None, + summarization_strategy: dict[str, Any] | None = None, num_threads: int = 4, graphml_snapshot_enabled: bool = False, ) -> pd.DataFrame: """All the steps to summarize entities.""" - summarized = await summarize_descriptions_df( + summarized = await summarize_descriptions( entities, - cache, callbacks, + cache, column="entity_graph", to="entity_graph", - strategy=strategy, + strategy=summarization_strategy, num_threads=num_threads, ) if graphml_snapshot_enabled: - await snapshot_rows_df( + await snapshot_rows( summarized, column="entity_graph", base_name="summarized_graph", diff --git a/graphrag/index/operations/chunk_text/__init__.py b/graphrag/index/operations/chunk_text/__init__.py new file mode 100644 index 00000000..273ff0ab --- /dev/null +++ b/graphrag/index/operations/chunk_text/__init__.py @@ -0,0 +1,8 @@ +# Copyright (c) 2024 Microsoft Corporation. +# Licensed under the MIT License + +"""The Indexing Engine text chunk package root.""" + +from .chunk_text import ChunkStrategy, ChunkStrategyType, chunk_text + +__all__ = ["ChunkStrategy", "ChunkStrategyType", "chunk_text"] diff --git a/graphrag/index/verbs/text/chunk/text_chunk.py b/graphrag/index/operations/chunk_text/chunk_text.py similarity index 80% rename from graphrag/index/verbs/text/chunk/text_chunk.py rename to graphrag/index/operations/chunk_text/chunk_text.py index 436fdbec..bbcc750c 100644 --- a/graphrag/index/verbs/text/chunk/text_chunk.py +++ b/graphrag/index/operations/chunk_text/chunk_text.py @@ -3,59 +3,30 @@ """A module containing _get_num_total, chunk, run_strategy and load_strategy methods definitions.""" -from enum import Enum from typing import Any, cast import pandas as pd from datashaper import ( ProgressTicker, - TableContainer, VerbCallbacks, - VerbInput, progress_ticker, - verb, ) -from .strategies.typing import ChunkStrategy as ChunkStrategy -from .typing import ChunkInput +from .typing import ChunkInput, ChunkStrategy, ChunkStrategyType -def _get_num_total(output: pd.DataFrame, column: str) -> int: - num_total = 0 - for row in output[column]: - if isinstance(row, str): - num_total += 1 - else: - num_total += len(row) - return num_total - - -class ChunkStrategyType(str, Enum): - """ChunkStrategy class definition.""" - - tokens = "tokens" - sentence = "sentence" - - def __repr__(self): - """Get a string representation.""" - return f'"{self.value}"' - - -@verb(name="chunk") -def chunk( - input: VerbInput, +def chunk_text( + input: pd.DataFrame, column: str, to: str, callbacks: VerbCallbacks, strategy: dict[str, Any] | None = None, - **_kwargs, -) -> TableContainer: +) -> pd.DataFrame: """ Chunk a piece of text into smaller pieces. ## Usage ```yaml - verb: text_chunk args: column: # The name of the column containing the text to chunk, this can either be a column with text, or a column with a list[tuple[doc_id, str]] to: # The name of the column to output the chunks to @@ -85,21 +56,6 @@ def chunk( type: sentence ``` """ - input_table = cast(pd.DataFrame, input.get_input()) - - output = chunk_df(input_table, column, to, callbacks, strategy) - - return TableContainer(table=output) - - -def chunk_df( - input: pd.DataFrame, - column: str, - to: str, - callbacks: VerbCallbacks, - strategy: dict[str, Any] | None = None, -) -> pd.DataFrame: - """Chunk a piece of text into smaller pieces.""" output = input if strategy is None: strategy = {} @@ -161,17 +117,27 @@ def load_strategy(strategy: ChunkStrategyType) -> ChunkStrategy: """Load strategy method definition.""" match strategy: case ChunkStrategyType.tokens: - from .strategies.tokens import run as run_tokens + from .strategies import run_tokens return run_tokens case ChunkStrategyType.sentence: # NLTK from graphrag.index.bootstrap import bootstrap - from .strategies.sentence import run as run_sentence + from .strategies import run_sentences bootstrap() - return run_sentence + return run_sentences case _: msg = f"Unknown strategy: {strategy}" raise ValueError(msg) + + +def _get_num_total(output: pd.DataFrame, column: str) -> int: + num_total = 0 + for row in output[column]: + if isinstance(row, str): + num_total += 1 + else: + num_total += len(row) + return num_total diff --git a/graphrag/index/verbs/text/chunk/strategies/tokens.py b/graphrag/index/operations/chunk_text/strategies.py similarity index 78% rename from graphrag/index/verbs/text/chunk/strategies/tokens.py rename to graphrag/index/operations/chunk_text/strategies.py index 6426c783..7507784b 100644 --- a/graphrag/index/verbs/text/chunk/strategies/tokens.py +++ b/graphrag/index/operations/chunk_text/strategies.py @@ -1,23 +1,25 @@ # Copyright (c) 2024 Microsoft Corporation. # Licensed under the MIT License -"""A module containing run and split_text_on_tokens methods definition.""" +"""A module containing chunk strategies.""" from collections.abc import Iterable from typing import Any +import nltk import tiktoken from datashaper import ProgressTicker import graphrag.config.defaults as defs from graphrag.index.text_splitting import Tokenizer -from graphrag.index.verbs.text.chunk.typing import TextChunk + +from .typing import TextChunk -def run( +def run_tokens( input: list[str], args: dict[str, Any], tick: ProgressTicker ) -> Iterable[TextChunk]: - """Chunks text into multiple parts. A pipeline verb.""" + """Chunks text into chunks based on encoding tokens.""" tokens_per_chunk = args.get("chunk_size", defs.CHUNK_SIZE) chunk_overlap = args.get("chunk_overlap", defs.CHUNK_OVERLAP) encoding_name = args.get("encoding_name", defs.ENCODING_MODEL) @@ -31,7 +33,7 @@ def run( def decode(tokens: list[int]) -> str: return enc.decode(tokens) - return split_text_on_tokens( + return _split_text_on_tokens( input, Tokenizer( chunk_overlap=chunk_overlap, @@ -45,7 +47,7 @@ def run( # Adapted from - https://github.com/langchain-ai/langchain/blob/77b359edf5df0d37ef0d539f678cf64f5557cb54/libs/langchain/langchain/text_splitter.py#L471 # So we could have better control over the chunking process -def split_text_on_tokens( +def _split_text_on_tokens( texts: list[str], enc: Tokenizer, tick: ProgressTicker ) -> list[TextChunk]: """Split incoming text and return chunks.""" @@ -79,3 +81,17 @@ def split_text_on_tokens( chunk_ids = input_ids[start_idx:cur_idx] return result + + +def run_sentences( + input: list[str], _args: dict[str, Any], tick: ProgressTicker +) -> Iterable[TextChunk]: + """Chunks text into multiple parts by sentence.""" + for doc_idx, text in enumerate(input): + sentences = nltk.sent_tokenize(text) + for sentence in sentences: + yield TextChunk( + text_chunk=sentence, + source_doc_indices=[doc_idx], + ) + tick(1) diff --git a/graphrag/index/verbs/text/chunk/typing.py b/graphrag/index/operations/chunk_text/typing.py similarity index 50% rename from graphrag/index/verbs/text/chunk/typing.py rename to graphrag/index/operations/chunk_text/typing.py index 3a42cf68..ebfa4db9 100644 --- a/graphrag/index/verbs/text/chunk/typing.py +++ b/graphrag/index/operations/chunk_text/typing.py @@ -3,7 +3,12 @@ """A module containing 'TextChunk' model.""" +from collections.abc import Callable, Iterable from dataclasses import dataclass +from enum import Enum +from typing import Any + +from datashaper import ProgressTicker @dataclass @@ -17,3 +22,18 @@ class TextChunk: ChunkInput = str | list[str] | list[tuple[str, str]] """Input to a chunking strategy. Can be a string, a list of strings, or a list of tuples of (id, text).""" + +ChunkStrategy = Callable[ + [list[str], dict[str, Any], ProgressTicker], Iterable[TextChunk] +] + + +class ChunkStrategyType(str, Enum): + """ChunkStrategy class definition.""" + + tokens = "tokens" + sentence = "sentence" + + def __repr__(self): + """Get a string representation.""" + return f'"{self.value}"' diff --git a/graphrag/index/verbs/graph/clustering/cluster_graph.py b/graphrag/index/operations/cluster_graph.py similarity index 69% rename from graphrag/index/verbs/graph/clustering/cluster_graph.py rename to graphrag/index/operations/cluster_graph.py index 969d116e..731c4b5b 100644 --- a/graphrag/index/verbs/graph/clustering/cluster_graph.py +++ b/graphrag/index/operations/cluster_graph.py @@ -10,65 +10,29 @@ from typing import Any, cast import networkx as nx import pandas as pd -from datashaper import TableContainer, VerbCallbacks, VerbInput, progress_iterable, verb +from datashaper import VerbCallbacks, progress_iterable +from graspologic.partition import hierarchical_leiden +from graphrag.index.graph.utils import stable_largest_connected_component from graphrag.index.utils import gen_uuid, load_graph -from .typing import Communities +Communities = list[tuple[int, str, list[str]]] + + +class GraphCommunityStrategyType(str, Enum): + """GraphCommunityStrategyType class definition.""" + + leiden = "leiden" + + def __repr__(self): + """Get a string representation.""" + return f'"{self.value}"' + log = logging.getLogger(__name__) -@verb(name="cluster_graph") def cluster_graph( - input: VerbInput, - callbacks: VerbCallbacks, - strategy: dict[str, Any], - column: str, - to: str, - level_to: str | None = None, - **_kwargs, -) -> TableContainer: - """ - Apply a hierarchical clustering algorithm to a graph. The graph is expected to be in graphml format. The verb outputs a new column containing the clustered graph, and a new column containing the level of the graph. - - ## Usage - ```yaml - verb: cluster_graph - args: - column: entity_graph # The name of the column containing the graph, should be a graphml graph - to: clustered_graph # The name of the column to output the clustered graph to - level_to: level # The name of the column to output the level to - strategy: # See strategies section below - ``` - - ## Strategies - The cluster graph verb uses a strategy to cluster the graph. The strategy is a json object which defines the strategy to use. The following strategies are available: - - ### leiden - This strategy uses the leiden algorithm to cluster a graph. The strategy config is as follows: - ```yaml - strategy: - type: leiden - max_cluster_size: 10 # Optional, The max cluster size to use, default: 10 - use_lcc: true # Optional, if the largest connected component should be used with the leiden algorithm, default: true - seed: 0xDEADBEEF # Optional, the seed to use for the leiden algorithm, default: 0xDEADBEEF - levels: [0, 1] # Optional, the levels to output, default: all the levels detected - - ``` - """ - output_df = cluster_graph_df( - cast(pd.DataFrame, input.get_input()), - callbacks, - strategy, - column, - to, - level_to=level_to, - ) - return TableContainer(table=output_df) - - -def cluster_graph_df( input: pd.DataFrame, callbacks: VerbCallbacks, strategy: dict[str, Any], @@ -157,16 +121,6 @@ def apply_clustering( return graph -class GraphCommunityStrategyType(str, Enum): - """GraphCommunityStrategyType class definition.""" - - leiden = "leiden" - - def __repr__(self): - """Get a string representation.""" - return f'"{self.value}"' - - def run_layout( strategy: dict[str, Any], graphml_or_graph: str | nx.Graph ) -> Communities: @@ -180,8 +134,6 @@ def run_layout( strategy_type = strategy.get("type", GraphCommunityStrategyType.leiden) match strategy_type: case GraphCommunityStrategyType.leiden: - from .strategies.leiden import run as run_leiden - clusters = run_leiden(graph, strategy) case _: msg = f"Unknown clustering strategy {strategy_type}" @@ -192,3 +144,60 @@ def run_layout( for cluster_id, nodes in clusters[level].items(): results.append((level, cluster_id, nodes)) return results + + +def run_leiden( + graph: nx.Graph, args: dict[str, Any] +) -> dict[int, dict[str, list[str]]]: + """Run method definition.""" + max_cluster_size = args.get("max_cluster_size", 10) + use_lcc = args.get("use_lcc", True) + if args.get("verbose", False): + log.info( + "Running leiden with max_cluster_size=%s, lcc=%s", max_cluster_size, use_lcc + ) + + node_id_to_community_map = _compute_leiden_communities( + graph=graph, + max_cluster_size=max_cluster_size, + use_lcc=use_lcc, + seed=args.get("seed", 0xDEADBEEF), + ) + levels = args.get("levels") + + # If they don't pass in levels, use them all + if levels is None: + levels = sorted(node_id_to_community_map.keys()) + + results_by_level: dict[int, dict[str, list[str]]] = {} + for level in levels: + result = {} + results_by_level[level] = result + for node_id, raw_community_id in node_id_to_community_map[level].items(): + community_id = str(raw_community_id) + if community_id not in result: + result[community_id] = [] + result[community_id].append(node_id) + return results_by_level + + +# Taken from graph_intelligence & adapted +def _compute_leiden_communities( + graph: nx.Graph | nx.DiGraph, + max_cluster_size: int, + use_lcc: bool, + seed=0xDEADBEEF, +) -> dict[int, dict[str, int]]: + """Return Leiden root communities.""" + if use_lcc: + graph = stable_largest_connected_component(graph) + + community_mapping = hierarchical_leiden( + graph, max_cluster_size=max_cluster_size, random_seed=seed + ) + results: dict[int, dict[str, int]] = {} + for partition in community_mapping: + results[partition.level] = results.get(partition.level, {}) + results[partition.level][partition.node] = partition.cluster + + return results diff --git a/graphrag/index/operations/compute_edge_combined_degree.py b/graphrag/index/operations/compute_edge_combined_degree.py new file mode 100644 index 00000000..e0a81be0 --- /dev/null +++ b/graphrag/index/operations/compute_edge_combined_degree.py @@ -0,0 +1,44 @@ +# Copyright (c) 2024 Microsoft Corporation. +# Licensed under the MIT License + +"""A module containing compute_edge_combined_degree methods definition.""" + +import pandas as pd + + +def compute_edge_combined_degree( + edge_df: pd.DataFrame, + node_degree_df: pd.DataFrame, + to: str, + node_name_column: str, + node_degree_column: str, + edge_source_column: str, + edge_target_column: str, +) -> pd.DataFrame: + """Compute the combined degree for each edge in a graph.""" + if to in edge_df.columns: + return edge_df + + def join_to_degree(df: pd.DataFrame, column: str) -> pd.DataFrame: + degree_column = _degree_colname(column) + result = df.merge( + node_degree_df.rename( + columns={node_name_column: column, node_degree_column: degree_column} + ), + on=column, + how="left", + ) + result[degree_column] = result[degree_column].fillna(0) + return result + + output_df = join_to_degree(edge_df, edge_source_column) + output_df = join_to_degree(output_df, edge_target_column) + output_df[to] = ( + output_df[_degree_colname(edge_source_column)] + + output_df[_degree_colname(edge_target_column)] + ) + return output_df + + +def _degree_colname(column: str) -> str: + return f"{column}_degree" diff --git a/graphrag/index/operations/embed_graph/__init__.py b/graphrag/index/operations/embed_graph/__init__.py index 4ca8168c..a47441b4 100644 --- a/graphrag/index/operations/embed_graph/__init__.py +++ b/graphrag/index/operations/embed_graph/__init__.py @@ -4,5 +4,6 @@ """The Indexing Engine graph embed package root.""" from .embed_graph import EmbedGraphStrategyType, embed_graph +from .typing import NodeEmbeddings -__all__ = ["EmbedGraphStrategyType", "embed_graph"] +__all__ = ["EmbedGraphStrategyType", "NodeEmbeddings", "embed_graph"] diff --git a/graphrag/index/operations/embed_graph/embed_graph.py b/graphrag/index/operations/embed_graph/embed_graph.py index f38051b1..ab125a93 100644 --- a/graphrag/index/operations/embed_graph/embed_graph.py +++ b/graphrag/index/operations/embed_graph/embed_graph.py @@ -10,6 +10,8 @@ import networkx as nx import pandas as pd from datashaper import VerbCallbacks, derive_from_rows +from graphrag.index.graph.embedding import embed_nod2vec +from graphrag.index.graph.utils import stable_largest_connected_component from graphrag.index.utils import load_graph from .typing import NodeEmbeddings @@ -85,9 +87,29 @@ def run_embeddings( graph = load_graph(graphml_or_graph) match strategy: case EmbedGraphStrategyType.node2vec: - from .strategies.node_2_vec import run as run_node_2_vec - return run_node_2_vec(graph, args) case _: msg = f"Unknown strategy {strategy}" raise ValueError(msg) + + +def run_node_2_vec(graph: nx.Graph, args: dict[str, Any]) -> NodeEmbeddings: + """Run method definition.""" + if args.get("use_lcc", True): + graph = stable_largest_connected_component(graph) + + # create graph embedding using node2vec + embeddings = embed_nod2vec( + graph=graph, + dimensions=args.get("dimensions", 1536), + num_walks=args.get("num_walks", 10), + walk_length=args.get("walk_length", 40), + window_size=args.get("window_size", 2), + iterations=args.get("iterations", 3), + random_seed=args.get("random_seed", 86), + ) + + pairs = zip(embeddings.nodes, embeddings.embeddings.tolist(), strict=True) + sorted_pairs = sorted(pairs, key=lambda x: x[0]) + + return dict(sorted_pairs) diff --git a/graphrag/index/operations/embed_graph/strategies/__init__.py b/graphrag/index/operations/embed_graph/strategies/__init__.py deleted file mode 100644 index ef85198e..00000000 --- a/graphrag/index/operations/embed_graph/strategies/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -# Copyright (c) 2024 Microsoft Corporation. -# Licensed under the MIT License - -"""Text Embedding strategies.""" diff --git a/graphrag/index/operations/embed_graph/strategies/node_2_vec.py b/graphrag/index/operations/embed_graph/strategies/node_2_vec.py deleted file mode 100644 index 82abc825..00000000 --- a/graphrag/index/operations/embed_graph/strategies/node_2_vec.py +++ /dev/null @@ -1,34 +0,0 @@ -# Copyright (c) 2024 Microsoft Corporation. -# Licensed under the MIT License - -"""A module containing run method definition.""" - -from typing import Any - -import networkx as nx - -from graphrag.index.graph.embedding import embed_nod2vec -from graphrag.index.graph.utils import stable_largest_connected_component -from graphrag.index.operations.embed_graph.typing import NodeEmbeddings - - -def run(graph: nx.Graph, args: dict[str, Any]) -> NodeEmbeddings: - """Run method definition.""" - if args.get("use_lcc", True): - graph = stable_largest_connected_component(graph) - - # create graph embedding using node2vec - embeddings = embed_nod2vec( - graph=graph, - dimensions=args.get("dimensions", 1536), - num_walks=args.get("num_walks", 10), - walk_length=args.get("walk_length", 40), - window_size=args.get("window_size", 2), - iterations=args.get("iterations", 3), - random_seed=args.get("random_seed", 86), - ) - - pairs = zip(embeddings.nodes, embeddings.embeddings.tolist(), strict=True) - sorted_pairs = sorted(pairs, key=lambda x: x[0]) - - return dict(sorted_pairs) diff --git a/graphrag/index/verbs/covariates/extract_covariates/__init__.py b/graphrag/index/operations/extract_covariates/__init__.py similarity index 100% rename from graphrag/index/verbs/covariates/extract_covariates/__init__.py rename to graphrag/index/operations/extract_covariates/__init__.py diff --git a/graphrag/index/verbs/covariates/extract_covariates/extract_covariates.py b/graphrag/index/operations/extract_covariates/extract_covariates.py similarity index 64% rename from graphrag/index/verbs/covariates/extract_covariates/extract_covariates.py rename to graphrag/index/operations/extract_covariates/extract_covariates.py index 92785efe..1ee5f51c 100644 --- a/graphrag/index/verbs/covariates/extract_covariates/extract_covariates.py +++ b/graphrag/index/operations/extract_covariates/extract_covariates.py @@ -5,70 +5,29 @@ import logging from dataclasses import asdict -from enum import Enum -from typing import Any, cast +from typing import Any import pandas as pd from datashaper import ( AsyncType, - TableContainer, VerbCallbacks, - VerbInput, derive_from_rows, - verb, ) from graphrag.index.cache import PipelineCache -from graphrag.index.verbs.covariates.typing import Covariate, CovariateExtractStrategy + +from .typing import Covariate, CovariateExtractStrategy, ExtractClaimsStrategyType log = logging.getLogger(__name__) -class ExtractClaimsStrategyType(str, Enum): - """ExtractClaimsStrategyType class definition.""" - - graph_intelligence = "graph_intelligence" - - def __repr__(self): - """Get a string representation.""" - return f'"{self.value}"' - - DEFAULT_ENTITY_TYPES = ["organization", "person", "geo", "event"] -@verb(name="extract_covariates") async def extract_covariates( - input: VerbInput, - cache: PipelineCache, - callbacks: VerbCallbacks, - column: str, - covariate_type: str, - strategy: dict[str, Any] | None, - async_mode: AsyncType = AsyncType.AsyncIO, - entity_types: list[str] | None = None, - **kwargs, -) -> TableContainer: - """Extract claims from a piece of text.""" - source = cast(pd.DataFrame, input.get_input()) - output = await extract_covariates_df( - source, - cache, - callbacks, - column, - covariate_type, - strategy, - async_mode, - entity_types, - **kwargs, - ) - return TableContainer(table=output) - - -async def extract_covariates_df( input: pd.DataFrame, - cache: PipelineCache, callbacks: VerbCallbacks, + cache: PipelineCache, column: str, covariate_type: str, strategy: dict[str, Any] | None, @@ -113,9 +72,9 @@ def load_strategy(strategy_type: ExtractClaimsStrategyType) -> CovariateExtractS """Load strategy method definition.""" match strategy_type: case ExtractClaimsStrategyType.graph_intelligence: - from .strategies.graph_intelligence import run as run_gi + from .strategies import run_graph_intelligence - return run_gi + return run_graph_intelligence case _: msg = f"Unknown strategy: {strategy_type}" raise ValueError(msg) diff --git a/graphrag/index/verbs/covariates/extract_covariates/strategies/graph_intelligence/run_gi_extract_claims.py b/graphrag/index/operations/extract_covariates/strategies.py similarity index 80% rename from graphrag/index/verbs/covariates/extract_covariates/strategies/graph_intelligence/run_gi_extract_claims.py rename to graphrag/index/operations/extract_covariates/strategies.py index b9315b2d..2ef83e51 100644 --- a/graphrag/index/verbs/covariates/extract_covariates/strategies/graph_intelligence/run_gi_extract_claims.py +++ b/graphrag/index/operations/extract_covariates/strategies.py @@ -9,35 +9,31 @@ from typing import Any from datashaper import VerbCallbacks import graphrag.config.defaults as defs -from graphrag.config.enums import LLMType from graphrag.index.cache import PipelineCache from graphrag.index.graph.extractors.claims import ClaimExtractor from graphrag.index.llm import load_llm -from graphrag.index.verbs.covariates.typing import ( +from graphrag.llm import CompletionLLM + +from .typing import ( Covariate, CovariateExtractionResult, ) -from graphrag.llm import CompletionLLM - -from .defaults import MOCK_LLM_RESPONSES -async def run( +async def run_graph_intelligence( input: str | Iterable[str], entity_types: list[str], resolved_entities_map: dict[str, str], - reporter: VerbCallbacks, - pipeline_cache: PipelineCache, + callbacks: VerbCallbacks, + cache: PipelineCache, strategy_config: dict[str, Any], ) -> CovariateExtractionResult: """Run the Claim extraction chain.""" - llm_config = strategy_config.get( - "llm", {"type": LLMType.StaticResponse, "responses": MOCK_LLM_RESPONSES} - ) - llm_type = llm_config.get("type", LLMType.StaticResponse) - llm = load_llm("claim_extraction", llm_type, reporter, pipeline_cache, llm_config) + llm_config = strategy_config.get("llm", {}) + llm_type = llm_config.get("type") + llm = load_llm("claim_extraction", llm_type, callbacks, cache, llm_config) return await _execute( - llm, input, entity_types, resolved_entities_map, reporter, strategy_config + llm, input, entity_types, resolved_entities_map, callbacks, strategy_config ) @@ -46,7 +42,7 @@ async def _execute( texts: Iterable[str], entity_types: list[str], resolved_entities_map: dict[str, str], - reporter: VerbCallbacks, + callbacks: VerbCallbacks, strategy_config: dict[str, Any], ) -> CovariateExtractionResult: extraction_prompt = strategy_config.get("extraction_prompt") @@ -62,7 +58,7 @@ async def _execute( max_gleanings=max_gleanings, encoding_model=encoding_model, on_error=lambda e, s, d: ( - reporter.error("Claim Extraction Error", e, s, d) if reporter else None + callbacks.error("Claim Extraction Error", e, s, d) if callbacks else None ), ) diff --git a/graphrag/index/verbs/covariates/typing.py b/graphrag/index/operations/extract_covariates/typing.py similarity index 81% rename from graphrag/index/verbs/covariates/typing.py rename to graphrag/index/operations/extract_covariates/typing.py index 0e0c5fb7..c0cb9663 100644 --- a/graphrag/index/verbs/covariates/typing.py +++ b/graphrag/index/operations/extract_covariates/typing.py @@ -5,6 +5,7 @@ from collections.abc import Awaitable, Callable, Iterable from dataclasses import dataclass +from enum import Enum from typing import Any from datashaper import VerbCallbacks @@ -48,3 +49,13 @@ CovariateExtractStrategy = Callable[ ], Awaitable[CovariateExtractionResult], ] + + +class ExtractClaimsStrategyType(str, Enum): + """ExtractClaimsStrategyType class definition.""" + + graph_intelligence = "graph_intelligence" + + def __repr__(self): + """Get a string representation.""" + return f'"{self.value}"' diff --git a/graphrag/index/verbs/entities/extraction/__init__.py b/graphrag/index/operations/extract_entities/__init__.py similarity index 51% rename from graphrag/index/verbs/entities/extraction/__init__.py rename to graphrag/index/operations/extract_entities/__init__.py index 46e6d545..579b57df 100644 --- a/graphrag/index/verbs/entities/extraction/__init__.py +++ b/graphrag/index/operations/extract_entities/__init__.py @@ -3,6 +3,6 @@ """The Indexing Engine entities extraction package root.""" -from .entity_extract import ExtractEntityStrategyType, entity_extract +from .extract_entities import ExtractEntityStrategyType, extract_entities -__all__ = ["ExtractEntityStrategyType", "entity_extract"] +__all__ = ["ExtractEntityStrategyType", "extract_entities"] diff --git a/graphrag/index/verbs/entities/extraction/entity_extract.py b/graphrag/index/operations/extract_entities/extract_entities.py similarity index 74% rename from graphrag/index/verbs/entities/extraction/entity_extract.py rename to graphrag/index/operations/extract_entities/extract_entities.py index e5c8eff2..77f29dd6 100644 --- a/graphrag/index/verbs/entities/extraction/entity_extract.py +++ b/graphrag/index/operations/extract_entities/extract_entities.py @@ -5,16 +5,13 @@ import logging from enum import Enum -from typing import Any, cast +from typing import Any import pandas as pd from datashaper import ( AsyncType, - TableContainer, VerbCallbacks, - VerbInput, derive_from_rows, - verb, ) from graphrag.index.bootstrap import bootstrap @@ -40,43 +37,10 @@ class ExtractEntityStrategyType(str, Enum): DEFAULT_ENTITY_TYPES = ["organization", "person", "geo", "event"] -@verb(name="entity_extract") -async def entity_extract( - input: VerbInput, - cache: PipelineCache, - callbacks: VerbCallbacks, - column: str, - id_column: str, - to: str, - strategy: dict[str, Any] | None, - graph_to: str | None = None, - async_mode: AsyncType = AsyncType.AsyncIO, - entity_types=DEFAULT_ENTITY_TYPES, - **kwargs, -) -> TableContainer: - """Extract entities from a piece of text.""" - source = cast(pd.DataFrame, input.get_input()) - output = await entity_extract_df( - source, - cache, - callbacks, - column, - id_column, - to, - strategy, - graph_to, - async_mode, - entity_types, - **kwargs, - ) - - return TableContainer(table=output) - - -async def entity_extract_df( +async def extract_entities( input: pd.DataFrame, - cache: PipelineCache, callbacks: VerbCallbacks, + cache: PipelineCache, column: str, id_column: str, to: str, @@ -90,24 +54,7 @@ async def entity_extract_df( Extract entities from a piece of text. ## Usage - ### json - ```json - { - "verb": "entity_extract", - "args": { - "column": "the_document_text_column_to_extract_entities_from", /* In general this will be your document text column */ - "id_column": "the_column_with_the_unique_id_for_each_row", /* In general this will be your document id */ - "to": "the_column_to_output_the_entities_to", /* This will be a list[dict[str, Any]] a list of entities, with a name, and additional attributes */ - "graph_to": "the_column_to_output_the_graphml_to", /* Optional: This will be a graphml graph in string form which represents the entities and their relationships */ - "strategy": {...} , see strategies section below - "entity_types": ["list", "of", "entity", "types", "to", "extract"] /* Optional: This will limit the entity types extracted, default: ["organization", "person", "geo", "event"] */ - "summarize_descriptions" : true | false /* Optional: This will summarize the descriptions of the entities and relationships, default: true */ - } - } - ``` - ### yaml ```yaml - verb: entity_extract args: column: the_document_text_column_to_extract_entities_from id_column: the_column_with_the_unique_id_for_each_row @@ -218,9 +165,9 @@ def _load_strategy(strategy_type: ExtractEntityStrategyType) -> EntityExtractStr """Load strategy method definition.""" match strategy_type: case ExtractEntityStrategyType.graph_intelligence: - from .strategies.graph_intelligence import run_gi + from .strategies.graph_intelligence import run_graph_intelligence - return run_gi + return run_graph_intelligence case ExtractEntityStrategyType.nltk: bootstrap() diff --git a/graphrag/index/verbs/entities/extraction/strategies/__init__.py b/graphrag/index/operations/extract_entities/strategies/__init__.py similarity index 100% rename from graphrag/index/verbs/entities/extraction/strategies/__init__.py rename to graphrag/index/operations/extract_entities/strategies/__init__.py diff --git a/graphrag/index/verbs/entities/extraction/strategies/graph_intelligence/run_graph_intelligence.py b/graphrag/index/operations/extract_entities/strategies/graph_intelligence.py similarity index 82% rename from graphrag/index/verbs/entities/extraction/strategies/graph_intelligence/run_graph_intelligence.py rename to graphrag/index/operations/extract_entities/strategies/graph_intelligence.py index 06284879..1536df34 100644 --- a/graphrag/index/verbs/entities/extraction/strategies/graph_intelligence/run_graph_intelligence.py +++ b/graphrag/index/operations/extract_entities/strategies/graph_intelligence.py @@ -1,51 +1,49 @@ # Copyright (c) 2024 Microsoft Corporation. # Licensed under the MIT License -"""A module containing run_gi, run_extract_entities and _create_text_splitter methods to run graph intelligence.""" +"""A module containing run_graph_intelligence, run_extract_entities and _create_text_splitter methods to run graph intelligence.""" import networkx as nx from datashaper import VerbCallbacks import graphrag.config.defaults as defs -from graphrag.config.enums import LLMType from graphrag.index.cache import PipelineCache -from graphrag.index.graph.extractors.graph import GraphExtractor +from graphrag.index.graph.extractors import GraphExtractor from graphrag.index.llm import load_llm from graphrag.index.text_splitting import ( NoopTextSplitter, TextSplitter, TokenTextSplitter, ) -from graphrag.index.verbs.entities.extraction.strategies.typing import ( +from graphrag.llm import CompletionLLM + +from .typing import ( Document, EntityExtractionResult, EntityTypes, StrategyConfig, ) -from graphrag.llm import CompletionLLM - -from .defaults import DEFAULT_LLM_CONFIG -async def run_gi( +async def run_graph_intelligence( docs: list[Document], entity_types: EntityTypes, - reporter: VerbCallbacks, - pipeline_cache: PipelineCache, + callbacks: VerbCallbacks, + cache: PipelineCache, args: StrategyConfig, ) -> EntityExtractionResult: """Run the graph intelligence entity extraction strategy.""" - llm_config = args.get("llm", DEFAULT_LLM_CONFIG) - llm_type = llm_config.get("type", LLMType.StaticResponse) - llm = load_llm("entity_extraction", llm_type, reporter, pipeline_cache, llm_config) - return await run_extract_entities(llm, docs, entity_types, reporter, args) + llm_config = args.get("llm", {}) + llm_type = llm_config.get("type") + llm = load_llm("entity_extraction", llm_type, callbacks, cache, llm_config) + return await run_extract_entities(llm, docs, entity_types, callbacks, args) async def run_extract_entities( llm: CompletionLLM, docs: list[Document], entity_types: EntityTypes, - reporter: VerbCallbacks | None, + callbacks: VerbCallbacks | None, args: StrategyConfig, ) -> EntityExtractionResult: """Run the entity extraction chain.""" @@ -76,7 +74,7 @@ async def run_extract_entities( encoding_model=encoding_model, max_gleanings=max_gleanings, on_error=lambda e, s, d: ( - reporter.error("Entity Extraction Error", e, s, d) if reporter else None + callbacks.error("Entity Extraction Error", e, s, d) if callbacks else None ), ) text_list = [doc.text.strip() for doc in docs] diff --git a/graphrag/index/verbs/entities/extraction/strategies/nltk.py b/graphrag/index/operations/extract_entities/strategies/nltk.py similarity index 95% rename from graphrag/index/verbs/entities/extraction/strategies/nltk.py rename to graphrag/index/operations/extract_entities/strategies/nltk.py index 48d4dae4..9403c5a5 100644 --- a/graphrag/index/verbs/entities/extraction/strategies/nltk.py +++ b/graphrag/index/operations/extract_entities/strategies/nltk.py @@ -19,8 +19,8 @@ words.ensure_loaded() async def run( # noqa RUF029 async is required for interface docs: list[Document], entity_types: EntityTypes, - reporter: VerbCallbacks, # noqa ARG001 - pipeline_cache: PipelineCache, # noqa ARG001 + callbacks: VerbCallbacks, # noqa ARG001 + cache: PipelineCache, # noqa ARG001 args: StrategyConfig, # noqa ARG001 ) -> EntityExtractionResult: """Run method definition.""" diff --git a/graphrag/index/verbs/entities/extraction/strategies/typing.py b/graphrag/index/operations/extract_entities/strategies/typing.py similarity index 100% rename from graphrag/index/verbs/entities/extraction/strategies/typing.py rename to graphrag/index/operations/extract_entities/strategies/typing.py diff --git a/graphrag/index/verbs/graph/layout/__init__.py b/graphrag/index/operations/layout_graph/__init__.py similarity index 100% rename from graphrag/index/verbs/graph/layout/__init__.py rename to graphrag/index/operations/layout_graph/__init__.py diff --git a/graphrag/index/verbs/graph/layout/layout_graph.py b/graphrag/index/operations/layout_graph/layout_graph.py similarity index 82% rename from graphrag/index/verbs/graph/layout/layout_graph.py rename to graphrag/index/operations/layout_graph/layout_graph.py index 9721fdfa..d2b23266 100644 --- a/graphrag/index/verbs/graph/layout/layout_graph.py +++ b/graphrag/index/operations/layout_graph/layout_graph.py @@ -8,10 +8,10 @@ from typing import Any, cast import networkx as nx import pandas as pd -from datashaper import TableContainer, VerbCallbacks, VerbInput, progress_callback, verb +from datashaper import VerbCallbacks, progress_callback from graphrag.index.graph.visualization import GraphLayout -from graphrag.index.operations.embed_graph.typing import NodeEmbeddings +from graphrag.index.operations.embed_graph import NodeEmbeddings from graphrag.index.utils import load_graph @@ -26,23 +26,20 @@ class LayoutGraphStrategyType(str, Enum): return f'"{self.value}"' -@verb(name="layout_graph") def layout_graph( - input: VerbInput, + input_df: pd.DataFrame, callbacks: VerbCallbacks, strategy: dict[str, Any], embeddings_column: str, graph_column: str, to: str, graph_to: str | None = None, - **_kwargs: dict, -) -> TableContainer: +): """ Apply a layout algorithm to a graph. The graph is expected to be in graphml format. The verb outputs a new column containing the laid out graph. ## Usage ```yaml - verb: layout_graph args: graph_column: clustered_graph # The name of the column containing the graph, should be a graphml graph embeddings_column: embeddings # The name of the column containing the embeddings @@ -63,24 +60,6 @@ def layout_graph( min_dist: 0.75 # Optional, The min distance to use for the umap algorithm, default: 0.75 ``` """ - input_df = cast(pd.DataFrame, input.get_input()) - output_df = layout_graph_df( - input_df, callbacks, strategy, embeddings_column, graph_column, to, graph_to - ) - - return TableContainer(table=output_df) - - -def layout_graph_df( - input_df: pd.DataFrame, - callbacks: VerbCallbacks, - strategy: dict[str, Any], - embeddings_column: str, - graph_column: str, - to: str, - graph_to: str | None = None, -): - """Apply a layout algorithm to a graph.""" output_df = input_df num_items = len(output_df) strategy_type = strategy.get("type", LayoutGraphStrategyType.umap) @@ -118,7 +97,7 @@ def _run_layout( graphml_or_graph: str | nx.Graph, embeddings: NodeEmbeddings, args: dict[str, Any], - reporter: VerbCallbacks, + callbacks: VerbCallbacks, ) -> GraphLayout: graph = load_graph(graphml_or_graph) match strategy: @@ -129,7 +108,7 @@ def _run_layout( graph, embeddings, args, - lambda e, stack, d: reporter.error("Error in Umap", e, stack, d), + lambda e, stack, d: callbacks.error("Error in Umap", e, stack, d), ) case LayoutGraphStrategyType.zero: from .methods.zero import run as run_zero @@ -137,7 +116,7 @@ def _run_layout( return run_zero( graph, args, - lambda e, stack, d: reporter.error("Error in Zero", e, stack, d), + lambda e, stack, d: callbacks.error("Error in Zero", e, stack, d), ) case _: msg = f"Unknown strategy {strategy}" diff --git a/graphrag/index/verbs/graph/layout/methods/__init__.py b/graphrag/index/operations/layout_graph/methods/__init__.py similarity index 100% rename from graphrag/index/verbs/graph/layout/methods/__init__.py rename to graphrag/index/operations/layout_graph/methods/__init__.py diff --git a/graphrag/index/verbs/graph/layout/methods/umap.py b/graphrag/index/operations/layout_graph/methods/umap.py similarity index 97% rename from graphrag/index/verbs/graph/layout/methods/umap.py rename to graphrag/index/operations/layout_graph/methods/umap.py index d0e00b3c..636fd9a6 100644 --- a/graphrag/index/verbs/graph/layout/methods/umap.py +++ b/graphrag/index/operations/layout_graph/methods/umap.py @@ -15,7 +15,7 @@ from graphrag.index.graph.visualization import ( NodePosition, compute_umap_positions, ) -from graphrag.index.operations.embed_graph.typing import NodeEmbeddings +from graphrag.index.operations.embed_graph import NodeEmbeddings from graphrag.index.typing import ErrorHandlerFn # TODO: This could be handled more elegantly, like what columns to use diff --git a/graphrag/index/verbs/graph/layout/methods/zero.py b/graphrag/index/operations/layout_graph/methods/zero.py similarity index 100% rename from graphrag/index/verbs/graph/layout/methods/zero.py rename to graphrag/index/operations/layout_graph/methods/zero.py diff --git a/graphrag/index/verbs/graph/merge/__init__.py b/graphrag/index/operations/merge_graphs/__init__.py similarity index 60% rename from graphrag/index/verbs/graph/merge/__init__.py rename to graphrag/index/operations/merge_graphs/__init__.py index f7188279..f3b957dd 100644 --- a/graphrag/index/verbs/graph/merge/__init__.py +++ b/graphrag/index/operations/merge_graphs/__init__.py @@ -1,8 +1,10 @@ # Copyright (c) 2024 Microsoft Corporation. # Licensed under the MIT License -"""The Indexing Engine graph merge package root.""" +"""merge_graphs operation.""" from .merge_graphs import merge_graphs -__all__ = ["merge_graphs"] +__all__ = [ + "merge_graphs", +] diff --git a/graphrag/index/verbs/graph/merge/merge_graphs.py b/graphrag/index/operations/merge_graphs/merge_graphs.py similarity index 90% rename from graphrag/index/verbs/graph/merge/merge_graphs.py rename to graphrag/index/operations/merge_graphs/merge_graphs.py index d5a4c9f5..ca654e6e 100644 --- a/graphrag/index/verbs/graph/merge/merge_graphs.py +++ b/graphrag/index/operations/merge_graphs/merge_graphs.py @@ -7,15 +7,10 @@ from typing import Any, cast import networkx as nx import pandas as pd -from datashaper import TableContainer, VerbCallbacks, VerbInput, progress_iterable, verb +from datashaper import VerbCallbacks, progress_iterable from graphrag.index.utils import load_graph -from .defaults import ( - DEFAULT_CONCAT_SEPARATOR, - DEFAULT_EDGE_OPERATIONS, - DEFAULT_NODE_OPERATIONS, -) from .typing import ( BasicMergeOperation, DetailedAttributeMergeOperation, @@ -23,25 +18,23 @@ from .typing import ( StringOperation, ) +DEFAULT_NODE_OPERATIONS = { + "*": { + "operation": BasicMergeOperation.Replace, + } +} + +DEFAULT_EDGE_OPERATIONS = { + "*": { + "operation": BasicMergeOperation.Replace, + }, + "weight": "sum", +} + +DEFAULT_CONCAT_SEPARATOR = "," + -@verb(name="merge_graphs") def merge_graphs( - input: VerbInput, - callbacks: VerbCallbacks, - column: str, - to: str, - nodes: dict[str, Any] = DEFAULT_NODE_OPERATIONS, - edges: dict[str, Any] = DEFAULT_EDGE_OPERATIONS, - **_kwargs, -) -> TableContainer: - """Merge multiple graphs together. The graphs are expected to be in graphml format. The verb outputs a new column containing the merged graph.""" - input_df = cast(pd.DataFrame, input.get_input()) - output = merge_graphs_df(input_df, callbacks, column, to, nodes, edges) - - return TableContainer(table=output) - - -def merge_graphs_df( input: pd.DataFrame, callbacks: VerbCallbacks, column: str, diff --git a/graphrag/index/verbs/graph/merge/typing.py b/graphrag/index/operations/merge_graphs/typing.py similarity index 100% rename from graphrag/index/verbs/graph/merge/typing.py rename to graphrag/index/operations/merge_graphs/typing.py diff --git a/graphrag/index/verbs/snapshot.py b/graphrag/index/operations/snapshot.py similarity index 58% rename from graphrag/index/verbs/snapshot.py rename to graphrag/index/operations/snapshot.py index 032e1951..2b85c3ea 100644 --- a/graphrag/index/verbs/snapshot.py +++ b/graphrag/index/operations/snapshot.py @@ -3,31 +3,12 @@ """A module containing snapshot method definition.""" -from typing import cast - import pandas as pd -from datashaper import TableContainer, VerbInput, verb from graphrag.index.storage import PipelineStorage -@verb(name="snapshot") async def snapshot( - input: VerbInput, - name: str, - formats: list[str], - storage: PipelineStorage, - **_kwargs: dict, -) -> TableContainer: - """Take a entire snapshot of the tabular data.""" - source = cast(pd.DataFrame, input.get_input()) - - await snapshot_df(source, name, formats, storage) - - return TableContainer(table=source) - - -async def snapshot_df( input: pd.DataFrame, name: str, formats: list[str], diff --git a/graphrag/index/verbs/snapshot_rows.py b/graphrag/index/operations/snapshot_rows.py similarity index 79% rename from graphrag/index/verbs/snapshot_rows.py rename to graphrag/index/operations/snapshot_rows.py index e4b567aa..5abd771b 100644 --- a/graphrag/index/verbs/snapshot_rows.py +++ b/graphrag/index/operations/snapshot_rows.py @@ -5,10 +5,9 @@ import json from dataclasses import dataclass -from typing import Any, cast +from typing import Any import pandas as pd -from datashaper import TableContainer, VerbInput, verb from graphrag.index.storage import PipelineStorage @@ -21,30 +20,7 @@ class FormatSpecifier: extension: str -@verb(name="snapshot_rows") async def snapshot_rows( - input: VerbInput, - column: str | None, - base_name: str, - storage: PipelineStorage, - formats: list[str | dict[str, Any]], - row_name_column: str | None = None, - **_kwargs: dict, -) -> TableContainer: - """Take a by-row snapshot of the tabular data.""" - source = cast(pd.DataFrame, input.get_input()) - await snapshot_rows_df( - source, - column, - base_name, - storage, - formats, - row_name_column, - ) - return TableContainer(table=source) - - -async def snapshot_rows_df( input: pd.DataFrame, column: str | None, base_name: str, diff --git a/graphrag/index/operations/split_text.py b/graphrag/index/operations/split_text.py new file mode 100644 index 00000000..7a9a9076 --- /dev/null +++ b/graphrag/index/operations/split_text.py @@ -0,0 +1,26 @@ +# Copyright (c) 2024 Microsoft Corporation. +# Licensed under the MIT License + +"""A module containing the split_text method definition.""" + +import pandas as pd + + +def split_text( + input: pd.DataFrame, column: str, to: str, separator: str = "," +) -> pd.DataFrame: + """Split a column into a list of strings.""" + output = input + + def _apply_split(row): + if row[column] is None or isinstance(row[column], list): + return row[column] + if row[column] == "": + return [] + if not isinstance(row[column], str): + message = f"Expected {column} to be a string, but got {type(row[column])}" + raise TypeError(message) + return row[column].split(separator) + + output[to] = output.apply(_apply_split, axis=1) + return output diff --git a/graphrag/index/operations/summarize_communities/__init__.py b/graphrag/index/operations/summarize_communities/__init__.py new file mode 100644 index 00000000..d3065198 --- /dev/null +++ b/graphrag/index/operations/summarize_communities/__init__.py @@ -0,0 +1,16 @@ +# Copyright (c) 2024 Microsoft Corporation. +# Licensed under the MIT License + +"""Community summarization modules.""" + +from .prepare_community_reports import prepare_community_reports +from .restore_community_hierarchy import restore_community_hierarchy +from .summarize_communities import summarize_communities +from .typing import CreateCommunityReportsStrategyType + +__all__ = [ + "CreateCommunityReportsStrategyType", + "prepare_community_reports", + "restore_community_hierarchy", + "summarize_communities", +] diff --git a/graphrag/index/verbs/graph/report/prepare_community_reports.py b/graphrag/index/operations/summarize_communities/prepare_community_reports.py similarity index 87% rename from graphrag/index/verbs/graph/report/prepare_community_reports.py rename to graphrag/index/operations/summarize_communities/prepare_community_reports.py index a6a3a24f..1a54f7d9 100644 --- a/graphrag/index/verbs/graph/report/prepare_community_reports.py +++ b/graphrag/index/operations/summarize_communities/prepare_community_reports.py @@ -8,11 +8,8 @@ from typing import cast import pandas as pd from datashaper import ( - TableContainer, VerbCallbacks, - VerbInput, progress_iterable, - verb, ) import graphrag.index.graph.extractors.community_reports.schemas as schemas @@ -25,32 +22,11 @@ from graphrag.index.graph.extractors.community_reports import ( set_context_size, sort_context, ) -from graphrag.index.utils.ds_util import get_named_input_table, get_required_input_table log = logging.getLogger(__name__) -@verb(name="prepare_community_reports") def prepare_community_reports( - input: VerbInput, - callbacks: VerbCallbacks, - max_tokens: int = 16_000, - **_kwargs, -) -> TableContainer: - """Prep communities for report generation.""" - # Prepare Community Reports - nodes = cast(pd.DataFrame, get_required_input_table(input, "nodes").table) - edges = cast(pd.DataFrame, get_required_input_table(input, "edges").table) - claims = get_named_input_table(input, "claims") - if claims: - claims = cast(pd.DataFrame, claims.table) - - output = prepare_community_reports_df(nodes, edges, claims, callbacks, max_tokens) - - return TableContainer(table=output) - - -def prepare_community_reports_df( nodes, edges, claims, diff --git a/graphrag/index/verbs/graph/report/restore_community_hierarchy.py b/graphrag/index/operations/summarize_communities/restore_community_hierarchy.py similarity index 78% rename from graphrag/index/verbs/graph/report/restore_community_hierarchy.py rename to graphrag/index/operations/summarize_communities/restore_community_hierarchy.py index 2bf5fd00..368e4b05 100644 --- a/graphrag/index/verbs/graph/report/restore_community_hierarchy.py +++ b/graphrag/index/operations/summarize_communities/restore_community_hierarchy.py @@ -4,37 +4,15 @@ """A module containing create_graph, _get_node_attributes, _get_edge_attributes and _get_attribute_column_mapping methods definition.""" import logging -from typing import cast import pandas as pd -from datashaper import TableContainer, VerbInput, verb import graphrag.index.graph.extractors.community_reports.schemas as schemas log = logging.getLogger(__name__) -@verb(name="restore_community_hierarchy") def restore_community_hierarchy( - input: VerbInput, - name_column: str = schemas.NODE_NAME, - community_column: str = schemas.NODE_COMMUNITY, - level_column: str = schemas.NODE_LEVEL, - **_kwargs, -) -> TableContainer: - """Restore the community hierarchy from the node data.""" - node_df: pd.DataFrame = cast(pd.DataFrame, input.get_input()) - - output = restore_community_hierarchy_df( - node_df, - name_column=name_column, - community_column=community_column, - level_column=level_column, - ) - return TableContainer(table=output) - - -def restore_community_hierarchy_df( input: pd.DataFrame, name_column: str = schemas.NODE_NAME, community_column: str = schemas.NODE_COMMUNITY, diff --git a/graphrag/index/verbs/graph/report/strategies/graph_intelligence/run_graph_intelligence.py b/graphrag/index/operations/summarize_communities/strategies.py similarity index 78% rename from graphrag/index/verbs/graph/report/strategies/graph_intelligence/run_graph_intelligence.py rename to graphrag/index/operations/summarize_communities/strategies.py index d9a5235b..2653e41f 100644 --- a/graphrag/index/verbs/graph/report/strategies/graph_intelligence/run_graph_intelligence.py +++ b/graphrag/index/operations/summarize_communities/strategies.py @@ -9,41 +9,37 @@ import traceback from datashaper import VerbCallbacks -from graphrag.config.enums import LLMType from graphrag.index.cache import PipelineCache from graphrag.index.graph.extractors.community_reports import ( CommunityReportsExtractor, ) from graphrag.index.llm import load_llm from graphrag.index.utils.rate_limiter import RateLimiter -from graphrag.index.verbs.graph.report.strategies.typing import ( +from graphrag.llm import CompletionLLM + +from .typing import ( CommunityReport, StrategyConfig, ) -from graphrag.llm import CompletionLLM -from .defaults import MOCK_RESPONSES +DEFAULT_CHUNK_SIZE = 3000 log = logging.getLogger(__name__) -async def run( +async def run_graph_intelligence( community: str | int, input: str, level: int, - reporter: VerbCallbacks, - pipeline_cache: PipelineCache, + callbacks: VerbCallbacks, + cache: PipelineCache, args: StrategyConfig, ) -> CommunityReport | None: """Run the graph intelligence entity extraction strategy.""" - llm_config = args.get( - "llm", {"type": LLMType.StaticResponse, "responses": MOCK_RESPONSES} - ) - llm_type = llm_config.get("type", LLMType.StaticResponse) - llm = load_llm( - "community_reporting", llm_type, reporter, pipeline_cache, llm_config - ) - return await _run_extractor(llm, community, input, level, args, reporter) + llm_config = args.get("llm", {}) + llm_type = llm_config.get("type") + llm = load_llm("community_reporting", llm_type, callbacks, cache, llm_config) + return await _run_extractor(llm, community, input, level, args, callbacks) async def _run_extractor( @@ -52,7 +48,7 @@ async def _run_extractor( input: str, level: int, args: StrategyConfig, - reporter: VerbCallbacks, + callbacks: VerbCallbacks, ) -> CommunityReport | None: # RateLimiter rate_limiter = RateLimiter(rate=1, per=60) @@ -60,7 +56,7 @@ async def _run_extractor( llm, extraction_prompt=args.get("extraction_prompt", None), max_report_length=args.get("max_report_length", None), - on_error=lambda e, stack, _data: reporter.error( + on_error=lambda e, stack, _data: callbacks.error( "Community Report Extraction Error", e, stack ), ) @@ -86,7 +82,7 @@ async def _run_extractor( ) except Exception as e: log.exception("Error processing community: %s", community) - reporter.error("Community Report Extraction Error", e, traceback.format_exc()) + callbacks.error("Community Report Extraction Error", e, traceback.format_exc()) return None diff --git a/graphrag/index/verbs/graph/report/create_community_reports.py b/graphrag/index/operations/summarize_communities/summarize_communities.py similarity index 67% rename from graphrag/index/verbs/graph/report/create_community_reports.py rename to graphrag/index/operations/summarize_communities/summarize_communities.py index 1764362c..a704fcc1 100644 --- a/graphrag/index/verbs/graph/report/create_community_reports.py +++ b/graphrag/index/operations/summarize_communities/summarize_communities.py @@ -4,19 +4,14 @@ """A module containing create_community_reports and load_strategy methods definition.""" import logging -from enum import Enum -from typing import cast import pandas as pd from datashaper import ( AsyncType, NoopVerbCallbacks, - TableContainer, VerbCallbacks, - VerbInput, derive_from_rows, progress_ticker, - verb, ) import graphrag.config.defaults as defaults @@ -26,54 +21,17 @@ from graphrag.index.graph.extractors.community_reports import ( get_levels, prep_community_report_context, ) -from graphrag.index.utils.ds_util import get_required_input_table -from .strategies.typing import CommunityReport, CommunityReportsStrategy +from .typing import ( + CommunityReport, + CommunityReportsStrategy, + CreateCommunityReportsStrategyType, +) log = logging.getLogger(__name__) -class CreateCommunityReportsStrategyType(str, Enum): - """CreateCommunityReportsStrategyType class definition.""" - - graph_intelligence = "graph_intelligence" - - def __repr__(self): - """Get a string representation.""" - return f'"{self.value}"' - - -@verb(name="create_community_reports") -async def create_community_reports( - input: VerbInput, - callbacks: VerbCallbacks, - cache: PipelineCache, - strategy: dict, - async_mode: AsyncType = AsyncType.AsyncIO, - num_threads: int = 4, - **_kwargs, -) -> TableContainer: - """Generate community summaries.""" - log.debug("create_community_reports strategy=%s", strategy) - local_contexts = cast(pd.DataFrame, input.get_input()) - nodes = get_required_input_table(input, "nodes").table - community_hierarchy = get_required_input_table(input, "community_hierarchy").table - - output = await create_community_reports_df( - local_contexts, - nodes, - community_hierarchy, - callbacks, - cache, - strategy, - async_mode=async_mode, - num_threads=num_threads, - ) - - return TableContainer(table=output) - - -async def create_community_reports_df( +async def summarize_communities( local_contexts, nodes, community_hierarchy, @@ -106,8 +64,8 @@ async def create_community_reports_df( community_id=record[schemas.NODE_COMMUNITY], community_level=record[schemas.COMMUNITY_LEVEL], community_context=record[schemas.CONTEXT_STRING], - cache=cache, callbacks=callbacks, + cache=cache, strategy=strategy, ) tick() @@ -127,8 +85,8 @@ async def create_community_reports_df( async def _generate_report( runner: CommunityReportsStrategy, - cache: PipelineCache, callbacks: VerbCallbacks, + cache: PipelineCache, strategy: dict, community_id: int | str, community_level: int, @@ -146,9 +104,9 @@ def load_strategy( """Load strategy method definition.""" match strategy: case CreateCommunityReportsStrategyType.graph_intelligence: - from .strategies.graph_intelligence import run + from .strategies import run_graph_intelligence - return run + return run_graph_intelligence case _: msg = f"Unknown strategy: {strategy}" raise ValueError(msg) diff --git a/graphrag/index/verbs/graph/report/strategies/typing.py b/graphrag/index/operations/summarize_communities/typing.py similarity index 78% rename from graphrag/index/verbs/graph/report/strategies/typing.py rename to graphrag/index/operations/summarize_communities/typing.py index 087c7247..a5a01cbb 100644 --- a/graphrag/index/verbs/graph/report/strategies/typing.py +++ b/graphrag/index/operations/summarize_communities/typing.py @@ -4,6 +4,7 @@ """A module containing 'Finding' and 'CommunityReport' models.""" from collections.abc import Awaitable, Callable +from enum import Enum from typing import Any from datashaper import VerbCallbacks @@ -50,3 +51,13 @@ CommunityReportsStrategy = Callable[ ], Awaitable[CommunityReport | None], ] + + +class CreateCommunityReportsStrategyType(str, Enum): + """CreateCommunityReportsStrategyType class definition.""" + + graph_intelligence = "graph_intelligence" + + def __repr__(self): + """Get a string representation.""" + return f'"{self.value}"' diff --git a/graphrag/index/operations/summarize_descriptions/__init__.py b/graphrag/index/operations/summarize_descriptions/__init__.py new file mode 100644 index 00000000..55f818d1 --- /dev/null +++ b/graphrag/index/operations/summarize_descriptions/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2024 Microsoft Corporation. +# Licensed under the MIT License + +"""Root package for description summarization.""" + +from .summarize_descriptions import summarize_descriptions +from .typing import SummarizationStrategy, SummarizeStrategyType + +__all__ = [ + "SummarizationStrategy", + "SummarizeStrategyType", + "summarize_descriptions", +] diff --git a/graphrag/index/verbs/entities/summarize/strategies/graph_intelligence/run_graph_intelligence.py b/graphrag/index/operations/summarize_descriptions/strategies.py similarity index 69% rename from graphrag/index/verbs/entities/summarize/strategies/graph_intelligence/run_graph_intelligence.py rename to graphrag/index/operations/summarize_descriptions/strategies.py index 57a1ecd2..91ff0d31 100644 --- a/graphrag/index/verbs/entities/summarize/strategies/graph_intelligence/run_graph_intelligence.py +++ b/graphrag/index/operations/summarize_descriptions/strategies.py @@ -1,38 +1,34 @@ # Copyright (c) 2024 Microsoft Corporation. # Licensed under the MIT License -"""A module containing run_gi, run_resolve_entities and _create_text_list_splitter methods to run graph intelligence.""" +"""A module containing run_graph_intelligence, run_resolve_entities and _create_text_list_splitter methods to run graph intelligence.""" from datashaper import VerbCallbacks -from graphrag.config.enums import LLMType from graphrag.index.cache import PipelineCache from graphrag.index.graph.extractors.summarize import SummarizeExtractor from graphrag.index.llm import load_llm -from graphrag.index.verbs.entities.summarize.strategies.typing import ( +from graphrag.llm import CompletionLLM + +from .typing import ( StrategyConfig, SummarizedDescriptionResult, ) -from graphrag.llm import CompletionLLM - -from .defaults import DEFAULT_LLM_CONFIG -async def run( +async def run_graph_intelligence( described_items: str | tuple[str, str], descriptions: list[str], - reporter: VerbCallbacks, - pipeline_cache: PipelineCache, + callbacks: VerbCallbacks, + cache: PipelineCache, args: StrategyConfig, ) -> SummarizedDescriptionResult: """Run the graph intelligence entity extraction strategy.""" - llm_config = args.get("llm", DEFAULT_LLM_CONFIG) - llm_type = llm_config.get("type", LLMType.StaticResponse) - llm = load_llm( - "summarize_descriptions", llm_type, reporter, pipeline_cache, llm_config - ) + llm_config = args.get("llm", {}) + llm_type = llm_config.get("type") + llm = load_llm("summarize_descriptions", llm_type, callbacks, cache, llm_config) return await run_summarize_descriptions( - llm, described_items, descriptions, reporter, args + llm, described_items, descriptions, callbacks, args ) @@ -40,7 +36,7 @@ async def run_summarize_descriptions( llm: CompletionLLM, items: str | tuple[str, str], descriptions: list[str], - reporter: VerbCallbacks, + callbacks: VerbCallbacks, args: StrategyConfig, ) -> SummarizedDescriptionResult: """Run the entity extraction chain.""" @@ -56,8 +52,8 @@ async def run_summarize_descriptions( entity_name_key=entity_name_key, input_descriptions_key=input_descriptions_key, on_error=lambda e, stack, details: ( - reporter.error("Entity Extraction Error", e, stack, details) - if reporter + callbacks.error("Entity Extraction Error", e, stack, details) + if callbacks else None ), max_summary_length=args.get("max_summary_length", None), diff --git a/graphrag/index/verbs/entities/summarize/description_summarize.py b/graphrag/index/operations/summarize_descriptions/summarize_descriptions.py similarity index 83% rename from graphrag/index/verbs/entities/summarize/description_summarize.py rename to graphrag/index/operations/summarize_descriptions/summarize_descriptions.py index 40200b4c..40af8dfc 100644 --- a/graphrag/index/verbs/entities/summarize/description_summarize.py +++ b/graphrag/index/operations/summarize_descriptions/summarize_descriptions.py @@ -5,72 +5,32 @@ import asyncio import logging -from enum import Enum -from typing import Any, NamedTuple, cast +from typing import Any, cast import networkx as nx import pandas as pd from datashaper import ( ProgressTicker, - TableContainer, VerbCallbacks, - VerbInput, progress_ticker, - verb, ) from graphrag.index.cache import PipelineCache from graphrag.index.utils import load_graph -from .strategies.typing import SummarizationStrategy +from .typing import ( + DescriptionSummarizeRow, + SummarizationStrategy, + SummarizeStrategyType, +) log = logging.getLogger(__name__) -class DescriptionSummarizeRow(NamedTuple): - """DescriptionSummarizeRow class definition.""" - - graph: Any - - -class SummarizeStrategyType(str, Enum): - """SummarizeStrategyType class definition.""" - - graph_intelligence = "graph_intelligence" - - def __repr__(self): - """Get a string representation.""" - return f'"{self.value}"' - - -@verb(name="summarize_descriptions") async def summarize_descriptions( - input: VerbInput, - cache: PipelineCache, - callbacks: VerbCallbacks, - column: str, - to: str, - strategy: dict[str, Any] | None = None, - **kwargs, -) -> TableContainer: - """Summarize entity and relationship descriptions from an entity graph.""" - source = cast(pd.DataFrame, input.get_input()) - output = await summarize_descriptions_df( - source, - cache, - callbacks, - column=column, - to=to, - strategy=strategy, - **kwargs, - ) - return TableContainer(table=output) - - -async def summarize_descriptions_df( input: pd.DataFrame, - cache: PipelineCache, callbacks: VerbCallbacks, + cache: PipelineCache, column: str, to: str, strategy: dict[str, Any] | None = None, @@ -99,7 +59,6 @@ async def summarize_descriptions_df( ### yaml ```yaml - verb: entity_extract args: column: the_document_text_column_to_extract_descriptions_from to: the_column_to_output_the_summarized_descriptions_to @@ -221,9 +180,9 @@ def load_strategy(strategy_type: SummarizeStrategyType) -> SummarizationStrategy """Load strategy method definition.""" match strategy_type: case SummarizeStrategyType.graph_intelligence: - from .strategies.graph_intelligence import run as run_gi + from .strategies import run_graph_intelligence - return run_gi + return run_graph_intelligence case _: msg = f"Unknown strategy: {strategy_type}" raise ValueError(msg) diff --git a/graphrag/index/verbs/entities/summarize/strategies/typing.py b/graphrag/index/operations/summarize_descriptions/typing.py similarity index 63% rename from graphrag/index/verbs/entities/summarize/strategies/typing.py rename to graphrag/index/operations/summarize_descriptions/typing.py index 39829503..4e957cf4 100644 --- a/graphrag/index/verbs/entities/summarize/strategies/typing.py +++ b/graphrag/index/operations/summarize_descriptions/typing.py @@ -5,7 +5,8 @@ from collections.abc import Awaitable, Callable from dataclasses import dataclass -from typing import Any +from enum import Enum +from typing import Any, NamedTuple from datashaper import VerbCallbacks @@ -32,3 +33,19 @@ SummarizationStrategy = Callable[ ], Awaitable[SummarizedDescriptionResult], ] + + +class DescriptionSummarizeRow(NamedTuple): + """DescriptionSummarizeRow class definition.""" + + graph: Any + + +class SummarizeStrategyType(str, Enum): + """SummarizeStrategyType class definition.""" + + graph_intelligence = "graph_intelligence" + + def __repr__(self): + """Get a string representation.""" + return f'"{self.value}"' diff --git a/graphrag/index/verbs/graph/unpack.py b/graphrag/index/operations/unpack_graph.py similarity index 61% rename from graphrag/index/verbs/graph/unpack.py rename to graphrag/index/operations/unpack_graph.py index 3ab99a56..ad9f7381 100644 --- a/graphrag/index/verbs/graph/unpack.py +++ b/graphrag/index/operations/unpack_graph.py @@ -7,57 +7,20 @@ from typing import Any, cast import networkx as nx import pandas as pd -from datashaper import TableContainer, VerbCallbacks, VerbInput, progress_iterable, verb +from datashaper import VerbCallbacks, progress_iterable from graphrag.index.utils import load_graph default_copy = ["level"] -@verb(name="unpack_graph") def unpack_graph( - input: VerbInput, - callbacks: VerbCallbacks, - column: str, - type: str, # noqa A002 - copy: list[str] | None = None, - embeddings_column: str = "embeddings", - **kwargs, -) -> TableContainer: - """ - Unpack nodes or edges from a graphml graph, into a list of nodes or edges. - - This verb will create columns for each attribute in a node or edge. - - ## Usage - ```yaml - verb: unpack_graph - args: - type: node # The type of data to unpack, one of: node, edge. node will create a node list, edge will create an edge list - column: # The name of the column containing the graph, should be a graphml graph - ``` - """ - input_df = input.get_input() - output_df = unpack_graph_df( - cast(pd.DataFrame, input_df), - callbacks, - column, - type, - copy, - embeddings_column, - kwargs=kwargs, - ) - return TableContainer(table=output_df) - - -def unpack_graph_df( input_df: pd.DataFrame, callbacks: VerbCallbacks, column: str, type: str, # noqa A002 copy: list[str] | None = None, embeddings_column: str = "embeddings", - **kwargs, ) -> pd.DataFrame: """Unpack nodes or edges from a graphml graph, into a list of nodes or edges.""" if copy is None: @@ -83,7 +46,6 @@ def unpack_graph_df( cast(str | nx.Graph, row[column]), type, embeddings, - kwargs, ) ]) @@ -94,19 +56,18 @@ def _run_unpack( graphml_or_graph: str | nx.Graph, unpack_type: str, embeddings: dict[str, list[float]], - args: dict[str, Any], ) -> list[dict[str, Any]]: graph = load_graph(graphml_or_graph) if unpack_type == "nodes": - return _unpack_nodes(graph, embeddings, args) + return _unpack_nodes(graph, embeddings) if unpack_type == "edges": - return _unpack_edges(graph, args) + return _unpack_edges(graph) msg = f"Unknown type {unpack_type}" raise ValueError(msg) def _unpack_nodes( - graph: nx.Graph, embeddings: dict[str, list[float]], _args: dict[str, Any] + graph: nx.Graph, embeddings: dict[str, list[float]] ) -> list[dict[str, Any]]: return [ { @@ -118,7 +79,7 @@ def _unpack_nodes( ] -def _unpack_edges(graph: nx.Graph, _args: dict[str, Any]) -> list[dict[str, Any]]: +def _unpack_edges(graph: nx.Graph) -> list[dict[str, Any]]: return [ { "source": source_id, diff --git a/graphrag/index/run/run.py b/graphrag/index/run/run.py index 0ef973ed..dd50c4a1 100644 --- a/graphrag/index/run/run.py +++ b/graphrag/index/run/run.py @@ -47,7 +47,6 @@ from graphrag.index.typing import PipelineRunResult # Register all verbs from graphrag.index.update.dataframes import get_delta_docs, update_dataframe_outputs -from graphrag.index.verbs import * # noqa from graphrag.index.workflows import ( VerbDefinitions, WorkflowDefinitions, diff --git a/graphrag/index/verbs/__init__.py b/graphrag/index/verbs/__init__.py deleted file mode 100644 index 5859d983..00000000 --- a/graphrag/index/verbs/__init__.py +++ /dev/null @@ -1,46 +0,0 @@ -# Copyright (c) 2024 Microsoft Corporation. -# Licensed under the MIT License - -"""A module containing get_default_verbs method definition.""" - -from .covariates import extract_covariates -from .entities import entity_extract, summarize_descriptions -from .genid import genid -from .graph import ( - cluster_graph, - create_community_reports, - create_graph, - layout_graph, - merge_graphs, - unpack_graph, -) -from .overrides import aggregate, concat -from .snapshot import snapshot -from .snapshot_rows import snapshot_rows -from .spread_json import spread_json -from .text import chunk, text_split, text_translate -from .unzip import unzip -from .zip import zip_verb - -__all__ = [ - "aggregate", - "chunk", - "cluster_graph", - "concat", - "create_community_reports", - "create_graph", - "entity_extract", - "extract_covariates", - "genid", - "layout_graph", - "merge_graphs", - "snapshot", - "snapshot_rows", - "spread_json", - "summarize_descriptions", - "text_split", - "text_translate", - "unpack_graph", - "unzip", - "zip_verb", -] diff --git a/graphrag/index/verbs/covariates/__init__.py b/graphrag/index/verbs/covariates/__init__.py deleted file mode 100644 index cdebee22..00000000 --- a/graphrag/index/verbs/covariates/__init__.py +++ /dev/null @@ -1,8 +0,0 @@ -# Copyright (c) 2024 Microsoft Corporation. -# Licensed under the MIT License - -"""The Indexing Engine covariates package root.""" - -from .extract_covariates import extract_covariates - -__all__ = ["extract_covariates"] diff --git a/graphrag/index/verbs/covariates/extract_covariates/strategies/__init__.py b/graphrag/index/verbs/covariates/extract_covariates/strategies/__init__.py deleted file mode 100644 index 605c66f8..00000000 --- a/graphrag/index/verbs/covariates/extract_covariates/strategies/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -# Copyright (c) 2024 Microsoft Corporation. -# Licensed under the MIT License - -"""The Indexing Engine text extract claims strategies package root.""" diff --git a/graphrag/index/verbs/covariates/extract_covariates/strategies/graph_intelligence/__init__.py b/graphrag/index/verbs/covariates/extract_covariates/strategies/graph_intelligence/__init__.py deleted file mode 100644 index ab01f06f..00000000 --- a/graphrag/index/verbs/covariates/extract_covariates/strategies/graph_intelligence/__init__.py +++ /dev/null @@ -1,8 +0,0 @@ -# Copyright (c) 2024 Microsoft Corporation. -# Licensed under the MIT License - -"""The Indexing Engine text extract claims strategies graph intelligence package root.""" - -from .run_gi_extract_claims import run - -__all__ = ["run"] diff --git a/graphrag/index/verbs/covariates/extract_covariates/strategies/graph_intelligence/defaults.py b/graphrag/index/verbs/covariates/extract_covariates/strategies/graph_intelligence/defaults.py deleted file mode 100644 index a777f296..00000000 --- a/graphrag/index/verbs/covariates/extract_covariates/strategies/graph_intelligence/defaults.py +++ /dev/null @@ -1,10 +0,0 @@ -# Copyright (c) 2024 Microsoft Corporation. -# Licensed under the MIT License - -"""A file containing MOCK_LLM_RESPONSES definition.""" - -MOCK_LLM_RESPONSES = [ - """ -(COMPANY A<|>GOVERNMENT AGENCY B<|>ANTI-COMPETITIVE PRACTICES<|>TRUE<|>2022-01-10T00:00:00<|>2022-01-10T00:00:00<|>Company A was found to engage in anti-competitive practices because it was fined for bid rigging in multiple public tenders published by Government Agency B according to an article published on 2022/01/10<|>According to an article published on 2022/01/10, Company A was fined for bid rigging while participating in multiple public tenders published by Government Agency B.) - """.strip() -] diff --git a/graphrag/index/verbs/entities/__init__.py b/graphrag/index/verbs/entities/__init__.py deleted file mode 100644 index 2f55d710..00000000 --- a/graphrag/index/verbs/entities/__init__.py +++ /dev/null @@ -1,9 +0,0 @@ -# Copyright (c) 2024 Microsoft Corporation. -# Licensed under the MIT License - -"""The Indexing Engine entities package root.""" - -from .extraction import entity_extract -from .summarize import summarize_descriptions - -__all__ = ["entity_extract", "summarize_descriptions"] diff --git a/graphrag/index/verbs/entities/extraction/strategies/graph_intelligence/__init__.py b/graphrag/index/verbs/entities/extraction/strategies/graph_intelligence/__init__.py deleted file mode 100644 index 083c0e41..00000000 --- a/graphrag/index/verbs/entities/extraction/strategies/graph_intelligence/__init__.py +++ /dev/null @@ -1,8 +0,0 @@ -# Copyright (c) 2024 Microsoft Corporation. -# Licensed under the MIT License - -"""The Indexing Engine graph intelligence package root.""" - -from .run_graph_intelligence import run_gi - -__all__ = ["run_gi"] diff --git a/graphrag/index/verbs/entities/extraction/strategies/graph_intelligence/defaults.py b/graphrag/index/verbs/entities/extraction/strategies/graph_intelligence/defaults.py deleted file mode 100644 index 237e6657..00000000 --- a/graphrag/index/verbs/entities/extraction/strategies/graph_intelligence/defaults.py +++ /dev/null @@ -1,25 +0,0 @@ -# Copyright (c) 2024 Microsoft Corporation. -# Licensed under the MIT License - -"""A file containing some default responses.""" - -from graphrag.config.enums import LLMType - -MOCK_LLM_RESPONSES = [ - """ - ("entity"<|>COMPANY_A<|>COMPANY<|>Company_A is a test company) - ## - ("entity"<|>COMPANY_B<|>COMPANY<|>Company_B owns Company_A and also shares an address with Company_A) - ## - ("entity"<|>PERSON_C<|>PERSON<|>Person_C is director of Company_A) - ## - ("relationship"<|>COMPANY_A<|>COMPANY_B<|>Company_A and Company_B are related because Company_A is 100% owned by Company_B and the two companies also share the same address)<|>2) - ## - ("relationship"<|>COMPANY_A<|>PERSON_C<|>Company_A and Person_C are related because Person_C is director of Company_A<|>1)) - """.strip() -] - -DEFAULT_LLM_CONFIG = { - "type": LLMType.StaticResponse, - "responses": MOCK_LLM_RESPONSES, -} diff --git a/graphrag/index/verbs/entities/summarize/__init__.py b/graphrag/index/verbs/entities/summarize/__init__.py deleted file mode 100644 index d7e9a5d9..00000000 --- a/graphrag/index/verbs/entities/summarize/__init__.py +++ /dev/null @@ -1,8 +0,0 @@ -# Copyright (c) 2024 Microsoft Corporation. -# Licensed under the MIT License - -"""Root package for entity summarization.""" - -from .description_summarize import SummarizeStrategyType, summarize_descriptions - -__all__ = ["SummarizeStrategyType", "summarize_descriptions"] diff --git a/graphrag/index/verbs/entities/summarize/strategies/__init__.py b/graphrag/index/verbs/entities/summarize/strategies/__init__.py deleted file mode 100644 index 28c398e6..00000000 --- a/graphrag/index/verbs/entities/summarize/strategies/__init__.py +++ /dev/null @@ -1,8 +0,0 @@ -# Copyright (c) 2024 Microsoft Corporation. -# Licensed under the MIT License - -"""Indexing Engine - Summarization Strategies Package.""" - -from .typing import SummarizationStrategy - -__all__ = ["SummarizationStrategy"] diff --git a/graphrag/index/verbs/entities/summarize/strategies/graph_intelligence/__init__.py b/graphrag/index/verbs/entities/summarize/strategies/graph_intelligence/__init__.py deleted file mode 100644 index a98d9406..00000000 --- a/graphrag/index/verbs/entities/summarize/strategies/graph_intelligence/__init__.py +++ /dev/null @@ -1,8 +0,0 @@ -# Copyright (c) 2024 Microsoft Corporation. -# Licensed under the MIT License - -"""The Entity summarization graph intelligence package root.""" - -from .run_graph_intelligence import run - -__all__ = ["run"] diff --git a/graphrag/index/verbs/entities/summarize/strategies/graph_intelligence/defaults.py b/graphrag/index/verbs/entities/summarize/strategies/graph_intelligence/defaults.py deleted file mode 100644 index 8ac42aa1..00000000 --- a/graphrag/index/verbs/entities/summarize/strategies/graph_intelligence/defaults.py +++ /dev/null @@ -1,17 +0,0 @@ -# Copyright (c) 2024 Microsoft Corporation. -# Licensed under the MIT License - -"""A file containing some default responses.""" - -from graphrag.config.enums import LLMType - -MOCK_LLM_RESPONSES = [ - """ - This is a MOCK response for the LLM. It is summarized! - """.strip() -] - -DEFAULT_LLM_CONFIG = { - "type": LLMType.StaticResponse, - "responses": MOCK_LLM_RESPONSES, -} diff --git a/graphrag/index/verbs/genid.py b/graphrag/index/verbs/genid.py deleted file mode 100644 index 58ab581f..00000000 --- a/graphrag/index/verbs/genid.py +++ /dev/null @@ -1,80 +0,0 @@ -# Copyright (c) 2024 Microsoft Corporation. -# Licensed under the MIT License - -"""A module containing genid method definition.""" - -from typing import cast - -import pandas as pd -from datashaper import TableContainer, VerbInput, verb - -from graphrag.index.utils import gen_md5_hash - - -@verb(name="genid") -def genid( - input: VerbInput, - to: str, - method: str = "md5_hash", - hash: list[str] | None = None, # noqa A002 - **_kwargs: dict, -) -> TableContainer: - """ - Generate a unique id for each row in the tabular data. - - ## Usage - ### json - ```json - { - "verb": "genid", - "args": { - "to": "id_output_column_name", /* The name of the column to output the id to */ - "method": "md5_hash", /* The method to use to generate the id */ - "hash": ["list", "of", "column", "names"] /* only if using md5_hash */, - "seed": 034324 /* The random seed to use with UUID */ - } - } - ``` - - ### yaml - ```yaml - verb: genid - args: - to: id_output_column_name - method: md5_hash - hash: - - list - - of - - column - - names - seed: 034324 - ``` - """ - data = cast(pd.DataFrame, input.source.table) - - output = genid_df(data, to, method, hash) - - return TableContainer(table=output) - - -def genid_df( - input: pd.DataFrame, - to: str, - method: str = "md5_hash", - hash: list[str] | None = None, # noqa A002 -): - """Generate a unique id for each row in the tabular data.""" - data = input - match method: - case "md5_hash": - if not hash: - msg = 'Must specify the "hash" columns to use md5_hash method' - raise ValueError(msg) - data[to] = data.apply(lambda row: gen_md5_hash(row, hash), axis=1) - case "increment": - data[to] = data.index + 1 - case _: - msg = f"Unknown method {method}" - raise ValueError(msg) - - return data diff --git a/graphrag/index/verbs/graph/__init__.py b/graphrag/index/verbs/graph/__init__.py deleted file mode 100644 index f252a9f3..00000000 --- a/graphrag/index/verbs/graph/__init__.py +++ /dev/null @@ -1,34 +0,0 @@ -# Copyright (c) 2024 Microsoft Corporation. -# Licensed under the MIT License - -"""The Indexing Engine graph package root.""" - -from .clustering import cluster_graph -from .compute_edge_combined_degree import compute_edge_combined_degree -from .create import DEFAULT_EDGE_ATTRIBUTES, DEFAULT_NODE_ATTRIBUTES, create_graph -from .layout import layout_graph -from .merge import merge_graphs -from .report import ( - create_community_reports, - prepare_community_reports, - prepare_community_reports_claims, - prepare_community_reports_edges, - restore_community_hierarchy, -) -from .unpack import unpack_graph - -__all__ = [ - "DEFAULT_EDGE_ATTRIBUTES", - "DEFAULT_NODE_ATTRIBUTES", - "cluster_graph", - "compute_edge_combined_degree", - "create_community_reports", - "create_graph", - "layout_graph", - "merge_graphs", - "prepare_community_reports", - "prepare_community_reports_claims", - "prepare_community_reports_edges", - "restore_community_hierarchy", - "unpack_graph", -] diff --git a/graphrag/index/verbs/graph/clustering/__init__.py b/graphrag/index/verbs/graph/clustering/__init__.py deleted file mode 100644 index a5db89bb..00000000 --- a/graphrag/index/verbs/graph/clustering/__init__.py +++ /dev/null @@ -1,8 +0,0 @@ -# Copyright (c) 2024 Microsoft Corporation. -# Licensed under the MIT License - -"""The Indexing Engine graph clustering package root.""" - -from .cluster_graph import GraphCommunityStrategyType, cluster_graph - -__all__ = ["GraphCommunityStrategyType", "cluster_graph"] diff --git a/graphrag/index/verbs/graph/clustering/strategies/__init__.py b/graphrag/index/verbs/graph/clustering/strategies/__init__.py deleted file mode 100644 index 16a03f12..00000000 --- a/graphrag/index/verbs/graph/clustering/strategies/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -# Copyright (c) 2024 Microsoft Corporation. -# Licensed under the MIT License - -"""Graph Clustering Strategies.""" diff --git a/graphrag/index/verbs/graph/clustering/strategies/leiden.py b/graphrag/index/verbs/graph/clustering/strategies/leiden.py deleted file mode 100644 index ffc36880..00000000 --- a/graphrag/index/verbs/graph/clustering/strategies/leiden.py +++ /dev/null @@ -1,69 +0,0 @@ -# Copyright (c) 2024 Microsoft Corporation. -# Licensed under the MIT License - -"""A module containing run and _compute_leiden_communities methods definitions.""" - -import logging -from typing import Any - -import networkx as nx -from graspologic.partition import hierarchical_leiden - -from graphrag.index.graph.utils import stable_largest_connected_component - -log = logging.getLogger(__name__) - - -def run(graph: nx.Graph, args: dict[str, Any]) -> dict[int, dict[str, list[str]]]: - """Run method definition.""" - max_cluster_size = args.get("max_cluster_size", 10) - use_lcc = args.get("use_lcc", True) - if args.get("verbose", False): - log.info( - "Running leiden with max_cluster_size=%s, lcc=%s", max_cluster_size, use_lcc - ) - - node_id_to_community_map = _compute_leiden_communities( - graph=graph, - max_cluster_size=max_cluster_size, - use_lcc=use_lcc, - seed=args.get("seed", 0xDEADBEEF), - ) - levels = args.get("levels") - - # If they don't pass in levels, use them all - if levels is None: - levels = sorted(node_id_to_community_map.keys()) - - results_by_level: dict[int, dict[str, list[str]]] = {} - for level in levels: - result = {} - results_by_level[level] = result - for node_id, raw_community_id in node_id_to_community_map[level].items(): - community_id = str(raw_community_id) - if community_id not in result: - result[community_id] = [] - result[community_id].append(node_id) - return results_by_level - - -# Taken from graph_intelligence & adapted -def _compute_leiden_communities( - graph: nx.Graph | nx.DiGraph, - max_cluster_size: int, - use_lcc: bool, - seed=0xDEADBEEF, -) -> dict[int, dict[str, int]]: - """Return Leiden root communities.""" - if use_lcc: - graph = stable_largest_connected_component(graph) - - community_mapping = hierarchical_leiden( - graph, max_cluster_size=max_cluster_size, random_seed=seed - ) - results: dict[int, dict[str, int]] = {} - for partition in community_mapping: - results[partition.level] = results.get(partition.level, {}) - results[partition.level][partition.node] = partition.cluster - - return results diff --git a/graphrag/index/verbs/graph/clustering/typing.py b/graphrag/index/verbs/graph/clustering/typing.py deleted file mode 100644 index 4d6fc7e6..00000000 --- a/graphrag/index/verbs/graph/clustering/typing.py +++ /dev/null @@ -1,6 +0,0 @@ -# Copyright (c) 2024 Microsoft Corporation. -# Licensed under the MIT License - -"""A module containing Communities list definition.""" - -Communities = list[tuple[int, str, list[str]]] diff --git a/graphrag/index/verbs/graph/compute_edge_combined_degree.py b/graphrag/index/verbs/graph/compute_edge_combined_degree.py deleted file mode 100644 index 59101376..00000000 --- a/graphrag/index/verbs/graph/compute_edge_combined_degree.py +++ /dev/null @@ -1,93 +0,0 @@ -# Copyright (c) 2024 Microsoft Corporation. -# Licensed under the MIT License - -"""A module containing create_graph, _get_node_attributes, _get_edge_attributes and _get_attribute_column_mapping methods definition.""" - -from typing import cast - -import pandas as pd -from datashaper import TableContainer, VerbInput, verb - -from graphrag.index.utils.ds_util import get_required_input_table - - -@verb(name="compute_edge_combined_degree") -def compute_edge_combined_degree( - input: VerbInput, - to: str = "rank", - node_name_column: str = "title", - node_degree_column: str = "degree", - edge_source_column: str = "source", - edge_target_column: str = "target", - **_kwargs, -) -> TableContainer: - """ - Compute the combined degree for each edge in a graph. - - Inputs Tables: - - input: The edge table - - nodes: The nodes table. - - Args: - - to: The name of the column to output the combined degree to. Default="rank" - """ - edge_df: pd.DataFrame = cast(pd.DataFrame, input.get_input()) - node_degree_df = _get_node_degree_table(input, node_name_column, node_degree_column) - - output_df = compute_edge_combined_degree_df( - edge_df, - node_degree_df, - to, - node_name_column, - node_degree_column, - edge_source_column, - edge_target_column, - ) - - return TableContainer(table=output_df) - - -def compute_edge_combined_degree_df( - edge_df: pd.DataFrame, - node_degree_df: pd.DataFrame, - to: str, - node_name_column: str, - node_degree_column: str, - edge_source_column: str, - edge_target_column: str, -) -> pd.DataFrame: - """Compute the combined degree for each edge in a graph.""" - if to in edge_df.columns: - return edge_df - - def join_to_degree(df: pd.DataFrame, column: str) -> pd.DataFrame: - degree_column = _degree_colname(column) - result = df.merge( - node_degree_df.rename( - columns={node_name_column: column, node_degree_column: degree_column} - ), - on=column, - how="left", - ) - result[degree_column] = result[degree_column].fillna(0) - return result - - output_df = join_to_degree(edge_df, edge_source_column) - output_df = join_to_degree(output_df, edge_target_column) - output_df[to] = ( - output_df[_degree_colname(edge_source_column)] - + output_df[_degree_colname(edge_target_column)] - ) - return output_df - - -def _degree_colname(column: str) -> str: - return f"{column}_degree" - - -def _get_node_degree_table( - input: VerbInput, node_name_column: str, node_degree_column: str -) -> pd.DataFrame: - nodes_container = get_required_input_table(input, "nodes") - nodes = cast(pd.DataFrame, nodes_container.table) - return cast(pd.DataFrame, nodes[[node_name_column, node_degree_column]]) diff --git a/graphrag/index/verbs/graph/create.py b/graphrag/index/verbs/graph/create.py deleted file mode 100644 index eaf06284..00000000 --- a/graphrag/index/verbs/graph/create.py +++ /dev/null @@ -1,135 +0,0 @@ -# Copyright (c) 2024 Microsoft Corporation. -# Licensed under the MIT License - -"""A module containing create_graph, _get_node_attributes, _get_edge_attributes and _get_attribute_column_mapping methods definition.""" - -from typing import Any - -import networkx as nx -import pandas as pd -from datashaper import TableContainer, VerbCallbacks, VerbInput, progress_iterable, verb - -from graphrag.index.utils import clean_str - -DEFAULT_NODE_ATTRIBUTES = ["label", "type", "id", "name", "description", "community"] -DEFAULT_EDGE_ATTRIBUTES = ["label", "type", "name", "source", "target"] - - -@verb(name="create_graph") -def create_graph( - input: VerbInput, - callbacks: VerbCallbacks, - to: str, - type: str, # noqa A002 - graph_type: str = "undirected", - **kwargs, -) -> TableContainer: - """ - Create a graph from a dataframe. The verb outputs a new column containing the graph. - - > Note: This will roll up all rows into a single graph. - - ## Usage - ```yaml - verb: create_graph - args: - type: node # The type of graph to create, one of: node, edge - to: # The name of the column to output the graph to, this will be a graphml graph - attributes: # The attributes for the nodes / edges - # If using the node type, the following attributes are required: - id: - - # If using the edge type, the following attributes are required: - source: - target: - - # Other attributes can be added as follows: - : - ... for each attribute - ``` - """ - if type != "node" and type != "edge": - msg = f"Unknown type {type}" - raise ValueError(msg) - - input_df = input.get_input() - num_total = len(input_df) - out_graph: nx.Graph = _create_nx_graph(graph_type) - - in_attributes = ( - _get_node_attributes(kwargs) if type == "node" else _get_edge_attributes(kwargs) - ) - - # At this point, _get_node_attributes and _get_edge_attributes have already validated - id_col = in_attributes.get( - "id", in_attributes.get("label", in_attributes.get("name", None)) - ) - source_col = in_attributes.get("source", None) - target_col = in_attributes.get("target", None) - - for _, row in progress_iterable(input_df.iterrows(), callbacks.progress, num_total): - item_attributes = { - clean_str(key): _clean_value(row[value]) - for key, value in in_attributes.items() - if value in row - } - if type == "node": - id = clean_str(row[id_col]) - out_graph.add_node(id, **item_attributes) - elif type == "edge": - source = clean_str(row[source_col]) - target = clean_str(row[target_col]) - out_graph.add_edge(source, target, **item_attributes) - - graphml_string = "".join(nx.generate_graphml(out_graph)) - output_df = pd.DataFrame([{to: graphml_string}]) - return TableContainer(table=output_df) - - -def _clean_value(value: Any) -> str: - if value is None: - return "" - if isinstance(value, str): - return clean_str(value) - - msg = f"Value must be a string or None, got {type(value)}" - raise TypeError(msg) - - -def _get_node_attributes(args: dict[str, Any]) -> dict[str, Any]: - mapping = _get_attribute_column_mapping( - args.get("attributes", DEFAULT_NODE_ATTRIBUTES) - ) - if "id" not in mapping and "label" not in mapping and "name" not in mapping: - msg = "You must specify an id, label, or name column in the node attributes" - raise ValueError(msg) - return mapping - - -def _get_edge_attributes(args: dict[str, Any]) -> dict[str, Any]: - mapping = _get_attribute_column_mapping( - args.get("attributes", DEFAULT_EDGE_ATTRIBUTES) - ) - if "source" not in mapping or "target" not in mapping: - msg = "You must specify a source and target column in the edge attributes" - raise ValueError(msg) - return mapping - - -def _get_attribute_column_mapping( - in_attributes: dict[str, Any] | list[str], -) -> dict[str, str]: - # Its already a attribute: column dict - if isinstance(in_attributes, dict): - return { - **in_attributes, - } - - return {attrib: attrib for attrib in in_attributes} - - -def _create_nx_graph(graph_type: str) -> nx.Graph: - if graph_type == "directed": - return nx.DiGraph() - - return nx.Graph() diff --git a/graphrag/index/verbs/graph/merge/defaults.py b/graphrag/index/verbs/graph/merge/defaults.py deleted file mode 100644 index 80c60331..00000000 --- a/graphrag/index/verbs/graph/merge/defaults.py +++ /dev/null @@ -1,21 +0,0 @@ -# Copyright (c) 2024 Microsoft Corporation. -# Licensed under the MIT License - -"""A file containing DEFAULT_NODE_OPERATIONS, DEFAULT_EDGE_OPERATIONS and DEFAULT_CONCAT_SEPARATOR values definition.""" - -from .typing import BasicMergeOperation - -DEFAULT_NODE_OPERATIONS = { - "*": { - "operation": BasicMergeOperation.Replace, - } -} - -DEFAULT_EDGE_OPERATIONS = { - "*": { - "operation": BasicMergeOperation.Replace, - }, - "weight": "sum", -} - -DEFAULT_CONCAT_SEPARATOR = "," diff --git a/graphrag/index/verbs/graph/report/__init__.py b/graphrag/index/verbs/graph/report/__init__.py deleted file mode 100644 index e47d9cce..00000000 --- a/graphrag/index/verbs/graph/report/__init__.py +++ /dev/null @@ -1,25 +0,0 @@ -# Copyright (c) 2024 Microsoft Corporation. -# Licensed under the MIT License - -"""The Indexing Engine graph report package root.""" - -from .create_community_reports import ( - CreateCommunityReportsStrategyType, - create_community_reports, -) -from .prepare_community_reports import prepare_community_reports -from .prepare_community_reports_claims import prepare_community_reports_claims -from .prepare_community_reports_edges import prepare_community_reports_edges -from .prepare_community_reports_nodes import prepare_community_reports_nodes -from .restore_community_hierarchy import restore_community_hierarchy - -__all__ = [ - "CreateCommunityReportsStrategyType", - "create_community_reports", - "create_community_reports", - "prepare_community_reports", - "prepare_community_reports_claims", - "prepare_community_reports_edges", - "prepare_community_reports_nodes", - "restore_community_hierarchy", -] diff --git a/graphrag/index/verbs/graph/report/prepare_community_reports_claims.py b/graphrag/index/verbs/graph/report/prepare_community_reports_claims.py deleted file mode 100644 index aa9a7907..00000000 --- a/graphrag/index/verbs/graph/report/prepare_community_reports_claims.py +++ /dev/null @@ -1,50 +0,0 @@ -# Copyright (c) 2024 Microsoft Corporation. -# Licensed under the MIT License - -"""A module containing create_graph, _get_node_attributes, _get_edge_attributes and _get_attribute_column_mapping methods definition.""" - -from typing import cast - -import pandas as pd -from datashaper import TableContainer, VerbInput, verb - -from graphrag.index.graph.extractors.community_reports.schemas import ( - CLAIM_DESCRIPTION, - CLAIM_DETAILS, - CLAIM_ID, - CLAIM_STATUS, - CLAIM_SUBJECT, - CLAIM_TYPE, -) - -_MISSING_DESCRIPTION = "No Description" - - -@verb(name="prepare_community_reports_claims") -def prepare_community_reports_claims( - input: VerbInput, - to: str = CLAIM_DETAILS, - id_column: str = CLAIM_ID, - description_column: str = CLAIM_DESCRIPTION, - subject_column: str = CLAIM_SUBJECT, - type_column: str = CLAIM_TYPE, - status_column: str = CLAIM_STATUS, - **_kwargs, -) -> TableContainer: - """Merge claim details into an object.""" - claim_df: pd.DataFrame = cast(pd.DataFrame, input.get_input()) - claim_df = claim_df.fillna(value={description_column: _MISSING_DESCRIPTION}) - - # merge values of five columns into a map column - claim_df[to] = claim_df.apply( - lambda x: { - id_column: x[id_column], - subject_column: x[subject_column], - type_column: x[type_column], - status_column: x[status_column], - description_column: x[description_column], - }, - axis=1, - ) - - return TableContainer(table=claim_df) diff --git a/graphrag/index/verbs/graph/report/prepare_community_reports_edges.py b/graphrag/index/verbs/graph/report/prepare_community_reports_edges.py deleted file mode 100644 index b568aba0..00000000 --- a/graphrag/index/verbs/graph/report/prepare_community_reports_edges.py +++ /dev/null @@ -1,48 +0,0 @@ -# Copyright (c) 2024 Microsoft Corporation. -# Licensed under the MIT License - -"""A module containing create_graph, _get_node_attributes, _get_edge_attributes and _get_attribute_column_mapping methods definition.""" - -from typing import cast - -import pandas as pd -from datashaper import TableContainer, VerbInput, verb - -from graphrag.index.graph.extractors.community_reports.schemas import ( - EDGE_DEGREE, - EDGE_DESCRIPTION, - EDGE_DETAILS, - EDGE_ID, - EDGE_SOURCE, - EDGE_TARGET, -) - -_MISSING_DESCRIPTION = "No Description" - - -@verb(name="prepare_community_reports_edges") -def prepare_community_reports_edges( - input: VerbInput, - to: str = EDGE_DETAILS, - id_column: str = EDGE_ID, - source_column: str = EDGE_SOURCE, - target_column: str = EDGE_TARGET, - description_column: str = EDGE_DESCRIPTION, - degree_column: str = EDGE_DEGREE, - **_kwargs, -) -> TableContainer: - """Merge edge details into an object.""" - edge_df: pd.DataFrame = cast(pd.DataFrame, input.get_input()).fillna( - value={description_column: _MISSING_DESCRIPTION} - ) - edge_df[to] = edge_df.apply( - lambda x: { - id_column: x[id_column], - source_column: x[source_column], - target_column: x[target_column], - description_column: x[description_column], - degree_column: x[degree_column], - }, - axis=1, - ) - return TableContainer(table=edge_df) diff --git a/graphrag/index/verbs/graph/report/prepare_community_reports_nodes.py b/graphrag/index/verbs/graph/report/prepare_community_reports_nodes.py deleted file mode 100644 index f159c125..00000000 --- a/graphrag/index/verbs/graph/report/prepare_community_reports_nodes.py +++ /dev/null @@ -1,46 +0,0 @@ -# Copyright (c) 2024 Microsoft Corporation. -# Licensed under the MIT License - -"""A module containing create_graph, _get_node_attributes, _get_edge_attributes and _get_attribute_column_mapping methods definition.""" - -from typing import cast - -import pandas as pd -from datashaper import TableContainer, VerbInput, verb - -from graphrag.index.graph.extractors.community_reports.schemas import ( - NODE_DEGREE, - NODE_DESCRIPTION, - NODE_DETAILS, - NODE_ID, - NODE_NAME, -) - -_MISSING_DESCRIPTION = "No Description" - - -@verb(name="prepare_community_reports_nodes") -def prepare_community_reports_nodes( - input: VerbInput, - to: str = NODE_DETAILS, - id_column: str = NODE_ID, - name_column: str = NODE_NAME, - description_column: str = NODE_DESCRIPTION, - degree_column: str = NODE_DEGREE, - **_kwargs, -) -> TableContainer: - """Merge edge details into an object.""" - node_df = cast(pd.DataFrame, input.get_input()) - node_df = node_df.fillna(value={description_column: _MISSING_DESCRIPTION}) - - # merge values of four columns into a map column - node_df[to] = node_df.apply( - lambda x: { - id_column: x[id_column], - name_column: x[name_column], - description_column: x[description_column], - degree_column: x[degree_column], - }, - axis=1, - ) - return TableContainer(table=node_df) diff --git a/graphrag/index/verbs/graph/report/strategies/__init__.py b/graphrag/index/verbs/graph/report/strategies/__init__.py deleted file mode 100644 index 87d1f9e2..00000000 --- a/graphrag/index/verbs/graph/report/strategies/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -# Copyright (c) 2024 Microsoft Corporation. -# Licensed under the MIT License - -"""The Indexing Engine graph report strategies package root.""" diff --git a/graphrag/index/verbs/graph/report/strategies/graph_intelligence/__init__.py b/graphrag/index/verbs/graph/report/strategies/graph_intelligence/__init__.py deleted file mode 100644 index 7f51d790..00000000 --- a/graphrag/index/verbs/graph/report/strategies/graph_intelligence/__init__.py +++ /dev/null @@ -1,8 +0,0 @@ -# Copyright (c) 2024 Microsoft Corporation. -# Licensed under the MIT License - -"""The Indexing Engine graph report strategies graph intelligence package root.""" - -from .run_graph_intelligence import run - -__all__ = ["run"] diff --git a/graphrag/index/verbs/graph/report/strategies/graph_intelligence/defaults.py b/graphrag/index/verbs/graph/report/strategies/graph_intelligence/defaults.py deleted file mode 100644 index c184fb8e..00000000 --- a/graphrag/index/verbs/graph/report/strategies/graph_intelligence/defaults.py +++ /dev/null @@ -1,27 +0,0 @@ -# Copyright (c) 2024 Microsoft Corporation. -# Licensed under the MIT License - -"""A file containing DEFAULT_CHUNK_SIZE and MOCK_RESPONSES definitions.""" - -import json - -DEFAULT_CHUNK_SIZE = 3000 - -MOCK_RESPONSES = [ - json.dumps({ - "title": "", - "summary": "", - "rating": 2, - "rating_explanation": "", - "findings": [ - { - "summary": "", - "explanation": "", - "explanation": " TableContainer: - """Aggregate method definition.""" - input_table = input.get_input() - callbacks.progress(Progress(percent=0)) - - output = aggregate_df(input_table, aggregations, groupby) - - callbacks.progress(Progress(percent=1)) - - return TableContainer(table=output) - - -def aggregate_df( - input_table: Table, - aggregations: list[dict[str, Any]], - groupby: list[str] | None = None, -) -> pd.DataFrame: - """Aggregate method definition.""" - aggregations_to_apply = _load_aggregations(aggregations) - df_aggregations = { - agg.column: _get_pandas_agg_operation(agg) - for agg in aggregations_to_apply.values() - } - if groupby is None: - output_grouped = input_table.groupby(lambda _x: True) - else: - output_grouped = input_table.groupby(groupby, sort=False) - output = cast(pd.DataFrame, output_grouped.agg(df_aggregations)) - output.rename( - columns={agg.column: agg.to for agg in aggregations_to_apply.values()}, - inplace=True, - ) - output.columns = [agg.to for agg in aggregations_to_apply.values()] - return output.reset_index() - - -@dataclass -class Aggregation: - """Aggregation class method definition.""" - - column: str | None - operation: str - to: str - - # Only useful for the concat operation - separator: str | None = None - - -def _get_pandas_agg_operation(agg: Aggregation) -> Any: - # TODO: Merge into datashaper - if agg.operation == "string_concat": - return (agg.separator or ",").join - return aggregate_operation_mapping[FieldAggregateOperation(agg.operation)] - - -def _load_aggregations( - aggregations: list[dict[str, Any]], -) -> dict[str, Aggregation]: - return { - aggregation["column"]: Aggregation( - aggregation["column"], aggregation["operation"], aggregation["to"] - ) - for aggregation in aggregations - } diff --git a/graphrag/index/verbs/overrides/concat.py b/graphrag/index/verbs/overrides/concat.py deleted file mode 100644 index 7a0f0e2c..00000000 --- a/graphrag/index/verbs/overrides/concat.py +++ /dev/null @@ -1,27 +0,0 @@ -# Copyright (c) 2024 Microsoft Corporation. -# Licensed under the MIT License - -"""A module containing concat method definition.""" - -# Copyright (c) 2024 Microsoft Corporation. -# Licensed under the MIT License -from typing import cast - -import pandas as pd -from datashaper import TableContainer, VerbInput, verb - - -@verb(name="concat_override") -def concat( - input: VerbInput, - columnwise: bool = False, - **_kwargs: dict, -) -> TableContainer: - """Concat method definition.""" - input_table = cast(pd.DataFrame, input.get_input()) - others = cast(list[pd.DataFrame], input.get_others()) - if columnwise: - output = pd.concat([input_table, *others], axis=1) - else: - output = pd.concat([input_table, *others], ignore_index=True) - return TableContainer(table=output) diff --git a/graphrag/index/verbs/spread_json.py b/graphrag/index/verbs/spread_json.py deleted file mode 100644 index 38656e12..00000000 --- a/graphrag/index/verbs/spread_json.py +++ /dev/null @@ -1,55 +0,0 @@ -# Copyright (c) 2024 Microsoft Corporation. -# Licensed under the MIT License - -"""A module containing spread_json method definition.""" - -import logging - -import pandas as pd -from datashaper import TableContainer, VerbInput, verb - -from graphrag.index.utils import is_null - -# TODO: Check if this is already a thing -DEFAULT_COPY = ["level"] - - -@verb(name="spread_json") -def spread_json( - input: VerbInput, - column: str, - copy: list[str] | None = None, - **_kwargs: dict, -) -> TableContainer: - """ - Unpack a column containing a tuple into multiple columns. - - id|json|b - 1|{"x":5,"y":6}|b - - is converted to - - id|x|y|b - -------- - 1|5|6|b - """ - if copy is None: - copy = DEFAULT_COPY - data = input.get_input() - - results = [] - for _, row in data.iterrows(): - try: - cleaned_row = {col: row[col] for col in copy} - rest_row = row[column] if row[column] is not None else {} - - if is_null(rest_row): - rest_row = {} - - results.append({**cleaned_row, **rest_row}) # type: ignore - except Exception: - logging.exception("Error spreading row: %s", row) - raise - data = pd.DataFrame(results, index=data.index) - - return TableContainer(table=data) diff --git a/graphrag/index/verbs/text/__init__.py b/graphrag/index/verbs/text/__init__.py deleted file mode 100644 index bd7ddc16..00000000 --- a/graphrag/index/verbs/text/__init__.py +++ /dev/null @@ -1,16 +0,0 @@ -# Copyright (c) 2024 Microsoft Corporation. -# Licensed under the MIT License - -"""The Indexing Engine text package root.""" - -from .chunk.text_chunk import chunk -from .replace import replace -from .split import text_split -from .translate import text_translate - -__all__ = [ - "chunk", - "replace", - "text_split", - "text_translate", -] diff --git a/graphrag/index/verbs/text/chunk/__init__.py b/graphrag/index/verbs/text/chunk/__init__.py deleted file mode 100644 index 4e2a7729..00000000 --- a/graphrag/index/verbs/text/chunk/__init__.py +++ /dev/null @@ -1,8 +0,0 @@ -# Copyright (c) 2024 Microsoft Corporation. -# Licensed under the MIT License - -"""The Indexing Engine text chunk package root.""" - -from .text_chunk import ChunkStrategy, ChunkStrategyType, chunk - -__all__ = ["ChunkStrategy", "ChunkStrategyType", "chunk"] diff --git a/graphrag/index/verbs/text/chunk/strategies/__init__.py b/graphrag/index/verbs/text/chunk/strategies/__init__.py deleted file mode 100644 index 0f15fcb2..00000000 --- a/graphrag/index/verbs/text/chunk/strategies/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -# Copyright (c) 2024 Microsoft Corporation. -# Licensed under the MIT License - -"""The Indexing Engine text chunk strategies package root.""" diff --git a/graphrag/index/verbs/text/chunk/strategies/sentence.py b/graphrag/index/verbs/text/chunk/strategies/sentence.py deleted file mode 100644 index 687def1d..00000000 --- a/graphrag/index/verbs/text/chunk/strategies/sentence.py +++ /dev/null @@ -1,26 +0,0 @@ -# Copyright (c) 2024 Microsoft Corporation. -# Licensed under the MIT License - -"""A module containing run method definition.""" - -from collections.abc import Iterable -from typing import Any - -import nltk -from datashaper import ProgressTicker - -from .typing import TextChunk - - -def run( - input: list[str], _args: dict[str, Any], tick: ProgressTicker -) -> Iterable[TextChunk]: - """Chunks text into multiple parts. A pipeline verb.""" - for doc_idx, text in enumerate(input): - sentences = nltk.sent_tokenize(text) - for sentence in sentences: - yield TextChunk( - text_chunk=sentence, - source_doc_indices=[doc_idx], - ) - tick(1) diff --git a/graphrag/index/verbs/text/chunk/strategies/typing.py b/graphrag/index/verbs/text/chunk/strategies/typing.py deleted file mode 100644 index b4e833c8..00000000 --- a/graphrag/index/verbs/text/chunk/strategies/typing.py +++ /dev/null @@ -1,17 +0,0 @@ -# Copyright (c) 2024 Microsoft Corporation. -# Licensed under the MIT License - -"""A module containing ChunkStrategy definition.""" - -from collections.abc import Callable, Iterable -from typing import Any - -from datashaper import ProgressTicker - -from graphrag.index.verbs.text.chunk.typing import TextChunk - -# Given a list of document texts, return a list of tuples of (source_doc_indices, text_chunk) - -ChunkStrategy = Callable[ - [list[str], dict[str, Any], ProgressTicker], Iterable[TextChunk] -] diff --git a/graphrag/index/verbs/text/replace/__init__.py b/graphrag/index/verbs/text/replace/__init__.py deleted file mode 100644 index f863415f..00000000 --- a/graphrag/index/verbs/text/replace/__init__.py +++ /dev/null @@ -1,8 +0,0 @@ -# Copyright (c) 2024 Microsoft Corporation. -# Licensed under the MIT License - -"""The Indexing Engine text replace package root.""" - -from .replace import text_replace - -__all__ = ["text_replace"] diff --git a/graphrag/index/verbs/text/replace/replace.py b/graphrag/index/verbs/text/replace/replace.py deleted file mode 100644 index 386fac34..00000000 --- a/graphrag/index/verbs/text/replace/replace.py +++ /dev/null @@ -1,47 +0,0 @@ -# Copyright (c) 2024 Microsoft Corporation. -# Licensed under the MIT License - -"""A module containing replace and _apply_replacements methods.""" - -from typing import cast - -import pandas as pd -from datashaper import TableContainer, VerbInput, verb - -from .typing import Replacement - - -@verb(name="text_replace") -def text_replace( - input: VerbInput, - column: str, - to: str, - replacements: list[dict[str, str]], - **_kwargs: dict, -) -> TableContainer: - """ - Apply a set of replacements to a piece of text. - - ## Usage - ```yaml - verb: text_replace - args: - column: # The name of the column containing the text to replace - to: # The name of the column to write the replaced text to - replacements: # A list of replacements to apply - - pattern: # The regex pattern to find - replacement: # The string to replace with - ``` - """ - output = cast(pd.DataFrame, input.get_input()) - parsed_replacements = [Replacement(**r) for r in replacements] - output[to] = output[column].apply( - lambda text: _apply_replacements(text, parsed_replacements) - ) - return TableContainer(table=output) - - -def _apply_replacements(text: str, replacements: list[Replacement]) -> str: - for r in replacements: - text = text.replace(r.pattern, r.replacement) - return text diff --git a/graphrag/index/verbs/text/replace/typing.py b/graphrag/index/verbs/text/replace/typing.py deleted file mode 100644 index 45beef9f..00000000 --- a/graphrag/index/verbs/text/replace/typing.py +++ /dev/null @@ -1,14 +0,0 @@ -# Copyright (c) 2024 Microsoft Corporation. -# Licensed under the MIT License - -"""A module containing 'Replacement' model.""" - -from dataclasses import dataclass - - -@dataclass -class Replacement: - """Replacement class definition.""" - - pattern: str - replacement: str diff --git a/graphrag/index/verbs/text/split.py b/graphrag/index/verbs/text/split.py deleted file mode 100644 index b1339ff4..00000000 --- a/graphrag/index/verbs/text/split.py +++ /dev/null @@ -1,54 +0,0 @@ -# Copyright (c) 2024 Microsoft Corporation. -# Licensed under the MIT License - -"""A module containing the text_split method definition.""" - -from typing import cast - -import pandas as pd -from datashaper import TableContainer, VerbInput, verb - - -@verb(name="text_split") -def text_split( - input: VerbInput, - column: str, - to: str, - separator: str = ",", - **_kwargs: dict, -) -> TableContainer: - """ - Split a piece of text into a list of strings based on a delimiter. The verb outputs a new column containing a list of strings. - - ## Usage - - ```yaml - verb: text_split - args: - column: text # The name of the column containing the text to split - to: split_text # The name of the column to output the split text to - separator: "," # The separator to split the text on, defaults to "," - ``` - """ - output = text_split_df(cast(pd.DataFrame, input.get_input()), column, to, separator) - return TableContainer(table=output) - - -def text_split_df( - input: pd.DataFrame, column: str, to: str, separator: str = "," -) -> pd.DataFrame: - """Split a column into a list of strings.""" - output = input - - def _apply_split(row): - if row[column] is None or isinstance(row[column], list): - return row[column] - if row[column] == "": - return [] - if not isinstance(row[column], str): - message = f"Expected {column} to be a string, but got {type(row[column])}" - raise TypeError(message) - return row[column].split(separator) - - output[to] = output.apply(_apply_split, axis=1) - return output diff --git a/graphrag/index/verbs/text/translate/__init__.py b/graphrag/index/verbs/text/translate/__init__.py deleted file mode 100644 index ad830dfa..00000000 --- a/graphrag/index/verbs/text/translate/__init__.py +++ /dev/null @@ -1,8 +0,0 @@ -# Copyright (c) 2024 Microsoft Corporation. -# Licensed under the MIT License - -"""The Indexing Engine text translate package root.""" - -from .text_translate import text_translate - -__all__ = ["text_translate"] diff --git a/graphrag/index/verbs/text/translate/strategies/__init__.py b/graphrag/index/verbs/text/translate/strategies/__init__.py deleted file mode 100644 index d418bbae..00000000 --- a/graphrag/index/verbs/text/translate/strategies/__init__.py +++ /dev/null @@ -1,9 +0,0 @@ -# Copyright (c) 2024 Microsoft Corporation. -# Licensed under the MIT License - -"""The Indexing Engine translate strategies package root.""" - -from .mock import run as run_mock -from .openai import run as run_openai - -__all__ = ["run_mock", "run_openai"] diff --git a/graphrag/index/verbs/text/translate/strategies/defaults.py b/graphrag/index/verbs/text/translate/strategies/defaults.py deleted file mode 100644 index 003e00eb..00000000 --- a/graphrag/index/verbs/text/translate/strategies/defaults.py +++ /dev/null @@ -1,8 +0,0 @@ -# Copyright (c) 2024 Microsoft Corporation. -# Licensed under the MIT License - -"""A file containing TRANSLATION_PROMPT value definition.""" - -TRANSLATION_PROMPT = """ - You are a helpful assistant. Translate into {language} the following text, and make sure all of the text is in {language}. - """.strip() diff --git a/graphrag/index/verbs/text/translate/strategies/mock.py b/graphrag/index/verbs/text/translate/strategies/mock.py deleted file mode 100644 index 58a5a999..00000000 --- a/graphrag/index/verbs/text/translate/strategies/mock.py +++ /dev/null @@ -1,28 +0,0 @@ -# Copyright (c) 2024 Microsoft Corporation. -# Licensed under the MIT License - -"""A module containing run and _summarize_text methods definitions.""" - -from typing import Any - -from datashaper import VerbCallbacks - -from graphrag.index.cache import PipelineCache - -from .typing import TextTranslationResult - - -async def run( # noqa RUF029 async is required for interface - input: str | list[str], - _args: dict[str, Any], - _reporter: VerbCallbacks, - _cache: PipelineCache, -) -> TextTranslationResult: - """Run the Claim extraction chain.""" - input = [input] if isinstance(input, str) else input - return TextTranslationResult(translations=[_translate_text(text) for text in input]) - - -def _translate_text(text: str) -> str: - """Translate a single piece of text.""" - return f"{text} translated" diff --git a/graphrag/index/verbs/text/translate/strategies/openai.py b/graphrag/index/verbs/text/translate/strategies/openai.py deleted file mode 100644 index 49c47b34..00000000 --- a/graphrag/index/verbs/text/translate/strategies/openai.py +++ /dev/null @@ -1,93 +0,0 @@ -# Copyright (c) 2024 Microsoft Corporation. -# Licensed under the MIT License - -"""A module containing run, _translate_text and _create_translation_prompt methods definition.""" - -import logging -import traceback -from typing import Any - -from datashaper import VerbCallbacks - -import graphrag.config.defaults as defs -from graphrag.config.enums import LLMType -from graphrag.index.cache import PipelineCache -from graphrag.index.llm import load_llm -from graphrag.index.text_splitting import TokenTextSplitter -from graphrag.llm import CompletionLLM - -from .defaults import TRANSLATION_PROMPT as DEFAULT_TRANSLATION_PROMPT -from .typing import TextTranslationResult - -log = logging.getLogger(__name__) - - -async def run( - input: str | list[str], - args: dict[str, Any], - callbacks: VerbCallbacks, - pipeline_cache: PipelineCache, -) -> TextTranslationResult: - """Run the Claim extraction chain.""" - llm_config = args.get("llm", {"type": LLMType.StaticResponse}) - llm_type = llm_config.get("type", LLMType.StaticResponse) - llm = load_llm( - "text_translation", - llm_type, - callbacks, - pipeline_cache, - llm_config, - chat_only=True, - ) - language = args.get("language", "English") - prompt = args.get("prompt") - chunk_size = args.get("chunk_size", defs.CHUNK_SIZE) - chunk_overlap = args.get("chunk_overlap", defs.CHUNK_OVERLAP) - - input = [input] if isinstance(input, str) else input - return TextTranslationResult( - translations=[ - await _translate_text( - text, language, prompt, llm, chunk_size, chunk_overlap, callbacks - ) - for text in input - ] - ) - - -async def _translate_text( - text: str, - language: str, - prompt: str | None, - llm: CompletionLLM, - chunk_size: int, - chunk_overlap: int, - callbacks: VerbCallbacks, -) -> str: - """Translate a single piece of text.""" - splitter = TokenTextSplitter( - chunk_size=chunk_size, - chunk_overlap=chunk_overlap, - ) - - out = "" - chunks = splitter.split_text(text) - for chunk in chunks: - try: - result = await llm( - chunk, - history=[ - { - "role": "system", - "content": (prompt or DEFAULT_TRANSLATION_PROMPT), - } - ], - variables={"language": language}, - ) - out += result.output or "" - except Exception as e: - log.exception("error translating text") - callbacks.error("Error translating text", e, traceback.format_exc()) - out += "" - - return out diff --git a/graphrag/index/verbs/text/translate/strategies/typing.py b/graphrag/index/verbs/text/translate/strategies/typing.py deleted file mode 100644 index d91ed735..00000000 --- a/graphrag/index/verbs/text/translate/strategies/typing.py +++ /dev/null @@ -1,25 +0,0 @@ -# Copyright (c) 2024 Microsoft Corporation. -# Licensed under the MIT License - -"""A module containing 'TextTranslationResult' model.""" - -from collections.abc import Awaitable, Callable -from dataclasses import dataclass -from typing import Any - -from datashaper import VerbCallbacks - -from graphrag.index.cache import PipelineCache - - -@dataclass -class TextTranslationResult: - """Text translation result class definition.""" - - translations: list[str] - - -TextTranslationStrategy = Callable[ - [list[str], dict[str, Any], VerbCallbacks, PipelineCache], - Awaitable[TextTranslationResult], -] diff --git a/graphrag/index/verbs/text/translate/text_translate.py b/graphrag/index/verbs/text/translate/text_translate.py deleted file mode 100644 index 8d0faffe..00000000 --- a/graphrag/index/verbs/text/translate/text_translate.py +++ /dev/null @@ -1,120 +0,0 @@ -# Copyright (c) 2024 Microsoft Corporation. -# Licensed under the MIT License - -"""A module containing text_translate methods definition.""" - -from enum import Enum -from typing import Any, cast - -import pandas as pd -from datashaper import ( - AsyncType, - TableContainer, - VerbCallbacks, - VerbInput, - derive_from_rows, - verb, -) - -from graphrag.index.cache import PipelineCache - -from .strategies.typing import TextTranslationStrategy - - -class TextTranslateStrategyType(str, Enum): - """TextTranslateStrategyType class definition.""" - - openai = "openai" - mock = "mock" - - def __repr__(self): - """Get a string representation.""" - return f'"{self.value}"' - - -@verb(name="text_translate") -async def text_translate( - input: VerbInput, - cache: PipelineCache, - callbacks: VerbCallbacks, - text_column: str, - to: str, - strategy: dict[str, Any], - async_mode: AsyncType = AsyncType.AsyncIO, - **kwargs, -) -> TableContainer: - """ - Translate a piece of text into another language. - - ## Usage - ```yaml - verb: text_translate - args: - text_column: # The name of the column containing the text to translate - to: # The name of the column to write the translated text to - strategy: # The strategy to use to translate the text, see below for more details - ``` - - ## Strategies - The text translate verb uses a strategy to translate the text. The strategy is an object which defines the strategy to use. The following strategies are available: - - ### openai - This strategy uses openai to translate a piece of text. In particular it uses a LLM to translate a piece of text. The strategy config is as follows: - - ```yaml - strategy: - type: openai - language: english # The language to translate to, default: english - prompt: # The prompt to use for the translation, default: None - chunk_size: 2500 # The chunk size to use for the translation, default: 2500 - chunk_overlap: 0 # The chunk overlap to use for the translation, default: 0 - llm: # The configuration for the LLM - type: openai_chat # the type of llm to use, available options are: openai_chat, azure_openai_chat - api_key: !ENV ${GRAPHRAG_OPENAI_API_KEY} # The api key to use for openai - model: !ENV ${GRAPHRAG_OPENAI_MODEL:gpt-4-turbo-preview} # The model to use for openai - max_tokens: !ENV ${GRAPHRAG_MAX_TOKENS:6000} # The max tokens to use for openai - organization: !ENV ${GRAPHRAG_OPENAI_ORGANIZATION} # The organization to use for openai - ``` - """ - output_df = cast(pd.DataFrame, input.get_input()) - strategy_type = strategy["type"] - strategy_args = {**strategy} - strategy_exec = _load_strategy(strategy_type) - - async def run_strategy(row): - text = row[text_column] - result = await strategy_exec(text, strategy_args, callbacks, cache) - - # If it is a single string, then return just the translation for that string - if isinstance(text, str): - return result.translations[0] - - # Otherwise, return a list of translations, one for each item in the input - return list(result.translations) - - results = await derive_from_rows( - output_df, - run_strategy, - callbacks, - scheduling_type=async_mode, - num_threads=kwargs.get("num_threads", 4), - ) - output_df[to] = results - return TableContainer(table=output_df) - - -def _load_strategy(strategy: TextTranslateStrategyType) -> TextTranslationStrategy: - match strategy: - case TextTranslateStrategyType.openai: - from .strategies.openai import run as run_openai - - return run_openai - - case TextTranslateStrategyType.mock: - from .strategies.mock import run as run_mock - - return run_mock - - case _: - msg = f"Unknown strategy: {strategy}" - raise ValueError(msg) diff --git a/graphrag/index/verbs/unzip.py b/graphrag/index/verbs/unzip.py deleted file mode 100644 index 4d8c8da0..00000000 --- a/graphrag/index/verbs/unzip.py +++ /dev/null @@ -1,25 +0,0 @@ -# Copyright (c) 2024 Microsoft Corporation. -# Licensed under the MIT License - -"""A module containing unzip method definition.""" - -from typing import cast - -import pandas as pd -from datashaper import TableContainer, VerbInput, verb - - -# TODO: Check if this is already a thing -# Takes 1|(x,y)|b -# and converts to -# 1|x|y|b -@verb(name="unzip") -def unzip( - input: VerbInput, column: str, to: list[str], **_kwargs: dict -) -> TableContainer: - """Unpacks a column containing a tuple into multiple columns.""" - table = cast(pd.DataFrame, input.get_input()) - - table[to] = pd.DataFrame(table[column].tolist(), index=table.index) - - return TableContainer(table=table) diff --git a/graphrag/index/verbs/zip.py b/graphrag/index/verbs/zip.py deleted file mode 100644 index 462395d3..00000000 --- a/graphrag/index/verbs/zip.py +++ /dev/null @@ -1,51 +0,0 @@ -# Copyright (c) 2024 Microsoft Corporation. -# Licensed under the MIT License - -"""A module containing ds_zip method definition.""" - -from typing import cast - -import pandas as pd -from datashaper import TableContainer, VerbInput, verb - - -@verb(name="zip") -def zip_verb( - input: VerbInput, - to: str, - columns: list[str], - type: str | None = None, # noqa A002 - **_kwargs: dict, -) -> TableContainer: - """ - Zip columns together. - - ## Usage - TODO - - """ - table = cast(pd.DataFrame, input.get_input()) - if type is None: - table[to] = list(zip(*[table[col] for col in columns], strict=True)) - - # This one is a little weird - elif type == "dict": - if len(columns) != 2: - msg = f"Expected exactly two columns for a dict, got {columns}" - raise ValueError(msg) - key_col, value_col = columns - - results = [] - for _, row in table.iterrows(): - keys = row[key_col] - values = row[value_col] - output = {} - if len(keys) != len(values): - msg = f"Expected same number of keys and values, got {len(keys)} keys and {len(values)} values" - raise ValueError(msg) - for idx, key in enumerate(keys): - output[key] = values[idx] - results.append(output) - - table[to] = results - return TableContainer(table=table.reset_index(drop=True)) diff --git a/graphrag/index/workflows/v1/create_base_entity_graph.py b/graphrag/index/workflows/v1/create_base_entity_graph.py index 351b2b67..dadaece5 100644 --- a/graphrag/index/workflows/v1/create_base_entity_graph.py +++ b/graphrag/index/workflows/v1/create_base_entity_graph.py @@ -21,7 +21,7 @@ def build_steps( "cluster_graph", {"strategy": {"type": "leiden"}}, ) - clustering_strategy = clustering_config["strategy"] + clustering_strategy = clustering_config.get("strategy") embed_graph_config = config.get( "embed_graph", @@ -36,7 +36,7 @@ def build_steps( } }, ) - embedding_strategy = embed_graph_config["strategy"] + embedding_strategy = embed_graph_config.get("strategy") embed_graph_enabled = config.get("embed_graph_enabled", False) or False graphml_snapshot_enabled = config.get("graphml_snapshot", False) or False diff --git a/graphrag/index/workflows/v1/create_base_extracted_entities.py b/graphrag/index/workflows/v1/create_base_extracted_entities.py index 7d0ea603..e18266b3 100644 --- a/graphrag/index/workflows/v1/create_base_extracted_entities.py +++ b/graphrag/index/workflows/v1/create_base_extracted_entities.py @@ -24,7 +24,7 @@ def build_steps( column = entity_extraction_config.get("text_column", "chunk") id_column = entity_extraction_config.get("id_column", "chunk_id") async_mode = entity_extraction_config.get("async_mode", AsyncType.AsyncIO) - strategy = entity_extraction_config.get("strategy") + extraction_strategy = entity_extraction_config.get("strategy") num_threads = entity_extraction_config.get("num_threads", 4) entity_types = entity_extraction_config.get("entity_types") @@ -71,7 +71,7 @@ def build_steps( "column": column, "id_column": id_column, "async_mode": async_mode, - "strategy": strategy, + "extraction_strategy": extraction_strategy, "num_threads": num_threads, "entity_types": entity_types, "nodes": nodes, diff --git a/graphrag/index/workflows/v1/create_base_text_units.py b/graphrag/index/workflows/v1/create_base_text_units.py index da2d1374..5a1d470c 100644 --- a/graphrag/index/workflows/v1/create_base_text_units.py +++ b/graphrag/index/workflows/v1/create_base_text_units.py @@ -22,7 +22,8 @@ def build_steps( chunk_column_name = config.get("chunk_column", "chunk") chunk_by_columns = config.get("chunk_by", []) or [] n_tokens_column_name = config.get("n_tokens_column", "n_tokens") - text_chunk = config.get("text_chunk", {}) + text_chunk_config = config.get("text_chunk", {}) + chunk_strategy = text_chunk_config.get("strategy") return [ { "verb": "create_base_text_units", @@ -30,7 +31,7 @@ def build_steps( "chunk_column_name": chunk_column_name, "n_tokens_column_name": n_tokens_column_name, "chunk_by_columns": chunk_by_columns, - **text_chunk, + "chunk_strategy": chunk_strategy, }, "input": {"source": DEFAULT_INPUT_NAME}, }, diff --git a/graphrag/index/workflows/v1/create_final_community_reports.py b/graphrag/index/workflows/v1/create_final_community_reports.py index 5e933c8e..e859397d 100644 --- a/graphrag/index/workflows/v1/create_final_community_reports.py +++ b/graphrag/index/workflows/v1/create_final_community_reports.py @@ -19,6 +19,10 @@ def build_steps( """ covariates_enabled = config.get("covariates_enabled", False) create_community_reports_config = config.get("create_community_reports", {}) + summarization_strategy = create_community_reports_config.get("strategy") + async_mode = create_community_reports_config.get("async_mode") + num_threads = create_community_reports_config.get("num_threads") + base_text_embed = config.get("text_embed", {}) community_report_full_content_embed_config = config.get( "community_report_full_content_embed", base_text_embed @@ -43,9 +47,6 @@ def build_steps( { "verb": "create_final_community_reports", "args": { - "skip_full_content_embedding": skip_full_content_embedding, - "skip_summary_embedding": skip_summary_embedding, - "skip_title_embedding": skip_title_embedding, "full_content_text_embed": community_report_full_content_embed_config if not skip_full_content_embedding else None, @@ -55,7 +56,9 @@ def build_steps( "title_text_embed": community_report_title_embed_config if not skip_title_embedding else None, - **create_community_reports_config, + "summarization_strategy": summarization_strategy, + "async_mode": async_mode, + "num_threads": num_threads, }, "input": input, }, diff --git a/graphrag/index/workflows/v1/create_final_covariates.py b/graphrag/index/workflows/v1/create_final_covariates.py index 2b558547..1fdab708 100644 --- a/graphrag/index/workflows/v1/create_final_covariates.py +++ b/graphrag/index/workflows/v1/create_final_covariates.py @@ -3,7 +3,9 @@ """A module containing build_steps method definition.""" -from datashaper import AsyncType +from datashaper import ( + AsyncType, +) from graphrag.index.config import PipelineWorkflowConfig, PipelineWorkflowStep @@ -21,9 +23,13 @@ def build_steps( * `workflow:create_base_extracted_entities` """ claim_extract_config = config.get("claim_extract", {}) + extraction_strategy = claim_extract_config.get("strategy") + async_mode = claim_extract_config.get("async_mode", AsyncType.AsyncIO) + num_threads = claim_extract_config.get("num_threads") + chunk_column = config.get("chunk_column", "chunk") chunk_id_column = config.get("chunk_id_column", "chunk_id") - async_mode = config.get("async_mode", AsyncType.AsyncIO) + return [ { "verb": "create_final_covariates", @@ -31,8 +37,9 @@ def build_steps( "column": chunk_column, "id_column": chunk_id_column, "covariate_type": "claim", + "extraction_strategy": extraction_strategy, "async_mode": async_mode, - **claim_extract_config, + "num_threads": num_threads, }, "input": {"source": "workflow:create_base_text_units"}, }, diff --git a/graphrag/index/workflows/v1/create_final_entities.py b/graphrag/index/workflows/v1/create_final_entities.py index d7393f4a..7242800f 100644 --- a/graphrag/index/workflows/v1/create_final_entities.py +++ b/graphrag/index/workflows/v1/create_final_entities.py @@ -22,6 +22,7 @@ def build_steps( entity_name_description_embed_config = config.get( "entity_name_description_embed", base_text_embed ) + skip_name_embedding = config.get("skip_name_embedding", False) skip_description_embedding = config.get("skip_description_embedding", False) @@ -29,8 +30,6 @@ def build_steps( { "verb": "create_final_entities", "args": { - "skip_name_embedding": skip_name_embedding, - "skip_description_embedding": skip_description_embedding, "name_text_embed": entity_name_embed_config if not skip_name_embedding else None, diff --git a/graphrag/index/workflows/v1/create_final_nodes.py b/graphrag/index/workflows/v1/create_final_nodes.py index 65ab516d..aecc804c 100644 --- a/graphrag/index/workflows/v1/create_final_nodes.py +++ b/graphrag/index/workflows/v1/create_final_nodes.py @@ -27,6 +27,7 @@ def build_steps( }, }, ) + layout_strategy = layout_graph_config.get("strategy") level_for_node_positions = config.get("level_for_node_positions", 0) return [ @@ -34,7 +35,7 @@ def build_steps( "id": "laid_out_entity_graph", "verb": "create_final_nodes", "args": { - **layout_graph_config, + "layout_strategy": layout_strategy, "level_for_node_positions": level_for_node_positions, "snapshot_top_level_nodes": snapshot_top_level_nodes, }, diff --git a/graphrag/index/workflows/v1/create_final_text_units.py b/graphrag/index/workflows/v1/create_final_text_units.py index 7453f5cc..0638253a 100644 --- a/graphrag/index/workflows/v1/create_final_text_units.py +++ b/graphrag/index/workflows/v1/create_final_text_units.py @@ -21,6 +21,7 @@ def build_steps( """ base_text_embed = config.get("text_embed", {}) text_unit_text_embed_config = config.get("text_unit_text_embed", base_text_embed) + skip_text_unit_embedding = config.get("skip_text_unit_embedding", False) covariates_enabled = config.get("covariates_enabled", False) diff --git a/graphrag/index/workflows/v1/create_summarized_entities.py b/graphrag/index/workflows/v1/create_summarized_entities.py index d4d95786..53821814 100644 --- a/graphrag/index/workflows/v1/create_summarized_entities.py +++ b/graphrag/index/workflows/v1/create_summarized_entities.py @@ -18,8 +18,8 @@ def build_steps( * `workflow:create_base_text_units` """ summarize_descriptions_config = config.get("summarize_descriptions", {}) - strategy = summarize_descriptions_config.get("strategy", {}) - num_threads = strategy.get("num_threads", 4) + summarization_strategy = summarize_descriptions_config.get("strategy") + num_threads = summarize_descriptions_config.get("num_threads", 4) graphml_snapshot_enabled = config.get("graphml_snapshot", False) or False @@ -27,7 +27,7 @@ def build_steps( { "verb": "create_summarized_entities", "args": { - "strategy": strategy, + "summarization_strategy": summarization_strategy, "num_threads": num_threads, "graphml_snapshot_enabled": graphml_snapshot_enabled, }, diff --git a/graphrag/index/workflows/v1/subflows/create_base_documents.py b/graphrag/index/workflows/v1/subflows/create_base_documents.py index 6e682e67..c3e52098 100644 --- a/graphrag/index/workflows/v1/subflows/create_base_documents.py +++ b/graphrag/index/workflows/v1/subflows/create_base_documents.py @@ -29,7 +29,9 @@ def create_base_documents( source = cast(pd.DataFrame, input.get_input()) text_units = cast(pd.DataFrame, get_required_input_table(input, "text_units").table) - output = create_base_documents_flow(source, text_units, document_attribute_columns) + output = create_base_documents_flow( + source, text_units, document_attribute_columns=document_attribute_columns + ) return create_verb_result( cast( diff --git a/graphrag/index/workflows/v1/subflows/create_base_entity_graph.py b/graphrag/index/workflows/v1/subflows/create_base_entity_graph.py index ea1630dd..009da03f 100644 --- a/graphrag/index/workflows/v1/subflows/create_base_entity_graph.py +++ b/graphrag/index/workflows/v1/subflows/create_base_entity_graph.py @@ -42,7 +42,7 @@ async def create_base_entity_graph( storage, clustering_strategy, embedding_strategy, - graphml_snapshot_enabled, + graphml_snapshot_enabled=graphml_snapshot_enabled, ) return create_verb_result(cast(Table, output)) diff --git a/graphrag/index/workflows/v1/subflows/create_base_extracted_entities.py b/graphrag/index/workflows/v1/subflows/create_base_extracted_entities.py index 846467f3..34660e01 100644 --- a/graphrag/index/workflows/v1/subflows/create_base_extracted_entities.py +++ b/graphrag/index/workflows/v1/subflows/create_base_extracted_entities.py @@ -25,14 +25,14 @@ from graphrag.index.storage import PipelineStorage @verb(name="create_base_extracted_entities", treats_input_tables_as_immutable=True) async def create_base_extracted_entities( input: VerbInput, - cache: PipelineCache, callbacks: VerbCallbacks, + cache: PipelineCache, storage: PipelineStorage, column: str, id_column: str, nodes: dict[str, Any], edges: dict[str, Any], - strategy: dict[str, Any] | None, + extraction_strategy: dict[str, Any] | None, async_mode: AsyncType = AsyncType.AsyncIO, entity_types: list[str] | None = None, num_threads: int = 4, @@ -45,18 +45,18 @@ async def create_base_extracted_entities( output = await create_base_extracted_entities_flow( source, - cache, callbacks, + cache, storage, column, id_column, nodes, edges, - strategy, - async_mode, - entity_types, - graphml_snapshot_enabled, - raw_entity_snapshot_enabled, + extraction_strategy, + async_mode=async_mode, + entity_types=entity_types, + graphml_snapshot_enabled=graphml_snapshot_enabled, + raw_entity_snapshot_enabled=raw_entity_snapshot_enabled, num_threads=num_threads, ) diff --git a/graphrag/index/workflows/v1/subflows/create_base_text_units.py b/graphrag/index/workflows/v1/subflows/create_base_text_units.py index 370abc59..18c65008 100644 --- a/graphrag/index/workflows/v1/subflows/create_base_text_units.py +++ b/graphrag/index/workflows/v1/subflows/create_base_text_units.py @@ -26,7 +26,7 @@ def create_base_text_units( chunk_column_name: str, n_tokens_column_name: str, chunk_by_columns: list[str], - strategy: dict[str, Any] | None = None, + chunk_strategy: dict[str, Any] | None = None, **_kwargs: dict, ) -> VerbResult: """All the steps to transform base text_units.""" @@ -38,7 +38,7 @@ def create_base_text_units( chunk_column_name, n_tokens_column_name, chunk_by_columns, - strategy, + chunk_strategy=chunk_strategy, ) return create_verb_result( diff --git a/graphrag/index/workflows/v1/subflows/create_final_community_reports.py b/graphrag/index/workflows/v1/subflows/create_final_community_reports.py index 48074412..e44f6e65 100644 --- a/graphrag/index/workflows/v1/subflows/create_final_community_reports.py +++ b/graphrag/index/workflows/v1/subflows/create_final_community_reports.py @@ -27,7 +27,7 @@ async def create_final_community_reports( input: VerbInput, callbacks: VerbCallbacks, cache: PipelineCache, - strategy: dict, + summarization_strategy: dict, async_mode: AsyncType = AsyncType.AsyncIO, num_threads: int = 4, full_content_text_embed: dict | None = None, @@ -49,12 +49,12 @@ async def create_final_community_reports( claims, callbacks, cache, - strategy, - async_mode, - num_threads, - full_content_text_embed, - summary_text_embed, - title_text_embed, + summarization_strategy, + async_mode=async_mode, + num_threads=num_threads, + full_content_text_embed=full_content_text_embed, + summary_text_embed=summary_text_embed, + title_text_embed=title_text_embed, ) return create_verb_result( diff --git a/graphrag/index/workflows/v1/subflows/create_final_covariates.py b/graphrag/index/workflows/v1/subflows/create_final_covariates.py index 8a6c3c90..d0812bc5 100644 --- a/graphrag/index/workflows/v1/subflows/create_final_covariates.py +++ b/graphrag/index/workflows/v1/subflows/create_final_covariates.py @@ -24,11 +24,11 @@ from graphrag.index.flows.create_final_covariates import ( @verb(name="create_final_covariates", treats_input_tables_as_immutable=True) async def create_final_covariates( input: VerbInput, - cache: PipelineCache, callbacks: VerbCallbacks, + cache: PipelineCache, column: str, covariate_type: str, - strategy: dict[str, Any] | None, + extraction_strategy: dict[str, Any] | None, async_mode: AsyncType = AsyncType.AsyncIO, entity_types: list[str] | None = None, num_threads: int = 4, @@ -39,14 +39,14 @@ async def create_final_covariates( output = await create_final_covariates_flow( source, - cache, callbacks, + cache, column, covariate_type, - strategy, - async_mode, - entity_types, - num_threads, + extraction_strategy, + async_mode=async_mode, + entity_types=entity_types, + num_threads=num_threads, ) return create_verb_result(cast(Table, output)) diff --git a/graphrag/index/workflows/v1/subflows/create_final_documents.py b/graphrag/index/workflows/v1/subflows/create_final_documents.py index 2f665df7..bc883552 100644 --- a/graphrag/index/workflows/v1/subflows/create_final_documents.py +++ b/graphrag/index/workflows/v1/subflows/create_final_documents.py @@ -38,7 +38,7 @@ async def create_final_documents( source, callbacks, cache, - raw_content_text_embed, + raw_content_text_embed=raw_content_text_embed, ) return create_verb_result(cast(Table, output)) diff --git a/graphrag/index/workflows/v1/subflows/create_final_entities.py b/graphrag/index/workflows/v1/subflows/create_final_entities.py index 54a10ebf..fa3ae898 100644 --- a/graphrag/index/workflows/v1/subflows/create_final_entities.py +++ b/graphrag/index/workflows/v1/subflows/create_final_entities.py @@ -28,8 +28,8 @@ async def create_final_entities( input: VerbInput, callbacks: VerbCallbacks, cache: PipelineCache, - name_text_embed: dict, - description_text_embed: dict, + name_text_embed: dict | None = None, + description_text_embed: dict | None = None, **_kwargs: dict, ) -> VerbResult: """All the steps to transform final entities.""" @@ -39,8 +39,8 @@ async def create_final_entities( source, callbacks, cache, - name_text_embed, - description_text_embed, + name_text_embed=name_text_embed, + description_text_embed=description_text_embed, ) return create_verb_result(cast(Table, output)) diff --git a/graphrag/index/workflows/v1/subflows/create_final_nodes.py b/graphrag/index/workflows/v1/subflows/create_final_nodes.py index 9a1754cb..8971652d 100644 --- a/graphrag/index/workflows/v1/subflows/create_final_nodes.py +++ b/graphrag/index/workflows/v1/subflows/create_final_nodes.py @@ -25,7 +25,7 @@ async def create_final_nodes( input: VerbInput, callbacks: VerbCallbacks, storage: PipelineStorage, - strategy: dict[str, Any], + layout_strategy: dict[str, Any], level_for_node_positions: int, snapshot_top_level_nodes: bool = False, **_kwargs: dict, @@ -37,9 +37,9 @@ async def create_final_nodes( source, callbacks, storage, - strategy, + layout_strategy, level_for_node_positions, - snapshot_top_level_nodes, + snapshot_top_level_nodes=snapshot_top_level_nodes, ) return create_verb_result( diff --git a/graphrag/index/workflows/v1/subflows/create_final_relationships.py b/graphrag/index/workflows/v1/subflows/create_final_relationships.py index d28a535c..24222846 100644 --- a/graphrag/index/workflows/v1/subflows/create_final_relationships.py +++ b/graphrag/index/workflows/v1/subflows/create_final_relationships.py @@ -41,7 +41,7 @@ async def create_final_relationships( nodes, callbacks, cache, - description_text_embed, + description_text_embed=description_text_embed, ) return create_verb_result(cast(Table, output)) diff --git a/graphrag/index/workflows/v1/subflows/create_final_text_units.py b/graphrag/index/workflows/v1/subflows/create_final_text_units.py index 15d592c1..14bd8399 100644 --- a/graphrag/index/workflows/v1/subflows/create_final_text_units.py +++ b/graphrag/index/workflows/v1/subflows/create_final_text_units.py @@ -50,7 +50,7 @@ async def create_final_text_units( final_covariates, callbacks, cache, - text_text_embed, + text_text_embed=text_text_embed, ) return create_verb_result(cast(Table, output)) diff --git a/graphrag/index/workflows/v1/subflows/create_summarized_entities.py b/graphrag/index/workflows/v1/subflows/create_summarized_entities.py index 2d5c917d..4ac2dd5b 100644 --- a/graphrag/index/workflows/v1/subflows/create_summarized_entities.py +++ b/graphrag/index/workflows/v1/subflows/create_summarized_entities.py @@ -27,10 +27,10 @@ from graphrag.index.storage import PipelineStorage ) async def create_summarized_entities( input: VerbInput, - cache: PipelineCache, callbacks: VerbCallbacks, + cache: PipelineCache, storage: PipelineStorage, - strategy: dict[str, Any] | None = None, + summarization_strategy: dict[str, Any] | None = None, num_threads: int = 4, graphml_snapshot_enabled: bool = False, **_kwargs: dict, @@ -40,12 +40,12 @@ async def create_summarized_entities( output = await create_summarized_entities_flow( source, - cache, callbacks, + cache, storage, - strategy, - num_threads, - graphml_snapshot_enabled, + summarization_strategy, + num_threads=num_threads, + graphml_snapshot_enabled=graphml_snapshot_enabled, ) return create_verb_result(cast(Table, output)) diff --git a/graphrag/prompt_tune/loader/input.py b/graphrag/prompt_tune/loader/input.py index 06799905..a61f0a47 100644 --- a/graphrag/prompt_tune/loader/input.py +++ b/graphrag/prompt_tune/loader/input.py @@ -3,18 +3,16 @@ """Input loading module.""" -from typing import cast - import numpy as np import pandas as pd -from datashaper import NoopVerbCallbacks, TableContainer, VerbInput +from datashaper import NoopVerbCallbacks import graphrag.config.defaults as defs from graphrag.config.models.graph_rag_config import GraphRagConfig from graphrag.index.input import load_input from graphrag.index.llm import load_llm_embeddings +from graphrag.index.operations.chunk_text import chunk_text from graphrag.index.progress.types import ProgressReporter -from graphrag.index.verbs import chunk from graphrag.llm.types.llm_types import EmbeddingLLM from graphrag.prompt_tune.types import DocSelectionType @@ -62,23 +60,20 @@ async def load_docs_in_chunks( dataset = await load_input(config.input, reporter, root) # covert to text units - input = VerbInput(input=TableContainer(table=dataset)) chunk_strategy = config.chunks.resolved_strategy(defs.ENCODING_MODEL) # Use smaller chunks, to avoid huge prompts chunk_strategy["chunk_size"] = chunk_size chunk_strategy["chunk_overlap"] = MIN_CHUNK_OVERLAP - dataset_chunks_table_container = chunk( - input, + dataset_chunks = chunk_text( + dataset, column="text", to="chunks", callbacks=NoopVerbCallbacks(), strategy=chunk_strategy, ) - dataset_chunks = cast(pd.DataFrame, dataset_chunks_table_container.table) - # Select chunks into a new df and explode it chunks_df = pd.DataFrame(dataset_chunks["chunks"].explode()) # type: ignore diff --git a/tests/integration/_pipeline/megapipeline.yml b/tests/integration/_pipeline/megapipeline.yml index e8f51d26..a6004b9b 100644 --- a/tests/integration/_pipeline/megapipeline.yml +++ b/tests/integration/_pipeline/megapipeline.yml @@ -24,7 +24,29 @@ workflows: graphml_snapshot: True entity_extract: strategy: - type: nltk + type: graph_intelligence + llm: + type: static_response + responses: + - '("entity"<|>COMPANY_A<|>COMPANY<|>Company_A is a test company) + ## + ("entity"<|>COMPANY_B<|>COMPANY<|>Company_B owns Company_A and also shares an address with Company_A) + ## + ("entity"<|>PERSON_C<|>PERSON<|>Person_C is director of Company_A) + ## + ("relationship"<|>COMPANY_A<|>COMPANY_B<|>Company_A and Company_B are related because Company_A is 100% owned by Company_B and the two companies also share the same address)<|>2) + ## + ("relationship"<|>COMPANY_A<|>PERSON_C<|>Company_A and Person_C are related because Person_C is director of Company_A<|>1))' + + - name: create_summarized_entities + config: + summarize_descriptions: + strategy: + type: graph_intelligence + llm: + type: static_response + responses: + - This is a MOCK response for the LLM. It is summarized! - name: create_base_entity_graph config: diff --git a/tests/unit/indexing/verbs/entities/extraction/strategies/graph_intelligence/test_gi_entity_extraction.py b/tests/unit/indexing/verbs/entities/extraction/strategies/graph_intelligence/test_gi_entity_extraction.py index 1ebca2bf..31a83a26 100644 --- a/tests/unit/indexing/verbs/entities/extraction/strategies/graph_intelligence/test_gi_entity_extraction.py +++ b/tests/unit/indexing/verbs/entities/extraction/strategies/graph_intelligence/test_gi_entity_extraction.py @@ -4,10 +4,12 @@ import unittest import networkx as nx -from graphrag.index.verbs.entities.extraction.strategies.graph_intelligence.run_graph_intelligence import ( - Document, +from graphrag.index.operations.extract_entities.strategies.graph_intelligence import ( run_extract_entities, ) +from graphrag.index.operations.extract_entities.strategies.typing import ( + Document, +) from tests.unit.indexing.verbs.helpers.mock_llm import create_mock_llm @@ -16,7 +18,7 @@ class TestRunChain(unittest.IsolatedAsyncioTestCase): results = await run_extract_entities( docs=[Document("test_text", "1")], entity_types=["person"], - reporter=None, + callbacks=None, args={ "prechunked": True, "max_gleanings": 0, @@ -51,7 +53,7 @@ class TestRunChain(unittest.IsolatedAsyncioTestCase): results = await run_extract_entities( docs=[Document("text_1", "1"), Document("text_2", "2")], entity_types=["person"], - reporter=None, + callbacks=None, args={ "prechunked": True, "max_gleanings": 0, @@ -88,7 +90,7 @@ class TestRunChain(unittest.IsolatedAsyncioTestCase): results = await run_extract_entities( docs=[Document("text_1", "1"), Document("text_2", "2")], entity_types=["person"], - reporter=None, + callbacks=None, args={ "prechunked": True, "max_gleanings": 0, @@ -133,7 +135,7 @@ class TestRunChain(unittest.IsolatedAsyncioTestCase): results = await run_extract_entities( docs=[Document("text_1", "1"), Document("text_2", "2")], entity_types=["person"], - reporter=None, + callbacks=None, args={ "prechunked": True, "max_gleanings": 0, @@ -181,7 +183,7 @@ class TestRunChain(unittest.IsolatedAsyncioTestCase): results = await run_extract_entities( docs=[Document("text_1", "1"), Document("text_2", "2")], entity_types=["person"], - reporter=None, + callbacks=None, args={ "prechunked": True, "max_gleanings": 0, diff --git a/tests/unit/indexing/verbs/text/test_split.py b/tests/unit/indexing/verbs/text/test_split.py index d9ced064..abbb0eef 100644 --- a/tests/unit/indexing/verbs/text/test_split.py +++ b/tests/unit/indexing/verbs/text/test_split.py @@ -5,34 +5,34 @@ import unittest import pandas as pd import pytest -from graphrag.index.verbs.text.split import text_split_df +from graphrag.index.operations.split_text import split_text class TestTextSplit(unittest.TestCase): def test_empty_string(self): input = pd.DataFrame([{"in": ""}]) - result = text_split_df(input, "in", "out", ",").to_dict(orient="records") + result = split_text(input, "in", "out", ",").to_dict(orient="records") assert len(result) == 1 assert result[0]["out"] == [] def test_string_without_seperator(self): input = pd.DataFrame([{"in": "test_string_without_seperator"}]) - result = text_split_df(input, "in", "out", ",").to_dict(orient="records") + result = split_text(input, "in", "out", ",").to_dict(orient="records") assert len(result) == 1 assert result[0]["out"] == ["test_string_without_seperator"] def test_string_with_seperator(self): input = pd.DataFrame([{"in": "test_1,test_2"}]) - result = text_split_df(input, "in", "out", ",").to_dict(orient="records") + result = split_text(input, "in", "out", ",").to_dict(orient="records") assert len(result) == 1 assert result[0]["out"] == ["test_1", "test_2"] def test_row_with_list_as_column(self): input = pd.DataFrame([{"in": ["test_1", "test_2"]}]) - result = text_split_df(input, "in", "out", ",").to_dict(orient="records") + result = split_text(input, "in", "out", ",").to_dict(orient="records") assert len(result) == 1 assert result[0]["out"] == ["test_1", "test_2"] @@ -40,11 +40,11 @@ class TestTextSplit(unittest.TestCase): def test_non_string_column_throws_error(self): input = pd.DataFrame([{"in": 5}]) with pytest.raises(TypeError): - text_split_df(input, "in", "out", ",").to_dict(orient="records") + split_text(input, "in", "out", ",").to_dict(orient="records") def test_more_than_one_row_returns_correctly(self): input = pd.DataFrame([{"in": "row_1_1,row_1_2"}, {"in": "row_2_1,row_2_2"}]) - result = text_split_df(input, "in", "out", ",").to_dict(orient="records") + result = split_text(input, "in", "out", ",").to_dict(orient="records") assert len(result) == 2 assert result[0]["out"] == ["row_1_1", "row_1_2"] diff --git a/tests/verbs/test_create_base_extracted_entities.py b/tests/verbs/test_create_base_extracted_entities.py index 029126c1..57ca6003 100644 --- a/tests/verbs/test_create_base_extracted_entities.py +++ b/tests/verbs/test_create_base_extracted_entities.py @@ -2,7 +2,10 @@ # Licensed under the MIT License import networkx as nx +import pytest +from datashaper.errors import VerbParallelizationError +from graphrag.config.enums import LLMType from graphrag.index.storage.memory_pipeline_storage import MemoryPipelineStorage from graphrag.index.workflows.v1.create_base_extracted_entities import ( build_steps, @@ -16,6 +19,25 @@ from .util import ( load_input_tables, ) +MOCK_LLM_RESPONSES = [ + """ + ("entity"<|>COMPANY_A<|>COMPANY<|>Company_A is a test company) + ## + ("entity"<|>COMPANY_B<|>COMPANY<|>Company_B owns Company_A and also shares an address with Company_A) + ## + ("entity"<|>PERSON_C<|>PERSON<|>Person_C is director of Company_A) + ## + ("relationship"<|>COMPANY_A<|>COMPANY_B<|>Company_A and Company_B are related because Company_A is 100% owned by Company_B and the two companies also share the same address)<|>2) + ## + ("relationship"<|>COMPANY_A<|>PERSON_C<|>Company_A and Person_C are related because Person_C is director of Company_A<|>1)) + """.strip() +] + +MOCK_LLM_CONFIG = { + "type": LLMType.StaticResponse, + "responses": MOCK_LLM_RESPONSES, +} + async def test_create_base_extracted_entities(): input_tables = load_input_tables(["workflow:create_base_text_units"]) @@ -25,7 +47,7 @@ async def test_create_base_extracted_entities(): config = get_config_for_workflow(workflow_name) - del config["entity_extract"]["strategy"]["llm"] + config["entity_extract"]["strategy"]["llm"] = MOCK_LLM_CONFIG steps = build_steps(config) @@ -57,7 +79,7 @@ async def test_create_base_extracted_entities_with_snapshots(): config = get_config_for_workflow(workflow_name) - del config["entity_extract"]["strategy"]["llm"] + config["entity_extract"]["strategy"]["llm"] = MOCK_LLM_CONFIG config["raw_entity_snapshot"] = True config["graphml_snapshot"] = True @@ -76,3 +98,21 @@ async def test_create_base_extracted_entities_with_snapshots(): assert actual.columns == expected.columns assert storage.keys() == ["raw_extracted_entities.json", "merged_graph.graphml"] + + +async def test_create_base_extracted_entities_missing_llm_throws(): + input_tables = load_input_tables(["workflow:create_base_text_units"]) + + config = get_config_for_workflow(workflow_name) + + del config["entity_extract"]["strategy"]["llm"] + + steps = build_steps(config) + + with pytest.raises(VerbParallelizationError): + await get_workflow_output( + input_tables, + { + "steps": steps, + }, + ) diff --git a/tests/verbs/test_create_final_community_reports.py b/tests/verbs/test_create_final_community_reports.py index b4f5ecf7..cb846e24 100644 --- a/tests/verbs/test_create_final_community_reports.py +++ b/tests/verbs/test_create_final_community_reports.py @@ -1,6 +1,12 @@ # Copyright (c) 2024 Microsoft Corporation. # Licensed under the MIT License +import json + +import pytest +from datashaper.errors import VerbParallelizationError + +from graphrag.config.enums import LLMType from graphrag.index.workflows.v1.create_final_community_reports import ( build_steps, workflow_name, @@ -14,6 +20,27 @@ from .util import ( load_input_tables, ) +MOCK_RESPONSES = [ + json.dumps({ + "title": "", + "summary": "", + "rating": 2, + "rating_explanation": "", + "findings": [ + { + "summary": "", + "explanation": "", + "explanation": "GOVERNMENT AGENCY B<|>ANTI-COMPETITIVE PRACTICES<|>TRUE<|>2022-01-10T00:00:00<|>2022-01-10T00:00:00<|>Company A was found to engage in anti-competitive practices because it was fined for bid rigging in multiple public tenders published by Government Agency B according to an article published on 2022/01/10<|>According to an article published on 2022/01/10, Company A was fined for bid rigging while participating in multiple public tenders published by Government Agency B.) + """.strip() +] + +MOCK_LLM_CONFIG = {"type": LLMType.StaticResponse, "responses": MOCK_LLM_RESPONSES} + async def test_create_final_covariates(): input_tables = load_input_tables(["workflow:create_base_text_units"]) @@ -22,8 +33,7 @@ async def test_create_final_covariates(): config = get_config_for_workflow(workflow_name) - # deleting the llm config results in a default mock injection in run_gi_extract_claims - del config["claim_extract"]["strategy"]["llm"] + config["claim_extract"]["strategy"]["llm"] = MOCK_LLM_CONFIG steps = build_steps(config) @@ -66,3 +76,21 @@ async def test_create_final_covariates(): actual["source_text"][0] == "According to an article published on 2022/01/10, Company A was fined for bid rigging while participating in multiple public tenders published by Government Agency B." ) + + +async def test_create_final_covariates_missing_llm_throws(): + input_tables = load_input_tables(["workflow:create_base_text_units"]) + + config = get_config_for_workflow(workflow_name) + + del config["claim_extract"]["strategy"]["llm"] + + steps = build_steps(config) + + with pytest.raises(VerbParallelizationError): + await get_workflow_output( + input_tables, + { + "steps": steps, + }, + ) diff --git a/tests/verbs/test_create_summarized_entities.py b/tests/verbs/test_create_summarized_entities.py index 7d9ac9b5..c36c9b53 100644 --- a/tests/verbs/test_create_summarized_entities.py +++ b/tests/verbs/test_create_summarized_entities.py @@ -2,7 +2,9 @@ # Licensed under the MIT License import networkx as nx +import pytest +from graphrag.config.enums import LLMType from graphrag.index.storage.memory_pipeline_storage import MemoryPipelineStorage from graphrag.index.workflows.v1.create_summarized_entities import ( build_steps, @@ -16,6 +18,17 @@ from .util import ( load_input_tables, ) +MOCK_LLM_RESPONSES = [ + """ + This is a MOCK response for the LLM. It is summarized! + """.strip() +] + +MOCK_LLM_CONFIG = { + "type": LLMType.StaticResponse, + "responses": MOCK_LLM_RESPONSES, +} + async def test_create_summarized_entities(): input_tables = load_input_tables([ @@ -27,7 +40,7 @@ async def test_create_summarized_entities(): config = get_config_for_workflow(workflow_name) - del config["summarize_descriptions"]["strategy"]["llm"] + config["summarize_descriptions"]["strategy"]["llm"] = MOCK_LLM_CONFIG steps = build_steps(config) @@ -76,7 +89,7 @@ async def test_create_summarized_entities_with_snapshots(): config = get_config_for_workflow(workflow_name) - del config["summarize_descriptions"]["strategy"]["llm"] + config["summarize_descriptions"]["strategy"]["llm"] = MOCK_LLM_CONFIG config["graphml_snapshot"] = True steps = build_steps(config) @@ -94,3 +107,23 @@ async def test_create_summarized_entities_with_snapshots(): assert storage.keys() == [ "summarized_graph.graphml", ], "Graph snapshot keys differ" + + +async def test_create_summarized_entities_missing_llm_throws(): + input_tables = load_input_tables([ + "workflow:create_base_extracted_entities", + ]) + + config = get_config_for_workflow(workflow_name) + + del config["summarize_descriptions"]["strategy"]["llm"] + + steps = build_steps(config) + + with pytest.raises(ValueError): # noqa PT011 + await get_workflow_output( + input_tables, + { + "steps": steps, + }, + ) diff --git a/tests/verbs/util.py b/tests/verbs/util.py index 4dff5399..2d6d5b6d 100644 --- a/tests/verbs/util.py +++ b/tests/verbs/util.py @@ -53,6 +53,7 @@ def get_config_for_workflow(name: str) -> PipelineWorkflowConfig: pipeline_config = create_pipeline_config(config) result = next(conf for conf in pipeline_config.workflows if conf.name == name) + return cast(PipelineWorkflowConfig, result.config) From d66901e67e5b49805649a28153e45d432e6f9d6d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?JunHo=20Kim=20=28=EA=B9=80=EC=A4=80=ED=98=B8=29?= Date: Thu, 10 Oct 2024 06:16:50 +0900 Subject: [PATCH 2/6] Update description of GRAPHRAG_CACHE_BASE_DIR in env_vars.md (#1213) * Update description of GRAPHRAG_CACHE_BASE_DIR in env_vars.md Clarified that `GRAPHRAG_CACHE_BASE_DIR` refers to the base directory path for cache files rather than reporting outputs. This improves the accuracy of the documentation and helps users understand the correct usage of this environment variable. * Update description of `GRAPHRAG_CACHE_BASE_DIR` Simplified the description of `GRAPHRAG_CACHE_BASE_DIR` to make it clearer. Changed "base directory path" to "base path" for conciseness. --------- Co-authored-by: Alonso Guevara --- docsite/posts/config/env_vars.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docsite/posts/config/env_vars.md b/docsite/posts/config/env_vars.md index 406ed366..d2164873 100644 --- a/docsite/posts/config/env_vars.md +++ b/docsite/posts/config/env_vars.md @@ -178,7 +178,7 @@ This section controls the cache mechanism used by the pipeline. This is used to | `GRAPHRAG_CACHE_STORAGE_ACCOUNT_BLOB_URL` | The Azure Storage blob endpoint to use when in `blob` mode and using managed identity. Will have the format `https://.blob.core.windows.net` | `str` | optional | None | | `GRAPHRAG_CACHE_CONNECTION_STRING` | The Azure Storage connection string to use when in `blob` mode. | `str` | optional | None | | `GRAPHRAG_CACHE_CONTAINER_NAME` | The Azure Storage container name to use when in `blob` mode. | `str` | optional | None | -| `GRAPHRAG_CACHE_BASE_DIR` | The base path to the reporting outputs. | `str` | optional | None | +| `GRAPHRAG_CACHE_BASE_DIR` | The base path to the cache files. | `str` | optional | None | ## Reporting From d4a0a590f4db059ae8e7bdd8c293cedd15ec741a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?JunHo=20Kim=20=28=EA=B9=80=EC=A4=80=ED=98=B8=29?= Date: Thu, 10 Oct 2024 06:20:18 +0900 Subject: [PATCH 3/6] Change config.json references to settings.json in the configuration document. (#1221) Updated the configuration documentation to reflect the default filename for configuration file. Default config files are `["settings.yaml", "settings.yml", "settings.json"]` https://github.com/microsoft/graphrag/blob/ce71bcf7fbe9811058f6bbc1eb725c4a1d960e7e/graphrag/config/config_file_loader.py#L15 Co-authored-by: Alonso Guevara --- docsite/posts/config/json_yaml.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docsite/posts/config/json_yaml.md b/docsite/posts/config/json_yaml.md index 8c2e5701..394b4b2d 100644 --- a/docsite/posts/config/json_yaml.md +++ b/docsite/posts/config/json_yaml.md @@ -6,7 +6,7 @@ layout: page date: 2023-01-03 --- -The default configuration mode may be configured by using a `config.json` or `config.yml` file in the data project root. If a `.env` file is present along with this config file, then it will be loaded, and the environment variables defined therein will be available for token replacements in your configuration document using `${ENV_VAR}` syntax. +The default configuration mode may be configured by using a `settings.json` or `settings.yml` file in the data project root. If a `.env` file is present along with this config file, then it will be loaded, and the environment variables defined therein will be available for token replacements in your configuration document using `${ENV_VAR}` syntax. For example: @@ -14,7 +14,7 @@ For example: # .env API_KEY=some_api_key -# config.json +# settings.json { "llm": { "api_key": "${API_KEY}" From 9fa6b91684a193bb9024eb359585481d1634ff5e Mon Sep 17 00:00:00 2001 From: Alonso Guevara Date: Wed, 9 Oct 2024 17:01:54 -0600 Subject: [PATCH 4/6] Chore/community context clean (#1262) * Update community_context.py to check conversation_history_context's value For the following code (line 90 - 96), conversation_history_context is concatenated with community_context, but the case where conversation_history_context is empty("") has not been considered. When conversation_history_context is empty (""), concatenation should not be performed, as it would result in community_context or each element in community_context having an extra "\n\n". Therefore, by introducing a context_prefix to check the state of conversation_history_context, concatenation can be handled appropriately. When conversation_history_context is empty (""), the following code will use "" for concatenation. When conversation_history_context is not empty (""), the functionality will be similar to the previous code. * Format and semver * Code cleanup --------- Co-authored-by: ZeyuTeng96 <96521059+ZeyuTeng96@users.noreply.github.com> --- .../patch-20241009221929632018.json | 4 ++++ .../global_search/community_context.py | 24 ++++++++++++------- 2 files changed, 20 insertions(+), 8 deletions(-) create mode 100644 .semversioner/next-release/patch-20241009221929632018.json diff --git a/.semversioner/next-release/patch-20241009221929632018.json b/.semversioner/next-release/patch-20241009221929632018.json new file mode 100644 index 00000000..ec560627 --- /dev/null +++ b/.semversioner/next-release/patch-20241009221929632018.json @@ -0,0 +1,4 @@ +{ + "type": "patch", + "description": "Small cleanup in community context history building" +} diff --git a/graphrag/query/structured_search/global_search/community_context.py b/graphrag/query/structured_search/global_search/community_context.py index d63320c8..f5991526 100644 --- a/graphrag/query/structured_search/global_search/community_context.py +++ b/graphrag/query/structured_search/global_search/community_context.py @@ -87,13 +87,21 @@ class GlobalCommunityContext(GlobalContextBuilder): context_name=context_name, random_state=self.random_state, ) - if isinstance(community_context, list): - final_context = [ - f"{conversation_history_context}\n\n{context}" - for context in community_context - ] - else: - final_context = f"{conversation_history_context}\n\n{community_context}" + # Prepare context_prefix based on whether conversation_history_context exists + context_prefix = ( + f"{conversation_history_context}\n\n" + if conversation_history_context + else "" + ) + + final_context = ( + [f"{context_prefix}{context}" for context in community_context] + if isinstance(community_context, list) + else f"{context_prefix}{community_context}" + ) + + # Update the final context data with the provided community_context_data final_context_data.update(community_context_data) - return (final_context, final_context_data) + + return final_context, final_context_data From cd4f1fa9bac34e43a668bbff54294c96d280df04 Mon Sep 17 00:00:00 2001 From: Sumit K Bhuttan Date: Wed, 9 Oct 2024 19:09:17 -0400 Subject: [PATCH 5/6] Adding fix per comment on Issue-692 (#1255) Co-authored-by: Josh Bradley Co-authored-by: Alonso Guevara --- docsite/posts/get_started.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docsite/posts/get_started.md b/docsite/posts/get_started.md index b0ea2664..eb1989bb 100644 --- a/docsite/posts/get_started.md +++ b/docsite/posts/get_started.md @@ -48,7 +48,7 @@ mkdir -p ./ragtest/input Now let's get a copy of A Christmas Carol by Charles Dickens from a trusted source ```sh -curl https://www.gutenberg.org/cache/epub/24022/pg24022.txt > ./ragtest/input/book.txt +curl https://www.gutenberg.org/cache/epub/24022/pg24022.txt -o ./ragtest/input/book.txt ``` Next we'll inject some required config variables: From ce8749bd19a199e2edb798584d3b83c85f8aabce Mon Sep 17 00:00:00 2001 From: 9prodhi Date: Wed, 9 Oct 2024 18:26:28 -0500 Subject: [PATCH 6/6] Fix: Add await to LLM execution for async handling (#1206) Co-authored-by: Alonso Guevara --- graphrag/llm/openai/openai_completion_llm.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/graphrag/llm/openai/openai_completion_llm.py b/graphrag/llm/openai/openai_completion_llm.py index bdbac6c1..74511c02 100644 --- a/graphrag/llm/openai/openai_completion_llm.py +++ b/graphrag/llm/openai/openai_completion_llm.py @@ -39,5 +39,5 @@ class OpenAICompletionLLM(BaseLLM[CompletionInput, CompletionOutput]): args = get_completion_llm_args( kwargs.get("model_parameters"), self.configuration ) - completion = self.client.completions.create(prompt=input, **args) + completion = await self.client.completions.create(prompt=input, **args) return completion.choices[0].text