diff --git a/.semversioner/next-release/patch-20241003214516791831.json b/.semversioner/next-release/patch-20241003214516791831.json new file mode 100644 index 00000000..0fa47e64 --- /dev/null +++ b/.semversioner/next-release/patch-20241003214516791831.json @@ -0,0 +1,4 @@ +{ + "type": "patch", + "description": "Moving verbs around." +} diff --git a/.semversioner/next-release/patch-20241009221929632018.json b/.semversioner/next-release/patch-20241009221929632018.json new file mode 100644 index 00000000..ec560627 --- /dev/null +++ b/.semversioner/next-release/patch-20241009221929632018.json @@ -0,0 +1,4 @@ +{ + "type": "patch", + "description": "Small cleanup in community context history building" +} diff --git a/dictionary.txt b/dictionary.txt index b7eb072a..ee01cc68 100644 --- a/dictionary.txt +++ b/dictionary.txt @@ -89,7 +89,6 @@ nbconvert binarize prechunked openai -genid umap concat unhot diff --git a/docsite/posts/config/env_vars.md b/docsite/posts/config/env_vars.md index 406ed366..d2164873 100644 --- a/docsite/posts/config/env_vars.md +++ b/docsite/posts/config/env_vars.md @@ -178,7 +178,7 @@ This section controls the cache mechanism used by the pipeline. This is used to | `GRAPHRAG_CACHE_STORAGE_ACCOUNT_BLOB_URL` | The Azure Storage blob endpoint to use when in `blob` mode and using managed identity. Will have the format `https://.blob.core.windows.net` | `str` | optional | None | | `GRAPHRAG_CACHE_CONNECTION_STRING` | The Azure Storage connection string to use when in `blob` mode. | `str` | optional | None | | `GRAPHRAG_CACHE_CONTAINER_NAME` | The Azure Storage container name to use when in `blob` mode. | `str` | optional | None | -| `GRAPHRAG_CACHE_BASE_DIR` | The base path to the reporting outputs. | `str` | optional | None | +| `GRAPHRAG_CACHE_BASE_DIR` | The base path to the cache files. | `str` | optional | None | ## Reporting diff --git a/docsite/posts/config/json_yaml.md b/docsite/posts/config/json_yaml.md index 8c2e5701..394b4b2d 100644 --- a/docsite/posts/config/json_yaml.md +++ b/docsite/posts/config/json_yaml.md @@ -6,7 +6,7 @@ layout: page date: 2023-01-03 --- -The default configuration mode may be configured by using a `config.json` or `config.yml` file in the data project root. If a `.env` file is present along with this config file, then it will be loaded, and the environment variables defined therein will be available for token replacements in your configuration document using `${ENV_VAR}` syntax. +The default configuration mode may be configured by using a `settings.json` or `settings.yml` file in the data project root. If a `.env` file is present along with this config file, then it will be loaded, and the environment variables defined therein will be available for token replacements in your configuration document using `${ENV_VAR}` syntax. For example: @@ -14,7 +14,7 @@ For example: # .env API_KEY=some_api_key -# config.json +# settings.json { "llm": { "api_key": "${API_KEY}" diff --git a/docsite/posts/get_started.md b/docsite/posts/get_started.md index b0ea2664..eb1989bb 100644 --- a/docsite/posts/get_started.md +++ b/docsite/posts/get_started.md @@ -48,7 +48,7 @@ mkdir -p ./ragtest/input Now let's get a copy of A Christmas Carol by Charles Dickens from a trusted source ```sh -curl https://www.gutenberg.org/cache/epub/24022/pg24022.txt > ./ragtest/input/book.txt +curl https://www.gutenberg.org/cache/epub/24022/pg24022.txt -o ./ragtest/input/book.txt ``` Next we'll inject some required config variables: diff --git a/graphrag/config/models/chunking_config.py b/graphrag/config/models/chunking_config.py index 4ca8a8d3..a2b40017 100644 --- a/graphrag/config/models/chunking_config.py +++ b/graphrag/config/models/chunking_config.py @@ -29,7 +29,7 @@ class ChunkingConfig(BaseModel): def resolved_strategy(self, encoding_model: str) -> dict: """Get the resolved chunking strategy.""" - from graphrag.index.verbs.text.chunk import ChunkStrategyType + from graphrag.index.operations.chunk_text import ChunkStrategyType return self.strategy or { "type": ChunkStrategyType.tokens, diff --git a/graphrag/config/models/claim_extraction_config.py b/graphrag/config/models/claim_extraction_config.py index a26fdad2..6a4de8e3 100644 --- a/graphrag/config/models/claim_extraction_config.py +++ b/graphrag/config/models/claim_extraction_config.py @@ -38,7 +38,7 @@ class ClaimExtractionConfig(LLMConfig): def resolved_strategy(self, root_dir: str, encoding_model: str) -> dict: """Get the resolved claim extraction strategy.""" - from graphrag.index.verbs.covariates.extract_covariates import ( + from graphrag.index.operations.extract_covariates import ( ExtractClaimsStrategyType, ) diff --git a/graphrag/config/models/cluster_graph_config.py b/graphrag/config/models/cluster_graph_config.py index 3029baeb..805e5a18 100644 --- a/graphrag/config/models/cluster_graph_config.py +++ b/graphrag/config/models/cluster_graph_config.py @@ -20,7 +20,7 @@ class ClusterGraphConfig(BaseModel): def resolved_strategy(self) -> dict: """Get the resolved cluster strategy.""" - from graphrag.index.verbs.graph.clustering import GraphCommunityStrategyType + from graphrag.index.operations.cluster_graph import GraphCommunityStrategyType return self.strategy or { "type": GraphCommunityStrategyType.leiden, diff --git a/graphrag/config/models/community_reports_config.py b/graphrag/config/models/community_reports_config.py index ab55063c..0eafa81c 100644 --- a/graphrag/config/models/community_reports_config.py +++ b/graphrag/config/models/community_reports_config.py @@ -32,7 +32,9 @@ class CommunityReportsConfig(LLMConfig): def resolved_strategy(self, root_dir) -> dict: """Get the resolved community report extraction strategy.""" - from graphrag.index.verbs.graph.report import CreateCommunityReportsStrategyType + from graphrag.index.operations.summarize_communities import ( + CreateCommunityReportsStrategyType, + ) return self.strategy or { "type": CreateCommunityReportsStrategyType.graph_intelligence, diff --git a/graphrag/config/models/embed_graph_config.py b/graphrag/config/models/embed_graph_config.py index e3f717c0..12dd90cf 100644 --- a/graphrag/config/models/embed_graph_config.py +++ b/graphrag/config/models/embed_graph_config.py @@ -36,7 +36,7 @@ class EmbedGraphConfig(BaseModel): def resolved_strategy(self) -> dict: """Get the resolved node2vec strategy.""" - from graphrag.index.operations.embed_graph.embed_graph import ( + from graphrag.index.operations.embed_graph import ( EmbedGraphStrategyType, ) diff --git a/graphrag/config/models/entity_extraction_config.py b/graphrag/config/models/entity_extraction_config.py index ca160bc4..08055d51 100644 --- a/graphrag/config/models/entity_extraction_config.py +++ b/graphrag/config/models/entity_extraction_config.py @@ -35,7 +35,9 @@ class EntityExtractionConfig(LLMConfig): def resolved_strategy(self, root_dir: str, encoding_model: str) -> dict: """Get the resolved entity extraction strategy.""" - from graphrag.index.verbs.entities.extraction import ExtractEntityStrategyType + from graphrag.index.operations.extract_entities import ( + ExtractEntityStrategyType, + ) return self.strategy or { "type": ExtractEntityStrategyType.graph_intelligence, diff --git a/graphrag/config/models/summarize_descriptions_config.py b/graphrag/config/models/summarize_descriptions_config.py index 9747d949..9104a60a 100644 --- a/graphrag/config/models/summarize_descriptions_config.py +++ b/graphrag/config/models/summarize_descriptions_config.py @@ -28,7 +28,9 @@ class SummarizeDescriptionsConfig(LLMConfig): def resolved_strategy(self, root_dir: str) -> dict: """Get the resolved description summarization strategy.""" - from graphrag.index.verbs.entities.summarize import SummarizeStrategyType + from graphrag.index.operations.summarize_descriptions import ( + SummarizeStrategyType, + ) return self.strategy or { "type": SummarizeStrategyType.graph_intelligence, diff --git a/graphrag/config/models/text_embedding_config.py b/graphrag/config/models/text_embedding_config.py index cec0ee46..abd2f2bf 100644 --- a/graphrag/config/models/text_embedding_config.py +++ b/graphrag/config/models/text_embedding_config.py @@ -35,7 +35,7 @@ class TextEmbeddingConfig(LLMConfig): def resolved_strategy(self) -> dict: """Get the resolved text embedding strategy.""" - from graphrag.index.operations.embed_text.embed_text import ( + from graphrag.index.operations.embed_text import ( TextEmbedStrategyType, ) diff --git a/graphrag/index/flows/create_base_entity_graph.py b/graphrag/index/flows/create_base_entity_graph.py index 25f6375e..39880a45 100644 --- a/graphrag/index/flows/create_base_entity_graph.py +++ b/graphrag/index/flows/create_base_entity_graph.py @@ -10,10 +10,10 @@ from datashaper import ( VerbCallbacks, ) -from graphrag.index.operations.embed_graph.embed_graph import embed_graph +from graphrag.index.operations.cluster_graph import cluster_graph +from graphrag.index.operations.embed_graph import embed_graph +from graphrag.index.operations.snapshot_rows import snapshot_rows from graphrag.index.storage import PipelineStorage -from graphrag.index.verbs.graph.clustering.cluster_graph import cluster_graph_df -from graphrag.index.verbs.snapshot_rows import snapshot_rows_df async def create_base_entity_graph( @@ -25,7 +25,7 @@ async def create_base_entity_graph( graphml_snapshot_enabled: bool = False, ) -> pd.DataFrame: """All the steps to create the base entity graph.""" - clustered = cluster_graph_df( + clustered = cluster_graph( entities, callbacks, column="entity_graph", @@ -35,7 +35,7 @@ async def create_base_entity_graph( ) if graphml_snapshot_enabled: - await snapshot_rows_df( + await snapshot_rows( clustered, column="clustered_graph", base_name="clustered_graph", @@ -54,7 +54,7 @@ async def create_base_entity_graph( # take second snapshot after embedding # todo: this could be skipped if embedding isn't performed, other wise it is a copy of the regular graph? if graphml_snapshot_enabled: - await snapshot_rows_df( + await snapshot_rows( clustered, column="entity_graph", base_name="embedded_graph", diff --git a/graphrag/index/flows/create_base_extracted_entities.py b/graphrag/index/flows/create_base_extracted_entities.py index b538f18f..bfbf4d23 100644 --- a/graphrag/index/flows/create_base_extracted_entities.py +++ b/graphrag/index/flows/create_base_extracted_entities.py @@ -12,23 +12,23 @@ from datashaper import ( ) from graphrag.index.cache import PipelineCache +from graphrag.index.operations.extract_entities import extract_entities +from graphrag.index.operations.merge_graphs import merge_graphs +from graphrag.index.operations.snapshot import snapshot +from graphrag.index.operations.snapshot_rows import snapshot_rows from graphrag.index.storage import PipelineStorage -from graphrag.index.verbs.entities.extraction.entity_extract import entity_extract_df -from graphrag.index.verbs.graph.merge.merge_graphs import merge_graphs_df -from graphrag.index.verbs.snapshot import snapshot_df -from graphrag.index.verbs.snapshot_rows import snapshot_rows_df async def create_base_extracted_entities( text_units: pd.DataFrame, - cache: PipelineCache, callbacks: VerbCallbacks, + cache: PipelineCache, storage: PipelineStorage, column: str, id_column: str, nodes: dict[str, Any], edges: dict[str, Any], - strategy: dict[str, Any] | None, + extraction_strategy: dict[str, Any] | None, async_mode: AsyncType = AsyncType.AsyncIO, entity_types: list[str] | None = None, graphml_snapshot_enabled: bool = False, @@ -36,13 +36,13 @@ async def create_base_extracted_entities( num_threads: int = 4, ) -> pd.DataFrame: """All the steps to extract and format covariates.""" - entity_graph = await entity_extract_df( + entity_graph = await extract_entities( text_units, - cache, callbacks, + cache, column=column, id_column=id_column, - strategy=strategy, + strategy=extraction_strategy, async_mode=async_mode, entity_types=entity_types, to="entities", @@ -51,14 +51,14 @@ async def create_base_extracted_entities( ) if raw_entity_snapshot_enabled: - await snapshot_df( + await snapshot( entity_graph, name="raw_extracted_entities", storage=storage, formats=["json"], ) - merged_graph = merge_graphs_df( + merged_graph = merge_graphs( entity_graph, callbacks, column="entity_graph", @@ -68,7 +68,7 @@ async def create_base_extracted_entities( ) if graphml_snapshot_enabled: - await snapshot_rows_df( + await snapshot_rows( merged_graph, base_name="merged_graph", column="entity_graph", diff --git a/graphrag/index/flows/create_base_text_units.py b/graphrag/index/flows/create_base_text_units.py index 091b9221..4e07e868 100644 --- a/graphrag/index/flows/create_base_text_units.py +++ b/graphrag/index/flows/create_base_text_units.py @@ -3,14 +3,19 @@ """All the steps to transform base text_units.""" +from dataclasses import dataclass from typing import Any, cast import pandas as pd -from datashaper import VerbCallbacks +from datashaper import ( + FieldAggregateOperation, + Progress, + VerbCallbacks, + aggregate_operation_mapping, +) -from graphrag.index.verbs.genid import genid_df -from graphrag.index.verbs.overrides.aggregate import aggregate_df -from graphrag.index.verbs.text.chunk.text_chunk import chunk_df +from graphrag.index.operations.chunk_text import chunk_text +from graphrag.index.utils import gen_md5_hash def create_base_text_units( @@ -19,7 +24,7 @@ def create_base_text_units( chunk_column_name: str, n_tokens_column_name: str, chunk_by_columns: list[str], - strategy: dict[str, Any] | None = None, + chunk_strategy: dict[str, Any] | None = None, ) -> pd.DataFrame: """All the steps to transform base text_units.""" sort = documents.sort_values(by=["id"], ascending=[True]) @@ -28,7 +33,9 @@ def create_base_text_units( zip(*[sort[col] for col in ["id", "text"]], strict=True) ) - aggregated = aggregate_df( + callbacks.progress(Progress(percent=0)) + + aggregated = _aggregate_df( sort, groupby=[*chunk_by_columns] if len(chunk_by_columns) > 0 else None, aggregations=[ @@ -40,12 +47,14 @@ def create_base_text_units( ], ) - chunked = chunk_df( + callbacks.progress(Progress(percent=1)) + + chunked = chunk_text( aggregated, column="texts", to="chunks", callbacks=callbacks, - strategy=strategy, + strategy=chunk_strategy, ) chunked = cast(pd.DataFrame, chunked[[*chunk_by_columns, "chunks"]]) @@ -56,11 +65,9 @@ def create_base_text_units( }, inplace=True, ) - - chunked = genid_df( - chunked, to="chunk_id", method="md5_hash", hash=[chunk_column_name] + chunked["chunk_id"] = chunked.apply( + lambda row: gen_md5_hash(row, [chunk_column_name]), axis=1 ) - chunked[["document_ids", chunk_column_name, n_tokens_column_name]] = pd.DataFrame( chunked[chunk_column_name].tolist(), index=chunked.index ) @@ -69,3 +76,57 @@ def create_base_text_units( return cast( pd.DataFrame, chunked[chunked[chunk_column_name].notna()].reset_index(drop=True) ) + + +# TODO: would be nice to inline this completely in the main method with pandas +def _aggregate_df( + input: pd.DataFrame, + aggregations: list[dict[str, Any]], + groupby: list[str] | None = None, +) -> pd.DataFrame: + """Aggregate method definition.""" + aggregations_to_apply = _load_aggregations(aggregations) + df_aggregations = { + agg.column: _get_pandas_agg_operation(agg) + for agg in aggregations_to_apply.values() + } + if groupby is None: + output_grouped = input.groupby(lambda _x: True) + else: + output_grouped = input.groupby(groupby, sort=False) + output = cast(pd.DataFrame, output_grouped.agg(df_aggregations)) + output.rename( + columns={agg.column: agg.to for agg in aggregations_to_apply.values()}, + inplace=True, + ) + output.columns = [agg.to for agg in aggregations_to_apply.values()] + return output.reset_index() + + +@dataclass +class Aggregation: + """Aggregation class method definition.""" + + column: str | None + operation: str + to: str + + # Only useful for the concat operation + separator: str | None = None + + +def _get_pandas_agg_operation(agg: Aggregation) -> Any: + if agg.operation == "string_concat": + return (agg.separator or ",").join + return aggregate_operation_mapping[FieldAggregateOperation(agg.operation)] + + +def _load_aggregations( + aggregations: list[dict[str, Any]], +) -> dict[str, Aggregation]: + return { + aggregation["column"]: Aggregation( + aggregation["column"], aggregation["operation"], aggregation["to"] + ) + for aggregation in aggregations + } diff --git a/graphrag/index/flows/create_final_communities.py b/graphrag/index/flows/create_final_communities.py index 51654121..23c84c56 100644 --- a/graphrag/index/flows/create_final_communities.py +++ b/graphrag/index/flows/create_final_communities.py @@ -8,7 +8,7 @@ from datashaper import ( VerbCallbacks, ) -from graphrag.index.verbs.graph.unpack import unpack_graph_df +from graphrag.index.operations.unpack_graph import unpack_graph def create_final_communities( @@ -16,8 +16,8 @@ def create_final_communities( callbacks: VerbCallbacks, ) -> pd.DataFrame: """All the steps to transform final communities.""" - graph_nodes = unpack_graph_df(entity_graph, callbacks, "clustered_graph", "nodes") - graph_edges = unpack_graph_df(entity_graph, callbacks, "clustered_graph", "edges") + graph_nodes = unpack_graph(entity_graph, callbacks, "clustered_graph", "nodes") + graph_edges = unpack_graph(entity_graph, callbacks, "clustered_graph", "edges") # Merge graph_nodes with graph_edges for both source and target matches source_clusters = graph_nodes.merge( diff --git a/graphrag/index/flows/create_final_community_reports.py b/graphrag/index/flows/create_final_community_reports.py index ddf7ea69..3556fa3f 100644 --- a/graphrag/index/flows/create_final_community_reports.py +++ b/graphrag/index/flows/create_final_community_reports.py @@ -31,15 +31,11 @@ from graphrag.index.graph.extractors.community_reports.schemas import ( NODE_ID, NODE_NAME, ) -from graphrag.index.operations.embed_text.embed_text import embed_text -from graphrag.index.verbs.graph.report.create_community_reports import ( - create_community_reports_df, -) -from graphrag.index.verbs.graph.report.prepare_community_reports import ( - prepare_community_reports_df, -) -from graphrag.index.verbs.graph.report.restore_community_hierarchy import ( - restore_community_hierarchy_df, +from graphrag.index.operations.embed_text import embed_text +from graphrag.index.operations.summarize_communities import ( + prepare_community_reports, + restore_community_hierarchy, + summarize_communities, ) @@ -49,7 +45,7 @@ async def create_final_community_reports( claims_input: pd.DataFrame | None, callbacks: VerbCallbacks, cache: PipelineCache, - strategy: dict, + summarization_strategy: dict, async_mode: AsyncType = AsyncType.AsyncIO, num_threads: int = 4, full_content_text_embed: dict | None = None, @@ -64,19 +60,23 @@ async def create_final_community_reports( if claims_input is not None: claims = _prep_claims(claims_input) - community_hierarchy = restore_community_hierarchy_df(nodes) + community_hierarchy = restore_community_hierarchy(nodes) - local_contexts = prepare_community_reports_df( - nodes, edges, claims, callbacks, strategy.get("max_input_length", 16_000) + local_contexts = prepare_community_reports( + nodes, + edges, + claims, + callbacks, + summarization_strategy.get("max_input_length", 16_000), ) - community_reports = await create_community_reports_df( + community_reports = await summarize_communities( local_contexts, nodes, community_hierarchy, callbacks, cache, - strategy, + summarization_strategy, async_mode=async_mode, num_threads=num_threads, ) diff --git a/graphrag/index/flows/create_final_covariates.py b/graphrag/index/flows/create_final_covariates.py index 98b352e4..09ec9f8f 100644 --- a/graphrag/index/flows/create_final_covariates.py +++ b/graphrag/index/flows/create_final_covariates.py @@ -13,30 +13,30 @@ from datashaper import ( ) from graphrag.index.cache import PipelineCache -from graphrag.index.verbs.covariates.extract_covariates.extract_covariates import ( - extract_covariates_df, +from graphrag.index.operations.extract_covariates import ( + extract_covariates, ) async def create_final_covariates( text_units: pd.DataFrame, - cache: PipelineCache, callbacks: VerbCallbacks, + cache: PipelineCache, column: str, covariate_type: str, - strategy: dict[str, Any] | None, + extraction_strategy: dict[str, Any] | None, async_mode: AsyncType = AsyncType.AsyncIO, entity_types: list[str] | None = None, num_threads: int = 4, ) -> pd.DataFrame: """All the steps to extract and format covariates.""" - covariates = await extract_covariates_df( + covariates = await extract_covariates( text_units, - cache, callbacks, + cache, column, covariate_type, - strategy, + extraction_strategy, async_mode, entity_types, num_threads, diff --git a/graphrag/index/flows/create_final_documents.py b/graphrag/index/flows/create_final_documents.py index c8f35132..29504000 100644 --- a/graphrag/index/flows/create_final_documents.py +++ b/graphrag/index/flows/create_final_documents.py @@ -9,7 +9,7 @@ from datashaper import ( ) from graphrag.index.cache import PipelineCache -from graphrag.index.operations.embed_text.embed_text import embed_text +from graphrag.index.operations.embed_text import embed_text async def create_final_documents( diff --git a/graphrag/index/flows/create_final_entities.py b/graphrag/index/flows/create_final_entities.py index e653f0f1..9601cb31 100644 --- a/graphrag/index/flows/create_final_entities.py +++ b/graphrag/index/flows/create_final_entities.py @@ -9,22 +9,22 @@ from datashaper import ( ) from graphrag.index.cache import PipelineCache -from graphrag.index.operations.embed_text.embed_text import embed_text -from graphrag.index.verbs.graph.unpack import unpack_graph_df -from graphrag.index.verbs.text.split import text_split_df +from graphrag.index.operations.embed_text import embed_text +from graphrag.index.operations.split_text import split_text +from graphrag.index.operations.unpack_graph import unpack_graph async def create_final_entities( entity_graph: pd.DataFrame, callbacks: VerbCallbacks, cache: PipelineCache, - name_text_embed: dict, - description_text_embed: dict, + name_text_embed: dict | None = None, + description_text_embed: dict | None = None, ) -> pd.DataFrame: """All the steps to transform final entities.""" # Process nodes nodes = ( - unpack_graph_df(entity_graph, callbacks, "clustered_graph", "nodes") + unpack_graph(entity_graph, callbacks, "clustered_graph", "nodes") .rename(columns={"label": "name"}) .loc[ :, @@ -44,7 +44,7 @@ async def create_final_entities( nodes = nodes.loc[nodes["name"].notna()] # Split 'source_id' column into 'text_unit_ids' - nodes = text_split_df( + nodes = split_text( nodes, column="source_id", separator=",", to="text_unit_ids" ).drop(columns=["source_id"]) diff --git a/graphrag/index/flows/create_final_nodes.py b/graphrag/index/flows/create_final_nodes.py index 4597a6f0..fb0b6890 100644 --- a/graphrag/index/flows/create_final_nodes.py +++ b/graphrag/index/flows/create_final_nodes.py @@ -10,27 +10,27 @@ from datashaper import ( VerbCallbacks, ) +from graphrag.index.operations.layout_graph import layout_graph +from graphrag.index.operations.snapshot import snapshot +from graphrag.index.operations.unpack_graph import unpack_graph from graphrag.index.storage import PipelineStorage -from graphrag.index.verbs.graph.layout.layout_graph import layout_graph_df -from graphrag.index.verbs.graph.unpack import unpack_graph_df -from graphrag.index.verbs.snapshot import snapshot_df async def create_final_nodes( entity_graph: pd.DataFrame, callbacks: VerbCallbacks, storage: PipelineStorage, - strategy: dict[str, Any], + layout_strategy: dict[str, Any], level_for_node_positions: int, snapshot_top_level_nodes: bool = False, ) -> pd.DataFrame: """All the steps to transform final nodes.""" laid_out_entity_graph = cast( pd.DataFrame, - layout_graph_df( + layout_graph( entity_graph, callbacks, - strategy, + layout_strategy, embeddings_column="embeddings", graph_column="clustered_graph", to="node_positions", @@ -40,7 +40,7 @@ async def create_final_nodes( nodes = cast( pd.DataFrame, - unpack_graph_df( + unpack_graph( laid_out_entity_graph, callbacks, column="positioned_graph", type="nodes" ), ) @@ -51,7 +51,7 @@ async def create_final_nodes( nodes = cast(pd.DataFrame, nodes[["id", "x", "y"]]) if snapshot_top_level_nodes: - await snapshot_df( + await snapshot( nodes, name="top_level_nodes", storage=storage, diff --git a/graphrag/index/flows/create_final_relationships.py b/graphrag/index/flows/create_final_relationships.py index 8e5bcd7d..ba82c5bc 100644 --- a/graphrag/index/flows/create_final_relationships.py +++ b/graphrag/index/flows/create_final_relationships.py @@ -11,11 +11,11 @@ from datashaper import ( ) from graphrag.index.cache import PipelineCache -from graphrag.index.operations.embed_text.embed_text import embed_text -from graphrag.index.verbs.graph.compute_edge_combined_degree import ( - compute_edge_combined_degree_df, +from graphrag.index.operations.compute_edge_combined_degree import ( + compute_edge_combined_degree, ) -from graphrag.index.verbs.graph.unpack import unpack_graph_df +from graphrag.index.operations.embed_text import embed_text +from graphrag.index.operations.unpack_graph import unpack_graph async def create_final_relationships( @@ -26,7 +26,7 @@ async def create_final_relationships( description_text_embed: dict | None = None, ) -> pd.DataFrame: """All the steps to transform final relationships.""" - graph_edges = unpack_graph_df(entity_graph, callbacks, "clustered_graph", "edges") + graph_edges = unpack_graph(entity_graph, callbacks, "clustered_graph", "edges") graph_edges.rename(columns={"source_id": "text_unit_ids"}, inplace=True) @@ -49,7 +49,7 @@ async def create_final_relationships( filtered_nodes = nodes[nodes["level"] == 0].reset_index(drop=True) filtered_nodes = cast(pd.DataFrame, filtered_nodes[["title", "degree"]]) - edge_combined_degree = compute_edge_combined_degree_df( + edge_combined_degree = compute_edge_combined_degree( pruned_edges, filtered_nodes, to="rank", diff --git a/graphrag/index/flows/create_final_text_units.py b/graphrag/index/flows/create_final_text_units.py index 2522c4c9..a63d797f 100644 --- a/graphrag/index/flows/create_final_text_units.py +++ b/graphrag/index/flows/create_final_text_units.py @@ -11,7 +11,7 @@ from datashaper import ( ) from graphrag.index.cache import PipelineCache -from graphrag.index.operations.embed_text.embed_text import embed_text +from graphrag.index.operations.embed_text import embed_text async def create_final_text_units( diff --git a/graphrag/index/flows/create_summarized_entities.py b/graphrag/index/flows/create_summarized_entities.py index dc5c6d25..a9a5d59a 100644 --- a/graphrag/index/flows/create_summarized_entities.py +++ b/graphrag/index/flows/create_summarized_entities.py @@ -11,35 +11,35 @@ from datashaper import ( ) from graphrag.index.cache import PipelineCache -from graphrag.index.storage import PipelineStorage -from graphrag.index.verbs.entities.summarize.description_summarize import ( - summarize_descriptions_df, +from graphrag.index.operations.snapshot_rows import snapshot_rows +from graphrag.index.operations.summarize_descriptions import ( + summarize_descriptions, ) -from graphrag.index.verbs.snapshot_rows import snapshot_rows_df +from graphrag.index.storage import PipelineStorage async def create_summarized_entities( entities: pd.DataFrame, - cache: PipelineCache, callbacks: VerbCallbacks, + cache: PipelineCache, storage: PipelineStorage, - strategy: dict[str, Any] | None = None, + summarization_strategy: dict[str, Any] | None = None, num_threads: int = 4, graphml_snapshot_enabled: bool = False, ) -> pd.DataFrame: """All the steps to summarize entities.""" - summarized = await summarize_descriptions_df( + summarized = await summarize_descriptions( entities, - cache, callbacks, + cache, column="entity_graph", to="entity_graph", - strategy=strategy, + strategy=summarization_strategy, num_threads=num_threads, ) if graphml_snapshot_enabled: - await snapshot_rows_df( + await snapshot_rows( summarized, column="entity_graph", base_name="summarized_graph", diff --git a/graphrag/index/operations/chunk_text/__init__.py b/graphrag/index/operations/chunk_text/__init__.py new file mode 100644 index 00000000..273ff0ab --- /dev/null +++ b/graphrag/index/operations/chunk_text/__init__.py @@ -0,0 +1,8 @@ +# Copyright (c) 2024 Microsoft Corporation. +# Licensed under the MIT License + +"""The Indexing Engine text chunk package root.""" + +from .chunk_text import ChunkStrategy, ChunkStrategyType, chunk_text + +__all__ = ["ChunkStrategy", "ChunkStrategyType", "chunk_text"] diff --git a/graphrag/index/verbs/text/chunk/text_chunk.py b/graphrag/index/operations/chunk_text/chunk_text.py similarity index 80% rename from graphrag/index/verbs/text/chunk/text_chunk.py rename to graphrag/index/operations/chunk_text/chunk_text.py index 436fdbec..bbcc750c 100644 --- a/graphrag/index/verbs/text/chunk/text_chunk.py +++ b/graphrag/index/operations/chunk_text/chunk_text.py @@ -3,59 +3,30 @@ """A module containing _get_num_total, chunk, run_strategy and load_strategy methods definitions.""" -from enum import Enum from typing import Any, cast import pandas as pd from datashaper import ( ProgressTicker, - TableContainer, VerbCallbacks, - VerbInput, progress_ticker, - verb, ) -from .strategies.typing import ChunkStrategy as ChunkStrategy -from .typing import ChunkInput +from .typing import ChunkInput, ChunkStrategy, ChunkStrategyType -def _get_num_total(output: pd.DataFrame, column: str) -> int: - num_total = 0 - for row in output[column]: - if isinstance(row, str): - num_total += 1 - else: - num_total += len(row) - return num_total - - -class ChunkStrategyType(str, Enum): - """ChunkStrategy class definition.""" - - tokens = "tokens" - sentence = "sentence" - - def __repr__(self): - """Get a string representation.""" - return f'"{self.value}"' - - -@verb(name="chunk") -def chunk( - input: VerbInput, +def chunk_text( + input: pd.DataFrame, column: str, to: str, callbacks: VerbCallbacks, strategy: dict[str, Any] | None = None, - **_kwargs, -) -> TableContainer: +) -> pd.DataFrame: """ Chunk a piece of text into smaller pieces. ## Usage ```yaml - verb: text_chunk args: column: # The name of the column containing the text to chunk, this can either be a column with text, or a column with a list[tuple[doc_id, str]] to: # The name of the column to output the chunks to @@ -85,21 +56,6 @@ def chunk( type: sentence ``` """ - input_table = cast(pd.DataFrame, input.get_input()) - - output = chunk_df(input_table, column, to, callbacks, strategy) - - return TableContainer(table=output) - - -def chunk_df( - input: pd.DataFrame, - column: str, - to: str, - callbacks: VerbCallbacks, - strategy: dict[str, Any] | None = None, -) -> pd.DataFrame: - """Chunk a piece of text into smaller pieces.""" output = input if strategy is None: strategy = {} @@ -161,17 +117,27 @@ def load_strategy(strategy: ChunkStrategyType) -> ChunkStrategy: """Load strategy method definition.""" match strategy: case ChunkStrategyType.tokens: - from .strategies.tokens import run as run_tokens + from .strategies import run_tokens return run_tokens case ChunkStrategyType.sentence: # NLTK from graphrag.index.bootstrap import bootstrap - from .strategies.sentence import run as run_sentence + from .strategies import run_sentences bootstrap() - return run_sentence + return run_sentences case _: msg = f"Unknown strategy: {strategy}" raise ValueError(msg) + + +def _get_num_total(output: pd.DataFrame, column: str) -> int: + num_total = 0 + for row in output[column]: + if isinstance(row, str): + num_total += 1 + else: + num_total += len(row) + return num_total diff --git a/graphrag/index/verbs/text/chunk/strategies/tokens.py b/graphrag/index/operations/chunk_text/strategies.py similarity index 78% rename from graphrag/index/verbs/text/chunk/strategies/tokens.py rename to graphrag/index/operations/chunk_text/strategies.py index 6426c783..7507784b 100644 --- a/graphrag/index/verbs/text/chunk/strategies/tokens.py +++ b/graphrag/index/operations/chunk_text/strategies.py @@ -1,23 +1,25 @@ # Copyright (c) 2024 Microsoft Corporation. # Licensed under the MIT License -"""A module containing run and split_text_on_tokens methods definition.""" +"""A module containing chunk strategies.""" from collections.abc import Iterable from typing import Any +import nltk import tiktoken from datashaper import ProgressTicker import graphrag.config.defaults as defs from graphrag.index.text_splitting import Tokenizer -from graphrag.index.verbs.text.chunk.typing import TextChunk + +from .typing import TextChunk -def run( +def run_tokens( input: list[str], args: dict[str, Any], tick: ProgressTicker ) -> Iterable[TextChunk]: - """Chunks text into multiple parts. A pipeline verb.""" + """Chunks text into chunks based on encoding tokens.""" tokens_per_chunk = args.get("chunk_size", defs.CHUNK_SIZE) chunk_overlap = args.get("chunk_overlap", defs.CHUNK_OVERLAP) encoding_name = args.get("encoding_name", defs.ENCODING_MODEL) @@ -31,7 +33,7 @@ def run( def decode(tokens: list[int]) -> str: return enc.decode(tokens) - return split_text_on_tokens( + return _split_text_on_tokens( input, Tokenizer( chunk_overlap=chunk_overlap, @@ -45,7 +47,7 @@ def run( # Adapted from - https://github.com/langchain-ai/langchain/blob/77b359edf5df0d37ef0d539f678cf64f5557cb54/libs/langchain/langchain/text_splitter.py#L471 # So we could have better control over the chunking process -def split_text_on_tokens( +def _split_text_on_tokens( texts: list[str], enc: Tokenizer, tick: ProgressTicker ) -> list[TextChunk]: """Split incoming text and return chunks.""" @@ -79,3 +81,17 @@ def split_text_on_tokens( chunk_ids = input_ids[start_idx:cur_idx] return result + + +def run_sentences( + input: list[str], _args: dict[str, Any], tick: ProgressTicker +) -> Iterable[TextChunk]: + """Chunks text into multiple parts by sentence.""" + for doc_idx, text in enumerate(input): + sentences = nltk.sent_tokenize(text) + for sentence in sentences: + yield TextChunk( + text_chunk=sentence, + source_doc_indices=[doc_idx], + ) + tick(1) diff --git a/graphrag/index/verbs/text/chunk/typing.py b/graphrag/index/operations/chunk_text/typing.py similarity index 50% rename from graphrag/index/verbs/text/chunk/typing.py rename to graphrag/index/operations/chunk_text/typing.py index 3a42cf68..ebfa4db9 100644 --- a/graphrag/index/verbs/text/chunk/typing.py +++ b/graphrag/index/operations/chunk_text/typing.py @@ -3,7 +3,12 @@ """A module containing 'TextChunk' model.""" +from collections.abc import Callable, Iterable from dataclasses import dataclass +from enum import Enum +from typing import Any + +from datashaper import ProgressTicker @dataclass @@ -17,3 +22,18 @@ class TextChunk: ChunkInput = str | list[str] | list[tuple[str, str]] """Input to a chunking strategy. Can be a string, a list of strings, or a list of tuples of (id, text).""" + +ChunkStrategy = Callable[ + [list[str], dict[str, Any], ProgressTicker], Iterable[TextChunk] +] + + +class ChunkStrategyType(str, Enum): + """ChunkStrategy class definition.""" + + tokens = "tokens" + sentence = "sentence" + + def __repr__(self): + """Get a string representation.""" + return f'"{self.value}"' diff --git a/graphrag/index/verbs/graph/clustering/cluster_graph.py b/graphrag/index/operations/cluster_graph.py similarity index 69% rename from graphrag/index/verbs/graph/clustering/cluster_graph.py rename to graphrag/index/operations/cluster_graph.py index 969d116e..731c4b5b 100644 --- a/graphrag/index/verbs/graph/clustering/cluster_graph.py +++ b/graphrag/index/operations/cluster_graph.py @@ -10,65 +10,29 @@ from typing import Any, cast import networkx as nx import pandas as pd -from datashaper import TableContainer, VerbCallbacks, VerbInput, progress_iterable, verb +from datashaper import VerbCallbacks, progress_iterable +from graspologic.partition import hierarchical_leiden +from graphrag.index.graph.utils import stable_largest_connected_component from graphrag.index.utils import gen_uuid, load_graph -from .typing import Communities +Communities = list[tuple[int, str, list[str]]] + + +class GraphCommunityStrategyType(str, Enum): + """GraphCommunityStrategyType class definition.""" + + leiden = "leiden" + + def __repr__(self): + """Get a string representation.""" + return f'"{self.value}"' + log = logging.getLogger(__name__) -@verb(name="cluster_graph") def cluster_graph( - input: VerbInput, - callbacks: VerbCallbacks, - strategy: dict[str, Any], - column: str, - to: str, - level_to: str | None = None, - **_kwargs, -) -> TableContainer: - """ - Apply a hierarchical clustering algorithm to a graph. The graph is expected to be in graphml format. The verb outputs a new column containing the clustered graph, and a new column containing the level of the graph. - - ## Usage - ```yaml - verb: cluster_graph - args: - column: entity_graph # The name of the column containing the graph, should be a graphml graph - to: clustered_graph # The name of the column to output the clustered graph to - level_to: level # The name of the column to output the level to - strategy: # See strategies section below - ``` - - ## Strategies - The cluster graph verb uses a strategy to cluster the graph. The strategy is a json object which defines the strategy to use. The following strategies are available: - - ### leiden - This strategy uses the leiden algorithm to cluster a graph. The strategy config is as follows: - ```yaml - strategy: - type: leiden - max_cluster_size: 10 # Optional, The max cluster size to use, default: 10 - use_lcc: true # Optional, if the largest connected component should be used with the leiden algorithm, default: true - seed: 0xDEADBEEF # Optional, the seed to use for the leiden algorithm, default: 0xDEADBEEF - levels: [0, 1] # Optional, the levels to output, default: all the levels detected - - ``` - """ - output_df = cluster_graph_df( - cast(pd.DataFrame, input.get_input()), - callbacks, - strategy, - column, - to, - level_to=level_to, - ) - return TableContainer(table=output_df) - - -def cluster_graph_df( input: pd.DataFrame, callbacks: VerbCallbacks, strategy: dict[str, Any], @@ -157,16 +121,6 @@ def apply_clustering( return graph -class GraphCommunityStrategyType(str, Enum): - """GraphCommunityStrategyType class definition.""" - - leiden = "leiden" - - def __repr__(self): - """Get a string representation.""" - return f'"{self.value}"' - - def run_layout( strategy: dict[str, Any], graphml_or_graph: str | nx.Graph ) -> Communities: @@ -180,8 +134,6 @@ def run_layout( strategy_type = strategy.get("type", GraphCommunityStrategyType.leiden) match strategy_type: case GraphCommunityStrategyType.leiden: - from .strategies.leiden import run as run_leiden - clusters = run_leiden(graph, strategy) case _: msg = f"Unknown clustering strategy {strategy_type}" @@ -192,3 +144,60 @@ def run_layout( for cluster_id, nodes in clusters[level].items(): results.append((level, cluster_id, nodes)) return results + + +def run_leiden( + graph: nx.Graph, args: dict[str, Any] +) -> dict[int, dict[str, list[str]]]: + """Run method definition.""" + max_cluster_size = args.get("max_cluster_size", 10) + use_lcc = args.get("use_lcc", True) + if args.get("verbose", False): + log.info( + "Running leiden with max_cluster_size=%s, lcc=%s", max_cluster_size, use_lcc + ) + + node_id_to_community_map = _compute_leiden_communities( + graph=graph, + max_cluster_size=max_cluster_size, + use_lcc=use_lcc, + seed=args.get("seed", 0xDEADBEEF), + ) + levels = args.get("levels") + + # If they don't pass in levels, use them all + if levels is None: + levels = sorted(node_id_to_community_map.keys()) + + results_by_level: dict[int, dict[str, list[str]]] = {} + for level in levels: + result = {} + results_by_level[level] = result + for node_id, raw_community_id in node_id_to_community_map[level].items(): + community_id = str(raw_community_id) + if community_id not in result: + result[community_id] = [] + result[community_id].append(node_id) + return results_by_level + + +# Taken from graph_intelligence & adapted +def _compute_leiden_communities( + graph: nx.Graph | nx.DiGraph, + max_cluster_size: int, + use_lcc: bool, + seed=0xDEADBEEF, +) -> dict[int, dict[str, int]]: + """Return Leiden root communities.""" + if use_lcc: + graph = stable_largest_connected_component(graph) + + community_mapping = hierarchical_leiden( + graph, max_cluster_size=max_cluster_size, random_seed=seed + ) + results: dict[int, dict[str, int]] = {} + for partition in community_mapping: + results[partition.level] = results.get(partition.level, {}) + results[partition.level][partition.node] = partition.cluster + + return results diff --git a/graphrag/index/operations/compute_edge_combined_degree.py b/graphrag/index/operations/compute_edge_combined_degree.py new file mode 100644 index 00000000..e0a81be0 --- /dev/null +++ b/graphrag/index/operations/compute_edge_combined_degree.py @@ -0,0 +1,44 @@ +# Copyright (c) 2024 Microsoft Corporation. +# Licensed under the MIT License + +"""A module containing compute_edge_combined_degree methods definition.""" + +import pandas as pd + + +def compute_edge_combined_degree( + edge_df: pd.DataFrame, + node_degree_df: pd.DataFrame, + to: str, + node_name_column: str, + node_degree_column: str, + edge_source_column: str, + edge_target_column: str, +) -> pd.DataFrame: + """Compute the combined degree for each edge in a graph.""" + if to in edge_df.columns: + return edge_df + + def join_to_degree(df: pd.DataFrame, column: str) -> pd.DataFrame: + degree_column = _degree_colname(column) + result = df.merge( + node_degree_df.rename( + columns={node_name_column: column, node_degree_column: degree_column} + ), + on=column, + how="left", + ) + result[degree_column] = result[degree_column].fillna(0) + return result + + output_df = join_to_degree(edge_df, edge_source_column) + output_df = join_to_degree(output_df, edge_target_column) + output_df[to] = ( + output_df[_degree_colname(edge_source_column)] + + output_df[_degree_colname(edge_target_column)] + ) + return output_df + + +def _degree_colname(column: str) -> str: + return f"{column}_degree" diff --git a/graphrag/index/operations/embed_graph/__init__.py b/graphrag/index/operations/embed_graph/__init__.py index 4ca8168c..a47441b4 100644 --- a/graphrag/index/operations/embed_graph/__init__.py +++ b/graphrag/index/operations/embed_graph/__init__.py @@ -4,5 +4,6 @@ """The Indexing Engine graph embed package root.""" from .embed_graph import EmbedGraphStrategyType, embed_graph +from .typing import NodeEmbeddings -__all__ = ["EmbedGraphStrategyType", "embed_graph"] +__all__ = ["EmbedGraphStrategyType", "NodeEmbeddings", "embed_graph"] diff --git a/graphrag/index/operations/embed_graph/embed_graph.py b/graphrag/index/operations/embed_graph/embed_graph.py index f38051b1..ab125a93 100644 --- a/graphrag/index/operations/embed_graph/embed_graph.py +++ b/graphrag/index/operations/embed_graph/embed_graph.py @@ -10,6 +10,8 @@ import networkx as nx import pandas as pd from datashaper import VerbCallbacks, derive_from_rows +from graphrag.index.graph.embedding import embed_nod2vec +from graphrag.index.graph.utils import stable_largest_connected_component from graphrag.index.utils import load_graph from .typing import NodeEmbeddings @@ -85,9 +87,29 @@ def run_embeddings( graph = load_graph(graphml_or_graph) match strategy: case EmbedGraphStrategyType.node2vec: - from .strategies.node_2_vec import run as run_node_2_vec - return run_node_2_vec(graph, args) case _: msg = f"Unknown strategy {strategy}" raise ValueError(msg) + + +def run_node_2_vec(graph: nx.Graph, args: dict[str, Any]) -> NodeEmbeddings: + """Run method definition.""" + if args.get("use_lcc", True): + graph = stable_largest_connected_component(graph) + + # create graph embedding using node2vec + embeddings = embed_nod2vec( + graph=graph, + dimensions=args.get("dimensions", 1536), + num_walks=args.get("num_walks", 10), + walk_length=args.get("walk_length", 40), + window_size=args.get("window_size", 2), + iterations=args.get("iterations", 3), + random_seed=args.get("random_seed", 86), + ) + + pairs = zip(embeddings.nodes, embeddings.embeddings.tolist(), strict=True) + sorted_pairs = sorted(pairs, key=lambda x: x[0]) + + return dict(sorted_pairs) diff --git a/graphrag/index/operations/embed_graph/strategies/__init__.py b/graphrag/index/operations/embed_graph/strategies/__init__.py deleted file mode 100644 index ef85198e..00000000 --- a/graphrag/index/operations/embed_graph/strategies/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -# Copyright (c) 2024 Microsoft Corporation. -# Licensed under the MIT License - -"""Text Embedding strategies.""" diff --git a/graphrag/index/operations/embed_graph/strategies/node_2_vec.py b/graphrag/index/operations/embed_graph/strategies/node_2_vec.py deleted file mode 100644 index 82abc825..00000000 --- a/graphrag/index/operations/embed_graph/strategies/node_2_vec.py +++ /dev/null @@ -1,34 +0,0 @@ -# Copyright (c) 2024 Microsoft Corporation. -# Licensed under the MIT License - -"""A module containing run method definition.""" - -from typing import Any - -import networkx as nx - -from graphrag.index.graph.embedding import embed_nod2vec -from graphrag.index.graph.utils import stable_largest_connected_component -from graphrag.index.operations.embed_graph.typing import NodeEmbeddings - - -def run(graph: nx.Graph, args: dict[str, Any]) -> NodeEmbeddings: - """Run method definition.""" - if args.get("use_lcc", True): - graph = stable_largest_connected_component(graph) - - # create graph embedding using node2vec - embeddings = embed_nod2vec( - graph=graph, - dimensions=args.get("dimensions", 1536), - num_walks=args.get("num_walks", 10), - walk_length=args.get("walk_length", 40), - window_size=args.get("window_size", 2), - iterations=args.get("iterations", 3), - random_seed=args.get("random_seed", 86), - ) - - pairs = zip(embeddings.nodes, embeddings.embeddings.tolist(), strict=True) - sorted_pairs = sorted(pairs, key=lambda x: x[0]) - - return dict(sorted_pairs) diff --git a/graphrag/index/verbs/covariates/extract_covariates/__init__.py b/graphrag/index/operations/extract_covariates/__init__.py similarity index 100% rename from graphrag/index/verbs/covariates/extract_covariates/__init__.py rename to graphrag/index/operations/extract_covariates/__init__.py diff --git a/graphrag/index/verbs/covariates/extract_covariates/extract_covariates.py b/graphrag/index/operations/extract_covariates/extract_covariates.py similarity index 64% rename from graphrag/index/verbs/covariates/extract_covariates/extract_covariates.py rename to graphrag/index/operations/extract_covariates/extract_covariates.py index 92785efe..1ee5f51c 100644 --- a/graphrag/index/verbs/covariates/extract_covariates/extract_covariates.py +++ b/graphrag/index/operations/extract_covariates/extract_covariates.py @@ -5,70 +5,29 @@ import logging from dataclasses import asdict -from enum import Enum -from typing import Any, cast +from typing import Any import pandas as pd from datashaper import ( AsyncType, - TableContainer, VerbCallbacks, - VerbInput, derive_from_rows, - verb, ) from graphrag.index.cache import PipelineCache -from graphrag.index.verbs.covariates.typing import Covariate, CovariateExtractStrategy + +from .typing import Covariate, CovariateExtractStrategy, ExtractClaimsStrategyType log = logging.getLogger(__name__) -class ExtractClaimsStrategyType(str, Enum): - """ExtractClaimsStrategyType class definition.""" - - graph_intelligence = "graph_intelligence" - - def __repr__(self): - """Get a string representation.""" - return f'"{self.value}"' - - DEFAULT_ENTITY_TYPES = ["organization", "person", "geo", "event"] -@verb(name="extract_covariates") async def extract_covariates( - input: VerbInput, - cache: PipelineCache, - callbacks: VerbCallbacks, - column: str, - covariate_type: str, - strategy: dict[str, Any] | None, - async_mode: AsyncType = AsyncType.AsyncIO, - entity_types: list[str] | None = None, - **kwargs, -) -> TableContainer: - """Extract claims from a piece of text.""" - source = cast(pd.DataFrame, input.get_input()) - output = await extract_covariates_df( - source, - cache, - callbacks, - column, - covariate_type, - strategy, - async_mode, - entity_types, - **kwargs, - ) - return TableContainer(table=output) - - -async def extract_covariates_df( input: pd.DataFrame, - cache: PipelineCache, callbacks: VerbCallbacks, + cache: PipelineCache, column: str, covariate_type: str, strategy: dict[str, Any] | None, @@ -113,9 +72,9 @@ def load_strategy(strategy_type: ExtractClaimsStrategyType) -> CovariateExtractS """Load strategy method definition.""" match strategy_type: case ExtractClaimsStrategyType.graph_intelligence: - from .strategies.graph_intelligence import run as run_gi + from .strategies import run_graph_intelligence - return run_gi + return run_graph_intelligence case _: msg = f"Unknown strategy: {strategy_type}" raise ValueError(msg) diff --git a/graphrag/index/verbs/covariates/extract_covariates/strategies/graph_intelligence/run_gi_extract_claims.py b/graphrag/index/operations/extract_covariates/strategies.py similarity index 80% rename from graphrag/index/verbs/covariates/extract_covariates/strategies/graph_intelligence/run_gi_extract_claims.py rename to graphrag/index/operations/extract_covariates/strategies.py index b9315b2d..2ef83e51 100644 --- a/graphrag/index/verbs/covariates/extract_covariates/strategies/graph_intelligence/run_gi_extract_claims.py +++ b/graphrag/index/operations/extract_covariates/strategies.py @@ -9,35 +9,31 @@ from typing import Any from datashaper import VerbCallbacks import graphrag.config.defaults as defs -from graphrag.config.enums import LLMType from graphrag.index.cache import PipelineCache from graphrag.index.graph.extractors.claims import ClaimExtractor from graphrag.index.llm import load_llm -from graphrag.index.verbs.covariates.typing import ( +from graphrag.llm import CompletionLLM + +from .typing import ( Covariate, CovariateExtractionResult, ) -from graphrag.llm import CompletionLLM - -from .defaults import MOCK_LLM_RESPONSES -async def run( +async def run_graph_intelligence( input: str | Iterable[str], entity_types: list[str], resolved_entities_map: dict[str, str], - reporter: VerbCallbacks, - pipeline_cache: PipelineCache, + callbacks: VerbCallbacks, + cache: PipelineCache, strategy_config: dict[str, Any], ) -> CovariateExtractionResult: """Run the Claim extraction chain.""" - llm_config = strategy_config.get( - "llm", {"type": LLMType.StaticResponse, "responses": MOCK_LLM_RESPONSES} - ) - llm_type = llm_config.get("type", LLMType.StaticResponse) - llm = load_llm("claim_extraction", llm_type, reporter, pipeline_cache, llm_config) + llm_config = strategy_config.get("llm", {}) + llm_type = llm_config.get("type") + llm = load_llm("claim_extraction", llm_type, callbacks, cache, llm_config) return await _execute( - llm, input, entity_types, resolved_entities_map, reporter, strategy_config + llm, input, entity_types, resolved_entities_map, callbacks, strategy_config ) @@ -46,7 +42,7 @@ async def _execute( texts: Iterable[str], entity_types: list[str], resolved_entities_map: dict[str, str], - reporter: VerbCallbacks, + callbacks: VerbCallbacks, strategy_config: dict[str, Any], ) -> CovariateExtractionResult: extraction_prompt = strategy_config.get("extraction_prompt") @@ -62,7 +58,7 @@ async def _execute( max_gleanings=max_gleanings, encoding_model=encoding_model, on_error=lambda e, s, d: ( - reporter.error("Claim Extraction Error", e, s, d) if reporter else None + callbacks.error("Claim Extraction Error", e, s, d) if callbacks else None ), ) diff --git a/graphrag/index/verbs/covariates/typing.py b/graphrag/index/operations/extract_covariates/typing.py similarity index 81% rename from graphrag/index/verbs/covariates/typing.py rename to graphrag/index/operations/extract_covariates/typing.py index 0e0c5fb7..c0cb9663 100644 --- a/graphrag/index/verbs/covariates/typing.py +++ b/graphrag/index/operations/extract_covariates/typing.py @@ -5,6 +5,7 @@ from collections.abc import Awaitable, Callable, Iterable from dataclasses import dataclass +from enum import Enum from typing import Any from datashaper import VerbCallbacks @@ -48,3 +49,13 @@ CovariateExtractStrategy = Callable[ ], Awaitable[CovariateExtractionResult], ] + + +class ExtractClaimsStrategyType(str, Enum): + """ExtractClaimsStrategyType class definition.""" + + graph_intelligence = "graph_intelligence" + + def __repr__(self): + """Get a string representation.""" + return f'"{self.value}"' diff --git a/graphrag/index/verbs/entities/extraction/__init__.py b/graphrag/index/operations/extract_entities/__init__.py similarity index 51% rename from graphrag/index/verbs/entities/extraction/__init__.py rename to graphrag/index/operations/extract_entities/__init__.py index 46e6d545..579b57df 100644 --- a/graphrag/index/verbs/entities/extraction/__init__.py +++ b/graphrag/index/operations/extract_entities/__init__.py @@ -3,6 +3,6 @@ """The Indexing Engine entities extraction package root.""" -from .entity_extract import ExtractEntityStrategyType, entity_extract +from .extract_entities import ExtractEntityStrategyType, extract_entities -__all__ = ["ExtractEntityStrategyType", "entity_extract"] +__all__ = ["ExtractEntityStrategyType", "extract_entities"] diff --git a/graphrag/index/verbs/entities/extraction/entity_extract.py b/graphrag/index/operations/extract_entities/extract_entities.py similarity index 74% rename from graphrag/index/verbs/entities/extraction/entity_extract.py rename to graphrag/index/operations/extract_entities/extract_entities.py index e5c8eff2..77f29dd6 100644 --- a/graphrag/index/verbs/entities/extraction/entity_extract.py +++ b/graphrag/index/operations/extract_entities/extract_entities.py @@ -5,16 +5,13 @@ import logging from enum import Enum -from typing import Any, cast +from typing import Any import pandas as pd from datashaper import ( AsyncType, - TableContainer, VerbCallbacks, - VerbInput, derive_from_rows, - verb, ) from graphrag.index.bootstrap import bootstrap @@ -40,43 +37,10 @@ class ExtractEntityStrategyType(str, Enum): DEFAULT_ENTITY_TYPES = ["organization", "person", "geo", "event"] -@verb(name="entity_extract") -async def entity_extract( - input: VerbInput, - cache: PipelineCache, - callbacks: VerbCallbacks, - column: str, - id_column: str, - to: str, - strategy: dict[str, Any] | None, - graph_to: str | None = None, - async_mode: AsyncType = AsyncType.AsyncIO, - entity_types=DEFAULT_ENTITY_TYPES, - **kwargs, -) -> TableContainer: - """Extract entities from a piece of text.""" - source = cast(pd.DataFrame, input.get_input()) - output = await entity_extract_df( - source, - cache, - callbacks, - column, - id_column, - to, - strategy, - graph_to, - async_mode, - entity_types, - **kwargs, - ) - - return TableContainer(table=output) - - -async def entity_extract_df( +async def extract_entities( input: pd.DataFrame, - cache: PipelineCache, callbacks: VerbCallbacks, + cache: PipelineCache, column: str, id_column: str, to: str, @@ -90,24 +54,7 @@ async def entity_extract_df( Extract entities from a piece of text. ## Usage - ### json - ```json - { - "verb": "entity_extract", - "args": { - "column": "the_document_text_column_to_extract_entities_from", /* In general this will be your document text column */ - "id_column": "the_column_with_the_unique_id_for_each_row", /* In general this will be your document id */ - "to": "the_column_to_output_the_entities_to", /* This will be a list[dict[str, Any]] a list of entities, with a name, and additional attributes */ - "graph_to": "the_column_to_output_the_graphml_to", /* Optional: This will be a graphml graph in string form which represents the entities and their relationships */ - "strategy": {...} , see strategies section below - "entity_types": ["list", "of", "entity", "types", "to", "extract"] /* Optional: This will limit the entity types extracted, default: ["organization", "person", "geo", "event"] */ - "summarize_descriptions" : true | false /* Optional: This will summarize the descriptions of the entities and relationships, default: true */ - } - } - ``` - ### yaml ```yaml - verb: entity_extract args: column: the_document_text_column_to_extract_entities_from id_column: the_column_with_the_unique_id_for_each_row @@ -218,9 +165,9 @@ def _load_strategy(strategy_type: ExtractEntityStrategyType) -> EntityExtractStr """Load strategy method definition.""" match strategy_type: case ExtractEntityStrategyType.graph_intelligence: - from .strategies.graph_intelligence import run_gi + from .strategies.graph_intelligence import run_graph_intelligence - return run_gi + return run_graph_intelligence case ExtractEntityStrategyType.nltk: bootstrap() diff --git a/graphrag/index/verbs/entities/extraction/strategies/__init__.py b/graphrag/index/operations/extract_entities/strategies/__init__.py similarity index 100% rename from graphrag/index/verbs/entities/extraction/strategies/__init__.py rename to graphrag/index/operations/extract_entities/strategies/__init__.py diff --git a/graphrag/index/verbs/entities/extraction/strategies/graph_intelligence/run_graph_intelligence.py b/graphrag/index/operations/extract_entities/strategies/graph_intelligence.py similarity index 82% rename from graphrag/index/verbs/entities/extraction/strategies/graph_intelligence/run_graph_intelligence.py rename to graphrag/index/operations/extract_entities/strategies/graph_intelligence.py index 06284879..1536df34 100644 --- a/graphrag/index/verbs/entities/extraction/strategies/graph_intelligence/run_graph_intelligence.py +++ b/graphrag/index/operations/extract_entities/strategies/graph_intelligence.py @@ -1,51 +1,49 @@ # Copyright (c) 2024 Microsoft Corporation. # Licensed under the MIT License -"""A module containing run_gi, run_extract_entities and _create_text_splitter methods to run graph intelligence.""" +"""A module containing run_graph_intelligence, run_extract_entities and _create_text_splitter methods to run graph intelligence.""" import networkx as nx from datashaper import VerbCallbacks import graphrag.config.defaults as defs -from graphrag.config.enums import LLMType from graphrag.index.cache import PipelineCache -from graphrag.index.graph.extractors.graph import GraphExtractor +from graphrag.index.graph.extractors import GraphExtractor from graphrag.index.llm import load_llm from graphrag.index.text_splitting import ( NoopTextSplitter, TextSplitter, TokenTextSplitter, ) -from graphrag.index.verbs.entities.extraction.strategies.typing import ( +from graphrag.llm import CompletionLLM + +from .typing import ( Document, EntityExtractionResult, EntityTypes, StrategyConfig, ) -from graphrag.llm import CompletionLLM - -from .defaults import DEFAULT_LLM_CONFIG -async def run_gi( +async def run_graph_intelligence( docs: list[Document], entity_types: EntityTypes, - reporter: VerbCallbacks, - pipeline_cache: PipelineCache, + callbacks: VerbCallbacks, + cache: PipelineCache, args: StrategyConfig, ) -> EntityExtractionResult: """Run the graph intelligence entity extraction strategy.""" - llm_config = args.get("llm", DEFAULT_LLM_CONFIG) - llm_type = llm_config.get("type", LLMType.StaticResponse) - llm = load_llm("entity_extraction", llm_type, reporter, pipeline_cache, llm_config) - return await run_extract_entities(llm, docs, entity_types, reporter, args) + llm_config = args.get("llm", {}) + llm_type = llm_config.get("type") + llm = load_llm("entity_extraction", llm_type, callbacks, cache, llm_config) + return await run_extract_entities(llm, docs, entity_types, callbacks, args) async def run_extract_entities( llm: CompletionLLM, docs: list[Document], entity_types: EntityTypes, - reporter: VerbCallbacks | None, + callbacks: VerbCallbacks | None, args: StrategyConfig, ) -> EntityExtractionResult: """Run the entity extraction chain.""" @@ -76,7 +74,7 @@ async def run_extract_entities( encoding_model=encoding_model, max_gleanings=max_gleanings, on_error=lambda e, s, d: ( - reporter.error("Entity Extraction Error", e, s, d) if reporter else None + callbacks.error("Entity Extraction Error", e, s, d) if callbacks else None ), ) text_list = [doc.text.strip() for doc in docs] diff --git a/graphrag/index/verbs/entities/extraction/strategies/nltk.py b/graphrag/index/operations/extract_entities/strategies/nltk.py similarity index 95% rename from graphrag/index/verbs/entities/extraction/strategies/nltk.py rename to graphrag/index/operations/extract_entities/strategies/nltk.py index 48d4dae4..9403c5a5 100644 --- a/graphrag/index/verbs/entities/extraction/strategies/nltk.py +++ b/graphrag/index/operations/extract_entities/strategies/nltk.py @@ -19,8 +19,8 @@ words.ensure_loaded() async def run( # noqa RUF029 async is required for interface docs: list[Document], entity_types: EntityTypes, - reporter: VerbCallbacks, # noqa ARG001 - pipeline_cache: PipelineCache, # noqa ARG001 + callbacks: VerbCallbacks, # noqa ARG001 + cache: PipelineCache, # noqa ARG001 args: StrategyConfig, # noqa ARG001 ) -> EntityExtractionResult: """Run method definition.""" diff --git a/graphrag/index/verbs/entities/extraction/strategies/typing.py b/graphrag/index/operations/extract_entities/strategies/typing.py similarity index 100% rename from graphrag/index/verbs/entities/extraction/strategies/typing.py rename to graphrag/index/operations/extract_entities/strategies/typing.py diff --git a/graphrag/index/verbs/graph/layout/__init__.py b/graphrag/index/operations/layout_graph/__init__.py similarity index 100% rename from graphrag/index/verbs/graph/layout/__init__.py rename to graphrag/index/operations/layout_graph/__init__.py diff --git a/graphrag/index/verbs/graph/layout/layout_graph.py b/graphrag/index/operations/layout_graph/layout_graph.py similarity index 82% rename from graphrag/index/verbs/graph/layout/layout_graph.py rename to graphrag/index/operations/layout_graph/layout_graph.py index 9721fdfa..d2b23266 100644 --- a/graphrag/index/verbs/graph/layout/layout_graph.py +++ b/graphrag/index/operations/layout_graph/layout_graph.py @@ -8,10 +8,10 @@ from typing import Any, cast import networkx as nx import pandas as pd -from datashaper import TableContainer, VerbCallbacks, VerbInput, progress_callback, verb +from datashaper import VerbCallbacks, progress_callback from graphrag.index.graph.visualization import GraphLayout -from graphrag.index.operations.embed_graph.typing import NodeEmbeddings +from graphrag.index.operations.embed_graph import NodeEmbeddings from graphrag.index.utils import load_graph @@ -26,23 +26,20 @@ class LayoutGraphStrategyType(str, Enum): return f'"{self.value}"' -@verb(name="layout_graph") def layout_graph( - input: VerbInput, + input_df: pd.DataFrame, callbacks: VerbCallbacks, strategy: dict[str, Any], embeddings_column: str, graph_column: str, to: str, graph_to: str | None = None, - **_kwargs: dict, -) -> TableContainer: +): """ Apply a layout algorithm to a graph. The graph is expected to be in graphml format. The verb outputs a new column containing the laid out graph. ## Usage ```yaml - verb: layout_graph args: graph_column: clustered_graph # The name of the column containing the graph, should be a graphml graph embeddings_column: embeddings # The name of the column containing the embeddings @@ -63,24 +60,6 @@ def layout_graph( min_dist: 0.75 # Optional, The min distance to use for the umap algorithm, default: 0.75 ``` """ - input_df = cast(pd.DataFrame, input.get_input()) - output_df = layout_graph_df( - input_df, callbacks, strategy, embeddings_column, graph_column, to, graph_to - ) - - return TableContainer(table=output_df) - - -def layout_graph_df( - input_df: pd.DataFrame, - callbacks: VerbCallbacks, - strategy: dict[str, Any], - embeddings_column: str, - graph_column: str, - to: str, - graph_to: str | None = None, -): - """Apply a layout algorithm to a graph.""" output_df = input_df num_items = len(output_df) strategy_type = strategy.get("type", LayoutGraphStrategyType.umap) @@ -118,7 +97,7 @@ def _run_layout( graphml_or_graph: str | nx.Graph, embeddings: NodeEmbeddings, args: dict[str, Any], - reporter: VerbCallbacks, + callbacks: VerbCallbacks, ) -> GraphLayout: graph = load_graph(graphml_or_graph) match strategy: @@ -129,7 +108,7 @@ def _run_layout( graph, embeddings, args, - lambda e, stack, d: reporter.error("Error in Umap", e, stack, d), + lambda e, stack, d: callbacks.error("Error in Umap", e, stack, d), ) case LayoutGraphStrategyType.zero: from .methods.zero import run as run_zero @@ -137,7 +116,7 @@ def _run_layout( return run_zero( graph, args, - lambda e, stack, d: reporter.error("Error in Zero", e, stack, d), + lambda e, stack, d: callbacks.error("Error in Zero", e, stack, d), ) case _: msg = f"Unknown strategy {strategy}" diff --git a/graphrag/index/verbs/graph/layout/methods/__init__.py b/graphrag/index/operations/layout_graph/methods/__init__.py similarity index 100% rename from graphrag/index/verbs/graph/layout/methods/__init__.py rename to graphrag/index/operations/layout_graph/methods/__init__.py diff --git a/graphrag/index/verbs/graph/layout/methods/umap.py b/graphrag/index/operations/layout_graph/methods/umap.py similarity index 97% rename from graphrag/index/verbs/graph/layout/methods/umap.py rename to graphrag/index/operations/layout_graph/methods/umap.py index d0e00b3c..636fd9a6 100644 --- a/graphrag/index/verbs/graph/layout/methods/umap.py +++ b/graphrag/index/operations/layout_graph/methods/umap.py @@ -15,7 +15,7 @@ from graphrag.index.graph.visualization import ( NodePosition, compute_umap_positions, ) -from graphrag.index.operations.embed_graph.typing import NodeEmbeddings +from graphrag.index.operations.embed_graph import NodeEmbeddings from graphrag.index.typing import ErrorHandlerFn # TODO: This could be handled more elegantly, like what columns to use diff --git a/graphrag/index/verbs/graph/layout/methods/zero.py b/graphrag/index/operations/layout_graph/methods/zero.py similarity index 100% rename from graphrag/index/verbs/graph/layout/methods/zero.py rename to graphrag/index/operations/layout_graph/methods/zero.py diff --git a/graphrag/index/verbs/graph/merge/__init__.py b/graphrag/index/operations/merge_graphs/__init__.py similarity index 60% rename from graphrag/index/verbs/graph/merge/__init__.py rename to graphrag/index/operations/merge_graphs/__init__.py index f7188279..f3b957dd 100644 --- a/graphrag/index/verbs/graph/merge/__init__.py +++ b/graphrag/index/operations/merge_graphs/__init__.py @@ -1,8 +1,10 @@ # Copyright (c) 2024 Microsoft Corporation. # Licensed under the MIT License -"""The Indexing Engine graph merge package root.""" +"""merge_graphs operation.""" from .merge_graphs import merge_graphs -__all__ = ["merge_graphs"] +__all__ = [ + "merge_graphs", +] diff --git a/graphrag/index/verbs/graph/merge/merge_graphs.py b/graphrag/index/operations/merge_graphs/merge_graphs.py similarity index 90% rename from graphrag/index/verbs/graph/merge/merge_graphs.py rename to graphrag/index/operations/merge_graphs/merge_graphs.py index d5a4c9f5..ca654e6e 100644 --- a/graphrag/index/verbs/graph/merge/merge_graphs.py +++ b/graphrag/index/operations/merge_graphs/merge_graphs.py @@ -7,15 +7,10 @@ from typing import Any, cast import networkx as nx import pandas as pd -from datashaper import TableContainer, VerbCallbacks, VerbInput, progress_iterable, verb +from datashaper import VerbCallbacks, progress_iterable from graphrag.index.utils import load_graph -from .defaults import ( - DEFAULT_CONCAT_SEPARATOR, - DEFAULT_EDGE_OPERATIONS, - DEFAULT_NODE_OPERATIONS, -) from .typing import ( BasicMergeOperation, DetailedAttributeMergeOperation, @@ -23,25 +18,23 @@ from .typing import ( StringOperation, ) +DEFAULT_NODE_OPERATIONS = { + "*": { + "operation": BasicMergeOperation.Replace, + } +} + +DEFAULT_EDGE_OPERATIONS = { + "*": { + "operation": BasicMergeOperation.Replace, + }, + "weight": "sum", +} + +DEFAULT_CONCAT_SEPARATOR = "," + -@verb(name="merge_graphs") def merge_graphs( - input: VerbInput, - callbacks: VerbCallbacks, - column: str, - to: str, - nodes: dict[str, Any] = DEFAULT_NODE_OPERATIONS, - edges: dict[str, Any] = DEFAULT_EDGE_OPERATIONS, - **_kwargs, -) -> TableContainer: - """Merge multiple graphs together. The graphs are expected to be in graphml format. The verb outputs a new column containing the merged graph.""" - input_df = cast(pd.DataFrame, input.get_input()) - output = merge_graphs_df(input_df, callbacks, column, to, nodes, edges) - - return TableContainer(table=output) - - -def merge_graphs_df( input: pd.DataFrame, callbacks: VerbCallbacks, column: str, diff --git a/graphrag/index/verbs/graph/merge/typing.py b/graphrag/index/operations/merge_graphs/typing.py similarity index 100% rename from graphrag/index/verbs/graph/merge/typing.py rename to graphrag/index/operations/merge_graphs/typing.py diff --git a/graphrag/index/verbs/snapshot.py b/graphrag/index/operations/snapshot.py similarity index 58% rename from graphrag/index/verbs/snapshot.py rename to graphrag/index/operations/snapshot.py index 032e1951..2b85c3ea 100644 --- a/graphrag/index/verbs/snapshot.py +++ b/graphrag/index/operations/snapshot.py @@ -3,31 +3,12 @@ """A module containing snapshot method definition.""" -from typing import cast - import pandas as pd -from datashaper import TableContainer, VerbInput, verb from graphrag.index.storage import PipelineStorage -@verb(name="snapshot") async def snapshot( - input: VerbInput, - name: str, - formats: list[str], - storage: PipelineStorage, - **_kwargs: dict, -) -> TableContainer: - """Take a entire snapshot of the tabular data.""" - source = cast(pd.DataFrame, input.get_input()) - - await snapshot_df(source, name, formats, storage) - - return TableContainer(table=source) - - -async def snapshot_df( input: pd.DataFrame, name: str, formats: list[str], diff --git a/graphrag/index/verbs/snapshot_rows.py b/graphrag/index/operations/snapshot_rows.py similarity index 79% rename from graphrag/index/verbs/snapshot_rows.py rename to graphrag/index/operations/snapshot_rows.py index e4b567aa..5abd771b 100644 --- a/graphrag/index/verbs/snapshot_rows.py +++ b/graphrag/index/operations/snapshot_rows.py @@ -5,10 +5,9 @@ import json from dataclasses import dataclass -from typing import Any, cast +from typing import Any import pandas as pd -from datashaper import TableContainer, VerbInput, verb from graphrag.index.storage import PipelineStorage @@ -21,30 +20,7 @@ class FormatSpecifier: extension: str -@verb(name="snapshot_rows") async def snapshot_rows( - input: VerbInput, - column: str | None, - base_name: str, - storage: PipelineStorage, - formats: list[str | dict[str, Any]], - row_name_column: str | None = None, - **_kwargs: dict, -) -> TableContainer: - """Take a by-row snapshot of the tabular data.""" - source = cast(pd.DataFrame, input.get_input()) - await snapshot_rows_df( - source, - column, - base_name, - storage, - formats, - row_name_column, - ) - return TableContainer(table=source) - - -async def snapshot_rows_df( input: pd.DataFrame, column: str | None, base_name: str, diff --git a/graphrag/index/operations/split_text.py b/graphrag/index/operations/split_text.py new file mode 100644 index 00000000..7a9a9076 --- /dev/null +++ b/graphrag/index/operations/split_text.py @@ -0,0 +1,26 @@ +# Copyright (c) 2024 Microsoft Corporation. +# Licensed under the MIT License + +"""A module containing the split_text method definition.""" + +import pandas as pd + + +def split_text( + input: pd.DataFrame, column: str, to: str, separator: str = "," +) -> pd.DataFrame: + """Split a column into a list of strings.""" + output = input + + def _apply_split(row): + if row[column] is None or isinstance(row[column], list): + return row[column] + if row[column] == "": + return [] + if not isinstance(row[column], str): + message = f"Expected {column} to be a string, but got {type(row[column])}" + raise TypeError(message) + return row[column].split(separator) + + output[to] = output.apply(_apply_split, axis=1) + return output diff --git a/graphrag/index/operations/summarize_communities/__init__.py b/graphrag/index/operations/summarize_communities/__init__.py new file mode 100644 index 00000000..d3065198 --- /dev/null +++ b/graphrag/index/operations/summarize_communities/__init__.py @@ -0,0 +1,16 @@ +# Copyright (c) 2024 Microsoft Corporation. +# Licensed under the MIT License + +"""Community summarization modules.""" + +from .prepare_community_reports import prepare_community_reports +from .restore_community_hierarchy import restore_community_hierarchy +from .summarize_communities import summarize_communities +from .typing import CreateCommunityReportsStrategyType + +__all__ = [ + "CreateCommunityReportsStrategyType", + "prepare_community_reports", + "restore_community_hierarchy", + "summarize_communities", +] diff --git a/graphrag/index/verbs/graph/report/prepare_community_reports.py b/graphrag/index/operations/summarize_communities/prepare_community_reports.py similarity index 87% rename from graphrag/index/verbs/graph/report/prepare_community_reports.py rename to graphrag/index/operations/summarize_communities/prepare_community_reports.py index a6a3a24f..1a54f7d9 100644 --- a/graphrag/index/verbs/graph/report/prepare_community_reports.py +++ b/graphrag/index/operations/summarize_communities/prepare_community_reports.py @@ -8,11 +8,8 @@ from typing import cast import pandas as pd from datashaper import ( - TableContainer, VerbCallbacks, - VerbInput, progress_iterable, - verb, ) import graphrag.index.graph.extractors.community_reports.schemas as schemas @@ -25,32 +22,11 @@ from graphrag.index.graph.extractors.community_reports import ( set_context_size, sort_context, ) -from graphrag.index.utils.ds_util import get_named_input_table, get_required_input_table log = logging.getLogger(__name__) -@verb(name="prepare_community_reports") def prepare_community_reports( - input: VerbInput, - callbacks: VerbCallbacks, - max_tokens: int = 16_000, - **_kwargs, -) -> TableContainer: - """Prep communities for report generation.""" - # Prepare Community Reports - nodes = cast(pd.DataFrame, get_required_input_table(input, "nodes").table) - edges = cast(pd.DataFrame, get_required_input_table(input, "edges").table) - claims = get_named_input_table(input, "claims") - if claims: - claims = cast(pd.DataFrame, claims.table) - - output = prepare_community_reports_df(nodes, edges, claims, callbacks, max_tokens) - - return TableContainer(table=output) - - -def prepare_community_reports_df( nodes, edges, claims, diff --git a/graphrag/index/verbs/graph/report/restore_community_hierarchy.py b/graphrag/index/operations/summarize_communities/restore_community_hierarchy.py similarity index 78% rename from graphrag/index/verbs/graph/report/restore_community_hierarchy.py rename to graphrag/index/operations/summarize_communities/restore_community_hierarchy.py index 2bf5fd00..368e4b05 100644 --- a/graphrag/index/verbs/graph/report/restore_community_hierarchy.py +++ b/graphrag/index/operations/summarize_communities/restore_community_hierarchy.py @@ -4,37 +4,15 @@ """A module containing create_graph, _get_node_attributes, _get_edge_attributes and _get_attribute_column_mapping methods definition.""" import logging -from typing import cast import pandas as pd -from datashaper import TableContainer, VerbInput, verb import graphrag.index.graph.extractors.community_reports.schemas as schemas log = logging.getLogger(__name__) -@verb(name="restore_community_hierarchy") def restore_community_hierarchy( - input: VerbInput, - name_column: str = schemas.NODE_NAME, - community_column: str = schemas.NODE_COMMUNITY, - level_column: str = schemas.NODE_LEVEL, - **_kwargs, -) -> TableContainer: - """Restore the community hierarchy from the node data.""" - node_df: pd.DataFrame = cast(pd.DataFrame, input.get_input()) - - output = restore_community_hierarchy_df( - node_df, - name_column=name_column, - community_column=community_column, - level_column=level_column, - ) - return TableContainer(table=output) - - -def restore_community_hierarchy_df( input: pd.DataFrame, name_column: str = schemas.NODE_NAME, community_column: str = schemas.NODE_COMMUNITY, diff --git a/graphrag/index/verbs/graph/report/strategies/graph_intelligence/run_graph_intelligence.py b/graphrag/index/operations/summarize_communities/strategies.py similarity index 78% rename from graphrag/index/verbs/graph/report/strategies/graph_intelligence/run_graph_intelligence.py rename to graphrag/index/operations/summarize_communities/strategies.py index d9a5235b..2653e41f 100644 --- a/graphrag/index/verbs/graph/report/strategies/graph_intelligence/run_graph_intelligence.py +++ b/graphrag/index/operations/summarize_communities/strategies.py @@ -9,41 +9,37 @@ import traceback from datashaper import VerbCallbacks -from graphrag.config.enums import LLMType from graphrag.index.cache import PipelineCache from graphrag.index.graph.extractors.community_reports import ( CommunityReportsExtractor, ) from graphrag.index.llm import load_llm from graphrag.index.utils.rate_limiter import RateLimiter -from graphrag.index.verbs.graph.report.strategies.typing import ( +from graphrag.llm import CompletionLLM + +from .typing import ( CommunityReport, StrategyConfig, ) -from graphrag.llm import CompletionLLM -from .defaults import MOCK_RESPONSES +DEFAULT_CHUNK_SIZE = 3000 log = logging.getLogger(__name__) -async def run( +async def run_graph_intelligence( community: str | int, input: str, level: int, - reporter: VerbCallbacks, - pipeline_cache: PipelineCache, + callbacks: VerbCallbacks, + cache: PipelineCache, args: StrategyConfig, ) -> CommunityReport | None: """Run the graph intelligence entity extraction strategy.""" - llm_config = args.get( - "llm", {"type": LLMType.StaticResponse, "responses": MOCK_RESPONSES} - ) - llm_type = llm_config.get("type", LLMType.StaticResponse) - llm = load_llm( - "community_reporting", llm_type, reporter, pipeline_cache, llm_config - ) - return await _run_extractor(llm, community, input, level, args, reporter) + llm_config = args.get("llm", {}) + llm_type = llm_config.get("type") + llm = load_llm("community_reporting", llm_type, callbacks, cache, llm_config) + return await _run_extractor(llm, community, input, level, args, callbacks) async def _run_extractor( @@ -52,7 +48,7 @@ async def _run_extractor( input: str, level: int, args: StrategyConfig, - reporter: VerbCallbacks, + callbacks: VerbCallbacks, ) -> CommunityReport | None: # RateLimiter rate_limiter = RateLimiter(rate=1, per=60) @@ -60,7 +56,7 @@ async def _run_extractor( llm, extraction_prompt=args.get("extraction_prompt", None), max_report_length=args.get("max_report_length", None), - on_error=lambda e, stack, _data: reporter.error( + on_error=lambda e, stack, _data: callbacks.error( "Community Report Extraction Error", e, stack ), ) @@ -86,7 +82,7 @@ async def _run_extractor( ) except Exception as e: log.exception("Error processing community: %s", community) - reporter.error("Community Report Extraction Error", e, traceback.format_exc()) + callbacks.error("Community Report Extraction Error", e, traceback.format_exc()) return None diff --git a/graphrag/index/verbs/graph/report/create_community_reports.py b/graphrag/index/operations/summarize_communities/summarize_communities.py similarity index 67% rename from graphrag/index/verbs/graph/report/create_community_reports.py rename to graphrag/index/operations/summarize_communities/summarize_communities.py index 1764362c..a704fcc1 100644 --- a/graphrag/index/verbs/graph/report/create_community_reports.py +++ b/graphrag/index/operations/summarize_communities/summarize_communities.py @@ -4,19 +4,14 @@ """A module containing create_community_reports and load_strategy methods definition.""" import logging -from enum import Enum -from typing import cast import pandas as pd from datashaper import ( AsyncType, NoopVerbCallbacks, - TableContainer, VerbCallbacks, - VerbInput, derive_from_rows, progress_ticker, - verb, ) import graphrag.config.defaults as defaults @@ -26,54 +21,17 @@ from graphrag.index.graph.extractors.community_reports import ( get_levels, prep_community_report_context, ) -from graphrag.index.utils.ds_util import get_required_input_table -from .strategies.typing import CommunityReport, CommunityReportsStrategy +from .typing import ( + CommunityReport, + CommunityReportsStrategy, + CreateCommunityReportsStrategyType, +) log = logging.getLogger(__name__) -class CreateCommunityReportsStrategyType(str, Enum): - """CreateCommunityReportsStrategyType class definition.""" - - graph_intelligence = "graph_intelligence" - - def __repr__(self): - """Get a string representation.""" - return f'"{self.value}"' - - -@verb(name="create_community_reports") -async def create_community_reports( - input: VerbInput, - callbacks: VerbCallbacks, - cache: PipelineCache, - strategy: dict, - async_mode: AsyncType = AsyncType.AsyncIO, - num_threads: int = 4, - **_kwargs, -) -> TableContainer: - """Generate community summaries.""" - log.debug("create_community_reports strategy=%s", strategy) - local_contexts = cast(pd.DataFrame, input.get_input()) - nodes = get_required_input_table(input, "nodes").table - community_hierarchy = get_required_input_table(input, "community_hierarchy").table - - output = await create_community_reports_df( - local_contexts, - nodes, - community_hierarchy, - callbacks, - cache, - strategy, - async_mode=async_mode, - num_threads=num_threads, - ) - - return TableContainer(table=output) - - -async def create_community_reports_df( +async def summarize_communities( local_contexts, nodes, community_hierarchy, @@ -106,8 +64,8 @@ async def create_community_reports_df( community_id=record[schemas.NODE_COMMUNITY], community_level=record[schemas.COMMUNITY_LEVEL], community_context=record[schemas.CONTEXT_STRING], - cache=cache, callbacks=callbacks, + cache=cache, strategy=strategy, ) tick() @@ -127,8 +85,8 @@ async def create_community_reports_df( async def _generate_report( runner: CommunityReportsStrategy, - cache: PipelineCache, callbacks: VerbCallbacks, + cache: PipelineCache, strategy: dict, community_id: int | str, community_level: int, @@ -146,9 +104,9 @@ def load_strategy( """Load strategy method definition.""" match strategy: case CreateCommunityReportsStrategyType.graph_intelligence: - from .strategies.graph_intelligence import run + from .strategies import run_graph_intelligence - return run + return run_graph_intelligence case _: msg = f"Unknown strategy: {strategy}" raise ValueError(msg) diff --git a/graphrag/index/verbs/graph/report/strategies/typing.py b/graphrag/index/operations/summarize_communities/typing.py similarity index 78% rename from graphrag/index/verbs/graph/report/strategies/typing.py rename to graphrag/index/operations/summarize_communities/typing.py index 087c7247..a5a01cbb 100644 --- a/graphrag/index/verbs/graph/report/strategies/typing.py +++ b/graphrag/index/operations/summarize_communities/typing.py @@ -4,6 +4,7 @@ """A module containing 'Finding' and 'CommunityReport' models.""" from collections.abc import Awaitable, Callable +from enum import Enum from typing import Any from datashaper import VerbCallbacks @@ -50,3 +51,13 @@ CommunityReportsStrategy = Callable[ ], Awaitable[CommunityReport | None], ] + + +class CreateCommunityReportsStrategyType(str, Enum): + """CreateCommunityReportsStrategyType class definition.""" + + graph_intelligence = "graph_intelligence" + + def __repr__(self): + """Get a string representation.""" + return f'"{self.value}"' diff --git a/graphrag/index/operations/summarize_descriptions/__init__.py b/graphrag/index/operations/summarize_descriptions/__init__.py new file mode 100644 index 00000000..55f818d1 --- /dev/null +++ b/graphrag/index/operations/summarize_descriptions/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2024 Microsoft Corporation. +# Licensed under the MIT License + +"""Root package for description summarization.""" + +from .summarize_descriptions import summarize_descriptions +from .typing import SummarizationStrategy, SummarizeStrategyType + +__all__ = [ + "SummarizationStrategy", + "SummarizeStrategyType", + "summarize_descriptions", +] diff --git a/graphrag/index/verbs/entities/summarize/strategies/graph_intelligence/run_graph_intelligence.py b/graphrag/index/operations/summarize_descriptions/strategies.py similarity index 69% rename from graphrag/index/verbs/entities/summarize/strategies/graph_intelligence/run_graph_intelligence.py rename to graphrag/index/operations/summarize_descriptions/strategies.py index 57a1ecd2..91ff0d31 100644 --- a/graphrag/index/verbs/entities/summarize/strategies/graph_intelligence/run_graph_intelligence.py +++ b/graphrag/index/operations/summarize_descriptions/strategies.py @@ -1,38 +1,34 @@ # Copyright (c) 2024 Microsoft Corporation. # Licensed under the MIT License -"""A module containing run_gi, run_resolve_entities and _create_text_list_splitter methods to run graph intelligence.""" +"""A module containing run_graph_intelligence, run_resolve_entities and _create_text_list_splitter methods to run graph intelligence.""" from datashaper import VerbCallbacks -from graphrag.config.enums import LLMType from graphrag.index.cache import PipelineCache from graphrag.index.graph.extractors.summarize import SummarizeExtractor from graphrag.index.llm import load_llm -from graphrag.index.verbs.entities.summarize.strategies.typing import ( +from graphrag.llm import CompletionLLM + +from .typing import ( StrategyConfig, SummarizedDescriptionResult, ) -from graphrag.llm import CompletionLLM - -from .defaults import DEFAULT_LLM_CONFIG -async def run( +async def run_graph_intelligence( described_items: str | tuple[str, str], descriptions: list[str], - reporter: VerbCallbacks, - pipeline_cache: PipelineCache, + callbacks: VerbCallbacks, + cache: PipelineCache, args: StrategyConfig, ) -> SummarizedDescriptionResult: """Run the graph intelligence entity extraction strategy.""" - llm_config = args.get("llm", DEFAULT_LLM_CONFIG) - llm_type = llm_config.get("type", LLMType.StaticResponse) - llm = load_llm( - "summarize_descriptions", llm_type, reporter, pipeline_cache, llm_config - ) + llm_config = args.get("llm", {}) + llm_type = llm_config.get("type") + llm = load_llm("summarize_descriptions", llm_type, callbacks, cache, llm_config) return await run_summarize_descriptions( - llm, described_items, descriptions, reporter, args + llm, described_items, descriptions, callbacks, args ) @@ -40,7 +36,7 @@ async def run_summarize_descriptions( llm: CompletionLLM, items: str | tuple[str, str], descriptions: list[str], - reporter: VerbCallbacks, + callbacks: VerbCallbacks, args: StrategyConfig, ) -> SummarizedDescriptionResult: """Run the entity extraction chain.""" @@ -56,8 +52,8 @@ async def run_summarize_descriptions( entity_name_key=entity_name_key, input_descriptions_key=input_descriptions_key, on_error=lambda e, stack, details: ( - reporter.error("Entity Extraction Error", e, stack, details) - if reporter + callbacks.error("Entity Extraction Error", e, stack, details) + if callbacks else None ), max_summary_length=args.get("max_summary_length", None), diff --git a/graphrag/index/verbs/entities/summarize/description_summarize.py b/graphrag/index/operations/summarize_descriptions/summarize_descriptions.py similarity index 83% rename from graphrag/index/verbs/entities/summarize/description_summarize.py rename to graphrag/index/operations/summarize_descriptions/summarize_descriptions.py index 40200b4c..40af8dfc 100644 --- a/graphrag/index/verbs/entities/summarize/description_summarize.py +++ b/graphrag/index/operations/summarize_descriptions/summarize_descriptions.py @@ -5,72 +5,32 @@ import asyncio import logging -from enum import Enum -from typing import Any, NamedTuple, cast +from typing import Any, cast import networkx as nx import pandas as pd from datashaper import ( ProgressTicker, - TableContainer, VerbCallbacks, - VerbInput, progress_ticker, - verb, ) from graphrag.index.cache import PipelineCache from graphrag.index.utils import load_graph -from .strategies.typing import SummarizationStrategy +from .typing import ( + DescriptionSummarizeRow, + SummarizationStrategy, + SummarizeStrategyType, +) log = logging.getLogger(__name__) -class DescriptionSummarizeRow(NamedTuple): - """DescriptionSummarizeRow class definition.""" - - graph: Any - - -class SummarizeStrategyType(str, Enum): - """SummarizeStrategyType class definition.""" - - graph_intelligence = "graph_intelligence" - - def __repr__(self): - """Get a string representation.""" - return f'"{self.value}"' - - -@verb(name="summarize_descriptions") async def summarize_descriptions( - input: VerbInput, - cache: PipelineCache, - callbacks: VerbCallbacks, - column: str, - to: str, - strategy: dict[str, Any] | None = None, - **kwargs, -) -> TableContainer: - """Summarize entity and relationship descriptions from an entity graph.""" - source = cast(pd.DataFrame, input.get_input()) - output = await summarize_descriptions_df( - source, - cache, - callbacks, - column=column, - to=to, - strategy=strategy, - **kwargs, - ) - return TableContainer(table=output) - - -async def summarize_descriptions_df( input: pd.DataFrame, - cache: PipelineCache, callbacks: VerbCallbacks, + cache: PipelineCache, column: str, to: str, strategy: dict[str, Any] | None = None, @@ -99,7 +59,6 @@ async def summarize_descriptions_df( ### yaml ```yaml - verb: entity_extract args: column: the_document_text_column_to_extract_descriptions_from to: the_column_to_output_the_summarized_descriptions_to @@ -221,9 +180,9 @@ def load_strategy(strategy_type: SummarizeStrategyType) -> SummarizationStrategy """Load strategy method definition.""" match strategy_type: case SummarizeStrategyType.graph_intelligence: - from .strategies.graph_intelligence import run as run_gi + from .strategies import run_graph_intelligence - return run_gi + return run_graph_intelligence case _: msg = f"Unknown strategy: {strategy_type}" raise ValueError(msg) diff --git a/graphrag/index/verbs/entities/summarize/strategies/typing.py b/graphrag/index/operations/summarize_descriptions/typing.py similarity index 63% rename from graphrag/index/verbs/entities/summarize/strategies/typing.py rename to graphrag/index/operations/summarize_descriptions/typing.py index 39829503..4e957cf4 100644 --- a/graphrag/index/verbs/entities/summarize/strategies/typing.py +++ b/graphrag/index/operations/summarize_descriptions/typing.py @@ -5,7 +5,8 @@ from collections.abc import Awaitable, Callable from dataclasses import dataclass -from typing import Any +from enum import Enum +from typing import Any, NamedTuple from datashaper import VerbCallbacks @@ -32,3 +33,19 @@ SummarizationStrategy = Callable[ ], Awaitable[SummarizedDescriptionResult], ] + + +class DescriptionSummarizeRow(NamedTuple): + """DescriptionSummarizeRow class definition.""" + + graph: Any + + +class SummarizeStrategyType(str, Enum): + """SummarizeStrategyType class definition.""" + + graph_intelligence = "graph_intelligence" + + def __repr__(self): + """Get a string representation.""" + return f'"{self.value}"' diff --git a/graphrag/index/verbs/graph/unpack.py b/graphrag/index/operations/unpack_graph.py similarity index 61% rename from graphrag/index/verbs/graph/unpack.py rename to graphrag/index/operations/unpack_graph.py index 3ab99a56..ad9f7381 100644 --- a/graphrag/index/verbs/graph/unpack.py +++ b/graphrag/index/operations/unpack_graph.py @@ -7,57 +7,20 @@ from typing import Any, cast import networkx as nx import pandas as pd -from datashaper import TableContainer, VerbCallbacks, VerbInput, progress_iterable, verb +from datashaper import VerbCallbacks, progress_iterable from graphrag.index.utils import load_graph default_copy = ["level"] -@verb(name="unpack_graph") def unpack_graph( - input: VerbInput, - callbacks: VerbCallbacks, - column: str, - type: str, # noqa A002 - copy: list[str] | None = None, - embeddings_column: str = "embeddings", - **kwargs, -) -> TableContainer: - """ - Unpack nodes or edges from a graphml graph, into a list of nodes or edges. - - This verb will create columns for each attribute in a node or edge. - - ## Usage - ```yaml - verb: unpack_graph - args: - type: node # The type of data to unpack, one of: node, edge. node will create a node list, edge will create an edge list - column: # The name of the column containing the graph, should be a graphml graph - ``` - """ - input_df = input.get_input() - output_df = unpack_graph_df( - cast(pd.DataFrame, input_df), - callbacks, - column, - type, - copy, - embeddings_column, - kwargs=kwargs, - ) - return TableContainer(table=output_df) - - -def unpack_graph_df( input_df: pd.DataFrame, callbacks: VerbCallbacks, column: str, type: str, # noqa A002 copy: list[str] | None = None, embeddings_column: str = "embeddings", - **kwargs, ) -> pd.DataFrame: """Unpack nodes or edges from a graphml graph, into a list of nodes or edges.""" if copy is None: @@ -83,7 +46,6 @@ def unpack_graph_df( cast(str | nx.Graph, row[column]), type, embeddings, - kwargs, ) ]) @@ -94,19 +56,18 @@ def _run_unpack( graphml_or_graph: str | nx.Graph, unpack_type: str, embeddings: dict[str, list[float]], - args: dict[str, Any], ) -> list[dict[str, Any]]: graph = load_graph(graphml_or_graph) if unpack_type == "nodes": - return _unpack_nodes(graph, embeddings, args) + return _unpack_nodes(graph, embeddings) if unpack_type == "edges": - return _unpack_edges(graph, args) + return _unpack_edges(graph) msg = f"Unknown type {unpack_type}" raise ValueError(msg) def _unpack_nodes( - graph: nx.Graph, embeddings: dict[str, list[float]], _args: dict[str, Any] + graph: nx.Graph, embeddings: dict[str, list[float]] ) -> list[dict[str, Any]]: return [ { @@ -118,7 +79,7 @@ def _unpack_nodes( ] -def _unpack_edges(graph: nx.Graph, _args: dict[str, Any]) -> list[dict[str, Any]]: +def _unpack_edges(graph: nx.Graph) -> list[dict[str, Any]]: return [ { "source": source_id, diff --git a/graphrag/index/run/run.py b/graphrag/index/run/run.py index 0ef973ed..dd50c4a1 100644 --- a/graphrag/index/run/run.py +++ b/graphrag/index/run/run.py @@ -47,7 +47,6 @@ from graphrag.index.typing import PipelineRunResult # Register all verbs from graphrag.index.update.dataframes import get_delta_docs, update_dataframe_outputs -from graphrag.index.verbs import * # noqa from graphrag.index.workflows import ( VerbDefinitions, WorkflowDefinitions, diff --git a/graphrag/index/verbs/__init__.py b/graphrag/index/verbs/__init__.py deleted file mode 100644 index 5859d983..00000000 --- a/graphrag/index/verbs/__init__.py +++ /dev/null @@ -1,46 +0,0 @@ -# Copyright (c) 2024 Microsoft Corporation. -# Licensed under the MIT License - -"""A module containing get_default_verbs method definition.""" - -from .covariates import extract_covariates -from .entities import entity_extract, summarize_descriptions -from .genid import genid -from .graph import ( - cluster_graph, - create_community_reports, - create_graph, - layout_graph, - merge_graphs, - unpack_graph, -) -from .overrides import aggregate, concat -from .snapshot import snapshot -from .snapshot_rows import snapshot_rows -from .spread_json import spread_json -from .text import chunk, text_split, text_translate -from .unzip import unzip -from .zip import zip_verb - -__all__ = [ - "aggregate", - "chunk", - "cluster_graph", - "concat", - "create_community_reports", - "create_graph", - "entity_extract", - "extract_covariates", - "genid", - "layout_graph", - "merge_graphs", - "snapshot", - "snapshot_rows", - "spread_json", - "summarize_descriptions", - "text_split", - "text_translate", - "unpack_graph", - "unzip", - "zip_verb", -] diff --git a/graphrag/index/verbs/covariates/__init__.py b/graphrag/index/verbs/covariates/__init__.py deleted file mode 100644 index cdebee22..00000000 --- a/graphrag/index/verbs/covariates/__init__.py +++ /dev/null @@ -1,8 +0,0 @@ -# Copyright (c) 2024 Microsoft Corporation. -# Licensed under the MIT License - -"""The Indexing Engine covariates package root.""" - -from .extract_covariates import extract_covariates - -__all__ = ["extract_covariates"] diff --git a/graphrag/index/verbs/covariates/extract_covariates/strategies/__init__.py b/graphrag/index/verbs/covariates/extract_covariates/strategies/__init__.py deleted file mode 100644 index 605c66f8..00000000 --- a/graphrag/index/verbs/covariates/extract_covariates/strategies/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -# Copyright (c) 2024 Microsoft Corporation. -# Licensed under the MIT License - -"""The Indexing Engine text extract claims strategies package root.""" diff --git a/graphrag/index/verbs/covariates/extract_covariates/strategies/graph_intelligence/__init__.py b/graphrag/index/verbs/covariates/extract_covariates/strategies/graph_intelligence/__init__.py deleted file mode 100644 index ab01f06f..00000000 --- a/graphrag/index/verbs/covariates/extract_covariates/strategies/graph_intelligence/__init__.py +++ /dev/null @@ -1,8 +0,0 @@ -# Copyright (c) 2024 Microsoft Corporation. -# Licensed under the MIT License - -"""The Indexing Engine text extract claims strategies graph intelligence package root.""" - -from .run_gi_extract_claims import run - -__all__ = ["run"] diff --git a/graphrag/index/verbs/covariates/extract_covariates/strategies/graph_intelligence/defaults.py b/graphrag/index/verbs/covariates/extract_covariates/strategies/graph_intelligence/defaults.py deleted file mode 100644 index a777f296..00000000 --- a/graphrag/index/verbs/covariates/extract_covariates/strategies/graph_intelligence/defaults.py +++ /dev/null @@ -1,10 +0,0 @@ -# Copyright (c) 2024 Microsoft Corporation. -# Licensed under the MIT License - -"""A file containing MOCK_LLM_RESPONSES definition.""" - -MOCK_LLM_RESPONSES = [ - """ -(COMPANY A<|>GOVERNMENT AGENCY B<|>ANTI-COMPETITIVE PRACTICES<|>TRUE<|>2022-01-10T00:00:00<|>2022-01-10T00:00:00<|>Company A was found to engage in anti-competitive practices because it was fined for bid rigging in multiple public tenders published by Government Agency B according to an article published on 2022/01/10<|>According to an article published on 2022/01/10, Company A was fined for bid rigging while participating in multiple public tenders published by Government Agency B.) - """.strip() -] diff --git a/graphrag/index/verbs/entities/__init__.py b/graphrag/index/verbs/entities/__init__.py deleted file mode 100644 index 2f55d710..00000000 --- a/graphrag/index/verbs/entities/__init__.py +++ /dev/null @@ -1,9 +0,0 @@ -# Copyright (c) 2024 Microsoft Corporation. -# Licensed under the MIT License - -"""The Indexing Engine entities package root.""" - -from .extraction import entity_extract -from .summarize import summarize_descriptions - -__all__ = ["entity_extract", "summarize_descriptions"] diff --git a/graphrag/index/verbs/entities/extraction/strategies/graph_intelligence/__init__.py b/graphrag/index/verbs/entities/extraction/strategies/graph_intelligence/__init__.py deleted file mode 100644 index 083c0e41..00000000 --- a/graphrag/index/verbs/entities/extraction/strategies/graph_intelligence/__init__.py +++ /dev/null @@ -1,8 +0,0 @@ -# Copyright (c) 2024 Microsoft Corporation. -# Licensed under the MIT License - -"""The Indexing Engine graph intelligence package root.""" - -from .run_graph_intelligence import run_gi - -__all__ = ["run_gi"] diff --git a/graphrag/index/verbs/entities/extraction/strategies/graph_intelligence/defaults.py b/graphrag/index/verbs/entities/extraction/strategies/graph_intelligence/defaults.py deleted file mode 100644 index 237e6657..00000000 --- a/graphrag/index/verbs/entities/extraction/strategies/graph_intelligence/defaults.py +++ /dev/null @@ -1,25 +0,0 @@ -# Copyright (c) 2024 Microsoft Corporation. -# Licensed under the MIT License - -"""A file containing some default responses.""" - -from graphrag.config.enums import LLMType - -MOCK_LLM_RESPONSES = [ - """ - ("entity"<|>COMPANY_A<|>COMPANY<|>Company_A is a test company) - ## - ("entity"<|>COMPANY_B<|>COMPANY<|>Company_B owns Company_A and also shares an address with Company_A) - ## - ("entity"<|>PERSON_C<|>PERSON<|>Person_C is director of Company_A) - ## - ("relationship"<|>COMPANY_A<|>COMPANY_B<|>Company_A and Company_B are related because Company_A is 100% owned by Company_B and the two companies also share the same address)<|>2) - ## - ("relationship"<|>COMPANY_A<|>PERSON_C<|>Company_A and Person_C are related because Person_C is director of Company_A<|>1)) - """.strip() -] - -DEFAULT_LLM_CONFIG = { - "type": LLMType.StaticResponse, - "responses": MOCK_LLM_RESPONSES, -} diff --git a/graphrag/index/verbs/entities/summarize/__init__.py b/graphrag/index/verbs/entities/summarize/__init__.py deleted file mode 100644 index d7e9a5d9..00000000 --- a/graphrag/index/verbs/entities/summarize/__init__.py +++ /dev/null @@ -1,8 +0,0 @@ -# Copyright (c) 2024 Microsoft Corporation. -# Licensed under the MIT License - -"""Root package for entity summarization.""" - -from .description_summarize import SummarizeStrategyType, summarize_descriptions - -__all__ = ["SummarizeStrategyType", "summarize_descriptions"] diff --git a/graphrag/index/verbs/entities/summarize/strategies/__init__.py b/graphrag/index/verbs/entities/summarize/strategies/__init__.py deleted file mode 100644 index 28c398e6..00000000 --- a/graphrag/index/verbs/entities/summarize/strategies/__init__.py +++ /dev/null @@ -1,8 +0,0 @@ -# Copyright (c) 2024 Microsoft Corporation. -# Licensed under the MIT License - -"""Indexing Engine - Summarization Strategies Package.""" - -from .typing import SummarizationStrategy - -__all__ = ["SummarizationStrategy"] diff --git a/graphrag/index/verbs/entities/summarize/strategies/graph_intelligence/__init__.py b/graphrag/index/verbs/entities/summarize/strategies/graph_intelligence/__init__.py deleted file mode 100644 index a98d9406..00000000 --- a/graphrag/index/verbs/entities/summarize/strategies/graph_intelligence/__init__.py +++ /dev/null @@ -1,8 +0,0 @@ -# Copyright (c) 2024 Microsoft Corporation. -# Licensed under the MIT License - -"""The Entity summarization graph intelligence package root.""" - -from .run_graph_intelligence import run - -__all__ = ["run"] diff --git a/graphrag/index/verbs/entities/summarize/strategies/graph_intelligence/defaults.py b/graphrag/index/verbs/entities/summarize/strategies/graph_intelligence/defaults.py deleted file mode 100644 index 8ac42aa1..00000000 --- a/graphrag/index/verbs/entities/summarize/strategies/graph_intelligence/defaults.py +++ /dev/null @@ -1,17 +0,0 @@ -# Copyright (c) 2024 Microsoft Corporation. -# Licensed under the MIT License - -"""A file containing some default responses.""" - -from graphrag.config.enums import LLMType - -MOCK_LLM_RESPONSES = [ - """ - This is a MOCK response for the LLM. It is summarized! - """.strip() -] - -DEFAULT_LLM_CONFIG = { - "type": LLMType.StaticResponse, - "responses": MOCK_LLM_RESPONSES, -} diff --git a/graphrag/index/verbs/genid.py b/graphrag/index/verbs/genid.py deleted file mode 100644 index 58ab581f..00000000 --- a/graphrag/index/verbs/genid.py +++ /dev/null @@ -1,80 +0,0 @@ -# Copyright (c) 2024 Microsoft Corporation. -# Licensed under the MIT License - -"""A module containing genid method definition.""" - -from typing import cast - -import pandas as pd -from datashaper import TableContainer, VerbInput, verb - -from graphrag.index.utils import gen_md5_hash - - -@verb(name="genid") -def genid( - input: VerbInput, - to: str, - method: str = "md5_hash", - hash: list[str] | None = None, # noqa A002 - **_kwargs: dict, -) -> TableContainer: - """ - Generate a unique id for each row in the tabular data. - - ## Usage - ### json - ```json - { - "verb": "genid", - "args": { - "to": "id_output_column_name", /* The name of the column to output the id to */ - "method": "md5_hash", /* The method to use to generate the id */ - "hash": ["list", "of", "column", "names"] /* only if using md5_hash */, - "seed": 034324 /* The random seed to use with UUID */ - } - } - ``` - - ### yaml - ```yaml - verb: genid - args: - to: id_output_column_name - method: md5_hash - hash: - - list - - of - - column - - names - seed: 034324 - ``` - """ - data = cast(pd.DataFrame, input.source.table) - - output = genid_df(data, to, method, hash) - - return TableContainer(table=output) - - -def genid_df( - input: pd.DataFrame, - to: str, - method: str = "md5_hash", - hash: list[str] | None = None, # noqa A002 -): - """Generate a unique id for each row in the tabular data.""" - data = input - match method: - case "md5_hash": - if not hash: - msg = 'Must specify the "hash" columns to use md5_hash method' - raise ValueError(msg) - data[to] = data.apply(lambda row: gen_md5_hash(row, hash), axis=1) - case "increment": - data[to] = data.index + 1 - case _: - msg = f"Unknown method {method}" - raise ValueError(msg) - - return data diff --git a/graphrag/index/verbs/graph/__init__.py b/graphrag/index/verbs/graph/__init__.py deleted file mode 100644 index f252a9f3..00000000 --- a/graphrag/index/verbs/graph/__init__.py +++ /dev/null @@ -1,34 +0,0 @@ -# Copyright (c) 2024 Microsoft Corporation. -# Licensed under the MIT License - -"""The Indexing Engine graph package root.""" - -from .clustering import cluster_graph -from .compute_edge_combined_degree import compute_edge_combined_degree -from .create import DEFAULT_EDGE_ATTRIBUTES, DEFAULT_NODE_ATTRIBUTES, create_graph -from .layout import layout_graph -from .merge import merge_graphs -from .report import ( - create_community_reports, - prepare_community_reports, - prepare_community_reports_claims, - prepare_community_reports_edges, - restore_community_hierarchy, -) -from .unpack import unpack_graph - -__all__ = [ - "DEFAULT_EDGE_ATTRIBUTES", - "DEFAULT_NODE_ATTRIBUTES", - "cluster_graph", - "compute_edge_combined_degree", - "create_community_reports", - "create_graph", - "layout_graph", - "merge_graphs", - "prepare_community_reports", - "prepare_community_reports_claims", - "prepare_community_reports_edges", - "restore_community_hierarchy", - "unpack_graph", -] diff --git a/graphrag/index/verbs/graph/clustering/__init__.py b/graphrag/index/verbs/graph/clustering/__init__.py deleted file mode 100644 index a5db89bb..00000000 --- a/graphrag/index/verbs/graph/clustering/__init__.py +++ /dev/null @@ -1,8 +0,0 @@ -# Copyright (c) 2024 Microsoft Corporation. -# Licensed under the MIT License - -"""The Indexing Engine graph clustering package root.""" - -from .cluster_graph import GraphCommunityStrategyType, cluster_graph - -__all__ = ["GraphCommunityStrategyType", "cluster_graph"] diff --git a/graphrag/index/verbs/graph/clustering/strategies/__init__.py b/graphrag/index/verbs/graph/clustering/strategies/__init__.py deleted file mode 100644 index 16a03f12..00000000 --- a/graphrag/index/verbs/graph/clustering/strategies/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -# Copyright (c) 2024 Microsoft Corporation. -# Licensed under the MIT License - -"""Graph Clustering Strategies.""" diff --git a/graphrag/index/verbs/graph/clustering/strategies/leiden.py b/graphrag/index/verbs/graph/clustering/strategies/leiden.py deleted file mode 100644 index ffc36880..00000000 --- a/graphrag/index/verbs/graph/clustering/strategies/leiden.py +++ /dev/null @@ -1,69 +0,0 @@ -# Copyright (c) 2024 Microsoft Corporation. -# Licensed under the MIT License - -"""A module containing run and _compute_leiden_communities methods definitions.""" - -import logging -from typing import Any - -import networkx as nx -from graspologic.partition import hierarchical_leiden - -from graphrag.index.graph.utils import stable_largest_connected_component - -log = logging.getLogger(__name__) - - -def run(graph: nx.Graph, args: dict[str, Any]) -> dict[int, dict[str, list[str]]]: - """Run method definition.""" - max_cluster_size = args.get("max_cluster_size", 10) - use_lcc = args.get("use_lcc", True) - if args.get("verbose", False): - log.info( - "Running leiden with max_cluster_size=%s, lcc=%s", max_cluster_size, use_lcc - ) - - node_id_to_community_map = _compute_leiden_communities( - graph=graph, - max_cluster_size=max_cluster_size, - use_lcc=use_lcc, - seed=args.get("seed", 0xDEADBEEF), - ) - levels = args.get("levels") - - # If they don't pass in levels, use them all - if levels is None: - levels = sorted(node_id_to_community_map.keys()) - - results_by_level: dict[int, dict[str, list[str]]] = {} - for level in levels: - result = {} - results_by_level[level] = result - for node_id, raw_community_id in node_id_to_community_map[level].items(): - community_id = str(raw_community_id) - if community_id not in result: - result[community_id] = [] - result[community_id].append(node_id) - return results_by_level - - -# Taken from graph_intelligence & adapted -def _compute_leiden_communities( - graph: nx.Graph | nx.DiGraph, - max_cluster_size: int, - use_lcc: bool, - seed=0xDEADBEEF, -) -> dict[int, dict[str, int]]: - """Return Leiden root communities.""" - if use_lcc: - graph = stable_largest_connected_component(graph) - - community_mapping = hierarchical_leiden( - graph, max_cluster_size=max_cluster_size, random_seed=seed - ) - results: dict[int, dict[str, int]] = {} - for partition in community_mapping: - results[partition.level] = results.get(partition.level, {}) - results[partition.level][partition.node] = partition.cluster - - return results diff --git a/graphrag/index/verbs/graph/clustering/typing.py b/graphrag/index/verbs/graph/clustering/typing.py deleted file mode 100644 index 4d6fc7e6..00000000 --- a/graphrag/index/verbs/graph/clustering/typing.py +++ /dev/null @@ -1,6 +0,0 @@ -# Copyright (c) 2024 Microsoft Corporation. -# Licensed under the MIT License - -"""A module containing Communities list definition.""" - -Communities = list[tuple[int, str, list[str]]] diff --git a/graphrag/index/verbs/graph/compute_edge_combined_degree.py b/graphrag/index/verbs/graph/compute_edge_combined_degree.py deleted file mode 100644 index 59101376..00000000 --- a/graphrag/index/verbs/graph/compute_edge_combined_degree.py +++ /dev/null @@ -1,93 +0,0 @@ -# Copyright (c) 2024 Microsoft Corporation. -# Licensed under the MIT License - -"""A module containing create_graph, _get_node_attributes, _get_edge_attributes and _get_attribute_column_mapping methods definition.""" - -from typing import cast - -import pandas as pd -from datashaper import TableContainer, VerbInput, verb - -from graphrag.index.utils.ds_util import get_required_input_table - - -@verb(name="compute_edge_combined_degree") -def compute_edge_combined_degree( - input: VerbInput, - to: str = "rank", - node_name_column: str = "title", - node_degree_column: str = "degree", - edge_source_column: str = "source", - edge_target_column: str = "target", - **_kwargs, -) -> TableContainer: - """ - Compute the combined degree for each edge in a graph. - - Inputs Tables: - - input: The edge table - - nodes: The nodes table. - - Args: - - to: The name of the column to output the combined degree to. Default="rank" - """ - edge_df: pd.DataFrame = cast(pd.DataFrame, input.get_input()) - node_degree_df = _get_node_degree_table(input, node_name_column, node_degree_column) - - output_df = compute_edge_combined_degree_df( - edge_df, - node_degree_df, - to, - node_name_column, - node_degree_column, - edge_source_column, - edge_target_column, - ) - - return TableContainer(table=output_df) - - -def compute_edge_combined_degree_df( - edge_df: pd.DataFrame, - node_degree_df: pd.DataFrame, - to: str, - node_name_column: str, - node_degree_column: str, - edge_source_column: str, - edge_target_column: str, -) -> pd.DataFrame: - """Compute the combined degree for each edge in a graph.""" - if to in edge_df.columns: - return edge_df - - def join_to_degree(df: pd.DataFrame, column: str) -> pd.DataFrame: - degree_column = _degree_colname(column) - result = df.merge( - node_degree_df.rename( - columns={node_name_column: column, node_degree_column: degree_column} - ), - on=column, - how="left", - ) - result[degree_column] = result[degree_column].fillna(0) - return result - - output_df = join_to_degree(edge_df, edge_source_column) - output_df = join_to_degree(output_df, edge_target_column) - output_df[to] = ( - output_df[_degree_colname(edge_source_column)] - + output_df[_degree_colname(edge_target_column)] - ) - return output_df - - -def _degree_colname(column: str) -> str: - return f"{column}_degree" - - -def _get_node_degree_table( - input: VerbInput, node_name_column: str, node_degree_column: str -) -> pd.DataFrame: - nodes_container = get_required_input_table(input, "nodes") - nodes = cast(pd.DataFrame, nodes_container.table) - return cast(pd.DataFrame, nodes[[node_name_column, node_degree_column]]) diff --git a/graphrag/index/verbs/graph/create.py b/graphrag/index/verbs/graph/create.py deleted file mode 100644 index eaf06284..00000000 --- a/graphrag/index/verbs/graph/create.py +++ /dev/null @@ -1,135 +0,0 @@ -# Copyright (c) 2024 Microsoft Corporation. -# Licensed under the MIT License - -"""A module containing create_graph, _get_node_attributes, _get_edge_attributes and _get_attribute_column_mapping methods definition.""" - -from typing import Any - -import networkx as nx -import pandas as pd -from datashaper import TableContainer, VerbCallbacks, VerbInput, progress_iterable, verb - -from graphrag.index.utils import clean_str - -DEFAULT_NODE_ATTRIBUTES = ["label", "type", "id", "name", "description", "community"] -DEFAULT_EDGE_ATTRIBUTES = ["label", "type", "name", "source", "target"] - - -@verb(name="create_graph") -def create_graph( - input: VerbInput, - callbacks: VerbCallbacks, - to: str, - type: str, # noqa A002 - graph_type: str = "undirected", - **kwargs, -) -> TableContainer: - """ - Create a graph from a dataframe. The verb outputs a new column containing the graph. - - > Note: This will roll up all rows into a single graph. - - ## Usage - ```yaml - verb: create_graph - args: - type: node # The type of graph to create, one of: node, edge - to: # The name of the column to output the graph to, this will be a graphml graph - attributes: # The attributes for the nodes / edges - # If using the node type, the following attributes are required: - id: - - # If using the edge type, the following attributes are required: - source: - target: - - # Other attributes can be added as follows: - : - ... for each attribute - ``` - """ - if type != "node" and type != "edge": - msg = f"Unknown type {type}" - raise ValueError(msg) - - input_df = input.get_input() - num_total = len(input_df) - out_graph: nx.Graph = _create_nx_graph(graph_type) - - in_attributes = ( - _get_node_attributes(kwargs) if type == "node" else _get_edge_attributes(kwargs) - ) - - # At this point, _get_node_attributes and _get_edge_attributes have already validated - id_col = in_attributes.get( - "id", in_attributes.get("label", in_attributes.get("name", None)) - ) - source_col = in_attributes.get("source", None) - target_col = in_attributes.get("target", None) - - for _, row in progress_iterable(input_df.iterrows(), callbacks.progress, num_total): - item_attributes = { - clean_str(key): _clean_value(row[value]) - for key, value in in_attributes.items() - if value in row - } - if type == "node": - id = clean_str(row[id_col]) - out_graph.add_node(id, **item_attributes) - elif type == "edge": - source = clean_str(row[source_col]) - target = clean_str(row[target_col]) - out_graph.add_edge(source, target, **item_attributes) - - graphml_string = "".join(nx.generate_graphml(out_graph)) - output_df = pd.DataFrame([{to: graphml_string}]) - return TableContainer(table=output_df) - - -def _clean_value(value: Any) -> str: - if value is None: - return "" - if isinstance(value, str): - return clean_str(value) - - msg = f"Value must be a string or None, got {type(value)}" - raise TypeError(msg) - - -def _get_node_attributes(args: dict[str, Any]) -> dict[str, Any]: - mapping = _get_attribute_column_mapping( - args.get("attributes", DEFAULT_NODE_ATTRIBUTES) - ) - if "id" not in mapping and "label" not in mapping and "name" not in mapping: - msg = "You must specify an id, label, or name column in the node attributes" - raise ValueError(msg) - return mapping - - -def _get_edge_attributes(args: dict[str, Any]) -> dict[str, Any]: - mapping = _get_attribute_column_mapping( - args.get("attributes", DEFAULT_EDGE_ATTRIBUTES) - ) - if "source" not in mapping or "target" not in mapping: - msg = "You must specify a source and target column in the edge attributes" - raise ValueError(msg) - return mapping - - -def _get_attribute_column_mapping( - in_attributes: dict[str, Any] | list[str], -) -> dict[str, str]: - # Its already a attribute: column dict - if isinstance(in_attributes, dict): - return { - **in_attributes, - } - - return {attrib: attrib for attrib in in_attributes} - - -def _create_nx_graph(graph_type: str) -> nx.Graph: - if graph_type == "directed": - return nx.DiGraph() - - return nx.Graph() diff --git a/graphrag/index/verbs/graph/merge/defaults.py b/graphrag/index/verbs/graph/merge/defaults.py deleted file mode 100644 index 80c60331..00000000 --- a/graphrag/index/verbs/graph/merge/defaults.py +++ /dev/null @@ -1,21 +0,0 @@ -# Copyright (c) 2024 Microsoft Corporation. -# Licensed under the MIT License - -"""A file containing DEFAULT_NODE_OPERATIONS, DEFAULT_EDGE_OPERATIONS and DEFAULT_CONCAT_SEPARATOR values definition.""" - -from .typing import BasicMergeOperation - -DEFAULT_NODE_OPERATIONS = { - "*": { - "operation": BasicMergeOperation.Replace, - } -} - -DEFAULT_EDGE_OPERATIONS = { - "*": { - "operation": BasicMergeOperation.Replace, - }, - "weight": "sum", -} - -DEFAULT_CONCAT_SEPARATOR = "," diff --git a/graphrag/index/verbs/graph/report/__init__.py b/graphrag/index/verbs/graph/report/__init__.py deleted file mode 100644 index e47d9cce..00000000 --- a/graphrag/index/verbs/graph/report/__init__.py +++ /dev/null @@ -1,25 +0,0 @@ -# Copyright (c) 2024 Microsoft Corporation. -# Licensed under the MIT License - -"""The Indexing Engine graph report package root.""" - -from .create_community_reports import ( - CreateCommunityReportsStrategyType, - create_community_reports, -) -from .prepare_community_reports import prepare_community_reports -from .prepare_community_reports_claims import prepare_community_reports_claims -from .prepare_community_reports_edges import prepare_community_reports_edges -from .prepare_community_reports_nodes import prepare_community_reports_nodes -from .restore_community_hierarchy import restore_community_hierarchy - -__all__ = [ - "CreateCommunityReportsStrategyType", - "create_community_reports", - "create_community_reports", - "prepare_community_reports", - "prepare_community_reports_claims", - "prepare_community_reports_edges", - "prepare_community_reports_nodes", - "restore_community_hierarchy", -] diff --git a/graphrag/index/verbs/graph/report/prepare_community_reports_claims.py b/graphrag/index/verbs/graph/report/prepare_community_reports_claims.py deleted file mode 100644 index aa9a7907..00000000 --- a/graphrag/index/verbs/graph/report/prepare_community_reports_claims.py +++ /dev/null @@ -1,50 +0,0 @@ -# Copyright (c) 2024 Microsoft Corporation. -# Licensed under the MIT License - -"""A module containing create_graph, _get_node_attributes, _get_edge_attributes and _get_attribute_column_mapping methods definition.""" - -from typing import cast - -import pandas as pd -from datashaper import TableContainer, VerbInput, verb - -from graphrag.index.graph.extractors.community_reports.schemas import ( - CLAIM_DESCRIPTION, - CLAIM_DETAILS, - CLAIM_ID, - CLAIM_STATUS, - CLAIM_SUBJECT, - CLAIM_TYPE, -) - -_MISSING_DESCRIPTION = "No Description" - - -@verb(name="prepare_community_reports_claims") -def prepare_community_reports_claims( - input: VerbInput, - to: str = CLAIM_DETAILS, - id_column: str = CLAIM_ID, - description_column: str = CLAIM_DESCRIPTION, - subject_column: str = CLAIM_SUBJECT, - type_column: str = CLAIM_TYPE, - status_column: str = CLAIM_STATUS, - **_kwargs, -) -> TableContainer: - """Merge claim details into an object.""" - claim_df: pd.DataFrame = cast(pd.DataFrame, input.get_input()) - claim_df = claim_df.fillna(value={description_column: _MISSING_DESCRIPTION}) - - # merge values of five columns into a map column - claim_df[to] = claim_df.apply( - lambda x: { - id_column: x[id_column], - subject_column: x[subject_column], - type_column: x[type_column], - status_column: x[status_column], - description_column: x[description_column], - }, - axis=1, - ) - - return TableContainer(table=claim_df) diff --git a/graphrag/index/verbs/graph/report/prepare_community_reports_edges.py b/graphrag/index/verbs/graph/report/prepare_community_reports_edges.py deleted file mode 100644 index b568aba0..00000000 --- a/graphrag/index/verbs/graph/report/prepare_community_reports_edges.py +++ /dev/null @@ -1,48 +0,0 @@ -# Copyright (c) 2024 Microsoft Corporation. -# Licensed under the MIT License - -"""A module containing create_graph, _get_node_attributes, _get_edge_attributes and _get_attribute_column_mapping methods definition.""" - -from typing import cast - -import pandas as pd -from datashaper import TableContainer, VerbInput, verb - -from graphrag.index.graph.extractors.community_reports.schemas import ( - EDGE_DEGREE, - EDGE_DESCRIPTION, - EDGE_DETAILS, - EDGE_ID, - EDGE_SOURCE, - EDGE_TARGET, -) - -_MISSING_DESCRIPTION = "No Description" - - -@verb(name="prepare_community_reports_edges") -def prepare_community_reports_edges( - input: VerbInput, - to: str = EDGE_DETAILS, - id_column: str = EDGE_ID, - source_column: str = EDGE_SOURCE, - target_column: str = EDGE_TARGET, - description_column: str = EDGE_DESCRIPTION, - degree_column: str = EDGE_DEGREE, - **_kwargs, -) -> TableContainer: - """Merge edge details into an object.""" - edge_df: pd.DataFrame = cast(pd.DataFrame, input.get_input()).fillna( - value={description_column: _MISSING_DESCRIPTION} - ) - edge_df[to] = edge_df.apply( - lambda x: { - id_column: x[id_column], - source_column: x[source_column], - target_column: x[target_column], - description_column: x[description_column], - degree_column: x[degree_column], - }, - axis=1, - ) - return TableContainer(table=edge_df) diff --git a/graphrag/index/verbs/graph/report/prepare_community_reports_nodes.py b/graphrag/index/verbs/graph/report/prepare_community_reports_nodes.py deleted file mode 100644 index f159c125..00000000 --- a/graphrag/index/verbs/graph/report/prepare_community_reports_nodes.py +++ /dev/null @@ -1,46 +0,0 @@ -# Copyright (c) 2024 Microsoft Corporation. -# Licensed under the MIT License - -"""A module containing create_graph, _get_node_attributes, _get_edge_attributes and _get_attribute_column_mapping methods definition.""" - -from typing import cast - -import pandas as pd -from datashaper import TableContainer, VerbInput, verb - -from graphrag.index.graph.extractors.community_reports.schemas import ( - NODE_DEGREE, - NODE_DESCRIPTION, - NODE_DETAILS, - NODE_ID, - NODE_NAME, -) - -_MISSING_DESCRIPTION = "No Description" - - -@verb(name="prepare_community_reports_nodes") -def prepare_community_reports_nodes( - input: VerbInput, - to: str = NODE_DETAILS, - id_column: str = NODE_ID, - name_column: str = NODE_NAME, - description_column: str = NODE_DESCRIPTION, - degree_column: str = NODE_DEGREE, - **_kwargs, -) -> TableContainer: - """Merge edge details into an object.""" - node_df = cast(pd.DataFrame, input.get_input()) - node_df = node_df.fillna(value={description_column: _MISSING_DESCRIPTION}) - - # merge values of four columns into a map column - node_df[to] = node_df.apply( - lambda x: { - id_column: x[id_column], - name_column: x[name_column], - description_column: x[description_column], - degree_column: x[degree_column], - }, - axis=1, - ) - return TableContainer(table=node_df) diff --git a/graphrag/index/verbs/graph/report/strategies/__init__.py b/graphrag/index/verbs/graph/report/strategies/__init__.py deleted file mode 100644 index 87d1f9e2..00000000 --- a/graphrag/index/verbs/graph/report/strategies/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -# Copyright (c) 2024 Microsoft Corporation. -# Licensed under the MIT License - -"""The Indexing Engine graph report strategies package root.""" diff --git a/graphrag/index/verbs/graph/report/strategies/graph_intelligence/__init__.py b/graphrag/index/verbs/graph/report/strategies/graph_intelligence/__init__.py deleted file mode 100644 index 7f51d790..00000000 --- a/graphrag/index/verbs/graph/report/strategies/graph_intelligence/__init__.py +++ /dev/null @@ -1,8 +0,0 @@ -# Copyright (c) 2024 Microsoft Corporation. -# Licensed under the MIT License - -"""The Indexing Engine graph report strategies graph intelligence package root.""" - -from .run_graph_intelligence import run - -__all__ = ["run"] diff --git a/graphrag/index/verbs/graph/report/strategies/graph_intelligence/defaults.py b/graphrag/index/verbs/graph/report/strategies/graph_intelligence/defaults.py deleted file mode 100644 index c184fb8e..00000000 --- a/graphrag/index/verbs/graph/report/strategies/graph_intelligence/defaults.py +++ /dev/null @@ -1,27 +0,0 @@ -# Copyright (c) 2024 Microsoft Corporation. -# Licensed under the MIT License - -"""A file containing DEFAULT_CHUNK_SIZE and MOCK_RESPONSES definitions.""" - -import json - -DEFAULT_CHUNK_SIZE = 3000 - -MOCK_RESPONSES = [ - json.dumps({ - "title": "", - "summary": "", - "rating": 2, - "rating_explanation": "", - "findings": [ - { - "summary": "", - "explanation": "", - "explanation": " TableContainer: - """Aggregate method definition.""" - input_table = input.get_input() - callbacks.progress(Progress(percent=0)) - - output = aggregate_df(input_table, aggregations, groupby) - - callbacks.progress(Progress(percent=1)) - - return TableContainer(table=output) - - -def aggregate_df( - input_table: Table, - aggregations: list[dict[str, Any]], - groupby: list[str] | None = None, -) -> pd.DataFrame: - """Aggregate method definition.""" - aggregations_to_apply = _load_aggregations(aggregations) - df_aggregations = { - agg.column: _get_pandas_agg_operation(agg) - for agg in aggregations_to_apply.values() - } - if groupby is None: - output_grouped = input_table.groupby(lambda _x: True) - else: - output_grouped = input_table.groupby(groupby, sort=False) - output = cast(pd.DataFrame, output_grouped.agg(df_aggregations)) - output.rename( - columns={agg.column: agg.to for agg in aggregations_to_apply.values()}, - inplace=True, - ) - output.columns = [agg.to for agg in aggregations_to_apply.values()] - return output.reset_index() - - -@dataclass -class Aggregation: - """Aggregation class method definition.""" - - column: str | None - operation: str - to: str - - # Only useful for the concat operation - separator: str | None = None - - -def _get_pandas_agg_operation(agg: Aggregation) -> Any: - # TODO: Merge into datashaper - if agg.operation == "string_concat": - return (agg.separator or ",").join - return aggregate_operation_mapping[FieldAggregateOperation(agg.operation)] - - -def _load_aggregations( - aggregations: list[dict[str, Any]], -) -> dict[str, Aggregation]: - return { - aggregation["column"]: Aggregation( - aggregation["column"], aggregation["operation"], aggregation["to"] - ) - for aggregation in aggregations - } diff --git a/graphrag/index/verbs/overrides/concat.py b/graphrag/index/verbs/overrides/concat.py deleted file mode 100644 index 7a0f0e2c..00000000 --- a/graphrag/index/verbs/overrides/concat.py +++ /dev/null @@ -1,27 +0,0 @@ -# Copyright (c) 2024 Microsoft Corporation. -# Licensed under the MIT License - -"""A module containing concat method definition.""" - -# Copyright (c) 2024 Microsoft Corporation. -# Licensed under the MIT License -from typing import cast - -import pandas as pd -from datashaper import TableContainer, VerbInput, verb - - -@verb(name="concat_override") -def concat( - input: VerbInput, - columnwise: bool = False, - **_kwargs: dict, -) -> TableContainer: - """Concat method definition.""" - input_table = cast(pd.DataFrame, input.get_input()) - others = cast(list[pd.DataFrame], input.get_others()) - if columnwise: - output = pd.concat([input_table, *others], axis=1) - else: - output = pd.concat([input_table, *others], ignore_index=True) - return TableContainer(table=output) diff --git a/graphrag/index/verbs/spread_json.py b/graphrag/index/verbs/spread_json.py deleted file mode 100644 index 38656e12..00000000 --- a/graphrag/index/verbs/spread_json.py +++ /dev/null @@ -1,55 +0,0 @@ -# Copyright (c) 2024 Microsoft Corporation. -# Licensed under the MIT License - -"""A module containing spread_json method definition.""" - -import logging - -import pandas as pd -from datashaper import TableContainer, VerbInput, verb - -from graphrag.index.utils import is_null - -# TODO: Check if this is already a thing -DEFAULT_COPY = ["level"] - - -@verb(name="spread_json") -def spread_json( - input: VerbInput, - column: str, - copy: list[str] | None = None, - **_kwargs: dict, -) -> TableContainer: - """ - Unpack a column containing a tuple into multiple columns. - - id|json|b - 1|{"x":5,"y":6}|b - - is converted to - - id|x|y|b - -------- - 1|5|6|b - """ - if copy is None: - copy = DEFAULT_COPY - data = input.get_input() - - results = [] - for _, row in data.iterrows(): - try: - cleaned_row = {col: row[col] for col in copy} - rest_row = row[column] if row[column] is not None else {} - - if is_null(rest_row): - rest_row = {} - - results.append({**cleaned_row, **rest_row}) # type: ignore - except Exception: - logging.exception("Error spreading row: %s", row) - raise - data = pd.DataFrame(results, index=data.index) - - return TableContainer(table=data) diff --git a/graphrag/index/verbs/text/__init__.py b/graphrag/index/verbs/text/__init__.py deleted file mode 100644 index bd7ddc16..00000000 --- a/graphrag/index/verbs/text/__init__.py +++ /dev/null @@ -1,16 +0,0 @@ -# Copyright (c) 2024 Microsoft Corporation. -# Licensed under the MIT License - -"""The Indexing Engine text package root.""" - -from .chunk.text_chunk import chunk -from .replace import replace -from .split import text_split -from .translate import text_translate - -__all__ = [ - "chunk", - "replace", - "text_split", - "text_translate", -] diff --git a/graphrag/index/verbs/text/chunk/__init__.py b/graphrag/index/verbs/text/chunk/__init__.py deleted file mode 100644 index 4e2a7729..00000000 --- a/graphrag/index/verbs/text/chunk/__init__.py +++ /dev/null @@ -1,8 +0,0 @@ -# Copyright (c) 2024 Microsoft Corporation. -# Licensed under the MIT License - -"""The Indexing Engine text chunk package root.""" - -from .text_chunk import ChunkStrategy, ChunkStrategyType, chunk - -__all__ = ["ChunkStrategy", "ChunkStrategyType", "chunk"] diff --git a/graphrag/index/verbs/text/chunk/strategies/__init__.py b/graphrag/index/verbs/text/chunk/strategies/__init__.py deleted file mode 100644 index 0f15fcb2..00000000 --- a/graphrag/index/verbs/text/chunk/strategies/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -# Copyright (c) 2024 Microsoft Corporation. -# Licensed under the MIT License - -"""The Indexing Engine text chunk strategies package root.""" diff --git a/graphrag/index/verbs/text/chunk/strategies/sentence.py b/graphrag/index/verbs/text/chunk/strategies/sentence.py deleted file mode 100644 index 687def1d..00000000 --- a/graphrag/index/verbs/text/chunk/strategies/sentence.py +++ /dev/null @@ -1,26 +0,0 @@ -# Copyright (c) 2024 Microsoft Corporation. -# Licensed under the MIT License - -"""A module containing run method definition.""" - -from collections.abc import Iterable -from typing import Any - -import nltk -from datashaper import ProgressTicker - -from .typing import TextChunk - - -def run( - input: list[str], _args: dict[str, Any], tick: ProgressTicker -) -> Iterable[TextChunk]: - """Chunks text into multiple parts. A pipeline verb.""" - for doc_idx, text in enumerate(input): - sentences = nltk.sent_tokenize(text) - for sentence in sentences: - yield TextChunk( - text_chunk=sentence, - source_doc_indices=[doc_idx], - ) - tick(1) diff --git a/graphrag/index/verbs/text/chunk/strategies/typing.py b/graphrag/index/verbs/text/chunk/strategies/typing.py deleted file mode 100644 index b4e833c8..00000000 --- a/graphrag/index/verbs/text/chunk/strategies/typing.py +++ /dev/null @@ -1,17 +0,0 @@ -# Copyright (c) 2024 Microsoft Corporation. -# Licensed under the MIT License - -"""A module containing ChunkStrategy definition.""" - -from collections.abc import Callable, Iterable -from typing import Any - -from datashaper import ProgressTicker - -from graphrag.index.verbs.text.chunk.typing import TextChunk - -# Given a list of document texts, return a list of tuples of (source_doc_indices, text_chunk) - -ChunkStrategy = Callable[ - [list[str], dict[str, Any], ProgressTicker], Iterable[TextChunk] -] diff --git a/graphrag/index/verbs/text/replace/__init__.py b/graphrag/index/verbs/text/replace/__init__.py deleted file mode 100644 index f863415f..00000000 --- a/graphrag/index/verbs/text/replace/__init__.py +++ /dev/null @@ -1,8 +0,0 @@ -# Copyright (c) 2024 Microsoft Corporation. -# Licensed under the MIT License - -"""The Indexing Engine text replace package root.""" - -from .replace import text_replace - -__all__ = ["text_replace"] diff --git a/graphrag/index/verbs/text/replace/replace.py b/graphrag/index/verbs/text/replace/replace.py deleted file mode 100644 index 386fac34..00000000 --- a/graphrag/index/verbs/text/replace/replace.py +++ /dev/null @@ -1,47 +0,0 @@ -# Copyright (c) 2024 Microsoft Corporation. -# Licensed under the MIT License - -"""A module containing replace and _apply_replacements methods.""" - -from typing import cast - -import pandas as pd -from datashaper import TableContainer, VerbInput, verb - -from .typing import Replacement - - -@verb(name="text_replace") -def text_replace( - input: VerbInput, - column: str, - to: str, - replacements: list[dict[str, str]], - **_kwargs: dict, -) -> TableContainer: - """ - Apply a set of replacements to a piece of text. - - ## Usage - ```yaml - verb: text_replace - args: - column: # The name of the column containing the text to replace - to: # The name of the column to write the replaced text to - replacements: # A list of replacements to apply - - pattern: # The regex pattern to find - replacement: # The string to replace with - ``` - """ - output = cast(pd.DataFrame, input.get_input()) - parsed_replacements = [Replacement(**r) for r in replacements] - output[to] = output[column].apply( - lambda text: _apply_replacements(text, parsed_replacements) - ) - return TableContainer(table=output) - - -def _apply_replacements(text: str, replacements: list[Replacement]) -> str: - for r in replacements: - text = text.replace(r.pattern, r.replacement) - return text diff --git a/graphrag/index/verbs/text/replace/typing.py b/graphrag/index/verbs/text/replace/typing.py deleted file mode 100644 index 45beef9f..00000000 --- a/graphrag/index/verbs/text/replace/typing.py +++ /dev/null @@ -1,14 +0,0 @@ -# Copyright (c) 2024 Microsoft Corporation. -# Licensed under the MIT License - -"""A module containing 'Replacement' model.""" - -from dataclasses import dataclass - - -@dataclass -class Replacement: - """Replacement class definition.""" - - pattern: str - replacement: str diff --git a/graphrag/index/verbs/text/split.py b/graphrag/index/verbs/text/split.py deleted file mode 100644 index b1339ff4..00000000 --- a/graphrag/index/verbs/text/split.py +++ /dev/null @@ -1,54 +0,0 @@ -# Copyright (c) 2024 Microsoft Corporation. -# Licensed under the MIT License - -"""A module containing the text_split method definition.""" - -from typing import cast - -import pandas as pd -from datashaper import TableContainer, VerbInput, verb - - -@verb(name="text_split") -def text_split( - input: VerbInput, - column: str, - to: str, - separator: str = ",", - **_kwargs: dict, -) -> TableContainer: - """ - Split a piece of text into a list of strings based on a delimiter. The verb outputs a new column containing a list of strings. - - ## Usage - - ```yaml - verb: text_split - args: - column: text # The name of the column containing the text to split - to: split_text # The name of the column to output the split text to - separator: "," # The separator to split the text on, defaults to "," - ``` - """ - output = text_split_df(cast(pd.DataFrame, input.get_input()), column, to, separator) - return TableContainer(table=output) - - -def text_split_df( - input: pd.DataFrame, column: str, to: str, separator: str = "," -) -> pd.DataFrame: - """Split a column into a list of strings.""" - output = input - - def _apply_split(row): - if row[column] is None or isinstance(row[column], list): - return row[column] - if row[column] == "": - return [] - if not isinstance(row[column], str): - message = f"Expected {column} to be a string, but got {type(row[column])}" - raise TypeError(message) - return row[column].split(separator) - - output[to] = output.apply(_apply_split, axis=1) - return output diff --git a/graphrag/index/verbs/text/translate/__init__.py b/graphrag/index/verbs/text/translate/__init__.py deleted file mode 100644 index ad830dfa..00000000 --- a/graphrag/index/verbs/text/translate/__init__.py +++ /dev/null @@ -1,8 +0,0 @@ -# Copyright (c) 2024 Microsoft Corporation. -# Licensed under the MIT License - -"""The Indexing Engine text translate package root.""" - -from .text_translate import text_translate - -__all__ = ["text_translate"] diff --git a/graphrag/index/verbs/text/translate/strategies/__init__.py b/graphrag/index/verbs/text/translate/strategies/__init__.py deleted file mode 100644 index d418bbae..00000000 --- a/graphrag/index/verbs/text/translate/strategies/__init__.py +++ /dev/null @@ -1,9 +0,0 @@ -# Copyright (c) 2024 Microsoft Corporation. -# Licensed under the MIT License - -"""The Indexing Engine translate strategies package root.""" - -from .mock import run as run_mock -from .openai import run as run_openai - -__all__ = ["run_mock", "run_openai"] diff --git a/graphrag/index/verbs/text/translate/strategies/defaults.py b/graphrag/index/verbs/text/translate/strategies/defaults.py deleted file mode 100644 index 003e00eb..00000000 --- a/graphrag/index/verbs/text/translate/strategies/defaults.py +++ /dev/null @@ -1,8 +0,0 @@ -# Copyright (c) 2024 Microsoft Corporation. -# Licensed under the MIT License - -"""A file containing TRANSLATION_PROMPT value definition.""" - -TRANSLATION_PROMPT = """ - You are a helpful assistant. Translate into {language} the following text, and make sure all of the text is in {language}. - """.strip() diff --git a/graphrag/index/verbs/text/translate/strategies/mock.py b/graphrag/index/verbs/text/translate/strategies/mock.py deleted file mode 100644 index 58a5a999..00000000 --- a/graphrag/index/verbs/text/translate/strategies/mock.py +++ /dev/null @@ -1,28 +0,0 @@ -# Copyright (c) 2024 Microsoft Corporation. -# Licensed under the MIT License - -"""A module containing run and _summarize_text methods definitions.""" - -from typing import Any - -from datashaper import VerbCallbacks - -from graphrag.index.cache import PipelineCache - -from .typing import TextTranslationResult - - -async def run( # noqa RUF029 async is required for interface - input: str | list[str], - _args: dict[str, Any], - _reporter: VerbCallbacks, - _cache: PipelineCache, -) -> TextTranslationResult: - """Run the Claim extraction chain.""" - input = [input] if isinstance(input, str) else input - return TextTranslationResult(translations=[_translate_text(text) for text in input]) - - -def _translate_text(text: str) -> str: - """Translate a single piece of text.""" - return f"{text} translated" diff --git a/graphrag/index/verbs/text/translate/strategies/openai.py b/graphrag/index/verbs/text/translate/strategies/openai.py deleted file mode 100644 index 49c47b34..00000000 --- a/graphrag/index/verbs/text/translate/strategies/openai.py +++ /dev/null @@ -1,93 +0,0 @@ -# Copyright (c) 2024 Microsoft Corporation. -# Licensed under the MIT License - -"""A module containing run, _translate_text and _create_translation_prompt methods definition.""" - -import logging -import traceback -from typing import Any - -from datashaper import VerbCallbacks - -import graphrag.config.defaults as defs -from graphrag.config.enums import LLMType -from graphrag.index.cache import PipelineCache -from graphrag.index.llm import load_llm -from graphrag.index.text_splitting import TokenTextSplitter -from graphrag.llm import CompletionLLM - -from .defaults import TRANSLATION_PROMPT as DEFAULT_TRANSLATION_PROMPT -from .typing import TextTranslationResult - -log = logging.getLogger(__name__) - - -async def run( - input: str | list[str], - args: dict[str, Any], - callbacks: VerbCallbacks, - pipeline_cache: PipelineCache, -) -> TextTranslationResult: - """Run the Claim extraction chain.""" - llm_config = args.get("llm", {"type": LLMType.StaticResponse}) - llm_type = llm_config.get("type", LLMType.StaticResponse) - llm = load_llm( - "text_translation", - llm_type, - callbacks, - pipeline_cache, - llm_config, - chat_only=True, - ) - language = args.get("language", "English") - prompt = args.get("prompt") - chunk_size = args.get("chunk_size", defs.CHUNK_SIZE) - chunk_overlap = args.get("chunk_overlap", defs.CHUNK_OVERLAP) - - input = [input] if isinstance(input, str) else input - return TextTranslationResult( - translations=[ - await _translate_text( - text, language, prompt, llm, chunk_size, chunk_overlap, callbacks - ) - for text in input - ] - ) - - -async def _translate_text( - text: str, - language: str, - prompt: str | None, - llm: CompletionLLM, - chunk_size: int, - chunk_overlap: int, - callbacks: VerbCallbacks, -) -> str: - """Translate a single piece of text.""" - splitter = TokenTextSplitter( - chunk_size=chunk_size, - chunk_overlap=chunk_overlap, - ) - - out = "" - chunks = splitter.split_text(text) - for chunk in chunks: - try: - result = await llm( - chunk, - history=[ - { - "role": "system", - "content": (prompt or DEFAULT_TRANSLATION_PROMPT), - } - ], - variables={"language": language}, - ) - out += result.output or "" - except Exception as e: - log.exception("error translating text") - callbacks.error("Error translating text", e, traceback.format_exc()) - out += "" - - return out diff --git a/graphrag/index/verbs/text/translate/strategies/typing.py b/graphrag/index/verbs/text/translate/strategies/typing.py deleted file mode 100644 index d91ed735..00000000 --- a/graphrag/index/verbs/text/translate/strategies/typing.py +++ /dev/null @@ -1,25 +0,0 @@ -# Copyright (c) 2024 Microsoft Corporation. -# Licensed under the MIT License - -"""A module containing 'TextTranslationResult' model.""" - -from collections.abc import Awaitable, Callable -from dataclasses import dataclass -from typing import Any - -from datashaper import VerbCallbacks - -from graphrag.index.cache import PipelineCache - - -@dataclass -class TextTranslationResult: - """Text translation result class definition.""" - - translations: list[str] - - -TextTranslationStrategy = Callable[ - [list[str], dict[str, Any], VerbCallbacks, PipelineCache], - Awaitable[TextTranslationResult], -] diff --git a/graphrag/index/verbs/text/translate/text_translate.py b/graphrag/index/verbs/text/translate/text_translate.py deleted file mode 100644 index 8d0faffe..00000000 --- a/graphrag/index/verbs/text/translate/text_translate.py +++ /dev/null @@ -1,120 +0,0 @@ -# Copyright (c) 2024 Microsoft Corporation. -# Licensed under the MIT License - -"""A module containing text_translate methods definition.""" - -from enum import Enum -from typing import Any, cast - -import pandas as pd -from datashaper import ( - AsyncType, - TableContainer, - VerbCallbacks, - VerbInput, - derive_from_rows, - verb, -) - -from graphrag.index.cache import PipelineCache - -from .strategies.typing import TextTranslationStrategy - - -class TextTranslateStrategyType(str, Enum): - """TextTranslateStrategyType class definition.""" - - openai = "openai" - mock = "mock" - - def __repr__(self): - """Get a string representation.""" - return f'"{self.value}"' - - -@verb(name="text_translate") -async def text_translate( - input: VerbInput, - cache: PipelineCache, - callbacks: VerbCallbacks, - text_column: str, - to: str, - strategy: dict[str, Any], - async_mode: AsyncType = AsyncType.AsyncIO, - **kwargs, -) -> TableContainer: - """ - Translate a piece of text into another language. - - ## Usage - ```yaml - verb: text_translate - args: - text_column: # The name of the column containing the text to translate - to: # The name of the column to write the translated text to - strategy: # The strategy to use to translate the text, see below for more details - ``` - - ## Strategies - The text translate verb uses a strategy to translate the text. The strategy is an object which defines the strategy to use. The following strategies are available: - - ### openai - This strategy uses openai to translate a piece of text. In particular it uses a LLM to translate a piece of text. The strategy config is as follows: - - ```yaml - strategy: - type: openai - language: english # The language to translate to, default: english - prompt: # The prompt to use for the translation, default: None - chunk_size: 2500 # The chunk size to use for the translation, default: 2500 - chunk_overlap: 0 # The chunk overlap to use for the translation, default: 0 - llm: # The configuration for the LLM - type: openai_chat # the type of llm to use, available options are: openai_chat, azure_openai_chat - api_key: !ENV ${GRAPHRAG_OPENAI_API_KEY} # The api key to use for openai - model: !ENV ${GRAPHRAG_OPENAI_MODEL:gpt-4-turbo-preview} # The model to use for openai - max_tokens: !ENV ${GRAPHRAG_MAX_TOKENS:6000} # The max tokens to use for openai - organization: !ENV ${GRAPHRAG_OPENAI_ORGANIZATION} # The organization to use for openai - ``` - """ - output_df = cast(pd.DataFrame, input.get_input()) - strategy_type = strategy["type"] - strategy_args = {**strategy} - strategy_exec = _load_strategy(strategy_type) - - async def run_strategy(row): - text = row[text_column] - result = await strategy_exec(text, strategy_args, callbacks, cache) - - # If it is a single string, then return just the translation for that string - if isinstance(text, str): - return result.translations[0] - - # Otherwise, return a list of translations, one for each item in the input - return list(result.translations) - - results = await derive_from_rows( - output_df, - run_strategy, - callbacks, - scheduling_type=async_mode, - num_threads=kwargs.get("num_threads", 4), - ) - output_df[to] = results - return TableContainer(table=output_df) - - -def _load_strategy(strategy: TextTranslateStrategyType) -> TextTranslationStrategy: - match strategy: - case TextTranslateStrategyType.openai: - from .strategies.openai import run as run_openai - - return run_openai - - case TextTranslateStrategyType.mock: - from .strategies.mock import run as run_mock - - return run_mock - - case _: - msg = f"Unknown strategy: {strategy}" - raise ValueError(msg) diff --git a/graphrag/index/verbs/unzip.py b/graphrag/index/verbs/unzip.py deleted file mode 100644 index 4d8c8da0..00000000 --- a/graphrag/index/verbs/unzip.py +++ /dev/null @@ -1,25 +0,0 @@ -# Copyright (c) 2024 Microsoft Corporation. -# Licensed under the MIT License - -"""A module containing unzip method definition.""" - -from typing import cast - -import pandas as pd -from datashaper import TableContainer, VerbInput, verb - - -# TODO: Check if this is already a thing -# Takes 1|(x,y)|b -# and converts to -# 1|x|y|b -@verb(name="unzip") -def unzip( - input: VerbInput, column: str, to: list[str], **_kwargs: dict -) -> TableContainer: - """Unpacks a column containing a tuple into multiple columns.""" - table = cast(pd.DataFrame, input.get_input()) - - table[to] = pd.DataFrame(table[column].tolist(), index=table.index) - - return TableContainer(table=table) diff --git a/graphrag/index/verbs/zip.py b/graphrag/index/verbs/zip.py deleted file mode 100644 index 462395d3..00000000 --- a/graphrag/index/verbs/zip.py +++ /dev/null @@ -1,51 +0,0 @@ -# Copyright (c) 2024 Microsoft Corporation. -# Licensed under the MIT License - -"""A module containing ds_zip method definition.""" - -from typing import cast - -import pandas as pd -from datashaper import TableContainer, VerbInput, verb - - -@verb(name="zip") -def zip_verb( - input: VerbInput, - to: str, - columns: list[str], - type: str | None = None, # noqa A002 - **_kwargs: dict, -) -> TableContainer: - """ - Zip columns together. - - ## Usage - TODO - - """ - table = cast(pd.DataFrame, input.get_input()) - if type is None: - table[to] = list(zip(*[table[col] for col in columns], strict=True)) - - # This one is a little weird - elif type == "dict": - if len(columns) != 2: - msg = f"Expected exactly two columns for a dict, got {columns}" - raise ValueError(msg) - key_col, value_col = columns - - results = [] - for _, row in table.iterrows(): - keys = row[key_col] - values = row[value_col] - output = {} - if len(keys) != len(values): - msg = f"Expected same number of keys and values, got {len(keys)} keys and {len(values)} values" - raise ValueError(msg) - for idx, key in enumerate(keys): - output[key] = values[idx] - results.append(output) - - table[to] = results - return TableContainer(table=table.reset_index(drop=True)) diff --git a/graphrag/index/workflows/v1/create_base_entity_graph.py b/graphrag/index/workflows/v1/create_base_entity_graph.py index 351b2b67..dadaece5 100644 --- a/graphrag/index/workflows/v1/create_base_entity_graph.py +++ b/graphrag/index/workflows/v1/create_base_entity_graph.py @@ -21,7 +21,7 @@ def build_steps( "cluster_graph", {"strategy": {"type": "leiden"}}, ) - clustering_strategy = clustering_config["strategy"] + clustering_strategy = clustering_config.get("strategy") embed_graph_config = config.get( "embed_graph", @@ -36,7 +36,7 @@ def build_steps( } }, ) - embedding_strategy = embed_graph_config["strategy"] + embedding_strategy = embed_graph_config.get("strategy") embed_graph_enabled = config.get("embed_graph_enabled", False) or False graphml_snapshot_enabled = config.get("graphml_snapshot", False) or False diff --git a/graphrag/index/workflows/v1/create_base_extracted_entities.py b/graphrag/index/workflows/v1/create_base_extracted_entities.py index 7d0ea603..e18266b3 100644 --- a/graphrag/index/workflows/v1/create_base_extracted_entities.py +++ b/graphrag/index/workflows/v1/create_base_extracted_entities.py @@ -24,7 +24,7 @@ def build_steps( column = entity_extraction_config.get("text_column", "chunk") id_column = entity_extraction_config.get("id_column", "chunk_id") async_mode = entity_extraction_config.get("async_mode", AsyncType.AsyncIO) - strategy = entity_extraction_config.get("strategy") + extraction_strategy = entity_extraction_config.get("strategy") num_threads = entity_extraction_config.get("num_threads", 4) entity_types = entity_extraction_config.get("entity_types") @@ -71,7 +71,7 @@ def build_steps( "column": column, "id_column": id_column, "async_mode": async_mode, - "strategy": strategy, + "extraction_strategy": extraction_strategy, "num_threads": num_threads, "entity_types": entity_types, "nodes": nodes, diff --git a/graphrag/index/workflows/v1/create_base_text_units.py b/graphrag/index/workflows/v1/create_base_text_units.py index da2d1374..5a1d470c 100644 --- a/graphrag/index/workflows/v1/create_base_text_units.py +++ b/graphrag/index/workflows/v1/create_base_text_units.py @@ -22,7 +22,8 @@ def build_steps( chunk_column_name = config.get("chunk_column", "chunk") chunk_by_columns = config.get("chunk_by", []) or [] n_tokens_column_name = config.get("n_tokens_column", "n_tokens") - text_chunk = config.get("text_chunk", {}) + text_chunk_config = config.get("text_chunk", {}) + chunk_strategy = text_chunk_config.get("strategy") return [ { "verb": "create_base_text_units", @@ -30,7 +31,7 @@ def build_steps( "chunk_column_name": chunk_column_name, "n_tokens_column_name": n_tokens_column_name, "chunk_by_columns": chunk_by_columns, - **text_chunk, + "chunk_strategy": chunk_strategy, }, "input": {"source": DEFAULT_INPUT_NAME}, }, diff --git a/graphrag/index/workflows/v1/create_final_community_reports.py b/graphrag/index/workflows/v1/create_final_community_reports.py index 5e933c8e..e859397d 100644 --- a/graphrag/index/workflows/v1/create_final_community_reports.py +++ b/graphrag/index/workflows/v1/create_final_community_reports.py @@ -19,6 +19,10 @@ def build_steps( """ covariates_enabled = config.get("covariates_enabled", False) create_community_reports_config = config.get("create_community_reports", {}) + summarization_strategy = create_community_reports_config.get("strategy") + async_mode = create_community_reports_config.get("async_mode") + num_threads = create_community_reports_config.get("num_threads") + base_text_embed = config.get("text_embed", {}) community_report_full_content_embed_config = config.get( "community_report_full_content_embed", base_text_embed @@ -43,9 +47,6 @@ def build_steps( { "verb": "create_final_community_reports", "args": { - "skip_full_content_embedding": skip_full_content_embedding, - "skip_summary_embedding": skip_summary_embedding, - "skip_title_embedding": skip_title_embedding, "full_content_text_embed": community_report_full_content_embed_config if not skip_full_content_embedding else None, @@ -55,7 +56,9 @@ def build_steps( "title_text_embed": community_report_title_embed_config if not skip_title_embedding else None, - **create_community_reports_config, + "summarization_strategy": summarization_strategy, + "async_mode": async_mode, + "num_threads": num_threads, }, "input": input, }, diff --git a/graphrag/index/workflows/v1/create_final_covariates.py b/graphrag/index/workflows/v1/create_final_covariates.py index 2b558547..1fdab708 100644 --- a/graphrag/index/workflows/v1/create_final_covariates.py +++ b/graphrag/index/workflows/v1/create_final_covariates.py @@ -3,7 +3,9 @@ """A module containing build_steps method definition.""" -from datashaper import AsyncType +from datashaper import ( + AsyncType, +) from graphrag.index.config import PipelineWorkflowConfig, PipelineWorkflowStep @@ -21,9 +23,13 @@ def build_steps( * `workflow:create_base_extracted_entities` """ claim_extract_config = config.get("claim_extract", {}) + extraction_strategy = claim_extract_config.get("strategy") + async_mode = claim_extract_config.get("async_mode", AsyncType.AsyncIO) + num_threads = claim_extract_config.get("num_threads") + chunk_column = config.get("chunk_column", "chunk") chunk_id_column = config.get("chunk_id_column", "chunk_id") - async_mode = config.get("async_mode", AsyncType.AsyncIO) + return [ { "verb": "create_final_covariates", @@ -31,8 +37,9 @@ def build_steps( "column": chunk_column, "id_column": chunk_id_column, "covariate_type": "claim", + "extraction_strategy": extraction_strategy, "async_mode": async_mode, - **claim_extract_config, + "num_threads": num_threads, }, "input": {"source": "workflow:create_base_text_units"}, }, diff --git a/graphrag/index/workflows/v1/create_final_entities.py b/graphrag/index/workflows/v1/create_final_entities.py index d7393f4a..7242800f 100644 --- a/graphrag/index/workflows/v1/create_final_entities.py +++ b/graphrag/index/workflows/v1/create_final_entities.py @@ -22,6 +22,7 @@ def build_steps( entity_name_description_embed_config = config.get( "entity_name_description_embed", base_text_embed ) + skip_name_embedding = config.get("skip_name_embedding", False) skip_description_embedding = config.get("skip_description_embedding", False) @@ -29,8 +30,6 @@ def build_steps( { "verb": "create_final_entities", "args": { - "skip_name_embedding": skip_name_embedding, - "skip_description_embedding": skip_description_embedding, "name_text_embed": entity_name_embed_config if not skip_name_embedding else None, diff --git a/graphrag/index/workflows/v1/create_final_nodes.py b/graphrag/index/workflows/v1/create_final_nodes.py index 65ab516d..aecc804c 100644 --- a/graphrag/index/workflows/v1/create_final_nodes.py +++ b/graphrag/index/workflows/v1/create_final_nodes.py @@ -27,6 +27,7 @@ def build_steps( }, }, ) + layout_strategy = layout_graph_config.get("strategy") level_for_node_positions = config.get("level_for_node_positions", 0) return [ @@ -34,7 +35,7 @@ def build_steps( "id": "laid_out_entity_graph", "verb": "create_final_nodes", "args": { - **layout_graph_config, + "layout_strategy": layout_strategy, "level_for_node_positions": level_for_node_positions, "snapshot_top_level_nodes": snapshot_top_level_nodes, }, diff --git a/graphrag/index/workflows/v1/create_final_text_units.py b/graphrag/index/workflows/v1/create_final_text_units.py index 7453f5cc..0638253a 100644 --- a/graphrag/index/workflows/v1/create_final_text_units.py +++ b/graphrag/index/workflows/v1/create_final_text_units.py @@ -21,6 +21,7 @@ def build_steps( """ base_text_embed = config.get("text_embed", {}) text_unit_text_embed_config = config.get("text_unit_text_embed", base_text_embed) + skip_text_unit_embedding = config.get("skip_text_unit_embedding", False) covariates_enabled = config.get("covariates_enabled", False) diff --git a/graphrag/index/workflows/v1/create_summarized_entities.py b/graphrag/index/workflows/v1/create_summarized_entities.py index d4d95786..53821814 100644 --- a/graphrag/index/workflows/v1/create_summarized_entities.py +++ b/graphrag/index/workflows/v1/create_summarized_entities.py @@ -18,8 +18,8 @@ def build_steps( * `workflow:create_base_text_units` """ summarize_descriptions_config = config.get("summarize_descriptions", {}) - strategy = summarize_descriptions_config.get("strategy", {}) - num_threads = strategy.get("num_threads", 4) + summarization_strategy = summarize_descriptions_config.get("strategy") + num_threads = summarize_descriptions_config.get("num_threads", 4) graphml_snapshot_enabled = config.get("graphml_snapshot", False) or False @@ -27,7 +27,7 @@ def build_steps( { "verb": "create_summarized_entities", "args": { - "strategy": strategy, + "summarization_strategy": summarization_strategy, "num_threads": num_threads, "graphml_snapshot_enabled": graphml_snapshot_enabled, }, diff --git a/graphrag/index/workflows/v1/subflows/create_base_documents.py b/graphrag/index/workflows/v1/subflows/create_base_documents.py index 6e682e67..c3e52098 100644 --- a/graphrag/index/workflows/v1/subflows/create_base_documents.py +++ b/graphrag/index/workflows/v1/subflows/create_base_documents.py @@ -29,7 +29,9 @@ def create_base_documents( source = cast(pd.DataFrame, input.get_input()) text_units = cast(pd.DataFrame, get_required_input_table(input, "text_units").table) - output = create_base_documents_flow(source, text_units, document_attribute_columns) + output = create_base_documents_flow( + source, text_units, document_attribute_columns=document_attribute_columns + ) return create_verb_result( cast( diff --git a/graphrag/index/workflows/v1/subflows/create_base_entity_graph.py b/graphrag/index/workflows/v1/subflows/create_base_entity_graph.py index ea1630dd..009da03f 100644 --- a/graphrag/index/workflows/v1/subflows/create_base_entity_graph.py +++ b/graphrag/index/workflows/v1/subflows/create_base_entity_graph.py @@ -42,7 +42,7 @@ async def create_base_entity_graph( storage, clustering_strategy, embedding_strategy, - graphml_snapshot_enabled, + graphml_snapshot_enabled=graphml_snapshot_enabled, ) return create_verb_result(cast(Table, output)) diff --git a/graphrag/index/workflows/v1/subflows/create_base_extracted_entities.py b/graphrag/index/workflows/v1/subflows/create_base_extracted_entities.py index 846467f3..34660e01 100644 --- a/graphrag/index/workflows/v1/subflows/create_base_extracted_entities.py +++ b/graphrag/index/workflows/v1/subflows/create_base_extracted_entities.py @@ -25,14 +25,14 @@ from graphrag.index.storage import PipelineStorage @verb(name="create_base_extracted_entities", treats_input_tables_as_immutable=True) async def create_base_extracted_entities( input: VerbInput, - cache: PipelineCache, callbacks: VerbCallbacks, + cache: PipelineCache, storage: PipelineStorage, column: str, id_column: str, nodes: dict[str, Any], edges: dict[str, Any], - strategy: dict[str, Any] | None, + extraction_strategy: dict[str, Any] | None, async_mode: AsyncType = AsyncType.AsyncIO, entity_types: list[str] | None = None, num_threads: int = 4, @@ -45,18 +45,18 @@ async def create_base_extracted_entities( output = await create_base_extracted_entities_flow( source, - cache, callbacks, + cache, storage, column, id_column, nodes, edges, - strategy, - async_mode, - entity_types, - graphml_snapshot_enabled, - raw_entity_snapshot_enabled, + extraction_strategy, + async_mode=async_mode, + entity_types=entity_types, + graphml_snapshot_enabled=graphml_snapshot_enabled, + raw_entity_snapshot_enabled=raw_entity_snapshot_enabled, num_threads=num_threads, ) diff --git a/graphrag/index/workflows/v1/subflows/create_base_text_units.py b/graphrag/index/workflows/v1/subflows/create_base_text_units.py index 370abc59..18c65008 100644 --- a/graphrag/index/workflows/v1/subflows/create_base_text_units.py +++ b/graphrag/index/workflows/v1/subflows/create_base_text_units.py @@ -26,7 +26,7 @@ def create_base_text_units( chunk_column_name: str, n_tokens_column_name: str, chunk_by_columns: list[str], - strategy: dict[str, Any] | None = None, + chunk_strategy: dict[str, Any] | None = None, **_kwargs: dict, ) -> VerbResult: """All the steps to transform base text_units.""" @@ -38,7 +38,7 @@ def create_base_text_units( chunk_column_name, n_tokens_column_name, chunk_by_columns, - strategy, + chunk_strategy=chunk_strategy, ) return create_verb_result( diff --git a/graphrag/index/workflows/v1/subflows/create_final_community_reports.py b/graphrag/index/workflows/v1/subflows/create_final_community_reports.py index 48074412..e44f6e65 100644 --- a/graphrag/index/workflows/v1/subflows/create_final_community_reports.py +++ b/graphrag/index/workflows/v1/subflows/create_final_community_reports.py @@ -27,7 +27,7 @@ async def create_final_community_reports( input: VerbInput, callbacks: VerbCallbacks, cache: PipelineCache, - strategy: dict, + summarization_strategy: dict, async_mode: AsyncType = AsyncType.AsyncIO, num_threads: int = 4, full_content_text_embed: dict | None = None, @@ -49,12 +49,12 @@ async def create_final_community_reports( claims, callbacks, cache, - strategy, - async_mode, - num_threads, - full_content_text_embed, - summary_text_embed, - title_text_embed, + summarization_strategy, + async_mode=async_mode, + num_threads=num_threads, + full_content_text_embed=full_content_text_embed, + summary_text_embed=summary_text_embed, + title_text_embed=title_text_embed, ) return create_verb_result( diff --git a/graphrag/index/workflows/v1/subflows/create_final_covariates.py b/graphrag/index/workflows/v1/subflows/create_final_covariates.py index 8a6c3c90..d0812bc5 100644 --- a/graphrag/index/workflows/v1/subflows/create_final_covariates.py +++ b/graphrag/index/workflows/v1/subflows/create_final_covariates.py @@ -24,11 +24,11 @@ from graphrag.index.flows.create_final_covariates import ( @verb(name="create_final_covariates", treats_input_tables_as_immutable=True) async def create_final_covariates( input: VerbInput, - cache: PipelineCache, callbacks: VerbCallbacks, + cache: PipelineCache, column: str, covariate_type: str, - strategy: dict[str, Any] | None, + extraction_strategy: dict[str, Any] | None, async_mode: AsyncType = AsyncType.AsyncIO, entity_types: list[str] | None = None, num_threads: int = 4, @@ -39,14 +39,14 @@ async def create_final_covariates( output = await create_final_covariates_flow( source, - cache, callbacks, + cache, column, covariate_type, - strategy, - async_mode, - entity_types, - num_threads, + extraction_strategy, + async_mode=async_mode, + entity_types=entity_types, + num_threads=num_threads, ) return create_verb_result(cast(Table, output)) diff --git a/graphrag/index/workflows/v1/subflows/create_final_documents.py b/graphrag/index/workflows/v1/subflows/create_final_documents.py index 2f665df7..bc883552 100644 --- a/graphrag/index/workflows/v1/subflows/create_final_documents.py +++ b/graphrag/index/workflows/v1/subflows/create_final_documents.py @@ -38,7 +38,7 @@ async def create_final_documents( source, callbacks, cache, - raw_content_text_embed, + raw_content_text_embed=raw_content_text_embed, ) return create_verb_result(cast(Table, output)) diff --git a/graphrag/index/workflows/v1/subflows/create_final_entities.py b/graphrag/index/workflows/v1/subflows/create_final_entities.py index 54a10ebf..fa3ae898 100644 --- a/graphrag/index/workflows/v1/subflows/create_final_entities.py +++ b/graphrag/index/workflows/v1/subflows/create_final_entities.py @@ -28,8 +28,8 @@ async def create_final_entities( input: VerbInput, callbacks: VerbCallbacks, cache: PipelineCache, - name_text_embed: dict, - description_text_embed: dict, + name_text_embed: dict | None = None, + description_text_embed: dict | None = None, **_kwargs: dict, ) -> VerbResult: """All the steps to transform final entities.""" @@ -39,8 +39,8 @@ async def create_final_entities( source, callbacks, cache, - name_text_embed, - description_text_embed, + name_text_embed=name_text_embed, + description_text_embed=description_text_embed, ) return create_verb_result(cast(Table, output)) diff --git a/graphrag/index/workflows/v1/subflows/create_final_nodes.py b/graphrag/index/workflows/v1/subflows/create_final_nodes.py index 9a1754cb..8971652d 100644 --- a/graphrag/index/workflows/v1/subflows/create_final_nodes.py +++ b/graphrag/index/workflows/v1/subflows/create_final_nodes.py @@ -25,7 +25,7 @@ async def create_final_nodes( input: VerbInput, callbacks: VerbCallbacks, storage: PipelineStorage, - strategy: dict[str, Any], + layout_strategy: dict[str, Any], level_for_node_positions: int, snapshot_top_level_nodes: bool = False, **_kwargs: dict, @@ -37,9 +37,9 @@ async def create_final_nodes( source, callbacks, storage, - strategy, + layout_strategy, level_for_node_positions, - snapshot_top_level_nodes, + snapshot_top_level_nodes=snapshot_top_level_nodes, ) return create_verb_result( diff --git a/graphrag/index/workflows/v1/subflows/create_final_relationships.py b/graphrag/index/workflows/v1/subflows/create_final_relationships.py index d28a535c..24222846 100644 --- a/graphrag/index/workflows/v1/subflows/create_final_relationships.py +++ b/graphrag/index/workflows/v1/subflows/create_final_relationships.py @@ -41,7 +41,7 @@ async def create_final_relationships( nodes, callbacks, cache, - description_text_embed, + description_text_embed=description_text_embed, ) return create_verb_result(cast(Table, output)) diff --git a/graphrag/index/workflows/v1/subflows/create_final_text_units.py b/graphrag/index/workflows/v1/subflows/create_final_text_units.py index 15d592c1..14bd8399 100644 --- a/graphrag/index/workflows/v1/subflows/create_final_text_units.py +++ b/graphrag/index/workflows/v1/subflows/create_final_text_units.py @@ -50,7 +50,7 @@ async def create_final_text_units( final_covariates, callbacks, cache, - text_text_embed, + text_text_embed=text_text_embed, ) return create_verb_result(cast(Table, output)) diff --git a/graphrag/index/workflows/v1/subflows/create_summarized_entities.py b/graphrag/index/workflows/v1/subflows/create_summarized_entities.py index 2d5c917d..4ac2dd5b 100644 --- a/graphrag/index/workflows/v1/subflows/create_summarized_entities.py +++ b/graphrag/index/workflows/v1/subflows/create_summarized_entities.py @@ -27,10 +27,10 @@ from graphrag.index.storage import PipelineStorage ) async def create_summarized_entities( input: VerbInput, - cache: PipelineCache, callbacks: VerbCallbacks, + cache: PipelineCache, storage: PipelineStorage, - strategy: dict[str, Any] | None = None, + summarization_strategy: dict[str, Any] | None = None, num_threads: int = 4, graphml_snapshot_enabled: bool = False, **_kwargs: dict, @@ -40,12 +40,12 @@ async def create_summarized_entities( output = await create_summarized_entities_flow( source, - cache, callbacks, + cache, storage, - strategy, - num_threads, - graphml_snapshot_enabled, + summarization_strategy, + num_threads=num_threads, + graphml_snapshot_enabled=graphml_snapshot_enabled, ) return create_verb_result(cast(Table, output)) diff --git a/graphrag/llm/openai/openai_completion_llm.py b/graphrag/llm/openai/openai_completion_llm.py index bdbac6c1..74511c02 100644 --- a/graphrag/llm/openai/openai_completion_llm.py +++ b/graphrag/llm/openai/openai_completion_llm.py @@ -39,5 +39,5 @@ class OpenAICompletionLLM(BaseLLM[CompletionInput, CompletionOutput]): args = get_completion_llm_args( kwargs.get("model_parameters"), self.configuration ) - completion = self.client.completions.create(prompt=input, **args) + completion = await self.client.completions.create(prompt=input, **args) return completion.choices[0].text diff --git a/graphrag/prompt_tune/loader/input.py b/graphrag/prompt_tune/loader/input.py index 06799905..a61f0a47 100644 --- a/graphrag/prompt_tune/loader/input.py +++ b/graphrag/prompt_tune/loader/input.py @@ -3,18 +3,16 @@ """Input loading module.""" -from typing import cast - import numpy as np import pandas as pd -from datashaper import NoopVerbCallbacks, TableContainer, VerbInput +from datashaper import NoopVerbCallbacks import graphrag.config.defaults as defs from graphrag.config.models.graph_rag_config import GraphRagConfig from graphrag.index.input import load_input from graphrag.index.llm import load_llm_embeddings +from graphrag.index.operations.chunk_text import chunk_text from graphrag.index.progress.types import ProgressReporter -from graphrag.index.verbs import chunk from graphrag.llm.types.llm_types import EmbeddingLLM from graphrag.prompt_tune.types import DocSelectionType @@ -62,23 +60,20 @@ async def load_docs_in_chunks( dataset = await load_input(config.input, reporter, root) # covert to text units - input = VerbInput(input=TableContainer(table=dataset)) chunk_strategy = config.chunks.resolved_strategy(defs.ENCODING_MODEL) # Use smaller chunks, to avoid huge prompts chunk_strategy["chunk_size"] = chunk_size chunk_strategy["chunk_overlap"] = MIN_CHUNK_OVERLAP - dataset_chunks_table_container = chunk( - input, + dataset_chunks = chunk_text( + dataset, column="text", to="chunks", callbacks=NoopVerbCallbacks(), strategy=chunk_strategy, ) - dataset_chunks = cast(pd.DataFrame, dataset_chunks_table_container.table) - # Select chunks into a new df and explode it chunks_df = pd.DataFrame(dataset_chunks["chunks"].explode()) # type: ignore diff --git a/graphrag/query/structured_search/global_search/community_context.py b/graphrag/query/structured_search/global_search/community_context.py index d63320c8..f5991526 100644 --- a/graphrag/query/structured_search/global_search/community_context.py +++ b/graphrag/query/structured_search/global_search/community_context.py @@ -87,13 +87,21 @@ class GlobalCommunityContext(GlobalContextBuilder): context_name=context_name, random_state=self.random_state, ) - if isinstance(community_context, list): - final_context = [ - f"{conversation_history_context}\n\n{context}" - for context in community_context - ] - else: - final_context = f"{conversation_history_context}\n\n{community_context}" + # Prepare context_prefix based on whether conversation_history_context exists + context_prefix = ( + f"{conversation_history_context}\n\n" + if conversation_history_context + else "" + ) + + final_context = ( + [f"{context_prefix}{context}" for context in community_context] + if isinstance(community_context, list) + else f"{context_prefix}{community_context}" + ) + + # Update the final context data with the provided community_context_data final_context_data.update(community_context_data) - return (final_context, final_context_data) + + return final_context, final_context_data diff --git a/tests/integration/_pipeline/megapipeline.yml b/tests/integration/_pipeline/megapipeline.yml index e8f51d26..a6004b9b 100644 --- a/tests/integration/_pipeline/megapipeline.yml +++ b/tests/integration/_pipeline/megapipeline.yml @@ -24,7 +24,29 @@ workflows: graphml_snapshot: True entity_extract: strategy: - type: nltk + type: graph_intelligence + llm: + type: static_response + responses: + - '("entity"<|>COMPANY_A<|>COMPANY<|>Company_A is a test company) + ## + ("entity"<|>COMPANY_B<|>COMPANY<|>Company_B owns Company_A and also shares an address with Company_A) + ## + ("entity"<|>PERSON_C<|>PERSON<|>Person_C is director of Company_A) + ## + ("relationship"<|>COMPANY_A<|>COMPANY_B<|>Company_A and Company_B are related because Company_A is 100% owned by Company_B and the two companies also share the same address)<|>2) + ## + ("relationship"<|>COMPANY_A<|>PERSON_C<|>Company_A and Person_C are related because Person_C is director of Company_A<|>1))' + + - name: create_summarized_entities + config: + summarize_descriptions: + strategy: + type: graph_intelligence + llm: + type: static_response + responses: + - This is a MOCK response for the LLM. It is summarized! - name: create_base_entity_graph config: diff --git a/tests/unit/indexing/verbs/entities/extraction/strategies/graph_intelligence/test_gi_entity_extraction.py b/tests/unit/indexing/verbs/entities/extraction/strategies/graph_intelligence/test_gi_entity_extraction.py index 1ebca2bf..31a83a26 100644 --- a/tests/unit/indexing/verbs/entities/extraction/strategies/graph_intelligence/test_gi_entity_extraction.py +++ b/tests/unit/indexing/verbs/entities/extraction/strategies/graph_intelligence/test_gi_entity_extraction.py @@ -4,10 +4,12 @@ import unittest import networkx as nx -from graphrag.index.verbs.entities.extraction.strategies.graph_intelligence.run_graph_intelligence import ( - Document, +from graphrag.index.operations.extract_entities.strategies.graph_intelligence import ( run_extract_entities, ) +from graphrag.index.operations.extract_entities.strategies.typing import ( + Document, +) from tests.unit.indexing.verbs.helpers.mock_llm import create_mock_llm @@ -16,7 +18,7 @@ class TestRunChain(unittest.IsolatedAsyncioTestCase): results = await run_extract_entities( docs=[Document("test_text", "1")], entity_types=["person"], - reporter=None, + callbacks=None, args={ "prechunked": True, "max_gleanings": 0, @@ -51,7 +53,7 @@ class TestRunChain(unittest.IsolatedAsyncioTestCase): results = await run_extract_entities( docs=[Document("text_1", "1"), Document("text_2", "2")], entity_types=["person"], - reporter=None, + callbacks=None, args={ "prechunked": True, "max_gleanings": 0, @@ -88,7 +90,7 @@ class TestRunChain(unittest.IsolatedAsyncioTestCase): results = await run_extract_entities( docs=[Document("text_1", "1"), Document("text_2", "2")], entity_types=["person"], - reporter=None, + callbacks=None, args={ "prechunked": True, "max_gleanings": 0, @@ -133,7 +135,7 @@ class TestRunChain(unittest.IsolatedAsyncioTestCase): results = await run_extract_entities( docs=[Document("text_1", "1"), Document("text_2", "2")], entity_types=["person"], - reporter=None, + callbacks=None, args={ "prechunked": True, "max_gleanings": 0, @@ -181,7 +183,7 @@ class TestRunChain(unittest.IsolatedAsyncioTestCase): results = await run_extract_entities( docs=[Document("text_1", "1"), Document("text_2", "2")], entity_types=["person"], - reporter=None, + callbacks=None, args={ "prechunked": True, "max_gleanings": 0, diff --git a/tests/unit/indexing/verbs/text/test_split.py b/tests/unit/indexing/verbs/text/test_split.py index d9ced064..abbb0eef 100644 --- a/tests/unit/indexing/verbs/text/test_split.py +++ b/tests/unit/indexing/verbs/text/test_split.py @@ -5,34 +5,34 @@ import unittest import pandas as pd import pytest -from graphrag.index.verbs.text.split import text_split_df +from graphrag.index.operations.split_text import split_text class TestTextSplit(unittest.TestCase): def test_empty_string(self): input = pd.DataFrame([{"in": ""}]) - result = text_split_df(input, "in", "out", ",").to_dict(orient="records") + result = split_text(input, "in", "out", ",").to_dict(orient="records") assert len(result) == 1 assert result[0]["out"] == [] def test_string_without_seperator(self): input = pd.DataFrame([{"in": "test_string_without_seperator"}]) - result = text_split_df(input, "in", "out", ",").to_dict(orient="records") + result = split_text(input, "in", "out", ",").to_dict(orient="records") assert len(result) == 1 assert result[0]["out"] == ["test_string_without_seperator"] def test_string_with_seperator(self): input = pd.DataFrame([{"in": "test_1,test_2"}]) - result = text_split_df(input, "in", "out", ",").to_dict(orient="records") + result = split_text(input, "in", "out", ",").to_dict(orient="records") assert len(result) == 1 assert result[0]["out"] == ["test_1", "test_2"] def test_row_with_list_as_column(self): input = pd.DataFrame([{"in": ["test_1", "test_2"]}]) - result = text_split_df(input, "in", "out", ",").to_dict(orient="records") + result = split_text(input, "in", "out", ",").to_dict(orient="records") assert len(result) == 1 assert result[0]["out"] == ["test_1", "test_2"] @@ -40,11 +40,11 @@ class TestTextSplit(unittest.TestCase): def test_non_string_column_throws_error(self): input = pd.DataFrame([{"in": 5}]) with pytest.raises(TypeError): - text_split_df(input, "in", "out", ",").to_dict(orient="records") + split_text(input, "in", "out", ",").to_dict(orient="records") def test_more_than_one_row_returns_correctly(self): input = pd.DataFrame([{"in": "row_1_1,row_1_2"}, {"in": "row_2_1,row_2_2"}]) - result = text_split_df(input, "in", "out", ",").to_dict(orient="records") + result = split_text(input, "in", "out", ",").to_dict(orient="records") assert len(result) == 2 assert result[0]["out"] == ["row_1_1", "row_1_2"] diff --git a/tests/verbs/test_create_base_extracted_entities.py b/tests/verbs/test_create_base_extracted_entities.py index 029126c1..57ca6003 100644 --- a/tests/verbs/test_create_base_extracted_entities.py +++ b/tests/verbs/test_create_base_extracted_entities.py @@ -2,7 +2,10 @@ # Licensed under the MIT License import networkx as nx +import pytest +from datashaper.errors import VerbParallelizationError +from graphrag.config.enums import LLMType from graphrag.index.storage.memory_pipeline_storage import MemoryPipelineStorage from graphrag.index.workflows.v1.create_base_extracted_entities import ( build_steps, @@ -16,6 +19,25 @@ from .util import ( load_input_tables, ) +MOCK_LLM_RESPONSES = [ + """ + ("entity"<|>COMPANY_A<|>COMPANY<|>Company_A is a test company) + ## + ("entity"<|>COMPANY_B<|>COMPANY<|>Company_B owns Company_A and also shares an address with Company_A) + ## + ("entity"<|>PERSON_C<|>PERSON<|>Person_C is director of Company_A) + ## + ("relationship"<|>COMPANY_A<|>COMPANY_B<|>Company_A and Company_B are related because Company_A is 100% owned by Company_B and the two companies also share the same address)<|>2) + ## + ("relationship"<|>COMPANY_A<|>PERSON_C<|>Company_A and Person_C are related because Person_C is director of Company_A<|>1)) + """.strip() +] + +MOCK_LLM_CONFIG = { + "type": LLMType.StaticResponse, + "responses": MOCK_LLM_RESPONSES, +} + async def test_create_base_extracted_entities(): input_tables = load_input_tables(["workflow:create_base_text_units"]) @@ -25,7 +47,7 @@ async def test_create_base_extracted_entities(): config = get_config_for_workflow(workflow_name) - del config["entity_extract"]["strategy"]["llm"] + config["entity_extract"]["strategy"]["llm"] = MOCK_LLM_CONFIG steps = build_steps(config) @@ -57,7 +79,7 @@ async def test_create_base_extracted_entities_with_snapshots(): config = get_config_for_workflow(workflow_name) - del config["entity_extract"]["strategy"]["llm"] + config["entity_extract"]["strategy"]["llm"] = MOCK_LLM_CONFIG config["raw_entity_snapshot"] = True config["graphml_snapshot"] = True @@ -76,3 +98,21 @@ async def test_create_base_extracted_entities_with_snapshots(): assert actual.columns == expected.columns assert storage.keys() == ["raw_extracted_entities.json", "merged_graph.graphml"] + + +async def test_create_base_extracted_entities_missing_llm_throws(): + input_tables = load_input_tables(["workflow:create_base_text_units"]) + + config = get_config_for_workflow(workflow_name) + + del config["entity_extract"]["strategy"]["llm"] + + steps = build_steps(config) + + with pytest.raises(VerbParallelizationError): + await get_workflow_output( + input_tables, + { + "steps": steps, + }, + ) diff --git a/tests/verbs/test_create_final_community_reports.py b/tests/verbs/test_create_final_community_reports.py index b4f5ecf7..cb846e24 100644 --- a/tests/verbs/test_create_final_community_reports.py +++ b/tests/verbs/test_create_final_community_reports.py @@ -1,6 +1,12 @@ # Copyright (c) 2024 Microsoft Corporation. # Licensed under the MIT License +import json + +import pytest +from datashaper.errors import VerbParallelizationError + +from graphrag.config.enums import LLMType from graphrag.index.workflows.v1.create_final_community_reports import ( build_steps, workflow_name, @@ -14,6 +20,27 @@ from .util import ( load_input_tables, ) +MOCK_RESPONSES = [ + json.dumps({ + "title": "", + "summary": "", + "rating": 2, + "rating_explanation": "", + "findings": [ + { + "summary": "", + "explanation": "", + "explanation": "GOVERNMENT AGENCY B<|>ANTI-COMPETITIVE PRACTICES<|>TRUE<|>2022-01-10T00:00:00<|>2022-01-10T00:00:00<|>Company A was found to engage in anti-competitive practices because it was fined for bid rigging in multiple public tenders published by Government Agency B according to an article published on 2022/01/10<|>According to an article published on 2022/01/10, Company A was fined for bid rigging while participating in multiple public tenders published by Government Agency B.) + """.strip() +] + +MOCK_LLM_CONFIG = {"type": LLMType.StaticResponse, "responses": MOCK_LLM_RESPONSES} + async def test_create_final_covariates(): input_tables = load_input_tables(["workflow:create_base_text_units"]) @@ -22,8 +33,7 @@ async def test_create_final_covariates(): config = get_config_for_workflow(workflow_name) - # deleting the llm config results in a default mock injection in run_gi_extract_claims - del config["claim_extract"]["strategy"]["llm"] + config["claim_extract"]["strategy"]["llm"] = MOCK_LLM_CONFIG steps = build_steps(config) @@ -66,3 +76,21 @@ async def test_create_final_covariates(): actual["source_text"][0] == "According to an article published on 2022/01/10, Company A was fined for bid rigging while participating in multiple public tenders published by Government Agency B." ) + + +async def test_create_final_covariates_missing_llm_throws(): + input_tables = load_input_tables(["workflow:create_base_text_units"]) + + config = get_config_for_workflow(workflow_name) + + del config["claim_extract"]["strategy"]["llm"] + + steps = build_steps(config) + + with pytest.raises(VerbParallelizationError): + await get_workflow_output( + input_tables, + { + "steps": steps, + }, + ) diff --git a/tests/verbs/test_create_summarized_entities.py b/tests/verbs/test_create_summarized_entities.py index 7d9ac9b5..c36c9b53 100644 --- a/tests/verbs/test_create_summarized_entities.py +++ b/tests/verbs/test_create_summarized_entities.py @@ -2,7 +2,9 @@ # Licensed under the MIT License import networkx as nx +import pytest +from graphrag.config.enums import LLMType from graphrag.index.storage.memory_pipeline_storage import MemoryPipelineStorage from graphrag.index.workflows.v1.create_summarized_entities import ( build_steps, @@ -16,6 +18,17 @@ from .util import ( load_input_tables, ) +MOCK_LLM_RESPONSES = [ + """ + This is a MOCK response for the LLM. It is summarized! + """.strip() +] + +MOCK_LLM_CONFIG = { + "type": LLMType.StaticResponse, + "responses": MOCK_LLM_RESPONSES, +} + async def test_create_summarized_entities(): input_tables = load_input_tables([ @@ -27,7 +40,7 @@ async def test_create_summarized_entities(): config = get_config_for_workflow(workflow_name) - del config["summarize_descriptions"]["strategy"]["llm"] + config["summarize_descriptions"]["strategy"]["llm"] = MOCK_LLM_CONFIG steps = build_steps(config) @@ -76,7 +89,7 @@ async def test_create_summarized_entities_with_snapshots(): config = get_config_for_workflow(workflow_name) - del config["summarize_descriptions"]["strategy"]["llm"] + config["summarize_descriptions"]["strategy"]["llm"] = MOCK_LLM_CONFIG config["graphml_snapshot"] = True steps = build_steps(config) @@ -94,3 +107,23 @@ async def test_create_summarized_entities_with_snapshots(): assert storage.keys() == [ "summarized_graph.graphml", ], "Graph snapshot keys differ" + + +async def test_create_summarized_entities_missing_llm_throws(): + input_tables = load_input_tables([ + "workflow:create_base_extracted_entities", + ]) + + config = get_config_for_workflow(workflow_name) + + del config["summarize_descriptions"]["strategy"]["llm"] + + steps = build_steps(config) + + with pytest.raises(ValueError): # noqa PT011 + await get_workflow_output( + input_tables, + { + "steps": steps, + }, + ) diff --git a/tests/verbs/util.py b/tests/verbs/util.py index 4dff5399..2d6d5b6d 100644 --- a/tests/verbs/util.py +++ b/tests/verbs/util.py @@ -53,6 +53,7 @@ def get_config_for_workflow(name: str) -> PipelineWorkflowConfig: pipeline_config = create_pipeline_config(config) result = next(conf for conf in pipeline_config.workflows if conf.name == name) + return cast(PipelineWorkflowConfig, result.config)