diff --git a/.semversioner/next-release/minor-20250916182815141332.json b/.semversioner/next-release/minor-20250916182815141332.json new file mode 100644 index 00000000..666af666 --- /dev/null +++ b/.semversioner/next-release/minor-20250916182815141332.json @@ -0,0 +1,4 @@ +{ + "type": "minor", + "description": "Add config for NLP async mode." +} diff --git a/graphrag/config/defaults.py b/graphrag/config/defaults.py index dc84fc45..8d299cc4 100644 --- a/graphrag/config/defaults.py +++ b/graphrag/config/defaults.py @@ -214,6 +214,7 @@ class ExtractGraphNLPDefaults: normalize_edge_weights: bool = True text_analyzer: TextAnalyzerDefaults = field(default_factory=TextAnalyzerDefaults) concurrent_requests: int = 25 + async_mode: AsyncType = AsyncType.Threaded @dataclass diff --git a/graphrag/config/init_content.py b/graphrag/config/init_content.py index d08abb1c..1eb60cf1 100644 --- a/graphrag/config/init_content.py +++ b/graphrag/config/init_content.py @@ -112,6 +112,7 @@ summarize_descriptions: extract_graph_nlp: text_analyzer: extractor_type: {graphrag_config_defaults.extract_graph_nlp.text_analyzer.extractor_type.value} # [regex_english, syntactic_parser, cfg] + async_mode: {graphrag_config_defaults.extract_graph_nlp.async_mode.value} # or asyncio cluster_graph: max_cluster_size: {graphrag_config_defaults.cluster_graph.max_cluster_size} diff --git a/graphrag/config/models/extract_graph_nlp_config.py b/graphrag/config/models/extract_graph_nlp_config.py index 52b8a4eb..5ab587cf 100644 --- a/graphrag/config/models/extract_graph_nlp_config.py +++ b/graphrag/config/models/extract_graph_nlp_config.py @@ -6,7 +6,7 @@ from pydantic import BaseModel, Field from graphrag.config.defaults import graphrag_config_defaults -from graphrag.config.enums import NounPhraseExtractorType +from graphrag.config.enums import AsyncType, NounPhraseExtractorType class TextAnalyzerConfig(BaseModel): @@ -68,3 +68,7 @@ class ExtractGraphNLPConfig(BaseModel): description="The number of threads to use for the extraction process.", default=graphrag_config_defaults.extract_graph_nlp.concurrent_requests, ) + async_mode: AsyncType = Field( + description="The async mode to use.", + default=graphrag_config_defaults.extract_graph_nlp.async_mode, + ) diff --git a/graphrag/index/operations/build_noun_graph/build_noun_graph.py b/graphrag/index/operations/build_noun_graph/build_noun_graph.py index 1c868594..dca2644c 100644 --- a/graphrag/index/operations/build_noun_graph/build_noun_graph.py +++ b/graphrag/index/operations/build_noun_graph/build_noun_graph.py @@ -24,12 +24,17 @@ async def build_noun_graph( text_analyzer: BaseNounPhraseExtractor, normalize_edge_weights: bool, num_threads: int = 4, + async_mode: AsyncType = AsyncType.Threaded, cache: PipelineCache | None = None, ) -> tuple[pd.DataFrame, pd.DataFrame]: """Build a noun graph from text units.""" text_units = text_unit_df.loc[:, ["id", "text"]] nodes_df = await _extract_nodes( - text_units, text_analyzer, num_threads=num_threads, cache=cache + text_units, + text_analyzer, + num_threads=num_threads, + async_mode=async_mode, + cache=cache, ) edges_df = _extract_edges(nodes_df, normalize_edge_weights=normalize_edge_weights) return (nodes_df, edges_df) @@ -39,6 +44,7 @@ async def _extract_nodes( text_unit_df: pd.DataFrame, text_analyzer: BaseNounPhraseExtractor, num_threads: int = 4, + async_mode: AsyncType = AsyncType.Threaded, cache: PipelineCache | None = None, ) -> pd.DataFrame: """ @@ -64,7 +70,7 @@ async def _extract_nodes( text_unit_df, extract, num_threads=num_threads, - async_type=AsyncType.Threaded, + async_type=async_mode, progress_msg="extract noun phrases progress: ", ) diff --git a/graphrag/index/workflows/extract_graph_nlp.py b/graphrag/index/workflows/extract_graph_nlp.py index 0becdaa8..90afedf4 100644 --- a/graphrag/index/workflows/extract_graph_nlp.py +++ b/graphrag/index/workflows/extract_graph_nlp.py @@ -61,6 +61,7 @@ async def extract_graph_nlp( text_analyzer=text_analyzer, normalize_edge_weights=extraction_config.normalize_edge_weights, num_threads=extraction_config.concurrent_requests, + async_mode=extraction_config.async_mode, cache=cache, )