Configure async for NLP extraction (#2059)

* Make async mode configurable for NLP extraction * Semver
2026-01-13 16:47:20 +08:00 · 2025-09-16 11:52:18 -07:00 · 2025-09-16 11:52:18 -07:00 · 6c66b7c30f
commit 6c66b7c30f
parent a398cc38bb
6 changed files with 20 additions and 3 deletions
--- a/.semversioner/next-release/minor-20250916182815141332.json
+++ b/.semversioner/next-release/minor-20250916182815141332.json
@ -0,0 +1,4 @@
+{
+  "type": "minor",
+  "description": "Add config for NLP async mode."
+}
--- a/graphrag/config/defaults.py
+++ b/graphrag/config/defaults.py
@ -214,6 +214,7 @@ class ExtractGraphNLPDefaults:
    normalize_edge_weights: bool = True
    text_analyzer: TextAnalyzerDefaults = field(default_factory=TextAnalyzerDefaults)
    concurrent_requests: int = 25
+    async_mode: AsyncType = AsyncType.Threaded


@dataclass
--- a/graphrag/config/init_content.py
+++ b/graphrag/config/init_content.py
@ -112,6 +112,7 @@ summarize_descriptions:
 extract_graph_nlp:
  text_analyzer:
    extractor_type: {graphrag_config_defaults.extract_graph_nlp.text_analyzer.extractor_type.value} # [regex_english, syntactic_parser, cfg]
+  async_mode: {graphrag_config_defaults.extract_graph_nlp.async_mode.value} # or asyncio

 cluster_graph:
  max_cluster_size: {graphrag_config_defaults.cluster_graph.max_cluster_size}
--- a/graphrag/config/models/extract_graph_nlp_config.py
+++ b/graphrag/config/models/extract_graph_nlp_config.py
@ -6,7 +6,7 @@
 from pydantic import BaseModel, Field

 from graphrag.config.defaults import graphrag_config_defaults
-from graphrag.config.enums import NounPhraseExtractorType
+from graphrag.config.enums import AsyncType, NounPhraseExtractorType


 class TextAnalyzerConfig(BaseModel):
@ -68,3 +68,7 @@ class ExtractGraphNLPConfig(BaseModel):
        description="The number of threads to use for the extraction process.",
        default=graphrag_config_defaults.extract_graph_nlp.concurrent_requests,
    )
+    async_mode: AsyncType = Field(
+        description="The async mode to use.",
+        default=graphrag_config_defaults.extract_graph_nlp.async_mode,
+    )
--- a/graphrag/index/operations/build_noun_graph/build_noun_graph.py
+++ b/graphrag/index/operations/build_noun_graph/build_noun_graph.py
@ -24,12 +24,17 @@ async def build_noun_graph(
    text_analyzer: BaseNounPhraseExtractor,
    normalize_edge_weights: bool,
    num_threads: int = 4,
+    async_mode: AsyncType = AsyncType.Threaded,
    cache: PipelineCache | None = None,
 ) -> tuple[pd.DataFrame, pd.DataFrame]:
    """Build a noun graph from text units."""
    text_units = text_unit_df.loc[:, ["id", "text"]]
    nodes_df = await _extract_nodes(
-        text_units, text_analyzer, num_threads=num_threads, cache=cache
+        text_units,
+        text_analyzer,
+        num_threads=num_threads,
+        async_mode=async_mode,
+        cache=cache,
    )
    edges_df = _extract_edges(nodes_df, normalize_edge_weights=normalize_edge_weights)
    return (nodes_df, edges_df)
@ -39,6 +44,7 @@ async def _extract_nodes(
    text_unit_df: pd.DataFrame,
    text_analyzer: BaseNounPhraseExtractor,
    num_threads: int = 4,
+    async_mode: AsyncType = AsyncType.Threaded,
    cache: PipelineCache | None = None,
 ) -> pd.DataFrame:
    """
@ -64,7 +70,7 @@ async def _extract_nodes(
        text_unit_df,
        extract,
        num_threads=num_threads,
-        async_type=AsyncType.Threaded,
+        async_type=async_mode,
        progress_msg="extract noun phrases progress: ",
    )

--- a/graphrag/index/workflows/extract_graph_nlp.py
+++ b/graphrag/index/workflows/extract_graph_nlp.py
@ -61,6 +61,7 @@ async def extract_graph_nlp(
        text_analyzer=text_analyzer,
        normalize_edge_weights=extraction_config.normalize_edge_weights,
        num_threads=extraction_config.concurrent_requests,
+        async_mode=extraction_config.async_mode,
        cache=cache,
    )