mirror of
https://github.com/microsoft/graphrag.git
synced 2026-01-13 16:47:20 +08:00
Configure async for NLP extraction (#2059)
Some checks failed
gh-pages / build (push) Has been cancelled
Python CI / python-ci (ubuntu-latest, 3.10) (push) Has been cancelled
Python CI / python-ci (ubuntu-latest, 3.11) (push) Has been cancelled
Python CI / python-ci (windows-latest, 3.10) (push) Has been cancelled
Python CI / python-ci (windows-latest, 3.11) (push) Has been cancelled
Python Integration Tests / python-ci (ubuntu-latest, 3.10) (push) Has been cancelled
Python Integration Tests / python-ci (windows-latest, 3.10) (push) Has been cancelled
Python Notebook Tests / python-ci (ubuntu-latest, 3.10) (push) Has been cancelled
Python Notebook Tests / python-ci (windows-latest, 3.10) (push) Has been cancelled
Python Publish (pypi) / Upload release to PyPI (push) Has been cancelled
Python Smoke Tests / python-ci (ubuntu-latest, 3.10) (push) Has been cancelled
Python Smoke Tests / python-ci (windows-latest, 3.10) (push) Has been cancelled
Spellcheck / spellcheck (push) Has been cancelled
Some checks failed
gh-pages / build (push) Has been cancelled
Python CI / python-ci (ubuntu-latest, 3.10) (push) Has been cancelled
Python CI / python-ci (ubuntu-latest, 3.11) (push) Has been cancelled
Python CI / python-ci (windows-latest, 3.10) (push) Has been cancelled
Python CI / python-ci (windows-latest, 3.11) (push) Has been cancelled
Python Integration Tests / python-ci (ubuntu-latest, 3.10) (push) Has been cancelled
Python Integration Tests / python-ci (windows-latest, 3.10) (push) Has been cancelled
Python Notebook Tests / python-ci (ubuntu-latest, 3.10) (push) Has been cancelled
Python Notebook Tests / python-ci (windows-latest, 3.10) (push) Has been cancelled
Python Publish (pypi) / Upload release to PyPI (push) Has been cancelled
Python Smoke Tests / python-ci (ubuntu-latest, 3.10) (push) Has been cancelled
Python Smoke Tests / python-ci (windows-latest, 3.10) (push) Has been cancelled
Spellcheck / spellcheck (push) Has been cancelled
* Make async mode configurable for NLP extraction * Semver
This commit is contained in:
parent
a398cc38bb
commit
6c66b7c30f
@ -0,0 +1,4 @@
|
||||
{
|
||||
"type": "minor",
|
||||
"description": "Add config for NLP async mode."
|
||||
}
|
||||
@ -214,6 +214,7 @@ class ExtractGraphNLPDefaults:
|
||||
normalize_edge_weights: bool = True
|
||||
text_analyzer: TextAnalyzerDefaults = field(default_factory=TextAnalyzerDefaults)
|
||||
concurrent_requests: int = 25
|
||||
async_mode: AsyncType = AsyncType.Threaded
|
||||
|
||||
|
||||
@dataclass
|
||||
|
||||
@ -112,6 +112,7 @@ summarize_descriptions:
|
||||
extract_graph_nlp:
|
||||
text_analyzer:
|
||||
extractor_type: {graphrag_config_defaults.extract_graph_nlp.text_analyzer.extractor_type.value} # [regex_english, syntactic_parser, cfg]
|
||||
async_mode: {graphrag_config_defaults.extract_graph_nlp.async_mode.value} # or asyncio
|
||||
|
||||
cluster_graph:
|
||||
max_cluster_size: {graphrag_config_defaults.cluster_graph.max_cluster_size}
|
||||
|
||||
@ -6,7 +6,7 @@
|
||||
from pydantic import BaseModel, Field
|
||||
|
||||
from graphrag.config.defaults import graphrag_config_defaults
|
||||
from graphrag.config.enums import NounPhraseExtractorType
|
||||
from graphrag.config.enums import AsyncType, NounPhraseExtractorType
|
||||
|
||||
|
||||
class TextAnalyzerConfig(BaseModel):
|
||||
@ -68,3 +68,7 @@ class ExtractGraphNLPConfig(BaseModel):
|
||||
description="The number of threads to use for the extraction process.",
|
||||
default=graphrag_config_defaults.extract_graph_nlp.concurrent_requests,
|
||||
)
|
||||
async_mode: AsyncType = Field(
|
||||
description="The async mode to use.",
|
||||
default=graphrag_config_defaults.extract_graph_nlp.async_mode,
|
||||
)
|
||||
|
||||
@ -24,12 +24,17 @@ async def build_noun_graph(
|
||||
text_analyzer: BaseNounPhraseExtractor,
|
||||
normalize_edge_weights: bool,
|
||||
num_threads: int = 4,
|
||||
async_mode: AsyncType = AsyncType.Threaded,
|
||||
cache: PipelineCache | None = None,
|
||||
) -> tuple[pd.DataFrame, pd.DataFrame]:
|
||||
"""Build a noun graph from text units."""
|
||||
text_units = text_unit_df.loc[:, ["id", "text"]]
|
||||
nodes_df = await _extract_nodes(
|
||||
text_units, text_analyzer, num_threads=num_threads, cache=cache
|
||||
text_units,
|
||||
text_analyzer,
|
||||
num_threads=num_threads,
|
||||
async_mode=async_mode,
|
||||
cache=cache,
|
||||
)
|
||||
edges_df = _extract_edges(nodes_df, normalize_edge_weights=normalize_edge_weights)
|
||||
return (nodes_df, edges_df)
|
||||
@ -39,6 +44,7 @@ async def _extract_nodes(
|
||||
text_unit_df: pd.DataFrame,
|
||||
text_analyzer: BaseNounPhraseExtractor,
|
||||
num_threads: int = 4,
|
||||
async_mode: AsyncType = AsyncType.Threaded,
|
||||
cache: PipelineCache | None = None,
|
||||
) -> pd.DataFrame:
|
||||
"""
|
||||
@ -64,7 +70,7 @@ async def _extract_nodes(
|
||||
text_unit_df,
|
||||
extract,
|
||||
num_threads=num_threads,
|
||||
async_type=AsyncType.Threaded,
|
||||
async_type=async_mode,
|
||||
progress_msg="extract noun phrases progress: ",
|
||||
)
|
||||
|
||||
|
||||
@ -61,6 +61,7 @@ async def extract_graph_nlp(
|
||||
text_analyzer=text_analyzer,
|
||||
normalize_edge_weights=extraction_config.normalize_edge_weights,
|
||||
num_threads=extraction_config.concurrent_requests,
|
||||
async_mode=extraction_config.async_mode,
|
||||
cache=cache,
|
||||
)
|
||||
|
||||
|
||||
Loading…
Reference in New Issue
Block a user