mirror of
https://github.com/microsoft/graphrag.git
synced 2026-01-14 00:57:23 +08:00
Streamline config
This commit is contained in:
parent
896a48ce1e
commit
9aa94dfd86
@ -3,15 +3,11 @@
|
||||
|
||||
"""Chunk strategy type enumeration."""
|
||||
|
||||
from enum import Enum
|
||||
from enum import StrEnum
|
||||
|
||||
|
||||
class ChunkStrategyType(str, Enum):
|
||||
class ChunkStrategyType(StrEnum):
|
||||
"""ChunkStrategy class definition."""
|
||||
|
||||
tokens = "tokens"
|
||||
sentence = "sentence"
|
||||
|
||||
def __repr__(self):
|
||||
"""Get a string representation."""
|
||||
return f'"{self.value}"'
|
||||
Tokens = "tokens"
|
||||
Sentence = "sentence"
|
||||
|
||||
@ -59,14 +59,14 @@ def create_chunker(
|
||||
|
||||
if chunker_strategy not in chunker_factory:
|
||||
match chunker_strategy:
|
||||
case ChunkStrategyType.tokens:
|
||||
case ChunkStrategyType.Tokens:
|
||||
from graphrag.chunking.token_chunker import TokenChunker
|
||||
|
||||
register_chunker(ChunkStrategyType.tokens, TokenChunker)
|
||||
case ChunkStrategyType.sentence:
|
||||
register_chunker(ChunkStrategyType.Tokens, TokenChunker)
|
||||
case ChunkStrategyType.Sentence:
|
||||
from graphrag.chunking.sentence_chunker import SentenceChunker
|
||||
|
||||
register_chunker(ChunkStrategyType.sentence, SentenceChunker)
|
||||
register_chunker(ChunkStrategyType.Sentence, SentenceChunker)
|
||||
case _:
|
||||
msg = f"ChunkingConfig.strategy '{chunker_strategy}' is not registered in the ChunkerFactory. Registered types: {', '.join(chunker_factory.keys())}."
|
||||
raise ValueError(msg)
|
||||
|
||||
@ -3,36 +3,38 @@
|
||||
|
||||
"""Parameterization settings for the default configuration."""
|
||||
|
||||
from pydantic import BaseModel, Field
|
||||
from pydantic import BaseModel, ConfigDict, Field
|
||||
|
||||
from graphrag.chunking.chunk_strategy_type import ChunkStrategyType
|
||||
from graphrag.config.defaults import graphrag_config_defaults
|
||||
|
||||
|
||||
class ChunkingConfig(BaseModel):
|
||||
"""Configuration section for chunking."""
|
||||
|
||||
model_config = ConfigDict(extra="allow")
|
||||
"""Allow extra fields to support custom cache implementations."""
|
||||
|
||||
strategy: str = Field(
|
||||
description="The chunking strategy to use.",
|
||||
default=ChunkStrategyType.tokens,
|
||||
default=ChunkStrategyType.Tokens,
|
||||
)
|
||||
size: int = Field(
|
||||
size: int | None = Field(
|
||||
description="The chunk size to use.",
|
||||
default=graphrag_config_defaults.chunks.size,
|
||||
default=None,
|
||||
)
|
||||
overlap: int = Field(
|
||||
overlap: int | None = Field(
|
||||
description="The chunk overlap to use.",
|
||||
default=graphrag_config_defaults.chunks.overlap,
|
||||
default=None,
|
||||
)
|
||||
encoding_model: str = Field(
|
||||
encoding_model: str | None = Field(
|
||||
description="The encoding model to use.",
|
||||
default=graphrag_config_defaults.chunks.encoding_model,
|
||||
default=None,
|
||||
)
|
||||
prepend_metadata: bool = Field(
|
||||
prepend_metadata: bool | None = Field(
|
||||
description="Prepend metadata into each chunk.",
|
||||
default=graphrag_config_defaults.chunks.prepend_metadata,
|
||||
default=None,
|
||||
)
|
||||
chunk_size_includes_metadata: bool = Field(
|
||||
chunk_size_includes_metadata: bool | None = Field(
|
||||
description="Count metadata in max tokens.",
|
||||
default=graphrag_config_defaults.chunks.chunk_size_includes_metadata,
|
||||
default=None,
|
||||
)
|
||||
|
||||
@ -15,7 +15,7 @@ from graphrag.chunking.chunker import Chunker
|
||||
class SentenceChunker(Chunker):
|
||||
"""A chunker that splits text into sentence-based chunks."""
|
||||
|
||||
def __init__(self, prepend_metadata: bool, **kwargs: Any) -> None:
|
||||
def __init__(self, prepend_metadata: bool = False, **kwargs: Any) -> None:
|
||||
"""Create a sentence chunker instance."""
|
||||
self._prepend_metadata = prepend_metadata
|
||||
bootstrap()
|
||||
|
||||
@ -24,9 +24,9 @@ class TokenChunker(Chunker):
|
||||
size: int,
|
||||
overlap: int,
|
||||
encoding_model: str,
|
||||
prepend_metadata: bool,
|
||||
chunk_size_includes_metadata: bool,
|
||||
tokenizer: Tokenizer,
|
||||
prepend_metadata: bool = False,
|
||||
chunk_size_includes_metadata: bool = False,
|
||||
**kwargs: Any,
|
||||
) -> None:
|
||||
"""Create a token chunker instance."""
|
||||
|
||||
@ -60,9 +60,9 @@ class BasicSearchDefaults:
|
||||
class ChunksDefaults:
|
||||
"""Default values for chunks."""
|
||||
|
||||
strategy: str = ChunkStrategyType.Tokens
|
||||
size: int = 1200
|
||||
overlap: int = 100
|
||||
strategy: ClassVar[ChunkStrategyType] = ChunkStrategyType.tokens
|
||||
encoding_model: str = ENCODING_MODEL
|
||||
prepend_metadata: bool = False
|
||||
chunk_size_includes_metadata: bool = False
|
||||
|
||||
@ -55,8 +55,10 @@ input:
|
||||
file_type: {graphrag_config_defaults.input.file_type.value} # [csv, text, json]
|
||||
|
||||
chunks:
|
||||
strategy: {graphrag_config_defaults.chunks.strategy}
|
||||
size: {graphrag_config_defaults.chunks.size}
|
||||
overlap: {graphrag_config_defaults.chunks.overlap}
|
||||
encoding_model: {graphrag_config_defaults.chunks.encoding_model}
|
||||
|
||||
### Output/storage settings ###
|
||||
## If blob storage is specified in the following four sections,
|
||||
|
||||
@ -92,4 +92,3 @@ def create_base_text_units(
|
||||
return cast(
|
||||
"pd.DataFrame", text_units[text_units["text"].notna()].reset_index(drop=True)
|
||||
)
|
||||
|
||||
|
||||
@ -31,7 +31,7 @@ class TestRunSentences:
|
||||
def test_basic_functionality(self):
|
||||
"""Test basic sentence splitting without metadata"""
|
||||
input = "This is a test. Another sentence."
|
||||
chunker = create_chunker(ChunkingConfig(strategy=ChunkStrategyType.sentence))
|
||||
chunker = create_chunker(ChunkingConfig(strategy=ChunkStrategyType.Sentence))
|
||||
chunks = chunker.chunk(input)
|
||||
|
||||
assert len(chunks) == 2
|
||||
@ -41,14 +41,14 @@ class TestRunSentences:
|
||||
def test_multiple_documents(self):
|
||||
"""Test processing multiple input documents"""
|
||||
input = ["First. Document.", "Second. Doc."]
|
||||
chunker = create_chunker(ChunkingConfig(strategy=ChunkStrategyType.sentence))
|
||||
chunker = create_chunker(ChunkingConfig(strategy=ChunkStrategyType.Sentence))
|
||||
chunks = [chunk for doc in input for chunk in chunker.chunk(doc)]
|
||||
assert len(chunks) == 4
|
||||
|
||||
def test_mixed_whitespace_handling(self):
|
||||
"""Test input with irregular whitespace"""
|
||||
input = " Sentence with spaces. Another one! "
|
||||
chunker = create_chunker(ChunkingConfig(strategy=ChunkStrategyType.sentence))
|
||||
chunker = create_chunker(ChunkingConfig(strategy=ChunkStrategyType.Sentence))
|
||||
chunks = chunker.chunk(input)
|
||||
assert chunks[0] == " Sentence with spaces."
|
||||
assert chunks[1] == "Another one!"
|
||||
@ -67,7 +67,7 @@ class TestRunTokens:
|
||||
size=5,
|
||||
overlap=1,
|
||||
encoding_model="fake-encoding",
|
||||
strategy=ChunkStrategyType.tokens,
|
||||
strategy=ChunkStrategyType.Tokens,
|
||||
)
|
||||
|
||||
chunker = create_chunker(config, tokenizer=tokenizer)
|
||||
|
||||
Loading…
Reference in New Issue
Block a user