Streamline config

This commit is contained in:
Nathan Evans 2025-12-18 17:15:04 -08:00
parent 896a48ce1e
commit 9aa94dfd86
9 changed files with 33 additions and 34 deletions

View File

@ -3,15 +3,11 @@
"""Chunk strategy type enumeration."""
from enum import Enum
from enum import StrEnum
class ChunkStrategyType(str, Enum):
class ChunkStrategyType(StrEnum):
"""ChunkStrategy class definition."""
tokens = "tokens"
sentence = "sentence"
def __repr__(self):
"""Get a string representation."""
return f'"{self.value}"'
Tokens = "tokens"
Sentence = "sentence"

View File

@ -59,14 +59,14 @@ def create_chunker(
if chunker_strategy not in chunker_factory:
match chunker_strategy:
case ChunkStrategyType.tokens:
case ChunkStrategyType.Tokens:
from graphrag.chunking.token_chunker import TokenChunker
register_chunker(ChunkStrategyType.tokens, TokenChunker)
case ChunkStrategyType.sentence:
register_chunker(ChunkStrategyType.Tokens, TokenChunker)
case ChunkStrategyType.Sentence:
from graphrag.chunking.sentence_chunker import SentenceChunker
register_chunker(ChunkStrategyType.sentence, SentenceChunker)
register_chunker(ChunkStrategyType.Sentence, SentenceChunker)
case _:
msg = f"ChunkingConfig.strategy '{chunker_strategy}' is not registered in the ChunkerFactory. Registered types: {', '.join(chunker_factory.keys())}."
raise ValueError(msg)

View File

@ -3,36 +3,38 @@
"""Parameterization settings for the default configuration."""
from pydantic import BaseModel, Field
from pydantic import BaseModel, ConfigDict, Field
from graphrag.chunking.chunk_strategy_type import ChunkStrategyType
from graphrag.config.defaults import graphrag_config_defaults
class ChunkingConfig(BaseModel):
"""Configuration section for chunking."""
model_config = ConfigDict(extra="allow")
"""Allow extra fields to support custom cache implementations."""
strategy: str = Field(
description="The chunking strategy to use.",
default=ChunkStrategyType.tokens,
default=ChunkStrategyType.Tokens,
)
size: int = Field(
size: int | None = Field(
description="The chunk size to use.",
default=graphrag_config_defaults.chunks.size,
default=None,
)
overlap: int = Field(
overlap: int | None = Field(
description="The chunk overlap to use.",
default=graphrag_config_defaults.chunks.overlap,
default=None,
)
encoding_model: str = Field(
encoding_model: str | None = Field(
description="The encoding model to use.",
default=graphrag_config_defaults.chunks.encoding_model,
default=None,
)
prepend_metadata: bool = Field(
prepend_metadata: bool | None = Field(
description="Prepend metadata into each chunk.",
default=graphrag_config_defaults.chunks.prepend_metadata,
default=None,
)
chunk_size_includes_metadata: bool = Field(
chunk_size_includes_metadata: bool | None = Field(
description="Count metadata in max tokens.",
default=graphrag_config_defaults.chunks.chunk_size_includes_metadata,
default=None,
)

View File

@ -15,7 +15,7 @@ from graphrag.chunking.chunker import Chunker
class SentenceChunker(Chunker):
"""A chunker that splits text into sentence-based chunks."""
def __init__(self, prepend_metadata: bool, **kwargs: Any) -> None:
def __init__(self, prepend_metadata: bool = False, **kwargs: Any) -> None:
"""Create a sentence chunker instance."""
self._prepend_metadata = prepend_metadata
bootstrap()

View File

@ -24,9 +24,9 @@ class TokenChunker(Chunker):
size: int,
overlap: int,
encoding_model: str,
prepend_metadata: bool,
chunk_size_includes_metadata: bool,
tokenizer: Tokenizer,
prepend_metadata: bool = False,
chunk_size_includes_metadata: bool = False,
**kwargs: Any,
) -> None:
"""Create a token chunker instance."""

View File

@ -60,9 +60,9 @@ class BasicSearchDefaults:
class ChunksDefaults:
"""Default values for chunks."""
strategy: str = ChunkStrategyType.Tokens
size: int = 1200
overlap: int = 100
strategy: ClassVar[ChunkStrategyType] = ChunkStrategyType.tokens
encoding_model: str = ENCODING_MODEL
prepend_metadata: bool = False
chunk_size_includes_metadata: bool = False

View File

@ -55,8 +55,10 @@ input:
file_type: {graphrag_config_defaults.input.file_type.value} # [csv, text, json]
chunks:
strategy: {graphrag_config_defaults.chunks.strategy}
size: {graphrag_config_defaults.chunks.size}
overlap: {graphrag_config_defaults.chunks.overlap}
encoding_model: {graphrag_config_defaults.chunks.encoding_model}
### Output/storage settings ###
## If blob storage is specified in the following four sections,

View File

@ -92,4 +92,3 @@ def create_base_text_units(
return cast(
"pd.DataFrame", text_units[text_units["text"].notna()].reset_index(drop=True)
)

View File

@ -31,7 +31,7 @@ class TestRunSentences:
def test_basic_functionality(self):
"""Test basic sentence splitting without metadata"""
input = "This is a test. Another sentence."
chunker = create_chunker(ChunkingConfig(strategy=ChunkStrategyType.sentence))
chunker = create_chunker(ChunkingConfig(strategy=ChunkStrategyType.Sentence))
chunks = chunker.chunk(input)
assert len(chunks) == 2
@ -41,14 +41,14 @@ class TestRunSentences:
def test_multiple_documents(self):
"""Test processing multiple input documents"""
input = ["First. Document.", "Second. Doc."]
chunker = create_chunker(ChunkingConfig(strategy=ChunkStrategyType.sentence))
chunker = create_chunker(ChunkingConfig(strategy=ChunkStrategyType.Sentence))
chunks = [chunk for doc in input for chunk in chunker.chunk(doc)]
assert len(chunks) == 4
def test_mixed_whitespace_handling(self):
"""Test input with irregular whitespace"""
input = " Sentence with spaces. Another one! "
chunker = create_chunker(ChunkingConfig(strategy=ChunkStrategyType.sentence))
chunker = create_chunker(ChunkingConfig(strategy=ChunkStrategyType.Sentence))
chunks = chunker.chunk(input)
assert chunks[0] == " Sentence with spaces."
assert chunks[1] == "Another one!"
@ -67,7 +67,7 @@ class TestRunTokens:
size=5,
overlap=1,
encoding_model="fake-encoding",
strategy=ChunkStrategyType.tokens,
strategy=ChunkStrategyType.Tokens,
)
chunker = create_chunker(config, tokenizer=tokenizer)