diff --git a/packages/graphrag/graphrag/chunking/chunk_strategy_type.py b/packages/graphrag/graphrag/chunking/chunk_strategy_type.py index e6ab40b9..df3c9801 100644 --- a/packages/graphrag/graphrag/chunking/chunk_strategy_type.py +++ b/packages/graphrag/graphrag/chunking/chunk_strategy_type.py @@ -3,15 +3,11 @@ """Chunk strategy type enumeration.""" -from enum import Enum +from enum import StrEnum -class ChunkStrategyType(str, Enum): +class ChunkStrategyType(StrEnum): """ChunkStrategy class definition.""" - tokens = "tokens" - sentence = "sentence" - - def __repr__(self): - """Get a string representation.""" - return f'"{self.value}"' + Tokens = "tokens" + Sentence = "sentence" diff --git a/packages/graphrag/graphrag/chunking/chunker_factory.py b/packages/graphrag/graphrag/chunking/chunker_factory.py index d7bdc834..5986f73e 100644 --- a/packages/graphrag/graphrag/chunking/chunker_factory.py +++ b/packages/graphrag/graphrag/chunking/chunker_factory.py @@ -59,14 +59,14 @@ def create_chunker( if chunker_strategy not in chunker_factory: match chunker_strategy: - case ChunkStrategyType.tokens: + case ChunkStrategyType.Tokens: from graphrag.chunking.token_chunker import TokenChunker - register_chunker(ChunkStrategyType.tokens, TokenChunker) - case ChunkStrategyType.sentence: + register_chunker(ChunkStrategyType.Tokens, TokenChunker) + case ChunkStrategyType.Sentence: from graphrag.chunking.sentence_chunker import SentenceChunker - register_chunker(ChunkStrategyType.sentence, SentenceChunker) + register_chunker(ChunkStrategyType.Sentence, SentenceChunker) case _: msg = f"ChunkingConfig.strategy '{chunker_strategy}' is not registered in the ChunkerFactory. Registered types: {', '.join(chunker_factory.keys())}." raise ValueError(msg) diff --git a/packages/graphrag/graphrag/chunking/chunking_config.py b/packages/graphrag/graphrag/chunking/chunking_config.py index 77b8080f..a111f5d2 100644 --- a/packages/graphrag/graphrag/chunking/chunking_config.py +++ b/packages/graphrag/graphrag/chunking/chunking_config.py @@ -3,36 +3,38 @@ """Parameterization settings for the default configuration.""" -from pydantic import BaseModel, Field +from pydantic import BaseModel, ConfigDict, Field from graphrag.chunking.chunk_strategy_type import ChunkStrategyType -from graphrag.config.defaults import graphrag_config_defaults class ChunkingConfig(BaseModel): """Configuration section for chunking.""" + model_config = ConfigDict(extra="allow") + """Allow extra fields to support custom cache implementations.""" + strategy: str = Field( description="The chunking strategy to use.", - default=ChunkStrategyType.tokens, + default=ChunkStrategyType.Tokens, ) - size: int = Field( + size: int | None = Field( description="The chunk size to use.", - default=graphrag_config_defaults.chunks.size, + default=None, ) - overlap: int = Field( + overlap: int | None = Field( description="The chunk overlap to use.", - default=graphrag_config_defaults.chunks.overlap, + default=None, ) - encoding_model: str = Field( + encoding_model: str | None = Field( description="The encoding model to use.", - default=graphrag_config_defaults.chunks.encoding_model, + default=None, ) - prepend_metadata: bool = Field( + prepend_metadata: bool | None = Field( description="Prepend metadata into each chunk.", - default=graphrag_config_defaults.chunks.prepend_metadata, + default=None, ) - chunk_size_includes_metadata: bool = Field( + chunk_size_includes_metadata: bool | None = Field( description="Count metadata in max tokens.", - default=graphrag_config_defaults.chunks.chunk_size_includes_metadata, + default=None, ) diff --git a/packages/graphrag/graphrag/chunking/sentence_chunker.py b/packages/graphrag/graphrag/chunking/sentence_chunker.py index af75fcd1..bc68e4b6 100644 --- a/packages/graphrag/graphrag/chunking/sentence_chunker.py +++ b/packages/graphrag/graphrag/chunking/sentence_chunker.py @@ -15,7 +15,7 @@ from graphrag.chunking.chunker import Chunker class SentenceChunker(Chunker): """A chunker that splits text into sentence-based chunks.""" - def __init__(self, prepend_metadata: bool, **kwargs: Any) -> None: + def __init__(self, prepend_metadata: bool = False, **kwargs: Any) -> None: """Create a sentence chunker instance.""" self._prepend_metadata = prepend_metadata bootstrap() diff --git a/packages/graphrag/graphrag/chunking/token_chunker.py b/packages/graphrag/graphrag/chunking/token_chunker.py index 41848fd0..d688b23d 100644 --- a/packages/graphrag/graphrag/chunking/token_chunker.py +++ b/packages/graphrag/graphrag/chunking/token_chunker.py @@ -24,9 +24,9 @@ class TokenChunker(Chunker): size: int, overlap: int, encoding_model: str, - prepend_metadata: bool, - chunk_size_includes_metadata: bool, tokenizer: Tokenizer, + prepend_metadata: bool = False, + chunk_size_includes_metadata: bool = False, **kwargs: Any, ) -> None: """Create a token chunker instance.""" diff --git a/packages/graphrag/graphrag/config/defaults.py b/packages/graphrag/graphrag/config/defaults.py index f7e904d7..e3f1f338 100644 --- a/packages/graphrag/graphrag/config/defaults.py +++ b/packages/graphrag/graphrag/config/defaults.py @@ -60,9 +60,9 @@ class BasicSearchDefaults: class ChunksDefaults: """Default values for chunks.""" + strategy: str = ChunkStrategyType.Tokens size: int = 1200 overlap: int = 100 - strategy: ClassVar[ChunkStrategyType] = ChunkStrategyType.tokens encoding_model: str = ENCODING_MODEL prepend_metadata: bool = False chunk_size_includes_metadata: bool = False diff --git a/packages/graphrag/graphrag/config/init_content.py b/packages/graphrag/graphrag/config/init_content.py index 45674068..8d2c85af 100644 --- a/packages/graphrag/graphrag/config/init_content.py +++ b/packages/graphrag/graphrag/config/init_content.py @@ -55,8 +55,10 @@ input: file_type: {graphrag_config_defaults.input.file_type.value} # [csv, text, json] chunks: + strategy: {graphrag_config_defaults.chunks.strategy} size: {graphrag_config_defaults.chunks.size} overlap: {graphrag_config_defaults.chunks.overlap} + encoding_model: {graphrag_config_defaults.chunks.encoding_model} ### Output/storage settings ### ## If blob storage is specified in the following four sections, diff --git a/packages/graphrag/graphrag/index/workflows/create_base_text_units.py b/packages/graphrag/graphrag/index/workflows/create_base_text_units.py index 94d2bf0f..04371a29 100644 --- a/packages/graphrag/graphrag/index/workflows/create_base_text_units.py +++ b/packages/graphrag/graphrag/index/workflows/create_base_text_units.py @@ -92,4 +92,3 @@ def create_base_text_units( return cast( "pd.DataFrame", text_units[text_units["text"].notna()].reset_index(drop=True) ) - diff --git a/tests/unit/chunking/test_chunker.py b/tests/unit/chunking/test_chunker.py index bb4add8e..2041294d 100644 --- a/tests/unit/chunking/test_chunker.py +++ b/tests/unit/chunking/test_chunker.py @@ -31,7 +31,7 @@ class TestRunSentences: def test_basic_functionality(self): """Test basic sentence splitting without metadata""" input = "This is a test. Another sentence." - chunker = create_chunker(ChunkingConfig(strategy=ChunkStrategyType.sentence)) + chunker = create_chunker(ChunkingConfig(strategy=ChunkStrategyType.Sentence)) chunks = chunker.chunk(input) assert len(chunks) == 2 @@ -41,14 +41,14 @@ class TestRunSentences: def test_multiple_documents(self): """Test processing multiple input documents""" input = ["First. Document.", "Second. Doc."] - chunker = create_chunker(ChunkingConfig(strategy=ChunkStrategyType.sentence)) + chunker = create_chunker(ChunkingConfig(strategy=ChunkStrategyType.Sentence)) chunks = [chunk for doc in input for chunk in chunker.chunk(doc)] assert len(chunks) == 4 def test_mixed_whitespace_handling(self): """Test input with irregular whitespace""" input = " Sentence with spaces. Another one! " - chunker = create_chunker(ChunkingConfig(strategy=ChunkStrategyType.sentence)) + chunker = create_chunker(ChunkingConfig(strategy=ChunkStrategyType.Sentence)) chunks = chunker.chunk(input) assert chunks[0] == " Sentence with spaces." assert chunks[1] == "Another one!" @@ -67,7 +67,7 @@ class TestRunTokens: size=5, overlap=1, encoding_model="fake-encoding", - strategy=ChunkStrategyType.tokens, + strategy=ChunkStrategyType.Tokens, ) chunker = create_chunker(config, tokenizer=tokenizer)