Split apart chunker module

This commit is contained in:
Nathan Evans 2025-12-17 16:12:25 -08:00
parent 81240ab2e3
commit 461291706f
6 changed files with 134 additions and 106 deletions

View File

@ -4,20 +4,8 @@
"""A module containing the 'Chunker' class."""
from abc import ABC, abstractmethod
from collections.abc import Callable
from typing import Any
import nltk
from graphrag_common.factory.factory import Factory, ServiceScope
from graphrag.chunking.bootstrap import bootstrap
from graphrag.config.enums import ChunkStrategyType
from graphrag.config.models.chunking_config import ChunkingConfig
from graphrag.index.text_splitting.text_splitting import (
split_single_text_on_tokens,
)
from graphrag.tokenizer.get_tokenizer import get_tokenizer
class Chunker(ABC):
"""Abstract base class for text chunkers."""
@ -29,95 +17,3 @@ class Chunker(ABC):
@abstractmethod
def chunk(self, text: str) -> list[str]:
"""Chunk method definition."""
class TokenChunker(Chunker):
"""A chunker that splits text into token-based chunks."""
def __init__(
self,
size: int,
overlap: int,
encoding_model: str,
**kwargs: Any,
) -> None:
"""Create a token chunker instance."""
self._size = size
self._overlap = overlap
self._encoding_model = encoding_model
def chunk(self, text: str) -> list[str]:
"""Chunk the text into token-based chunks."""
tokenizer = get_tokenizer(encoding_model=self._encoding_model)
return split_single_text_on_tokens(
text,
chunk_overlap=self._overlap,
tokens_per_chunk=self._size,
encode=tokenizer.encode,
decode=tokenizer.decode,
)
class SentenceChunker(Chunker):
"""A chunker that splits text into sentence-based chunks."""
def __init__(self, **kwargs: Any) -> None:
"""Create a sentence chunker instance."""
bootstrap()
def chunk(self, text: str) -> list[str]:
"""Chunk the text into sentence-based chunks."""
return nltk.sent_tokenize(text)
class ChunkerFactory(Factory[Chunker]):
"""Factory for creating Chunker instances."""
chunker_factory = ChunkerFactory()
def register_chunker(
chunker_type: str,
chunker_initializer: Callable[..., Chunker],
scope: ServiceScope = "transient",
) -> None:
"""Register a custom chunker implementation.
Args
----
- chunker_type: str
The chunker id to register.
- chunker_initializer: Callable[..., Chunker]
The chunker initializer to register.
"""
chunker_factory.register(chunker_type, chunker_initializer, scope)
def create_chunker(config: ChunkingConfig) -> Chunker:
"""Create a chunker implementation based on the given configuration.
Args
----
- config: ChunkingConfig
The chunker configuration to use.
Returns
-------
Chunker
The created chunker implementation.
"""
config_model = config.model_dump()
chunker_strategy = config.strategy
if chunker_strategy not in chunker_factory:
match chunker_strategy:
case ChunkStrategyType.tokens:
chunker_factory.register(ChunkStrategyType.tokens, TokenChunker)
case ChunkStrategyType.sentence:
chunker_factory.register(ChunkStrategyType.sentence, SentenceChunker)
case _:
msg = f"ChunkingConfig.strategy '{chunker_strategy}' is not registered in the ChunkerFactory. Registered types: {', '.join(chunker_factory.keys())}."
raise ValueError(msg)
return chunker_factory.create(chunker_strategy, init_args=config_model)

View File

@ -0,0 +1,69 @@
# Copyright (c) 2024 Microsoft Corporation.
# Licensed under the MIT License
"""A module containing 'ChunkerFactory', 'register_chunker', and 'create_chunker'."""
from collections.abc import Callable
from graphrag_common.factory.factory import Factory, ServiceScope
from graphrag.chunking.chunker import Chunker
from graphrag.config.enums import ChunkStrategyType
from graphrag.config.models.chunking_config import ChunkingConfig
class ChunkerFactory(Factory[Chunker]):
"""Factory for creating Chunker instances."""
chunker_factory = ChunkerFactory()
def register_chunker(
chunker_type: str,
chunker_initializer: Callable[..., Chunker],
scope: ServiceScope = "transient",
) -> None:
"""Register a custom chunker implementation.
Args
----
- chunker_type: str
The chunker id to register.
- chunker_initializer: Callable[..., Chunker]
The chunker initializer to register.
"""
chunker_factory.register(chunker_type, chunker_initializer, scope)
def create_chunker(config: ChunkingConfig) -> Chunker:
"""Create a chunker implementation based on the given configuration.
Args
----
- config: ChunkingConfig
The chunker configuration to use.
Returns
-------
Chunker
The created chunker implementation.
"""
config_model = config.model_dump()
chunker_strategy = config.strategy
if chunker_strategy not in chunker_factory:
match chunker_strategy:
case ChunkStrategyType.tokens:
from graphrag.chunking.token_chunker import TokenChunker
chunker_factory.register(ChunkStrategyType.tokens, TokenChunker)
case ChunkStrategyType.sentence:
from graphrag.chunking.sentence_chunker import SentenceChunker
chunker_factory.register(ChunkStrategyType.sentence, SentenceChunker)
case _:
msg = f"ChunkingConfig.strategy '{chunker_strategy}' is not registered in the ChunkerFactory. Registered types: {', '.join(chunker_factory.keys())}."
raise ValueError(msg)
return chunker_factory.create(chunker_strategy, init_args=config_model)

View File

@ -0,0 +1,23 @@
# Copyright (c) 2024 Microsoft Corporation.
# Licensed under the MIT License
"""A module containing 'SentenceChunker' class."""
from typing import Any
import nltk
from graphrag.chunking.bootstrap import bootstrap
from graphrag.chunking.chunker import Chunker
class SentenceChunker(Chunker):
"""A chunker that splits text into sentence-based chunks."""
def __init__(self, **kwargs: Any) -> None:
"""Create a sentence chunker instance."""
bootstrap()
def chunk(self, text: str) -> list[str]:
"""Chunk the text into sentence-based chunks."""
return nltk.sent_tokenize(text)

View File

@ -0,0 +1,39 @@
# Copyright (c) 2024 Microsoft Corporation.
# Licensed under the MIT License
"""A module containing 'TokenChunker' class."""
from typing import Any
from graphrag.chunking.chunker import Chunker
from graphrag.index.text_splitting.text_splitting import (
split_single_text_on_tokens,
)
from graphrag.tokenizer.get_tokenizer import get_tokenizer
class TokenChunker(Chunker):
"""A chunker that splits text into token-based chunks."""
def __init__(
self,
size: int,
overlap: int,
encoding_model: str,
**kwargs: Any,
) -> None:
"""Create a token chunker instance."""
self._size = size
self._overlap = overlap
self._encoding_model = encoding_model
def chunk(self, text: str) -> list[str]:
"""Chunk the text into token-based chunks."""
tokenizer = get_tokenizer(encoding_model=self._encoding_model)
return split_single_text_on_tokens(
text,
chunk_overlap=self._overlap,
tokens_per_chunk=self._size,
encode=tokenizer.encode,
decode=tokenizer.decode,
)

View File

@ -10,7 +10,8 @@ from typing import Any, cast
import pandas as pd
from graphrag.callbacks.workflow_callbacks import WorkflowCallbacks
from graphrag.chunking.chunker import Chunker, create_chunker
from graphrag.chunking.chunker import Chunker
from graphrag.chunking.chunker_factory import create_chunker
from graphrag.config.models.chunking_config import ChunkingConfig
from graphrag.config.models.graph_rag_config import GraphRagConfig
from graphrag.index.typing.context import PipelineRunContext

View File

@ -4,7 +4,7 @@
from unittest.mock import Mock, patch
from graphrag.chunking.bootstrap import bootstrap
from graphrag.chunking.chunker import create_chunker
from graphrag.chunking.chunker_factory import create_chunker
from graphrag.config.enums import ChunkStrategyType
from graphrag.config.models.chunking_config import ChunkingConfig
from graphrag.tokenizer.get_tokenizer import get_tokenizer