mirror of
https://github.com/microsoft/graphrag.git
synced 2026-01-14 00:57:23 +08:00
Some checks failed
Python Build and Type Check / python-ci (ubuntu-latest, 3.11) (push) Has been cancelled
Python Build and Type Check / python-ci (ubuntu-latest, 3.13) (push) Has been cancelled
Python Build and Type Check / python-ci (windows-latest, 3.11) (push) Has been cancelled
Python Build and Type Check / python-ci (windows-latest, 3.13) (push) Has been cancelled
Python Integration Tests / python-ci (ubuntu-latest, 3.13) (push) Has been cancelled
Python Integration Tests / python-ci (windows-latest, 3.13) (push) Has been cancelled
Python Notebook Tests / python-ci (ubuntu-latest, 3.13) (push) Has been cancelled
Python Notebook Tests / python-ci (windows-latest, 3.13) (push) Has been cancelled
Python Smoke Tests / python-ci (ubuntu-latest, 3.13) (push) Has been cancelled
Python Smoke Tests / python-ci (windows-latest, 3.13) (push) Has been cancelled
Python Unit Tests / python-ci (ubuntu-latest, 3.13) (push) Has been cancelled
Python Unit Tests / python-ci (windows-latest, 3.13) (push) Has been cancelled
* Delete NoopTextSplitter * Delete unused check_token_limit * Add base chunking factory and migrate workflow to use it * Split apart chunker module * Co-locate chunking/splitting * Collapse token splitting functionality into one class/function * Restore create_base_text_units parameterization * Move Tokenizer base class to common package * Move pre-pending into chunkers * Streamline config * Fix defaults construction * Add prepending tests * Remove chunk_size_includes_metadata config * Revert ChunkingDocument interface * Move metadata prepending to a util * Move Tokenizer back to GR core * Fix tokenizer removal from chunker * Set defaults for chunking config * Move chunking to monorepo package * Format * Typo * Add ChunkResult model * Streamline chunking config * Add missing version updates for graphrag_chunking
78 lines
2.4 KiB
Python
78 lines
2.4 KiB
Python
# Copyright (c) 2024 Microsoft Corporation.
|
|
# Licensed under the MIT License
|
|
|
|
"""A module containing 'ChunkerFactory', 'register_chunker', and 'create_chunker'."""
|
|
|
|
from collections.abc import Callable
|
|
|
|
from graphrag_common.factory.factory import Factory, ServiceScope
|
|
|
|
from graphrag_chunking.chunk_strategy_type import ChunkerType
|
|
from graphrag_chunking.chunker import Chunker
|
|
from graphrag_chunking.chunking_config import ChunkingConfig
|
|
|
|
|
|
class ChunkerFactory(Factory[Chunker]):
|
|
"""Factory for creating Chunker instances."""
|
|
|
|
|
|
chunker_factory = ChunkerFactory()
|
|
|
|
|
|
def register_chunker(
|
|
chunker_type: str,
|
|
chunker_initializer: Callable[..., Chunker],
|
|
scope: ServiceScope = "transient",
|
|
) -> None:
|
|
"""Register a custom chunker implementation.
|
|
|
|
Args
|
|
----
|
|
- chunker_type: str
|
|
The chunker id to register.
|
|
- chunker_initializer: Callable[..., Chunker]
|
|
The chunker initializer to register.
|
|
"""
|
|
chunker_factory.register(chunker_type, chunker_initializer, scope)
|
|
|
|
|
|
def create_chunker(
|
|
config: ChunkingConfig,
|
|
encode: Callable[[str], list[int]] | None = None,
|
|
decode: Callable[[list[int]], str] | None = None,
|
|
) -> Chunker:
|
|
"""Create a chunker implementation based on the given configuration.
|
|
|
|
Args
|
|
----
|
|
- config: ChunkingConfig
|
|
The chunker configuration to use.
|
|
|
|
Returns
|
|
-------
|
|
Chunker
|
|
The created chunker implementation.
|
|
"""
|
|
config_model = config.model_dump()
|
|
if encode is not None:
|
|
config_model["encode"] = encode
|
|
if decode is not None:
|
|
config_model["decode"] = decode
|
|
chunker_strategy = config.type
|
|
|
|
if chunker_strategy not in chunker_factory:
|
|
match chunker_strategy:
|
|
case ChunkerType.Tokens:
|
|
from graphrag_chunking.token_chunker import TokenChunker
|
|
|
|
register_chunker(ChunkerType.Tokens, TokenChunker)
|
|
case ChunkerType.Sentence:
|
|
from graphrag_chunking.sentence_chunker import SentenceChunker
|
|
|
|
register_chunker(ChunkerType.Sentence, SentenceChunker)
|
|
case _:
|
|
msg = f"ChunkingConfig.strategy '{chunker_strategy}' is not registered in the ChunkerFactory. Registered types: {', '.join(chunker_factory.keys())}."
|
|
raise ValueError(msg)
|
|
|
|
return chunker_factory.create(chunker_strategy, init_args=config_model)
|