mirror of
https://github.com/microsoft/graphrag.git
synced 2026-01-14 00:57:23 +08:00
Split apart chunker module
This commit is contained in:
parent
81240ab2e3
commit
461291706f
@ -4,20 +4,8 @@
|
||||
"""A module containing the 'Chunker' class."""
|
||||
|
||||
from abc import ABC, abstractmethod
|
||||
from collections.abc import Callable
|
||||
from typing import Any
|
||||
|
||||
import nltk
|
||||
from graphrag_common.factory.factory import Factory, ServiceScope
|
||||
|
||||
from graphrag.chunking.bootstrap import bootstrap
|
||||
from graphrag.config.enums import ChunkStrategyType
|
||||
from graphrag.config.models.chunking_config import ChunkingConfig
|
||||
from graphrag.index.text_splitting.text_splitting import (
|
||||
split_single_text_on_tokens,
|
||||
)
|
||||
from graphrag.tokenizer.get_tokenizer import get_tokenizer
|
||||
|
||||
|
||||
class Chunker(ABC):
|
||||
"""Abstract base class for text chunkers."""
|
||||
@ -29,95 +17,3 @@ class Chunker(ABC):
|
||||
@abstractmethod
|
||||
def chunk(self, text: str) -> list[str]:
|
||||
"""Chunk method definition."""
|
||||
|
||||
|
||||
class TokenChunker(Chunker):
|
||||
"""A chunker that splits text into token-based chunks."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
size: int,
|
||||
overlap: int,
|
||||
encoding_model: str,
|
||||
**kwargs: Any,
|
||||
) -> None:
|
||||
"""Create a token chunker instance."""
|
||||
self._size = size
|
||||
self._overlap = overlap
|
||||
self._encoding_model = encoding_model
|
||||
|
||||
def chunk(self, text: str) -> list[str]:
|
||||
"""Chunk the text into token-based chunks."""
|
||||
tokenizer = get_tokenizer(encoding_model=self._encoding_model)
|
||||
return split_single_text_on_tokens(
|
||||
text,
|
||||
chunk_overlap=self._overlap,
|
||||
tokens_per_chunk=self._size,
|
||||
encode=tokenizer.encode,
|
||||
decode=tokenizer.decode,
|
||||
)
|
||||
|
||||
|
||||
class SentenceChunker(Chunker):
|
||||
"""A chunker that splits text into sentence-based chunks."""
|
||||
|
||||
def __init__(self, **kwargs: Any) -> None:
|
||||
"""Create a sentence chunker instance."""
|
||||
bootstrap()
|
||||
|
||||
def chunk(self, text: str) -> list[str]:
|
||||
"""Chunk the text into sentence-based chunks."""
|
||||
return nltk.sent_tokenize(text)
|
||||
|
||||
|
||||
class ChunkerFactory(Factory[Chunker]):
|
||||
"""Factory for creating Chunker instances."""
|
||||
|
||||
|
||||
chunker_factory = ChunkerFactory()
|
||||
|
||||
|
||||
def register_chunker(
|
||||
chunker_type: str,
|
||||
chunker_initializer: Callable[..., Chunker],
|
||||
scope: ServiceScope = "transient",
|
||||
) -> None:
|
||||
"""Register a custom chunker implementation.
|
||||
|
||||
Args
|
||||
----
|
||||
- chunker_type: str
|
||||
The chunker id to register.
|
||||
- chunker_initializer: Callable[..., Chunker]
|
||||
The chunker initializer to register.
|
||||
"""
|
||||
chunker_factory.register(chunker_type, chunker_initializer, scope)
|
||||
|
||||
|
||||
def create_chunker(config: ChunkingConfig) -> Chunker:
|
||||
"""Create a chunker implementation based on the given configuration.
|
||||
|
||||
Args
|
||||
----
|
||||
- config: ChunkingConfig
|
||||
The chunker configuration to use.
|
||||
|
||||
Returns
|
||||
-------
|
||||
Chunker
|
||||
The created chunker implementation.
|
||||
"""
|
||||
config_model = config.model_dump()
|
||||
chunker_strategy = config.strategy
|
||||
|
||||
if chunker_strategy not in chunker_factory:
|
||||
match chunker_strategy:
|
||||
case ChunkStrategyType.tokens:
|
||||
chunker_factory.register(ChunkStrategyType.tokens, TokenChunker)
|
||||
case ChunkStrategyType.sentence:
|
||||
chunker_factory.register(ChunkStrategyType.sentence, SentenceChunker)
|
||||
case _:
|
||||
msg = f"ChunkingConfig.strategy '{chunker_strategy}' is not registered in the ChunkerFactory. Registered types: {', '.join(chunker_factory.keys())}."
|
||||
raise ValueError(msg)
|
||||
|
||||
return chunker_factory.create(chunker_strategy, init_args=config_model)
|
||||
|
||||
69
packages/graphrag/graphrag/chunking/chunker_factory.py
Normal file
69
packages/graphrag/graphrag/chunking/chunker_factory.py
Normal file
@ -0,0 +1,69 @@
|
||||
# Copyright (c) 2024 Microsoft Corporation.
|
||||
# Licensed under the MIT License
|
||||
|
||||
"""A module containing 'ChunkerFactory', 'register_chunker', and 'create_chunker'."""
|
||||
|
||||
from collections.abc import Callable
|
||||
|
||||
from graphrag_common.factory.factory import Factory, ServiceScope
|
||||
|
||||
from graphrag.chunking.chunker import Chunker
|
||||
from graphrag.config.enums import ChunkStrategyType
|
||||
from graphrag.config.models.chunking_config import ChunkingConfig
|
||||
|
||||
|
||||
class ChunkerFactory(Factory[Chunker]):
|
||||
"""Factory for creating Chunker instances."""
|
||||
|
||||
|
||||
chunker_factory = ChunkerFactory()
|
||||
|
||||
|
||||
def register_chunker(
|
||||
chunker_type: str,
|
||||
chunker_initializer: Callable[..., Chunker],
|
||||
scope: ServiceScope = "transient",
|
||||
) -> None:
|
||||
"""Register a custom chunker implementation.
|
||||
|
||||
Args
|
||||
----
|
||||
- chunker_type: str
|
||||
The chunker id to register.
|
||||
- chunker_initializer: Callable[..., Chunker]
|
||||
The chunker initializer to register.
|
||||
"""
|
||||
chunker_factory.register(chunker_type, chunker_initializer, scope)
|
||||
|
||||
|
||||
def create_chunker(config: ChunkingConfig) -> Chunker:
|
||||
"""Create a chunker implementation based on the given configuration.
|
||||
|
||||
Args
|
||||
----
|
||||
- config: ChunkingConfig
|
||||
The chunker configuration to use.
|
||||
|
||||
Returns
|
||||
-------
|
||||
Chunker
|
||||
The created chunker implementation.
|
||||
"""
|
||||
config_model = config.model_dump()
|
||||
chunker_strategy = config.strategy
|
||||
|
||||
if chunker_strategy not in chunker_factory:
|
||||
match chunker_strategy:
|
||||
case ChunkStrategyType.tokens:
|
||||
from graphrag.chunking.token_chunker import TokenChunker
|
||||
|
||||
chunker_factory.register(ChunkStrategyType.tokens, TokenChunker)
|
||||
case ChunkStrategyType.sentence:
|
||||
from graphrag.chunking.sentence_chunker import SentenceChunker
|
||||
|
||||
chunker_factory.register(ChunkStrategyType.sentence, SentenceChunker)
|
||||
case _:
|
||||
msg = f"ChunkingConfig.strategy '{chunker_strategy}' is not registered in the ChunkerFactory. Registered types: {', '.join(chunker_factory.keys())}."
|
||||
raise ValueError(msg)
|
||||
|
||||
return chunker_factory.create(chunker_strategy, init_args=config_model)
|
||||
23
packages/graphrag/graphrag/chunking/sentence_chunker.py
Normal file
23
packages/graphrag/graphrag/chunking/sentence_chunker.py
Normal file
@ -0,0 +1,23 @@
|
||||
# Copyright (c) 2024 Microsoft Corporation.
|
||||
# Licensed under the MIT License
|
||||
|
||||
"""A module containing 'SentenceChunker' class."""
|
||||
|
||||
from typing import Any
|
||||
|
||||
import nltk
|
||||
|
||||
from graphrag.chunking.bootstrap import bootstrap
|
||||
from graphrag.chunking.chunker import Chunker
|
||||
|
||||
|
||||
class SentenceChunker(Chunker):
|
||||
"""A chunker that splits text into sentence-based chunks."""
|
||||
|
||||
def __init__(self, **kwargs: Any) -> None:
|
||||
"""Create a sentence chunker instance."""
|
||||
bootstrap()
|
||||
|
||||
def chunk(self, text: str) -> list[str]:
|
||||
"""Chunk the text into sentence-based chunks."""
|
||||
return nltk.sent_tokenize(text)
|
||||
39
packages/graphrag/graphrag/chunking/token_chunker.py
Normal file
39
packages/graphrag/graphrag/chunking/token_chunker.py
Normal file
@ -0,0 +1,39 @@
|
||||
# Copyright (c) 2024 Microsoft Corporation.
|
||||
# Licensed under the MIT License
|
||||
|
||||
"""A module containing 'TokenChunker' class."""
|
||||
|
||||
from typing import Any
|
||||
|
||||
from graphrag.chunking.chunker import Chunker
|
||||
from graphrag.index.text_splitting.text_splitting import (
|
||||
split_single_text_on_tokens,
|
||||
)
|
||||
from graphrag.tokenizer.get_tokenizer import get_tokenizer
|
||||
|
||||
|
||||
class TokenChunker(Chunker):
|
||||
"""A chunker that splits text into token-based chunks."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
size: int,
|
||||
overlap: int,
|
||||
encoding_model: str,
|
||||
**kwargs: Any,
|
||||
) -> None:
|
||||
"""Create a token chunker instance."""
|
||||
self._size = size
|
||||
self._overlap = overlap
|
||||
self._encoding_model = encoding_model
|
||||
|
||||
def chunk(self, text: str) -> list[str]:
|
||||
"""Chunk the text into token-based chunks."""
|
||||
tokenizer = get_tokenizer(encoding_model=self._encoding_model)
|
||||
return split_single_text_on_tokens(
|
||||
text,
|
||||
chunk_overlap=self._overlap,
|
||||
tokens_per_chunk=self._size,
|
||||
encode=tokenizer.encode,
|
||||
decode=tokenizer.decode,
|
||||
)
|
||||
@ -10,7 +10,8 @@ from typing import Any, cast
|
||||
import pandas as pd
|
||||
|
||||
from graphrag.callbacks.workflow_callbacks import WorkflowCallbacks
|
||||
from graphrag.chunking.chunker import Chunker, create_chunker
|
||||
from graphrag.chunking.chunker import Chunker
|
||||
from graphrag.chunking.chunker_factory import create_chunker
|
||||
from graphrag.config.models.chunking_config import ChunkingConfig
|
||||
from graphrag.config.models.graph_rag_config import GraphRagConfig
|
||||
from graphrag.index.typing.context import PipelineRunContext
|
||||
|
||||
@ -4,7 +4,7 @@
|
||||
from unittest.mock import Mock, patch
|
||||
|
||||
from graphrag.chunking.bootstrap import bootstrap
|
||||
from graphrag.chunking.chunker import create_chunker
|
||||
from graphrag.chunking.chunker_factory import create_chunker
|
||||
from graphrag.config.enums import ChunkStrategyType
|
||||
from graphrag.config.models.chunking_config import ChunkingConfig
|
||||
from graphrag.tokenizer.get_tokenizer import get_tokenizer
|
||||
|
||||
Loading…
Reference in New Issue
Block a user