Revert ChunkingDocument interface

This commit is contained in:
Nathan Evans 2025-12-22 10:17:27 -08:00
parent 026474a073
commit c8dbb029f4
7 changed files with 11 additions and 64 deletions

View File

@ -6,18 +6,14 @@
from abc import ABC, abstractmethod
from typing import Any
from graphrag.chunking.chunking_document import ChunkingDocument
class Chunker(ABC):
"""Abstract base class for text chunkers."""
"""Abstract base class for document chunkers."""
@abstractmethod
def __init__(self, **kwargs: Any) -> None:
"""Create a chunker instance."""
@abstractmethod
def chunk(
self, document: ChunkingDocument, metadata: dict | None = None
) -> list[str]:
def chunk(self, text: str, metadata: dict | None = None) -> list[str]:
"""Chunk method definition."""

View File

@ -1,15 +0,0 @@
# Copyright (c) 2024 Microsoft Corporation.
# Licensed under the MIT License
"""A module containing the 'Chunker' class."""
from abc import ABC, abstractmethod
from typing import Any
class ChunkingDocument(ABC):
"""Abstract base class for documents that need to be chunked. If you want to use a text-based chunker, ensure __str__ is implemented."""
@abstractmethod
def __init__(self, **kwargs: Any) -> None:
"""Create a chunking document instance."""

View File

@ -9,7 +9,6 @@ import nltk
from graphrag.chunking.bootstrap_nltk import bootstrap
from graphrag.chunking.chunker import Chunker
from graphrag.chunking.chunking_document import ChunkingDocument
class SentenceChunker(Chunker):
@ -20,11 +19,8 @@ class SentenceChunker(Chunker):
self._prepend_metadata = prepend_metadata
bootstrap()
def chunk(
self, document: ChunkingDocument, metadata: dict | None = None
) -> list[str]:
def chunk(self, text: str, metadata: dict | None = None) -> list[str]:
"""Chunk the text into sentence-based chunks."""
text = str(document)
chunks = nltk.sent_tokenize(text)
if self._prepend_metadata and metadata is not None:

View File

@ -1,20 +0,0 @@
# Copyright (c) 2024 Microsoft Corporation.
# Licensed under the MIT License
"""A module containing the 'Chunker' class."""
from typing import Any
from graphrag.chunking.chunking_document import ChunkingDocument
class TextChunkingDocument(ChunkingDocument):
"""Represents a basic text document for chunking."""
def __init__(self, text: str, **kwargs: Any) -> None:
"""Create a chunking document instance."""
self._text = text
def __str__(self) -> str:
"""Get the text of the document."""
return self._text

View File

@ -9,7 +9,6 @@ from typing import Any
from graphrag_common.types.tokenizer import Tokenizer
from graphrag.chunking.chunker import Chunker
from graphrag.chunking.chunking_document import ChunkingDocument
EncodedText = list[int]
DecodeFn = Callable[[EncodedText], str]
@ -33,13 +32,8 @@ class TokenChunker(Chunker):
self._prepend_metadata = prepend_metadata
self._tokenizer = tokenizer
def chunk(
self, document: ChunkingDocument, metadata: dict | None = None
) -> list[str]:
def chunk(self, text: str, metadata: dict | None = None) -> list[str]:
"""Chunk the text into token-based chunks."""
# we have to create and measure the metadata first to account for the length when chunking
text = str(document)
chunks = split_text_on_tokens(
text,
chunk_size=self._size,

View File

@ -13,7 +13,6 @@ from graphrag_common.types.tokenizer import Tokenizer
from graphrag.callbacks.workflow_callbacks import WorkflowCallbacks
from graphrag.chunking.chunker import Chunker
from graphrag.chunking.chunker_factory import create_chunker
from graphrag.chunking.text_chunking_document import TextChunkingDocument
from graphrag.config.models.graph_rag_config import GraphRagConfig
from graphrag.index.typing.context import PipelineRunContext
from graphrag.index.typing.workflow import WorkflowFunctionOutput
@ -67,8 +66,7 @@ def create_base_text_units(
metadata = row.get("metadata")
if (metadata is not None) and isinstance(metadata, str):
metadata = json.loads(metadata)
document = TextChunkingDocument(text=row["text"])
row["chunks"] = chunker.chunk(document, metadata=metadata)
row["chunks"] = chunker.chunk(row["text"], metadata=metadata)
tick()
logger.info("chunker progress: %d/%d", row_index + 1, total_rows)
return row

View File

@ -7,7 +7,6 @@ from graphrag.chunking.bootstrap_nltk import bootstrap
from graphrag.chunking.chunk_strategy_type import ChunkStrategyType
from graphrag.chunking.chunker_factory import create_chunker
from graphrag.chunking.chunking_config import ChunkingConfig
from graphrag.chunking.text_chunking_document import TextChunkingDocument
from graphrag.chunking.token_chunker import (
split_text_on_tokens,
)
@ -32,7 +31,7 @@ class TestRunSentences:
def test_basic_functionality(self):
"""Test basic sentence splitting without metadata"""
input = TextChunkingDocument(text="This is a test. Another sentence.")
input = "This is a test. Another sentence."
chunker = create_chunker(ChunkingConfig(strategy=ChunkStrategyType.Sentence))
chunks = chunker.chunk(input)
@ -43,7 +42,7 @@ class TestRunSentences:
def test_mixed_whitespace_handling(self):
"""Test input with irregular whitespace"""
input = TextChunkingDocument(text=" Sentence with spaces. Another one! ")
input = " Sentence with spaces. Another one! "
chunker = create_chunker(ChunkingConfig(strategy=ChunkStrategyType.Sentence))
chunks = chunker.chunk(input)
assert chunks[0] == " Sentence with spaces."
@ -51,7 +50,7 @@ class TestRunSentences:
def test_prepend_metadata(self):
"""Test prepending metadata to chunks"""
input = TextChunkingDocument(text="This is a test. Another sentence.")
input = "This is a test. Another sentence."
config = ChunkingConfig(
strategy=ChunkStrategyType.Sentence, prepend_metadata=True
)
@ -70,9 +69,8 @@ class TestRunTokens:
mock_encoder.decode.side_effect = lambda x: bytes(x).decode()
mock_get_encoding.return_value = mock_encoder
input = TextChunkingDocument(
text="Marley was dead: to begin with. There is no doubt whatever about that. The register of his burial was signed by the clergyman, the clerk, the undertaker, and the chief mourner. Scrooge signed it. And Scrooge's name was good upon 'Change, for anything he chose to put his hand to."
)
input = "Marley was dead: to begin with. There is no doubt whatever about that. The register of his burial was signed by the clergyman, the clerk, the undertaker, and the chief mourner. Scrooge signed it. And Scrooge's name was good upon 'Change, for anything he chose to put his hand to."
config = ChunkingConfig(
size=5,
overlap=1,
@ -88,7 +86,7 @@ class TestRunTokens:
def test_prepend_metadata(self):
"""Test prepending metadata to chunks"""
mocked_tokenizer = MockTokenizer()
input = TextChunkingDocument(text="This is a test.")
input = "This is a test."
config = ChunkingConfig(
strategy=ChunkStrategyType.Tokens, size=5, overlap=0, prepend_metadata=True
)