Revert ChunkingDocument interface

2026-01-14 00:57:23 +08:00 · 2025-12-22 10:17:27 -08:00 · 2025-12-22 10:17:27 -08:00 · c8dbb029f4
commit c8dbb029f4
parent 026474a073
7 changed files with 11 additions and 64 deletions
--- a/packages/graphrag/graphrag/chunking/chunker.py
+++ b/packages/graphrag/graphrag/chunking/chunker.py
@ -6,18 +6,14 @@
 from abc import ABC, abstractmethod
 from typing import Any

-from graphrag.chunking.chunking_document import ChunkingDocument
-

 class Chunker(ABC):
-    """Abstract base class for text chunkers."""
+    """Abstract base class for document chunkers."""

    @abstractmethod
    def __init__(self, **kwargs: Any) -> None:
        """Create a chunker instance."""

    @abstractmethod
-    def chunk(
-        self, document: ChunkingDocument, metadata: dict | None = None
-    ) -> list[str]:
+    def chunk(self, text: str, metadata: dict | None = None) -> list[str]:
        """Chunk method definition."""
--- a/packages/graphrag/graphrag/chunking/chunking_document.py
+++ b/packages/graphrag/graphrag/chunking/chunking_document.py
@ -1,15 +0,0 @@
-# Copyright (c) 2024 Microsoft Corporation.
-# Licensed under the MIT License
-
-"""A module containing the 'Chunker' class."""
-
-from abc import ABC, abstractmethod
-from typing import Any
-
-
-class ChunkingDocument(ABC):
-    """Abstract base class for documents that need to be chunked. If you want to use a text-based chunker, ensure __str__ is implemented."""
-
-    @abstractmethod
-    def __init__(self, **kwargs: Any) -> None:
-        """Create a chunking document instance."""
--- a/packages/graphrag/graphrag/chunking/sentence_chunker.py
+++ b/packages/graphrag/graphrag/chunking/sentence_chunker.py
@ -9,7 +9,6 @@ import nltk

 from graphrag.chunking.bootstrap_nltk import bootstrap
 from graphrag.chunking.chunker import Chunker
-from graphrag.chunking.chunking_document import ChunkingDocument


 class SentenceChunker(Chunker):
@ -20,11 +19,8 @@ class SentenceChunker(Chunker):
        self._prepend_metadata = prepend_metadata
        bootstrap()

-    def chunk(
-        self, document: ChunkingDocument, metadata: dict | None = None
-    ) -> list[str]:
+    def chunk(self, text: str, metadata: dict | None = None) -> list[str]:
        """Chunk the text into sentence-based chunks."""
-        text = str(document)
        chunks = nltk.sent_tokenize(text)

        if self._prepend_metadata and metadata is not None:
--- a/packages/graphrag/graphrag/chunking/text_chunking_document.py
+++ b/packages/graphrag/graphrag/chunking/text_chunking_document.py
@ -1,20 +0,0 @@
-# Copyright (c) 2024 Microsoft Corporation.
-# Licensed under the MIT License
-
-"""A module containing the 'Chunker' class."""
-
-from typing import Any
-
-from graphrag.chunking.chunking_document import ChunkingDocument
-
-
-class TextChunkingDocument(ChunkingDocument):
-    """Represents a basic text document for chunking."""
-
-    def __init__(self, text: str, **kwargs: Any) -> None:
-        """Create a chunking document instance."""
-        self._text = text
-
-    def __str__(self) -> str:
-        """Get the text of the document."""
-        return self._text
--- a/packages/graphrag/graphrag/chunking/token_chunker.py
+++ b/packages/graphrag/graphrag/chunking/token_chunker.py
@ -9,7 +9,6 @@ from typing import Any
 from graphrag_common.types.tokenizer import Tokenizer

 from graphrag.chunking.chunker import Chunker
-from graphrag.chunking.chunking_document import ChunkingDocument

 EncodedText = list[int]
 DecodeFn = Callable[[EncodedText], str]
@ -33,13 +32,8 @@ class TokenChunker(Chunker):
        self._prepend_metadata = prepend_metadata
        self._tokenizer = tokenizer

-    def chunk(
-        self, document: ChunkingDocument, metadata: dict | None = None
-    ) -> list[str]:
+    def chunk(self, text: str, metadata: dict | None = None) -> list[str]:
        """Chunk the text into token-based chunks."""
-        # we have to create and measure the metadata first to account for the length when chunking
-        text = str(document)
-
        chunks = split_text_on_tokens(
            text,
            chunk_size=self._size,
--- a/packages/graphrag/graphrag/index/workflows/create_base_text_units.py
+++ b/packages/graphrag/graphrag/index/workflows/create_base_text_units.py
@ -13,7 +13,6 @@ from graphrag_common.types.tokenizer import Tokenizer
 from graphrag.callbacks.workflow_callbacks import WorkflowCallbacks
 from graphrag.chunking.chunker import Chunker
 from graphrag.chunking.chunker_factory import create_chunker
-from graphrag.chunking.text_chunking_document import TextChunkingDocument
 from graphrag.config.models.graph_rag_config import GraphRagConfig
 from graphrag.index.typing.context import PipelineRunContext
 from graphrag.index.typing.workflow import WorkflowFunctionOutput
@ -67,8 +66,7 @@ def create_base_text_units(
        metadata = row.get("metadata")
        if (metadata is not None) and isinstance(metadata, str):
            metadata = json.loads(metadata)
-        document = TextChunkingDocument(text=row["text"])
-        row["chunks"] = chunker.chunk(document, metadata=metadata)
+        row["chunks"] = chunker.chunk(row["text"], metadata=metadata)
        tick()
        logger.info("chunker progress:  %d/%d", row_index + 1, total_rows)
        return row
--- a/tests/unit/chunking/test_chunker.py
+++ b/tests/unit/chunking/test_chunker.py
@ -7,7 +7,6 @@ from graphrag.chunking.bootstrap_nltk import bootstrap
 from graphrag.chunking.chunk_strategy_type import ChunkStrategyType
 from graphrag.chunking.chunker_factory import create_chunker
 from graphrag.chunking.chunking_config import ChunkingConfig
-from graphrag.chunking.text_chunking_document import TextChunkingDocument
 from graphrag.chunking.token_chunker import (
    split_text_on_tokens,
 )
@ -32,7 +31,7 @@ class TestRunSentences:

    def test_basic_functionality(self):
        """Test basic sentence splitting without metadata"""
-        input = TextChunkingDocument(text="This is a test. Another sentence.")
+        input = "This is a test. Another sentence."
        chunker = create_chunker(ChunkingConfig(strategy=ChunkStrategyType.Sentence))
        chunks = chunker.chunk(input)

@ -43,7 +42,7 @@ class TestRunSentences:
    def test_mixed_whitespace_handling(self):
        """Test input with irregular whitespace"""

-        input = TextChunkingDocument(text="   Sentence with spaces. Another one!   ")
+        input = "   Sentence with spaces. Another one!   "
        chunker = create_chunker(ChunkingConfig(strategy=ChunkStrategyType.Sentence))
        chunks = chunker.chunk(input)
        assert chunks[0] == "   Sentence with spaces."
@ -51,7 +50,7 @@ class TestRunSentences:

    def test_prepend_metadata(self):
        """Test prepending metadata to chunks"""
-        input = TextChunkingDocument(text="This is a test. Another sentence.")
+        input = "This is a test. Another sentence."
        config = ChunkingConfig(
            strategy=ChunkStrategyType.Sentence, prepend_metadata=True
        )
@ -70,9 +69,8 @@ class TestRunTokens:
        mock_encoder.decode.side_effect = lambda x: bytes(x).decode()
        mock_get_encoding.return_value = mock_encoder

-        input = TextChunkingDocument(
-            text="Marley was dead: to begin with. There is no doubt whatever about that. The register of his burial was signed by the clergyman, the clerk, the undertaker, and the chief mourner. Scrooge signed it. And Scrooge's name was good upon 'Change, for anything he chose to put his hand to."
-        )
+        input = "Marley was dead: to begin with. There is no doubt whatever about that. The register of his burial was signed by the clergyman, the clerk, the undertaker, and the chief mourner. Scrooge signed it. And Scrooge's name was good upon 'Change, for anything he chose to put his hand to."
+
        config = ChunkingConfig(
            size=5,
            overlap=1,
@ -88,7 +86,7 @@ class TestRunTokens:
    def test_prepend_metadata(self):
        """Test prepending metadata to chunks"""
        mocked_tokenizer = MockTokenizer()
-        input = TextChunkingDocument(text="This is a test.")
+        input = "This is a test."
        config = ChunkingConfig(
            strategy=ChunkStrategyType.Tokens, size=5, overlap=0, prepend_metadata=True
        )