mirror of
https://github.com/microsoft/graphrag.git
synced 2026-01-14 00:57:23 +08:00
Revert ChunkingDocument interface
This commit is contained in:
parent
026474a073
commit
c8dbb029f4
@ -6,18 +6,14 @@
|
||||
from abc import ABC, abstractmethod
|
||||
from typing import Any
|
||||
|
||||
from graphrag.chunking.chunking_document import ChunkingDocument
|
||||
|
||||
|
||||
class Chunker(ABC):
|
||||
"""Abstract base class for text chunkers."""
|
||||
"""Abstract base class for document chunkers."""
|
||||
|
||||
@abstractmethod
|
||||
def __init__(self, **kwargs: Any) -> None:
|
||||
"""Create a chunker instance."""
|
||||
|
||||
@abstractmethod
|
||||
def chunk(
|
||||
self, document: ChunkingDocument, metadata: dict | None = None
|
||||
) -> list[str]:
|
||||
def chunk(self, text: str, metadata: dict | None = None) -> list[str]:
|
||||
"""Chunk method definition."""
|
||||
|
||||
@ -1,15 +0,0 @@
|
||||
# Copyright (c) 2024 Microsoft Corporation.
|
||||
# Licensed under the MIT License
|
||||
|
||||
"""A module containing the 'Chunker' class."""
|
||||
|
||||
from abc import ABC, abstractmethod
|
||||
from typing import Any
|
||||
|
||||
|
||||
class ChunkingDocument(ABC):
|
||||
"""Abstract base class for documents that need to be chunked. If you want to use a text-based chunker, ensure __str__ is implemented."""
|
||||
|
||||
@abstractmethod
|
||||
def __init__(self, **kwargs: Any) -> None:
|
||||
"""Create a chunking document instance."""
|
||||
@ -9,7 +9,6 @@ import nltk
|
||||
|
||||
from graphrag.chunking.bootstrap_nltk import bootstrap
|
||||
from graphrag.chunking.chunker import Chunker
|
||||
from graphrag.chunking.chunking_document import ChunkingDocument
|
||||
|
||||
|
||||
class SentenceChunker(Chunker):
|
||||
@ -20,11 +19,8 @@ class SentenceChunker(Chunker):
|
||||
self._prepend_metadata = prepend_metadata
|
||||
bootstrap()
|
||||
|
||||
def chunk(
|
||||
self, document: ChunkingDocument, metadata: dict | None = None
|
||||
) -> list[str]:
|
||||
def chunk(self, text: str, metadata: dict | None = None) -> list[str]:
|
||||
"""Chunk the text into sentence-based chunks."""
|
||||
text = str(document)
|
||||
chunks = nltk.sent_tokenize(text)
|
||||
|
||||
if self._prepend_metadata and metadata is not None:
|
||||
|
||||
@ -1,20 +0,0 @@
|
||||
# Copyright (c) 2024 Microsoft Corporation.
|
||||
# Licensed under the MIT License
|
||||
|
||||
"""A module containing the 'Chunker' class."""
|
||||
|
||||
from typing import Any
|
||||
|
||||
from graphrag.chunking.chunking_document import ChunkingDocument
|
||||
|
||||
|
||||
class TextChunkingDocument(ChunkingDocument):
|
||||
"""Represents a basic text document for chunking."""
|
||||
|
||||
def __init__(self, text: str, **kwargs: Any) -> None:
|
||||
"""Create a chunking document instance."""
|
||||
self._text = text
|
||||
|
||||
def __str__(self) -> str:
|
||||
"""Get the text of the document."""
|
||||
return self._text
|
||||
@ -9,7 +9,6 @@ from typing import Any
|
||||
from graphrag_common.types.tokenizer import Tokenizer
|
||||
|
||||
from graphrag.chunking.chunker import Chunker
|
||||
from graphrag.chunking.chunking_document import ChunkingDocument
|
||||
|
||||
EncodedText = list[int]
|
||||
DecodeFn = Callable[[EncodedText], str]
|
||||
@ -33,13 +32,8 @@ class TokenChunker(Chunker):
|
||||
self._prepend_metadata = prepend_metadata
|
||||
self._tokenizer = tokenizer
|
||||
|
||||
def chunk(
|
||||
self, document: ChunkingDocument, metadata: dict | None = None
|
||||
) -> list[str]:
|
||||
def chunk(self, text: str, metadata: dict | None = None) -> list[str]:
|
||||
"""Chunk the text into token-based chunks."""
|
||||
# we have to create and measure the metadata first to account for the length when chunking
|
||||
text = str(document)
|
||||
|
||||
chunks = split_text_on_tokens(
|
||||
text,
|
||||
chunk_size=self._size,
|
||||
|
||||
@ -13,7 +13,6 @@ from graphrag_common.types.tokenizer import Tokenizer
|
||||
from graphrag.callbacks.workflow_callbacks import WorkflowCallbacks
|
||||
from graphrag.chunking.chunker import Chunker
|
||||
from graphrag.chunking.chunker_factory import create_chunker
|
||||
from graphrag.chunking.text_chunking_document import TextChunkingDocument
|
||||
from graphrag.config.models.graph_rag_config import GraphRagConfig
|
||||
from graphrag.index.typing.context import PipelineRunContext
|
||||
from graphrag.index.typing.workflow import WorkflowFunctionOutput
|
||||
@ -67,8 +66,7 @@ def create_base_text_units(
|
||||
metadata = row.get("metadata")
|
||||
if (metadata is not None) and isinstance(metadata, str):
|
||||
metadata = json.loads(metadata)
|
||||
document = TextChunkingDocument(text=row["text"])
|
||||
row["chunks"] = chunker.chunk(document, metadata=metadata)
|
||||
row["chunks"] = chunker.chunk(row["text"], metadata=metadata)
|
||||
tick()
|
||||
logger.info("chunker progress: %d/%d", row_index + 1, total_rows)
|
||||
return row
|
||||
|
||||
@ -7,7 +7,6 @@ from graphrag.chunking.bootstrap_nltk import bootstrap
|
||||
from graphrag.chunking.chunk_strategy_type import ChunkStrategyType
|
||||
from graphrag.chunking.chunker_factory import create_chunker
|
||||
from graphrag.chunking.chunking_config import ChunkingConfig
|
||||
from graphrag.chunking.text_chunking_document import TextChunkingDocument
|
||||
from graphrag.chunking.token_chunker import (
|
||||
split_text_on_tokens,
|
||||
)
|
||||
@ -32,7 +31,7 @@ class TestRunSentences:
|
||||
|
||||
def test_basic_functionality(self):
|
||||
"""Test basic sentence splitting without metadata"""
|
||||
input = TextChunkingDocument(text="This is a test. Another sentence.")
|
||||
input = "This is a test. Another sentence."
|
||||
chunker = create_chunker(ChunkingConfig(strategy=ChunkStrategyType.Sentence))
|
||||
chunks = chunker.chunk(input)
|
||||
|
||||
@ -43,7 +42,7 @@ class TestRunSentences:
|
||||
def test_mixed_whitespace_handling(self):
|
||||
"""Test input with irregular whitespace"""
|
||||
|
||||
input = TextChunkingDocument(text=" Sentence with spaces. Another one! ")
|
||||
input = " Sentence with spaces. Another one! "
|
||||
chunker = create_chunker(ChunkingConfig(strategy=ChunkStrategyType.Sentence))
|
||||
chunks = chunker.chunk(input)
|
||||
assert chunks[0] == " Sentence with spaces."
|
||||
@ -51,7 +50,7 @@ class TestRunSentences:
|
||||
|
||||
def test_prepend_metadata(self):
|
||||
"""Test prepending metadata to chunks"""
|
||||
input = TextChunkingDocument(text="This is a test. Another sentence.")
|
||||
input = "This is a test. Another sentence."
|
||||
config = ChunkingConfig(
|
||||
strategy=ChunkStrategyType.Sentence, prepend_metadata=True
|
||||
)
|
||||
@ -70,9 +69,8 @@ class TestRunTokens:
|
||||
mock_encoder.decode.side_effect = lambda x: bytes(x).decode()
|
||||
mock_get_encoding.return_value = mock_encoder
|
||||
|
||||
input = TextChunkingDocument(
|
||||
text="Marley was dead: to begin with. There is no doubt whatever about that. The register of his burial was signed by the clergyman, the clerk, the undertaker, and the chief mourner. Scrooge signed it. And Scrooge's name was good upon 'Change, for anything he chose to put his hand to."
|
||||
)
|
||||
input = "Marley was dead: to begin with. There is no doubt whatever about that. The register of his burial was signed by the clergyman, the clerk, the undertaker, and the chief mourner. Scrooge signed it. And Scrooge's name was good upon 'Change, for anything he chose to put his hand to."
|
||||
|
||||
config = ChunkingConfig(
|
||||
size=5,
|
||||
overlap=1,
|
||||
@ -88,7 +86,7 @@ class TestRunTokens:
|
||||
def test_prepend_metadata(self):
|
||||
"""Test prepending metadata to chunks"""
|
||||
mocked_tokenizer = MockTokenizer()
|
||||
input = TextChunkingDocument(text="This is a test.")
|
||||
input = "This is a test."
|
||||
config = ChunkingConfig(
|
||||
strategy=ChunkStrategyType.Tokens, size=5, overlap=0, prepend_metadata=True
|
||||
)
|
||||
|
||||
Loading…
Reference in New Issue
Block a user