Move chunking to monorepo package

This commit is contained in:
Nathan Evans 2025-12-22 11:57:12 -08:00
parent d9ba63f4d6
commit bd968f2710
21 changed files with 1383 additions and 1269 deletions

View File

@ -0,0 +1,32 @@
# GraphRAG Chunking
This package contains a collection of text chunkers, a core config model, and a factory for acquiring instances.
## Examples
Basic sentence chunking with nltk
```python
chunker = SentenceChunker()
chunks = chunker.chunk("This is a test. Another sentence.")
print(chunks) # ["This is a test.", "Another sentence."]
```
Token chunking
```python
tokenizer = tiktoken.get_encoding("o200k_base")
chunker = TokenChunker(size=3, overlap=0, encode=tokenizer.encode, decode=tokenizer.decode)
chunks = chunker.chunk("This is a random test fragment of some text")
print(chunks) # ["This is a", " random test fragment", " of some text"]
```
Using the factory via helper util
```python
tokenizer = tiktoken.get_encoding("o200k_base")
config = ChunkingConfig(
strategy="tokens",
size=3,
overlap=0
)
chunker = create_chunker(config, tokenizer.encode, tokenizer.decode)
...
`

View File

@ -7,9 +7,9 @@ from collections.abc import Callable
from graphrag_common.factory.factory import Factory, ServiceScope
from graphrag.chunking.chunk_strategy_type import ChunkStrategyType
from graphrag.chunking.chunker import Chunker
from graphrag.chunking.chunking_config import ChunkingConfig
from graphrag_chunking.chunk_strategy_type import ChunkStrategyType
from graphrag_chunking.chunker import Chunker
from graphrag_chunking.chunking_config import ChunkingConfig
class ChunkerFactory(Factory[Chunker]):
@ -63,11 +63,11 @@ def create_chunker(
if chunker_strategy not in chunker_factory:
match chunker_strategy:
case ChunkStrategyType.Tokens:
from graphrag.chunking.token_chunker import TokenChunker
from graphrag_chunking.token_chunker import TokenChunker
register_chunker(ChunkStrategyType.Tokens, TokenChunker)
case ChunkStrategyType.Sentence:
from graphrag.chunking.sentence_chunker import SentenceChunker
from graphrag_chunking.sentence_chunker import SentenceChunker
register_chunker(ChunkStrategyType.Sentence, SentenceChunker)
case _:

View File

@ -5,7 +5,7 @@
from pydantic import BaseModel, ConfigDict, Field
from graphrag.chunking.chunk_strategy_type import ChunkStrategyType
from graphrag_chunking.chunk_strategy_type import ChunkStrategyType
class ChunkingConfig(BaseModel):

View File

@ -7,8 +7,8 @@ from typing import Any
import nltk
from graphrag.chunking.bootstrap_nltk import bootstrap
from graphrag.chunking.chunker import Chunker
from graphrag_chunking.bootstrap_nltk import bootstrap
from graphrag_chunking.chunker import Chunker
class SentenceChunker(Chunker):

View File

@ -6,7 +6,7 @@
from collections.abc import Callable
from typing import Any
from graphrag.chunking.chunker import Chunker
from graphrag_chunking.chunker import Chunker
class TokenChunker(Chunker):

View File

@ -0,0 +1,43 @@
[project]
name = "graphrag-chunking"
version = "2.7.0"
description = "Chunking utilities for GraphRAG"
authors = [
{name = "Alonso Guevara Fernández", email = "alonsog@microsoft.com"},
{name = "Andrés Morales Esquivel", email = "andresmor@microsoft.com"},
{name = "Chris Trevino", email = "chtrevin@microsoft.com"},
{name = "David Tittsworth", email = "datittsw@microsoft.com"},
{name = "Dayenne de Souza", email = "ddesouza@microsoft.com"},
{name = "Derek Worthen", email = "deworthe@microsoft.com"},
{name = "Gaudy Blanco Meneses", email = "gaudyb@microsoft.com"},
{name = "Ha Trinh", email = "trinhha@microsoft.com"},
{name = "Jonathan Larson", email = "jolarso@microsoft.com"},
{name = "Josh Bradley", email = "joshbradley@microsoft.com"},
{name = "Kate Lytvynets", email = "kalytv@microsoft.com"},
{name = "Kenny Zhang", email = "zhangken@microsoft.com"},
{name = "Mónica Carvajal"},
{name = "Nathan Evans", email = "naevans@microsoft.com"},
{name = "Rodrigo Racanicci", email = "rracanicci@microsoft.com"},
{name = "Sarah Smith", email = "smithsarah@microsoft.com"},
]
license = {text = "MIT"}
readme = "README.md"
requires-python = ">=3.11,<3.14"
classifiers = [
"Programming Language :: Python :: 3",
"Programming Language :: Python :: 3.11",
"Programming Language :: Python :: 3.12",
"Programming Language :: Python :: 3.13",
]
dependencies = [
"graphrag-common==2.7.0",
"pydantic~=2.10",
]
[project.urls]
Source = "https://github.com/microsoft/graphrag"
[build-system]
requires = ["hatchling>=1.27.0,<2.0.0"]
build-backend = "hatchling.build"

View File

@ -8,9 +8,9 @@ from pathlib import Path
from typing import ClassVar
from graphrag_cache import CacheType
from graphrag_chunking.chunk_strategy_type import ChunkStrategyType
from graphrag_storage import StorageType
from graphrag.chunking.chunk_strategy_type import ChunkStrategyType
from graphrag.config.embeddings import default_embeddings
from graphrag.config.enums import (
AsyncType,

View File

@ -8,11 +8,11 @@ from pathlib import Path
from devtools import pformat
from graphrag_cache import CacheConfig
from graphrag_chunking.chunking_config import ChunkingConfig
from graphrag_storage import StorageConfig, StorageType
from pydantic import BaseModel, Field, model_validator
import graphrag.config.defaults as defs
from graphrag.chunking.chunking_config import ChunkingConfig
from graphrag.config.defaults import graphrag_config_defaults
from graphrag.config.enums import VectorStoreType
from graphrag.config.models.basic_search_config import BasicSearchConfig

View File

@ -8,9 +8,9 @@ import logging
from dataclasses import dataclass
import numpy as np
from graphrag_chunking.token_chunker import split_text_on_tokens
from graphrag.callbacks.workflow_callbacks import WorkflowCallbacks
from graphrag.chunking.token_chunker import split_text_on_tokens
from graphrag.index.utils.is_null import is_null
from graphrag.language_model.protocol.base import EmbeddingModel
from graphrag.logger.progress import ProgressTicker, progress_ticker

View File

@ -8,11 +8,11 @@ import logging
from typing import Any, cast
import pandas as pd
from graphrag_chunking.add_metadata import add_metadata
from graphrag_chunking.chunker import Chunker
from graphrag_chunking.chunker_factory import create_chunker
from graphrag.callbacks.workflow_callbacks import WorkflowCallbacks
from graphrag.chunking.add_metadata import add_metadata
from graphrag.chunking.chunker import Chunker
from graphrag.chunking.chunker_factory import create_chunker
from graphrag.config.models.graph_rag_config import GraphRagConfig
from graphrag.index.typing.context import PipelineRunContext
from graphrag.index.typing.workflow import WorkflowFunctionOutput
@ -71,7 +71,7 @@ def create_base_text_units(
metadata = row.get("metadata", None)
if prepend_metadata and metadata is not None:
metadata = json.loads(metadata) if isinstance(metadata, str) else metadata
row["chunks"] = [add_metadata(chunk, metadata) for chunk in row["chunks"]]
row["chunks"] = [add_metadata(chunk, metadata, line_delimiter=".\n") for chunk in row["chunks"]]
tick()
logger.info("chunker progress: %d/%d", row_index + 1, total_rows)
return row

View File

@ -9,10 +9,10 @@ from typing import Any
import numpy as np
import pandas as pd
from graphrag_cache.noop_cache import NoopCache
from graphrag_chunking.chunker_factory import create_chunker
from graphrag_storage import create_storage
from graphrag.callbacks.noop_workflow_callbacks import NoopWorkflowCallbacks
from graphrag.chunking.chunker_factory import create_chunker
from graphrag.config.models.graph_rag_config import GraphRagConfig
from graphrag.index.input.factory import InputReaderFactory
from graphrag.index.operations.embed_text.run_embed_text import (

View File

@ -53,6 +53,7 @@ package = false
members = ["packages/*"]
[tool.uv.sources]
graphrag-chunking = { workspace = true }
graphrag-common = { workspace = true }
graphrag-storage = { workspace = true }
graphrag-cache = { workspace = true }

View File

@ -3,15 +3,15 @@
from unittest.mock import Mock, patch
from graphrag.chunking.bootstrap_nltk import bootstrap
from graphrag.chunking.chunk_strategy_type import ChunkStrategyType
from graphrag.chunking.chunker_factory import create_chunker
from graphrag.chunking.chunking_config import ChunkingConfig
from graphrag.chunking.token_chunker import (
split_text_on_tokens,
)
from graphrag.tokenizer.get_tokenizer import get_tokenizer
from graphrag.tokenizer.tokenizer import Tokenizer
from graphrag_chunking.bootstrap_nltk import bootstrap
from graphrag_chunking.chunk_strategy_type import ChunkStrategyType
from graphrag_chunking.chunker_factory import create_chunker
from graphrag_chunking.chunking_config import ChunkingConfig
from graphrag_chunking.token_chunker import (
split_text_on_tokens,
)
class MockTokenizer(Tokenizer):
@ -22,9 +22,6 @@ class MockTokenizer(Tokenizer):
return "".join(chr(id) for id in tokens)
tokenizer = get_tokenizer()
class TestRunSentences:
def setup_method(self, method):
bootstrap()
@ -73,6 +70,7 @@ class TestRunTokens:
def test_split_text_str_empty():
tokenizer = get_tokenizer()
result = split_text_on_tokens(
"",
chunk_size=5,
@ -112,9 +110,9 @@ def test_split_text_on_tokens():
assert result == expected_splits
def test_split_text_on_tokens_no_overlap():
def test_split_text_on_tokens_one_overlap():
text = "This is a test text, meaning to be taken seriously by this test only."
tok = get_tokenizer(encoding_model="cl100k_base")
tokenizer = get_tokenizer(encoding_model="o200k_base")
expected_splits = [
"This is",
@ -125,10 +123,10 @@ def test_split_text_on_tokens_no_overlap():
", meaning",
" meaning to",
" to be",
" be taken", # cspell:disable-line
" taken seriously", # cspell:disable-line
" be taken",
" taken seriously",
" seriously by",
" by this", # cspell:disable-line
" by this",
" this test",
" test only",
" only.",
@ -138,7 +136,31 @@ def test_split_text_on_tokens_no_overlap():
text=text,
chunk_size=2,
chunk_overlap=1,
decode=tok.decode,
encode=tok.encode,
decode=tokenizer.decode,
encode=tokenizer.encode,
)
assert result == expected_splits
def test_split_text_on_tokens_no_overlap():
text = "This is a test text, meaning to be taken seriously by this test only."
tokenizer = get_tokenizer(encoding_model="o200k_base")
expected_splits = [
"This is a",
" test text,",
" meaning to be",
" taken seriously by",
" this test only",
".",
]
result = split_text_on_tokens(
text=text,
chunk_size=3,
chunk_overlap=0,
decode=tokenizer.decode,
encode=tokenizer.encode,
)
assert result == expected_splits

View File

@ -1,7 +1,7 @@
# Copyright (c) 2024 Microsoft Corporation.
# Licensed under the MIT License
from graphrag.chunking.add_metadata import add_metadata
from graphrag_chunking.add_metadata import add_metadata
def test_add_metadata_one_row():

View File

@ -4,7 +4,6 @@
from dataclasses import asdict
import graphrag.config.defaults as defs
from graphrag.chunking.chunking_config import ChunkingConfig
from graphrag.config.models.basic_search_config import BasicSearchConfig
from graphrag.config.models.cluster_graph_config import ClusterGraphConfig
from graphrag.config.models.community_reports_config import CommunityReportsConfig
@ -29,6 +28,7 @@ from graphrag.config.models.summarize_descriptions_config import (
)
from graphrag.config.models.vector_store_config import VectorStoreConfig
from graphrag_cache import CacheConfig
from graphrag_chunking.chunking_config import ChunkingConfig
from graphrag_storage import StorageConfig
from pydantic import BaseModel

2482
uv.lock generated

File diff suppressed because it is too large Load Diff