mirror of
https://github.com/microsoft/graphrag.git
synced 2026-01-14 00:57:23 +08:00
Move chunking to monorepo package
This commit is contained in:
parent
d9ba63f4d6
commit
bd968f2710
32
packages/graphrag-chunking/README.md
Normal file
32
packages/graphrag-chunking/README.md
Normal file
@ -0,0 +1,32 @@
|
||||
# GraphRAG Chunking
|
||||
|
||||
This package contains a collection of text chunkers, a core config model, and a factory for acquiring instances.
|
||||
|
||||
## Examples
|
||||
|
||||
Basic sentence chunking with nltk
|
||||
```python
|
||||
chunker = SentenceChunker()
|
||||
chunks = chunker.chunk("This is a test. Another sentence.")
|
||||
print(chunks) # ["This is a test.", "Another sentence."]
|
||||
```
|
||||
|
||||
Token chunking
|
||||
```python
|
||||
tokenizer = tiktoken.get_encoding("o200k_base")
|
||||
chunker = TokenChunker(size=3, overlap=0, encode=tokenizer.encode, decode=tokenizer.decode)
|
||||
chunks = chunker.chunk("This is a random test fragment of some text")
|
||||
print(chunks) # ["This is a", " random test fragment", " of some text"]
|
||||
```
|
||||
|
||||
Using the factory via helper util
|
||||
```python
|
||||
tokenizer = tiktoken.get_encoding("o200k_base")
|
||||
config = ChunkingConfig(
|
||||
strategy="tokens",
|
||||
size=3,
|
||||
overlap=0
|
||||
)
|
||||
chunker = create_chunker(config, tokenizer.encode, tokenizer.decode)
|
||||
...
|
||||
`
|
||||
@ -7,9 +7,9 @@ from collections.abc import Callable
|
||||
|
||||
from graphrag_common.factory.factory import Factory, ServiceScope
|
||||
|
||||
from graphrag.chunking.chunk_strategy_type import ChunkStrategyType
|
||||
from graphrag.chunking.chunker import Chunker
|
||||
from graphrag.chunking.chunking_config import ChunkingConfig
|
||||
from graphrag_chunking.chunk_strategy_type import ChunkStrategyType
|
||||
from graphrag_chunking.chunker import Chunker
|
||||
from graphrag_chunking.chunking_config import ChunkingConfig
|
||||
|
||||
|
||||
class ChunkerFactory(Factory[Chunker]):
|
||||
@ -63,11 +63,11 @@ def create_chunker(
|
||||
if chunker_strategy not in chunker_factory:
|
||||
match chunker_strategy:
|
||||
case ChunkStrategyType.Tokens:
|
||||
from graphrag.chunking.token_chunker import TokenChunker
|
||||
from graphrag_chunking.token_chunker import TokenChunker
|
||||
|
||||
register_chunker(ChunkStrategyType.Tokens, TokenChunker)
|
||||
case ChunkStrategyType.Sentence:
|
||||
from graphrag.chunking.sentence_chunker import SentenceChunker
|
||||
from graphrag_chunking.sentence_chunker import SentenceChunker
|
||||
|
||||
register_chunker(ChunkStrategyType.Sentence, SentenceChunker)
|
||||
case _:
|
||||
@ -5,7 +5,7 @@
|
||||
|
||||
from pydantic import BaseModel, ConfigDict, Field
|
||||
|
||||
from graphrag.chunking.chunk_strategy_type import ChunkStrategyType
|
||||
from graphrag_chunking.chunk_strategy_type import ChunkStrategyType
|
||||
|
||||
|
||||
class ChunkingConfig(BaseModel):
|
||||
@ -7,8 +7,8 @@ from typing import Any
|
||||
|
||||
import nltk
|
||||
|
||||
from graphrag.chunking.bootstrap_nltk import bootstrap
|
||||
from graphrag.chunking.chunker import Chunker
|
||||
from graphrag_chunking.bootstrap_nltk import bootstrap
|
||||
from graphrag_chunking.chunker import Chunker
|
||||
|
||||
|
||||
class SentenceChunker(Chunker):
|
||||
@ -6,7 +6,7 @@
|
||||
from collections.abc import Callable
|
||||
from typing import Any
|
||||
|
||||
from graphrag.chunking.chunker import Chunker
|
||||
from graphrag_chunking.chunker import Chunker
|
||||
|
||||
|
||||
class TokenChunker(Chunker):
|
||||
43
packages/graphrag-chunking/pyproject.toml
Normal file
43
packages/graphrag-chunking/pyproject.toml
Normal file
@ -0,0 +1,43 @@
|
||||
[project]
|
||||
name = "graphrag-chunking"
|
||||
version = "2.7.0"
|
||||
description = "Chunking utilities for GraphRAG"
|
||||
authors = [
|
||||
{name = "Alonso Guevara Fernández", email = "alonsog@microsoft.com"},
|
||||
{name = "Andrés Morales Esquivel", email = "andresmor@microsoft.com"},
|
||||
{name = "Chris Trevino", email = "chtrevin@microsoft.com"},
|
||||
{name = "David Tittsworth", email = "datittsw@microsoft.com"},
|
||||
{name = "Dayenne de Souza", email = "ddesouza@microsoft.com"},
|
||||
{name = "Derek Worthen", email = "deworthe@microsoft.com"},
|
||||
{name = "Gaudy Blanco Meneses", email = "gaudyb@microsoft.com"},
|
||||
{name = "Ha Trinh", email = "trinhha@microsoft.com"},
|
||||
{name = "Jonathan Larson", email = "jolarso@microsoft.com"},
|
||||
{name = "Josh Bradley", email = "joshbradley@microsoft.com"},
|
||||
{name = "Kate Lytvynets", email = "kalytv@microsoft.com"},
|
||||
{name = "Kenny Zhang", email = "zhangken@microsoft.com"},
|
||||
{name = "Mónica Carvajal"},
|
||||
{name = "Nathan Evans", email = "naevans@microsoft.com"},
|
||||
{name = "Rodrigo Racanicci", email = "rracanicci@microsoft.com"},
|
||||
{name = "Sarah Smith", email = "smithsarah@microsoft.com"},
|
||||
]
|
||||
license = {text = "MIT"}
|
||||
readme = "README.md"
|
||||
requires-python = ">=3.11,<3.14"
|
||||
classifiers = [
|
||||
"Programming Language :: Python :: 3",
|
||||
"Programming Language :: Python :: 3.11",
|
||||
"Programming Language :: Python :: 3.12",
|
||||
"Programming Language :: Python :: 3.13",
|
||||
]
|
||||
dependencies = [
|
||||
"graphrag-common==2.7.0",
|
||||
"pydantic~=2.10",
|
||||
]
|
||||
|
||||
[project.urls]
|
||||
Source = "https://github.com/microsoft/graphrag"
|
||||
|
||||
[build-system]
|
||||
requires = ["hatchling>=1.27.0,<2.0.0"]
|
||||
build-backend = "hatchling.build"
|
||||
|
||||
@ -8,9 +8,9 @@ from pathlib import Path
|
||||
from typing import ClassVar
|
||||
|
||||
from graphrag_cache import CacheType
|
||||
from graphrag_chunking.chunk_strategy_type import ChunkStrategyType
|
||||
from graphrag_storage import StorageType
|
||||
|
||||
from graphrag.chunking.chunk_strategy_type import ChunkStrategyType
|
||||
from graphrag.config.embeddings import default_embeddings
|
||||
from graphrag.config.enums import (
|
||||
AsyncType,
|
||||
|
||||
@ -8,11 +8,11 @@ from pathlib import Path
|
||||
|
||||
from devtools import pformat
|
||||
from graphrag_cache import CacheConfig
|
||||
from graphrag_chunking.chunking_config import ChunkingConfig
|
||||
from graphrag_storage import StorageConfig, StorageType
|
||||
from pydantic import BaseModel, Field, model_validator
|
||||
|
||||
import graphrag.config.defaults as defs
|
||||
from graphrag.chunking.chunking_config import ChunkingConfig
|
||||
from graphrag.config.defaults import graphrag_config_defaults
|
||||
from graphrag.config.enums import VectorStoreType
|
||||
from graphrag.config.models.basic_search_config import BasicSearchConfig
|
||||
|
||||
@ -8,9 +8,9 @@ import logging
|
||||
from dataclasses import dataclass
|
||||
|
||||
import numpy as np
|
||||
from graphrag_chunking.token_chunker import split_text_on_tokens
|
||||
|
||||
from graphrag.callbacks.workflow_callbacks import WorkflowCallbacks
|
||||
from graphrag.chunking.token_chunker import split_text_on_tokens
|
||||
from graphrag.index.utils.is_null import is_null
|
||||
from graphrag.language_model.protocol.base import EmbeddingModel
|
||||
from graphrag.logger.progress import ProgressTicker, progress_ticker
|
||||
|
||||
@ -8,11 +8,11 @@ import logging
|
||||
from typing import Any, cast
|
||||
|
||||
import pandas as pd
|
||||
from graphrag_chunking.add_metadata import add_metadata
|
||||
from graphrag_chunking.chunker import Chunker
|
||||
from graphrag_chunking.chunker_factory import create_chunker
|
||||
|
||||
from graphrag.callbacks.workflow_callbacks import WorkflowCallbacks
|
||||
from graphrag.chunking.add_metadata import add_metadata
|
||||
from graphrag.chunking.chunker import Chunker
|
||||
from graphrag.chunking.chunker_factory import create_chunker
|
||||
from graphrag.config.models.graph_rag_config import GraphRagConfig
|
||||
from graphrag.index.typing.context import PipelineRunContext
|
||||
from graphrag.index.typing.workflow import WorkflowFunctionOutput
|
||||
@ -71,7 +71,7 @@ def create_base_text_units(
|
||||
metadata = row.get("metadata", None)
|
||||
if prepend_metadata and metadata is not None:
|
||||
metadata = json.loads(metadata) if isinstance(metadata, str) else metadata
|
||||
row["chunks"] = [add_metadata(chunk, metadata) for chunk in row["chunks"]]
|
||||
row["chunks"] = [add_metadata(chunk, metadata, line_delimiter=".\n") for chunk in row["chunks"]]
|
||||
tick()
|
||||
logger.info("chunker progress: %d/%d", row_index + 1, total_rows)
|
||||
return row
|
||||
|
||||
@ -9,10 +9,10 @@ from typing import Any
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
from graphrag_cache.noop_cache import NoopCache
|
||||
from graphrag_chunking.chunker_factory import create_chunker
|
||||
from graphrag_storage import create_storage
|
||||
|
||||
from graphrag.callbacks.noop_workflow_callbacks import NoopWorkflowCallbacks
|
||||
from graphrag.chunking.chunker_factory import create_chunker
|
||||
from graphrag.config.models.graph_rag_config import GraphRagConfig
|
||||
from graphrag.index.input.factory import InputReaderFactory
|
||||
from graphrag.index.operations.embed_text.run_embed_text import (
|
||||
|
||||
@ -53,6 +53,7 @@ package = false
|
||||
members = ["packages/*"]
|
||||
|
||||
[tool.uv.sources]
|
||||
graphrag-chunking = { workspace = true }
|
||||
graphrag-common = { workspace = true }
|
||||
graphrag-storage = { workspace = true }
|
||||
graphrag-cache = { workspace = true }
|
||||
|
||||
@ -3,15 +3,15 @@
|
||||
|
||||
from unittest.mock import Mock, patch
|
||||
|
||||
from graphrag.chunking.bootstrap_nltk import bootstrap
|
||||
from graphrag.chunking.chunk_strategy_type import ChunkStrategyType
|
||||
from graphrag.chunking.chunker_factory import create_chunker
|
||||
from graphrag.chunking.chunking_config import ChunkingConfig
|
||||
from graphrag.chunking.token_chunker import (
|
||||
split_text_on_tokens,
|
||||
)
|
||||
from graphrag.tokenizer.get_tokenizer import get_tokenizer
|
||||
from graphrag.tokenizer.tokenizer import Tokenizer
|
||||
from graphrag_chunking.bootstrap_nltk import bootstrap
|
||||
from graphrag_chunking.chunk_strategy_type import ChunkStrategyType
|
||||
from graphrag_chunking.chunker_factory import create_chunker
|
||||
from graphrag_chunking.chunking_config import ChunkingConfig
|
||||
from graphrag_chunking.token_chunker import (
|
||||
split_text_on_tokens,
|
||||
)
|
||||
|
||||
|
||||
class MockTokenizer(Tokenizer):
|
||||
@ -22,9 +22,6 @@ class MockTokenizer(Tokenizer):
|
||||
return "".join(chr(id) for id in tokens)
|
||||
|
||||
|
||||
tokenizer = get_tokenizer()
|
||||
|
||||
|
||||
class TestRunSentences:
|
||||
def setup_method(self, method):
|
||||
bootstrap()
|
||||
@ -73,6 +70,7 @@ class TestRunTokens:
|
||||
|
||||
|
||||
def test_split_text_str_empty():
|
||||
tokenizer = get_tokenizer()
|
||||
result = split_text_on_tokens(
|
||||
"",
|
||||
chunk_size=5,
|
||||
@ -112,9 +110,9 @@ def test_split_text_on_tokens():
|
||||
assert result == expected_splits
|
||||
|
||||
|
||||
def test_split_text_on_tokens_no_overlap():
|
||||
def test_split_text_on_tokens_one_overlap():
|
||||
text = "This is a test text, meaning to be taken seriously by this test only."
|
||||
tok = get_tokenizer(encoding_model="cl100k_base")
|
||||
tokenizer = get_tokenizer(encoding_model="o200k_base")
|
||||
|
||||
expected_splits = [
|
||||
"This is",
|
||||
@ -125,10 +123,10 @@ def test_split_text_on_tokens_no_overlap():
|
||||
", meaning",
|
||||
" meaning to",
|
||||
" to be",
|
||||
" be taken", # cspell:disable-line
|
||||
" taken seriously", # cspell:disable-line
|
||||
" be taken",
|
||||
" taken seriously",
|
||||
" seriously by",
|
||||
" by this", # cspell:disable-line
|
||||
" by this",
|
||||
" this test",
|
||||
" test only",
|
||||
" only.",
|
||||
@ -138,7 +136,31 @@ def test_split_text_on_tokens_no_overlap():
|
||||
text=text,
|
||||
chunk_size=2,
|
||||
chunk_overlap=1,
|
||||
decode=tok.decode,
|
||||
encode=tok.encode,
|
||||
decode=tokenizer.decode,
|
||||
encode=tokenizer.encode,
|
||||
)
|
||||
assert result == expected_splits
|
||||
|
||||
|
||||
def test_split_text_on_tokens_no_overlap():
|
||||
text = "This is a test text, meaning to be taken seriously by this test only."
|
||||
tokenizer = get_tokenizer(encoding_model="o200k_base")
|
||||
|
||||
expected_splits = [
|
||||
"This is a",
|
||||
" test text,",
|
||||
" meaning to be",
|
||||
" taken seriously by",
|
||||
" this test only",
|
||||
".",
|
||||
]
|
||||
|
||||
result = split_text_on_tokens(
|
||||
text=text,
|
||||
chunk_size=3,
|
||||
chunk_overlap=0,
|
||||
decode=tokenizer.decode,
|
||||
encode=tokenizer.encode,
|
||||
)
|
||||
|
||||
assert result == expected_splits
|
||||
|
||||
@ -1,7 +1,7 @@
|
||||
# Copyright (c) 2024 Microsoft Corporation.
|
||||
# Licensed under the MIT License
|
||||
|
||||
from graphrag.chunking.add_metadata import add_metadata
|
||||
from graphrag_chunking.add_metadata import add_metadata
|
||||
|
||||
|
||||
def test_add_metadata_one_row():
|
||||
|
||||
@ -4,7 +4,6 @@
|
||||
from dataclasses import asdict
|
||||
|
||||
import graphrag.config.defaults as defs
|
||||
from graphrag.chunking.chunking_config import ChunkingConfig
|
||||
from graphrag.config.models.basic_search_config import BasicSearchConfig
|
||||
from graphrag.config.models.cluster_graph_config import ClusterGraphConfig
|
||||
from graphrag.config.models.community_reports_config import CommunityReportsConfig
|
||||
@ -29,6 +28,7 @@ from graphrag.config.models.summarize_descriptions_config import (
|
||||
)
|
||||
from graphrag.config.models.vector_store_config import VectorStoreConfig
|
||||
from graphrag_cache import CacheConfig
|
||||
from graphrag_chunking.chunking_config import ChunkingConfig
|
||||
from graphrag_storage import StorageConfig
|
||||
from pydantic import BaseModel
|
||||
|
||||
|
||||
Loading…
Reference in New Issue
Block a user