Set defaults for chunking config

This commit is contained in:
Nathan Evans 2025-12-22 11:25:53 -08:00
parent b32f403e8f
commit d9ba63f4d6
5 changed files with 46 additions and 35 deletions

View File

@ -0,0 +1,19 @@
# Copyright (c) 2024 Microsoft Corporation.
# Licensed under the MIT License
"""A module containing 'prepend_metadata' function."""
def add_metadata(
text: str,
metadata: dict,
delimiter: str = ": ",
line_delimiter: str = "\n",
append: bool = False,
) -> str:
"""Add metadata to the given text, prepending by default. This utility writes the dict as rows of key/value pairs."""
metadata_str = (
line_delimiter.join(f"{k}{delimiter}{v}" for k, v in metadata.items())
+ line_delimiter
)
return text + metadata_str if append else metadata_str + text

View File

@ -18,19 +18,19 @@ class ChunkingConfig(BaseModel):
description="The chunking strategy to use.",
default=ChunkStrategyType.Tokens,
)
size: int | None = Field(
description="The chunk size to use.",
default=None,
)
overlap: int | None = Field(
description="The chunk overlap to use.",
default=None,
)
encoding_model: str | None = Field(
description="The encoding model to use.",
default=None,
)
prepend_metadata: bool | None = Field(
size: int = Field(
description="The chunk size to use.",
default=1200,
)
overlap: int = Field(
description="The chunk overlap to use.",
default=100,
)
prepend_metadata: bool = Field(
description="Prepend metadata into each chunk.",
default=False,
)

View File

@ -1,15 +0,0 @@
# Copyright (c) 2024 Microsoft Corporation.
# Licensed under the MIT License
"""A module containing 'prepend_metadata' function."""
def prepend_metadata(
text: str, metadata: dict, delimiter: str = ": ", line_delimiter: str = "\n"
) -> str:
"""Prepend metadata to the given text. This utility writes the dict as rows of key/value pairs."""
metadata_str = (
line_delimiter.join(f"{k}{delimiter}{v}" for k, v in metadata.items())
+ line_delimiter
)
return metadata_str + text

View File

@ -10,9 +10,9 @@ from typing import Any, cast
import pandas as pd
from graphrag.callbacks.workflow_callbacks import WorkflowCallbacks
from graphrag.chunking.add_metadata import add_metadata
from graphrag.chunking.chunker import Chunker
from graphrag.chunking.chunker_factory import create_chunker
from graphrag.chunking.prepend_metadata import prepend_metadata as prepend_metadata_fn
from graphrag.config.models.graph_rag_config import GraphRagConfig
from graphrag.index.typing.context import PipelineRunContext
from graphrag.index.typing.workflow import WorkflowFunctionOutput
@ -71,9 +71,7 @@ def create_base_text_units(
metadata = row.get("metadata", None)
if prepend_metadata and metadata is not None:
metadata = json.loads(metadata) if isinstance(metadata, str) else metadata
row["chunks"] = [
prepend_metadata_fn(chunk, metadata) for chunk in row["chunks"]
]
row["chunks"] = [add_metadata(chunk, metadata) for chunk in row["chunks"]]
tick()
logger.info("chunker progress: %d/%d", row_index + 1, total_rows)
return row

View File

@ -1,33 +1,42 @@
# Copyright (c) 2024 Microsoft Corporation.
# Licensed under the MIT License
from graphrag.chunking.prepend_metadata import prepend_metadata
from graphrag.chunking.add_metadata import add_metadata
def test_prepend_metadata_one_row():
def test_add_metadata_one_row():
"""Test prepending metadata to chunks"""
chunks = ["This is a test.", "Another sentence."]
metadata = {"message": "hello"}
results = [prepend_metadata(chunk, metadata) for chunk in chunks]
results = [add_metadata(chunk, metadata) for chunk in chunks]
assert results[0] == "message: hello\nThis is a test."
assert results[1] == "message: hello\nAnother sentence."
def test_prepend_metadata_multiple_rows():
def test_add_metadata_one_row_append():
"""Test prepending metadata to chunks"""
chunks = ["This is a test.", "Another sentence."]
metadata = {"message": "hello"}
results = [add_metadata(chunk, metadata, append=True) for chunk in chunks]
assert results[0] == "This is a test.message: hello\n"
assert results[1] == "Another sentence.message: hello\n"
def test_add_metadata_multiple_rows():
"""Test prepending metadata to chunks"""
chunks = ["This is a test.", "Another sentence."]
metadata = {"message": "hello", "tag": "first"}
results = [prepend_metadata(chunk, metadata) for chunk in chunks]
results = [add_metadata(chunk, metadata) for chunk in chunks]
assert results[0] == "message: hello\ntag: first\nThis is a test."
assert results[1] == "message: hello\ntag: first\nAnother sentence."
def test_prepend_metadata_custom_delimiters():
def test_add_metadata_custom_delimiters():
"""Test prepending metadata to chunks"""
chunks = ["This is a test.", "Another sentence."]
metadata = {"message": "hello", "tag": "first"}
results = [
prepend_metadata(chunk, metadata, delimiter="-", line_delimiter="_")
add_metadata(chunk, metadata, delimiter="-", line_delimiter="_")
for chunk in chunks
]
assert results[0] == "message-hello_tag-first_This is a test."