Set defaults for chunking config

2026-01-14 00:57:23 +08:00 · 2025-12-22 11:25:53 -08:00 · 2025-12-22 11:25:53 -08:00 · d9ba63f4d6
commit d9ba63f4d6
parent b32f403e8f
5 changed files with 46 additions and 35 deletions
--- a/packages/graphrag/graphrag/chunking/add_metadata.py
+++ b/packages/graphrag/graphrag/chunking/add_metadata.py
@ -0,0 +1,19 @@
+# Copyright (c) 2024 Microsoft Corporation.
+# Licensed under the MIT License
+
+"""A module containing 'prepend_metadata' function."""
+
+
+def add_metadata(
+    text: str,
+    metadata: dict,
+    delimiter: str = ": ",
+    line_delimiter: str = "\n",
+    append: bool = False,
+) -> str:
+    """Add metadata to the given text, prepending by default. This utility writes the dict as rows of key/value pairs."""
+    metadata_str = (
+        line_delimiter.join(f"{k}{delimiter}{v}" for k, v in metadata.items())
+        + line_delimiter
+    )
+    return text + metadata_str if append else metadata_str + text
--- a/packages/graphrag/graphrag/chunking/chunking_config.py
+++ b/packages/graphrag/graphrag/chunking/chunking_config.py
@ -18,19 +18,19 @@ class ChunkingConfig(BaseModel):
        description="The chunking strategy to use.",
        default=ChunkStrategyType.Tokens,
    )
-    size: int | None = Field(
-        description="The chunk size to use.",
-        default=None,
-    )
-    overlap: int | None = Field(
-        description="The chunk overlap to use.",
-        default=None,
-    )
    encoding_model: str | None = Field(
        description="The encoding model to use.",
        default=None,
    )
-    prepend_metadata: bool | None = Field(
+    size: int = Field(
+        description="The chunk size to use.",
+        default=1200,
+    )
+    overlap: int = Field(
+        description="The chunk overlap to use.",
+        default=100,
+    )
+    prepend_metadata: bool = Field(
        description="Prepend metadata into each chunk.",
        default=False,
    )
--- a/packages/graphrag/graphrag/chunking/prepend_metadata.py
+++ b/packages/graphrag/graphrag/chunking/prepend_metadata.py
@ -1,15 +0,0 @@
-# Copyright (c) 2024 Microsoft Corporation.
-# Licensed under the MIT License
-
-"""A module containing 'prepend_metadata' function."""
-
-
-def prepend_metadata(
-    text: str, metadata: dict, delimiter: str = ": ", line_delimiter: str = "\n"
-) -> str:
-    """Prepend metadata to the given text. This utility writes the dict as rows of key/value pairs."""
-    metadata_str = (
-        line_delimiter.join(f"{k}{delimiter}{v}" for k, v in metadata.items())
-        + line_delimiter
-    )
-    return metadata_str + text
--- a/packages/graphrag/graphrag/index/workflows/create_base_text_units.py
+++ b/packages/graphrag/graphrag/index/workflows/create_base_text_units.py
@ -10,9 +10,9 @@ from typing import Any, cast
 import pandas as pd

 from graphrag.callbacks.workflow_callbacks import WorkflowCallbacks
+from graphrag.chunking.add_metadata import add_metadata
 from graphrag.chunking.chunker import Chunker
 from graphrag.chunking.chunker_factory import create_chunker
-from graphrag.chunking.prepend_metadata import prepend_metadata as prepend_metadata_fn
 from graphrag.config.models.graph_rag_config import GraphRagConfig
 from graphrag.index.typing.context import PipelineRunContext
 from graphrag.index.typing.workflow import WorkflowFunctionOutput
@ -71,9 +71,7 @@ def create_base_text_units(
        metadata = row.get("metadata", None)
        if prepend_metadata and metadata is not None:
            metadata = json.loads(metadata) if isinstance(metadata, str) else metadata
-            row["chunks"] = [
-                prepend_metadata_fn(chunk, metadata) for chunk in row["chunks"]
-            ]
+            row["chunks"] = [add_metadata(chunk, metadata) for chunk in row["chunks"]]
        tick()
        logger.info("chunker progress:  %d/%d", row_index + 1, total_rows)
        return row
--- a/tests/unit/chunking/test_prepend_metadata.py
+++ b/tests/unit/chunking/test_prepend_metadata.py
@ -1,33 +1,42 @@
 # Copyright (c) 2024 Microsoft Corporation.
 # Licensed under the MIT License

-from graphrag.chunking.prepend_metadata import prepend_metadata
+from graphrag.chunking.add_metadata import add_metadata


-def test_prepend_metadata_one_row():
+def test_add_metadata_one_row():
    """Test prepending metadata to chunks"""
    chunks = ["This is a test.", "Another sentence."]
    metadata = {"message": "hello"}
-    results = [prepend_metadata(chunk, metadata) for chunk in chunks]
+    results = [add_metadata(chunk, metadata) for chunk in chunks]
    assert results[0] == "message: hello\nThis is a test."
    assert results[1] == "message: hello\nAnother sentence."


-def test_prepend_metadata_multiple_rows():
+def test_add_metadata_one_row_append():
+    """Test prepending metadata to chunks"""
+    chunks = ["This is a test.", "Another sentence."]
+    metadata = {"message": "hello"}
+    results = [add_metadata(chunk, metadata, append=True) for chunk in chunks]
+    assert results[0] == "This is a test.message: hello\n"
+    assert results[1] == "Another sentence.message: hello\n"
+
+
+def test_add_metadata_multiple_rows():
    """Test prepending metadata to chunks"""
    chunks = ["This is a test.", "Another sentence."]
    metadata = {"message": "hello", "tag": "first"}
-    results = [prepend_metadata(chunk, metadata) for chunk in chunks]
+    results = [add_metadata(chunk, metadata) for chunk in chunks]
    assert results[0] == "message: hello\ntag: first\nThis is a test."
    assert results[1] == "message: hello\ntag: first\nAnother sentence."


-def test_prepend_metadata_custom_delimiters():
+def test_add_metadata_custom_delimiters():
    """Test prepending metadata to chunks"""
    chunks = ["This is a test.", "Another sentence."]
    metadata = {"message": "hello", "tag": "first"}
    results = [
-        prepend_metadata(chunk, metadata, delimiter="-", line_delimiter="_")
+        add_metadata(chunk, metadata, delimiter="-", line_delimiter="_")
        for chunk in chunks
    ]
    assert results[0] == "message-hello_tag-first_This is a test."