mirror of
https://github.com/microsoft/graphrag.git
synced 2026-01-14 00:57:23 +08:00
Set defaults for chunking config
This commit is contained in:
parent
b32f403e8f
commit
d9ba63f4d6
19
packages/graphrag/graphrag/chunking/add_metadata.py
Normal file
19
packages/graphrag/graphrag/chunking/add_metadata.py
Normal file
@ -0,0 +1,19 @@
|
||||
# Copyright (c) 2024 Microsoft Corporation.
|
||||
# Licensed under the MIT License
|
||||
|
||||
"""A module containing 'prepend_metadata' function."""
|
||||
|
||||
|
||||
def add_metadata(
|
||||
text: str,
|
||||
metadata: dict,
|
||||
delimiter: str = ": ",
|
||||
line_delimiter: str = "\n",
|
||||
append: bool = False,
|
||||
) -> str:
|
||||
"""Add metadata to the given text, prepending by default. This utility writes the dict as rows of key/value pairs."""
|
||||
metadata_str = (
|
||||
line_delimiter.join(f"{k}{delimiter}{v}" for k, v in metadata.items())
|
||||
+ line_delimiter
|
||||
)
|
||||
return text + metadata_str if append else metadata_str + text
|
||||
@ -18,19 +18,19 @@ class ChunkingConfig(BaseModel):
|
||||
description="The chunking strategy to use.",
|
||||
default=ChunkStrategyType.Tokens,
|
||||
)
|
||||
size: int | None = Field(
|
||||
description="The chunk size to use.",
|
||||
default=None,
|
||||
)
|
||||
overlap: int | None = Field(
|
||||
description="The chunk overlap to use.",
|
||||
default=None,
|
||||
)
|
||||
encoding_model: str | None = Field(
|
||||
description="The encoding model to use.",
|
||||
default=None,
|
||||
)
|
||||
prepend_metadata: bool | None = Field(
|
||||
size: int = Field(
|
||||
description="The chunk size to use.",
|
||||
default=1200,
|
||||
)
|
||||
overlap: int = Field(
|
||||
description="The chunk overlap to use.",
|
||||
default=100,
|
||||
)
|
||||
prepend_metadata: bool = Field(
|
||||
description="Prepend metadata into each chunk.",
|
||||
default=False,
|
||||
)
|
||||
|
||||
@ -1,15 +0,0 @@
|
||||
# Copyright (c) 2024 Microsoft Corporation.
|
||||
# Licensed under the MIT License
|
||||
|
||||
"""A module containing 'prepend_metadata' function."""
|
||||
|
||||
|
||||
def prepend_metadata(
|
||||
text: str, metadata: dict, delimiter: str = ": ", line_delimiter: str = "\n"
|
||||
) -> str:
|
||||
"""Prepend metadata to the given text. This utility writes the dict as rows of key/value pairs."""
|
||||
metadata_str = (
|
||||
line_delimiter.join(f"{k}{delimiter}{v}" for k, v in metadata.items())
|
||||
+ line_delimiter
|
||||
)
|
||||
return metadata_str + text
|
||||
@ -10,9 +10,9 @@ from typing import Any, cast
|
||||
import pandas as pd
|
||||
|
||||
from graphrag.callbacks.workflow_callbacks import WorkflowCallbacks
|
||||
from graphrag.chunking.add_metadata import add_metadata
|
||||
from graphrag.chunking.chunker import Chunker
|
||||
from graphrag.chunking.chunker_factory import create_chunker
|
||||
from graphrag.chunking.prepend_metadata import prepend_metadata as prepend_metadata_fn
|
||||
from graphrag.config.models.graph_rag_config import GraphRagConfig
|
||||
from graphrag.index.typing.context import PipelineRunContext
|
||||
from graphrag.index.typing.workflow import WorkflowFunctionOutput
|
||||
@ -71,9 +71,7 @@ def create_base_text_units(
|
||||
metadata = row.get("metadata", None)
|
||||
if prepend_metadata and metadata is not None:
|
||||
metadata = json.loads(metadata) if isinstance(metadata, str) else metadata
|
||||
row["chunks"] = [
|
||||
prepend_metadata_fn(chunk, metadata) for chunk in row["chunks"]
|
||||
]
|
||||
row["chunks"] = [add_metadata(chunk, metadata) for chunk in row["chunks"]]
|
||||
tick()
|
||||
logger.info("chunker progress: %d/%d", row_index + 1, total_rows)
|
||||
return row
|
||||
|
||||
@ -1,33 +1,42 @@
|
||||
# Copyright (c) 2024 Microsoft Corporation.
|
||||
# Licensed under the MIT License
|
||||
|
||||
from graphrag.chunking.prepend_metadata import prepend_metadata
|
||||
from graphrag.chunking.add_metadata import add_metadata
|
||||
|
||||
|
||||
def test_prepend_metadata_one_row():
|
||||
def test_add_metadata_one_row():
|
||||
"""Test prepending metadata to chunks"""
|
||||
chunks = ["This is a test.", "Another sentence."]
|
||||
metadata = {"message": "hello"}
|
||||
results = [prepend_metadata(chunk, metadata) for chunk in chunks]
|
||||
results = [add_metadata(chunk, metadata) for chunk in chunks]
|
||||
assert results[0] == "message: hello\nThis is a test."
|
||||
assert results[1] == "message: hello\nAnother sentence."
|
||||
|
||||
|
||||
def test_prepend_metadata_multiple_rows():
|
||||
def test_add_metadata_one_row_append():
|
||||
"""Test prepending metadata to chunks"""
|
||||
chunks = ["This is a test.", "Another sentence."]
|
||||
metadata = {"message": "hello"}
|
||||
results = [add_metadata(chunk, metadata, append=True) for chunk in chunks]
|
||||
assert results[0] == "This is a test.message: hello\n"
|
||||
assert results[1] == "Another sentence.message: hello\n"
|
||||
|
||||
|
||||
def test_add_metadata_multiple_rows():
|
||||
"""Test prepending metadata to chunks"""
|
||||
chunks = ["This is a test.", "Another sentence."]
|
||||
metadata = {"message": "hello", "tag": "first"}
|
||||
results = [prepend_metadata(chunk, metadata) for chunk in chunks]
|
||||
results = [add_metadata(chunk, metadata) for chunk in chunks]
|
||||
assert results[0] == "message: hello\ntag: first\nThis is a test."
|
||||
assert results[1] == "message: hello\ntag: first\nAnother sentence."
|
||||
|
||||
|
||||
def test_prepend_metadata_custom_delimiters():
|
||||
def test_add_metadata_custom_delimiters():
|
||||
"""Test prepending metadata to chunks"""
|
||||
chunks = ["This is a test.", "Another sentence."]
|
||||
metadata = {"message": "hello", "tag": "first"}
|
||||
results = [
|
||||
prepend_metadata(chunk, metadata, delimiter="-", line_delimiter="_")
|
||||
add_metadata(chunk, metadata, delimiter="-", line_delimiter="_")
|
||||
for chunk in chunks
|
||||
]
|
||||
assert results[0] == "message-hello_tag-first_This is a test."
|
||||
|
||||
Loading…
Reference in New Issue
Block a user