Align input config type name with other factory configs

This commit is contained in:
Nathan Evans 2026-01-08 15:41:28 -08:00
parent 2f6d075b97
commit a671aa4fe4
25 changed files with 78 additions and 65 deletions

View File

@ -87,9 +87,9 @@ Our pipeline can ingest .csv, .txt, or .json data from an input location. See th
- `container_name` **str** - (blob/cosmosdb only) The Azure Storage container name.
- `storage_account_blob_url` **str** - (blob only) The storage account blob URL to use.
- `cosmosdb_account_blob_url` **str** - (cosmosdb only) The CosmosDB account blob URL to use.
- `file_type` **text|csv|json** - The type of input data to load. Default is `text`
- `type` **text|csv|json** - The type of input data to load. Default is `text`
- `encoding` **str** - The encoding of the input file. Default is `utf-8`
- `file_pattern` **str** - A regex to match input files. Default is `.*\.csv$`, `.*\.txt$`, or `.*\.json$` depending on the specified `file_type`, but you can customize it if needed.
- `file_pattern` **str** - A regex to match input files. Default is `.*\.csv$`, `.*\.txt$`, or `.*\.json$` depending on the specified `type`, but you can customize it if needed.
- `text_column` **str** - (CSV/JSON only) The text column name. If unset we expect a column named `text`.
- `title_column` **str** - (CSV/JSON only) The title column name, filename will be used if unset.
- `metadata` **list[str]** - (CSV/JSON only) The additional document attributes fields to keep.

View File

@ -116,7 +116,7 @@ settings.yaml
```yaml
input:
file_type: text
type: text
metadata: [title]
chunks:
@ -194,7 +194,7 @@ settings.yaml
```yaml
input:
file_type: json
type: json
title_column: headline
text_column: content

View File

@ -13,12 +13,12 @@ This package provides input document loading utilities for GraphRAG, supporting
Basic usage with the factory:
```python
from graphrag_input import create_input_reader, InputConfig, InputFileType
from graphrag_input import create_input_reader, InputConfig, InputType
from graphrag_storage import StorageConfig, create_storage
config = InputConfig(
storage=StorageConfig(base_dir="./input"),
file_type=InputFileType.Csv,
type=InputType.Csv,
text_column="content",
title_column="title",
)

View File

@ -5,15 +5,15 @@
from graphrag_input.get_property import get_property
from graphrag_input.input_config import InputConfig
from graphrag_input.input_file_type import InputFileType
from graphrag_input.input_reader import InputReader
from graphrag_input.input_reader_factory import create_input_reader
from graphrag_input.input_type import InputType
from graphrag_input.text_document import TextDocument
__all__ = [
"InputConfig",
"InputFileType",
"InputReader",
"InputType",
"TextDocument",
"create_input_reader",
"get_property",

View File

@ -15,6 +15,12 @@ logger = logging.getLogger(__name__)
class CSVFileReader(StructuredFileReader):
"""Reader implementation for csv files."""
def __init__(self, file_pattern: str | None = None, **kwargs):
super().__init__(
file_pattern=file_pattern if file_pattern is not None else ".*\\.csv$",
**kwargs,
)
async def read_file(self, path: str) -> list[TextDocument]:
"""Read a csv file into a list of documents.

View File

@ -6,7 +6,7 @@
from graphrag_storage import StorageConfig
from pydantic import BaseModel, ConfigDict, Field
from graphrag_input.input_file_type import InputFileType
from graphrag_input.input_type import InputType
class InputConfig(BaseModel):
@ -19,9 +19,9 @@ class InputConfig(BaseModel):
description="The storage configuration to use for reading input documents.",
default_factory=lambda: StorageConfig(base_dir="input"),
)
file_type: str = Field(
type: str = Field(
description="The input file type to use.",
default=InputFileType.Text,
default=InputType.Text,
)
encoding: str | None = Field(
description="The input file encoding to use.",

View File

@ -24,30 +24,19 @@ class InputReader(metaclass=ABCMeta):
def __init__(
self,
storage: Storage,
file_type: str,
encoding: str = "utf-8",
file_pattern: str | None = None,
**kwargs,
):
self._storage = storage
self._file_type = file_type
self._encoding = encoding
# built-in readers set a default pattern if none is provided
# this is usually just the file type itself, e.g., the file extension
pattern = (
file_pattern if file_pattern is not None else f".*\\.{self._file_type}$"
)
if file_pattern is None and self._file_type == "text":
pattern = ".*\\.txt$"
self._file_pattern = pattern
self._file_pattern = file_pattern
async def read_files(self) -> list[TextDocument]:
"""Load files from storage and apply a loader function based on file type. Process metadata on the results if needed."""
files = list(self._storage.find(re.compile(self._file_pattern)))
if len(files) == 0:
msg = f"No {self._file_type} files found in storage" # TODO: use a storage __str__ to define it per impl
msg = f"No {self._file_pattern} matches found in storage"
logger.warning(msg)
files = []
@ -63,11 +52,11 @@ class InputReader(metaclass=ABCMeta):
logger.info(
"Found %d %s files, loading %d",
len(files),
self._file_type,
self._file_pattern,
len(documents),
)
total_files_log = (
f"Total number of unfiltered {self._file_type} rows: {len(documents)}"
f"Total number of unfiltered {self._file_pattern} rows: {len(documents)}"
)
logger.info(total_files_log)

View File

@ -11,8 +11,8 @@ from graphrag_common.factory.factory import ServiceScope
from graphrag_storage.storage import Storage
from graphrag_input.input_config import InputConfig
from graphrag_input.input_file_type import InputFileType
from graphrag_input.input_reader import InputReader
from graphrag_input.input_type import InputType
logger = logging.getLogger(__name__)
@ -57,28 +57,28 @@ def create_input_reader(config: InputConfig, storage: Storage) -> InputReader:
The created input reader implementation.
"""
config_model = config.model_dump()
input_strategy = config.file_type
input_strategy = config.type
if input_strategy not in input_reader_factory:
match input_strategy:
case InputFileType.Csv:
case InputType.Csv:
from graphrag_input.csv import CSVFileReader
register_input_reader(InputFileType.Csv, CSVFileReader)
case InputFileType.Text:
register_input_reader(InputType.Csv, CSVFileReader)
case InputType.Text:
from graphrag_input.text import TextFileReader
register_input_reader(InputFileType.Text, TextFileReader)
case InputFileType.Json:
register_input_reader(InputType.Text, TextFileReader)
case InputType.Json:
from graphrag_input.json import JSONFileReader
register_input_reader(InputFileType.Json, JSONFileReader)
case InputFileType.JsonLines:
register_input_reader(InputType.Json, JSONFileReader)
case InputType.JsonLines:
from graphrag_input.jsonl import JSONLinesFileReader
register_input_reader(InputFileType.JsonLines, JSONLinesFileReader)
register_input_reader(InputType.JsonLines, JSONLinesFileReader)
case _:
msg = f"InputConfig.file_type '{input_strategy}' is not registered in the InputReaderFactory. Registered types: {', '.join(input_reader_factory.keys())}."
msg = f"InputConfig.type '{input_strategy}' is not registered in the InputReaderFactory. Registered types: {', '.join(input_reader_factory.keys())}."
raise ValueError(msg)
config_model["storage"] = storage

View File

@ -6,7 +6,7 @@
from enum import StrEnum
class InputFileType(StrEnum):
class InputType(StrEnum):
"""The input file type for the pipeline."""
Csv = "csv"

View File

@ -15,6 +15,12 @@ logger = logging.getLogger(__name__)
class JSONFileReader(StructuredFileReader):
"""Reader implementation for json files."""
def __init__(self, file_pattern: str | None = None, **kwargs):
super().__init__(
file_pattern=file_pattern if file_pattern is not None else ".*\\.json$",
**kwargs,
)
async def read_file(self, path: str) -> list[TextDocument]:
"""Read a JSON file into a list of documents.

View File

@ -15,6 +15,12 @@ logger = logging.getLogger(__name__)
class JSONLinesFileReader(StructuredFileReader):
"""Reader implementation for json lines files."""
def __init__(self, file_pattern: str | None = None, **kwargs):
super().__init__(
file_pattern=file_pattern if file_pattern is not None else ".*\\.jsonl$",
**kwargs,
)
async def read_file(self, path: str) -> list[TextDocument]:
"""Read a JSON lines file into a list of documents.

View File

@ -16,6 +16,12 @@ logger = logging.getLogger(__name__)
class TextFileReader(InputReader):
"""Reader implementation for text files."""
def __init__(self, file_pattern: str | None = None, **kwargs):
super().__init__(
file_pattern=file_pattern if file_pattern is not None else ".*\\.txt$",
**kwargs,
)
async def read_file(self, path: str) -> list[TextDocument]:
"""Read a text file into a DataFrame of documents.

View File

@ -9,7 +9,7 @@ from typing import ClassVar
from graphrag_cache import CacheType
from graphrag_chunking.chunk_strategy_type import ChunkerType
from graphrag_input import InputFileType
from graphrag_input import InputType
from graphrag_storage import StorageType
from graphrag.config.embeddings import default_embeddings
@ -237,7 +237,7 @@ class InputDefaults:
"""Default values for input."""
storage: InputStorageDefaults = field(default_factory=InputStorageDefaults)
file_type: ClassVar[InputFileType] = InputFileType.Text
type: ClassVar[InputType] = InputType.Text
encoding: str | None = None
file_pattern: None = None
id_column: None = None

View File

@ -52,7 +52,7 @@ input:
storage:
type: {graphrag_config_defaults.input.storage.type} # [file, blob, cosmosdb]
base_dir: "{graphrag_config_defaults.input.storage.base_dir}"
file_type: {graphrag_config_defaults.input.file_type.value} # [csv, text, json]
type: {graphrag_config_defaults.input.type.value} # [csv, text, json]
chunking:
type: {graphrag_config_defaults.chunking.type}

View File

@ -1,6 +1,6 @@
{
"input_path": "./tests/fixtures/azure",
"input_file_type": "text",
"input_type": "text",
"index_method": "standard",
"workflow_config": {
"skip_assert": true,

View File

@ -14,7 +14,7 @@ input:
connection_string: ${LOCAL_BLOB_STORAGE_CONNECTION_STRING}
container_name: azurefixture
base_dir: input
file_type: text
type: text
cache:

View File

@ -1,6 +1,6 @@
{
"input_path": "./tests/fixtures/min-csv",
"input_file_type": "text",
"input_type": "text",
"index_method": "standard",
"workflow_config": {
"load_input_documents": {

View File

@ -35,7 +35,7 @@ vector_store:
overwrite: True
input:
file_type: csv
type: csv
snapshots:
embeddings: true

View File

@ -1,6 +1,6 @@
{
"input_path": "./tests/fixtures/text",
"input_file_type": "text",
"input_type": "text",
"index_method": "fast",
"workflow_config": {
"load_input_documents": {

View File

@ -126,7 +126,7 @@ class TestIndexer:
def __run_indexer(
self,
root: Path,
input_file_type: str,
input_type: str,
index_method: str,
):
command = [
@ -232,7 +232,7 @@ class TestIndexer:
def test_fixture(
self,
input_path: str,
input_file_type: str,
input_type: str,
index_method: str,
workflow_config: dict[str, dict[str, Any]],
query_config: list[dict[str, str]],
@ -248,7 +248,7 @@ class TestIndexer:
dispose = asyncio.run(prepare_azurite_data(input_path, azure))
print("running indexer")
self.__run_indexer(root, input_file_type, index_method)
self.__run_indexer(root, input_type, index_method)
print("indexer complete")
if dispose is not None:

View File

@ -144,7 +144,7 @@ def assert_cache_configs(actual: CacheConfig, expected: CacheConfig) -> None:
def assert_input_configs(actual: InputConfig, expected: InputConfig) -> None:
assert_storage_config(actual.storage, expected.storage)
assert actual.file_type == expected.file_type
assert actual.type == expected.type
assert actual.encoding == expected.encoding
assert actual.file_pattern == expected.file_pattern
assert actual.text_column == expected.text_column

View File

@ -1,7 +1,7 @@
# Copyright (c) 2024 Microsoft Corporation.
# Licensed under the MIT License
from graphrag_input import InputConfig, InputFileType, create_input_reader
from graphrag_input import InputConfig, InputType, create_input_reader
from graphrag_storage import StorageConfig, create_storage
@ -10,7 +10,7 @@ async def test_csv_loader_one_file():
storage=StorageConfig(
base_dir="tests/unit/indexing/input/data/one-csv",
),
file_type=InputFileType.Csv,
type=InputType.Csv,
file_pattern=".*\\.csv$",
)
storage = create_storage(config.storage)
@ -30,7 +30,7 @@ async def test_csv_loader_one_file_with_title():
storage=StorageConfig(
base_dir="tests/unit/indexing/input/data/one-csv",
),
file_type=InputFileType.Csv,
type=InputType.Csv,
title_column="title",
)
storage = create_storage(config.storage)
@ -45,7 +45,7 @@ async def test_csv_loader_multiple_files():
storage=StorageConfig(
base_dir="tests/unit/indexing/input/data/multiple-csvs",
),
file_type=InputFileType.Csv,
type=InputType.Csv,
)
storage = create_storage(config.storage)
reader = create_input_reader(config, storage)

View File

@ -1,7 +1,7 @@
# Copyright (c) 2024 Microsoft Corporation.
# Licensed under the MIT License
from graphrag_input import InputConfig, InputFileType, create_input_reader
from graphrag_input import InputConfig, InputType, create_input_reader
from graphrag_storage import StorageConfig, create_storage
@ -10,7 +10,7 @@ async def test_json_loader_one_file_one_object():
storage=StorageConfig(
base_dir="tests/unit/indexing/input/data/one-json-one-object",
),
file_type=InputFileType.Json,
type=InputType.Json,
file_pattern=".*\\.json$",
)
storage = create_storage(config.storage)
@ -29,7 +29,7 @@ async def test_json_loader_one_file_multiple_objects():
storage=StorageConfig(
base_dir="tests/unit/indexing/input/data/one-json-multiple-objects",
),
file_type=InputFileType.Json,
type=InputType.Json,
)
storage = create_storage(config.storage)
reader = create_input_reader(config, storage)
@ -44,7 +44,7 @@ async def test_json_loader_one_file_with_title():
storage=StorageConfig(
base_dir="tests/unit/indexing/input/data/one-json-one-object",
),
file_type=InputFileType.Json,
type=InputType.Json,
title_column="title",
)
storage = create_storage(config.storage)
@ -59,7 +59,7 @@ async def test_json_loader_multiple_files():
storage=StorageConfig(
base_dir="tests/unit/indexing/input/data/multiple-jsons",
),
file_type=InputFileType.Json,
type=InputType.Json,
)
storage = create_storage(config.storage)
reader = create_input_reader(config, storage)

View File

@ -1,7 +1,7 @@
# Copyright (c) 2024 Microsoft Corporation.
# Licensed under the MIT License
from graphrag_input import InputConfig, InputFileType, create_input_reader
from graphrag_input import InputConfig, InputType, create_input_reader
from graphrag_storage import StorageConfig, create_storage
@ -10,7 +10,7 @@ async def test_jsonl_loader_one_file_multiple_objects():
storage=StorageConfig(
base_dir="tests/unit/indexing/input/data/one-jsonl",
),
file_type=InputFileType.JsonLines,
type=InputType.JsonLines,
file_pattern=".*\\.jsonl$",
)
storage = create_storage(config.storage)
@ -30,7 +30,7 @@ async def test_jsonl_loader_one_file_with_title():
storage=StorageConfig(
base_dir="tests/unit/indexing/input/data/one-jsonl",
),
file_type=InputFileType.JsonLines,
type=InputType.JsonLines,
title_column="title",
)
storage = create_storage(config.storage)

View File

@ -1,7 +1,7 @@
# Copyright (c) 2024 Microsoft Corporation.
# Licensed under the MIT License
from graphrag_input import InputConfig, InputFileType, create_input_reader
from graphrag_input import InputConfig, InputType, create_input_reader
from graphrag_storage import StorageConfig, create_storage
@ -10,7 +10,7 @@ async def test_txt_loader_one_file():
storage=StorageConfig(
base_dir="tests/unit/indexing/input/data/one-txt",
),
file_type=InputFileType.Text,
type=InputType.Text,
file_pattern=".*\\.txt$",
)
storage = create_storage(config.storage)
@ -26,7 +26,7 @@ async def test_txt_loader_multiple_files():
storage=StorageConfig(
base_dir="tests/unit/indexing/input/data/multiple-txts",
),
file_type=InputFileType.Text,
type=InputType.Text,
file_pattern=".*\\.txt$",
)
storage = create_storage(config.storage)