Align input config type name with other factory configs

2026-01-14 00:57:23 +08:00 · 2026-01-08 15:41:28 -08:00 · 2026-01-08 15:41:28 -08:00 · a671aa4fe4
commit a671aa4fe4
parent 2f6d075b97
25 changed files with 78 additions and 65 deletions
--- a/docs/config/yaml.md
+++ b/docs/config/yaml.md
@ -87,9 +87,9 @@ Our pipeline can ingest .csv, .txt, or .json data from an input location. See th
  - `container_name` **str** - (blob/cosmosdb only) The Azure Storage container name.
  - `storage_account_blob_url` **str** - (blob only) The storage account blob URL to use.
  - `cosmosdb_account_blob_url` **str** - (cosmosdb only) The CosmosDB account blob URL to use.
- `file_type` **text|csv|json** - The type of input data to load. Default is `text`
+- `type` **text|csv|json** - The type of input data to load. Default is `text`
 - `encoding` **str** - The encoding of the input file. Default is `utf-8`
- `file_pattern` **str** - A regex to match input files. Default is `.*\.csv$`, `.*\.txt$`, or `.*\.json$` depending on the specified `file_type`, but you can customize it if needed.
+- `file_pattern` **str** - A regex to match input files. Default is `.*\.csv$`, `.*\.txt$`, or `.*\.json$` depending on the specified `type`, but you can customize it if needed.
 - `text_column` **str** - (CSV/JSON only) The text column name. If unset we expect a column named `text`.
 - `title_column` **str** - (CSV/JSON only) The title column name, filename will be used if unset.
 - `metadata` **list[str]** - (CSV/JSON only) The additional document attributes fields to keep.
--- a/docs/index/inputs.md
+++ b/docs/index/inputs.md
@ -116,7 +116,7 @@ settings.yaml

 ```yaml
 input:
-    file_type: text
+    type: text
    metadata: [title]

 chunks:
@ -194,7 +194,7 @@ settings.yaml

 ```yaml
 input:
-    file_type: json
+    type: json
    title_column: headline
    text_column: content

--- a/packages/graphrag-input/README.md
+++ b/packages/graphrag-input/README.md
@ -13,12 +13,12 @@ This package provides input document loading utilities for GraphRAG, supporting

 Basic usage with the factory:
 ```python
-from graphrag_input import create_input_reader, InputConfig, InputFileType
+from graphrag_input import create_input_reader, InputConfig, InputType
 from graphrag_storage import StorageConfig, create_storage

 config = InputConfig(
    storage=StorageConfig(base_dir="./input"),
-    file_type=InputFileType.Csv,
+    type=InputType.Csv,
    text_column="content",
    title_column="title",
 )
--- a/packages/graphrag-input/graphrag_input/init.py
+++ b/packages/graphrag-input/graphrag_input/init.py
@ -5,15 +5,15 @@

 from graphrag_input.get_property import get_property
 from graphrag_input.input_config import InputConfig
-from graphrag_input.input_file_type import InputFileType
 from graphrag_input.input_reader import InputReader
 from graphrag_input.input_reader_factory import create_input_reader
+from graphrag_input.input_type import InputType
 from graphrag_input.text_document import TextDocument

 __all__ = [
    "InputConfig",
-    "InputFileType",
    "InputReader",
+    "InputType",
    "TextDocument",
    "create_input_reader",
    "get_property",
--- a/packages/graphrag-input/graphrag_input/csv.py
+++ b/packages/graphrag-input/graphrag_input/csv.py
@ -15,6 +15,12 @@ logger = logging.getLogger(__name__)
 class CSVFileReader(StructuredFileReader):
    """Reader implementation for csv files."""

+    def __init__(self, file_pattern: str | None = None, **kwargs):
+        super().__init__(
+            file_pattern=file_pattern if file_pattern is not None else ".*\\.csv$",
+            **kwargs,
+        )
+
    async def read_file(self, path: str) -> list[TextDocument]:
        """Read a csv file into a list of documents.

--- a/packages/graphrag-input/graphrag_input/input_config.py
+++ b/packages/graphrag-input/graphrag_input/input_config.py
@ -6,7 +6,7 @@
 from graphrag_storage import StorageConfig
 from pydantic import BaseModel, ConfigDict, Field

-from graphrag_input.input_file_type import InputFileType
+from graphrag_input.input_type import InputType


 class InputConfig(BaseModel):
@ -19,9 +19,9 @@ class InputConfig(BaseModel):
        description="The storage configuration to use for reading input documents.",
        default_factory=lambda: StorageConfig(base_dir="input"),
    )
-    file_type: str = Field(
+    type: str = Field(
        description="The input file type to use.",
-        default=InputFileType.Text,
+        default=InputType.Text,
    )
    encoding: str | None = Field(
        description="The input file encoding to use.",
--- a/packages/graphrag-input/graphrag_input/input_reader.py
+++ b/packages/graphrag-input/graphrag_input/input_reader.py
@ -24,30 +24,19 @@ class InputReader(metaclass=ABCMeta):
    def __init__(
        self,
        storage: Storage,
-        file_type: str,
        encoding: str = "utf-8",
        file_pattern: str | None = None,
        **kwargs,
    ):
        self._storage = storage
-        self._file_type = file_type
        self._encoding = encoding
-
-        # built-in readers set a default pattern if none is provided
-        # this is usually just the file type itself, e.g., the file extension
-        pattern = (
-            file_pattern if file_pattern is not None else f".*\\.{self._file_type}$"
-        )
-        if file_pattern is None and self._file_type == "text":
-            pattern = ".*\\.txt$"
-
-        self._file_pattern = pattern
+        self._file_pattern = file_pattern

    async def read_files(self) -> list[TextDocument]:
        """Load files from storage and apply a loader function based on file type. Process metadata on the results if needed."""
        files = list(self._storage.find(re.compile(self._file_pattern)))
        if len(files) == 0:
-            msg = f"No {self._file_type} files found in storage"  # TODO: use a storage __str__ to define it per impl
+            msg = f"No {self._file_pattern} matches found in storage"
            logger.warning(msg)
            files = []

@ -63,11 +52,11 @@ class InputReader(metaclass=ABCMeta):
        logger.info(
            "Found %d %s files, loading %d",
            len(files),
-            self._file_type,
+            self._file_pattern,
            len(documents),
        )
        total_files_log = (
-            f"Total number of unfiltered {self._file_type} rows: {len(documents)}"
+            f"Total number of unfiltered {self._file_pattern} rows: {len(documents)}"
        )
        logger.info(total_files_log)

--- a/packages/graphrag-input/graphrag_input/input_reader_factory.py
+++ b/packages/graphrag-input/graphrag_input/input_reader_factory.py
@ -11,8 +11,8 @@ from graphrag_common.factory.factory import ServiceScope
 from graphrag_storage.storage import Storage

 from graphrag_input.input_config import InputConfig
-from graphrag_input.input_file_type import InputFileType
 from graphrag_input.input_reader import InputReader
+from graphrag_input.input_type import InputType

 logger = logging.getLogger(__name__)

@ -57,28 +57,28 @@ def create_input_reader(config: InputConfig, storage: Storage) -> InputReader:
            The created input reader implementation.
    """
    config_model = config.model_dump()
-    input_strategy = config.file_type
+    input_strategy = config.type

    if input_strategy not in input_reader_factory:
        match input_strategy:
-            case InputFileType.Csv:
+            case InputType.Csv:
                from graphrag_input.csv import CSVFileReader

-                register_input_reader(InputFileType.Csv, CSVFileReader)
-            case InputFileType.Text:
+                register_input_reader(InputType.Csv, CSVFileReader)
+            case InputType.Text:
                from graphrag_input.text import TextFileReader

-                register_input_reader(InputFileType.Text, TextFileReader)
-            case InputFileType.Json:
+                register_input_reader(InputType.Text, TextFileReader)
+            case InputType.Json:
                from graphrag_input.json import JSONFileReader

-                register_input_reader(InputFileType.Json, JSONFileReader)
-            case InputFileType.JsonLines:
+                register_input_reader(InputType.Json, JSONFileReader)
+            case InputType.JsonLines:
                from graphrag_input.jsonl import JSONLinesFileReader

-                register_input_reader(InputFileType.JsonLines, JSONLinesFileReader)
+                register_input_reader(InputType.JsonLines, JSONLinesFileReader)
            case _:
-                msg = f"InputConfig.file_type '{input_strategy}' is not registered in the InputReaderFactory. Registered types: {', '.join(input_reader_factory.keys())}."
+                msg = f"InputConfig.type '{input_strategy}' is not registered in the InputReaderFactory. Registered types: {', '.join(input_reader_factory.keys())}."
                raise ValueError(msg)

    config_model["storage"] = storage
--- a/packages/graphrag-input/graphrag_input/input_file_type.py
+++ b/packages/graphrag-input/graphrag_input/input_file_type.py
@ -6,7 +6,7 @@
 from enum import StrEnum


-class InputFileType(StrEnum):
+class InputType(StrEnum):
    """The input file type for the pipeline."""

    Csv = "csv"
--- a/packages/graphrag-input/graphrag_input/json.py
+++ b/packages/graphrag-input/graphrag_input/json.py
@ -15,6 +15,12 @@ logger = logging.getLogger(__name__)
 class JSONFileReader(StructuredFileReader):
    """Reader implementation for json files."""

+    def __init__(self, file_pattern: str | None = None, **kwargs):
+        super().__init__(
+            file_pattern=file_pattern if file_pattern is not None else ".*\\.json$",
+            **kwargs,
+        )
+
    async def read_file(self, path: str) -> list[TextDocument]:
        """Read a JSON file into a list of documents.

--- a/packages/graphrag-input/graphrag_input/jsonl.py
+++ b/packages/graphrag-input/graphrag_input/jsonl.py
@ -15,6 +15,12 @@ logger = logging.getLogger(__name__)
 class JSONLinesFileReader(StructuredFileReader):
    """Reader implementation for json lines files."""

+    def __init__(self, file_pattern: str | None = None, **kwargs):
+        super().__init__(
+            file_pattern=file_pattern if file_pattern is not None else ".*\\.jsonl$",
+            **kwargs,
+        )
+
    async def read_file(self, path: str) -> list[TextDocument]:
        """Read a JSON lines file into a list of documents.

--- a/packages/graphrag-input/graphrag_input/text.py
+++ b/packages/graphrag-input/graphrag_input/text.py
@ -16,6 +16,12 @@ logger = logging.getLogger(__name__)
 class TextFileReader(InputReader):
    """Reader implementation for text files."""

+    def __init__(self, file_pattern: str | None = None, **kwargs):
+        super().__init__(
+            file_pattern=file_pattern if file_pattern is not None else ".*\\.txt$",
+            **kwargs,
+        )
+
    async def read_file(self, path: str) -> list[TextDocument]:
        """Read a text file into a DataFrame of documents.

--- a/packages/graphrag/graphrag/config/defaults.py
+++ b/packages/graphrag/graphrag/config/defaults.py
@ -9,7 +9,7 @@ from typing import ClassVar

 from graphrag_cache import CacheType
 from graphrag_chunking.chunk_strategy_type import ChunkerType
-from graphrag_input import InputFileType
+from graphrag_input import InputType
 from graphrag_storage import StorageType

 from graphrag.config.embeddings import default_embeddings
@ -237,7 +237,7 @@ class InputDefaults:
    """Default values for input."""

    storage: InputStorageDefaults = field(default_factory=InputStorageDefaults)
-    file_type: ClassVar[InputFileType] = InputFileType.Text
+    type: ClassVar[InputType] = InputType.Text
    encoding: str | None = None
    file_pattern: None = None
    id_column: None = None
--- a/packages/graphrag/graphrag/config/init_content.py
+++ b/packages/graphrag/graphrag/config/init_content.py
@ -52,7 +52,7 @@ input:
  storage:
    type: {graphrag_config_defaults.input.storage.type} # [file, blob, cosmosdb]
    base_dir: "{graphrag_config_defaults.input.storage.base_dir}"
-  file_type: {graphrag_config_defaults.input.file_type.value} # [csv, text, json]
+  type: {graphrag_config_defaults.input.type.value} # [csv, text, json]

 chunking:
  type: {graphrag_config_defaults.chunking.type}
--- a/tests/fixtures/azure/config.json
+++ b/tests/fixtures/azure/config.json
@ -1,6 +1,6 @@
 {
    "input_path": "./tests/fixtures/azure",
-    "input_file_type": "text",
+    "input_type": "text",
    "index_method": "standard",
    "workflow_config": {
        "skip_assert": true,
--- a/tests/fixtures/azure/settings.yml
+++ b/tests/fixtures/azure/settings.yml
@ -14,7 +14,7 @@ input:
    connection_string: ${LOCAL_BLOB_STORAGE_CONNECTION_STRING}
    container_name: azurefixture
    base_dir: input
-  file_type: text
+  type: text
  

 cache:
--- a/tests/fixtures/min-csv/config.json
+++ b/tests/fixtures/min-csv/config.json
@ -1,6 +1,6 @@
 {
    "input_path": "./tests/fixtures/min-csv",
-    "input_file_type": "text",
+    "input_type": "text",
    "index_method": "standard",
    "workflow_config": {
        "load_input_documents": {
--- a/tests/fixtures/min-csv/settings.yml
+++ b/tests/fixtures/min-csv/settings.yml
@ -35,7 +35,7 @@ vector_store:
    overwrite: True

 input:
-  file_type: csv
+  type: csv

 snapshots:
  embeddings: true
--- a/tests/fixtures/text/config.json
+++ b/tests/fixtures/text/config.json
@ -1,6 +1,6 @@
 {
    "input_path": "./tests/fixtures/text",
-    "input_file_type": "text",
+    "input_type": "text",
    "index_method": "fast",
    "workflow_config": {
        "load_input_documents": {
--- a/tests/smoke/test_fixtures.py
+++ b/tests/smoke/test_fixtures.py
@ -126,7 +126,7 @@ class TestIndexer:
    def __run_indexer(
        self,
        root: Path,
-        input_file_type: str,
+        input_type: str,
        index_method: str,
    ):
        command = [
@ -232,7 +232,7 @@ class TestIndexer:
    def test_fixture(
        self,
        input_path: str,
-        input_file_type: str,
+        input_type: str,
        index_method: str,
        workflow_config: dict[str, dict[str, Any]],
        query_config: list[dict[str, str]],
@ -248,7 +248,7 @@ class TestIndexer:
            dispose = asyncio.run(prepare_azurite_data(input_path, azure))

        print("running indexer")
-        self.__run_indexer(root, input_file_type, index_method)
+        self.__run_indexer(root, input_type, index_method)
        print("indexer complete")

        if dispose is not None:
--- a/tests/unit/config/utils.py
+++ b/tests/unit/config/utils.py
@ -144,7 +144,7 @@ def assert_cache_configs(actual: CacheConfig, expected: CacheConfig) -> None:

 def assert_input_configs(actual: InputConfig, expected: InputConfig) -> None:
    assert_storage_config(actual.storage, expected.storage)
-    assert actual.file_type == expected.file_type
+    assert actual.type == expected.type
    assert actual.encoding == expected.encoding
    assert actual.file_pattern == expected.file_pattern
    assert actual.text_column == expected.text_column
--- a/tests/unit/indexing/input/test_csv_loader.py
+++ b/tests/unit/indexing/input/test_csv_loader.py
@ -1,7 +1,7 @@
 # Copyright (c) 2024 Microsoft Corporation.
 # Licensed under the MIT License

-from graphrag_input import InputConfig, InputFileType, create_input_reader
+from graphrag_input import InputConfig, InputType, create_input_reader
 from graphrag_storage import StorageConfig, create_storage


@ -10,7 +10,7 @@ async def test_csv_loader_one_file():
        storage=StorageConfig(
            base_dir="tests/unit/indexing/input/data/one-csv",
        ),
-        file_type=InputFileType.Csv,
+        type=InputType.Csv,
        file_pattern=".*\\.csv$",
    )
    storage = create_storage(config.storage)
@ -30,7 +30,7 @@ async def test_csv_loader_one_file_with_title():
        storage=StorageConfig(
            base_dir="tests/unit/indexing/input/data/one-csv",
        ),
-        file_type=InputFileType.Csv,
+        type=InputType.Csv,
        title_column="title",
    )
    storage = create_storage(config.storage)
@ -45,7 +45,7 @@ async def test_csv_loader_multiple_files():
        storage=StorageConfig(
            base_dir="tests/unit/indexing/input/data/multiple-csvs",
        ),
-        file_type=InputFileType.Csv,
+        type=InputType.Csv,
    )
    storage = create_storage(config.storage)
    reader = create_input_reader(config, storage)
--- a/tests/unit/indexing/input/test_json_loader.py
+++ b/tests/unit/indexing/input/test_json_loader.py
@ -1,7 +1,7 @@
 # Copyright (c) 2024 Microsoft Corporation.
 # Licensed under the MIT License

-from graphrag_input import InputConfig, InputFileType, create_input_reader
+from graphrag_input import InputConfig, InputType, create_input_reader
 from graphrag_storage import StorageConfig, create_storage


@ -10,7 +10,7 @@ async def test_json_loader_one_file_one_object():
        storage=StorageConfig(
            base_dir="tests/unit/indexing/input/data/one-json-one-object",
        ),
-        file_type=InputFileType.Json,
+        type=InputType.Json,
        file_pattern=".*\\.json$",
    )
    storage = create_storage(config.storage)
@ -29,7 +29,7 @@ async def test_json_loader_one_file_multiple_objects():
        storage=StorageConfig(
            base_dir="tests/unit/indexing/input/data/one-json-multiple-objects",
        ),
-        file_type=InputFileType.Json,
+        type=InputType.Json,
    )
    storage = create_storage(config.storage)
    reader = create_input_reader(config, storage)
@ -44,7 +44,7 @@ async def test_json_loader_one_file_with_title():
        storage=StorageConfig(
            base_dir="tests/unit/indexing/input/data/one-json-one-object",
        ),
-        file_type=InputFileType.Json,
+        type=InputType.Json,
        title_column="title",
    )
    storage = create_storage(config.storage)
@ -59,7 +59,7 @@ async def test_json_loader_multiple_files():
        storage=StorageConfig(
            base_dir="tests/unit/indexing/input/data/multiple-jsons",
        ),
-        file_type=InputFileType.Json,
+        type=InputType.Json,
    )
    storage = create_storage(config.storage)
    reader = create_input_reader(config, storage)
--- a/tests/unit/indexing/input/test_jsonl_loader.py
+++ b/tests/unit/indexing/input/test_jsonl_loader.py
@ -1,7 +1,7 @@
 # Copyright (c) 2024 Microsoft Corporation.
 # Licensed under the MIT License

-from graphrag_input import InputConfig, InputFileType, create_input_reader
+from graphrag_input import InputConfig, InputType, create_input_reader
 from graphrag_storage import StorageConfig, create_storage


@ -10,7 +10,7 @@ async def test_jsonl_loader_one_file_multiple_objects():
        storage=StorageConfig(
            base_dir="tests/unit/indexing/input/data/one-jsonl",
        ),
-        file_type=InputFileType.JsonLines,
+        type=InputType.JsonLines,
        file_pattern=".*\\.jsonl$",
    )
    storage = create_storage(config.storage)
@ -30,7 +30,7 @@ async def test_jsonl_loader_one_file_with_title():
        storage=StorageConfig(
            base_dir="tests/unit/indexing/input/data/one-jsonl",
        ),
-        file_type=InputFileType.JsonLines,
+        type=InputType.JsonLines,
        title_column="title",
    )
    storage = create_storage(config.storage)
--- a/tests/unit/indexing/input/test_txt_loader.py
+++ b/tests/unit/indexing/input/test_txt_loader.py
@ -1,7 +1,7 @@
 # Copyright (c) 2024 Microsoft Corporation.
 # Licensed under the MIT License

-from graphrag_input import InputConfig, InputFileType, create_input_reader
+from graphrag_input import InputConfig, InputType, create_input_reader
 from graphrag_storage import StorageConfig, create_storage


@ -10,7 +10,7 @@ async def test_txt_loader_one_file():
        storage=StorageConfig(
            base_dir="tests/unit/indexing/input/data/one-txt",
        ),
-        file_type=InputFileType.Text,
+        type=InputType.Text,
        file_pattern=".*\\.txt$",
    )
    storage = create_storage(config.storage)
@ -26,7 +26,7 @@ async def test_txt_loader_multiple_files():
        storage=StorageConfig(
            base_dir="tests/unit/indexing/input/data/multiple-txts",
        ),
-        file_type=InputFileType.Text,
+        type=InputType.Text,
        file_pattern=".*\\.txt$",
    )
    storage = create_storage(config.storage)