mirror of
https://github.com/microsoft/graphrag.git
synced 2026-01-14 00:57:23 +08:00
Align input config type name with other factory configs
This commit is contained in:
parent
2f6d075b97
commit
a671aa4fe4
@ -87,9 +87,9 @@ Our pipeline can ingest .csv, .txt, or .json data from an input location. See th
|
||||
- `container_name` **str** - (blob/cosmosdb only) The Azure Storage container name.
|
||||
- `storage_account_blob_url` **str** - (blob only) The storage account blob URL to use.
|
||||
- `cosmosdb_account_blob_url` **str** - (cosmosdb only) The CosmosDB account blob URL to use.
|
||||
- `file_type` **text|csv|json** - The type of input data to load. Default is `text`
|
||||
- `type` **text|csv|json** - The type of input data to load. Default is `text`
|
||||
- `encoding` **str** - The encoding of the input file. Default is `utf-8`
|
||||
- `file_pattern` **str** - A regex to match input files. Default is `.*\.csv$`, `.*\.txt$`, or `.*\.json$` depending on the specified `file_type`, but you can customize it if needed.
|
||||
- `file_pattern` **str** - A regex to match input files. Default is `.*\.csv$`, `.*\.txt$`, or `.*\.json$` depending on the specified `type`, but you can customize it if needed.
|
||||
- `text_column` **str** - (CSV/JSON only) The text column name. If unset we expect a column named `text`.
|
||||
- `title_column` **str** - (CSV/JSON only) The title column name, filename will be used if unset.
|
||||
- `metadata` **list[str]** - (CSV/JSON only) The additional document attributes fields to keep.
|
||||
|
||||
@ -116,7 +116,7 @@ settings.yaml
|
||||
|
||||
```yaml
|
||||
input:
|
||||
file_type: text
|
||||
type: text
|
||||
metadata: [title]
|
||||
|
||||
chunks:
|
||||
@ -194,7 +194,7 @@ settings.yaml
|
||||
|
||||
```yaml
|
||||
input:
|
||||
file_type: json
|
||||
type: json
|
||||
title_column: headline
|
||||
text_column: content
|
||||
|
||||
|
||||
@ -13,12 +13,12 @@ This package provides input document loading utilities for GraphRAG, supporting
|
||||
|
||||
Basic usage with the factory:
|
||||
```python
|
||||
from graphrag_input import create_input_reader, InputConfig, InputFileType
|
||||
from graphrag_input import create_input_reader, InputConfig, InputType
|
||||
from graphrag_storage import StorageConfig, create_storage
|
||||
|
||||
config = InputConfig(
|
||||
storage=StorageConfig(base_dir="./input"),
|
||||
file_type=InputFileType.Csv,
|
||||
type=InputType.Csv,
|
||||
text_column="content",
|
||||
title_column="title",
|
||||
)
|
||||
|
||||
@ -5,15 +5,15 @@
|
||||
|
||||
from graphrag_input.get_property import get_property
|
||||
from graphrag_input.input_config import InputConfig
|
||||
from graphrag_input.input_file_type import InputFileType
|
||||
from graphrag_input.input_reader import InputReader
|
||||
from graphrag_input.input_reader_factory import create_input_reader
|
||||
from graphrag_input.input_type import InputType
|
||||
from graphrag_input.text_document import TextDocument
|
||||
|
||||
__all__ = [
|
||||
"InputConfig",
|
||||
"InputFileType",
|
||||
"InputReader",
|
||||
"InputType",
|
||||
"TextDocument",
|
||||
"create_input_reader",
|
||||
"get_property",
|
||||
|
||||
@ -15,6 +15,12 @@ logger = logging.getLogger(__name__)
|
||||
class CSVFileReader(StructuredFileReader):
|
||||
"""Reader implementation for csv files."""
|
||||
|
||||
def __init__(self, file_pattern: str | None = None, **kwargs):
|
||||
super().__init__(
|
||||
file_pattern=file_pattern if file_pattern is not None else ".*\\.csv$",
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
async def read_file(self, path: str) -> list[TextDocument]:
|
||||
"""Read a csv file into a list of documents.
|
||||
|
||||
|
||||
@ -6,7 +6,7 @@
|
||||
from graphrag_storage import StorageConfig
|
||||
from pydantic import BaseModel, ConfigDict, Field
|
||||
|
||||
from graphrag_input.input_file_type import InputFileType
|
||||
from graphrag_input.input_type import InputType
|
||||
|
||||
|
||||
class InputConfig(BaseModel):
|
||||
@ -19,9 +19,9 @@ class InputConfig(BaseModel):
|
||||
description="The storage configuration to use for reading input documents.",
|
||||
default_factory=lambda: StorageConfig(base_dir="input"),
|
||||
)
|
||||
file_type: str = Field(
|
||||
type: str = Field(
|
||||
description="The input file type to use.",
|
||||
default=InputFileType.Text,
|
||||
default=InputType.Text,
|
||||
)
|
||||
encoding: str | None = Field(
|
||||
description="The input file encoding to use.",
|
||||
|
||||
@ -24,30 +24,19 @@ class InputReader(metaclass=ABCMeta):
|
||||
def __init__(
|
||||
self,
|
||||
storage: Storage,
|
||||
file_type: str,
|
||||
encoding: str = "utf-8",
|
||||
file_pattern: str | None = None,
|
||||
**kwargs,
|
||||
):
|
||||
self._storage = storage
|
||||
self._file_type = file_type
|
||||
self._encoding = encoding
|
||||
|
||||
# built-in readers set a default pattern if none is provided
|
||||
# this is usually just the file type itself, e.g., the file extension
|
||||
pattern = (
|
||||
file_pattern if file_pattern is not None else f".*\\.{self._file_type}$"
|
||||
)
|
||||
if file_pattern is None and self._file_type == "text":
|
||||
pattern = ".*\\.txt$"
|
||||
|
||||
self._file_pattern = pattern
|
||||
self._file_pattern = file_pattern
|
||||
|
||||
async def read_files(self) -> list[TextDocument]:
|
||||
"""Load files from storage and apply a loader function based on file type. Process metadata on the results if needed."""
|
||||
files = list(self._storage.find(re.compile(self._file_pattern)))
|
||||
if len(files) == 0:
|
||||
msg = f"No {self._file_type} files found in storage" # TODO: use a storage __str__ to define it per impl
|
||||
msg = f"No {self._file_pattern} matches found in storage"
|
||||
logger.warning(msg)
|
||||
files = []
|
||||
|
||||
@ -63,11 +52,11 @@ class InputReader(metaclass=ABCMeta):
|
||||
logger.info(
|
||||
"Found %d %s files, loading %d",
|
||||
len(files),
|
||||
self._file_type,
|
||||
self._file_pattern,
|
||||
len(documents),
|
||||
)
|
||||
total_files_log = (
|
||||
f"Total number of unfiltered {self._file_type} rows: {len(documents)}"
|
||||
f"Total number of unfiltered {self._file_pattern} rows: {len(documents)}"
|
||||
)
|
||||
logger.info(total_files_log)
|
||||
|
||||
|
||||
@ -11,8 +11,8 @@ from graphrag_common.factory.factory import ServiceScope
|
||||
from graphrag_storage.storage import Storage
|
||||
|
||||
from graphrag_input.input_config import InputConfig
|
||||
from graphrag_input.input_file_type import InputFileType
|
||||
from graphrag_input.input_reader import InputReader
|
||||
from graphrag_input.input_type import InputType
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@ -57,28 +57,28 @@ def create_input_reader(config: InputConfig, storage: Storage) -> InputReader:
|
||||
The created input reader implementation.
|
||||
"""
|
||||
config_model = config.model_dump()
|
||||
input_strategy = config.file_type
|
||||
input_strategy = config.type
|
||||
|
||||
if input_strategy not in input_reader_factory:
|
||||
match input_strategy:
|
||||
case InputFileType.Csv:
|
||||
case InputType.Csv:
|
||||
from graphrag_input.csv import CSVFileReader
|
||||
|
||||
register_input_reader(InputFileType.Csv, CSVFileReader)
|
||||
case InputFileType.Text:
|
||||
register_input_reader(InputType.Csv, CSVFileReader)
|
||||
case InputType.Text:
|
||||
from graphrag_input.text import TextFileReader
|
||||
|
||||
register_input_reader(InputFileType.Text, TextFileReader)
|
||||
case InputFileType.Json:
|
||||
register_input_reader(InputType.Text, TextFileReader)
|
||||
case InputType.Json:
|
||||
from graphrag_input.json import JSONFileReader
|
||||
|
||||
register_input_reader(InputFileType.Json, JSONFileReader)
|
||||
case InputFileType.JsonLines:
|
||||
register_input_reader(InputType.Json, JSONFileReader)
|
||||
case InputType.JsonLines:
|
||||
from graphrag_input.jsonl import JSONLinesFileReader
|
||||
|
||||
register_input_reader(InputFileType.JsonLines, JSONLinesFileReader)
|
||||
register_input_reader(InputType.JsonLines, JSONLinesFileReader)
|
||||
case _:
|
||||
msg = f"InputConfig.file_type '{input_strategy}' is not registered in the InputReaderFactory. Registered types: {', '.join(input_reader_factory.keys())}."
|
||||
msg = f"InputConfig.type '{input_strategy}' is not registered in the InputReaderFactory. Registered types: {', '.join(input_reader_factory.keys())}."
|
||||
raise ValueError(msg)
|
||||
|
||||
config_model["storage"] = storage
|
||||
|
||||
@ -6,7 +6,7 @@
|
||||
from enum import StrEnum
|
||||
|
||||
|
||||
class InputFileType(StrEnum):
|
||||
class InputType(StrEnum):
|
||||
"""The input file type for the pipeline."""
|
||||
|
||||
Csv = "csv"
|
||||
@ -15,6 +15,12 @@ logger = logging.getLogger(__name__)
|
||||
class JSONFileReader(StructuredFileReader):
|
||||
"""Reader implementation for json files."""
|
||||
|
||||
def __init__(self, file_pattern: str | None = None, **kwargs):
|
||||
super().__init__(
|
||||
file_pattern=file_pattern if file_pattern is not None else ".*\\.json$",
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
async def read_file(self, path: str) -> list[TextDocument]:
|
||||
"""Read a JSON file into a list of documents.
|
||||
|
||||
|
||||
@ -15,6 +15,12 @@ logger = logging.getLogger(__name__)
|
||||
class JSONLinesFileReader(StructuredFileReader):
|
||||
"""Reader implementation for json lines files."""
|
||||
|
||||
def __init__(self, file_pattern: str | None = None, **kwargs):
|
||||
super().__init__(
|
||||
file_pattern=file_pattern if file_pattern is not None else ".*\\.jsonl$",
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
async def read_file(self, path: str) -> list[TextDocument]:
|
||||
"""Read a JSON lines file into a list of documents.
|
||||
|
||||
|
||||
@ -16,6 +16,12 @@ logger = logging.getLogger(__name__)
|
||||
class TextFileReader(InputReader):
|
||||
"""Reader implementation for text files."""
|
||||
|
||||
def __init__(self, file_pattern: str | None = None, **kwargs):
|
||||
super().__init__(
|
||||
file_pattern=file_pattern if file_pattern is not None else ".*\\.txt$",
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
async def read_file(self, path: str) -> list[TextDocument]:
|
||||
"""Read a text file into a DataFrame of documents.
|
||||
|
||||
|
||||
@ -9,7 +9,7 @@ from typing import ClassVar
|
||||
|
||||
from graphrag_cache import CacheType
|
||||
from graphrag_chunking.chunk_strategy_type import ChunkerType
|
||||
from graphrag_input import InputFileType
|
||||
from graphrag_input import InputType
|
||||
from graphrag_storage import StorageType
|
||||
|
||||
from graphrag.config.embeddings import default_embeddings
|
||||
@ -237,7 +237,7 @@ class InputDefaults:
|
||||
"""Default values for input."""
|
||||
|
||||
storage: InputStorageDefaults = field(default_factory=InputStorageDefaults)
|
||||
file_type: ClassVar[InputFileType] = InputFileType.Text
|
||||
type: ClassVar[InputType] = InputType.Text
|
||||
encoding: str | None = None
|
||||
file_pattern: None = None
|
||||
id_column: None = None
|
||||
|
||||
@ -52,7 +52,7 @@ input:
|
||||
storage:
|
||||
type: {graphrag_config_defaults.input.storage.type} # [file, blob, cosmosdb]
|
||||
base_dir: "{graphrag_config_defaults.input.storage.base_dir}"
|
||||
file_type: {graphrag_config_defaults.input.file_type.value} # [csv, text, json]
|
||||
type: {graphrag_config_defaults.input.type.value} # [csv, text, json]
|
||||
|
||||
chunking:
|
||||
type: {graphrag_config_defaults.chunking.type}
|
||||
|
||||
2
tests/fixtures/azure/config.json
vendored
2
tests/fixtures/azure/config.json
vendored
@ -1,6 +1,6 @@
|
||||
{
|
||||
"input_path": "./tests/fixtures/azure",
|
||||
"input_file_type": "text",
|
||||
"input_type": "text",
|
||||
"index_method": "standard",
|
||||
"workflow_config": {
|
||||
"skip_assert": true,
|
||||
|
||||
2
tests/fixtures/azure/settings.yml
vendored
2
tests/fixtures/azure/settings.yml
vendored
@ -14,7 +14,7 @@ input:
|
||||
connection_string: ${LOCAL_BLOB_STORAGE_CONNECTION_STRING}
|
||||
container_name: azurefixture
|
||||
base_dir: input
|
||||
file_type: text
|
||||
type: text
|
||||
|
||||
|
||||
cache:
|
||||
|
||||
2
tests/fixtures/min-csv/config.json
vendored
2
tests/fixtures/min-csv/config.json
vendored
@ -1,6 +1,6 @@
|
||||
{
|
||||
"input_path": "./tests/fixtures/min-csv",
|
||||
"input_file_type": "text",
|
||||
"input_type": "text",
|
||||
"index_method": "standard",
|
||||
"workflow_config": {
|
||||
"load_input_documents": {
|
||||
|
||||
2
tests/fixtures/min-csv/settings.yml
vendored
2
tests/fixtures/min-csv/settings.yml
vendored
@ -35,7 +35,7 @@ vector_store:
|
||||
overwrite: True
|
||||
|
||||
input:
|
||||
file_type: csv
|
||||
type: csv
|
||||
|
||||
snapshots:
|
||||
embeddings: true
|
||||
2
tests/fixtures/text/config.json
vendored
2
tests/fixtures/text/config.json
vendored
@ -1,6 +1,6 @@
|
||||
{
|
||||
"input_path": "./tests/fixtures/text",
|
||||
"input_file_type": "text",
|
||||
"input_type": "text",
|
||||
"index_method": "fast",
|
||||
"workflow_config": {
|
||||
"load_input_documents": {
|
||||
|
||||
@ -126,7 +126,7 @@ class TestIndexer:
|
||||
def __run_indexer(
|
||||
self,
|
||||
root: Path,
|
||||
input_file_type: str,
|
||||
input_type: str,
|
||||
index_method: str,
|
||||
):
|
||||
command = [
|
||||
@ -232,7 +232,7 @@ class TestIndexer:
|
||||
def test_fixture(
|
||||
self,
|
||||
input_path: str,
|
||||
input_file_type: str,
|
||||
input_type: str,
|
||||
index_method: str,
|
||||
workflow_config: dict[str, dict[str, Any]],
|
||||
query_config: list[dict[str, str]],
|
||||
@ -248,7 +248,7 @@ class TestIndexer:
|
||||
dispose = asyncio.run(prepare_azurite_data(input_path, azure))
|
||||
|
||||
print("running indexer")
|
||||
self.__run_indexer(root, input_file_type, index_method)
|
||||
self.__run_indexer(root, input_type, index_method)
|
||||
print("indexer complete")
|
||||
|
||||
if dispose is not None:
|
||||
|
||||
@ -144,7 +144,7 @@ def assert_cache_configs(actual: CacheConfig, expected: CacheConfig) -> None:
|
||||
|
||||
def assert_input_configs(actual: InputConfig, expected: InputConfig) -> None:
|
||||
assert_storage_config(actual.storage, expected.storage)
|
||||
assert actual.file_type == expected.file_type
|
||||
assert actual.type == expected.type
|
||||
assert actual.encoding == expected.encoding
|
||||
assert actual.file_pattern == expected.file_pattern
|
||||
assert actual.text_column == expected.text_column
|
||||
|
||||
@ -1,7 +1,7 @@
|
||||
# Copyright (c) 2024 Microsoft Corporation.
|
||||
# Licensed under the MIT License
|
||||
|
||||
from graphrag_input import InputConfig, InputFileType, create_input_reader
|
||||
from graphrag_input import InputConfig, InputType, create_input_reader
|
||||
from graphrag_storage import StorageConfig, create_storage
|
||||
|
||||
|
||||
@ -10,7 +10,7 @@ async def test_csv_loader_one_file():
|
||||
storage=StorageConfig(
|
||||
base_dir="tests/unit/indexing/input/data/one-csv",
|
||||
),
|
||||
file_type=InputFileType.Csv,
|
||||
type=InputType.Csv,
|
||||
file_pattern=".*\\.csv$",
|
||||
)
|
||||
storage = create_storage(config.storage)
|
||||
@ -30,7 +30,7 @@ async def test_csv_loader_one_file_with_title():
|
||||
storage=StorageConfig(
|
||||
base_dir="tests/unit/indexing/input/data/one-csv",
|
||||
),
|
||||
file_type=InputFileType.Csv,
|
||||
type=InputType.Csv,
|
||||
title_column="title",
|
||||
)
|
||||
storage = create_storage(config.storage)
|
||||
@ -45,7 +45,7 @@ async def test_csv_loader_multiple_files():
|
||||
storage=StorageConfig(
|
||||
base_dir="tests/unit/indexing/input/data/multiple-csvs",
|
||||
),
|
||||
file_type=InputFileType.Csv,
|
||||
type=InputType.Csv,
|
||||
)
|
||||
storage = create_storage(config.storage)
|
||||
reader = create_input_reader(config, storage)
|
||||
|
||||
@ -1,7 +1,7 @@
|
||||
# Copyright (c) 2024 Microsoft Corporation.
|
||||
# Licensed under the MIT License
|
||||
|
||||
from graphrag_input import InputConfig, InputFileType, create_input_reader
|
||||
from graphrag_input import InputConfig, InputType, create_input_reader
|
||||
from graphrag_storage import StorageConfig, create_storage
|
||||
|
||||
|
||||
@ -10,7 +10,7 @@ async def test_json_loader_one_file_one_object():
|
||||
storage=StorageConfig(
|
||||
base_dir="tests/unit/indexing/input/data/one-json-one-object",
|
||||
),
|
||||
file_type=InputFileType.Json,
|
||||
type=InputType.Json,
|
||||
file_pattern=".*\\.json$",
|
||||
)
|
||||
storage = create_storage(config.storage)
|
||||
@ -29,7 +29,7 @@ async def test_json_loader_one_file_multiple_objects():
|
||||
storage=StorageConfig(
|
||||
base_dir="tests/unit/indexing/input/data/one-json-multiple-objects",
|
||||
),
|
||||
file_type=InputFileType.Json,
|
||||
type=InputType.Json,
|
||||
)
|
||||
storage = create_storage(config.storage)
|
||||
reader = create_input_reader(config, storage)
|
||||
@ -44,7 +44,7 @@ async def test_json_loader_one_file_with_title():
|
||||
storage=StorageConfig(
|
||||
base_dir="tests/unit/indexing/input/data/one-json-one-object",
|
||||
),
|
||||
file_type=InputFileType.Json,
|
||||
type=InputType.Json,
|
||||
title_column="title",
|
||||
)
|
||||
storage = create_storage(config.storage)
|
||||
@ -59,7 +59,7 @@ async def test_json_loader_multiple_files():
|
||||
storage=StorageConfig(
|
||||
base_dir="tests/unit/indexing/input/data/multiple-jsons",
|
||||
),
|
||||
file_type=InputFileType.Json,
|
||||
type=InputType.Json,
|
||||
)
|
||||
storage = create_storage(config.storage)
|
||||
reader = create_input_reader(config, storage)
|
||||
|
||||
@ -1,7 +1,7 @@
|
||||
# Copyright (c) 2024 Microsoft Corporation.
|
||||
# Licensed under the MIT License
|
||||
|
||||
from graphrag_input import InputConfig, InputFileType, create_input_reader
|
||||
from graphrag_input import InputConfig, InputType, create_input_reader
|
||||
from graphrag_storage import StorageConfig, create_storage
|
||||
|
||||
|
||||
@ -10,7 +10,7 @@ async def test_jsonl_loader_one_file_multiple_objects():
|
||||
storage=StorageConfig(
|
||||
base_dir="tests/unit/indexing/input/data/one-jsonl",
|
||||
),
|
||||
file_type=InputFileType.JsonLines,
|
||||
type=InputType.JsonLines,
|
||||
file_pattern=".*\\.jsonl$",
|
||||
)
|
||||
storage = create_storage(config.storage)
|
||||
@ -30,7 +30,7 @@ async def test_jsonl_loader_one_file_with_title():
|
||||
storage=StorageConfig(
|
||||
base_dir="tests/unit/indexing/input/data/one-jsonl",
|
||||
),
|
||||
file_type=InputFileType.JsonLines,
|
||||
type=InputType.JsonLines,
|
||||
title_column="title",
|
||||
)
|
||||
storage = create_storage(config.storage)
|
||||
|
||||
@ -1,7 +1,7 @@
|
||||
# Copyright (c) 2024 Microsoft Corporation.
|
||||
# Licensed under the MIT License
|
||||
|
||||
from graphrag_input import InputConfig, InputFileType, create_input_reader
|
||||
from graphrag_input import InputConfig, InputType, create_input_reader
|
||||
from graphrag_storage import StorageConfig, create_storage
|
||||
|
||||
|
||||
@ -10,7 +10,7 @@ async def test_txt_loader_one_file():
|
||||
storage=StorageConfig(
|
||||
base_dir="tests/unit/indexing/input/data/one-txt",
|
||||
),
|
||||
file_type=InputFileType.Text,
|
||||
type=InputType.Text,
|
||||
file_pattern=".*\\.txt$",
|
||||
)
|
||||
storage = create_storage(config.storage)
|
||||
@ -26,7 +26,7 @@ async def test_txt_loader_multiple_files():
|
||||
storage=StorageConfig(
|
||||
base_dir="tests/unit/indexing/input/data/multiple-txts",
|
||||
),
|
||||
file_type=InputFileType.Text,
|
||||
type=InputType.Text,
|
||||
file_pattern=".*\\.txt$",
|
||||
)
|
||||
storage = create_storage(config.storage)
|
||||
|
||||
Loading…
Reference in New Issue
Block a user