Add json lines (jsonl) input support

This commit is contained in:
Nathan Evans 2026-01-06 14:59:49 -08:00
parent a03df1b350
commit 8b45208ba9
10 changed files with 106 additions and 16 deletions

View File

@ -16,14 +16,14 @@ class CSVFileReader(StructuredFileReader):
"""Reader implementation for csv files."""
async def read_file(self, path: str) -> list[TextDocument]:
"""Read a csv file into a DataFrame of documents.
"""Read a csv file into a list of documents.
Args:
- path - The path to read the file from.
Returns
-------
- output - DataFrame with a row for each document in the file.
- output - list with a TextDocument for each row in the file.
"""
file = await self._storage.get(path)

View File

@ -15,6 +15,8 @@ class InputFileType(StrEnum):
"""The text input type."""
Json = "json"
"""The JSON input type."""
JsonLines = "jsonl"
"""The JSON Lines input type."""
def __repr__(self):
"""Get a string representation."""

View File

@ -73,6 +73,10 @@ def create_input_reader(config: InputConfig, storage: Storage) -> InputReader:
from graphrag.index.input.json import JSONFileReader
register_input_reader(InputFileType.Json, JSONFileReader)
case InputFileType.JsonLines:
from graphrag.index.input.jsonl import JSONLinesFileReader
register_input_reader(InputFileType.JsonLines, JSONLinesFileReader)
case _:
msg = f"InputConfig.file_type '{input_strategy}' is not registered in the InputReaderFactory. Registered types: {', '.join(input_reader_factory.keys())}."
raise ValueError(msg)

View File

@ -16,14 +16,14 @@ class JSONFileReader(StructuredFileReader):
"""Reader implementation for json files."""
async def read_file(self, path: str) -> list[TextDocument]:
"""Read a JSON file into a DataFrame of documents.
"""Read a JSON file into a list of documents.
Args:
- path - The path to read the file from.
Returns
-------
- output - DataFrame with a row for each document in the file.
- output - list with a TextDocument for each row in the file.
"""
text = await self._storage.get(path, encoding=self._encoding)
as_json = json.loads(text)

View File

@ -0,0 +1,32 @@
# Copyright (c) 2024 Microsoft Corporation.
# Licensed under the MIT License
"""A module containing 'JSONFileReader' model."""
import json
import logging
from graphrag.index.input.structured_file_reader import StructuredFileReader
from graphrag.index.input.text_document import TextDocument
logger = logging.getLogger(__name__)
class JSONLinesFileReader(StructuredFileReader):
"""Reader implementation for json files."""
async def read_file(self, path: str) -> list[TextDocument]:
"""Read a JSON lines file into a list of documents.
This differs from standard JSON files in that each line is a separate JSON object.
Args:
- path - The path to read the file from.
Returns
-------
- output - list with a TextDocument for each row in the file.
"""
text = await self._storage.get(path, encoding=self._encoding)
rows = [json.loads(line) for line in text.splitlines()]
return await self.process_data_columns(rows, path)

View File

@ -35,7 +35,7 @@ class StructuredFileReader(InputReader):
) -> list[TextDocument]:
"""Process configured data columns from a list of loaded dicts."""
documents = []
for row in rows:
for index, row in enumerate(rows):
# text column is required - harvest from dict
text = row[self._text_column]
# id is optional - generate from harvest from dict or hash from text
@ -45,7 +45,9 @@ class StructuredFileReader(InputReader):
else gen_sha512_hash({"text": text}, ["text"])
)
# title is optional - harvest from dict or use filename
title = row[self._title_column] if self._title_column else str(path)
title = (
row[self._title_column] if self._title_column else f"{path} ({index})"
)
creation_date = await self._storage.get_creation_date(path)
documents.append(
TextDocument(

View File

@ -0,0 +1,3 @@
{ "title": "Hello", "text": "Hi how are you today?"}
{ "title": "Goodbye", "text": "I'm outta here"}
{ "title": "Adios", "text": "See you later"}

View File

@ -21,7 +21,7 @@ async def test_csv_loader_one_file():
reader = create_input_reader(config, storage)
documents = await reader.read_files()
assert len(documents) == 2
assert documents[0].title == "input.csv"
assert documents[0].title == "input.csv (0)"
assert documents[0].metadata is None
@ -31,7 +31,6 @@ async def test_csv_loader_one_file_with_title():
base_dir="tests/unit/indexing/input/data/one-csv",
),
file_type=InputFileType.Csv,
file_pattern=".*\\.csv$",
title_column="title",
)
storage = create_storage(config.storage)
@ -47,7 +46,6 @@ async def test_csv_loader_one_file_with_metadata():
base_dir="tests/unit/indexing/input/data/one-csv",
),
file_type=InputFileType.Csv,
file_pattern=".*\\.csv$",
title_column="title",
metadata=["title"],
)
@ -64,7 +62,6 @@ async def test_csv_loader_multiple_files():
base_dir="tests/unit/indexing/input/data/multiple-csvs",
),
file_type=InputFileType.Csv,
file_pattern=".*\\.csv$",
)
storage = create_storage(config.storage)
reader = create_input_reader(config, storage)

View File

@ -19,7 +19,7 @@ async def test_json_loader_one_file_one_object():
reader = create_input_reader(config, storage)
documents = await reader.read_files()
assert len(documents) == 1
assert documents[0].title == "input.json"
assert documents[0].title == "input.json (0)"
assert documents[0].metadata is None
@ -29,13 +29,13 @@ async def test_json_loader_one_file_multiple_objects():
base_dir="tests/unit/indexing/input/data/one-json-multiple-objects",
),
file_type=InputFileType.Json,
file_pattern=".*\\.json$",
)
storage = create_storage(config.storage)
reader = create_input_reader(config, storage)
documents = await reader.read_files()
assert len(documents) == 3
assert documents[0].title == "input.json"
assert documents[0].title == "input.json (0)"
assert documents[1].title == "input.json (1)"
async def test_json_loader_one_file_with_title():
@ -44,7 +44,6 @@ async def test_json_loader_one_file_with_title():
base_dir="tests/unit/indexing/input/data/one-json-one-object",
),
file_type=InputFileType.Json,
file_pattern=".*\\.json$",
title_column="title",
)
storage = create_storage(config.storage)
@ -60,7 +59,6 @@ async def test_json_loader_one_file_with_metadata():
base_dir="tests/unit/indexing/input/data/one-json-one-object",
),
file_type=InputFileType.Json,
file_pattern=".*\\.json$",
title_column="title",
metadata=["title"],
)
@ -77,7 +75,6 @@ async def test_json_loader_multiple_files():
base_dir="tests/unit/indexing/input/data/multiple-jsons",
),
file_type=InputFileType.Json,
file_pattern=".*\\.json$",
)
storage = create_storage(config.storage)
reader = create_input_reader(config, storage)

View File

@ -0,0 +1,53 @@
# Copyright (c) 2024 Microsoft Corporation.
# Licensed under the MIT License
from graphrag.index.input.input_config import InputConfig
from graphrag.index.input.input_file_type import InputFileType
from graphrag.index.input.input_reader_factory import create_input_reader
from graphrag_storage import StorageConfig, create_storage
async def test_jsonl_loader_one_file_multiple_objects():
config = InputConfig(
storage=StorageConfig(
base_dir="tests/unit/indexing/input/data/one-jsonl",
),
file_type=InputFileType.JsonLines,
file_pattern=".*\\.jsonl$",
)
storage = create_storage(config.storage)
reader = create_input_reader(config, storage)
documents = await reader.read_files()
assert len(documents) == 3
assert documents[0].title == "input.jsonl (0)"
async def test_jsonl_loader_one_file_with_title():
config = InputConfig(
storage=StorageConfig(
base_dir="tests/unit/indexing/input/data/one-jsonl",
),
file_type=InputFileType.JsonLines,
title_column="title",
)
storage = create_storage(config.storage)
reader = create_input_reader(config, storage)
documents = await reader.read_files()
assert len(documents) == 3
assert documents[0].title == "Hello"
async def test_jsonl_loader_one_file_with_metadata():
config = InputConfig(
storage=StorageConfig(
base_dir="tests/unit/indexing/input/data/one-jsonl",
),
file_type=InputFileType.JsonLines,
title_column="title",
metadata=["title"],
)
storage = create_storage(config.storage)
reader = create_input_reader(config, storage)
documents = await reader.read_files()
assert len(documents) == 3
assert documents[0].metadata == {"title": "Hello"}