mirror of
https://github.com/microsoft/graphrag.git
synced 2026-01-14 00:57:23 +08:00
Add json lines (jsonl) input support
This commit is contained in:
parent
a03df1b350
commit
8b45208ba9
@ -16,14 +16,14 @@ class CSVFileReader(StructuredFileReader):
|
||||
"""Reader implementation for csv files."""
|
||||
|
||||
async def read_file(self, path: str) -> list[TextDocument]:
|
||||
"""Read a csv file into a DataFrame of documents.
|
||||
"""Read a csv file into a list of documents.
|
||||
|
||||
Args:
|
||||
- path - The path to read the file from.
|
||||
|
||||
Returns
|
||||
-------
|
||||
- output - DataFrame with a row for each document in the file.
|
||||
- output - list with a TextDocument for each row in the file.
|
||||
"""
|
||||
file = await self._storage.get(path)
|
||||
|
||||
|
||||
@ -15,6 +15,8 @@ class InputFileType(StrEnum):
|
||||
"""The text input type."""
|
||||
Json = "json"
|
||||
"""The JSON input type."""
|
||||
JsonLines = "jsonl"
|
||||
"""The JSON Lines input type."""
|
||||
|
||||
def __repr__(self):
|
||||
"""Get a string representation."""
|
||||
|
||||
@ -73,6 +73,10 @@ def create_input_reader(config: InputConfig, storage: Storage) -> InputReader:
|
||||
from graphrag.index.input.json import JSONFileReader
|
||||
|
||||
register_input_reader(InputFileType.Json, JSONFileReader)
|
||||
case InputFileType.JsonLines:
|
||||
from graphrag.index.input.jsonl import JSONLinesFileReader
|
||||
|
||||
register_input_reader(InputFileType.JsonLines, JSONLinesFileReader)
|
||||
case _:
|
||||
msg = f"InputConfig.file_type '{input_strategy}' is not registered in the InputReaderFactory. Registered types: {', '.join(input_reader_factory.keys())}."
|
||||
raise ValueError(msg)
|
||||
|
||||
@ -16,14 +16,14 @@ class JSONFileReader(StructuredFileReader):
|
||||
"""Reader implementation for json files."""
|
||||
|
||||
async def read_file(self, path: str) -> list[TextDocument]:
|
||||
"""Read a JSON file into a DataFrame of documents.
|
||||
"""Read a JSON file into a list of documents.
|
||||
|
||||
Args:
|
||||
- path - The path to read the file from.
|
||||
|
||||
Returns
|
||||
-------
|
||||
- output - DataFrame with a row for each document in the file.
|
||||
- output - list with a TextDocument for each row in the file.
|
||||
"""
|
||||
text = await self._storage.get(path, encoding=self._encoding)
|
||||
as_json = json.loads(text)
|
||||
|
||||
32
packages/graphrag/graphrag/index/input/jsonl.py
Normal file
32
packages/graphrag/graphrag/index/input/jsonl.py
Normal file
@ -0,0 +1,32 @@
|
||||
# Copyright (c) 2024 Microsoft Corporation.
|
||||
# Licensed under the MIT License
|
||||
|
||||
"""A module containing 'JSONFileReader' model."""
|
||||
|
||||
import json
|
||||
import logging
|
||||
|
||||
from graphrag.index.input.structured_file_reader import StructuredFileReader
|
||||
from graphrag.index.input.text_document import TextDocument
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class JSONLinesFileReader(StructuredFileReader):
|
||||
"""Reader implementation for json files."""
|
||||
|
||||
async def read_file(self, path: str) -> list[TextDocument]:
|
||||
"""Read a JSON lines file into a list of documents.
|
||||
|
||||
This differs from standard JSON files in that each line is a separate JSON object.
|
||||
|
||||
Args:
|
||||
- path - The path to read the file from.
|
||||
|
||||
Returns
|
||||
-------
|
||||
- output - list with a TextDocument for each row in the file.
|
||||
"""
|
||||
text = await self._storage.get(path, encoding=self._encoding)
|
||||
rows = [json.loads(line) for line in text.splitlines()]
|
||||
return await self.process_data_columns(rows, path)
|
||||
@ -35,7 +35,7 @@ class StructuredFileReader(InputReader):
|
||||
) -> list[TextDocument]:
|
||||
"""Process configured data columns from a list of loaded dicts."""
|
||||
documents = []
|
||||
for row in rows:
|
||||
for index, row in enumerate(rows):
|
||||
# text column is required - harvest from dict
|
||||
text = row[self._text_column]
|
||||
# id is optional - generate from harvest from dict or hash from text
|
||||
@ -45,7 +45,9 @@ class StructuredFileReader(InputReader):
|
||||
else gen_sha512_hash({"text": text}, ["text"])
|
||||
)
|
||||
# title is optional - harvest from dict or use filename
|
||||
title = row[self._title_column] if self._title_column else str(path)
|
||||
title = (
|
||||
row[self._title_column] if self._title_column else f"{path} ({index})"
|
||||
)
|
||||
creation_date = await self._storage.get_creation_date(path)
|
||||
documents.append(
|
||||
TextDocument(
|
||||
|
||||
3
tests/unit/indexing/input/data/one-jsonl/input.jsonl
Normal file
3
tests/unit/indexing/input/data/one-jsonl/input.jsonl
Normal file
@ -0,0 +1,3 @@
|
||||
{ "title": "Hello", "text": "Hi how are you today?"}
|
||||
{ "title": "Goodbye", "text": "I'm outta here"}
|
||||
{ "title": "Adios", "text": "See you later"}
|
||||
@ -21,7 +21,7 @@ async def test_csv_loader_one_file():
|
||||
reader = create_input_reader(config, storage)
|
||||
documents = await reader.read_files()
|
||||
assert len(documents) == 2
|
||||
assert documents[0].title == "input.csv"
|
||||
assert documents[0].title == "input.csv (0)"
|
||||
assert documents[0].metadata is None
|
||||
|
||||
|
||||
@ -31,7 +31,6 @@ async def test_csv_loader_one_file_with_title():
|
||||
base_dir="tests/unit/indexing/input/data/one-csv",
|
||||
),
|
||||
file_type=InputFileType.Csv,
|
||||
file_pattern=".*\\.csv$",
|
||||
title_column="title",
|
||||
)
|
||||
storage = create_storage(config.storage)
|
||||
@ -47,7 +46,6 @@ async def test_csv_loader_one_file_with_metadata():
|
||||
base_dir="tests/unit/indexing/input/data/one-csv",
|
||||
),
|
||||
file_type=InputFileType.Csv,
|
||||
file_pattern=".*\\.csv$",
|
||||
title_column="title",
|
||||
metadata=["title"],
|
||||
)
|
||||
@ -64,7 +62,6 @@ async def test_csv_loader_multiple_files():
|
||||
base_dir="tests/unit/indexing/input/data/multiple-csvs",
|
||||
),
|
||||
file_type=InputFileType.Csv,
|
||||
file_pattern=".*\\.csv$",
|
||||
)
|
||||
storage = create_storage(config.storage)
|
||||
reader = create_input_reader(config, storage)
|
||||
|
||||
@ -19,7 +19,7 @@ async def test_json_loader_one_file_one_object():
|
||||
reader = create_input_reader(config, storage)
|
||||
documents = await reader.read_files()
|
||||
assert len(documents) == 1
|
||||
assert documents[0].title == "input.json"
|
||||
assert documents[0].title == "input.json (0)"
|
||||
assert documents[0].metadata is None
|
||||
|
||||
|
||||
@ -29,13 +29,13 @@ async def test_json_loader_one_file_multiple_objects():
|
||||
base_dir="tests/unit/indexing/input/data/one-json-multiple-objects",
|
||||
),
|
||||
file_type=InputFileType.Json,
|
||||
file_pattern=".*\\.json$",
|
||||
)
|
||||
storage = create_storage(config.storage)
|
||||
reader = create_input_reader(config, storage)
|
||||
documents = await reader.read_files()
|
||||
assert len(documents) == 3
|
||||
assert documents[0].title == "input.json"
|
||||
assert documents[0].title == "input.json (0)"
|
||||
assert documents[1].title == "input.json (1)"
|
||||
|
||||
|
||||
async def test_json_loader_one_file_with_title():
|
||||
@ -44,7 +44,6 @@ async def test_json_loader_one_file_with_title():
|
||||
base_dir="tests/unit/indexing/input/data/one-json-one-object",
|
||||
),
|
||||
file_type=InputFileType.Json,
|
||||
file_pattern=".*\\.json$",
|
||||
title_column="title",
|
||||
)
|
||||
storage = create_storage(config.storage)
|
||||
@ -60,7 +59,6 @@ async def test_json_loader_one_file_with_metadata():
|
||||
base_dir="tests/unit/indexing/input/data/one-json-one-object",
|
||||
),
|
||||
file_type=InputFileType.Json,
|
||||
file_pattern=".*\\.json$",
|
||||
title_column="title",
|
||||
metadata=["title"],
|
||||
)
|
||||
@ -77,7 +75,6 @@ async def test_json_loader_multiple_files():
|
||||
base_dir="tests/unit/indexing/input/data/multiple-jsons",
|
||||
),
|
||||
file_type=InputFileType.Json,
|
||||
file_pattern=".*\\.json$",
|
||||
)
|
||||
storage = create_storage(config.storage)
|
||||
reader = create_input_reader(config, storage)
|
||||
|
||||
53
tests/unit/indexing/input/test_jsonl_loader.py
Normal file
53
tests/unit/indexing/input/test_jsonl_loader.py
Normal file
@ -0,0 +1,53 @@
|
||||
# Copyright (c) 2024 Microsoft Corporation.
|
||||
# Licensed under the MIT License
|
||||
|
||||
from graphrag.index.input.input_config import InputConfig
|
||||
from graphrag.index.input.input_file_type import InputFileType
|
||||
from graphrag.index.input.input_reader_factory import create_input_reader
|
||||
from graphrag_storage import StorageConfig, create_storage
|
||||
|
||||
|
||||
async def test_jsonl_loader_one_file_multiple_objects():
|
||||
config = InputConfig(
|
||||
storage=StorageConfig(
|
||||
base_dir="tests/unit/indexing/input/data/one-jsonl",
|
||||
),
|
||||
file_type=InputFileType.JsonLines,
|
||||
file_pattern=".*\\.jsonl$",
|
||||
)
|
||||
storage = create_storage(config.storage)
|
||||
reader = create_input_reader(config, storage)
|
||||
documents = await reader.read_files()
|
||||
assert len(documents) == 3
|
||||
assert documents[0].title == "input.jsonl (0)"
|
||||
|
||||
|
||||
async def test_jsonl_loader_one_file_with_title():
|
||||
config = InputConfig(
|
||||
storage=StorageConfig(
|
||||
base_dir="tests/unit/indexing/input/data/one-jsonl",
|
||||
),
|
||||
file_type=InputFileType.JsonLines,
|
||||
title_column="title",
|
||||
)
|
||||
storage = create_storage(config.storage)
|
||||
reader = create_input_reader(config, storage)
|
||||
documents = await reader.read_files()
|
||||
assert len(documents) == 3
|
||||
assert documents[0].title == "Hello"
|
||||
|
||||
|
||||
async def test_jsonl_loader_one_file_with_metadata():
|
||||
config = InputConfig(
|
||||
storage=StorageConfig(
|
||||
base_dir="tests/unit/indexing/input/data/one-jsonl",
|
||||
),
|
||||
file_type=InputFileType.JsonLines,
|
||||
title_column="title",
|
||||
metadata=["title"],
|
||||
)
|
||||
storage = create_storage(config.storage)
|
||||
reader = create_input_reader(config, storage)
|
||||
documents = await reader.read_files()
|
||||
assert len(documents) == 3
|
||||
assert documents[0].metadata == {"title": "Hello"}
|
||||
Loading…
Reference in New Issue
Block a user