Add json lines (jsonl) input support

2026-01-14 00:57:23 +08:00 · 2026-01-06 14:59:49 -08:00 · 2026-01-06 14:59:49 -08:00 · 8b45208ba9
commit 8b45208ba9
parent a03df1b350
10 changed files with 106 additions and 16 deletions
--- a/packages/graphrag/graphrag/index/input/csv.py
+++ b/packages/graphrag/graphrag/index/input/csv.py
@ -16,14 +16,14 @@ class CSVFileReader(StructuredFileReader):
    """Reader implementation for csv files."""

    async def read_file(self, path: str) -> list[TextDocument]:
-        """Read a csv file into a DataFrame of documents.
+        """Read a csv file into a list of documents.

        Args:
            - path - The path to read the file from.

        Returns
        -------
-            - output - DataFrame with a row for each document in the file.
+            - output - list with a TextDocument for each row in the file.
        """
        file = await self._storage.get(path)

--- a/packages/graphrag/graphrag/index/input/input_file_type.py
+++ b/packages/graphrag/graphrag/index/input/input_file_type.py
@ -15,6 +15,8 @@ class InputFileType(StrEnum):
    """The text input type."""
    Json = "json"
    """The JSON input type."""
+    JsonLines = "jsonl"
+    """The JSON Lines input type."""

    def __repr__(self):
        """Get a string representation."""
--- a/packages/graphrag/graphrag/index/input/input_reader_factory.py
+++ b/packages/graphrag/graphrag/index/input/input_reader_factory.py
@ -73,6 +73,10 @@ def create_input_reader(config: InputConfig, storage: Storage) -> InputReader:
                from graphrag.index.input.json import JSONFileReader

                register_input_reader(InputFileType.Json, JSONFileReader)
+            case InputFileType.JsonLines:
+                from graphrag.index.input.jsonl import JSONLinesFileReader
+
+                register_input_reader(InputFileType.JsonLines, JSONLinesFileReader)
            case _:
                msg = f"InputConfig.file_type '{input_strategy}' is not registered in the InputReaderFactory. Registered types: {', '.join(input_reader_factory.keys())}."
                raise ValueError(msg)
--- a/packages/graphrag/graphrag/index/input/json.py
+++ b/packages/graphrag/graphrag/index/input/json.py
@ -16,14 +16,14 @@ class JSONFileReader(StructuredFileReader):
    """Reader implementation for json files."""

    async def read_file(self, path: str) -> list[TextDocument]:
-        """Read a JSON file into a DataFrame of documents.
+        """Read a JSON file into a list of documents.

        Args:
            - path - The path to read the file from.

        Returns
        -------
-            - output - DataFrame with a row for each document in the file.
+            - output - list with a TextDocument for each row in the file.
        """
        text = await self._storage.get(path, encoding=self._encoding)
        as_json = json.loads(text)
--- a/packages/graphrag/graphrag/index/input/jsonl.py
+++ b/packages/graphrag/graphrag/index/input/jsonl.py
@ -0,0 +1,32 @@
+# Copyright (c) 2024 Microsoft Corporation.
+# Licensed under the MIT License
+
+"""A module containing 'JSONFileReader' model."""
+
+import json
+import logging
+
+from graphrag.index.input.structured_file_reader import StructuredFileReader
+from graphrag.index.input.text_document import TextDocument
+
+logger = logging.getLogger(__name__)
+
+
+class JSONLinesFileReader(StructuredFileReader):
+    """Reader implementation for json files."""
+
+    async def read_file(self, path: str) -> list[TextDocument]:
+        """Read a JSON lines file into a list of documents.
+
+        This differs from standard JSON files in that each line is a separate JSON object.
+
+        Args:
+            - path - The path to read the file from.
+
+        Returns
+        -------
+            - output - list with a TextDocument for each row in the file.
+        """
+        text = await self._storage.get(path, encoding=self._encoding)
+        rows = [json.loads(line) for line in text.splitlines()]
+        return await self.process_data_columns(rows, path)
--- a/packages/graphrag/graphrag/index/input/structured_file_reader.py
+++ b/packages/graphrag/graphrag/index/input/structured_file_reader.py
@ -35,7 +35,7 @@ class StructuredFileReader(InputReader):
    ) -> list[TextDocument]:
        """Process configured data columns from a list of loaded dicts."""
        documents = []
-        for row in rows:
+        for index, row in enumerate(rows):
            # text column is required - harvest from dict
            text = row[self._text_column]
            # id is optional - generate from harvest from dict or hash from text
@ -45,7 +45,9 @@ class StructuredFileReader(InputReader):
                else gen_sha512_hash({"text": text}, ["text"])
            )
            # title is optional - harvest from dict or use filename
-            title = row[self._title_column] if self._title_column else str(path)
+            title = (
+                row[self._title_column] if self._title_column else f"{path} ({index})"
+            )
            creation_date = await self._storage.get_creation_date(path)
            documents.append(
                TextDocument(
--- a/tests/unit/indexing/input/data/one-jsonl/input.jsonl
+++ b/tests/unit/indexing/input/data/one-jsonl/input.jsonl
@ -0,0 +1,3 @@
+{ "title": "Hello", "text": "Hi how are you today?"}
+{ "title": "Goodbye", "text": "I'm outta here"}
+{ "title": "Adios", "text": "See you later"}
--- a/tests/unit/indexing/input/test_csv_loader.py
+++ b/tests/unit/indexing/input/test_csv_loader.py
@ -21,7 +21,7 @@ async def test_csv_loader_one_file():
    reader = create_input_reader(config, storage)
    documents = await reader.read_files()
    assert len(documents) == 2
-    assert documents[0].title == "input.csv"
+    assert documents[0].title == "input.csv (0)"
    assert documents[0].metadata is None


@ -31,7 +31,6 @@ async def test_csv_loader_one_file_with_title():
            base_dir="tests/unit/indexing/input/data/one-csv",
        ),
        file_type=InputFileType.Csv,
-        file_pattern=".*\\.csv$",
        title_column="title",
    )
    storage = create_storage(config.storage)
@ -47,7 +46,6 @@ async def test_csv_loader_one_file_with_metadata():
            base_dir="tests/unit/indexing/input/data/one-csv",
        ),
        file_type=InputFileType.Csv,
-        file_pattern=".*\\.csv$",
        title_column="title",
        metadata=["title"],
    )
@ -64,7 +62,6 @@ async def test_csv_loader_multiple_files():
            base_dir="tests/unit/indexing/input/data/multiple-csvs",
        ),
        file_type=InputFileType.Csv,
-        file_pattern=".*\\.csv$",
    )
    storage = create_storage(config.storage)
    reader = create_input_reader(config, storage)
--- a/tests/unit/indexing/input/test_json_loader.py
+++ b/tests/unit/indexing/input/test_json_loader.py
@ -19,7 +19,7 @@ async def test_json_loader_one_file_one_object():
    reader = create_input_reader(config, storage)
    documents = await reader.read_files()
    assert len(documents) == 1
-    assert documents[0].title == "input.json"
+    assert documents[0].title == "input.json (0)"
    assert documents[0].metadata is None


@ -29,13 +29,13 @@ async def test_json_loader_one_file_multiple_objects():
            base_dir="tests/unit/indexing/input/data/one-json-multiple-objects",
        ),
        file_type=InputFileType.Json,
-        file_pattern=".*\\.json$",
    )
    storage = create_storage(config.storage)
    reader = create_input_reader(config, storage)
    documents = await reader.read_files()
    assert len(documents) == 3
-    assert documents[0].title == "input.json"
+    assert documents[0].title == "input.json (0)"
+    assert documents[1].title == "input.json (1)"


 async def test_json_loader_one_file_with_title():
@ -44,7 +44,6 @@ async def test_json_loader_one_file_with_title():
            base_dir="tests/unit/indexing/input/data/one-json-one-object",
        ),
        file_type=InputFileType.Json,
-        file_pattern=".*\\.json$",
        title_column="title",
    )
    storage = create_storage(config.storage)
@ -60,7 +59,6 @@ async def test_json_loader_one_file_with_metadata():
            base_dir="tests/unit/indexing/input/data/one-json-one-object",
        ),
        file_type=InputFileType.Json,
-        file_pattern=".*\\.json$",
        title_column="title",
        metadata=["title"],
    )
@ -77,7 +75,6 @@ async def test_json_loader_multiple_files():
            base_dir="tests/unit/indexing/input/data/multiple-jsons",
        ),
        file_type=InputFileType.Json,
-        file_pattern=".*\\.json$",
    )
    storage = create_storage(config.storage)
    reader = create_input_reader(config, storage)
--- a/tests/unit/indexing/input/test_jsonl_loader.py
+++ b/tests/unit/indexing/input/test_jsonl_loader.py
@ -0,0 +1,53 @@
+# Copyright (c) 2024 Microsoft Corporation.
+# Licensed under the MIT License
+
+from graphrag.index.input.input_config import InputConfig
+from graphrag.index.input.input_file_type import InputFileType
+from graphrag.index.input.input_reader_factory import create_input_reader
+from graphrag_storage import StorageConfig, create_storage
+
+
+async def test_jsonl_loader_one_file_multiple_objects():
+    config = InputConfig(
+        storage=StorageConfig(
+            base_dir="tests/unit/indexing/input/data/one-jsonl",
+        ),
+        file_type=InputFileType.JsonLines,
+        file_pattern=".*\\.jsonl$",
+    )
+    storage = create_storage(config.storage)
+    reader = create_input_reader(config, storage)
+    documents = await reader.read_files()
+    assert len(documents) == 3
+    assert documents[0].title == "input.jsonl (0)"
+
+
+async def test_jsonl_loader_one_file_with_title():
+    config = InputConfig(
+        storage=StorageConfig(
+            base_dir="tests/unit/indexing/input/data/one-jsonl",
+        ),
+        file_type=InputFileType.JsonLines,
+        title_column="title",
+    )
+    storage = create_storage(config.storage)
+    reader = create_input_reader(config, storage)
+    documents = await reader.read_files()
+    assert len(documents) == 3
+    assert documents[0].title == "Hello"
+
+
+async def test_jsonl_loader_one_file_with_metadata():
+    config = InputConfig(
+        storage=StorageConfig(
+            base_dir="tests/unit/indexing/input/data/one-jsonl",
+        ),
+        file_type=InputFileType.JsonLines,
+        title_column="title",
+        metadata=["title"],
+    )
+    storage = create_storage(config.storage)
+    reader = create_input_reader(config, storage)
+    documents = await reader.read_files()
+    assert len(documents) == 3
+    assert documents[0].metadata == {"title": "Hello"}