Improve and cleanup logging output of indexing (#1144)

2026-01-14 09:07:20 +08:00 · 2024-09-18 14:38:13 -04:00 · 2024-09-18 14:38:13 -04:00 · 594084f156
commit 594084f156
parent aa5b426f1d
8 changed files with 14 additions and 8 deletions
--- a/.semversioner/next-release/patch-20240917052141574161.json
+++ b/.semversioner/next-release/patch-20240917052141574161.json
@ -0,0 +1,4 @@
+{
+  "type": "patch",
+  "description": "Improve logging."
+}
--- a/graphrag/config/defaults.py
+++ b/graphrag/config/defaults.py
@ -74,7 +74,7 @@ NODE2VEC_WINDOW_SIZE = 2
 NODE2VEC_ITERATIONS = 3
 NODE2VEC_RANDOM_SEED = 597832
 REPORTING_TYPE = ReportingType.file
-REPORTING_BASE_DIR = "output/${timestamp}/reports"
+REPORTING_BASE_DIR = "output/${timestamp}/logs"
 SNAPSHOTS_GRAPHML = False
 SNAPSHOTS_RAW_ENTITIES = False
 SNAPSHOTS_TOP_LEVEL_NODES = False
--- a/graphrag/index/init_content.py
+++ b/graphrag/index/init_content.py
@ -54,15 +54,12 @@ embeddings:
    # max_retry_wait: {defs.LLM_MAX_RETRY_WAIT}
    # sleep_on_rate_limit_recommendation: true # whether to sleep when azure suggests wait-times
    # concurrent_requests: {defs.LLM_CONCURRENT_REQUESTS} # the number of parallel inflight requests that may be made
-    
-  
-

 chunks:
  size: {defs.CHUNK_SIZE}
  overlap: {defs.CHUNK_OVERLAP}
  group_by_columns: [{",".join(defs.CHUNK_GROUP_BY_COLUMNS)}] # by default, we don't allow chunks to cross documents
-    
+
 input:
  type: {defs.INPUT_TYPE.value} # or blob
  file_type: {defs.INPUT_FILE_TYPE.value} # or csv
--- a/graphrag/index/llm/load_llm.py
+++ b/graphrag/index/llm/load_llm.py
@ -77,7 +77,6 @@ def load_llm_embeddings(
            raise ValueError(msg)
        if cache is not None:
            cache = cache.child(name)
-
        return loaders[llm_type]["load"](on_error, cache, llm_config or {})

    msg = f"Unknown LLM type {llm_type}"
--- a/graphrag/index/reporting/blob_workflow_callbacks.py
+++ b/graphrag/index/reporting/blob_workflow_callbacks.py
@ -78,7 +78,7 @@ class BlobWorkflowCallbacks(NoopWorkflowCallbacks):
        blob_client = self._blob_service_client.get_blob_client(
            self._container_name, self._blob_name
        )
-        blob_client.append_block(json.dumps(log, ensure_ascii=False) + "\n")
+        blob_client.append_block(json.dumps(log, indent=4, ensure_ascii=False) + "\n")

        # update the blob's block count
        self._num_blocks += 1
--- a/graphrag/index/reporting/file_workflow_callbacks.py
+++ b/graphrag/index/reporting/file_workflow_callbacks.py
@ -42,6 +42,7 @@ class FileWorkflowCallbacks(NoopWorkflowCallbacks):
                    "source": str(cause),
                    "details": details,
                },
+                indent=4,
                ensure_ascii=False,
            )
            + "\n"
--- a/graphrag/index/reporting/load_pipeline_reporter.py
+++ b/graphrag/index/reporting/load_pipeline_reporter.py
@ -24,7 +24,7 @@ def load_pipeline_reporter(
    config: PipelineReportingConfig | None, root_dir: str | None
 ) -> WorkflowCallbacks:
    """Create a reporter for the given pipeline config."""
-    config = config or PipelineFileReportingConfig(base_dir="reports")
+    config = config or PipelineFileReportingConfig(base_dir="logs")

    match config.type:
        case ReportingType.file:
--- a/graphrag/llm/base/base_llm.py
+++ b/graphrag/llm/base/base_llm.py
@ -7,6 +7,7 @@ import traceback
 from abc import ABC, abstractmethod
 from typing import Generic, TypeVar

+from openai import RateLimitError
 from typing_extensions import Unpack

 from graphrag.llm.types import (
@ -52,6 +53,10 @@ class BaseLLM(ABC, LLM[TIn, TOut], Generic[TIn, TOut]):
        try:
            output = await self._execute_llm(input, **kwargs)
            return LLMOutput(output=output)
+        except RateLimitError:
+            # for improved readability, do not log rate limit exceptions,
+            # they are logged/handled elsewhere
+            raise
        except Exception as e:
            stack_trace = traceback.format_exc()
            if self._on_error: