Improve and cleanup logging output of indexing (#1144)

This commit is contained in:
Josh Bradley 2024-09-18 14:38:13 -04:00 committed by GitHub
parent aa5b426f1d
commit 594084f156
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
8 changed files with 14 additions and 8 deletions

View File

@ -0,0 +1,4 @@
{
"type": "patch",
"description": "Improve logging."
}

View File

@ -74,7 +74,7 @@ NODE2VEC_WINDOW_SIZE = 2
NODE2VEC_ITERATIONS = 3
NODE2VEC_RANDOM_SEED = 597832
REPORTING_TYPE = ReportingType.file
REPORTING_BASE_DIR = "output/${timestamp}/reports"
REPORTING_BASE_DIR = "output/${timestamp}/logs"
SNAPSHOTS_GRAPHML = False
SNAPSHOTS_RAW_ENTITIES = False
SNAPSHOTS_TOP_LEVEL_NODES = False

View File

@ -54,15 +54,12 @@ embeddings:
# max_retry_wait: {defs.LLM_MAX_RETRY_WAIT}
# sleep_on_rate_limit_recommendation: true # whether to sleep when azure suggests wait-times
# concurrent_requests: {defs.LLM_CONCURRENT_REQUESTS} # the number of parallel inflight requests that may be made
chunks:
size: {defs.CHUNK_SIZE}
overlap: {defs.CHUNK_OVERLAP}
group_by_columns: [{",".join(defs.CHUNK_GROUP_BY_COLUMNS)}] # by default, we don't allow chunks to cross documents
input:
type: {defs.INPUT_TYPE.value} # or blob
file_type: {defs.INPUT_FILE_TYPE.value} # or csv

View File

@ -77,7 +77,6 @@ def load_llm_embeddings(
raise ValueError(msg)
if cache is not None:
cache = cache.child(name)
return loaders[llm_type]["load"](on_error, cache, llm_config or {})
msg = f"Unknown LLM type {llm_type}"

View File

@ -78,7 +78,7 @@ class BlobWorkflowCallbacks(NoopWorkflowCallbacks):
blob_client = self._blob_service_client.get_blob_client(
self._container_name, self._blob_name
)
blob_client.append_block(json.dumps(log, ensure_ascii=False) + "\n")
blob_client.append_block(json.dumps(log, indent=4, ensure_ascii=False) + "\n")
# update the blob's block count
self._num_blocks += 1

View File

@ -42,6 +42,7 @@ class FileWorkflowCallbacks(NoopWorkflowCallbacks):
"source": str(cause),
"details": details,
},
indent=4,
ensure_ascii=False,
)
+ "\n"

View File

@ -24,7 +24,7 @@ def load_pipeline_reporter(
config: PipelineReportingConfig | None, root_dir: str | None
) -> WorkflowCallbacks:
"""Create a reporter for the given pipeline config."""
config = config or PipelineFileReportingConfig(base_dir="reports")
config = config or PipelineFileReportingConfig(base_dir="logs")
match config.type:
case ReportingType.file:

View File

@ -7,6 +7,7 @@ import traceback
from abc import ABC, abstractmethod
from typing import Generic, TypeVar
from openai import RateLimitError
from typing_extensions import Unpack
from graphrag.llm.types import (
@ -52,6 +53,10 @@ class BaseLLM(ABC, LLM[TIn, TOut], Generic[TIn, TOut]):
try:
output = await self._execute_llm(input, **kwargs)
return LLMOutput(output=output)
except RateLimitError:
# for improved readability, do not log rate limit exceptions,
# they are logged/handled elsewhere
raise
except Exception as e:
stack_trace = traceback.format_exc()
if self._on_error: