mirror of
https://github.com/microsoft/graphrag.git
synced 2026-01-14 09:07:20 +08:00
Some checks are pending
gh-pages / build (push) Waiting to run
Python CI / python-ci (ubuntu-latest, 3.10) (push) Waiting to run
Python CI / python-ci (ubuntu-latest, 3.11) (push) Waiting to run
Python CI / python-ci (windows-latest, 3.10) (push) Waiting to run
Python CI / python-ci (windows-latest, 3.11) (push) Waiting to run
Python Integration Tests / python-ci (ubuntu-latest, 3.10) (push) Waiting to run
Python Integration Tests / python-ci (windows-latest, 3.10) (push) Waiting to run
Python Notebook Tests / python-ci (ubuntu-latest, 3.10) (push) Waiting to run
Python Notebook Tests / python-ci (windows-latest, 3.10) (push) Waiting to run
Python Publish (pypi) / Upload release to PyPI (push) Waiting to run
Python Smoke Tests / python-ci (ubuntu-latest, 3.10) (push) Waiting to run
Python Smoke Tests / python-ci (windows-latest, 3.10) (push) Waiting to run
Spellcheck / spellcheck (push) Waiting to run
* Initial plan for issue * Implement standard logging module and integrate with existing loggers Co-authored-by: jgbradley1 <654554+jgbradley1@users.noreply.github.com> * Add test cases and improve documentation for standard logging Co-authored-by: jgbradley1 <654554+jgbradley1@users.noreply.github.com> * Apply ruff formatting and add semversioner file for logging improvements Co-authored-by: jgbradley1 <654554+jgbradley1@users.noreply.github.com> * Remove custom logger classes and refactor to use standard logging only Co-authored-by: jgbradley1 <654554+jgbradley1@users.noreply.github.com> * Apply ruff formatting to resolve CI/CD test failures Co-authored-by: jgbradley1 <654554+jgbradley1@users.noreply.github.com> * Add semversioner file and fix linting issues Co-authored-by: jgbradley1 <654554+jgbradley1@users.noreply.github.com> * ruff fixes * fix spelling error * Remove StandardProgressLogger and refactor to use standard logging Co-authored-by: jgbradley1 <654554+jgbradley1@users.noreply.github.com> * Remove LoggerFactory and custom loggers, refactor to use standard logging Co-authored-by: jgbradley1 <654554+jgbradley1@users.noreply.github.com> * Fix pyright error: use logger.info() instead of calling logger as function in cosmosdb_pipeline_storage.py Co-authored-by: jgbradley1 <654554+jgbradley1@users.noreply.github.com> * ruff fixes * Remove deprecated logger files that were marked as deprecated placeholders Co-authored-by: jgbradley1 <654554+jgbradley1@users.noreply.github.com> * Replace custom get_logger with standard Python logging Co-authored-by: jgbradley1 <654554+jgbradley1@users.noreply.github.com> * Fix linting issues found by ruff check --fix Co-authored-by: jgbradley1 <654554+jgbradley1@users.noreply.github.com> * apply ruff check fixes * add word to dictionary * Fix type checker error in ModelManager.__new__ method Co-authored-by: jgbradley1 <654554+jgbradley1@users.noreply.github.com> * Refactor multiple logging.getLogger() calls to use single logger per file Co-authored-by: jgbradley1 <654554+jgbradley1@users.noreply.github.com> * Remove progress_logger parameter from build_index() and logger parameter from generate_indexing_prompts() Co-authored-by: jgbradley1 <654554+jgbradley1@users.noreply.github.com> * Remove logger parameter from run_pipeline and standardize logger naming Co-authored-by: jgbradley1 <654554+jgbradley1@users.noreply.github.com> * Replace logger parameter with log_level parameter in CLI commands Co-authored-by: jgbradley1 <654554+jgbradley1@users.noreply.github.com> * Fix import ordering in notebook files to pass poetry poe check Co-authored-by: jgbradley1 <654554+jgbradley1@users.noreply.github.com> * Remove --logger parameter from smoke test command Co-authored-by: jgbradley1 <654554+jgbradley1@users.noreply.github.com> * Fix Windows CI/CD issue with log file cleanup in tests Co-authored-by: jgbradley1 <654554+jgbradley1@users.noreply.github.com> * Add StreamHandler to root logger in __main__.py for CLI logging Co-authored-by: jgbradley1 <654554+jgbradley1@users.noreply.github.com> * Only add StreamHandler if root logger doesn't have existing StreamHandler Co-authored-by: jgbradley1 <654554+jgbradley1@users.noreply.github.com> * Fix import ordering in notebook files to pass ruff checks Co-authored-by: jgbradley1 <654554+jgbradley1@users.noreply.github.com> * Replace logging.StreamHandler with colorlog.StreamHandler for colorized log output Co-authored-by: jgbradley1 <654554+jgbradley1@users.noreply.github.com> * Regenerate poetry.lock file after adding colorlog dependency Co-authored-by: jgbradley1 <654554+jgbradley1@users.noreply.github.com> * Fix import ordering in notebook files to pass ruff checks Co-authored-by: jgbradley1 <654554+jgbradley1@users.noreply.github.com> * move printing of dataframes to debug level * remove colorlog for now * Refactor workflow callbacks to inherit from logging.Handler Co-authored-by: jgbradley1 <654554+jgbradley1@users.noreply.github.com> * Fix linting issues in workflow callback handlers Co-authored-by: jgbradley1 <654554+jgbradley1@users.noreply.github.com> * Fix pyright type errors in blob and file workflow callbacks Co-authored-by: jgbradley1 <654554+jgbradley1@users.noreply.github.com> * Refactor pipeline logging to use pure logging.Handler subclasses Co-authored-by: jgbradley1 <654554+jgbradley1@users.noreply.github.com> * Rename workflow callback classes to workflow logger classes and move to logger directory Co-authored-by: jgbradley1 <654554+jgbradley1@users.noreply.github.com> * update dictionary * apply ruff fixes * fix function name * simplify logger code * update * Remove error, warning, and log methods from WorkflowCallbacks and replace with standard logging Co-authored-by: jgbradley1 <654554+jgbradley1@users.noreply.github.com> * ruff fixes * Fix pyright errors by removing WorkflowCallbacks from strategy type signatures Co-authored-by: jgbradley1 <654554+jgbradley1@users.noreply.github.com> * Remove ConsoleWorkflowLogger and apply consistent formatter to all handlers Co-authored-by: jgbradley1 <654554+jgbradley1@users.noreply.github.com> * apply ruff fixes * Refactor pipeline_logger.py to use standard FileHandler and remove FileWorkflowLogger Co-authored-by: jgbradley1 <654554+jgbradley1@users.noreply.github.com> * Remove conditional azure import checks from blob_workflow_logger.py Co-authored-by: jgbradley1 <654554+jgbradley1@users.noreply.github.com> * Fix pyright type checking errors in mock_provider.py and utils.py Co-authored-by: jgbradley1 <654554+jgbradley1@users.noreply.github.com> * Run ruff check --fix to fix import ordering in notebooks Co-authored-by: jgbradley1 <654554+jgbradley1@users.noreply.github.com> * Merge configure_logging and create_pipeline_logger into init_loggers function Co-authored-by: jgbradley1 <654554+jgbradley1@users.noreply.github.com> * Remove configure_logging and create_pipeline_logger functions, replace all usage with init_loggers Co-authored-by: jgbradley1 <654554+jgbradley1@users.noreply.github.com> * apply ruff fixes * cleanup unused code * Update init_loggers to accept GraphRagConfig instead of ReportingConfig Co-authored-by: jgbradley1 <654554+jgbradley1@users.noreply.github.com> * apply ruff check fixes * Fix test failures by providing valid GraphRagConfig with required model configurations Co-authored-by: jgbradley1 <654554+jgbradley1@users.noreply.github.com> * apply ruff fixes * remove logging_workflow_callback * cleanup logging messages * Add logging to track progress of pandas DataFrame apply operation in create_base_text_units Co-authored-by: jgbradley1 <654554+jgbradley1@users.noreply.github.com> * cleanup logger logic throughout codebase * update * more cleanup of old loggers * small logger cleanup * final code cleanup and added loggers to query * add verbose logging to query * minor code cleanup * Fix broken unit tests for chunk_text and standard_logging Co-authored-by: jgbradley1 <654554+jgbradley1@users.noreply.github.com> * apply ruff fixes * Fix test_chunk_text by mocking progress_ticker function instead of ProgressTicker class Co-authored-by: jgbradley1 <654554+jgbradley1@users.noreply.github.com> * remove unnecessary logger * remove rich and fix type annotation * revert test formatting changes my by copilot * promote graphrag logs to root logger * add correct semversioner file * revert change to file * revert formatting changes that have no effect * fix changes after merge with main * revert unnecessary copilot changes * remove whitespace * cleanup docstring * simplify some logic with less code * update poetry lock file * ruff fixes --------- Co-authored-by: copilot-swe-agent[bot] <198982749+Copilot@users.noreply.github.com> Co-authored-by: jgbradley1 <654554+jgbradley1@users.noreply.github.com> Co-authored-by: Josh Bradley <joshbradley@microsoft.com>
142 lines
4.9 KiB
Python
142 lines
4.9 KiB
Python
# Copyright (c) 2024 Microsoft Corporation.
|
|
# Licensed under the MIT License
|
|
|
|
"""Different methods to run the pipeline."""
|
|
|
|
import json
|
|
import logging
|
|
import re
|
|
import time
|
|
from collections.abc import AsyncIterable
|
|
from dataclasses import asdict
|
|
|
|
from graphrag.callbacks.workflow_callbacks import WorkflowCallbacks
|
|
from graphrag.config.models.graph_rag_config import GraphRagConfig
|
|
from graphrag.index.run.utils import create_run_context
|
|
from graphrag.index.typing.context import PipelineRunContext
|
|
from graphrag.index.typing.pipeline import Pipeline
|
|
from graphrag.index.typing.pipeline_run_result import PipelineRunResult
|
|
from graphrag.storage.pipeline_storage import PipelineStorage
|
|
from graphrag.utils.api import create_cache_from_config, create_storage_from_config
|
|
from graphrag.utils.storage import load_table_from_storage, write_table_to_storage
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
async def run_pipeline(
|
|
pipeline: Pipeline,
|
|
config: GraphRagConfig,
|
|
callbacks: WorkflowCallbacks,
|
|
is_update_run: bool = False,
|
|
) -> AsyncIterable[PipelineRunResult]:
|
|
"""Run all workflows using a simplified pipeline."""
|
|
root_dir = config.root_dir
|
|
|
|
input_storage = create_storage_from_config(config.input.storage)
|
|
output_storage = create_storage_from_config(config.output)
|
|
cache = create_cache_from_config(config.cache, root_dir)
|
|
|
|
# load existing state in case any workflows are stateful
|
|
state_json = await output_storage.get("context.json")
|
|
state = json.loads(state_json) if state_json else {}
|
|
|
|
if is_update_run:
|
|
logger.info("Running incremental indexing.")
|
|
|
|
update_storage = create_storage_from_config(config.update_index_output)
|
|
# we use this to store the new subset index, and will merge its content with the previous index
|
|
update_timestamp = time.strftime("%Y%m%d-%H%M%S")
|
|
timestamped_storage = update_storage.child(update_timestamp)
|
|
delta_storage = timestamped_storage.child("delta")
|
|
# copy the previous output to a backup folder, so we can replace it with the update
|
|
# we'll read from this later when we merge the old and new indexes
|
|
previous_storage = timestamped_storage.child("previous")
|
|
await _copy_previous_output(output_storage, previous_storage)
|
|
|
|
state["update_timestamp"] = update_timestamp
|
|
|
|
context = create_run_context(
|
|
input_storage=input_storage,
|
|
output_storage=delta_storage,
|
|
previous_storage=previous_storage,
|
|
cache=cache,
|
|
callbacks=callbacks,
|
|
state=state,
|
|
)
|
|
|
|
else:
|
|
logger.info("Running standard indexing.")
|
|
|
|
context = create_run_context(
|
|
input_storage=input_storage,
|
|
output_storage=output_storage,
|
|
cache=cache,
|
|
callbacks=callbacks,
|
|
state=state,
|
|
)
|
|
|
|
async for table in _run_pipeline(
|
|
pipeline=pipeline,
|
|
config=config,
|
|
context=context,
|
|
):
|
|
yield table
|
|
|
|
|
|
async def _run_pipeline(
|
|
pipeline: Pipeline,
|
|
config: GraphRagConfig,
|
|
context: PipelineRunContext,
|
|
) -> AsyncIterable[PipelineRunResult]:
|
|
start_time = time.time()
|
|
|
|
last_workflow = "<startup>"
|
|
|
|
try:
|
|
await _dump_json(context)
|
|
|
|
logger.info("Executing pipeline...")
|
|
for name, workflow_function in pipeline.run():
|
|
last_workflow = name
|
|
context.callbacks.workflow_start(name, None)
|
|
work_time = time.time()
|
|
result = await workflow_function(config, context)
|
|
context.callbacks.workflow_end(name, result)
|
|
yield PipelineRunResult(
|
|
workflow=name, result=result.result, state=context.state, errors=None
|
|
)
|
|
context.stats.workflows[name] = {"overall": time.time() - work_time}
|
|
if result.stop:
|
|
logger.info("Halting pipeline at workflow request")
|
|
break
|
|
|
|
context.stats.total_runtime = time.time() - start_time
|
|
logger.info("Indexing pipeline complete.")
|
|
await _dump_json(context)
|
|
|
|
except Exception as e:
|
|
logger.exception("error running workflow %s", last_workflow)
|
|
yield PipelineRunResult(
|
|
workflow=last_workflow, result=None, state=context.state, errors=[e]
|
|
)
|
|
|
|
|
|
async def _dump_json(context: PipelineRunContext) -> None:
|
|
"""Dump the stats and context state to the storage."""
|
|
await context.output_storage.set(
|
|
"stats.json", json.dumps(asdict(context.stats), indent=4, ensure_ascii=False)
|
|
)
|
|
await context.output_storage.set(
|
|
"context.json", json.dumps(context.state, indent=4, ensure_ascii=False)
|
|
)
|
|
|
|
|
|
async def _copy_previous_output(
|
|
storage: PipelineStorage,
|
|
copy_storage: PipelineStorage,
|
|
):
|
|
for file in storage.find(re.compile(r"\.parquet$")):
|
|
base_name = file[0].replace(".parquet", "")
|
|
table = await load_table_from_storage(base_name, storage)
|
|
await write_table_to_storage(table, base_name, copy_storage)
|