mirror of
https://github.com/microsoft/graphrag.git
synced 2026-01-14 09:07:20 +08:00
Some checks are pending
gh-pages / build (push) Waiting to run
Python CI / python-ci (ubuntu-latest, 3.10) (push) Waiting to run
Python CI / python-ci (ubuntu-latest, 3.11) (push) Waiting to run
Python CI / python-ci (windows-latest, 3.10) (push) Waiting to run
Python CI / python-ci (windows-latest, 3.11) (push) Waiting to run
Python Integration Tests / python-ci (ubuntu-latest, 3.10) (push) Waiting to run
Python Integration Tests / python-ci (windows-latest, 3.10) (push) Waiting to run
Python Notebook Tests / python-ci (ubuntu-latest, 3.10) (push) Waiting to run
Python Notebook Tests / python-ci (windows-latest, 3.10) (push) Waiting to run
Python Publish (pypi) / Upload release to PyPI (push) Waiting to run
Python Smoke Tests / python-ci (ubuntu-latest, 3.10) (push) Waiting to run
Python Smoke Tests / python-ci (windows-latest, 3.10) (push) Waiting to run
Spellcheck / spellcheck (push) Waiting to run
* Initial plan for issue * Implement standard logging module and integrate with existing loggers Co-authored-by: jgbradley1 <654554+jgbradley1@users.noreply.github.com> * Add test cases and improve documentation for standard logging Co-authored-by: jgbradley1 <654554+jgbradley1@users.noreply.github.com> * Apply ruff formatting and add semversioner file for logging improvements Co-authored-by: jgbradley1 <654554+jgbradley1@users.noreply.github.com> * Remove custom logger classes and refactor to use standard logging only Co-authored-by: jgbradley1 <654554+jgbradley1@users.noreply.github.com> * Apply ruff formatting to resolve CI/CD test failures Co-authored-by: jgbradley1 <654554+jgbradley1@users.noreply.github.com> * Add semversioner file and fix linting issues Co-authored-by: jgbradley1 <654554+jgbradley1@users.noreply.github.com> * ruff fixes * fix spelling error * Remove StandardProgressLogger and refactor to use standard logging Co-authored-by: jgbradley1 <654554+jgbradley1@users.noreply.github.com> * Remove LoggerFactory and custom loggers, refactor to use standard logging Co-authored-by: jgbradley1 <654554+jgbradley1@users.noreply.github.com> * Fix pyright error: use logger.info() instead of calling logger as function in cosmosdb_pipeline_storage.py Co-authored-by: jgbradley1 <654554+jgbradley1@users.noreply.github.com> * ruff fixes * Remove deprecated logger files that were marked as deprecated placeholders Co-authored-by: jgbradley1 <654554+jgbradley1@users.noreply.github.com> * Replace custom get_logger with standard Python logging Co-authored-by: jgbradley1 <654554+jgbradley1@users.noreply.github.com> * Fix linting issues found by ruff check --fix Co-authored-by: jgbradley1 <654554+jgbradley1@users.noreply.github.com> * apply ruff check fixes * add word to dictionary * Fix type checker error in ModelManager.__new__ method Co-authored-by: jgbradley1 <654554+jgbradley1@users.noreply.github.com> * Refactor multiple logging.getLogger() calls to use single logger per file Co-authored-by: jgbradley1 <654554+jgbradley1@users.noreply.github.com> * Remove progress_logger parameter from build_index() and logger parameter from generate_indexing_prompts() Co-authored-by: jgbradley1 <654554+jgbradley1@users.noreply.github.com> * Remove logger parameter from run_pipeline and standardize logger naming Co-authored-by: jgbradley1 <654554+jgbradley1@users.noreply.github.com> * Replace logger parameter with log_level parameter in CLI commands Co-authored-by: jgbradley1 <654554+jgbradley1@users.noreply.github.com> * Fix import ordering in notebook files to pass poetry poe check Co-authored-by: jgbradley1 <654554+jgbradley1@users.noreply.github.com> * Remove --logger parameter from smoke test command Co-authored-by: jgbradley1 <654554+jgbradley1@users.noreply.github.com> * Fix Windows CI/CD issue with log file cleanup in tests Co-authored-by: jgbradley1 <654554+jgbradley1@users.noreply.github.com> * Add StreamHandler to root logger in __main__.py for CLI logging Co-authored-by: jgbradley1 <654554+jgbradley1@users.noreply.github.com> * Only add StreamHandler if root logger doesn't have existing StreamHandler Co-authored-by: jgbradley1 <654554+jgbradley1@users.noreply.github.com> * Fix import ordering in notebook files to pass ruff checks Co-authored-by: jgbradley1 <654554+jgbradley1@users.noreply.github.com> * Replace logging.StreamHandler with colorlog.StreamHandler for colorized log output Co-authored-by: jgbradley1 <654554+jgbradley1@users.noreply.github.com> * Regenerate poetry.lock file after adding colorlog dependency Co-authored-by: jgbradley1 <654554+jgbradley1@users.noreply.github.com> * Fix import ordering in notebook files to pass ruff checks Co-authored-by: jgbradley1 <654554+jgbradley1@users.noreply.github.com> * move printing of dataframes to debug level * remove colorlog for now * Refactor workflow callbacks to inherit from logging.Handler Co-authored-by: jgbradley1 <654554+jgbradley1@users.noreply.github.com> * Fix linting issues in workflow callback handlers Co-authored-by: jgbradley1 <654554+jgbradley1@users.noreply.github.com> * Fix pyright type errors in blob and file workflow callbacks Co-authored-by: jgbradley1 <654554+jgbradley1@users.noreply.github.com> * Refactor pipeline logging to use pure logging.Handler subclasses Co-authored-by: jgbradley1 <654554+jgbradley1@users.noreply.github.com> * Rename workflow callback classes to workflow logger classes and move to logger directory Co-authored-by: jgbradley1 <654554+jgbradley1@users.noreply.github.com> * update dictionary * apply ruff fixes * fix function name * simplify logger code * update * Remove error, warning, and log methods from WorkflowCallbacks and replace with standard logging Co-authored-by: jgbradley1 <654554+jgbradley1@users.noreply.github.com> * ruff fixes * Fix pyright errors by removing WorkflowCallbacks from strategy type signatures Co-authored-by: jgbradley1 <654554+jgbradley1@users.noreply.github.com> * Remove ConsoleWorkflowLogger and apply consistent formatter to all handlers Co-authored-by: jgbradley1 <654554+jgbradley1@users.noreply.github.com> * apply ruff fixes * Refactor pipeline_logger.py to use standard FileHandler and remove FileWorkflowLogger Co-authored-by: jgbradley1 <654554+jgbradley1@users.noreply.github.com> * Remove conditional azure import checks from blob_workflow_logger.py Co-authored-by: jgbradley1 <654554+jgbradley1@users.noreply.github.com> * Fix pyright type checking errors in mock_provider.py and utils.py Co-authored-by: jgbradley1 <654554+jgbradley1@users.noreply.github.com> * Run ruff check --fix to fix import ordering in notebooks Co-authored-by: jgbradley1 <654554+jgbradley1@users.noreply.github.com> * Merge configure_logging and create_pipeline_logger into init_loggers function Co-authored-by: jgbradley1 <654554+jgbradley1@users.noreply.github.com> * Remove configure_logging and create_pipeline_logger functions, replace all usage with init_loggers Co-authored-by: jgbradley1 <654554+jgbradley1@users.noreply.github.com> * apply ruff fixes * cleanup unused code * Update init_loggers to accept GraphRagConfig instead of ReportingConfig Co-authored-by: jgbradley1 <654554+jgbradley1@users.noreply.github.com> * apply ruff check fixes * Fix test failures by providing valid GraphRagConfig with required model configurations Co-authored-by: jgbradley1 <654554+jgbradley1@users.noreply.github.com> * apply ruff fixes * remove logging_workflow_callback * cleanup logging messages * Add logging to track progress of pandas DataFrame apply operation in create_base_text_units Co-authored-by: jgbradley1 <654554+jgbradley1@users.noreply.github.com> * cleanup logger logic throughout codebase * update * more cleanup of old loggers * small logger cleanup * final code cleanup and added loggers to query * add verbose logging to query * minor code cleanup * Fix broken unit tests for chunk_text and standard_logging Co-authored-by: jgbradley1 <654554+jgbradley1@users.noreply.github.com> * apply ruff fixes * Fix test_chunk_text by mocking progress_ticker function instead of ProgressTicker class Co-authored-by: jgbradley1 <654554+jgbradley1@users.noreply.github.com> * remove unnecessary logger * remove rich and fix type annotation * revert test formatting changes my by copilot * promote graphrag logs to root logger * add correct semversioner file * revert change to file * revert formatting changes that have no effect * fix changes after merge with main * revert unnecessary copilot changes * remove whitespace * cleanup docstring * simplify some logic with less code * update poetry lock file * ruff fixes --------- Co-authored-by: copilot-swe-agent[bot] <198982749+Copilot@users.noreply.github.com> Co-authored-by: jgbradley1 <654554+jgbradley1@users.noreply.github.com> Co-authored-by: Josh Bradley <joshbradley@microsoft.com>
127 lines
4.4 KiB
Python
127 lines
4.4 KiB
Python
# Copyright (c) 2024 Microsoft Corporation.
|
|
# Licensed under the MIT License
|
|
|
|
"""CLI implementation of the prompt-tune subcommand."""
|
|
|
|
import logging
|
|
from pathlib import Path
|
|
|
|
import graphrag.api as api
|
|
from graphrag.config.enums import ReportingType
|
|
from graphrag.config.load_config import load_config
|
|
from graphrag.prompt_tune.generator.community_report_summarization import (
|
|
COMMUNITY_SUMMARIZATION_FILENAME,
|
|
)
|
|
from graphrag.prompt_tune.generator.entity_summarization_prompt import (
|
|
ENTITY_SUMMARIZATION_FILENAME,
|
|
)
|
|
from graphrag.prompt_tune.generator.extract_graph_prompt import (
|
|
EXTRACT_GRAPH_FILENAME,
|
|
)
|
|
from graphrag.utils.cli import redact
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
async def prompt_tune(
|
|
root: Path,
|
|
config: Path | None,
|
|
domain: str | None,
|
|
verbose: bool,
|
|
selection_method: api.DocSelectionType,
|
|
limit: int,
|
|
max_tokens: int,
|
|
chunk_size: int,
|
|
overlap: int,
|
|
language: str | None,
|
|
discover_entity_types: bool,
|
|
output: Path,
|
|
n_subset_max: int,
|
|
k: int,
|
|
min_examples_required: int,
|
|
):
|
|
"""Prompt tune the model.
|
|
|
|
Parameters
|
|
----------
|
|
- config: The configuration file.
|
|
- root: The root directory.
|
|
- domain: The domain to map the input documents to.
|
|
- verbose: Enable verbose logging.
|
|
- selection_method: The chunk selection method.
|
|
- limit: The limit of chunks to load.
|
|
- max_tokens: The maximum number of tokens to use on entity extraction prompts.
|
|
- chunk_size: The chunk token size to use.
|
|
- language: The language to use for the prompts.
|
|
- discover_entity_types: Generate entity types.
|
|
- output: The output folder to store the prompts.
|
|
- n_subset_max: The number of text chunks to embed when using auto selection method.
|
|
- k: The number of documents to select when using auto selection method.
|
|
- min_examples_required: The minimum number of examples required for entity extraction prompts.
|
|
"""
|
|
root_path = Path(root).resolve()
|
|
graph_config = load_config(root_path, config)
|
|
|
|
# override chunking config in the configuration
|
|
if chunk_size != graph_config.chunks.size:
|
|
graph_config.chunks.size = chunk_size
|
|
|
|
if overlap != graph_config.chunks.overlap:
|
|
graph_config.chunks.overlap = overlap
|
|
|
|
# configure the root logger with the specified log level
|
|
from graphrag.logger.standard_logging import init_loggers
|
|
|
|
# initialize loggers with config
|
|
init_loggers(
|
|
config=graph_config,
|
|
root_dir=str(root_path),
|
|
verbose=verbose,
|
|
)
|
|
|
|
# log the configuration details
|
|
if graph_config.reporting.type == ReportingType.file:
|
|
log_dir = Path(root_path) / (graph_config.reporting.base_dir or "")
|
|
log_path = log_dir / "logs.txt"
|
|
logger.info("Logging enabled at %s", log_path)
|
|
else:
|
|
logger.info(
|
|
"Logging not enabled for config %s",
|
|
redact(graph_config.model_dump()),
|
|
)
|
|
|
|
prompts = await api.generate_indexing_prompts(
|
|
config=graph_config,
|
|
chunk_size=chunk_size,
|
|
overlap=overlap,
|
|
limit=limit,
|
|
selection_method=selection_method,
|
|
domain=domain,
|
|
language=language,
|
|
max_tokens=max_tokens,
|
|
discover_entity_types=discover_entity_types,
|
|
min_examples_required=min_examples_required,
|
|
n_subset_max=n_subset_max,
|
|
k=k,
|
|
)
|
|
|
|
output_path = output.resolve()
|
|
if output_path:
|
|
logger.info("Writing prompts to %s", output_path)
|
|
output_path.mkdir(parents=True, exist_ok=True)
|
|
extract_graph_prompt_path = output_path / EXTRACT_GRAPH_FILENAME
|
|
entity_summarization_prompt_path = output_path / ENTITY_SUMMARIZATION_FILENAME
|
|
community_summarization_prompt_path = (
|
|
output_path / COMMUNITY_SUMMARIZATION_FILENAME
|
|
)
|
|
# write files to output path
|
|
with extract_graph_prompt_path.open("wb") as file:
|
|
file.write(prompts[0].encode(encoding="utf-8", errors="strict"))
|
|
with entity_summarization_prompt_path.open("wb") as file:
|
|
file.write(prompts[1].encode(encoding="utf-8", errors="strict"))
|
|
with community_summarization_prompt_path.open("wb") as file:
|
|
file.write(prompts[2].encode(encoding="utf-8", errors="strict"))
|
|
logger.info("Prompts written to %s", output_path)
|
|
else:
|
|
logger.error("No output path provided. Skipping writing prompts.")
|