mirror of
https://github.com/microsoft/graphrag.git
synced 2026-02-18 00:35:44 +08:00
Some checks are pending
gh-pages / build (push) Waiting to run
Python CI / python-ci (ubuntu-latest, 3.10) (push) Waiting to run
Python CI / python-ci (ubuntu-latest, 3.11) (push) Waiting to run
Python CI / python-ci (windows-latest, 3.10) (push) Waiting to run
Python CI / python-ci (windows-latest, 3.11) (push) Waiting to run
Python Integration Tests / python-ci (ubuntu-latest, 3.10) (push) Waiting to run
Python Integration Tests / python-ci (windows-latest, 3.10) (push) Waiting to run
Python Notebook Tests / python-ci (ubuntu-latest, 3.10) (push) Waiting to run
Python Notebook Tests / python-ci (windows-latest, 3.10) (push) Waiting to run
Python Publish (pypi) / Upload release to PyPI (push) Waiting to run
Python Smoke Tests / python-ci (ubuntu-latest, 3.10) (push) Waiting to run
Python Smoke Tests / python-ci (windows-latest, 3.10) (push) Waiting to run
Spellcheck / spellcheck (push) Waiting to run
* Initial plan for issue * Implement standard logging module and integrate with existing loggers Co-authored-by: jgbradley1 <654554+jgbradley1@users.noreply.github.com> * Add test cases and improve documentation for standard logging Co-authored-by: jgbradley1 <654554+jgbradley1@users.noreply.github.com> * Apply ruff formatting and add semversioner file for logging improvements Co-authored-by: jgbradley1 <654554+jgbradley1@users.noreply.github.com> * Remove custom logger classes and refactor to use standard logging only Co-authored-by: jgbradley1 <654554+jgbradley1@users.noreply.github.com> * Apply ruff formatting to resolve CI/CD test failures Co-authored-by: jgbradley1 <654554+jgbradley1@users.noreply.github.com> * Add semversioner file and fix linting issues Co-authored-by: jgbradley1 <654554+jgbradley1@users.noreply.github.com> * ruff fixes * fix spelling error * Remove StandardProgressLogger and refactor to use standard logging Co-authored-by: jgbradley1 <654554+jgbradley1@users.noreply.github.com> * Remove LoggerFactory and custom loggers, refactor to use standard logging Co-authored-by: jgbradley1 <654554+jgbradley1@users.noreply.github.com> * Fix pyright error: use logger.info() instead of calling logger as function in cosmosdb_pipeline_storage.py Co-authored-by: jgbradley1 <654554+jgbradley1@users.noreply.github.com> * ruff fixes * Remove deprecated logger files that were marked as deprecated placeholders Co-authored-by: jgbradley1 <654554+jgbradley1@users.noreply.github.com> * Replace custom get_logger with standard Python logging Co-authored-by: jgbradley1 <654554+jgbradley1@users.noreply.github.com> * Fix linting issues found by ruff check --fix Co-authored-by: jgbradley1 <654554+jgbradley1@users.noreply.github.com> * apply ruff check fixes * add word to dictionary * Fix type checker error in ModelManager.__new__ method Co-authored-by: jgbradley1 <654554+jgbradley1@users.noreply.github.com> * Refactor multiple logging.getLogger() calls to use single logger per file Co-authored-by: jgbradley1 <654554+jgbradley1@users.noreply.github.com> * Remove progress_logger parameter from build_index() and logger parameter from generate_indexing_prompts() Co-authored-by: jgbradley1 <654554+jgbradley1@users.noreply.github.com> * Remove logger parameter from run_pipeline and standardize logger naming Co-authored-by: jgbradley1 <654554+jgbradley1@users.noreply.github.com> * Replace logger parameter with log_level parameter in CLI commands Co-authored-by: jgbradley1 <654554+jgbradley1@users.noreply.github.com> * Fix import ordering in notebook files to pass poetry poe check Co-authored-by: jgbradley1 <654554+jgbradley1@users.noreply.github.com> * Remove --logger parameter from smoke test command Co-authored-by: jgbradley1 <654554+jgbradley1@users.noreply.github.com> * Fix Windows CI/CD issue with log file cleanup in tests Co-authored-by: jgbradley1 <654554+jgbradley1@users.noreply.github.com> * Add StreamHandler to root logger in __main__.py for CLI logging Co-authored-by: jgbradley1 <654554+jgbradley1@users.noreply.github.com> * Only add StreamHandler if root logger doesn't have existing StreamHandler Co-authored-by: jgbradley1 <654554+jgbradley1@users.noreply.github.com> * Fix import ordering in notebook files to pass ruff checks Co-authored-by: jgbradley1 <654554+jgbradley1@users.noreply.github.com> * Replace logging.StreamHandler with colorlog.StreamHandler for colorized log output Co-authored-by: jgbradley1 <654554+jgbradley1@users.noreply.github.com> * Regenerate poetry.lock file after adding colorlog dependency Co-authored-by: jgbradley1 <654554+jgbradley1@users.noreply.github.com> * Fix import ordering in notebook files to pass ruff checks Co-authored-by: jgbradley1 <654554+jgbradley1@users.noreply.github.com> * move printing of dataframes to debug level * remove colorlog for now * Refactor workflow callbacks to inherit from logging.Handler Co-authored-by: jgbradley1 <654554+jgbradley1@users.noreply.github.com> * Fix linting issues in workflow callback handlers Co-authored-by: jgbradley1 <654554+jgbradley1@users.noreply.github.com> * Fix pyright type errors in blob and file workflow callbacks Co-authored-by: jgbradley1 <654554+jgbradley1@users.noreply.github.com> * Refactor pipeline logging to use pure logging.Handler subclasses Co-authored-by: jgbradley1 <654554+jgbradley1@users.noreply.github.com> * Rename workflow callback classes to workflow logger classes and move to logger directory Co-authored-by: jgbradley1 <654554+jgbradley1@users.noreply.github.com> * update dictionary * apply ruff fixes * fix function name * simplify logger code * update * Remove error, warning, and log methods from WorkflowCallbacks and replace with standard logging Co-authored-by: jgbradley1 <654554+jgbradley1@users.noreply.github.com> * ruff fixes * Fix pyright errors by removing WorkflowCallbacks from strategy type signatures Co-authored-by: jgbradley1 <654554+jgbradley1@users.noreply.github.com> * Remove ConsoleWorkflowLogger and apply consistent formatter to all handlers Co-authored-by: jgbradley1 <654554+jgbradley1@users.noreply.github.com> * apply ruff fixes * Refactor pipeline_logger.py to use standard FileHandler and remove FileWorkflowLogger Co-authored-by: jgbradley1 <654554+jgbradley1@users.noreply.github.com> * Remove conditional azure import checks from blob_workflow_logger.py Co-authored-by: jgbradley1 <654554+jgbradley1@users.noreply.github.com> * Fix pyright type checking errors in mock_provider.py and utils.py Co-authored-by: jgbradley1 <654554+jgbradley1@users.noreply.github.com> * Run ruff check --fix to fix import ordering in notebooks Co-authored-by: jgbradley1 <654554+jgbradley1@users.noreply.github.com> * Merge configure_logging and create_pipeline_logger into init_loggers function Co-authored-by: jgbradley1 <654554+jgbradley1@users.noreply.github.com> * Remove configure_logging and create_pipeline_logger functions, replace all usage with init_loggers Co-authored-by: jgbradley1 <654554+jgbradley1@users.noreply.github.com> * apply ruff fixes * cleanup unused code * Update init_loggers to accept GraphRagConfig instead of ReportingConfig Co-authored-by: jgbradley1 <654554+jgbradley1@users.noreply.github.com> * apply ruff check fixes * Fix test failures by providing valid GraphRagConfig with required model configurations Co-authored-by: jgbradley1 <654554+jgbradley1@users.noreply.github.com> * apply ruff fixes * remove logging_workflow_callback * cleanup logging messages * Add logging to track progress of pandas DataFrame apply operation in create_base_text_units Co-authored-by: jgbradley1 <654554+jgbradley1@users.noreply.github.com> * cleanup logger logic throughout codebase * update * more cleanup of old loggers * small logger cleanup * final code cleanup and added loggers to query * add verbose logging to query * minor code cleanup * Fix broken unit tests for chunk_text and standard_logging Co-authored-by: jgbradley1 <654554+jgbradley1@users.noreply.github.com> * apply ruff fixes * Fix test_chunk_text by mocking progress_ticker function instead of ProgressTicker class Co-authored-by: jgbradley1 <654554+jgbradley1@users.noreply.github.com> * remove unnecessary logger * remove rich and fix type annotation * revert test formatting changes my by copilot * promote graphrag logs to root logger * add correct semversioner file * revert change to file * revert formatting changes that have no effect * fix changes after merge with main * revert unnecessary copilot changes * remove whitespace * cleanup docstring * simplify some logic with less code * update poetry lock file * ruff fixes --------- Co-authored-by: copilot-swe-agent[bot] <198982749+Copilot@users.noreply.github.com> Co-authored-by: jgbradley1 <654554+jgbradley1@users.noreply.github.com> Co-authored-by: Josh Bradley <joshbradley@microsoft.com>
112 lines
3.4 KiB
Python
112 lines
3.4 KiB
Python
# Copyright (c) 2024 Microsoft Corporation.
|
|
# Licensed under the MIT License
|
|
|
|
"""Text Utilities for LLM."""
|
|
|
|
import json
|
|
import logging
|
|
import re
|
|
from collections.abc import Iterator
|
|
from itertools import islice
|
|
|
|
import tiktoken
|
|
from json_repair import repair_json
|
|
|
|
import graphrag.config.defaults as defs
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
def num_tokens(text: str, token_encoder: tiktoken.Encoding | None = None) -> int:
|
|
"""Return the number of tokens in the given text."""
|
|
if token_encoder is None:
|
|
token_encoder = tiktoken.get_encoding(defs.ENCODING_MODEL)
|
|
return len(token_encoder.encode(text)) # type: ignore
|
|
|
|
|
|
def batched(iterable: Iterator, n: int):
|
|
"""
|
|
Batch data into tuples of length n. The last batch may be shorter.
|
|
|
|
Taken from Python's cookbook: https://docs.python.org/3/library/itertools.html#itertools.batched
|
|
"""
|
|
# batched('ABCDEFG', 3) --> ABC DEF G
|
|
if n < 1:
|
|
value_error = "n must be at least one"
|
|
raise ValueError(value_error)
|
|
it = iter(iterable)
|
|
while batch := tuple(islice(it, n)):
|
|
yield batch
|
|
|
|
|
|
def chunk_text(
|
|
text: str, max_tokens: int, token_encoder: tiktoken.Encoding | None = None
|
|
):
|
|
"""Chunk text by token length."""
|
|
if token_encoder is None:
|
|
token_encoder = tiktoken.get_encoding(defs.ENCODING_MODEL)
|
|
tokens = token_encoder.encode(text) # type: ignore
|
|
chunk_iterator = batched(iter(tokens), max_tokens)
|
|
yield from (token_encoder.decode(list(chunk)) for chunk in chunk_iterator)
|
|
|
|
|
|
def try_parse_json_object(input: str, verbose: bool = True) -> tuple[str, dict]:
|
|
"""JSON cleaning and formatting utilities."""
|
|
# Sometimes, the LLM returns a json string with some extra description, this function will clean it up.
|
|
|
|
result = None
|
|
try:
|
|
# Try parse first
|
|
result = json.loads(input)
|
|
except json.JSONDecodeError:
|
|
if verbose:
|
|
logger.warning("Error decoding faulty json, attempting repair")
|
|
|
|
if result:
|
|
return input, result
|
|
|
|
pattern = r"\{(.*)\}"
|
|
match = re.search(pattern, input, re.DOTALL)
|
|
input = "{" + match.group(1) + "}" if match else input
|
|
|
|
# Clean up json string.
|
|
input = (
|
|
input.replace("{{", "{")
|
|
.replace("}}", "}")
|
|
.replace('"[{', "[{")
|
|
.replace('}]"', "}]")
|
|
.replace("\\", " ")
|
|
.replace("\\n", " ")
|
|
.replace("\n", " ")
|
|
.replace("\r", "")
|
|
.strip()
|
|
)
|
|
|
|
# Remove JSON Markdown Frame
|
|
if input.startswith("```json"):
|
|
input = input[len("```json") :]
|
|
if input.endswith("```"):
|
|
input = input[: len(input) - len("```")]
|
|
|
|
try:
|
|
result = json.loads(input)
|
|
except json.JSONDecodeError:
|
|
# Fixup potentially malformed json string using json_repair.
|
|
input = str(repair_json(json_str=input, return_objects=False))
|
|
|
|
# Generate JSON-string output using best-attempt prompting & parsing techniques.
|
|
try:
|
|
result = json.loads(input)
|
|
except json.JSONDecodeError:
|
|
if verbose:
|
|
logger.exception("error loading json, json=%s", input)
|
|
return input, {}
|
|
else:
|
|
if not isinstance(result, dict):
|
|
if verbose:
|
|
logger.exception("not expected dict type. type=%s:", type(result))
|
|
return input, {}
|
|
return input, result
|
|
else:
|
|
return input, result
|