From 3b09df6e078c15d006c225a62f37437b685eb247 Mon Sep 17 00:00:00 2001 From: Derek Worthen Date: Wed, 18 Sep 2024 16:36:50 -0700 Subject: [PATCH] Migrate towards using static output directories (#1113) * Migrate towards using static output directories - Fixes load_config eagering resolving directories. Directories are only resolved when the output directories are local. - Add support for `--output` and `--reporting` flags for index CLI. To achieve previous output structure `index --output run1/artifacts --reports run1/reports`. - Use static output directories when initializing a new project. - Maintains backward compatibility for those using timestamp outputs locally. * fix smoke tests * update query cli to work with static directories * remove eager path resolution from load_config. Support CLI overrides that can be resolved. * add docs and output logs/artifacts to same directory * use match statement * switch back to if statement --------- Co-authored-by: Alonso Guevara --- .../patch-20240910134759384450.json | 4 ++ docsite/posts/index/2-cli.md | 2 + graphrag/config/__init__.py | 3 +- graphrag/config/defaults.py | 4 +- graphrag/config/load_config.py | 20 --------- graphrag/config/resolve_path.py | 41 +++++++++++++++++++ graphrag/index/__main__.py | 8 ++++ graphrag/index/cli.py | 24 +++++++---- graphrag/query/cli.py | 14 +++---- tests/fixtures/min-csv/settings.yml | 13 ++++++ tests/fixtures/text/settings.yml | 13 ++++++ v1-breaking-changes.md | 36 ++++++++++++++++ 12 files changed, 143 insertions(+), 39 deletions(-) create mode 100644 .semversioner/next-release/patch-20240910134759384450.json create mode 100644 v1-breaking-changes.md diff --git a/.semversioner/next-release/patch-20240910134759384450.json b/.semversioner/next-release/patch-20240910134759384450.json new file mode 100644 index 00000000..b22abb18 --- /dev/null +++ b/.semversioner/next-release/patch-20240910134759384450.json @@ -0,0 +1,4 @@ +{ + "type": "patch", + "description": "Use static output directories." +} diff --git a/docsite/posts/index/2-cli.md b/docsite/posts/index/2-cli.md index 97911bbc..79af671c 100644 --- a/docsite/posts/index/2-cli.md +++ b/docsite/posts/index/2-cli.md @@ -24,3 +24,5 @@ python -m graphrag.index --verbose --root --config ` - This will specify the progress reporter to use. The default is `rich`. Valid values are `rich`, `print`, and `none`. - `--emit ` - This specifies the table output formats the pipeline should emit. The default is `parquet`. Valid values are `parquet`, `csv`, and `json`, comma-separated. - `--nocache` - This will disable the caching mechanism. This is useful for debugging and development, but should not be used in production. +- `--output ` - Specify the output directory for pipeline artifacts. +- `--reports ` - Specify the output directory for reporting. diff --git a/graphrag/config/__init__.py b/graphrag/config/__init__.py index fae2ba3a..62708f5d 100644 --- a/graphrag/config/__init__.py +++ b/graphrag/config/__init__.py @@ -68,7 +68,7 @@ from .models import ( UmapConfig, ) from .read_dotenv import read_dotenv -from .resolve_path import resolve_path +from .resolve_path import resolve_path, resolve_paths __all__ = [ "ApiKeyMissingError", @@ -128,5 +128,6 @@ __all__ = [ "load_config_from_file", "read_dotenv", "resolve_path", + "resolve_paths", "search_for_config_in_root_dir", ] diff --git a/graphrag/config/defaults.py b/graphrag/config/defaults.py index f67518bc..c7880616 100644 --- a/graphrag/config/defaults.py +++ b/graphrag/config/defaults.py @@ -74,11 +74,11 @@ NODE2VEC_WINDOW_SIZE = 2 NODE2VEC_ITERATIONS = 3 NODE2VEC_RANDOM_SEED = 597832 REPORTING_TYPE = ReportingType.file -REPORTING_BASE_DIR = "output/${timestamp}/logs" +REPORTING_BASE_DIR = "output" SNAPSHOTS_GRAPHML = False SNAPSHOTS_RAW_ENTITIES = False SNAPSHOTS_TOP_LEVEL_NODES = False -STORAGE_BASE_DIR = "output/${timestamp}/artifacts" +STORAGE_BASE_DIR = "output" STORAGE_TYPE = StorageType.file SUMMARIZE_DESCRIPTIONS_MAX_LENGTH = 500 UMAP_ENABLED = False diff --git a/graphrag/config/load_config.py b/graphrag/config/load_config.py index 6d57cc8f..d46f01bd 100644 --- a/graphrag/config/load_config.py +++ b/graphrag/config/load_config.py @@ -7,15 +7,12 @@ from pathlib import Path from .config_file_loader import load_config_from_file, search_for_config_in_root_dir from .create_graphrag_config import create_graphrag_config -from .enums import ReportingType, StorageType from .models.graph_rag_config import GraphRagConfig -from .resolve_path import resolve_path def load_config( root_dir: str | Path, config_filepath: str | None = None, - run_id: str | None = None, ) -> GraphRagConfig: """Load configuration from a file or create a default configuration. @@ -29,8 +26,6 @@ def load_config( The path to the config file. If None, searches for config file in root and if not found creates a default configuration. - run_id : str | None - The run id to use for resolving timestamp_paths. """ root = Path(root_dir).resolve() @@ -48,19 +43,4 @@ def load_config( else: config = create_graphrag_config(root_dir=str(root)) - config.storage.base_dir = str( - resolve_path( - config.storage.base_dir, - root if config.storage.type == StorageType.file else None, - run_id, - ) - ) - config.reporting.base_dir = str( - resolve_path( - config.reporting.base_dir, - root if config.reporting.type == ReportingType.file else None, - run_id, - ) - ) - return config diff --git a/graphrag/config/resolve_path.py b/graphrag/config/resolve_path.py index bc4131b6..abc4312c 100644 --- a/graphrag/config/resolve_path.py +++ b/graphrag/config/resolve_path.py @@ -7,6 +7,9 @@ import re from pathlib import Path from string import Template +from .enums import ReportingType, StorageType +from .models.graph_rag_config import GraphRagConfig + def _resolve_timestamp_path_with_value(path: str | Path, timestamp_value: str) -> Path: """Resolve the timestamp in the path with the given timestamp value. @@ -150,3 +153,41 @@ def resolve_path( else: path_to_resolve = Path(path_to_resolve) return _resolve_timestamp_path(path_to_resolve, pattern_or_timestamp_value) + + +def resolve_paths( + config: GraphRagConfig, + pattern_or_timestamp_value: re.Pattern[str] | str | None = None, +) -> None: + """Resolve storage and reporting paths in the configuration for local file handling. + + Resolves any timestamp variables in the configuration paths by either using the provided timestamp value if string or + by looking up the latest available timestamp directory that matches the given pattern. + + Parameters + ---------- + config : GraphRagConfig + The configuration to resolve the paths in. + pattern_or_timestamp_value : re.Pattern[str] | str, default=None + The pattern to use to match the timestamp directories or the timestamp value to use. + If a string is provided, the path will be resolved with the given string value. + Otherwise, the path will be resolved with the latest available timestamp directory + that matches the given pattern. + """ + if config.storage.type == StorageType.file: + config.storage.base_dir = str( + resolve_path( + config.storage.base_dir, + config.root_dir, + pattern_or_timestamp_value, + ) + ) + + if config.reporting.type == ReportingType.file: + config.reporting.base_dir = str( + resolve_path( + config.reporting.base_dir, + config.root_dir, + pattern_or_timestamp_value, + ) + ) diff --git a/graphrag/index/__main__.py b/graphrag/index/__main__.py index d6984a4e..203d9558 100644 --- a/graphrag/index/__main__.py +++ b/graphrag/index/__main__.py @@ -88,6 +88,13 @@ if __name__ == "__main__": default=None, type=str, ) + parser.add_argument( + "--output", + help="The output directory to use for the pipeline.", + required=False, + default=None, + type=str, + ) args = parser.parse_args() if args.resume and args.update_index: @@ -107,4 +114,5 @@ if __name__ == "__main__": dryrun=args.dryrun, init=args.init, skip_validations=args.skip_validations, + output_dir=args.output, ) diff --git a/graphrag/index/cli.py b/graphrag/index/cli.py index 5a5ac39a..7dfae5b2 100644 --- a/graphrag/index/cli.py +++ b/graphrag/index/cli.py @@ -11,7 +11,12 @@ import time import warnings from pathlib import Path -from graphrag.config import CacheType, enable_logging_with_config, load_config +from graphrag.config import ( + CacheType, + enable_logging_with_config, + load_config, + resolve_paths, +) from .api import build_index from .emit.types import TableEmitterType @@ -110,6 +115,7 @@ def index_cli( emit: list[TableEmitterType], dryrun: bool, skip_validations: bool, + output_dir: str | None, ): """Run the pipeline with the given config.""" progress_reporter = load_progress_reporter(reporter) @@ -121,7 +127,11 @@ def index_cli( sys.exit(0) root = Path(root_dir).resolve() - config = load_config(root, config_filepath, run_id) + config = load_config(root, config_filepath) + + config.storage.base_dir = output_dir or config.storage.base_dir + config.reporting.base_dir = output_dir or config.reporting.base_dir + resolve_paths(config, run_id) if nocache: config.cache.type = CacheType.none @@ -188,13 +198,13 @@ def _initialize_project_at(path: str, reporter: ProgressReporter) -> None: msg = f"Project already initialized at {root}" raise ValueError(msg) + with settings_yaml.open("wb") as file: + file.write(INIT_YAML.encode(encoding="utf-8", errors="strict")) + dotenv = root / ".env" if not dotenv.exists(): - with settings_yaml.open("wb") as file: - file.write(INIT_YAML.encode(encoding="utf-8", errors="strict")) - - with dotenv.open("wb") as file: - file.write(INIT_DOTENV.encode(encoding="utf-8", errors="strict")) + with dotenv.open("wb") as file: + file.write(INIT_DOTENV.encode(encoding="utf-8", errors="strict")) prompts_dir = root / "prompts" if not prompts_dir.exists(): diff --git a/graphrag/query/cli.py b/graphrag/query/cli.py index 872a4d09..863f1697 100644 --- a/graphrag/query/cli.py +++ b/graphrag/query/cli.py @@ -9,11 +9,7 @@ from pathlib import Path import pandas as pd -from graphrag.config import ( - GraphRagConfig, - load_config, - resolve_path, -) +from graphrag.config import GraphRagConfig, load_config, resolve_paths from graphrag.index.create_pipeline_config import create_pipeline_config from graphrag.index.progress import PrintProgressReporter from graphrag.utils.storage import _create_storage, _load_table_from_storage @@ -39,8 +35,8 @@ def run_global_search( root = Path(root_dir).resolve() config = load_config(root, config_filepath) - if data_dir: - config.storage.base_dir = str(resolve_path(data_dir, root)) + config.storage.base_dir = data_dir or config.storage.base_dir + resolve_paths(config) dataframe_dict = _resolve_parquet_files( root_dir=root_dir, @@ -119,8 +115,8 @@ def run_local_search( root = Path(root_dir).resolve() config = load_config(root, config_filepath) - if data_dir: - config.storage.base_dir = str(resolve_path(data_dir, root)) + config.storage.base_dir = data_dir or config.storage.base_dir + resolve_paths(config) dataframe_dict = _resolve_parquet_files( root_dir=root_dir, diff --git a/tests/fixtures/min-csv/settings.yml b/tests/fixtures/min-csv/settings.yml index 57a00c27..a6393c02 100644 --- a/tests/fixtures/min-csv/settings.yml +++ b/tests/fixtures/min-csv/settings.yml @@ -18,3 +18,16 @@ embeddings: # community_report_title: ... # document_raw_content: ... # text_unit_text: ... + + +storage: + type: file # or blob + base_dir: "output/${timestamp}/artifacts" + # connection_string: + # container_name: + +reporting: + type: file # or console, blob + base_dir: "output/${timestamp}/reports" + # connection_string: + # container_name: \ No newline at end of file diff --git a/tests/fixtures/text/settings.yml b/tests/fixtures/text/settings.yml index 51c07736..4076e8fb 100644 --- a/tests/fixtures/text/settings.yml +++ b/tests/fixtures/text/settings.yml @@ -17,3 +17,16 @@ community_reports: prompt: "prompts/community_report.txt" max_length: 2000 max_input_length: 8000 + + +storage: + type: file # or blob + base_dir: "output/${timestamp}/artifacts" + # connection_string: + # container_name: + +reporting: + type: file # or console, blob + base_dir: "output/${timestamp}/reports" + # connection_string: + # container_name: \ No newline at end of file diff --git a/v1-breaking-changes.md b/v1-breaking-changes.md new file mode 100644 index 00000000..178ed63b --- /dev/null +++ b/v1-breaking-changes.md @@ -0,0 +1,36 @@ +# Config Breaking Changes + +## Deprecate timestamp paths + +### Change + +- Remove support for timestamp paths, those using `${timestamp}` directory nesting. +- Use the same directory for storage output and reporting output. + +### Migration + +- Ensure output directories no longer use `${timestamp}` directory nesting. + +**Using Environment Variables** + +- Ensure `GRAPHRAG_STORAGE_BASE_DIR` is set to a static directory, e.g., `output` instead of `output/${timestamp}/artifacts`. +- Ensure `GRAPHRAG_REPORTING_BASE_DIR` is set to a static directory, e.g., `output` instead of `output/${timestamp}/reports` + +[Full docs on using environment variables for configuration](https://microsoft.github.io/graphrag/posts/config/env_vars/). + +**Using Configuration File** + +```yaml +# rest of settings.yaml file +# ... + +storage: + type: file + base_dir: "output" # changed from "output/${timestamp}/artifacts" + +reporting: + type: file + base_dir: "output" # changed from "output/${timestamp}/reports" +``` + +[Full docs on using JSON or YAML files for configuration](https://microsoft.github.io/graphrag/posts/config/json_yaml/). \ No newline at end of file