Migrate towards using static output directories (#1113)

* Migrate towards using static output directories - Fixes load_config eagering resolving directories. Directories are only resolved when the output directories are local. - Add support for `--output` and `--reporting` flags for index CLI. To achieve previous output structure `index --output run1/artifacts --reports run1/reports`. - Use static output directories when initializing a new project. - Maintains backward compatibility for those using timestamp outputs locally. * fix smoke tests * update query cli to work with static directories * remove eager path resolution from load_config. Support CLI overrides that can be resolved. * add docs and output logs/artifacts to same directory * use match statement * switch back to if statement --------- Co-authored-by: Alonso Guevara <alonsog@microsoft.com>
2026-01-14 09:07:20 +08:00 · 2024-09-18 16:36:50 -07:00 · 2024-09-18 16:36:50 -07:00 · 3b09df6e07
commit 3b09df6e07
parent 10910797d0
12 changed files with 143 additions and 39 deletions
--- a/.semversioner/next-release/patch-20240910134759384450.json
+++ b/.semversioner/next-release/patch-20240910134759384450.json
@ -0,0 +1,4 @@
+{
+  "type": "patch",
+  "description": "Use static output directories."
+}
--- a/docsite/posts/index/2-cli.md
+++ b/docsite/posts/index/2-cli.md
@ -24,3 +24,5 @@ python -m graphrag.index --verbose --root </workspace/project/root> --config <cu
 - `--reporter <reporter>` - This will specify the progress reporter to use. The default is `rich`. Valid values are `rich`, `print`, and `none`.
 - `--emit <types>` - This specifies the table output formats the pipeline should emit. The default is `parquet`. Valid values are `parquet`, `csv`, and `json`, comma-separated.
 - `--nocache` - This will disable the caching mechanism. This is useful for debugging and development, but should not be used in production.
+- `--output <directory>` - Specify the output directory for pipeline artifacts. 
+- `--reports <directory>` - Specify the output directory for reporting. 
--- a/graphrag/config/init.py
+++ b/graphrag/config/init.py
@ -68,7 +68,7 @@ from .models import (
    UmapConfig,
 )
 from .read_dotenv import read_dotenv
-from .resolve_path import resolve_path
+from .resolve_path import resolve_path, resolve_paths

 __all__ = [
    "ApiKeyMissingError",
@ -128,5 +128,6 @@ __all__ = [
    "load_config_from_file",
    "read_dotenv",
    "resolve_path",
+    "resolve_paths",
    "search_for_config_in_root_dir",
 ]
--- a/graphrag/config/defaults.py
+++ b/graphrag/config/defaults.py
@ -74,11 +74,11 @@ NODE2VEC_WINDOW_SIZE = 2
 NODE2VEC_ITERATIONS = 3
 NODE2VEC_RANDOM_SEED = 597832
 REPORTING_TYPE = ReportingType.file
-REPORTING_BASE_DIR = "output/${timestamp}/logs"
+REPORTING_BASE_DIR = "output"
 SNAPSHOTS_GRAPHML = False
 SNAPSHOTS_RAW_ENTITIES = False
 SNAPSHOTS_TOP_LEVEL_NODES = False
-STORAGE_BASE_DIR = "output/${timestamp}/artifacts"
+STORAGE_BASE_DIR = "output"
 STORAGE_TYPE = StorageType.file
 SUMMARIZE_DESCRIPTIONS_MAX_LENGTH = 500
 UMAP_ENABLED = False
--- a/graphrag/config/load_config.py
+++ b/graphrag/config/load_config.py
@ -7,15 +7,12 @@ from pathlib import Path

 from .config_file_loader import load_config_from_file, search_for_config_in_root_dir
 from .create_graphrag_config import create_graphrag_config
-from .enums import ReportingType, StorageType
 from .models.graph_rag_config import GraphRagConfig
-from .resolve_path import resolve_path


 def load_config(
    root_dir: str | Path,
    config_filepath: str | None = None,
-    run_id: str | None = None,
 ) -> GraphRagConfig:
    """Load configuration from a file or create a default configuration.

@ -29,8 +26,6 @@ def load_config(
        The path to the config file.
        If None, searches for config file in root and
        if not found creates a default configuration.
-    run_id : str | None
-        The run id to use for resolving timestamp_paths.
    """
    root = Path(root_dir).resolve()

@ -48,19 +43,4 @@ def load_config(
    else:
        config = create_graphrag_config(root_dir=str(root))

-    config.storage.base_dir = str(
-        resolve_path(
-            config.storage.base_dir,
-            root if config.storage.type == StorageType.file else None,
-            run_id,
-        )
-    )
-    config.reporting.base_dir = str(
-        resolve_path(
-            config.reporting.base_dir,
-            root if config.reporting.type == ReportingType.file else None,
-            run_id,
-        )
-    )
-
    return config
--- a/graphrag/config/resolve_path.py
+++ b/graphrag/config/resolve_path.py
@ -7,6 +7,9 @@ import re
 from pathlib import Path
 from string import Template

+from .enums import ReportingType, StorageType
+from .models.graph_rag_config import GraphRagConfig
+

 def _resolve_timestamp_path_with_value(path: str | Path, timestamp_value: str) -> Path:
    """Resolve the timestamp in the path with the given timestamp value.
@ -150,3 +153,41 @@ def resolve_path(
    else:
        path_to_resolve = Path(path_to_resolve)
    return _resolve_timestamp_path(path_to_resolve, pattern_or_timestamp_value)
+
+
+def resolve_paths(
+    config: GraphRagConfig,
+    pattern_or_timestamp_value: re.Pattern[str] | str | None = None,
+) -> None:
+    """Resolve storage and reporting paths in the configuration for local file handling.
+
+    Resolves any timestamp variables in the configuration paths by either using the provided timestamp value if string or
+    by looking up the latest available timestamp directory that matches the given pattern.
+
+    Parameters
+    ----------
+    config : GraphRagConfig
+        The configuration to resolve the paths in.
+    pattern_or_timestamp_value : re.Pattern[str] | str, default=None
+        The pattern to use to match the timestamp directories or the timestamp value to use.
+        If a string is provided, the path will be resolved with the given string value.
+        Otherwise, the path will be resolved with the latest available timestamp directory
+        that matches the given pattern.
+    """
+    if config.storage.type == StorageType.file:
+        config.storage.base_dir = str(
+            resolve_path(
+                config.storage.base_dir,
+                config.root_dir,
+                pattern_or_timestamp_value,
+            )
+        )
+
+    if config.reporting.type == ReportingType.file:
+        config.reporting.base_dir = str(
+            resolve_path(
+                config.reporting.base_dir,
+                config.root_dir,
+                pattern_or_timestamp_value,
+            )
+        )
--- a/graphrag/index/main.py
+++ b/graphrag/index/main.py
@ -88,6 +88,13 @@ if __name__ == "__main__":
        default=None,
        type=str,
    )
+    parser.add_argument(
+        "--output",
+        help="The output directory to use for the pipeline.",
+        required=False,
+        default=None,
+        type=str,
+    )
    args = parser.parse_args()

    if args.resume and args.update_index:
@ -107,4 +114,5 @@ if __name__ == "__main__":
        dryrun=args.dryrun,
        init=args.init,
        skip_validations=args.skip_validations,
+        output_dir=args.output,
    )
--- a/graphrag/index/cli.py
+++ b/graphrag/index/cli.py
@ -11,7 +11,12 @@ import time
 import warnings
 from pathlib import Path

-from graphrag.config import CacheType, enable_logging_with_config, load_config
+from graphrag.config import (
+    CacheType,
+    enable_logging_with_config,
+    load_config,
+    resolve_paths,
+)

 from .api import build_index
 from .emit.types import TableEmitterType
@ -110,6 +115,7 @@ def index_cli(
    emit: list[TableEmitterType],
    dryrun: bool,
    skip_validations: bool,
+    output_dir: str | None,
 ):
    """Run the pipeline with the given config."""
    progress_reporter = load_progress_reporter(reporter)
@ -121,7 +127,11 @@ def index_cli(
        sys.exit(0)

    root = Path(root_dir).resolve()
-    config = load_config(root, config_filepath, run_id)
+    config = load_config(root, config_filepath)
+
+    config.storage.base_dir = output_dir or config.storage.base_dir
+    config.reporting.base_dir = output_dir or config.reporting.base_dir
+    resolve_paths(config, run_id)

    if nocache:
        config.cache.type = CacheType.none
@ -188,13 +198,13 @@ def _initialize_project_at(path: str, reporter: ProgressReporter) -> None:
        msg = f"Project already initialized at {root}"
        raise ValueError(msg)

+    with settings_yaml.open("wb") as file:
+        file.write(INIT_YAML.encode(encoding="utf-8", errors="strict"))
+
    dotenv = root / ".env"
    if not dotenv.exists():
-        with settings_yaml.open("wb") as file:
-            file.write(INIT_YAML.encode(encoding="utf-8", errors="strict"))
-
-    with dotenv.open("wb") as file:
-        file.write(INIT_DOTENV.encode(encoding="utf-8", errors="strict"))
+        with dotenv.open("wb") as file:
+            file.write(INIT_DOTENV.encode(encoding="utf-8", errors="strict"))

    prompts_dir = root / "prompts"
    if not prompts_dir.exists():
--- a/graphrag/query/cli.py
+++ b/graphrag/query/cli.py
@ -9,11 +9,7 @@ from pathlib import Path

 import pandas as pd

-from graphrag.config import (
-    GraphRagConfig,
-    load_config,
-    resolve_path,
-)
+from graphrag.config import GraphRagConfig, load_config, resolve_paths
 from graphrag.index.create_pipeline_config import create_pipeline_config
 from graphrag.index.progress import PrintProgressReporter
 from graphrag.utils.storage import _create_storage, _load_table_from_storage
@ -39,8 +35,8 @@ def run_global_search(
    root = Path(root_dir).resolve()
    config = load_config(root, config_filepath)

-    if data_dir:
-        config.storage.base_dir = str(resolve_path(data_dir, root))
+    config.storage.base_dir = data_dir or config.storage.base_dir
+    resolve_paths(config)

    dataframe_dict = _resolve_parquet_files(
        root_dir=root_dir,
@ -119,8 +115,8 @@ def run_local_search(
    root = Path(root_dir).resolve()
    config = load_config(root, config_filepath)

-    if data_dir:
-        config.storage.base_dir = str(resolve_path(data_dir, root))
+    config.storage.base_dir = data_dir or config.storage.base_dir
+    resolve_paths(config)

    dataframe_dict = _resolve_parquet_files(
        root_dir=root_dir,
--- a/tests/fixtures/min-csv/settings.yml
+++ b/tests/fixtures/min-csv/settings.yml
@ -18,3 +18,16 @@ embeddings:
    # community_report_title: ...
    # document_raw_content: ...
    # text_unit_text: ...
+
+
+storage:
+  type: file # or blob
+  base_dir: "output/${timestamp}/artifacts"
+  # connection_string: <azure_blob_storage_connection_string>
+  # container_name: <azure_blob_storage_container_name>
+
+reporting:
+  type: file # or console, blob
+  base_dir: "output/${timestamp}/reports"
+  # connection_string: <azure_blob_storage_connection_string>
+  # container_name: <azure_blob_storage_container_name>
--- a/tests/fixtures/text/settings.yml
+++ b/tests/fixtures/text/settings.yml
@ -17,3 +17,16 @@ community_reports:
  prompt: "prompts/community_report.txt"
  max_length: 2000
  max_input_length: 8000
+
+
+storage:
+  type: file # or blob
+  base_dir: "output/${timestamp}/artifacts"
+  # connection_string: <azure_blob_storage_connection_string>
+  # container_name: <azure_blob_storage_container_name>
+
+reporting:
+  type: file # or console, blob
+  base_dir: "output/${timestamp}/reports"
+  # connection_string: <azure_blob_storage_connection_string>
+  # container_name: <azure_blob_storage_container_name>
--- a/v1-breaking-changes.md
+++ b/v1-breaking-changes.md
@ -0,0 +1,36 @@
+# Config Breaking Changes
+
+## Deprecate timestamp paths
+
+### Change
+
+- Remove support for timestamp paths, those using `${timestamp}` directory nesting. 
+- Use the same directory for storage output and reporting output.
+
+### Migration
+
+- Ensure output directories no longer use `${timestamp}` directory nesting. 
+
+**Using Environment Variables**
+
+- Ensure `GRAPHRAG_STORAGE_BASE_DIR` is set to a static directory, e.g., `output` instead of `output/${timestamp}/artifacts`.
+- Ensure `GRAPHRAG_REPORTING_BASE_DIR` is set to a static directory, e.g., `output` instead of `output/${timestamp}/reports`
+
+[Full docs on using environment variables for configuration](https://microsoft.github.io/graphrag/posts/config/env_vars/).
+
+**Using Configuration File**
+
+```yaml
+# rest of settings.yaml file
+# ...
+
+storage:
+  type: file
+  base_dir: "output" # changed from "output/${timestamp}/artifacts"
+
+reporting:
+  type: file
+  base_dir: "output" # changed from "output/${timestamp}/reports"
+```
+
+[Full docs on using JSON or YAML files for configuration](https://microsoft.github.io/graphrag/posts/config/json_yaml/).