Migrate towards using static output directories (#1113)

* Migrate towards using static output directories

- Fixes load_config eagering resolving directories.
    Directories are only resolved when the output
    directories are local.
- Add support for `--output` and `--reporting` flags
    for index CLI. To achieve previous output structure
    `index --output run1/artifacts --reports run1/reports`.
- Use static output directories when initializing
    a new project.
- Maintains backward compatibility for those using
    timestamp outputs locally.

* fix smoke tests

* update query cli to work with static directories

* remove eager path resolution from load_config. Support CLI overrides that can be resolved.

* add docs and output logs/artifacts to same directory

* use match statement

* switch back to if statement

---------

Co-authored-by: Alonso Guevara <alonsog@microsoft.com>
This commit is contained in:
Derek Worthen 2024-09-18 16:36:50 -07:00 committed by GitHub
parent 10910797d0
commit 3b09df6e07
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
12 changed files with 143 additions and 39 deletions

View File

@ -0,0 +1,4 @@
{
"type": "patch",
"description": "Use static output directories."
}

View File

@ -24,3 +24,5 @@ python -m graphrag.index --verbose --root </workspace/project/root> --config <cu
- `--reporter <reporter>` - This will specify the progress reporter to use. The default is `rich`. Valid values are `rich`, `print`, and `none`.
- `--emit <types>` - This specifies the table output formats the pipeline should emit. The default is `parquet`. Valid values are `parquet`, `csv`, and `json`, comma-separated.
- `--nocache` - This will disable the caching mechanism. This is useful for debugging and development, but should not be used in production.
- `--output <directory>` - Specify the output directory for pipeline artifacts.
- `--reports <directory>` - Specify the output directory for reporting.

View File

@ -68,7 +68,7 @@ from .models import (
UmapConfig,
)
from .read_dotenv import read_dotenv
from .resolve_path import resolve_path
from .resolve_path import resolve_path, resolve_paths
__all__ = [
"ApiKeyMissingError",
@ -128,5 +128,6 @@ __all__ = [
"load_config_from_file",
"read_dotenv",
"resolve_path",
"resolve_paths",
"search_for_config_in_root_dir",
]

View File

@ -74,11 +74,11 @@ NODE2VEC_WINDOW_SIZE = 2
NODE2VEC_ITERATIONS = 3
NODE2VEC_RANDOM_SEED = 597832
REPORTING_TYPE = ReportingType.file
REPORTING_BASE_DIR = "output/${timestamp}/logs"
REPORTING_BASE_DIR = "output"
SNAPSHOTS_GRAPHML = False
SNAPSHOTS_RAW_ENTITIES = False
SNAPSHOTS_TOP_LEVEL_NODES = False
STORAGE_BASE_DIR = "output/${timestamp}/artifacts"
STORAGE_BASE_DIR = "output"
STORAGE_TYPE = StorageType.file
SUMMARIZE_DESCRIPTIONS_MAX_LENGTH = 500
UMAP_ENABLED = False

View File

@ -7,15 +7,12 @@ from pathlib import Path
from .config_file_loader import load_config_from_file, search_for_config_in_root_dir
from .create_graphrag_config import create_graphrag_config
from .enums import ReportingType, StorageType
from .models.graph_rag_config import GraphRagConfig
from .resolve_path import resolve_path
def load_config(
root_dir: str | Path,
config_filepath: str | None = None,
run_id: str | None = None,
) -> GraphRagConfig:
"""Load configuration from a file or create a default configuration.
@ -29,8 +26,6 @@ def load_config(
The path to the config file.
If None, searches for config file in root and
if not found creates a default configuration.
run_id : str | None
The run id to use for resolving timestamp_paths.
"""
root = Path(root_dir).resolve()
@ -48,19 +43,4 @@ def load_config(
else:
config = create_graphrag_config(root_dir=str(root))
config.storage.base_dir = str(
resolve_path(
config.storage.base_dir,
root if config.storage.type == StorageType.file else None,
run_id,
)
)
config.reporting.base_dir = str(
resolve_path(
config.reporting.base_dir,
root if config.reporting.type == ReportingType.file else None,
run_id,
)
)
return config

View File

@ -7,6 +7,9 @@ import re
from pathlib import Path
from string import Template
from .enums import ReportingType, StorageType
from .models.graph_rag_config import GraphRagConfig
def _resolve_timestamp_path_with_value(path: str | Path, timestamp_value: str) -> Path:
"""Resolve the timestamp in the path with the given timestamp value.
@ -150,3 +153,41 @@ def resolve_path(
else:
path_to_resolve = Path(path_to_resolve)
return _resolve_timestamp_path(path_to_resolve, pattern_or_timestamp_value)
def resolve_paths(
config: GraphRagConfig,
pattern_or_timestamp_value: re.Pattern[str] | str | None = None,
) -> None:
"""Resolve storage and reporting paths in the configuration for local file handling.
Resolves any timestamp variables in the configuration paths by either using the provided timestamp value if string or
by looking up the latest available timestamp directory that matches the given pattern.
Parameters
----------
config : GraphRagConfig
The configuration to resolve the paths in.
pattern_or_timestamp_value : re.Pattern[str] | str, default=None
The pattern to use to match the timestamp directories or the timestamp value to use.
If a string is provided, the path will be resolved with the given string value.
Otherwise, the path will be resolved with the latest available timestamp directory
that matches the given pattern.
"""
if config.storage.type == StorageType.file:
config.storage.base_dir = str(
resolve_path(
config.storage.base_dir,
config.root_dir,
pattern_or_timestamp_value,
)
)
if config.reporting.type == ReportingType.file:
config.reporting.base_dir = str(
resolve_path(
config.reporting.base_dir,
config.root_dir,
pattern_or_timestamp_value,
)
)

View File

@ -88,6 +88,13 @@ if __name__ == "__main__":
default=None,
type=str,
)
parser.add_argument(
"--output",
help="The output directory to use for the pipeline.",
required=False,
default=None,
type=str,
)
args = parser.parse_args()
if args.resume and args.update_index:
@ -107,4 +114,5 @@ if __name__ == "__main__":
dryrun=args.dryrun,
init=args.init,
skip_validations=args.skip_validations,
output_dir=args.output,
)

View File

@ -11,7 +11,12 @@ import time
import warnings
from pathlib import Path
from graphrag.config import CacheType, enable_logging_with_config, load_config
from graphrag.config import (
CacheType,
enable_logging_with_config,
load_config,
resolve_paths,
)
from .api import build_index
from .emit.types import TableEmitterType
@ -110,6 +115,7 @@ def index_cli(
emit: list[TableEmitterType],
dryrun: bool,
skip_validations: bool,
output_dir: str | None,
):
"""Run the pipeline with the given config."""
progress_reporter = load_progress_reporter(reporter)
@ -121,7 +127,11 @@ def index_cli(
sys.exit(0)
root = Path(root_dir).resolve()
config = load_config(root, config_filepath, run_id)
config = load_config(root, config_filepath)
config.storage.base_dir = output_dir or config.storage.base_dir
config.reporting.base_dir = output_dir or config.reporting.base_dir
resolve_paths(config, run_id)
if nocache:
config.cache.type = CacheType.none
@ -188,13 +198,13 @@ def _initialize_project_at(path: str, reporter: ProgressReporter) -> None:
msg = f"Project already initialized at {root}"
raise ValueError(msg)
with settings_yaml.open("wb") as file:
file.write(INIT_YAML.encode(encoding="utf-8", errors="strict"))
dotenv = root / ".env"
if not dotenv.exists():
with settings_yaml.open("wb") as file:
file.write(INIT_YAML.encode(encoding="utf-8", errors="strict"))
with dotenv.open("wb") as file:
file.write(INIT_DOTENV.encode(encoding="utf-8", errors="strict"))
with dotenv.open("wb") as file:
file.write(INIT_DOTENV.encode(encoding="utf-8", errors="strict"))
prompts_dir = root / "prompts"
if not prompts_dir.exists():

View File

@ -9,11 +9,7 @@ from pathlib import Path
import pandas as pd
from graphrag.config import (
GraphRagConfig,
load_config,
resolve_path,
)
from graphrag.config import GraphRagConfig, load_config, resolve_paths
from graphrag.index.create_pipeline_config import create_pipeline_config
from graphrag.index.progress import PrintProgressReporter
from graphrag.utils.storage import _create_storage, _load_table_from_storage
@ -39,8 +35,8 @@ def run_global_search(
root = Path(root_dir).resolve()
config = load_config(root, config_filepath)
if data_dir:
config.storage.base_dir = str(resolve_path(data_dir, root))
config.storage.base_dir = data_dir or config.storage.base_dir
resolve_paths(config)
dataframe_dict = _resolve_parquet_files(
root_dir=root_dir,
@ -119,8 +115,8 @@ def run_local_search(
root = Path(root_dir).resolve()
config = load_config(root, config_filepath)
if data_dir:
config.storage.base_dir = str(resolve_path(data_dir, root))
config.storage.base_dir = data_dir or config.storage.base_dir
resolve_paths(config)
dataframe_dict = _resolve_parquet_files(
root_dir=root_dir,

View File

@ -18,3 +18,16 @@ embeddings:
# community_report_title: ...
# document_raw_content: ...
# text_unit_text: ...
storage:
type: file # or blob
base_dir: "output/${timestamp}/artifacts"
# connection_string: <azure_blob_storage_connection_string>
# container_name: <azure_blob_storage_container_name>
reporting:
type: file # or console, blob
base_dir: "output/${timestamp}/reports"
# connection_string: <azure_blob_storage_connection_string>
# container_name: <azure_blob_storage_container_name>

View File

@ -17,3 +17,16 @@ community_reports:
prompt: "prompts/community_report.txt"
max_length: 2000
max_input_length: 8000
storage:
type: file # or blob
base_dir: "output/${timestamp}/artifacts"
# connection_string: <azure_blob_storage_connection_string>
# container_name: <azure_blob_storage_container_name>
reporting:
type: file # or console, blob
base_dir: "output/${timestamp}/reports"
# connection_string: <azure_blob_storage_connection_string>
# container_name: <azure_blob_storage_container_name>

36
v1-breaking-changes.md Normal file
View File

@ -0,0 +1,36 @@
# Config Breaking Changes
## Deprecate timestamp paths
### Change
- Remove support for timestamp paths, those using `${timestamp}` directory nesting.
- Use the same directory for storage output and reporting output.
### Migration
- Ensure output directories no longer use `${timestamp}` directory nesting.
**Using Environment Variables**
- Ensure `GRAPHRAG_STORAGE_BASE_DIR` is set to a static directory, e.g., `output` instead of `output/${timestamp}/artifacts`.
- Ensure `GRAPHRAG_REPORTING_BASE_DIR` is set to a static directory, e.g., `output` instead of `output/${timestamp}/reports`
[Full docs on using environment variables for configuration](https://microsoft.github.io/graphrag/posts/config/env_vars/).
**Using Configuration File**
```yaml
# rest of settings.yaml file
# ...
storage:
type: file
base_dir: "output" # changed from "output/${timestamp}/artifacts"
reporting:
type: file
base_dir: "output" # changed from "output/${timestamp}/reports"
```
[Full docs on using JSON or YAML files for configuration](https://microsoft.github.io/graphrag/posts/config/json_yaml/).