mirror of
https://github.com/microsoft/graphrag.git
synced 2026-01-14 09:07:20 +08:00
Migrate towards using static output directories (#1113)
* Migrate towards using static output directories
- Fixes load_config eagering resolving directories.
Directories are only resolved when the output
directories are local.
- Add support for `--output` and `--reporting` flags
for index CLI. To achieve previous output structure
`index --output run1/artifacts --reports run1/reports`.
- Use static output directories when initializing
a new project.
- Maintains backward compatibility for those using
timestamp outputs locally.
* fix smoke tests
* update query cli to work with static directories
* remove eager path resolution from load_config. Support CLI overrides that can be resolved.
* add docs and output logs/artifacts to same directory
* use match statement
* switch back to if statement
---------
Co-authored-by: Alonso Guevara <alonsog@microsoft.com>
This commit is contained in:
parent
10910797d0
commit
3b09df6e07
@ -0,0 +1,4 @@
|
||||
{
|
||||
"type": "patch",
|
||||
"description": "Use static output directories."
|
||||
}
|
||||
@ -24,3 +24,5 @@ python -m graphrag.index --verbose --root </workspace/project/root> --config <cu
|
||||
- `--reporter <reporter>` - This will specify the progress reporter to use. The default is `rich`. Valid values are `rich`, `print`, and `none`.
|
||||
- `--emit <types>` - This specifies the table output formats the pipeline should emit. The default is `parquet`. Valid values are `parquet`, `csv`, and `json`, comma-separated.
|
||||
- `--nocache` - This will disable the caching mechanism. This is useful for debugging and development, but should not be used in production.
|
||||
- `--output <directory>` - Specify the output directory for pipeline artifacts.
|
||||
- `--reports <directory>` - Specify the output directory for reporting.
|
||||
|
||||
@ -68,7 +68,7 @@ from .models import (
|
||||
UmapConfig,
|
||||
)
|
||||
from .read_dotenv import read_dotenv
|
||||
from .resolve_path import resolve_path
|
||||
from .resolve_path import resolve_path, resolve_paths
|
||||
|
||||
__all__ = [
|
||||
"ApiKeyMissingError",
|
||||
@ -128,5 +128,6 @@ __all__ = [
|
||||
"load_config_from_file",
|
||||
"read_dotenv",
|
||||
"resolve_path",
|
||||
"resolve_paths",
|
||||
"search_for_config_in_root_dir",
|
||||
]
|
||||
|
||||
@ -74,11 +74,11 @@ NODE2VEC_WINDOW_SIZE = 2
|
||||
NODE2VEC_ITERATIONS = 3
|
||||
NODE2VEC_RANDOM_SEED = 597832
|
||||
REPORTING_TYPE = ReportingType.file
|
||||
REPORTING_BASE_DIR = "output/${timestamp}/logs"
|
||||
REPORTING_BASE_DIR = "output"
|
||||
SNAPSHOTS_GRAPHML = False
|
||||
SNAPSHOTS_RAW_ENTITIES = False
|
||||
SNAPSHOTS_TOP_LEVEL_NODES = False
|
||||
STORAGE_BASE_DIR = "output/${timestamp}/artifacts"
|
||||
STORAGE_BASE_DIR = "output"
|
||||
STORAGE_TYPE = StorageType.file
|
||||
SUMMARIZE_DESCRIPTIONS_MAX_LENGTH = 500
|
||||
UMAP_ENABLED = False
|
||||
|
||||
@ -7,15 +7,12 @@ from pathlib import Path
|
||||
|
||||
from .config_file_loader import load_config_from_file, search_for_config_in_root_dir
|
||||
from .create_graphrag_config import create_graphrag_config
|
||||
from .enums import ReportingType, StorageType
|
||||
from .models.graph_rag_config import GraphRagConfig
|
||||
from .resolve_path import resolve_path
|
||||
|
||||
|
||||
def load_config(
|
||||
root_dir: str | Path,
|
||||
config_filepath: str | None = None,
|
||||
run_id: str | None = None,
|
||||
) -> GraphRagConfig:
|
||||
"""Load configuration from a file or create a default configuration.
|
||||
|
||||
@ -29,8 +26,6 @@ def load_config(
|
||||
The path to the config file.
|
||||
If None, searches for config file in root and
|
||||
if not found creates a default configuration.
|
||||
run_id : str | None
|
||||
The run id to use for resolving timestamp_paths.
|
||||
"""
|
||||
root = Path(root_dir).resolve()
|
||||
|
||||
@ -48,19 +43,4 @@ def load_config(
|
||||
else:
|
||||
config = create_graphrag_config(root_dir=str(root))
|
||||
|
||||
config.storage.base_dir = str(
|
||||
resolve_path(
|
||||
config.storage.base_dir,
|
||||
root if config.storage.type == StorageType.file else None,
|
||||
run_id,
|
||||
)
|
||||
)
|
||||
config.reporting.base_dir = str(
|
||||
resolve_path(
|
||||
config.reporting.base_dir,
|
||||
root if config.reporting.type == ReportingType.file else None,
|
||||
run_id,
|
||||
)
|
||||
)
|
||||
|
||||
return config
|
||||
|
||||
@ -7,6 +7,9 @@ import re
|
||||
from pathlib import Path
|
||||
from string import Template
|
||||
|
||||
from .enums import ReportingType, StorageType
|
||||
from .models.graph_rag_config import GraphRagConfig
|
||||
|
||||
|
||||
def _resolve_timestamp_path_with_value(path: str | Path, timestamp_value: str) -> Path:
|
||||
"""Resolve the timestamp in the path with the given timestamp value.
|
||||
@ -150,3 +153,41 @@ def resolve_path(
|
||||
else:
|
||||
path_to_resolve = Path(path_to_resolve)
|
||||
return _resolve_timestamp_path(path_to_resolve, pattern_or_timestamp_value)
|
||||
|
||||
|
||||
def resolve_paths(
|
||||
config: GraphRagConfig,
|
||||
pattern_or_timestamp_value: re.Pattern[str] | str | None = None,
|
||||
) -> None:
|
||||
"""Resolve storage and reporting paths in the configuration for local file handling.
|
||||
|
||||
Resolves any timestamp variables in the configuration paths by either using the provided timestamp value if string or
|
||||
by looking up the latest available timestamp directory that matches the given pattern.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
config : GraphRagConfig
|
||||
The configuration to resolve the paths in.
|
||||
pattern_or_timestamp_value : re.Pattern[str] | str, default=None
|
||||
The pattern to use to match the timestamp directories or the timestamp value to use.
|
||||
If a string is provided, the path will be resolved with the given string value.
|
||||
Otherwise, the path will be resolved with the latest available timestamp directory
|
||||
that matches the given pattern.
|
||||
"""
|
||||
if config.storage.type == StorageType.file:
|
||||
config.storage.base_dir = str(
|
||||
resolve_path(
|
||||
config.storage.base_dir,
|
||||
config.root_dir,
|
||||
pattern_or_timestamp_value,
|
||||
)
|
||||
)
|
||||
|
||||
if config.reporting.type == ReportingType.file:
|
||||
config.reporting.base_dir = str(
|
||||
resolve_path(
|
||||
config.reporting.base_dir,
|
||||
config.root_dir,
|
||||
pattern_or_timestamp_value,
|
||||
)
|
||||
)
|
||||
|
||||
@ -88,6 +88,13 @@ if __name__ == "__main__":
|
||||
default=None,
|
||||
type=str,
|
||||
)
|
||||
parser.add_argument(
|
||||
"--output",
|
||||
help="The output directory to use for the pipeline.",
|
||||
required=False,
|
||||
default=None,
|
||||
type=str,
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
if args.resume and args.update_index:
|
||||
@ -107,4 +114,5 @@ if __name__ == "__main__":
|
||||
dryrun=args.dryrun,
|
||||
init=args.init,
|
||||
skip_validations=args.skip_validations,
|
||||
output_dir=args.output,
|
||||
)
|
||||
|
||||
@ -11,7 +11,12 @@ import time
|
||||
import warnings
|
||||
from pathlib import Path
|
||||
|
||||
from graphrag.config import CacheType, enable_logging_with_config, load_config
|
||||
from graphrag.config import (
|
||||
CacheType,
|
||||
enable_logging_with_config,
|
||||
load_config,
|
||||
resolve_paths,
|
||||
)
|
||||
|
||||
from .api import build_index
|
||||
from .emit.types import TableEmitterType
|
||||
@ -110,6 +115,7 @@ def index_cli(
|
||||
emit: list[TableEmitterType],
|
||||
dryrun: bool,
|
||||
skip_validations: bool,
|
||||
output_dir: str | None,
|
||||
):
|
||||
"""Run the pipeline with the given config."""
|
||||
progress_reporter = load_progress_reporter(reporter)
|
||||
@ -121,7 +127,11 @@ def index_cli(
|
||||
sys.exit(0)
|
||||
|
||||
root = Path(root_dir).resolve()
|
||||
config = load_config(root, config_filepath, run_id)
|
||||
config = load_config(root, config_filepath)
|
||||
|
||||
config.storage.base_dir = output_dir or config.storage.base_dir
|
||||
config.reporting.base_dir = output_dir or config.reporting.base_dir
|
||||
resolve_paths(config, run_id)
|
||||
|
||||
if nocache:
|
||||
config.cache.type = CacheType.none
|
||||
@ -188,13 +198,13 @@ def _initialize_project_at(path: str, reporter: ProgressReporter) -> None:
|
||||
msg = f"Project already initialized at {root}"
|
||||
raise ValueError(msg)
|
||||
|
||||
with settings_yaml.open("wb") as file:
|
||||
file.write(INIT_YAML.encode(encoding="utf-8", errors="strict"))
|
||||
|
||||
dotenv = root / ".env"
|
||||
if not dotenv.exists():
|
||||
with settings_yaml.open("wb") as file:
|
||||
file.write(INIT_YAML.encode(encoding="utf-8", errors="strict"))
|
||||
|
||||
with dotenv.open("wb") as file:
|
||||
file.write(INIT_DOTENV.encode(encoding="utf-8", errors="strict"))
|
||||
with dotenv.open("wb") as file:
|
||||
file.write(INIT_DOTENV.encode(encoding="utf-8", errors="strict"))
|
||||
|
||||
prompts_dir = root / "prompts"
|
||||
if not prompts_dir.exists():
|
||||
|
||||
@ -9,11 +9,7 @@ from pathlib import Path
|
||||
|
||||
import pandas as pd
|
||||
|
||||
from graphrag.config import (
|
||||
GraphRagConfig,
|
||||
load_config,
|
||||
resolve_path,
|
||||
)
|
||||
from graphrag.config import GraphRagConfig, load_config, resolve_paths
|
||||
from graphrag.index.create_pipeline_config import create_pipeline_config
|
||||
from graphrag.index.progress import PrintProgressReporter
|
||||
from graphrag.utils.storage import _create_storage, _load_table_from_storage
|
||||
@ -39,8 +35,8 @@ def run_global_search(
|
||||
root = Path(root_dir).resolve()
|
||||
config = load_config(root, config_filepath)
|
||||
|
||||
if data_dir:
|
||||
config.storage.base_dir = str(resolve_path(data_dir, root))
|
||||
config.storage.base_dir = data_dir or config.storage.base_dir
|
||||
resolve_paths(config)
|
||||
|
||||
dataframe_dict = _resolve_parquet_files(
|
||||
root_dir=root_dir,
|
||||
@ -119,8 +115,8 @@ def run_local_search(
|
||||
root = Path(root_dir).resolve()
|
||||
config = load_config(root, config_filepath)
|
||||
|
||||
if data_dir:
|
||||
config.storage.base_dir = str(resolve_path(data_dir, root))
|
||||
config.storage.base_dir = data_dir or config.storage.base_dir
|
||||
resolve_paths(config)
|
||||
|
||||
dataframe_dict = _resolve_parquet_files(
|
||||
root_dir=root_dir,
|
||||
|
||||
13
tests/fixtures/min-csv/settings.yml
vendored
13
tests/fixtures/min-csv/settings.yml
vendored
@ -18,3 +18,16 @@ embeddings:
|
||||
# community_report_title: ...
|
||||
# document_raw_content: ...
|
||||
# text_unit_text: ...
|
||||
|
||||
|
||||
storage:
|
||||
type: file # or blob
|
||||
base_dir: "output/${timestamp}/artifacts"
|
||||
# connection_string: <azure_blob_storage_connection_string>
|
||||
# container_name: <azure_blob_storage_container_name>
|
||||
|
||||
reporting:
|
||||
type: file # or console, blob
|
||||
base_dir: "output/${timestamp}/reports"
|
||||
# connection_string: <azure_blob_storage_connection_string>
|
||||
# container_name: <azure_blob_storage_container_name>
|
||||
13
tests/fixtures/text/settings.yml
vendored
13
tests/fixtures/text/settings.yml
vendored
@ -17,3 +17,16 @@ community_reports:
|
||||
prompt: "prompts/community_report.txt"
|
||||
max_length: 2000
|
||||
max_input_length: 8000
|
||||
|
||||
|
||||
storage:
|
||||
type: file # or blob
|
||||
base_dir: "output/${timestamp}/artifacts"
|
||||
# connection_string: <azure_blob_storage_connection_string>
|
||||
# container_name: <azure_blob_storage_container_name>
|
||||
|
||||
reporting:
|
||||
type: file # or console, blob
|
||||
base_dir: "output/${timestamp}/reports"
|
||||
# connection_string: <azure_blob_storage_connection_string>
|
||||
# container_name: <azure_blob_storage_container_name>
|
||||
36
v1-breaking-changes.md
Normal file
36
v1-breaking-changes.md
Normal file
@ -0,0 +1,36 @@
|
||||
# Config Breaking Changes
|
||||
|
||||
## Deprecate timestamp paths
|
||||
|
||||
### Change
|
||||
|
||||
- Remove support for timestamp paths, those using `${timestamp}` directory nesting.
|
||||
- Use the same directory for storage output and reporting output.
|
||||
|
||||
### Migration
|
||||
|
||||
- Ensure output directories no longer use `${timestamp}` directory nesting.
|
||||
|
||||
**Using Environment Variables**
|
||||
|
||||
- Ensure `GRAPHRAG_STORAGE_BASE_DIR` is set to a static directory, e.g., `output` instead of `output/${timestamp}/artifacts`.
|
||||
- Ensure `GRAPHRAG_REPORTING_BASE_DIR` is set to a static directory, e.g., `output` instead of `output/${timestamp}/reports`
|
||||
|
||||
[Full docs on using environment variables for configuration](https://microsoft.github.io/graphrag/posts/config/env_vars/).
|
||||
|
||||
**Using Configuration File**
|
||||
|
||||
```yaml
|
||||
# rest of settings.yaml file
|
||||
# ...
|
||||
|
||||
storage:
|
||||
type: file
|
||||
base_dir: "output" # changed from "output/${timestamp}/artifacts"
|
||||
|
||||
reporting:
|
||||
type: file
|
||||
base_dir: "output" # changed from "output/${timestamp}/reports"
|
||||
```
|
||||
|
||||
[Full docs on using JSON or YAML files for configuration](https://microsoft.github.io/graphrag/posts/config/json_yaml/).
|
||||
Loading…
Reference in New Issue
Block a user