graphrag/graphrag/config/defaults.py
Copilot 7c28c70d5c
Some checks are pending
gh-pages / build (push) Waiting to run
Python CI / python-ci (ubuntu-latest, 3.10) (push) Waiting to run
Python CI / python-ci (ubuntu-latest, 3.11) (push) Waiting to run
Python CI / python-ci (windows-latest, 3.10) (push) Waiting to run
Python CI / python-ci (windows-latest, 3.11) (push) Waiting to run
Python Integration Tests / python-ci (ubuntu-latest, 3.10) (push) Waiting to run
Python Integration Tests / python-ci (windows-latest, 3.10) (push) Waiting to run
Python Notebook Tests / python-ci (ubuntu-latest, 3.10) (push) Waiting to run
Python Notebook Tests / python-ci (windows-latest, 3.10) (push) Waiting to run
Python Publish (pypi) / Upload release to PyPI (push) Waiting to run
Python Smoke Tests / python-ci (ubuntu-latest, 3.10) (push) Waiting to run
Python Smoke Tests / python-ci (windows-latest, 3.10) (push) Waiting to run
Spellcheck / spellcheck (push) Waiting to run
Switch from Poetry to uv for package management (#2008)
* Initial plan

* Switch from Poetry to uv for package management

Co-authored-by: jgbradley1 <654554+jgbradley1@users.noreply.github.com>

* Clean up build artifacts and update gitignore

Co-authored-by: jgbradley1 <654554+jgbradley1@users.noreply.github.com>

* remove build artifacts

* remove hardcoded version string

* fix calls to pip in cicd

* Update gh-pages.yml workflow to use uv instead of Poetry

Co-authored-by: jgbradley1 <654554+jgbradley1@users.noreply.github.com>

* ruff formatting fixes

* update cicd workflow with latest uv action

* fix command to retrieve package version

* update development instructions

* remove Poetry references

* Replace deprecated azuright action with npm-based Azurite installation

Co-authored-by: jgbradley1 <654554+jgbradley1@users.noreply.github.com>

* skip api version check for azurite

* add semversioner file

* update more changes from switching to UV

* Migrate unified-search-app from Poetry to uv package management

Co-authored-by: jgbradley1 <654554+jgbradley1@users.noreply.github.com>

* minor typo update

* minor Dockerfile update

* update cicd thresholds

* update pytest thresholds

* ruff fixes

* ruff fixes

* remove legacy npm settings that no longer apply

* Update Unified Search App Readme

---------

Co-authored-by: copilot-swe-agent[bot] <198982749+Copilot@users.noreply.github.com>
Co-authored-by: jgbradley1 <654554+jgbradley1@users.noreply.github.com>
Co-authored-by: Josh Bradley <joshbradley@microsoft.com>
Co-authored-by: Alonso Guevara <alonsog@microsoft.com>
2025-08-13 18:57:25 -06:00

444 lines
13 KiB
Python

# Copyright (c) 2024 Microsoft Corporation.
# Licensed under the MIT License
"""Common default configuration values."""
from dataclasses import dataclass, field
from pathlib import Path
from typing import ClassVar, Literal
from graphrag.config.embeddings import default_embeddings
from graphrag.config.enums import (
AsyncType,
AuthType,
CacheType,
ChunkStrategyType,
InputFileType,
ModelType,
NounPhraseExtractorType,
ReportingType,
StorageType,
)
from graphrag.index.operations.build_noun_graph.np_extractors.stop_words import (
EN_STOP_WORDS,
)
from graphrag.vector_stores.factory import VectorStoreType
DEFAULT_OUTPUT_BASE_DIR = "output"
DEFAULT_CHAT_MODEL_ID = "default_chat_model"
DEFAULT_CHAT_MODEL_TYPE = ModelType.OpenAIChat
DEFAULT_CHAT_MODEL = "gpt-4-turbo-preview"
DEFAULT_CHAT_MODEL_AUTH_TYPE = AuthType.APIKey
DEFAULT_EMBEDDING_MODEL_ID = "default_embedding_model"
DEFAULT_EMBEDDING_MODEL_TYPE = ModelType.OpenAIEmbedding
DEFAULT_EMBEDDING_MODEL = "text-embedding-3-small"
DEFAULT_EMBEDDING_MODEL_AUTH_TYPE = AuthType.APIKey
DEFAULT_VECTOR_STORE_ID = "default_vector_store"
ENCODING_MODEL = "cl100k_base"
COGNITIVE_SERVICES_AUDIENCE = "https://cognitiveservices.azure.com/.default"
@dataclass
class BasicSearchDefaults:
"""Default values for basic search."""
prompt: None = None
k: int = 10
max_context_tokens: int = 12_000
chat_model_id: str = DEFAULT_CHAT_MODEL_ID
embedding_model_id: str = DEFAULT_EMBEDDING_MODEL_ID
@dataclass
class CacheDefaults:
"""Default values for cache."""
type: ClassVar[CacheType] = CacheType.file
base_dir: str = "cache"
connection_string: None = None
container_name: None = None
storage_account_blob_url: None = None
cosmosdb_account_url: None = None
@dataclass
class ChunksDefaults:
"""Default values for chunks."""
size: int = 1200
overlap: int = 100
group_by_columns: list[str] = field(default_factory=lambda: ["id"])
strategy: ClassVar[ChunkStrategyType] = ChunkStrategyType.tokens
encoding_model: str = "cl100k_base"
prepend_metadata: bool = False
chunk_size_includes_metadata: bool = False
@dataclass
class ClusterGraphDefaults:
"""Default values for cluster graph."""
max_cluster_size: int = 10
use_lcc: bool = True
seed: int = 0xDEADBEEF
@dataclass
class CommunityReportDefaults:
"""Default values for community report."""
graph_prompt: None = None
text_prompt: None = None
max_length: int = 2000
max_input_length: int = 8000
strategy: None = None
model_id: str = DEFAULT_CHAT_MODEL_ID
@dataclass
class DriftSearchDefaults:
"""Default values for drift search."""
prompt: None = None
reduce_prompt: None = None
data_max_tokens: int = 12_000
reduce_max_tokens: None = None
reduce_temperature: float = 0
reduce_max_completion_tokens: None = None
concurrency: int = 32
drift_k_followups: int = 20
primer_folds: int = 5
primer_llm_max_tokens: int = 12_000
n_depth: int = 3
local_search_text_unit_prop: float = 0.9
local_search_community_prop: float = 0.1
local_search_top_k_mapped_entities: int = 10
local_search_top_k_relationships: int = 10
local_search_max_data_tokens: int = 12_000
local_search_temperature: float = 0
local_search_top_p: float = 1
local_search_n: int = 1
local_search_llm_max_gen_tokens: int | None = None
local_search_llm_max_gen_completion_tokens: int | None = None
chat_model_id: str = DEFAULT_CHAT_MODEL_ID
embedding_model_id: str = DEFAULT_EMBEDDING_MODEL_ID
@dataclass
class EmbedGraphDefaults:
"""Default values for embedding graph."""
enabled: bool = False
dimensions: int = 1536
num_walks: int = 10
walk_length: int = 40
window_size: int = 2
iterations: int = 3
random_seed: int = 597832
use_lcc: bool = True
@dataclass
class EmbedTextDefaults:
"""Default values for embedding text."""
model: str = "text-embedding-3-small"
batch_size: int = 16
batch_max_tokens: int = 8191
model_id: str = DEFAULT_EMBEDDING_MODEL_ID
names: list[str] = field(default_factory=lambda: default_embeddings)
strategy: None = None
vector_store_id: str = DEFAULT_VECTOR_STORE_ID
@dataclass
class ExtractClaimsDefaults:
"""Default values for claim extraction."""
enabled: bool = False
prompt: None = None
description: str = (
"Any claims or facts that could be relevant to information discovery."
)
max_gleanings: int = 1
strategy: None = None
model_id: str = DEFAULT_CHAT_MODEL_ID
@dataclass
class ExtractGraphDefaults:
"""Default values for extracting graph."""
prompt: None = None
entity_types: list[str] = field(
default_factory=lambda: ["organization", "person", "geo", "event"]
)
max_gleanings: int = 1
strategy: None = None
model_id: str = DEFAULT_CHAT_MODEL_ID
@dataclass
class TextAnalyzerDefaults:
"""Default values for text analyzer."""
extractor_type: ClassVar[NounPhraseExtractorType] = (
NounPhraseExtractorType.RegexEnglish
)
model_name: str = "en_core_web_md"
max_word_length: int = 15
word_delimiter: str = " "
include_named_entities: bool = True
exclude_nouns: list[str] = field(default_factory=lambda: EN_STOP_WORDS)
exclude_entity_tags: list[str] = field(default_factory=lambda: ["DATE"])
exclude_pos_tags: list[str] = field(
default_factory=lambda: ["DET", "PRON", "INTJ", "X"]
)
noun_phrase_tags: list[str] = field(default_factory=lambda: ["PROPN", "NOUNS"])
noun_phrase_grammars: dict[str, str] = field(
default_factory=lambda: {
"PROPN,PROPN": "PROPN",
"NOUN,NOUN": "NOUNS",
"NOUNS,NOUN": "NOUNS",
"ADJ,ADJ": "ADJ",
"ADJ,NOUN": "NOUNS",
}
)
@dataclass
class ExtractGraphNLPDefaults:
"""Default values for NLP graph extraction."""
normalize_edge_weights: bool = True
text_analyzer: TextAnalyzerDefaults = field(default_factory=TextAnalyzerDefaults)
concurrent_requests: int = 25
@dataclass
class GlobalSearchDefaults:
"""Default values for global search."""
map_prompt: None = None
reduce_prompt: None = None
knowledge_prompt: None = None
max_context_tokens: int = 12_000
data_max_tokens: int = 12_000
map_max_length: int = 1000
reduce_max_length: int = 2000
dynamic_search_threshold: int = 1
dynamic_search_keep_parent: bool = False
dynamic_search_num_repeats: int = 1
dynamic_search_use_summary: bool = False
dynamic_search_max_level: int = 2
chat_model_id: str = DEFAULT_CHAT_MODEL_ID
@dataclass
class StorageDefaults:
"""Default values for storage."""
type: ClassVar[StorageType] = StorageType.file
base_dir: str = DEFAULT_OUTPUT_BASE_DIR
connection_string: None = None
container_name: None = None
storage_account_blob_url: None = None
cosmosdb_account_url: None = None
@dataclass
class InputStorageDefaults(StorageDefaults):
"""Default values for input storage."""
base_dir: str = "input"
@dataclass
class InputDefaults:
"""Default values for input."""
storage: InputStorageDefaults = field(default_factory=InputStorageDefaults)
file_type: ClassVar[InputFileType] = InputFileType.text
encoding: str = "utf-8"
file_pattern: str = ""
file_filter: None = None
text_column: str = "text"
title_column: None = None
metadata: None = None
@dataclass
class LanguageModelDefaults:
"""Default values for language model."""
api_key: None = None
auth_type: ClassVar[AuthType] = AuthType.APIKey
encoding_model: str = ""
max_tokens: int | None = None
temperature: float = 0
max_completion_tokens: int | None = None
reasoning_effort: str | None = None
top_p: float = 1
n: int = 1
frequency_penalty: float = 0.0
presence_penalty: float = 0.0
request_timeout: float = 180.0
api_base: None = None
api_version: None = None
deployment_name: None = None
organization: None = None
proxy: None = None
audience: None = None
model_supports_json: None = None
tokens_per_minute: Literal["auto"] = "auto"
requests_per_minute: Literal["auto"] = "auto"
retry_strategy: str = "native"
max_retries: int = 10
max_retry_wait: float = 10.0
concurrent_requests: int = 25
responses: None = None
async_mode: AsyncType = AsyncType.Threaded
@dataclass
class LocalSearchDefaults:
"""Default values for local search."""
prompt: None = None
text_unit_prop: float = 0.5
community_prop: float = 0.15
conversation_history_max_turns: int = 5
top_k_entities: int = 10
top_k_relationships: int = 10
max_context_tokens: int = 12_000
chat_model_id: str = DEFAULT_CHAT_MODEL_ID
embedding_model_id: str = DEFAULT_EMBEDDING_MODEL_ID
@dataclass
class OutputDefaults(StorageDefaults):
"""Default values for output."""
base_dir: str = DEFAULT_OUTPUT_BASE_DIR
@dataclass
class PruneGraphDefaults:
"""Default values for pruning graph."""
min_node_freq: int = 2
max_node_freq_std: None = None
min_node_degree: int = 1
max_node_degree_std: None = None
min_edge_weight_pct: float = 40.0
remove_ego_nodes: bool = True
lcc_only: bool = False
@dataclass
class ReportingDefaults:
"""Default values for reporting."""
type: ClassVar[ReportingType] = ReportingType.file
base_dir: str = "logs"
connection_string: None = None
container_name: None = None
storage_account_blob_url: None = None
@dataclass
class SnapshotsDefaults:
"""Default values for snapshots."""
embeddings: bool = False
graphml: bool = False
raw_graph: bool = False
@dataclass
class SummarizeDescriptionsDefaults:
"""Default values for summarizing descriptions."""
prompt: None = None
max_length: int = 500
max_input_tokens: int = 4_000
strategy: None = None
model_id: str = DEFAULT_CHAT_MODEL_ID
@dataclass
class UmapDefaults:
"""Default values for UMAP."""
enabled: bool = False
@dataclass
class UpdateIndexOutputDefaults(StorageDefaults):
"""Default values for update index output."""
base_dir: str = "update_output"
@dataclass
class VectorStoreDefaults:
"""Default values for vector stores."""
type: ClassVar[str] = VectorStoreType.LanceDB.value
db_uri: str = str(Path(DEFAULT_OUTPUT_BASE_DIR) / "lancedb")
container_name: str = "default"
overwrite: bool = True
url: None = None
api_key: None = None
audience: None = None
database_name: None = None
@dataclass
class GraphRagConfigDefaults:
"""Default values for GraphRAG."""
root_dir: str = ""
models: dict = field(default_factory=dict)
reporting: ReportingDefaults = field(default_factory=ReportingDefaults)
storage: StorageDefaults = field(default_factory=StorageDefaults)
output: OutputDefaults = field(default_factory=OutputDefaults)
outputs: None = None
update_index_output: UpdateIndexOutputDefaults = field(
default_factory=UpdateIndexOutputDefaults
)
cache: CacheDefaults = field(default_factory=CacheDefaults)
input: InputDefaults = field(default_factory=InputDefaults)
embed_graph: EmbedGraphDefaults = field(default_factory=EmbedGraphDefaults)
embed_text: EmbedTextDefaults = field(default_factory=EmbedTextDefaults)
chunks: ChunksDefaults = field(default_factory=ChunksDefaults)
snapshots: SnapshotsDefaults = field(default_factory=SnapshotsDefaults)
extract_graph: ExtractGraphDefaults = field(default_factory=ExtractGraphDefaults)
extract_graph_nlp: ExtractGraphNLPDefaults = field(
default_factory=ExtractGraphNLPDefaults
)
summarize_descriptions: SummarizeDescriptionsDefaults = field(
default_factory=SummarizeDescriptionsDefaults
)
community_reports: CommunityReportDefaults = field(
default_factory=CommunityReportDefaults
)
extract_claims: ExtractClaimsDefaults = field(default_factory=ExtractClaimsDefaults)
prune_graph: PruneGraphDefaults = field(default_factory=PruneGraphDefaults)
cluster_graph: ClusterGraphDefaults = field(default_factory=ClusterGraphDefaults)
umap: UmapDefaults = field(default_factory=UmapDefaults)
local_search: LocalSearchDefaults = field(default_factory=LocalSearchDefaults)
global_search: GlobalSearchDefaults = field(default_factory=GlobalSearchDefaults)
drift_search: DriftSearchDefaults = field(default_factory=DriftSearchDefaults)
basic_search: BasicSearchDefaults = field(default_factory=BasicSearchDefaults)
vector_store: dict[str, VectorStoreDefaults] = field(
default_factory=lambda: {DEFAULT_VECTOR_STORE_ID: VectorStoreDefaults()}
)
workflows: None = None
language_model_defaults = LanguageModelDefaults()
vector_store_defaults = VectorStoreDefaults()
graphrag_config_defaults = GraphRagConfigDefaults()