Improve CLI speed with lazy imports (#1319)

This commit is contained in:
Josh Bradley 2024-11-15 16:41:10 -08:00 committed by GitHub
parent 9b4f24ebce
commit 22a57d14c7
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
237 changed files with 943 additions and 1390 deletions

View File

@ -0,0 +1,4 @@
{
"type": "patch",
"description": "move import statements out of init files"
}

View File

@ -0,0 +1,4 @@
{
"type": "patch",
"description": "fix autocompletion of existing files/directory paths."
}

View File

@ -20,9 +20,9 @@ Before running auto tuning, ensure you have already initialized your workspace w
You can run the main script from the command line with various options:
```bash
graphrag prompt-tune [--root ROOT] [--domain DOMAIN] [--method METHOD] [--limit LIMIT] [--language LANGUAGE] \
graphrag prompt-tune [--root ROOT] [--config CONFIG] [--domain DOMAIN] [--selection-method METHOD] [--limit LIMIT] [--language LANGUAGE] \
[--max-tokens MAX_TOKENS] [--chunk-size CHUNK_SIZE] [--n-subset-max N_SUBSET_MAX] [--k K] \
[--min-examples-required MIN_EXAMPLES_REQUIRED] [--no-entity-types] [--output OUTPUT]
[--min-examples-required MIN_EXAMPLES_REQUIRED] [--discover-entity-types] [--output OUTPUT]
```
## Command-Line Options
@ -49,7 +49,7 @@ graphrag prompt-tune [--root ROOT] [--domain DOMAIN] [--method METHOD] [--limit
- `--min-examples-required` (optional): The minimum number of examples required for entity extraction prompts. Default is 2.
- `--no-entity-types` (optional): Use untyped entity extraction generation. We recommend using this when your data covers a lot of topics or it is highly randomized.
- `--discover-entity-types` (optional): Allow the LLM to discover and extract entities automatically. We recommend using this when your data covers a lot of topics or it is highly randomized.
- `--output` (optional): The folder to save the generated prompts. Default is "prompts".

View File

@ -5,7 +5,7 @@ import os
import pandas as pd
from graphrag.index import run_pipeline_with_config
from graphrag.index.run import run_pipeline_with_config
pipeline_file = os.path.join(
os.path.dirname(os.path.abspath(__file__)), "./pipeline.yml"

View File

@ -5,8 +5,8 @@ import os
import pandas as pd
from graphrag.index import run_pipeline, run_pipeline_with_config
from graphrag.index.config import PipelineWorkflowReference
from graphrag.index.config.workflow import PipelineWorkflowReference
from graphrag.index.run import run_pipeline, run_pipeline_with_config
# our fake dataset
dataset = pd.DataFrame([{"col1": 2, "col2": 4}, {"col1": 5, "col2": 10}])

View File

@ -3,9 +3,10 @@
import asyncio
import os
from graphrag.index import run_pipeline, run_pipeline_with_config
from graphrag.index.config import PipelineCSVInputConfig, PipelineWorkflowReference
from graphrag.index.input import load_input
from graphrag.index.config.input import PipelineCSVInputConfig
from graphrag.index.config.workflow import PipelineWorkflowReference
from graphrag.index.input.load_input import load_input
from graphrag.index.run import run_pipeline, run_pipeline_with_config
sample_data_dir = os.path.join(
os.path.dirname(os.path.abspath(__file__)), "../_sample_data/"

View File

@ -3,6 +3,6 @@
"""The GraphRAG package."""
from .cli.main import app
from graphrag.cli.main import app
app(prog_name="graphrag")

View File

@ -8,7 +8,7 @@ Backwards compatibility is not guaranteed at this time.
"""
from graphrag.api.index import build_index
from graphrag.api.prompt_tune import DocSelectionType, generate_indexing_prompts
from graphrag.api.prompt_tune import generate_indexing_prompts
from graphrag.api.query import (
drift_search,
global_search,
@ -16,6 +16,7 @@ from graphrag.api.query import (
local_search,
local_search_streaming,
)
from graphrag.prompt_tune.types import DocSelectionType
__all__ = [ # noqa: RUF022
# index API

View File

@ -10,13 +10,14 @@ Backwards compatibility is not guaranteed at this time.
from pathlib import Path
from graphrag.config import CacheType, GraphRagConfig
from graphrag.config.enums import CacheType
from graphrag.config.models.graph_rag_config import GraphRagConfig
from graphrag.index.cache.noop_pipeline_cache import NoopPipelineCache
from graphrag.index.create_pipeline_config import create_pipeline_config
from graphrag.index.emit.types import TableEmitterType
from graphrag.index.run import run_pipeline_with_config
from graphrag.index.typing import PipelineRunResult
from graphrag.logging import ProgressReporter
from graphrag.logging.base import ProgressReporter
from graphrag.vector_stores.factory import VectorStoreType

View File

@ -15,25 +15,32 @@ from datashaper import NoopVerbCallbacks
from pydantic import PositiveInt, validate_call
from graphrag.config.models.graph_rag_config import GraphRagConfig
from graphrag.index.llm import load_llm
from graphrag.logging import PrintProgressReporter
from graphrag.prompt_tune.generator import (
MAX_TOKEN_COUNT,
create_community_summarization_prompt,
create_entity_extraction_prompt,
create_entity_summarization_prompt,
detect_language,
from graphrag.index.llm.load_llm import load_llm
from graphrag.logging.print_progress import PrintProgressReporter
from graphrag.prompt_tune.defaults import MAX_TOKEN_COUNT
from graphrag.prompt_tune.generator.community_report_rating import (
generate_community_report_rating,
)
from graphrag.prompt_tune.generator.community_report_summarization import (
create_community_summarization_prompt,
)
from graphrag.prompt_tune.generator.community_reporter_role import (
generate_community_reporter_role,
generate_domain,
)
from graphrag.prompt_tune.generator.domain import generate_domain
from graphrag.prompt_tune.generator.entity_extraction_prompt import (
create_entity_extraction_prompt,
)
from graphrag.prompt_tune.generator.entity_relationship import (
generate_entity_relationship_examples,
generate_entity_types,
generate_persona,
)
from graphrag.prompt_tune.loader import (
MIN_CHUNK_SIZE,
load_docs_in_chunks,
from graphrag.prompt_tune.generator.entity_summarization_prompt import (
create_entity_summarization_prompt,
)
from graphrag.prompt_tune.generator.entity_types import generate_entity_types
from graphrag.prompt_tune.generator.language import detect_language
from graphrag.prompt_tune.generator.persona import generate_persona
from graphrag.prompt_tune.loader.input import MIN_CHUNK_SIZE, load_docs_in_chunks
from graphrag.prompt_tune.types import DocSelectionType

View File

@ -24,12 +24,12 @@ from typing import Any
import pandas as pd
from pydantic import validate_call
from graphrag.config import GraphRagConfig
from graphrag.config.models.graph_rag_config import GraphRagConfig
from graphrag.index.config.embeddings import (
community_full_content_embedding,
entity_description_embedding,
)
from graphrag.logging import PrintProgressReporter
from graphrag.logging.print_progress import PrintProgressReporter
from graphrag.query.factories import (
get_drift_search_engine,
get_global_search_engine,
@ -47,8 +47,8 @@ from graphrag.query.indexer_adapters import (
from graphrag.query.structured_search.base import SearchResult # noqa: TCH001
from graphrag.utils.cli import redact
from graphrag.utils.embeddings import create_collection_name
from graphrag.vector_stores import VectorStoreFactory, VectorStoreType
from graphrag.vector_stores.base import BaseVectorStore
from graphrag.vector_stores.factory import VectorStoreFactory, VectorStoreType
reporter = PrintProgressReporter("")

View File

@ -8,17 +8,16 @@ from typing import cast
from datashaper import WorkflowCallbacks
from graphrag.config import ReportingType
from graphrag.index.config import (
from graphrag.callbacks.blob_workflow_callbacks import BlobWorkflowCallbacks
from graphrag.callbacks.console_workflow_callbacks import ConsoleWorkflowCallbacks
from graphrag.callbacks.file_workflow_callbacks import FileWorkflowCallbacks
from graphrag.config.enums import ReportingType
from graphrag.index.config.reporting import (
PipelineBlobReportingConfig,
PipelineFileReportingConfig,
PipelineReportingConfig,
)
from .blob_workflow_callbacks import BlobWorkflowCallbacks
from .console_workflow_callbacks import ConsoleWorkflowCallbacks
from .file_workflow_callbacks import FileWorkflowCallbacks
def create_pipeline_reporter(
config: PipelineReportingConfig | None, root_dir: str | None

View File

@ -3,10 +3,9 @@
"""GlobalSearch LLM Callbacks."""
from graphrag.callbacks.llm_callbacks import BaseLLMCallback
from graphrag.query.structured_search.base import SearchResult
from .llm_callbacks import BaseLLMCallback
class GlobalSearchLLMCallback(BaseLLMCallback):
"""GlobalSearch LLM Callbacks."""

View File

@ -7,7 +7,7 @@ from typing import Any
from datashaper import ExecutionNode, NoopWorkflowCallbacks, Progress, TableContainer
from graphrag.logging import ProgressReporter
from graphrag.logging.base import ProgressReporter
class ProgressWorkflowCallbacks(NoopWorkflowCallbacks):

View File

@ -11,15 +11,15 @@ import warnings
from pathlib import Path
import graphrag.api as api
from graphrag.config import (
CacheType,
enable_logging_with_config,
load_config,
resolve_paths,
)
from graphrag.config.enums import CacheType
from graphrag.config.load_config import load_config
from graphrag.config.logging import enable_logging_with_config
from graphrag.config.resolve_path import resolve_paths
from graphrag.index.emit.types import TableEmitterType
from graphrag.index.validate_config import validate_config_names
from graphrag.logging import ProgressReporter, ReporterType, create_progress_reporter
from graphrag.logging.base import ProgressReporter
from graphrag.logging.factories import create_progress_reporter
from graphrag.logging.types import ReporterType
from graphrag.utils.cli import redact
# Ignore warnings from numba

View File

@ -6,7 +6,8 @@
from pathlib import Path
from graphrag.config.init_content import INIT_DOTENV, INIT_YAML
from graphrag.logging import ReporterType, create_progress_reporter
from graphrag.logging.factories import create_progress_reporter
from graphrag.logging.types import ReporterType
from graphrag.prompts.index.claim_extraction import CLAIM_EXTRACTION_PROMPT
from graphrag.prompts.index.community_report import (
COMMUNITY_REPORT_PROMPT,

View File

@ -3,23 +3,24 @@
"""CLI entrypoint."""
import asyncio
import os
import re
from collections.abc import Callable
from enum import Enum
from pathlib import Path
from typing import Annotated
import typer
from graphrag.api import DocSelectionType
from graphrag.index.emit.types import TableEmitterType
from graphrag.logging import ReporterType
from graphrag.prompt_tune.generator import MAX_TOKEN_COUNT
from graphrag.prompt_tune.loader import MIN_CHUNK_SIZE
from .index import index_cli, update_cli
from .initialize import initialize_project_at
from .prompt_tune import prompt_tune
from .query import run_drift_search, run_global_search, run_local_search
from graphrag.logging.types import ReporterType
from graphrag.prompt_tune.defaults import (
MAX_TOKEN_COUNT,
MIN_CHUNK_SIZE,
N_SUBSET_MAX,
K,
)
from graphrag.prompt_tune.types import DocSelectionType
INVALID_METHOD_ERROR = "Invalid method"
@ -29,6 +30,48 @@ app = typer.Typer(
)
# A workaround for typer's lack of support for proper autocompletion of file/directory paths
# For more detail, watch
# https://github.com/fastapi/typer/discussions/682
# https://github.com/fastapi/typer/issues/951
def path_autocomplete(
file_okay: bool = True,
dir_okay: bool = True,
readable: bool = True,
writable: bool = False,
match_wildcard: str | None = None,
) -> Callable[[str], list[str]]:
"""Autocomplete file and directory paths."""
def wildcard_match(string: str, pattern: str) -> bool:
regex = re.escape(pattern).replace(r"\?", ".").replace(r"\*", ".*")
return re.fullmatch(regex, string) is not None
def completer(incomplete: str) -> list[str]:
items = os.listdir()
completions = []
for item in items:
if not file_okay and Path(item).is_file():
continue
if not dir_okay and Path(item).is_dir():
continue
if readable and not os.access(item, os.R_OK):
continue
if writable and not os.access(item, os.W_OK):
continue
completions.append(item)
if match_wildcard:
completions = filter(
lambda i: wildcard_match(i, match_wildcard)
if match_wildcard
else False,
completions,
)
return [i for i in completions if i.startswith(incomplete)]
return completer
class SearchType(Enum):
"""The type of search to run."""
@ -50,10 +93,15 @@ def _initialize_cli(
dir_okay=True,
writable=True,
resolve_path=True,
autocompletion=path_autocomplete(
file_okay=False, dir_okay=True, writable=True, match_wildcard="*"
),
),
],
):
"""Generate a default configuration file."""
from graphrag.cli.initialize import initialize_project_at
initialize_project_at(path=root)
@ -73,6 +121,9 @@ def _index_cli(
dir_okay=True,
writable=True,
resolve_path=True,
autocompletion=path_autocomplete(
file_okay=False, dir_okay=True, writable=True, match_wildcard="*"
),
),
] = Path(), # set default to current directory
verbose: Annotated[
@ -114,6 +165,8 @@ def _index_cli(
] = None,
):
"""Build a knowledge graph index."""
from graphrag.cli.index import index_cli
index_cli(
root_dir=root,
verbose=verbose,
@ -181,6 +234,8 @@ def _update_cli(
Applies a default storage configuration (if not provided by config), saving the new index to the local file system in the `update_output` folder.
"""
from graphrag.cli.index import update_cli
update_cli(
root_dir=root,
verbose=verbose,
@ -204,12 +259,21 @@ def _prompt_tune_cli(
dir_okay=True,
writable=True,
resolve_path=True,
autocompletion=path_autocomplete(
file_okay=False, dir_okay=True, writable=True, match_wildcard="*"
),
),
] = Path(), # set default to current directory
config: Annotated[
Path | None,
typer.Option(
help="The configuration to use.", exists=True, file_okay=True, readable=True
help="The configuration to use.",
exists=True,
file_okay=True,
readable=True,
autocompletion=path_autocomplete(
file_okay=True, dir_okay=False, match_wildcard="*"
),
),
] = None,
domain: Annotated[
@ -226,13 +290,13 @@ def _prompt_tune_cli(
typer.Option(
help="The number of text chunks to embed when --selection-method=auto."
),
] = 300,
] = N_SUBSET_MAX,
k: Annotated[
int,
typer.Option(
help="The maximum number of documents to select from each centroid when --selection-method=auto."
),
] = 15,
] = K,
limit: Annotated[
int,
typer.Option(
@ -271,6 +335,10 @@ def _prompt_tune_cli(
] = Path("prompts"),
):
"""Generate custom graphrag prompts with your own data (i.e. auto templating)."""
import asyncio
from graphrag.cli.prompt_tune import prompt_tune
loop = asyncio.get_event_loop()
loop.run_until_complete(
prompt_tune(
@ -298,7 +366,13 @@ def _query_cli(
config: Annotated[
Path | None,
typer.Option(
help="The configuration to use.", exists=True, file_okay=True, readable=True
help="The configuration to use.",
exists=True,
file_okay=True,
readable=True,
autocompletion=path_autocomplete(
file_okay=True, dir_okay=False, match_wildcard="*"
),
),
] = None,
data: Annotated[
@ -309,6 +383,9 @@ def _query_cli(
dir_okay=True,
readable=True,
resolve_path=True,
autocompletion=path_autocomplete(
file_okay=False, dir_okay=True, match_wildcard="*"
),
),
] = None,
root: Annotated[
@ -319,6 +396,9 @@ def _query_cli(
dir_okay=True,
writable=True,
resolve_path=True,
autocompletion=path_autocomplete(
file_okay=False, dir_okay=True, match_wildcard="*"
),
),
] = Path(), # set default to current directory
community_level: Annotated[
@ -342,6 +422,8 @@ def _query_cli(
] = False,
):
"""Query a knowledge graph index."""
from graphrag.cli.query import run_drift_search, run_global_search, run_local_search
match method:
case SearchType.LOCAL:
run_local_search(

View File

@ -6,8 +6,8 @@
from pathlib import Path
import graphrag.api as api
from graphrag.config import load_config
from graphrag.logging import PrintProgressReporter
from graphrag.config.load_config import load_config
from graphrag.logging.print_progress import PrintProgressReporter
from graphrag.prompt_tune.generator.community_report_summarization import (
COMMUNITY_SUMMARIZATION_FILENAME,
)

View File

@ -10,9 +10,11 @@ from pathlib import Path
import pandas as pd
import graphrag.api as api
from graphrag.config import GraphRagConfig, load_config, resolve_paths
from graphrag.config.load_config import load_config
from graphrag.config.models.graph_rag_config import GraphRagConfig
from graphrag.config.resolve_path import resolve_paths
from graphrag.index.create_pipeline_config import create_pipeline_config
from graphrag.logging import PrintProgressReporter
from graphrag.logging.print_progress import PrintProgressReporter
from graphrag.utils.storage import _create_storage, _load_table_from_storage
reporter = PrintProgressReporter("")

View File

@ -2,134 +2,3 @@
# Licensed under the MIT License
"""The Indexing Engine default config package root."""
from .config_file_loader import load_config_from_file, search_for_config_in_root_dir
from .create_graphrag_config import (
create_graphrag_config,
)
from .enums import (
CacheType,
InputFileType,
InputType,
LLMType,
ReportingType,
StorageType,
TextEmbeddingTarget,
)
from .errors import (
ApiKeyMissingError,
AzureApiBaseMissingError,
AzureDeploymentNameMissingError,
)
from .input_models import (
CacheConfigInput,
ChunkingConfigInput,
ClaimExtractionConfigInput,
ClusterGraphConfigInput,
CommunityReportsConfigInput,
EmbedGraphConfigInput,
EntityExtractionConfigInput,
GlobalSearchConfigInput,
GraphRagConfigInput,
InputConfigInput,
LLMConfigInput,
LLMParametersInput,
LocalSearchConfigInput,
ParallelizationParametersInput,
ReportingConfigInput,
SnapshotsConfigInput,
StorageConfigInput,
SummarizeDescriptionsConfigInput,
TextEmbeddingConfigInput,
UmapConfigInput,
)
from .load_config import load_config
from .logging import enable_logging_with_config
from .models import (
CacheConfig,
ChunkingConfig,
ClaimExtractionConfig,
ClusterGraphConfig,
CommunityReportsConfig,
DRIFTSearchConfig,
EmbedGraphConfig,
EntityExtractionConfig,
GlobalSearchConfig,
GraphRagConfig,
InputConfig,
LLMConfig,
LLMParameters,
LocalSearchConfig,
ParallelizationParameters,
ReportingConfig,
SnapshotsConfig,
StorageConfig,
SummarizeDescriptionsConfig,
TextEmbeddingConfig,
UmapConfig,
)
from .read_dotenv import read_dotenv
from .resolve_path import resolve_path, resolve_paths
__all__ = [
"ApiKeyMissingError",
"AzureApiBaseMissingError",
"AzureDeploymentNameMissingError",
"CacheConfig",
"CacheConfigInput",
"CacheType",
"ChunkingConfig",
"ChunkingConfigInput",
"ClaimExtractionConfig",
"ClaimExtractionConfigInput",
"ClusterGraphConfig",
"ClusterGraphConfigInput",
"CommunityReportsConfig",
"CommunityReportsConfigInput",
"DRIFTSearchConfig",
"EmbedGraphConfig",
"EmbedGraphConfigInput",
"EntityExtractionConfig",
"EntityExtractionConfigInput",
"GlobalSearchConfig",
"GlobalSearchConfigInput",
"GraphRagConfig",
"GraphRagConfigInput",
"InputConfig",
"InputConfigInput",
"InputFileType",
"InputType",
"LLMConfig",
"LLMConfigInput",
"LLMParameters",
"LLMParametersInput",
"LLMType",
"LocalSearchConfig",
"LocalSearchConfigInput",
"ParallelizationParameters",
"ParallelizationParametersInput",
"ReportingConfig",
"ReportingConfigInput",
"ReportingType",
"SnapshotsConfig",
"SnapshotsConfigInput",
"StorageConfig",
"StorageConfigInput",
"StorageType",
"StorageType",
"SummarizeDescriptionsConfig",
"SummarizeDescriptionsConfigInput",
"TextEmbeddingConfig",
"TextEmbeddingConfigInput",
"TextEmbeddingTarget",
"UmapConfig",
"UmapConfigInput",
"create_graphrag_config",
"enable_logging_with_config",
"load_config",
"load_config_from_file",
"read_dotenv",
"resolve_path",
"resolve_paths",
"search_for_config_in_root_dir",
]

View File

@ -9,8 +9,8 @@ from pathlib import Path
import yaml
from .create_graphrag_config import create_graphrag_config
from .models.graph_rag_config import GraphRagConfig
from graphrag.config.create_graphrag_config import create_graphrag_config
from graphrag.config.models.graph_rag_config import GraphRagConfig
_default_config_files = ["settings.yaml", "settings.yml", "settings.json"]

View File

@ -13,8 +13,7 @@ from environs import Env
from pydantic import TypeAdapter
import graphrag.config.defaults as defs
from .enums import (
from graphrag.config.enums import (
CacheType,
InputFileType,
InputType,
@ -23,39 +22,37 @@ from .enums import (
StorageType,
TextEmbeddingTarget,
)
from .environment_reader import EnvironmentReader
from .errors import (
from graphrag.config.environment_reader import EnvironmentReader
from graphrag.config.errors import (
ApiKeyMissingError,
AzureApiBaseMissingError,
AzureDeploymentNameMissingError,
)
from .input_models import (
GraphRagConfigInput,
LLMConfigInput,
)
from .models import (
CacheConfig,
ChunkingConfig,
ClaimExtractionConfig,
ClusterGraphConfig,
CommunityReportsConfig,
DRIFTSearchConfig,
EmbedGraphConfig,
EntityExtractionConfig,
GlobalSearchConfig,
GraphRagConfig,
InputConfig,
LLMParameters,
LocalSearchConfig,
ParallelizationParameters,
ReportingConfig,
SnapshotsConfig,
StorageConfig,
from graphrag.config.input_models.graphrag_config_input import GraphRagConfigInput
from graphrag.config.input_models.llm_config_input import LLMConfigInput
from graphrag.config.models.cache_config import CacheConfig
from graphrag.config.models.chunking_config import ChunkingConfig
from graphrag.config.models.claim_extraction_config import ClaimExtractionConfig
from graphrag.config.models.cluster_graph_config import ClusterGraphConfig
from graphrag.config.models.community_reports_config import CommunityReportsConfig
from graphrag.config.models.drift_search_config import DRIFTSearchConfig
from graphrag.config.models.embed_graph_config import EmbedGraphConfig
from graphrag.config.models.entity_extraction_config import EntityExtractionConfig
from graphrag.config.models.global_search_config import GlobalSearchConfig
from graphrag.config.models.graph_rag_config import GraphRagConfig
from graphrag.config.models.input_config import InputConfig
from graphrag.config.models.llm_parameters import LLMParameters
from graphrag.config.models.local_search_config import LocalSearchConfig
from graphrag.config.models.parallelization_parameters import ParallelizationParameters
from graphrag.config.models.reporting_config import ReportingConfig
from graphrag.config.models.snapshots_config import SnapshotsConfig
from graphrag.config.models.storage_config import StorageConfig
from graphrag.config.models.summarize_descriptions_config import (
SummarizeDescriptionsConfig,
TextEmbeddingConfig,
UmapConfig,
)
from .read_dotenv import read_dotenv
from graphrag.config.models.text_embedding_config import TextEmbeddingConfig
from graphrag.config.models.umap_config import UmapConfig
from graphrag.config.read_dotenv import read_dotenv
InputModelValidator = TypeAdapter(GraphRagConfigInput)

View File

@ -7,9 +7,7 @@ from pathlib import Path
from datashaper import AsyncType
from graphrag.vector_stores import VectorStoreType
from .enums import (
from graphrag.config.enums import (
CacheType,
InputFileType,
InputType,
@ -18,6 +16,7 @@ from .enums import (
StorageType,
TextEmbeddingTarget,
)
from graphrag.vector_stores.factory import VectorStoreType
ASYNC_MODE = AsyncType.Threaded
ENCODING_MODEL = "cl100k_base"

View File

@ -2,49 +2,3 @@
# Licensed under the MIT License
"""Interfaces for Default Config parameterization."""
from .cache_config_input import CacheConfigInput
from .chunking_config_input import ChunkingConfigInput
from .claim_extraction_config_input import ClaimExtractionConfigInput
from .cluster_graph_config_input import ClusterGraphConfigInput
from .community_reports_config_input import CommunityReportsConfigInput
from .embed_graph_config_input import EmbedGraphConfigInput
from .entity_extraction_config_input import EntityExtractionConfigInput
from .global_search_config_input import GlobalSearchConfigInput
from .graphrag_config_input import GraphRagConfigInput
from .input_config_input import InputConfigInput
from .llm_config_input import LLMConfigInput
from .llm_parameters_input import LLMParametersInput
from .local_search_config_input import LocalSearchConfigInput
from .parallelization_parameters_input import ParallelizationParametersInput
from .reporting_config_input import ReportingConfigInput
from .snapshots_config_input import SnapshotsConfigInput
from .storage_config_input import StorageConfigInput
from .summarize_descriptions_config_input import (
SummarizeDescriptionsConfigInput,
)
from .text_embedding_config_input import TextEmbeddingConfigInput
from .umap_config_input import UmapConfigInput
__all__ = [
"CacheConfigInput",
"ChunkingConfigInput",
"ClaimExtractionConfigInput",
"ClusterGraphConfigInput",
"CommunityReportsConfigInput",
"EmbedGraphConfigInput",
"EntityExtractionConfigInput",
"GlobalSearchConfigInput",
"GraphRagConfigInput",
"InputConfigInput",
"LLMConfigInput",
"LLMParametersInput",
"LocalSearchConfigInput",
"ParallelizationParametersInput",
"ReportingConfigInput",
"SnapshotsConfigInput",
"StorageConfigInput",
"SummarizeDescriptionsConfigInput",
"TextEmbeddingConfigInput",
"UmapConfigInput",
]

View File

@ -5,7 +5,7 @@
from typing_extensions import NotRequired
from .llm_config_input import LLMConfigInput
from graphrag.config.input_models.llm_config_input import LLMConfigInput
class ClaimExtractionConfigInput(LLMConfigInput):

View File

@ -5,7 +5,7 @@
from typing_extensions import NotRequired
from .llm_config_input import LLMConfigInput
from graphrag.config.input_models.llm_config_input import LLMConfigInput
class CommunityReportsConfigInput(LLMConfigInput):

View File

@ -5,7 +5,7 @@
from typing_extensions import NotRequired
from .llm_config_input import LLMConfigInput
from graphrag.config.input_models.llm_config_input import LLMConfigInput
class EntityExtractionConfigInput(LLMConfigInput):

View File

@ -5,25 +5,39 @@
from typing_extensions import NotRequired
from .cache_config_input import CacheConfigInput
from .chunking_config_input import ChunkingConfigInput
from .claim_extraction_config_input import ClaimExtractionConfigInput
from .cluster_graph_config_input import ClusterGraphConfigInput
from .community_reports_config_input import CommunityReportsConfigInput
from .embed_graph_config_input import EmbedGraphConfigInput
from .entity_extraction_config_input import EntityExtractionConfigInput
from .global_search_config_input import GlobalSearchConfigInput
from .input_config_input import InputConfigInput
from .llm_config_input import LLMConfigInput
from .local_search_config_input import LocalSearchConfigInput
from .reporting_config_input import ReportingConfigInput
from .snapshots_config_input import SnapshotsConfigInput
from .storage_config_input import StorageConfigInput
from .summarize_descriptions_config_input import (
from graphrag.config.input_models.cache_config_input import CacheConfigInput
from graphrag.config.input_models.chunking_config_input import ChunkingConfigInput
from graphrag.config.input_models.claim_extraction_config_input import (
ClaimExtractionConfigInput,
)
from graphrag.config.input_models.cluster_graph_config_input import (
ClusterGraphConfigInput,
)
from graphrag.config.input_models.community_reports_config_input import (
CommunityReportsConfigInput,
)
from graphrag.config.input_models.embed_graph_config_input import EmbedGraphConfigInput
from graphrag.config.input_models.entity_extraction_config_input import (
EntityExtractionConfigInput,
)
from graphrag.config.input_models.global_search_config_input import (
GlobalSearchConfigInput,
)
from graphrag.config.input_models.input_config_input import InputConfigInput
from graphrag.config.input_models.llm_config_input import LLMConfigInput
from graphrag.config.input_models.local_search_config_input import (
LocalSearchConfigInput,
)
from graphrag.config.input_models.reporting_config_input import ReportingConfigInput
from graphrag.config.input_models.snapshots_config_input import SnapshotsConfigInput
from graphrag.config.input_models.storage_config_input import StorageConfigInput
from graphrag.config.input_models.summarize_descriptions_config_input import (
SummarizeDescriptionsConfigInput,
)
from .text_embedding_config_input import TextEmbeddingConfigInput
from .umap_config_input import UmapConfigInput
from graphrag.config.input_models.text_embedding_config_input import (
TextEmbeddingConfigInput,
)
from graphrag.config.input_models.umap_config_input import UmapConfigInput
class GraphRagConfigInput(LLMConfigInput):

View File

@ -6,8 +6,10 @@
from datashaper import AsyncType
from typing_extensions import NotRequired, TypedDict
from .llm_parameters_input import LLMParametersInput
from .parallelization_parameters_input import ParallelizationParametersInput
from graphrag.config.input_models.llm_parameters_input import LLMParametersInput
from graphrag.config.input_models.parallelization_parameters_input import (
ParallelizationParametersInput,
)
class LLMConfigInput(TypedDict):

View File

@ -5,7 +5,7 @@
from typing_extensions import NotRequired
from .llm_config_input import LLMConfigInput
from graphrag.config.input_models.llm_config_input import LLMConfigInput
class SummarizeDescriptionsConfigInput(LLMConfigInput):

View File

@ -8,8 +8,7 @@ from typing_extensions import NotRequired
from graphrag.config.enums import (
TextEmbeddingTarget,
)
from .llm_config_input import LLMConfigInput
from graphrag.config.input_models.llm_config_input import LLMConfigInput
class TextEmbeddingConfigInput(LLMConfigInput):

View File

@ -5,9 +5,12 @@
from pathlib import Path
from .config_file_loader import load_config_from_file, search_for_config_in_root_dir
from .create_graphrag_config import create_graphrag_config
from .models.graph_rag_config import GraphRagConfig
from graphrag.config.config_file_loader import (
load_config_from_file,
search_for_config_in_root_dir,
)
from graphrag.config.create_graphrag_config import create_graphrag_config
from graphrag.config.models.graph_rag_config import GraphRagConfig
def load_config(

View File

@ -6,8 +6,8 @@
import logging
from pathlib import Path
from .enums import ReportingType
from .models.graph_rag_config import GraphRagConfig
from graphrag.config.enums import ReportingType
from graphrag.config.models.graph_rag_config import GraphRagConfig
def enable_logging(log_filepath: str | Path, verbose: bool = False) -> None:

View File

@ -2,49 +2,3 @@
# Licensed under the MIT License
"""Interfaces for Default Config parameterization."""
from .cache_config import CacheConfig
from .chunking_config import ChunkingConfig
from .claim_extraction_config import ClaimExtractionConfig
from .cluster_graph_config import ClusterGraphConfig
from .community_reports_config import CommunityReportsConfig
from .drift_search_config import DRIFTSearchConfig
from .embed_graph_config import EmbedGraphConfig
from .entity_extraction_config import EntityExtractionConfig
from .global_search_config import GlobalSearchConfig
from .graph_rag_config import GraphRagConfig
from .input_config import InputConfig
from .llm_config import LLMConfig
from .llm_parameters import LLMParameters
from .local_search_config import LocalSearchConfig
from .parallelization_parameters import ParallelizationParameters
from .reporting_config import ReportingConfig
from .snapshots_config import SnapshotsConfig
from .storage_config import StorageConfig
from .summarize_descriptions_config import SummarizeDescriptionsConfig
from .text_embedding_config import TextEmbeddingConfig
from .umap_config import UmapConfig
__all__ = [
"CacheConfig",
"ChunkingConfig",
"ClaimExtractionConfig",
"ClusterGraphConfig",
"CommunityReportsConfig",
"DRIFTSearchConfig",
"EmbedGraphConfig",
"EntityExtractionConfig",
"GlobalSearchConfig",
"GraphRagConfig",
"InputConfig",
"LLMConfig",
"LLMParameters",
"LocalSearchConfig",
"ParallelizationParameters",
"ReportingConfig",
"SnapshotsConfig",
"StorageConfig",
"SummarizeDescriptionsConfig",
"TextEmbeddingConfig",
"UmapConfig",
]

View File

@ -8,8 +8,7 @@ from pathlib import Path
from pydantic import Field
import graphrag.config.defaults as defs
from .llm_config import LLMConfig
from graphrag.config.models.llm_config import LLMConfig
class ClaimExtractionConfig(LLMConfig):

View File

@ -8,8 +8,7 @@ from pathlib import Path
from pydantic import Field
import graphrag.config.defaults as defs
from .llm_config import LLMConfig
from graphrag.config.models.llm_config import LLMConfig
class CommunityReportsConfig(LLMConfig):

View File

@ -8,8 +8,7 @@ from pathlib import Path
from pydantic import Field
import graphrag.config.defaults as defs
from .llm_config import LLMConfig
from graphrag.config.models.llm_config import LLMConfig
class EntityExtractionConfig(LLMConfig):

View File

@ -7,27 +7,26 @@ from devtools import pformat
from pydantic import Field
import graphrag.config.defaults as defs
from .cache_config import CacheConfig
from .chunking_config import ChunkingConfig
from .claim_extraction_config import ClaimExtractionConfig
from .cluster_graph_config import ClusterGraphConfig
from .community_reports_config import CommunityReportsConfig
from .drift_search_config import DRIFTSearchConfig
from .embed_graph_config import EmbedGraphConfig
from .entity_extraction_config import EntityExtractionConfig
from .global_search_config import GlobalSearchConfig
from .input_config import InputConfig
from .llm_config import LLMConfig
from .local_search_config import LocalSearchConfig
from .reporting_config import ReportingConfig
from .snapshots_config import SnapshotsConfig
from .storage_config import StorageConfig
from .summarize_descriptions_config import (
from graphrag.config.models.cache_config import CacheConfig
from graphrag.config.models.chunking_config import ChunkingConfig
from graphrag.config.models.claim_extraction_config import ClaimExtractionConfig
from graphrag.config.models.cluster_graph_config import ClusterGraphConfig
from graphrag.config.models.community_reports_config import CommunityReportsConfig
from graphrag.config.models.drift_search_config import DRIFTSearchConfig
from graphrag.config.models.embed_graph_config import EmbedGraphConfig
from graphrag.config.models.entity_extraction_config import EntityExtractionConfig
from graphrag.config.models.global_search_config import GlobalSearchConfig
from graphrag.config.models.input_config import InputConfig
from graphrag.config.models.llm_config import LLMConfig
from graphrag.config.models.local_search_config import LocalSearchConfig
from graphrag.config.models.reporting_config import ReportingConfig
from graphrag.config.models.snapshots_config import SnapshotsConfig
from graphrag.config.models.storage_config import StorageConfig
from graphrag.config.models.summarize_descriptions_config import (
SummarizeDescriptionsConfig,
)
from .text_embedding_config import TextEmbeddingConfig
from .umap_config import UmapConfig
from graphrag.config.models.text_embedding_config import TextEmbeddingConfig
from graphrag.config.models.umap_config import UmapConfig
class GraphRagConfig(LLMConfig):

View File

@ -7,9 +7,8 @@ from datashaper import AsyncType
from pydantic import BaseModel, Field
import graphrag.config.defaults as defs
from .llm_parameters import LLMParameters
from .parallelization_parameters import ParallelizationParameters
from graphrag.config.models.llm_parameters import LLMParameters
from graphrag.config.models.parallelization_parameters import ParallelizationParameters
class LLMConfig(BaseModel):

View File

@ -8,8 +8,7 @@ from pathlib import Path
from pydantic import Field
import graphrag.config.defaults as defs
from .llm_config import LLMConfig
from graphrag.config.models.llm_config import LLMConfig
class SummarizeDescriptionsConfig(LLMConfig):

View File

@ -7,8 +7,7 @@ from pydantic import Field
import graphrag.config.defaults as defs
from graphrag.config.enums import TextEmbeddingTarget
from .llm_config import LLMConfig
from graphrag.config.models.llm_config import LLMConfig
class TextEmbeddingConfig(LLMConfig):

View File

@ -7,8 +7,8 @@ import re
from pathlib import Path
from string import Template
from .enums import ReportingType, StorageType
from .models.graph_rag_config import GraphRagConfig
from graphrag.config.enums import ReportingType, StorageType
from graphrag.config.models.graph_rag_config import GraphRagConfig
def _resolve_timestamp_path_with_value(path: str | Path, timestamp_value: str) -> Path:

View File

@ -1,78 +1,4 @@
# Copyright (c) 2024 Microsoft Corporation.
# Licensed under the MIT License
"""The Indexing Engine package root."""
from .cache import PipelineCache
from .config import (
PipelineBlobCacheConfig,
PipelineBlobReportingConfig,
PipelineBlobStorageConfig,
PipelineCacheConfig,
PipelineCacheConfigTypes,
PipelineConfig,
PipelineConsoleReportingConfig,
PipelineCSVInputConfig,
PipelineFileCacheConfig,
PipelineFileReportingConfig,
PipelineFileStorageConfig,
PipelineInputConfig,
PipelineInputConfigTypes,
PipelineMemoryCacheConfig,
PipelineMemoryStorageConfig,
PipelineNoneCacheConfig,
PipelineReportingConfig,
PipelineReportingConfigTypes,
PipelineStorageConfig,
PipelineStorageConfigTypes,
PipelineTextInputConfig,
PipelineWorkflowConfig,
PipelineWorkflowReference,
PipelineWorkflowStep,
)
from .create_pipeline_config import create_pipeline_config
from .errors import (
NoWorkflowsDefinedError,
UndefinedWorkflowError,
UnknownWorkflowError,
)
from .load_pipeline_config import load_pipeline_config
from .run import run_pipeline, run_pipeline_with_config
from .storage import PipelineStorage
__all__ = [
"NoWorkflowsDefinedError",
"PipelineBlobCacheConfig",
"PipelineBlobCacheConfig",
"PipelineBlobReportingConfig",
"PipelineBlobStorageConfig",
"PipelineCSVInputConfig",
"PipelineCache",
"PipelineCacheConfig",
"PipelineCacheConfigTypes",
"PipelineConfig",
"PipelineConsoleReportingConfig",
"PipelineFileCacheConfig",
"PipelineFileReportingConfig",
"PipelineFileStorageConfig",
"PipelineInputConfig",
"PipelineInputConfigTypes",
"PipelineMemoryCacheConfig",
"PipelineMemoryStorageConfig",
"PipelineNoneCacheConfig",
"PipelineReportingConfig",
"PipelineReportingConfigTypes",
"PipelineStorage",
"PipelineStorageConfig",
"PipelineStorageConfigTypes",
"PipelineTextInputConfig",
"PipelineWorkflowConfig",
"PipelineWorkflowReference",
"PipelineWorkflowStep",
"UndefinedWorkflowError",
"UnknownWorkflowError",
"create_pipeline_config",
"load_pipeline_config",
"run_pipeline",
"run_pipeline_with_config",
]
"""The indexing engine package root."""

View File

@ -2,17 +2,3 @@
# Licensed under the MIT License
"""The Indexing Engine cache package root."""
from .json_pipeline_cache import JsonPipelineCache
from .load_cache import load_cache
from .memory_pipeline_cache import InMemoryCache
from .noop_pipeline_cache import NoopPipelineCache
from .pipeline_cache import PipelineCache
__all__ = [
"InMemoryCache",
"JsonPipelineCache",
"NoopPipelineCache",
"PipelineCache",
"load_cache",
]

View File

@ -6,9 +6,8 @@
import json
from typing import Any
from graphrag.index.storage import PipelineStorage
from .pipeline_cache import PipelineCache
from graphrag.index.cache.pipeline_cache import PipelineCache
from graphrag.index.storage.pipeline_storage import PipelineStorage
class JsonPipelineCache(PipelineCache):

View File

@ -12,16 +12,17 @@ from graphrag.index.config.cache import (
PipelineBlobCacheConfig,
PipelineFileCacheConfig,
)
from graphrag.index.storage import BlobPipelineStorage, FilePipelineStorage
from graphrag.index.storage.blob_pipeline_storage import BlobPipelineStorage
from graphrag.index.storage.file_pipeline_storage import FilePipelineStorage
if TYPE_CHECKING:
from graphrag.index.config import (
from graphrag.index.config.cache import (
PipelineCacheConfig,
)
from .json_pipeline_cache import JsonPipelineCache
from .memory_pipeline_cache import create_memory_cache
from .noop_pipeline_cache import NoopPipelineCache
from graphrag.index.cache.json_pipeline_cache import JsonPipelineCache
from graphrag.index.cache.memory_pipeline_cache import create_memory_cache
from graphrag.index.cache.noop_pipeline_cache import NoopPipelineCache
def load_cache(config: PipelineCacheConfig | None, root_dir: str | None):

View File

@ -5,7 +5,7 @@
from typing import Any
from .pipeline_cache import PipelineCache
from graphrag.index.cache.pipeline_cache import PipelineCache
class InMemoryCache(PipelineCache):

View File

@ -5,7 +5,7 @@
from typing import Any
from .pipeline_cache import PipelineCache
from graphrag.index.cache.pipeline_cache import PipelineCache
class NoopPipelineCache(PipelineCache):

View File

@ -2,90 +2,3 @@
# Licensed under the MIT License
"""The Indexing Engine config typing package root."""
from .cache import (
PipelineBlobCacheConfig,
PipelineCacheConfig,
PipelineCacheConfigTypes,
PipelineFileCacheConfig,
PipelineMemoryCacheConfig,
PipelineNoneCacheConfig,
)
from .embeddings import (
all_embeddings,
community_full_content_embedding,
community_summary_embedding,
community_title_embedding,
document_text_embedding,
entity_description_embedding,
entity_title_embedding,
relationship_description_embedding,
required_embeddings,
text_unit_text_embedding,
)
from .input import (
PipelineCSVInputConfig,
PipelineInputConfig,
PipelineInputConfigTypes,
PipelineTextInputConfig,
)
from .pipeline import PipelineConfig
from .reporting import (
PipelineBlobReportingConfig,
PipelineConsoleReportingConfig,
PipelineFileReportingConfig,
PipelineReportingConfig,
PipelineReportingConfigTypes,
)
from .storage import (
PipelineBlobStorageConfig,
PipelineFileStorageConfig,
PipelineMemoryStorageConfig,
PipelineStorageConfig,
PipelineStorageConfigTypes,
)
from .workflow import (
PipelineWorkflowConfig,
PipelineWorkflowReference,
PipelineWorkflowStep,
)
__all__ = [
"PipelineBlobCacheConfig",
"PipelineBlobReportingConfig",
"PipelineBlobStorageConfig",
"PipelineCSVInputConfig",
"PipelineCacheConfig",
"PipelineCacheConfigTypes",
"PipelineCacheConfigTypes",
"PipelineCacheConfigTypes",
"PipelineConfig",
"PipelineConsoleReportingConfig",
"PipelineFileCacheConfig",
"PipelineFileReportingConfig",
"PipelineFileStorageConfig",
"PipelineInputConfig",
"PipelineInputConfigTypes",
"PipelineMemoryCacheConfig",
"PipelineMemoryCacheConfig",
"PipelineMemoryStorageConfig",
"PipelineNoneCacheConfig",
"PipelineReportingConfig",
"PipelineReportingConfigTypes",
"PipelineStorageConfig",
"PipelineStorageConfigTypes",
"PipelineTextInputConfig",
"PipelineWorkflowConfig",
"PipelineWorkflowReference",
"PipelineWorkflowStep",
"all_embeddings",
"community_full_content_embedding",
"community_summary_embedding",
"community_title_embedding",
"document_text_embedding",
"entity_description_embedding",
"entity_title_embedding",
"relationship_description_embedding",
"required_embeddings",
"text_unit_text_embedding",
]

View File

@ -11,8 +11,7 @@ from pydantic import BaseModel
from pydantic import Field as pydantic_Field
from graphrag.config.enums import InputFileType, InputType
from .workflow import PipelineWorkflowStep
from graphrag.index.config.workflow import PipelineWorkflowStep
T = TypeVar("T")

View File

@ -9,11 +9,11 @@ from devtools import pformat
from pydantic import BaseModel
from pydantic import Field as pydantic_Field
from .cache import PipelineCacheConfigTypes
from .input import PipelineInputConfigTypes
from .reporting import PipelineReportingConfigTypes
from .storage import PipelineStorageConfigTypes
from .workflow import PipelineWorkflowReference
from graphrag.index.config.cache import PipelineCacheConfigTypes
from graphrag.index.config.input import PipelineInputConfigTypes
from graphrag.index.config.reporting import PipelineReportingConfigTypes
from graphrag.index.config.storage import PipelineStorageConfigTypes
from graphrag.index.config.workflow import PipelineWorkflowReference
class PipelineConfig(BaseModel):

View File

@ -7,8 +7,8 @@
from dataclasses import dataclass as dc_dataclass
from dataclasses import field
from .cache import PipelineCache
from .storage.pipeline_storage import PipelineStorage
from graphrag.index.cache.pipeline_cache import PipelineCache
from graphrag.index.storage.pipeline_storage import PipelineStorage
@dc_dataclass

View File

@ -14,7 +14,9 @@ from graphrag.config.enums import (
StorageType,
TextEmbeddingTarget,
)
from graphrag.config.models import GraphRagConfig, StorageConfig, TextEmbeddingConfig
from graphrag.config.models.graph_rag_config import GraphRagConfig
from graphrag.config.models.storage_config import StorageConfig
from graphrag.config.models.text_embedding_config import TextEmbeddingConfig
from graphrag.index.config.cache import (
PipelineBlobCacheConfig,
PipelineCacheConfigTypes,

View File

@ -2,20 +2,3 @@
# Licensed under the MIT License
"""Definitions for emitting pipeline artifacts to storage."""
from .csv_table_emitter import CSVTableEmitter
from .factories import create_table_emitter, create_table_emitters
from .json_table_emitter import JsonTableEmitter
from .parquet_table_emitter import ParquetTableEmitter
from .table_emitter import TableEmitter
from .types import TableEmitterType
__all__ = [
"CSVTableEmitter",
"JsonTableEmitter",
"ParquetTableEmitter",
"TableEmitter",
"TableEmitterType",
"create_table_emitter",
"create_table_emitters",
]

View File

@ -7,9 +7,8 @@ import logging
import pandas as pd
from graphrag.index.storage import PipelineStorage
from .table_emitter import TableEmitter
from graphrag.index.emit.table_emitter import TableEmitter
from graphrag.index.storage.pipeline_storage import PipelineStorage
log = logging.getLogger(__name__)

View File

@ -3,15 +3,14 @@
"""Table Emitter Factories."""
from graphrag.index.storage import PipelineStorage
from graphrag.index.emit.csv_table_emitter import CSVTableEmitter
from graphrag.index.emit.json_table_emitter import JsonTableEmitter
from graphrag.index.emit.parquet_table_emitter import ParquetTableEmitter
from graphrag.index.emit.table_emitter import TableEmitter
from graphrag.index.emit.types import TableEmitterType
from graphrag.index.storage.pipeline_storage import PipelineStorage
from graphrag.index.typing import ErrorHandlerFn
from .csv_table_emitter import CSVTableEmitter
from .json_table_emitter import JsonTableEmitter
from .parquet_table_emitter import ParquetTableEmitter
from .table_emitter import TableEmitter
from .types import TableEmitterType
def create_table_emitter(
emitter_type: TableEmitterType, storage: PipelineStorage, on_error: ErrorHandlerFn

View File

@ -7,9 +7,8 @@ import logging
import pandas as pd
from graphrag.index.storage import PipelineStorage
from .table_emitter import TableEmitter
from graphrag.index.emit.table_emitter import TableEmitter
from graphrag.index.storage.pipeline_storage import PipelineStorage
log = logging.getLogger(__name__)

View File

@ -9,11 +9,10 @@ import traceback
import pandas as pd
from pyarrow.lib import ArrowInvalid, ArrowTypeError
from graphrag.index.storage import PipelineStorage
from graphrag.index.emit.table_emitter import TableEmitter
from graphrag.index.storage.pipeline_storage import PipelineStorage
from graphrag.index.typing import ErrorHandlerFn
from .table_emitter import TableEmitter
log = logging.getLogger(__name__)

View File

@ -11,7 +11,7 @@ from datashaper import (
VerbCallbacks,
)
from graphrag.index.cache import PipelineCache
from graphrag.index.cache.pipeline_cache import PipelineCache
from graphrag.index.operations.cluster_graph import cluster_graph
from graphrag.index.operations.embed_graph import embed_graph
from graphrag.index.operations.extract_entities import extract_entities
@ -22,7 +22,7 @@ from graphrag.index.operations.snapshot_rows import snapshot_rows
from graphrag.index.operations.summarize_descriptions import (
summarize_descriptions,
)
from graphrag.index.storage import PipelineStorage
from graphrag.index.storage.pipeline_storage import PipelineStorage
async def create_base_entity_graph(

View File

@ -16,8 +16,8 @@ from datashaper import (
from graphrag.index.operations.chunk_text import chunk_text
from graphrag.index.operations.snapshot import snapshot
from graphrag.index.storage import PipelineStorage
from graphrag.index.utils import gen_md5_hash
from graphrag.index.storage.pipeline_storage import PipelineStorage
from graphrag.index.utils.hashing import gen_md5_hash
async def create_base_text_units(

View File

@ -11,7 +11,7 @@ from datashaper import (
VerbCallbacks,
)
from graphrag.index.cache import PipelineCache
from graphrag.index.cache.pipeline_cache import PipelineCache
from graphrag.index.graph.extractors.community_reports.schemas import (
CLAIM_DESCRIPTION,
CLAIM_DETAILS,

View File

@ -12,7 +12,7 @@ from datashaper import (
VerbCallbacks,
)
from graphrag.index.cache import PipelineCache
from graphrag.index.cache.pipeline_cache import PipelineCache
from graphrag.index.operations.extract_covariates import (
extract_covariates,
)

View File

@ -13,7 +13,7 @@ from datashaper import (
from graphrag.index.operations.layout_graph import layout_graph
from graphrag.index.operations.snapshot import snapshot
from graphrag.index.operations.unpack_graph import unpack_graph
from graphrag.index.storage import PipelineStorage
from graphrag.index.storage.pipeline_storage import PipelineStorage
async def create_final_nodes(

View File

@ -10,7 +10,7 @@ from datashaper import (
VerbCallbacks,
)
from graphrag.index.cache import PipelineCache
from graphrag.index.cache.pipeline_cache import PipelineCache
from graphrag.index.config.embeddings import (
community_full_content_embedding,
community_summary_embedding,
@ -23,7 +23,7 @@ from graphrag.index.config.embeddings import (
)
from graphrag.index.operations.embed_text import embed_text
from graphrag.index.operations.snapshot import snapshot
from graphrag.index.storage import PipelineStorage
from graphrag.index.storage.pipeline_storage import PipelineStorage
log = logging.getLogger(__name__)

View File

@ -3,6 +3,6 @@
"""The Indexing Engine graph embedding package root."""
from .embedding import NodeEmbeddings, embed_nod2vec
from graphrag.index.graph.embedding.embedding import NodeEmbeddings, embed_nod2vec
__all__ = ["NodeEmbeddings", "embed_nod2vec"]

View File

@ -5,7 +5,6 @@
from dataclasses import dataclass
import graspologic as gc
import networkx as nx
import numpy as np
@ -28,6 +27,9 @@ def embed_nod2vec(
random_seed: int = 86,
) -> NodeEmbeddings:
"""Generate node embeddings using Node2Vec."""
# NOTE: This import is done here to reduce the initial import time of the graphrag package
import graspologic as gc
# generate embedding
lcc_tensors = gc.embed.node2vec_embed( # type: ignore
graph=graph,

View File

@ -3,11 +3,11 @@
"""The Indexing Engine graph extractors package root."""
from .claims import ClaimExtractor
from .community_reports import (
from graphrag.index.graph.extractors.claims import ClaimExtractor
from graphrag.index.graph.extractors.community_reports import (
CommunityReportsExtractor,
)
from .graph import GraphExtractionResult, GraphExtractor
from graphrag.index.graph.extractors.graph import GraphExtractionResult, GraphExtractor
__all__ = [
"ClaimExtractor",

View File

@ -3,6 +3,6 @@
"""The Indexing Engine graph extractors claims package root."""
from .claim_extractor import ClaimExtractor
from graphrag.index.graph.extractors.claims.claim_extractor import ClaimExtractor
__all__ = ["ClaimExtractor"]

View File

@ -4,12 +4,17 @@
"""The Indexing Engine community reports package root."""
import graphrag.index.graph.extractors.community_reports.schemas as schemas
from .build_mixed_context import build_mixed_context
from .community_reports_extractor import CommunityReportsExtractor
from .prep_community_report_context import prep_community_report_context
from .sort_context import sort_context
from .utils import (
from graphrag.index.graph.extractors.community_reports.build_mixed_context import (
build_mixed_context,
)
from graphrag.index.graph.extractors.community_reports.community_reports_extractor import (
CommunityReportsExtractor,
)
from graphrag.index.graph.extractors.community_reports.prep_community_report_context import (
prep_community_report_context,
)
from graphrag.index.graph.extractors.community_reports.sort_context import sort_context
from graphrag.index.graph.extractors.community_reports.utils import (
filter_claims_to_nodes,
filter_edges_to_nodes,
filter_nodes_to_level,

View File

@ -5,10 +5,9 @@
import pandas as pd
import graphrag.index.graph.extractors.community_reports.schemas as schemas
from graphrag.index.graph.extractors.community_reports.sort_context import sort_context
from graphrag.query.llm.text_utils import num_tokens
from .sort_context import sort_context
def build_mixed_context(context: list[dict], max_tokens: int) -> str:
"""

View File

@ -9,7 +9,7 @@ from dataclasses import dataclass
from typing import Any
from graphrag.index.typing import ErrorHandlerFn
from graphrag.index.utils import dict_has_keys_with_types
from graphrag.index.utils.dicts import dict_has_keys_with_types
from graphrag.llm import CompletionLLM
from graphrag.prompts.index.community_report import COMMUNITY_REPORT_PROMPT

View File

@ -9,6 +9,11 @@ from typing import cast
import pandas as pd
import graphrag.index.graph.extractors.community_reports.schemas as schemas
from graphrag.index.graph.extractors.community_reports.build_mixed_context import (
build_mixed_context,
)
from graphrag.index.graph.extractors.community_reports.sort_context import sort_context
from graphrag.index.graph.extractors.community_reports.utils import set_context_size
from graphrag.index.utils.dataframes import (
antijoin,
drop_columns,
@ -19,10 +24,6 @@ from graphrag.index.utils.dataframes import (
where_column_equals,
)
from .build_mixed_context import build_mixed_context
from .sort_context import sort_context
from .utils import set_context_size
log = logging.getLogger(__name__)

View File

@ -3,7 +3,7 @@
"""The Indexing Engine unipartite graph package root."""
from .graph_extractor import (
from graphrag.index.graph.extractors.graph.graph_extractor import (
DEFAULT_ENTITY_TYPES,
GraphExtractionResult,
GraphExtractor,

View File

@ -15,7 +15,7 @@ import tiktoken
import graphrag.config.defaults as defs
from graphrag.index.typing import ErrorHandlerFn
from graphrag.index.utils import clean_str
from graphrag.index.utils.string import clean_str
from graphrag.llm import CompletionLLM
from graphrag.prompts.index.entity_extraction import (
CONTINUE_PROMPT,

View File

@ -3,7 +3,7 @@
"""The Indexing Engine unipartite graph package root."""
from .description_summary_extractor import (
from graphrag.index.graph.extractors.summarize.description_summary_extractor import (
SummarizationResult,
SummarizeExtractor,
)

View File

@ -3,7 +3,7 @@
"""The Indexing Engine graph utils package root."""
from .normalize_node_names import normalize_node_names
from .stable_lcc import stable_largest_connected_component
from graphrag.index.graph.utils.normalize_node_names import normalize_node_names
from graphrag.index.graph.utils.stable_lcc import stable_largest_connected_component
__all__ = ["normalize_node_names", "stable_largest_connected_component"]

View File

@ -6,13 +6,15 @@
from typing import Any, cast
import networkx as nx
from graspologic.utils import largest_connected_component
from .normalize_node_names import normalize_node_names
from graphrag.index.graph.utils.normalize_node_names import normalize_node_names
def stable_largest_connected_component(graph: nx.Graph) -> nx.Graph:
"""Return the largest connected component of the graph, with nodes and edges sorted in a stable way."""
# NOTE: The import is done here to reduce the initial import time of the module
from graspologic.utils import largest_connected_component
graph = graph.copy()
graph = cast(nx.Graph, largest_connected_component(graph))
graph = normalize_node_names(graph)

View File

@ -3,8 +3,11 @@
"""The Indexing Engine graph visualization package root."""
from .compute_umap_positions import compute_umap_positions, get_zero_positions
from .typing import GraphLayout, NodePosition
from graphrag.index.graph.visualization.compute_umap_positions import (
compute_umap_positions,
get_zero_positions,
)
from graphrag.index.graph.visualization.typing import GraphLayout, NodePosition
__all__ = [
"GraphLayout",

View File

@ -3,13 +3,11 @@
"""A module containing compute_umap_positions and visualize_embedding method definition."""
import graspologic as gc
import matplotlib.pyplot as plt
import networkx as nx
import numpy as np
import umap
from .typing import NodePosition
from graphrag.index.graph.visualization.typing import NodePosition
def get_zero_positions(
@ -61,6 +59,9 @@ def compute_umap_positions(
random_state: int = 86,
) -> list[NodePosition]:
"""Project embedding vectors down to 2D/3D using UMAP."""
# NOTE: This import is done here to reduce the initial import time of the graphrag package
import umap
embedding_positions = umap.UMAP(
min_dist=min_dist,
n_neighbors=n_neighbors,
@ -105,6 +106,9 @@ def visualize_embedding(
umap_positions: list[dict],
):
"""Project embedding down to 2D using UMAP and visualize."""
# NOTE: This import is done here to reduce the initial import time of the graphrag package
import graspologic as gc
# rendering
plt.clf()
figure = plt.gcf()

View File

@ -2,7 +2,3 @@
# Licensed under the MIT License
"""The Indexing Engine input package root."""
from .load_input import load_input
__all__ = ["load_input"]

View File

@ -10,10 +10,10 @@ from typing import cast
import pandas as pd
from graphrag.index.config import PipelineCSVInputConfig, PipelineInputConfig
from graphrag.index.storage import PipelineStorage
from graphrag.index.utils import gen_md5_hash
from graphrag.logging import ProgressReporter
from graphrag.index.config.input import PipelineCSVInputConfig, PipelineInputConfig
from graphrag.index.storage.pipeline_storage import PipelineStorage
from graphrag.index.utils.hashing import gen_md5_hash
from graphrag.logging.base import ProgressReporter
log = logging.getLogger(__name__)

View File

@ -10,18 +10,17 @@ from typing import cast
import pandas as pd
from graphrag.config import InputConfig, InputType
from graphrag.index.config import PipelineInputConfig
from graphrag.index.storage import (
BlobPipelineStorage,
FilePipelineStorage,
)
from graphrag.logging import NullProgressReporter, ProgressReporter
from .csv import input_type as csv
from .csv import load as load_csv
from .text import input_type as text
from .text import load as load_text
from graphrag.config.enums import InputType
from graphrag.config.models.input_config import InputConfig
from graphrag.index.config.input import PipelineInputConfig
from graphrag.index.input.csv import input_type as csv
from graphrag.index.input.csv import load as load_csv
from graphrag.index.input.text import input_type as text
from graphrag.index.input.text import load as load_text
from graphrag.index.storage.blob_pipeline_storage import BlobPipelineStorage
from graphrag.index.storage.file_pipeline_storage import FilePipelineStorage
from graphrag.logging.base import ProgressReporter
from graphrag.logging.null_progress import NullProgressReporter
log = logging.getLogger(__name__)
loaders: dict[str, Callable[..., Awaitable[pd.DataFrame]]] = {

View File

@ -10,10 +10,10 @@ from typing import Any
import pandas as pd
from graphrag.index.config import PipelineInputConfig
from graphrag.index.storage import PipelineStorage
from graphrag.index.utils import gen_md5_hash
from graphrag.logging import ProgressReporter
from graphrag.index.config.input import PipelineInputConfig
from graphrag.index.storage.pipeline_storage import PipelineStorage
from graphrag.index.utils.hashing import gen_md5_hash
from graphrag.logging.base import ProgressReporter
DEFAULT_FILE_PATTERN = re.compile(
r".*[\\/](?P<source>[^\\/]+)[\\/](?P<year>\d{4})-(?P<month>\d{2})-(?P<day>\d{2})_(?P<author>[^_]+)_\d+\.txt"

View File

@ -2,13 +2,3 @@
# Licensed under the MIT License
"""The Indexing Engine LLM package root."""
from .load_llm import load_llm, load_llm_embeddings
from .types import TextListSplitter, TextSplitter
__all__ = [
"TextListSplitter",
"TextSplitter",
"load_llm",
"load_llm_embeddings",
]

View File

@ -27,7 +27,7 @@ from graphrag.llm import (
if TYPE_CHECKING:
from datashaper import VerbCallbacks
from graphrag.index.cache import PipelineCache
from graphrag.index.cache.pipeline_cache import PipelineCache
from graphrag.index.typing import ErrorHandlerFn
log = logging.getLogger(__name__)

View File

@ -9,10 +9,9 @@ from pathlib import Path
import yaml
from pyaml_env import parse_config as parse_config_with_env
from graphrag.config import create_graphrag_config, read_dotenv
from graphrag.index.config import PipelineConfig
from .create_pipeline_config import create_pipeline_config
from graphrag.config.create_graphrag_config import create_graphrag_config, read_dotenv
from graphrag.index.config.pipeline import PipelineConfig
from graphrag.index.create_pipeline_config import create_pipeline_config
def load_pipeline_config(config_or_path: str | PipelineConfig) -> PipelineConfig:

View File

@ -3,6 +3,10 @@
"""The Indexing Engine text chunk package root."""
from .chunk_text import ChunkStrategy, ChunkStrategyType, chunk_text
from graphrag.index.operations.chunk_text.chunk_text import (
ChunkStrategy,
ChunkStrategyType,
chunk_text,
)
__all__ = ["ChunkStrategy", "ChunkStrategyType", "chunk_text"]

View File

@ -12,7 +12,11 @@ from datashaper import (
progress_ticker,
)
from .typing import ChunkInput, ChunkStrategy, ChunkStrategyType
from graphrag.index.operations.chunk_text.typing import (
ChunkInput,
ChunkStrategy,
ChunkStrategyType,
)
def chunk_text(
@ -117,14 +121,13 @@ def load_strategy(strategy: ChunkStrategyType) -> ChunkStrategy:
"""Load strategy method definition."""
match strategy:
case ChunkStrategyType.tokens:
from .strategies import run_tokens
from graphrag.index.operations.chunk_text.strategies import run_tokens
return run_tokens
case ChunkStrategyType.sentence:
# NLTK
from graphrag.index.bootstrap import bootstrap
from .strategies import run_sentences
from graphrag.index.operations.chunk_text.strategies import run_sentences
bootstrap()
return run_sentences

View File

@ -11,9 +11,8 @@ import tiktoken
from datashaper import ProgressTicker
import graphrag.config.defaults as defs
from graphrag.index.text_splitting import Tokenizer
from .typing import TextChunk
from graphrag.index.operations.chunk_text.typing import TextChunk
from graphrag.index.text_splitting.text_splitting import Tokenizer
def run_tokens(

View File

@ -11,10 +11,9 @@ from typing import Any, cast
import networkx as nx
import pandas as pd
from datashaper import VerbCallbacks, progress_iterable
from graspologic.partition import hierarchical_leiden
from graphrag.index.graph.utils import stable_largest_connected_component
from graphrag.index.utils import gen_uuid
from graphrag.index.utils.uuid import gen_uuid
Communities = list[tuple[int, str, list[str]]]
@ -187,6 +186,9 @@ def _compute_leiden_communities(
seed=0xDEADBEEF,
) -> dict[int, dict[str, int]]:
"""Return Leiden root communities."""
# NOTE: This import is done here to reduce the initial import time of the graphrag package
from graspologic.partition import hierarchical_leiden
if use_lcc:
graph = stable_largest_connected_component(graph)

View File

@ -3,7 +3,10 @@
"""The Indexing Engine graph embed package root."""
from .embed_graph import EmbedGraphStrategyType, embed_graph
from .typing import NodeEmbeddings
from graphrag.index.operations.embed_graph.embed_graph import (
EmbedGraphStrategyType,
embed_graph,
)
from graphrag.index.operations.embed_graph.typing import NodeEmbeddings
__all__ = ["EmbedGraphStrategyType", "NodeEmbeddings", "embed_graph"]

View File

@ -12,9 +12,8 @@ from datashaper import VerbCallbacks, derive_from_rows
from graphrag.index.graph.embedding import embed_nod2vec
from graphrag.index.graph.utils import stable_largest_connected_component
from graphrag.index.utils import load_graph
from .typing import NodeEmbeddings
from graphrag.index.operations.embed_graph.typing import NodeEmbeddings
from graphrag.index.utils.load_graph import load_graph
class EmbedGraphStrategyType(str, Enum):

View File

@ -3,6 +3,9 @@
"""The Indexing Engine text embed package root."""
from .embed_text import TextEmbedStrategyType, embed_text
from graphrag.index.operations.embed_text.embed_text import (
TextEmbedStrategyType,
embed_text,
)
__all__ = ["TextEmbedStrategyType", "embed_text"]

View File

@ -11,15 +11,11 @@ import numpy as np
import pandas as pd
from datashaper import VerbCallbacks
from graphrag.index.cache import PipelineCache
from graphrag.index.cache.pipeline_cache import PipelineCache
from graphrag.index.operations.embed_text.strategies.typing import TextEmbeddingStrategy
from graphrag.utils.embeddings import create_collection_name
from graphrag.vector_stores import (
BaseVectorStore,
VectorStoreDocument,
VectorStoreFactory,
)
from .strategies.typing import TextEmbeddingStrategy
from graphrag.vector_stores.base import BaseVectorStore, VectorStoreDocument
from graphrag.vector_stores.factory import VectorStoreFactory
log = logging.getLogger(__name__)
@ -242,11 +238,15 @@ def load_strategy(strategy: TextEmbedStrategyType) -> TextEmbeddingStrategy:
"""Load strategy method definition."""
match strategy:
case TextEmbedStrategyType.openai:
from .strategies.openai import run as run_openai
from graphrag.index.operations.embed_text.strategies.openai import (
run as run_openai,
)
return run_openai
case TextEmbedStrategyType.mock:
from .strategies.mock import run as run_mock
from graphrag.index.operations.embed_text.strategies.mock import (
run as run_mock,
)
return run_mock
case _:

View File

@ -9,9 +9,8 @@ from typing import Any
from datashaper import ProgressTicker, VerbCallbacks, progress_ticker
from graphrag.index.cache import PipelineCache
from .typing import TextEmbeddingResult
from graphrag.index.cache.pipeline_cache import PipelineCache
from graphrag.index.operations.embed_text.strategies.typing import TextEmbeddingResult
async def run( # noqa RUF029 async is required for interface

View File

@ -11,14 +11,13 @@ import numpy as np
from datashaper import ProgressTicker, VerbCallbacks, progress_ticker
import graphrag.config.defaults as defs
from graphrag.index.cache import PipelineCache
from graphrag.index.llm import load_llm_embeddings
from graphrag.index.text_splitting import TokenTextSplitter
from graphrag.index.utils import is_null
from graphrag.index.cache.pipeline_cache import PipelineCache
from graphrag.index.llm.load_llm import load_llm_embeddings
from graphrag.index.operations.embed_text.strategies.typing import TextEmbeddingResult
from graphrag.index.text_splitting.text_splitting import TokenTextSplitter
from graphrag.index.utils.is_null import is_null
from graphrag.llm import EmbeddingLLM, OpenAIConfiguration
from .typing import TextEmbeddingResult
log = logging.getLogger(__name__)

View File

@ -8,7 +8,7 @@ from dataclasses import dataclass
from datashaper import VerbCallbacks
from graphrag.index.cache import PipelineCache
from graphrag.index.cache.pipeline_cache import PipelineCache
@dataclass

View File

@ -3,6 +3,9 @@
"""The Indexing Engine text extract claims package root."""
from .extract_covariates import ExtractClaimsStrategyType, extract_covariates
from graphrag.index.operations.extract_covariates.extract_covariates import (
ExtractClaimsStrategyType,
extract_covariates,
)
__all__ = ["ExtractClaimsStrategyType", "extract_covariates"]

View File

@ -14,9 +14,12 @@ from datashaper import (
derive_from_rows,
)
from graphrag.index.cache import PipelineCache
from .typing import Covariate, CovariateExtractStrategy, ExtractClaimsStrategyType
from graphrag.index.cache.pipeline_cache import PipelineCache
from graphrag.index.operations.extract_covariates.typing import (
Covariate,
CovariateExtractStrategy,
ExtractClaimsStrategyType,
)
log = logging.getLogger(__name__)
@ -72,7 +75,9 @@ def load_strategy(strategy_type: ExtractClaimsStrategyType) -> CovariateExtractS
"""Load strategy method definition."""
match strategy_type:
case ExtractClaimsStrategyType.graph_intelligence:
from .strategies import run_graph_intelligence
from graphrag.index.operations.extract_covariates.strategies import (
run_graph_intelligence,
)
return run_graph_intelligence
case _:

View File

@ -9,15 +9,14 @@ from typing import Any
from datashaper import VerbCallbacks
import graphrag.config.defaults as defs
from graphrag.index.cache import PipelineCache
from graphrag.index.cache.pipeline_cache import PipelineCache
from graphrag.index.graph.extractors.claims import ClaimExtractor
from graphrag.index.llm import load_llm
from graphrag.llm import CompletionLLM
from .typing import (
from graphrag.index.llm.load_llm import load_llm
from graphrag.index.operations.extract_covariates.typing import (
Covariate,
CovariateExtractionResult,
)
from graphrag.llm import CompletionLLM
async def run_graph_intelligence(

Some files were not shown because too many files have changed in this diff Show More