Restructure project as monorepo. (#2111)

* Restructure project as monorepo.
This commit is contained in:
Derek Worthen 2025-11-04 09:51:56 -08:00 committed by GitHub
parent c43a58c353
commit 619269243d
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
404 changed files with 2285 additions and 1863 deletions

View File

@ -31,7 +31,7 @@ jobs:
- name: Install dependencies
shell: bash
run: uv sync
run: uv sync --all-packages
- name: mkdocs build
shell: bash

View File

@ -67,7 +67,7 @@ jobs:
- name: Install dependencies
shell: bash
run: |
uv sync
uv sync --all-packages
uv pip install gensim
- name: Check
@ -76,7 +76,7 @@ jobs:
- name: Build
run: |
uv build
uv build --all-packages
- name: Unit Test
run: |

View File

@ -67,12 +67,12 @@ jobs:
- name: Install dependencies
shell: bash
run: |
uv sync
uv sync --all-packages
uv pip install gensim
- name: Build
run: |
uv build
uv build --all-packages
- name: Install and start Azurite
shell: bash

View File

@ -67,7 +67,7 @@ jobs:
- name: Install dependencies
shell: bash
run: |
uv sync
uv sync --all-packages
uv pip install gensim
- name: Notebook Test

View File

@ -15,7 +15,6 @@ jobs:
runs-on: ubuntu-latest
environment:
name: pypi
url: https://pypi.org/p/graphrag
permissions:
id-token: write # IMPORTANT: this permission is mandatory for trusted publishing
@ -35,18 +34,14 @@ jobs:
- name: Install dependencies
shell: bash
run: uv sync
run: uv sync --all-packages
- name: Export Publication Version
run: echo "version=$(uv version --short)" >> $GITHUB_OUTPUT
- name: Build Distributable
shell: bash
run: uv build
run: uv run poe build
- name: Publish package distributions to PyPI
uses: pypa/gh-action-pypi-publish@release/v1
with:
packages-dir: dist
skip-existing: true
verbose: true
run: uv publish

View File

@ -72,12 +72,12 @@ jobs:
- name: Install dependencies
shell: bash
run: |
uv sync
uv sync --all-packages
uv pip install gensim
- name: Build
run: |
uv build
uv build --all-packages
- name: Install and start Azurite
shell: bash

3
.gitignore vendored
View File

@ -58,3 +58,6 @@ docsite/
# Jupyter notebook
.ipynb_checkpoints/
# Root build assets
packages/*/LICENSE

29
.vscode/launch.json vendored
View File

@ -10,7 +10,7 @@
"args": [
"index",
"--root",
"<path_to_index_folder>"
"${input:root_folder}"
],
"console": "integratedTerminal"
},
@ -22,9 +22,9 @@
"args": [
"query",
"--root",
"<path_to_index_folder>",
"--method", "basic",
"--query", "What are the top themes in this story",
"${input:root_folder}",
"--method", "${input:query_method}",
"--query", "${input:query}"
]
},
{
@ -35,7 +35,7 @@
"args": [
"poe", "prompt-tune",
"--config",
"<path_to_ragtest_root_demo>/settings.yaml",
"${input:root_folder}/settings.yaml",
]
},
{
@ -74,5 +74,22 @@
"console": "integratedTerminal",
"justMyCode": false
},
]
],
"inputs": [
{
"id": "root_folder",
"type": "promptString",
"description": "Enter the root folder path"
},
{
"id": "query_method",
"type": "promptString",
"description": "Enter the query method (e.g., 'global', 'local')"
},
{
"id": "query",
"type": "promptString",
"description": "Enter the query text"
}
]
}

View File

@ -28,9 +28,8 @@
"from pathlib import Path\n",
"from pprint import pprint\n",
"\n",
"import pandas as pd\n",
"\n",
"import graphrag.api as api\n",
"import pandas as pd\n",
"from graphrag.config.load_config import load_config\n",
"from graphrag.index.typing.pipeline_run_result import PipelineRunResult"
]

View File

@ -60,7 +60,6 @@
"\n",
"import numpy as np\n",
"import yaml\n",
"\n",
"from graphrag.config.models.vector_store_schema_config import VectorStoreSchemaConfig\n",
"from graphrag.data_model.types import TextEmbedder\n",
"\n",

View File

@ -20,7 +20,6 @@
"from pathlib import Path\n",
"\n",
"import pandas as pd\n",
"\n",
"from graphrag.config.enums import ModelType\n",
"from graphrag.config.models.drift_search_config import DRIFTSearchConfig\n",
"from graphrag.config.models.language_model_config import LanguageModelConfig\n",

View File

@ -19,7 +19,6 @@
"import os\n",
"\n",
"import pandas as pd\n",
"\n",
"from graphrag.config.enums import ModelType\n",
"from graphrag.config.models.language_model_config import LanguageModelConfig\n",
"from graphrag.language_model.manager import ModelManager\n",

View File

@ -19,7 +19,6 @@
"import os\n",
"\n",
"import pandas as pd\n",
"\n",
"from graphrag.config.enums import ModelType\n",
"from graphrag.config.models.language_model_config import LanguageModelConfig\n",
"from graphrag.language_model.manager import ModelManager\n",

View File

@ -70,7 +70,6 @@
"outputs": [],
"source": [
"import numpy as np\n",
"\n",
"from graphrag.utils.storage import (\n",
" delete_table_from_storage,\n",
" load_table_from_storage,\n",

View File

@ -30,9 +30,8 @@
"from pathlib import Path\n",
"from pprint import pprint\n",
"\n",
"import pandas as pd\n",
"\n",
"import graphrag.api as api\n",
"import pandas as pd\n",
"from graphrag.config.load_config import load_config\n",
"from graphrag.index.typing.pipeline_run_result import PipelineRunResult"
]

View File

@ -19,7 +19,6 @@
"import os\n",
"\n",
"import pandas as pd\n",
"\n",
"from graphrag.config.models.vector_store_schema_config import VectorStoreSchemaConfig\n",
"from graphrag.query.context_builder.entity_extraction import EntityVectorStoreKey\n",
"from graphrag.query.indexer_adapters import (\n",

View File

@ -0,0 +1,51 @@
# GraphRAG Common
## Factory module
```python
from abc import ABC, abstractmethod
from graphrag_common.factory import Factory
class SampleABC(ABC):
@abstractmethod
def get_value(self) -> str:
msg = "Subclasses must implement the get_value method."
raise NotImplementedError(msg)
class ConcreteClass(SampleABC):
def __init__(self, value: str):
self._value = value
def get_value(self) -> str:
return self._value
class SampleFactory(Factory[SampleABC]):
"""A Factory for SampleABC classes."""
factory = SampleFactory()
# Registering transient services
# A new one is created for every request
factory.register("some_strategy", ConcreteTestClass)
trans1 = factory.create("some_strategy", {"value": "test1"})
trans2 = factory.create("some_strategy", {"value": "test2"})
assert trans1 is not trans2
assert trans1.get_value() == "test1"
assert trans2.get_value() == "test2"
# Registering singleton services
# After first creation, the same one is returned every time
factory.register("some_other_strategy", ConcreteTestClass, scope="singleton")
single1 = factory.create("some_other_strategy", {"value": "singleton"})
single2 = factory.create("some_other_strategy", {"value": "ignored"})
assert single1 is single2
assert single1.get_value() == "singleton"
assert single2.get_value() == "singleton"
```

View File

@ -0,0 +1,4 @@
# Copyright (c) 2024 Microsoft Corporation.
# Licensed under the MIT License
"""GraphRAG Common package."""

View File

@ -0,0 +1,8 @@
# Copyright (c) 2024 Microsoft Corporation.
# Licensed under the MIT License
"""The GraphRAG factory module."""
from graphrag_common.factory.factory import Factory
__all__ = ["Factory"]

View File

@ -5,10 +5,21 @@
from abc import ABC
from collections.abc import Callable
from typing import Any, ClassVar, Generic, TypeVar
from dataclasses import dataclass
from typing import Any, ClassVar, Generic, Literal, TypeVar
T = TypeVar("T", covariant=True)
ServiceScope = Literal["singleton", "transient"]
@dataclass
class _ServiceDescriptor(Generic[T]):
"""Descriptor for a service."""
scope: ServiceScope
initializer: Callable[..., T]
class Factory(ABC, Generic[T]):
"""Abstract base class for factories."""
@ -23,18 +34,24 @@ class Factory(ABC, Generic[T]):
def __init__(self):
if not hasattr(self, "_initialized"):
self._services: dict[str, Callable[..., T]] = {}
self._service_initializers: dict[str, _ServiceDescriptor[T]] = {}
self._initialized_services: dict[str, T] = {}
self._initialized = True
def __contains__(self, strategy: str) -> bool:
"""Check if a strategy is registered."""
return strategy in self._services
return strategy in self._service_initializers
def keys(self) -> list[str]:
"""Get a list of registered strategy names."""
return list(self._services.keys())
return list(self._service_initializers.keys())
def register(self, strategy: str, initializer: Callable[..., T]) -> None:
def register(
self,
strategy: str,
initializer: Callable[..., T],
scope: ServiceScope = "transient",
) -> None:
"""
Register a new service.
@ -42,8 +59,9 @@ class Factory(ABC, Generic[T]):
----
strategy: The name of the strategy.
initializer: A callable that creates an instance of T.
scope: The service scope, either 'singleton' or 'transient'.
"""
self._services[strategy] = initializer
self._service_initializers[strategy] = _ServiceDescriptor(scope, initializer)
def create(self, strategy: str, init_args: dict[str, Any] | None = None) -> T:
"""
@ -62,7 +80,16 @@ class Factory(ABC, Generic[T]):
------
ValueError: If the strategy is not registered.
"""
if strategy not in self._services:
msg = f"Strategy '{strategy}' is not registered."
if strategy not in self._service_initializers:
msg = f"Strategy '{strategy}' is not registered. Registered strategies are: {', '.join(list(self._service_initializers.keys()))}"
raise ValueError(msg)
return self._services[strategy](**(init_args or {}))
service_descriptor = self._service_initializers[strategy]
if service_descriptor.scope == "singleton":
if strategy not in self._initialized_services:
self._initialized_services[strategy] = service_descriptor.initializer(
**(init_args or {})
)
return self._initialized_services[strategy]
return service_descriptor.initializer(**(init_args or {}))

View File

@ -0,0 +1,41 @@
[project]
name = "graphrag-common"
version = "2.7.0"
description = "Common utilities and types for GraphRAG"
authors = [
{name = "Alonso Guevara Fernández", email = "alonsog@microsoft.com"},
{name = "Andrés Morales Esquivel", email = "andresmor@microsoft.com"},
{name = "Chris Trevino", email = "chtrevin@microsoft.com"},
{name = "David Tittsworth", email = "datittsw@microsoft.com"},
{name = "Dayenne de Souza", email = "ddesouza@microsoft.com"},
{name = "Derek Worthen", email = "deworthe@microsoft.com"},
{name = "Gaudy Blanco Meneses", email = "gaudyb@microsoft.com"},
{name = "Ha Trinh", email = "trinhha@microsoft.com"},
{name = "Jonathan Larson", email = "jolarso@microsoft.com"},
{name = "Josh Bradley", email = "joshbradley@microsoft.com"},
{name = "Kate Lytvynets", email = "kalytv@microsoft.com"},
{name = "Kenny Zhang", email = "zhangken@microsoft.com"},
{name = "Mónica Carvajal"},
{name = "Nathan Evans", email = "naevans@microsoft.com"},
{name = "Rodrigo Racanicci", email = "rracanicci@microsoft.com"},
{name = "Sarah Smith", email = "smithsarah@microsoft.com"},
]
license = "MIT"
readme = "README.md"
license-files = ["LICENSE"]
requires-python = ">=3.10,<3.13"
classifiers = [
"Programming Language :: Python :: 3",
"Programming Language :: Python :: 3.10",
"Programming Language :: Python :: 3.11",
"Programming Language :: Python :: 3.12",
]
dependencies = []
[project.urls]
Source = "https://github.com/microsoft/graphrag"
[build-system]
requires = ["hatchling>=1.27.0,<2.0.0"]
build-backend = "hatchling.build"

View File

@ -0,0 +1,76 @@
# GraphRAG
👉 [Microsoft Research Blog Post](https://www.microsoft.com/en-us/research/blog/graphrag-unlocking-llm-discovery-on-narrative-private-data/)<br/>
👉 [Read the docs](https://microsoft.github.io/graphrag)<br/>
👉 [GraphRAG Arxiv](https://arxiv.org/pdf/2404.16130)
<div align="left">
<a href="https://pypi.org/project/graphrag/">
<img alt="PyPI - Version" src="https://img.shields.io/pypi/v/graphrag">
</a>
<a href="https://pypi.org/project/graphrag/">
<img alt="PyPI - Downloads" src="https://img.shields.io/pypi/dm/graphrag">
</a>
<a href="https://github.com/microsoft/graphrag/issues">
<img alt="GitHub Issues" src="https://img.shields.io/github/issues/microsoft/graphrag">
</a>
<a href="https://github.com/microsoft/graphrag/discussions">
<img alt="GitHub Discussions" src="https://img.shields.io/github/discussions/microsoft/graphrag">
</a>
</div>
## Overview
The GraphRAG project is a data pipeline and transformation suite that is designed to extract meaningful, structured data from unstructured text using the power of LLMs.
To learn more about GraphRAG and how it can be used to enhance your LLM's ability to reason about your private data, please visit the <a href="https://www.microsoft.com/en-us/research/blog/graphrag-unlocking-llm-discovery-on-narrative-private-data/" target="_blank">Microsoft Research Blog Post.</a>
## Quickstart
To get started with the GraphRAG system we recommend trying the [command line quickstart](https://microsoft.github.io/graphrag/get_started/).
## Repository Guidance
This repository presents a methodology for using knowledge graph memory structures to enhance LLM outputs. Please note that the provided code serves as a demonstration and is not an officially supported Microsoft offering.
⚠️ *Warning: GraphRAG indexing can be an expensive operation, please read all of the documentation to understand the process and costs involved, and start small.*
## Diving Deeper
- To learn about our contribution guidelines, see [CONTRIBUTING.md](./CONTRIBUTING.md)
- To start developing _GraphRAG_, see [DEVELOPING.md](./DEVELOPING.md)
- Join the conversation and provide feedback in the [GitHub Discussions tab!](https://github.com/microsoft/graphrag/discussions)
## Prompt Tuning
Using _GraphRAG_ with your data out of the box may not yield the best possible results.
We strongly recommend to fine-tune your prompts following the [Prompt Tuning Guide](https://microsoft.github.io/graphrag/prompt_tuning/overview/) in our documentation.
## Versioning
Please see the [breaking changes](./breaking-changes.md) document for notes on our approach to versioning the project.
*Always run `graphrag init --root [path] --force` between minor version bumps to ensure you have the latest config format. Run the provided migration notebook between major version bumps if you want to avoid re-indexing prior datasets. Note that this will overwrite your configuration and prompts, so backup if necessary.*
## Responsible AI FAQ
See [RAI_TRANSPARENCY.md](./RAI_TRANSPARENCY.md)
- [What is GraphRAG?](./RAI_TRANSPARENCY.md#what-is-graphrag)
- [What can GraphRAG do?](./RAI_TRANSPARENCY.md#what-can-graphrag-do)
- [What are GraphRAGs intended use(s)?](./RAI_TRANSPARENCY.md#what-are-graphrags-intended-uses)
- [How was GraphRAG evaluated? What metrics are used to measure performance?](./RAI_TRANSPARENCY.md#how-was-graphrag-evaluated-what-metrics-are-used-to-measure-performance)
- [What are the limitations of GraphRAG? How can users minimize the impact of GraphRAGs limitations when using the system?](./RAI_TRANSPARENCY.md#what-are-the-limitations-of-graphrag-how-can-users-minimize-the-impact-of-graphrags-limitations-when-using-the-system)
- [What operational factors and settings allow for effective and responsible use of GraphRAG?](./RAI_TRANSPARENCY.md#what-operational-factors-and-settings-allow-for-effective-and-responsible-use-of-graphrag)
## Trademarks
This project may contain trademarks or logos for projects, products, or services. Authorized use of Microsoft
trademarks or logos is subject to and must follow
[Microsoft's Trademark & Brand Guidelines](https://www.microsoft.com/en-us/legal/intellectualproperty/trademarks/usage/general).
Use of Microsoft trademarks or logos in modified versions of this project must not cause confusion or imply Microsoft sponsorship.
Any use of third-party trademarks or logos are subject to those third-party's policies.
## Privacy
[Microsoft Privacy Statement](https://privacy.microsoft.com/en-us/privacystatement)

View File

@ -5,12 +5,13 @@
from __future__ import annotations
from graphrag_common.factory import Factory
from graphrag.cache.json_pipeline_cache import JsonPipelineCache
from graphrag.cache.memory_pipeline_cache import InMemoryCache
from graphrag.cache.noop_pipeline_cache import NoopPipelineCache
from graphrag.cache.pipeline_cache import PipelineCache
from graphrag.config.enums import CacheType
from graphrag.factory.factory import Factory
from graphrag.storage.blob_pipeline_storage import BlobPipelineStorage
from graphrag.storage.cosmosdb_pipeline_storage import CosmosDBPipelineStorage
from graphrag.storage.file_pipeline_storage import FilePipelineStorage

View File

@ -5,8 +5,9 @@
import logging
from graphrag_common.factory import Factory
from graphrag.config.enums import InputFileType
from graphrag.factory.factory import Factory
from graphrag.index.input.csv import CSVFileReader
from graphrag.index.input.input_reader import InputReader
from graphrag.index.input.json import JSONFileReader

Some files were not shown because too many files have changed in this diff Show More