Merge remote-tracking branch 'origin/main' into custom_vector_store_schema

2026-01-14 09:07:20 +08:00 · 2025-09-18 20:05:51 -06:00 · 2025-09-18 20:05:51 -06:00 · f6d923ea62
commit f6d923ea62
parent d698cc2c39 075cadd59a
7 changed files with 33 additions and 59 deletions
--- a/.semversioner/next-release/patch-20250915234643851001.json
+++ b/.semversioner/next-release/patch-20250915234643851001.json
@ -0,0 +1,4 @@
+{
+  "type": "patch",
+  "description": "Remove hard-coded community rate limiter."
+}
--- a/.semversioner/next-release/patch-20250918192431890892.json
+++ b/.semversioner/next-release/patch-20250918192431890892.json
@ -0,0 +1,4 @@
+{
+  "type": "patch",
+  "description": "Fix multi-index search."
+}
--- a/docs/get_started.md
+++ b/docs/get_started.md
@ -65,11 +65,10 @@ deployment_name: <azure_model_deployment_name>
 ```

 #### Using Managed Auth on Azure
-To use managed auth, add an additional value to your model config and comment out or remove the api_key line:
+To use managed auth, edit the auth_type in your model config and *remove* the api_key line:

 ```yaml
 auth_type: azure_managed_identity # Default auth_type is is api_key
-# api_key: ${GRAPHRAG_API_KEY}
 ```

 You will also need to login with [az login](https://learn.microsoft.com/en-us/cli/azure/authenticate-azure-cli) and select the subscription with your endpoint.
@ -116,4 +115,4 @@ Please refer to [Query Engine](query/overview.md) docs for detailed information
 - For more details about configuring GraphRAG, see the [configuration documentation](config/overview.md).
 - To learn more about Initialization, refer to the [Initialization documentation](config/init.md).
 - For more details about using the CLI, refer to the [CLI documentation](cli.md).
- Check out our [visualization guide](visualization_guide.md) for a more interactive experience in debugging and exploring the knowledge graph.
+- Check out our [visualization guide](visualization_guide.md) for a more interactive experience in debugging and exploring the knowledge graph.
--- a/graphrag/cache/factory.py
+++ b/graphrag/cache/factory.py
@ -97,9 +97,19 @@ def create_cosmosdb_cache(**kwargs) -> PipelineCache:
    return JsonPipelineCache(storage)


+def create_noop_cache(**_kwargs) -> PipelineCache:
+    """Create a no-op cache implementation."""
+    return NoopPipelineCache()
+
+
+def create_memory_cache(**kwargs) -> PipelineCache:
+    """Create a memory cache implementation."""
+    return InMemoryCache(**kwargs)
+
+
 # --- register built-in cache implementations ---
-CacheFactory.register(CacheType.none.value, NoopPipelineCache)
-CacheFactory.register(CacheType.memory.value, InMemoryCache)
+CacheFactory.register(CacheType.none.value, create_noop_cache)
+CacheFactory.register(CacheType.memory.value, create_memory_cache)
 CacheFactory.register(CacheType.file.value, create_file_cache)
 CacheFactory.register(CacheType.blob.value, create_blob_cache)
 CacheFactory.register(CacheType.cosmosdb.value, create_cosmosdb_cache)
--- a/graphrag/index/operations/summarize_communities/strategies.py
+++ b/graphrag/index/operations/summarize_communities/strategies.py
@ -16,7 +16,6 @@ from graphrag.index.operations.summarize_communities.typing import (
    Finding,
    StrategyConfig,
 )
-from graphrag.index.utils.rate_limiter import RateLimiter
 from graphrag.language_model.manager import ModelManager
 from graphrag.language_model.protocol.base import ChatModel

@ -51,8 +50,6 @@ async def _run_extractor(
    level: int,
    args: StrategyConfig,
 ) -> CommunityReport | None:
-    # RateLimiter
-    rate_limiter = RateLimiter(rate=1, per=60)
    extractor = CommunityReportsExtractor(
        model,
        extraction_prompt=args.get("extraction_prompt", None),
@ -63,7 +60,6 @@ async def _run_extractor(
    )

    try:
-        await rate_limiter.acquire()
        results = await extractor(input)
        report = results.structured_output
        if report is None:
--- a/graphrag/index/utils/rate_limiter.py
+++ b/graphrag/index/utils/rate_limiter.py
@ -1,40 +0,0 @@
-# Copyright (c) 2024 Microsoft Corporation.
-# Licensed under the MIT License
-
-"""Rate limiter utility."""
-
-import asyncio
-import time
-
-
-class RateLimiter:
-    """
-    The original TpmRpmLLMLimiter strategy did not account for minute-based rate limiting when scheduled.
-
-    The RateLimiter was introduced to ensure that the CommunityReportsExtractor could be scheduled to adhere to rate configurations on a per-minute basis.
-    """
-
-    # TODO: RateLimiter scheduled: using asyncio for async_mode
-
-    def __init__(self, rate: int, per: int):
-        self.rate = rate
-        self.per = per
-        self.allowance = rate
-        self.last_check = time.monotonic()
-
-    async def acquire(self):
-        """Acquire a token from the rate limiter."""
-        current = time.monotonic()
-        elapsed = current - self.last_check
-        self.last_check = current
-        self.allowance += elapsed * (self.rate / self.per)
-
-        if self.allowance > self.rate:
-            self.allowance = self.rate
-
-        if self.allowance < 1.0:
-            sleep_time = (1.0 - self.allowance) * (self.per / self.rate)
-            await asyncio.sleep(sleep_time)
-            self.allowance = 0.0
-        else:
-            self.allowance -= 1.0
--- a/graphrag/utils/api.py
+++ b/graphrag/utils/api.py
@ -190,57 +190,58 @@ def update_context_data(
    """
    updated_context_data = {}
    for key in context_data:
+        entries = context_data[key].to_dict(orient="records")
        updated_entry = []
        if key == "reports":
            updated_entry = [
                dict(
-                    {k: entry[k] for k in entry},
+                    entry,
                    index_name=links["community_reports"][int(entry["id"])][
                        "index_name"
                    ],
                    index_id=links["community_reports"][int(entry["id"])]["id"],
                )
-                for entry in context_data[key]
+                for entry in entries
            ]
        if key == "entities":
            updated_entry = [
                dict(
-                    {k: entry[k] for k in entry},
+                    entry,
                    entity=entry["entity"].split("-")[0],
                    index_name=links["entities"][int(entry["id"])]["index_name"],
                    index_id=links["entities"][int(entry["id"])]["id"],
                )
-                for entry in context_data[key]
+                for entry in entries
            ]
        if key == "relationships":
            updated_entry = [
                dict(
-                    {k: entry[k] for k in entry},
+                    entry,
                    source=entry["source"].split("-")[0],
                    target=entry["target"].split("-")[0],
                    index_name=links["relationships"][int(entry["id"])]["index_name"],
                    index_id=links["relationships"][int(entry["id"])]["id"],
                )
-                for entry in context_data[key]
+                for entry in entries
            ]
        if key == "claims":
            updated_entry = [
                dict(
-                    {k: entry[k] for k in entry},
+                    entry,
                    entity=entry["entity"].split("-")[0],
                    index_name=links["covariates"][int(entry["id"])]["index_name"],
                    index_id=links["covariates"][int(entry["id"])]["id"],
                )
-                for entry in context_data[key]
+                for entry in entries
            ]
        if key == "sources":
            updated_entry = [
                dict(
-                    {k: entry[k] for k in entry},
+                    entry,
                    index_name=links["text_units"][int(entry["id"])]["index_name"],
                    index_id=links["text_units"][int(entry["id"])]["id"],
                )
-                for entry in context_data[key]
+                for entry in entries
            ]
        updated_context_data[key] = updated_entry
    return updated_context_data