Change defaults for CHUNK_SIZE, CHUNK_OVERLAP and GLEANINGS (#499)

* enlarge the chunk size to reduce the cost and time greatly to 1/4 * Change default gleanings, chunk_size and chunk_overlap * Update patch-20240710114442871595.json --------- Co-authored-by: KylinMountain <kose2livs@gmail.com>
2026-01-14 09:07:20 +08:00 · 2024-07-11 10:22:27 -06:00 · 2024-07-11 10:22:27 -06:00 · 7a9c9071c1
commit 7a9c9071c1
parent c3852b0749
14 changed files with 46 additions and 32 deletions
--- a/.semversioner/next-release/patch-20240710114442871595.json
+++ b/.semversioner/next-release/patch-20240710114442871595.json
@ -0,0 +1,4 @@
+{
+  "type": "patch",
+  "description": "Modify defaults for CHUNK_SIZE, CHUNK_OVERLAP and GLEANINGS to reduce time and LLM calls"
+}
--- a/docsite/posts/config/env_vars.md
+++ b/docsite/posts/config/env_vars.md
@ -134,7 +134,7 @@ These settings control the data input used by the pipeline. Any settings with a

 | Parameter                   | Description                                                                                 | Type  | Required or Optional | Default |
 | --------------------------- | ------------------------------------------------------------------------------------------- | ----- | -------------------- | ------- |
-| `GRAPHRAG_CHUNK_SIZE`       | The chunk size in tokens for text-chunk analysis windows.                                   | `str` | optional             | 300     |
+| `GRAPHRAG_CHUNK_SIZE`       | The chunk size in tokens for text-chunk analysis windows.                                   | `str` | optional             | 1200    |
 | `GRAPHRAG_CHUNK_OVERLAP`    | The chunk overlap in tokens for text-chunk analysis windows.                                | `str` | optional             | 100     |
 | `GRAPHRAG_CHUNK_BY_COLUMNS` | A comma-separated list of document attributes to groupby when performing TextUnit chunking. | `str` | optional             | `id`    |

@ -143,14 +143,14 @@ These settings control the data input used by the pipeline. Any settings with a
 | Parameter                                     | Description                                                                                | Type     | Required or Optional | Default                                                          |
 | --------------------------------------------- | ------------------------------------------------------------------------------------------ | -------- | -------------------- | ---------------------------------------------------------------- |
 | `GRAPHRAG_ENTITY_EXTRACTION_PROMPT_FILE`      | The path (relative to the root) of an entity extraction prompt template text file.         | `str`    | optional             | `None`                                                           |
-| `GRAPHRAG_ENTITY_EXTRACTION_MAX_GLEANINGS`    | The maximum number of redrives (gleanings) to invoke when extracting entities in a loop.   | `int`    | optional             | 0                                                                |
+| `GRAPHRAG_ENTITY_EXTRACTION_MAX_GLEANINGS`    | The maximum number of redrives (gleanings) to invoke when extracting entities in a loop.   | `int`    | optional             | 1                                                                |
 | `GRAPHRAG_ENTITY_EXTRACTION_ENTITY_TYPES`     | A comma-separated list of entity types to extract.                                         | `str`    | optional             | `organization,person,event,geo`                                  |
 | `GRAPHRAG_SUMMARIZE_DESCRIPTIONS_PROMPT_FILE` | The path (relative to the root) of an description summarization prompt template text file. | `str`    | optional             | `None`                                                           |
 | `GRAPHRAG_SUMMARIZE_DESCRIPTIONS_MAX_LENGTH`  | The maximum number of tokens to generate per description summarization.                    | `int`    | optional             | 500                                                              |
 | `GRAPHRAG_CLAIM_EXTRACTION_ENABLED`           | Whether claim extraction is enabled for this pipeline.                                     | `bool`   | optional             | `False`                                                          |
 | `GRAPHRAG_CLAIM_EXTRACTION_DESCRIPTION`       | The claim_description prompting argument to utilize.                                       | `string` | optional             | "Any claims or facts that could be relevant to threat analysis." |
 | `GRAPHRAG_CLAIM_EXTRACTION_PROMPT_FILE`       | The claim extraction prompt to utilize.                                                    | `string` | optional             | `None`                                                           |
-| `GRAPHRAG_CLAIM_EXTRACTION_MAX_GLEANINGS`     | The maximum number of redrives (gleanings) to invoke when extracting claims in a loop.     | `int`    | optional             | 0                                                                |
+| `GRAPHRAG_CLAIM_EXTRACTION_MAX_GLEANINGS`     | The maximum number of redrives (gleanings) to invoke when extracting claims in a loop.     | `int`    | optional             | 1                                                                |
 | `GRAPHRAG_COMMUNITY_REPORTS_PROMPT_FILE`      | The community reports extraction prompt to utilize.                                        | `string` | optional             | `None`                                                           |
 | `GRAPHRAG_COMMUNITY_REPORTS_MAX_LENGTH`       | The maximum number of tokens to generate per community reports.                            | `int`    | optional             | 1500                                                             |

--- a/docsite/posts/config/template.md
+++ b/docsite/posts/config/template.md
@ -117,19 +117,19 @@ GRAPHRAG_EMBEDDING_API_VERSION="api_version" # For Azure OpenAI Users and if GRA
 # GRAPHRAG_INPUT_ENCODING=utf-8

 # Data Chunking
-# GRAPHRAG_CHUNK_SIZE=300
+# GRAPHRAG_CHUNK_SIZE=1200
 # GRAPHRAG_CHUNK_OVERLAP=100
 # GRAPHRAG_CHUNK_BY_COLUMNS=id

 # Prompting Overrides
 # GRAPHRAG_ENTITY_EXTRACTION_PROMPT_FILE=None
-# GRAPHRAG_ENTITY_EXTRACTION_MAX_GLEANINGS=0
+# GRAPHRAG_ENTITY_EXTRACTION_MAX_GLEANINGS=1
 # GRAPHRAG_ENTITY_EXTRACTION_ENTITY_TYPES=organization,person,event,geo
 # GRAPHRAG_SUMMARIZE_DESCRIPTIONS_PROMPT_FILE=None
 # GRAPHRAG_SUMMARIZE_DESCRIPTIONS_MAX_LENGTH=500
 # GRAPHRAG_CLAIM_EXTRACTION_DESCRIPTION="Any claims or facts that could be relevant to threat analysis."
 # GRAPHRAG_CLAIM_EXTRACTION_PROMPT_FILE=None
-# GRAPHRAG_CLAIM_EXTRACTION_MAX_GLEANINGS=0
+# GRAPHRAG_CLAIM_EXTRACTION_MAX_GLEANINGS=1
 # GRAPHRAG_COMMUNITY_REPORT_PROMPT_FILE=None
 # GRAPHRAG_COMMUNITY_REPORT_MAX_LENGTH=1500

--- a/graphrag/config/defaults.py
+++ b/graphrag/config/defaults.py
@ -45,19 +45,19 @@ EMBEDDING_TARGET = TextEmbeddingTarget.required

 CACHE_TYPE = CacheType.file
 CACHE_BASE_DIR = "cache"
-CHUNK_SIZE = 300
+CHUNK_SIZE = 1200
 CHUNK_OVERLAP = 100
 CHUNK_GROUP_BY_COLUMNS = ["id"]
 CLAIM_DESCRIPTION = (
    "Any claims or facts that could be relevant to information discovery."
 )
-CLAIM_MAX_GLEANINGS = 0
+CLAIM_MAX_GLEANINGS = 1
 CLAIM_EXTRACTION_ENABLED = False
 MAX_CLUSTER_SIZE = 10
 COMMUNITY_REPORT_MAX_LENGTH = 2000
 COMMUNITY_REPORT_MAX_INPUT_LENGTH = 8000
 ENTITY_EXTRACTION_ENTITY_TYPES = ["organization", "person", "geo", "event"]
-ENTITY_EXTRACTION_MAX_GLEANINGS = 0
+ENTITY_EXTRACTION_MAX_GLEANINGS = 1
 INPUT_FILE_TYPE = InputFileType.text
 INPUT_TYPE = InputType.file
 INPUT_BASE_DIR = "input"
--- a/graphrag/index/graph/extractors/claims/claim_extractor.py
+++ b/graphrag/index/graph/extractors/claims/claim_extractor.py
@ -10,6 +10,7 @@ from typing import Any

 import tiktoken

+import graphrag.config.defaults as defs
 from graphrag.index.typing import ErrorHandlerFn
 from graphrag.llm import CompletionLLM

@ -80,7 +81,9 @@ class ClaimExtractor:
        self._input_resolved_entities_key = (
            input_resolved_entities_key or "resolved_entities"
        )
-        self._max_gleanings = max_gleanings if max_gleanings is not None else 0
+        self._max_gleanings = (
+            max_gleanings if max_gleanings is not None else defs.CLAIM_MAX_GLEANINGS
+        )
        self._on_error = on_error or (lambda _e, _s, _d: None)

        # Construct the looping arguments
--- a/graphrag/index/graph/extractors/graph/graph_extractor.py
+++ b/graphrag/index/graph/extractors/graph/graph_extractor.py
@ -14,6 +14,7 @@ from typing import Any
 import networkx as nx
 import tiktoken

+import graphrag.config.defaults as defs
 from graphrag.index.typing import ErrorHandlerFn
 from graphrag.index.utils import clean_str
 from graphrag.llm import CompletionLLM
@ -78,7 +79,11 @@ class GraphExtractor:
        )
        self._entity_types_key = entity_types_key or "entity_types"
        self._extraction_prompt = prompt or GRAPH_EXTRACTION_PROMPT
-        self._max_gleanings = max_gleanings if max_gleanings is not None else 0
+        self._max_gleanings = (
+            max_gleanings
+            if max_gleanings is not None
+            else defs.ENTITY_EXTRACTION_MAX_GLEANINGS
+        )
        self._on_error = on_error or (lambda _e, _s, _d: None)

        # Construct the looping arguments
--- a/graphrag/index/init_content.py
+++ b/graphrag/index/init_content.py
@ -93,7 +93,7 @@ entity_extraction:
  ## async_mode: override the global async_mode settings for this task
  prompt: "prompts/entity_extraction.txt"
  entity_types: [{",".join(defs.ENTITY_EXTRACTION_ENTITY_TYPES)}]
-  max_gleanings: 0
+  max_gleanings: {defs.ENTITY_EXTRACTION_MAX_GLEANINGS}

 summarize_descriptions:
  ## llm: override the global llm settings for this task
--- a/graphrag/index/verbs/covariates/extract_covariates/strategies/graph_intelligence/run_gi_extract_claims.py
+++ b/graphrag/index/verbs/covariates/extract_covariates/strategies/graph_intelligence/run_gi_extract_claims.py
@ -8,6 +8,7 @@ from typing import Any

 from datashaper import VerbCallbacks

+import graphrag.config.defaults as defs
 from graphrag.config.enums import LLMType
 from graphrag.index.cache import PipelineCache
 from graphrag.index.graph.extractors.claims import ClaimExtractor
@ -49,7 +50,7 @@ async def _execute(
    strategy_config: dict[str, Any],
 ) -> CovariateExtractionResult:
    extraction_prompt = strategy_config.get("extraction_prompt")
-    max_gleanings = strategy_config.get("max_gleanings", 0)
+    max_gleanings = strategy_config.get("max_gleanings", defs.CLAIM_MAX_GLEANINGS)
    tuple_delimiter = strategy_config.get("tuple_delimiter")
    record_delimiter = strategy_config.get("record_delimiter")
    completion_delimiter = strategy_config.get("completion_delimiter")
@ -60,9 +61,9 @@ async def _execute(
        extraction_prompt=extraction_prompt,
        max_gleanings=max_gleanings,
        encoding_model=encoding_model,
-        on_error=lambda e, s, d: reporter.error("Claim Extraction Error", e, s, d)
-        if reporter
-        else None,
+        on_error=lambda e, s, d: (
+            reporter.error("Claim Extraction Error", e, s, d) if reporter else None
+        ),
    )

    claim_description = strategy_config.get("claim_description")
--- a/graphrag/index/verbs/entities/extraction/entity_extract.py
+++ b/graphrag/index/verbs/entities/extraction/entity_extract.py
@ -108,8 +108,8 @@ async def entity_extract(

        prechunked: true | false # Optional, If the document is already chunked beforehand, otherwise this will chunk the document into smaller bits. default: false
        encoding_name: cl100k_base # Optional, The encoding to use for the LLM, if not already prechunked, default: cl100k_base
-        chunk_size: 2500 # Optional ,The chunk size to use for the LLM, if not already prechunked, default: 2500
-        chunk_overlap: 300 # Optional, The chunk overlap to use for the LLM, if not already prechunked, default: 300
+        chunk_size: 1000 # Optional ,The chunk size to use for the LLM, if not already prechunked, default: 1200
+        chunk_overlap: 100 # Optional, The chunk overlap to use for the LLM, if not already prechunked, default: 100

        llm: # The configuration for the LLM
            type: openai # the type of llm to use, available options are: openai, azure, openai_chat, azure_openai_chat.  The last two being chat based LLMs.
--- a/graphrag/index/verbs/entities/extraction/strategies/graph_intelligence/run_graph_intelligence.py
+++ b/graphrag/index/verbs/entities/extraction/strategies/graph_intelligence/run_graph_intelligence.py
@ -6,6 +6,7 @@
 import networkx as nx
 from datashaper import VerbCallbacks

+import graphrag.config.defaults as defs
 from graphrag.config.enums import LLMType
 from graphrag.index.cache import PipelineCache
 from graphrag.index.graph.extractors.graph import GraphExtractor
@ -52,8 +53,8 @@ async def run_extract_entities(

    # Chunking Arguments
    prechunked = args.get("prechunked", False)
-    chunk_size = args.get("chunk_size", 2500)
-    chunk_overlap = args.get("chunk_overlap", 300)
+    chunk_size = args.get("chunk_size", defs.CHUNK_SIZE)
+    chunk_overlap = args.get("chunk_overlap", defs.CHUNK_OVERLAP)

    # Extraction Arguments
    tuple_delimiter = args.get("tuple_delimiter", None)
@ -61,7 +62,7 @@ async def run_extract_entities(
    completion_delimiter = args.get("completion_delimiter", None)
    extraction_prompt = args.get("extraction_prompt", None)
    encoding_model = args.get("encoding_name", None)
-    max_gleanings = args.get("max_gleanings", None)
+    max_gleanings = args.get("max_gleanings", defs.ENTITY_EXTRACTION_MAX_GLEANINGS)

    # note: We're not using UnipartiteGraphChain.from_params
    # because we want to pass "timeout" to the llm_kwargs
--- a/graphrag/index/verbs/text/chunk/strategies/tokens.py
+++ b/graphrag/index/verbs/text/chunk/strategies/tokens.py
@ -9,20 +9,18 @@ from typing import Any
 import tiktoken
 from datashaper import ProgressTicker

+import graphrag.config.defaults as defs
 from graphrag.index.text_splitting import Tokenizer
 from graphrag.index.verbs.text.chunk.typing import TextChunk

-DEFAULT_CHUNK_SIZE = 2500  # tokens
-DEFAULT_CHUNK_OVERLAP = 300  # tokens
-

 def run(
    input: list[str], args: dict[str, Any], tick: ProgressTicker
 ) -> Iterable[TextChunk]:
    """Chunks text into multiple parts. A pipeline verb."""
-    tokens_per_chunk = args.get("chunk_size", DEFAULT_CHUNK_SIZE)
-    chunk_overlap = args.get("chunk_overlap", DEFAULT_CHUNK_OVERLAP)
-    encoding_name = args.get("encoding_name", "cl100k_base")
+    tokens_per_chunk = args.get("chunk_size", defs.CHUNK_SIZE)
+    chunk_overlap = args.get("chunk_overlap", defs.CHUNK_OVERLAP)
+    encoding_name = args.get("encoding_name", defs.ENCODING_MODEL)
    enc = tiktoken.get_encoding(encoding_name)

    def encode(text: str) -> list[int]:
--- a/graphrag/index/verbs/text/chunk/text_chunk.py
+++ b/graphrag/index/verbs/text/chunk/text_chunk.py
@ -73,8 +73,8 @@ def chunk(
    ```yaml
    strategy:
        type: tokens
-        chunk_size: 1000 # Optional, The chunk size to use, default: 1000
-        chunk_overlap: 300 # Optional, The chunk overlap to use, default: 300
+        chunk_size: 1200 # Optional, The chunk size to use, default: 1200
+        chunk_overlap: 100 # Optional, The chunk overlap to use, default: 100
    ```

    ### sentence
--- a/graphrag/index/verbs/text/embed/strategies/openai.py
+++ b/graphrag/index/verbs/text/embed/strategies/openai.py
@ -10,6 +10,7 @@ from typing import Any
 import numpy as np
 from datashaper import ProgressTicker, VerbCallbacks, progress_ticker

+import graphrag.config.defaults as defs
 from graphrag.index.cache import PipelineCache
 from graphrag.index.llm import load_llm_embeddings
 from graphrag.index.text_splitting import TokenTextSplitter
@ -68,7 +69,7 @@ def _get_splitter(
    config: OpenAIConfiguration, batch_max_tokens: int
 ) -> TokenTextSplitter:
    return TokenTextSplitter(
-        encoding_name=config.encoding_model or "cl100k_base",
+        encoding_name=config.encoding_model or defs.ENCODING_MODEL,
        chunk_size=batch_max_tokens,
    )

--- a/graphrag/index/verbs/text/translate/strategies/openai.py
+++ b/graphrag/index/verbs/text/translate/strategies/openai.py
@ -9,6 +9,7 @@ from typing import Any

 from datashaper import VerbCallbacks

+import graphrag.config.defaults as defs
 from graphrag.config.enums import LLMType
 from graphrag.index.cache import PipelineCache
 from graphrag.index.llm import load_llm
@ -40,8 +41,8 @@ async def run(
    )
    language = args.get("language", "English")
    prompt = args.get("prompt")
-    chunk_size = args.get("chunk_size", 2500)
-    chunk_overlap = args.get("chunk_overlap", 0)
+    chunk_size = args.get("chunk_size", defs.CHUNK_SIZE)
+    chunk_overlap = args.get("chunk_overlap", defs.CHUNK_OVERLAP)

    input = [input] if isinstance(input, str) else input
    return TextTranslationResult(