Change defaults for CHUNK_SIZE, CHUNK_OVERLAP and GLEANINGS (#499)

* enlarge the chunk size to reduce the cost and time greatly to 1/4

* Change default gleanings, chunk_size and chunk_overlap

* Update patch-20240710114442871595.json

---------

Co-authored-by: KylinMountain <kose2livs@gmail.com>
This commit is contained in:
Alonso Guevara 2024-07-11 10:22:27 -06:00 committed by GitHub
parent c3852b0749
commit 7a9c9071c1
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
14 changed files with 46 additions and 32 deletions

View File

@ -0,0 +1,4 @@
{
"type": "patch",
"description": "Modify defaults for CHUNK_SIZE, CHUNK_OVERLAP and GLEANINGS to reduce time and LLM calls"
}

View File

@ -134,7 +134,7 @@ These settings control the data input used by the pipeline. Any settings with a
| Parameter | Description | Type | Required or Optional | Default |
| --------------------------- | ------------------------------------------------------------------------------------------- | ----- | -------------------- | ------- |
| `GRAPHRAG_CHUNK_SIZE` | The chunk size in tokens for text-chunk analysis windows. | `str` | optional | 300 |
| `GRAPHRAG_CHUNK_SIZE` | The chunk size in tokens for text-chunk analysis windows. | `str` | optional | 1200 |
| `GRAPHRAG_CHUNK_OVERLAP` | The chunk overlap in tokens for text-chunk analysis windows. | `str` | optional | 100 |
| `GRAPHRAG_CHUNK_BY_COLUMNS` | A comma-separated list of document attributes to groupby when performing TextUnit chunking. | `str` | optional | `id` |
@ -143,14 +143,14 @@ These settings control the data input used by the pipeline. Any settings with a
| Parameter | Description | Type | Required or Optional | Default |
| --------------------------------------------- | ------------------------------------------------------------------------------------------ | -------- | -------------------- | ---------------------------------------------------------------- |
| `GRAPHRAG_ENTITY_EXTRACTION_PROMPT_FILE` | The path (relative to the root) of an entity extraction prompt template text file. | `str` | optional | `None` |
| `GRAPHRAG_ENTITY_EXTRACTION_MAX_GLEANINGS` | The maximum number of redrives (gleanings) to invoke when extracting entities in a loop. | `int` | optional | 0 |
| `GRAPHRAG_ENTITY_EXTRACTION_MAX_GLEANINGS` | The maximum number of redrives (gleanings) to invoke when extracting entities in a loop. | `int` | optional | 1 |
| `GRAPHRAG_ENTITY_EXTRACTION_ENTITY_TYPES` | A comma-separated list of entity types to extract. | `str` | optional | `organization,person,event,geo` |
| `GRAPHRAG_SUMMARIZE_DESCRIPTIONS_PROMPT_FILE` | The path (relative to the root) of an description summarization prompt template text file. | `str` | optional | `None` |
| `GRAPHRAG_SUMMARIZE_DESCRIPTIONS_MAX_LENGTH` | The maximum number of tokens to generate per description summarization. | `int` | optional | 500 |
| `GRAPHRAG_CLAIM_EXTRACTION_ENABLED` | Whether claim extraction is enabled for this pipeline. | `bool` | optional | `False` |
| `GRAPHRAG_CLAIM_EXTRACTION_DESCRIPTION` | The claim_description prompting argument to utilize. | `string` | optional | "Any claims or facts that could be relevant to threat analysis." |
| `GRAPHRAG_CLAIM_EXTRACTION_PROMPT_FILE` | The claim extraction prompt to utilize. | `string` | optional | `None` |
| `GRAPHRAG_CLAIM_EXTRACTION_MAX_GLEANINGS` | The maximum number of redrives (gleanings) to invoke when extracting claims in a loop. | `int` | optional | 0 |
| `GRAPHRAG_CLAIM_EXTRACTION_MAX_GLEANINGS` | The maximum number of redrives (gleanings) to invoke when extracting claims in a loop. | `int` | optional | 1 |
| `GRAPHRAG_COMMUNITY_REPORTS_PROMPT_FILE` | The community reports extraction prompt to utilize. | `string` | optional | `None` |
| `GRAPHRAG_COMMUNITY_REPORTS_MAX_LENGTH` | The maximum number of tokens to generate per community reports. | `int` | optional | 1500 |

View File

@ -117,19 +117,19 @@ GRAPHRAG_EMBEDDING_API_VERSION="api_version" # For Azure OpenAI Users and if GRA
# GRAPHRAG_INPUT_ENCODING=utf-8
# Data Chunking
# GRAPHRAG_CHUNK_SIZE=300
# GRAPHRAG_CHUNK_SIZE=1200
# GRAPHRAG_CHUNK_OVERLAP=100
# GRAPHRAG_CHUNK_BY_COLUMNS=id
# Prompting Overrides
# GRAPHRAG_ENTITY_EXTRACTION_PROMPT_FILE=None
# GRAPHRAG_ENTITY_EXTRACTION_MAX_GLEANINGS=0
# GRAPHRAG_ENTITY_EXTRACTION_MAX_GLEANINGS=1
# GRAPHRAG_ENTITY_EXTRACTION_ENTITY_TYPES=organization,person,event,geo
# GRAPHRAG_SUMMARIZE_DESCRIPTIONS_PROMPT_FILE=None
# GRAPHRAG_SUMMARIZE_DESCRIPTIONS_MAX_LENGTH=500
# GRAPHRAG_CLAIM_EXTRACTION_DESCRIPTION="Any claims or facts that could be relevant to threat analysis."
# GRAPHRAG_CLAIM_EXTRACTION_PROMPT_FILE=None
# GRAPHRAG_CLAIM_EXTRACTION_MAX_GLEANINGS=0
# GRAPHRAG_CLAIM_EXTRACTION_MAX_GLEANINGS=1
# GRAPHRAG_COMMUNITY_REPORT_PROMPT_FILE=None
# GRAPHRAG_COMMUNITY_REPORT_MAX_LENGTH=1500

View File

@ -45,19 +45,19 @@ EMBEDDING_TARGET = TextEmbeddingTarget.required
CACHE_TYPE = CacheType.file
CACHE_BASE_DIR = "cache"
CHUNK_SIZE = 300
CHUNK_SIZE = 1200
CHUNK_OVERLAP = 100
CHUNK_GROUP_BY_COLUMNS = ["id"]
CLAIM_DESCRIPTION = (
"Any claims or facts that could be relevant to information discovery."
)
CLAIM_MAX_GLEANINGS = 0
CLAIM_MAX_GLEANINGS = 1
CLAIM_EXTRACTION_ENABLED = False
MAX_CLUSTER_SIZE = 10
COMMUNITY_REPORT_MAX_LENGTH = 2000
COMMUNITY_REPORT_MAX_INPUT_LENGTH = 8000
ENTITY_EXTRACTION_ENTITY_TYPES = ["organization", "person", "geo", "event"]
ENTITY_EXTRACTION_MAX_GLEANINGS = 0
ENTITY_EXTRACTION_MAX_GLEANINGS = 1
INPUT_FILE_TYPE = InputFileType.text
INPUT_TYPE = InputType.file
INPUT_BASE_DIR = "input"

View File

@ -10,6 +10,7 @@ from typing import Any
import tiktoken
import graphrag.config.defaults as defs
from graphrag.index.typing import ErrorHandlerFn
from graphrag.llm import CompletionLLM
@ -80,7 +81,9 @@ class ClaimExtractor:
self._input_resolved_entities_key = (
input_resolved_entities_key or "resolved_entities"
)
self._max_gleanings = max_gleanings if max_gleanings is not None else 0
self._max_gleanings = (
max_gleanings if max_gleanings is not None else defs.CLAIM_MAX_GLEANINGS
)
self._on_error = on_error or (lambda _e, _s, _d: None)
# Construct the looping arguments

View File

@ -14,6 +14,7 @@ from typing import Any
import networkx as nx
import tiktoken
import graphrag.config.defaults as defs
from graphrag.index.typing import ErrorHandlerFn
from graphrag.index.utils import clean_str
from graphrag.llm import CompletionLLM
@ -78,7 +79,11 @@ class GraphExtractor:
)
self._entity_types_key = entity_types_key or "entity_types"
self._extraction_prompt = prompt or GRAPH_EXTRACTION_PROMPT
self._max_gleanings = max_gleanings if max_gleanings is not None else 0
self._max_gleanings = (
max_gleanings
if max_gleanings is not None
else defs.ENTITY_EXTRACTION_MAX_GLEANINGS
)
self._on_error = on_error or (lambda _e, _s, _d: None)
# Construct the looping arguments

View File

@ -93,7 +93,7 @@ entity_extraction:
## async_mode: override the global async_mode settings for this task
prompt: "prompts/entity_extraction.txt"
entity_types: [{",".join(defs.ENTITY_EXTRACTION_ENTITY_TYPES)}]
max_gleanings: 0
max_gleanings: {defs.ENTITY_EXTRACTION_MAX_GLEANINGS}
summarize_descriptions:
## llm: override the global llm settings for this task

View File

@ -8,6 +8,7 @@ from typing import Any
from datashaper import VerbCallbacks
import graphrag.config.defaults as defs
from graphrag.config.enums import LLMType
from graphrag.index.cache import PipelineCache
from graphrag.index.graph.extractors.claims import ClaimExtractor
@ -49,7 +50,7 @@ async def _execute(
strategy_config: dict[str, Any],
) -> CovariateExtractionResult:
extraction_prompt = strategy_config.get("extraction_prompt")
max_gleanings = strategy_config.get("max_gleanings", 0)
max_gleanings = strategy_config.get("max_gleanings", defs.CLAIM_MAX_GLEANINGS)
tuple_delimiter = strategy_config.get("tuple_delimiter")
record_delimiter = strategy_config.get("record_delimiter")
completion_delimiter = strategy_config.get("completion_delimiter")
@ -60,9 +61,9 @@ async def _execute(
extraction_prompt=extraction_prompt,
max_gleanings=max_gleanings,
encoding_model=encoding_model,
on_error=lambda e, s, d: reporter.error("Claim Extraction Error", e, s, d)
if reporter
else None,
on_error=lambda e, s, d: (
reporter.error("Claim Extraction Error", e, s, d) if reporter else None
),
)
claim_description = strategy_config.get("claim_description")

View File

@ -108,8 +108,8 @@ async def entity_extract(
prechunked: true | false # Optional, If the document is already chunked beforehand, otherwise this will chunk the document into smaller bits. default: false
encoding_name: cl100k_base # Optional, The encoding to use for the LLM, if not already prechunked, default: cl100k_base
chunk_size: 2500 # Optional ,The chunk size to use for the LLM, if not already prechunked, default: 2500
chunk_overlap: 300 # Optional, The chunk overlap to use for the LLM, if not already prechunked, default: 300
chunk_size: 1000 # Optional ,The chunk size to use for the LLM, if not already prechunked, default: 1200
chunk_overlap: 100 # Optional, The chunk overlap to use for the LLM, if not already prechunked, default: 100
llm: # The configuration for the LLM
type: openai # the type of llm to use, available options are: openai, azure, openai_chat, azure_openai_chat. The last two being chat based LLMs.

View File

@ -6,6 +6,7 @@
import networkx as nx
from datashaper import VerbCallbacks
import graphrag.config.defaults as defs
from graphrag.config.enums import LLMType
from graphrag.index.cache import PipelineCache
from graphrag.index.graph.extractors.graph import GraphExtractor
@ -52,8 +53,8 @@ async def run_extract_entities(
# Chunking Arguments
prechunked = args.get("prechunked", False)
chunk_size = args.get("chunk_size", 2500)
chunk_overlap = args.get("chunk_overlap", 300)
chunk_size = args.get("chunk_size", defs.CHUNK_SIZE)
chunk_overlap = args.get("chunk_overlap", defs.CHUNK_OVERLAP)
# Extraction Arguments
tuple_delimiter = args.get("tuple_delimiter", None)
@ -61,7 +62,7 @@ async def run_extract_entities(
completion_delimiter = args.get("completion_delimiter", None)
extraction_prompt = args.get("extraction_prompt", None)
encoding_model = args.get("encoding_name", None)
max_gleanings = args.get("max_gleanings", None)
max_gleanings = args.get("max_gleanings", defs.ENTITY_EXTRACTION_MAX_GLEANINGS)
# note: We're not using UnipartiteGraphChain.from_params
# because we want to pass "timeout" to the llm_kwargs

View File

@ -9,20 +9,18 @@ from typing import Any
import tiktoken
from datashaper import ProgressTicker
import graphrag.config.defaults as defs
from graphrag.index.text_splitting import Tokenizer
from graphrag.index.verbs.text.chunk.typing import TextChunk
DEFAULT_CHUNK_SIZE = 2500 # tokens
DEFAULT_CHUNK_OVERLAP = 300 # tokens
def run(
input: list[str], args: dict[str, Any], tick: ProgressTicker
) -> Iterable[TextChunk]:
"""Chunks text into multiple parts. A pipeline verb."""
tokens_per_chunk = args.get("chunk_size", DEFAULT_CHUNK_SIZE)
chunk_overlap = args.get("chunk_overlap", DEFAULT_CHUNK_OVERLAP)
encoding_name = args.get("encoding_name", "cl100k_base")
tokens_per_chunk = args.get("chunk_size", defs.CHUNK_SIZE)
chunk_overlap = args.get("chunk_overlap", defs.CHUNK_OVERLAP)
encoding_name = args.get("encoding_name", defs.ENCODING_MODEL)
enc = tiktoken.get_encoding(encoding_name)
def encode(text: str) -> list[int]:

View File

@ -73,8 +73,8 @@ def chunk(
```yaml
strategy:
type: tokens
chunk_size: 1000 # Optional, The chunk size to use, default: 1000
chunk_overlap: 300 # Optional, The chunk overlap to use, default: 300
chunk_size: 1200 # Optional, The chunk size to use, default: 1200
chunk_overlap: 100 # Optional, The chunk overlap to use, default: 100
```
### sentence

View File

@ -10,6 +10,7 @@ from typing import Any
import numpy as np
from datashaper import ProgressTicker, VerbCallbacks, progress_ticker
import graphrag.config.defaults as defs
from graphrag.index.cache import PipelineCache
from graphrag.index.llm import load_llm_embeddings
from graphrag.index.text_splitting import TokenTextSplitter
@ -68,7 +69,7 @@ def _get_splitter(
config: OpenAIConfiguration, batch_max_tokens: int
) -> TokenTextSplitter:
return TokenTextSplitter(
encoding_name=config.encoding_model or "cl100k_base",
encoding_name=config.encoding_model or defs.ENCODING_MODEL,
chunk_size=batch_max_tokens,
)

View File

@ -9,6 +9,7 @@ from typing import Any
from datashaper import VerbCallbacks
import graphrag.config.defaults as defs
from graphrag.config.enums import LLMType
from graphrag.index.cache import PipelineCache
from graphrag.index.llm import load_llm
@ -40,8 +41,8 @@ async def run(
)
language = args.get("language", "English")
prompt = args.get("prompt")
chunk_size = args.get("chunk_size", 2500)
chunk_overlap = args.get("chunk_overlap", 0)
chunk_size = args.get("chunk_size", defs.CHUNK_SIZE)
chunk_overlap = args.get("chunk_overlap", defs.CHUNK_OVERLAP)
input = [input] if isinstance(input, str) else input
return TextTranslationResult(