mirror of
https://github.com/microsoft/graphrag.git
synced 2026-01-14 09:07:20 +08:00
Change defaults for CHUNK_SIZE, CHUNK_OVERLAP and GLEANINGS (#499)
* enlarge the chunk size to reduce the cost and time greatly to 1/4 * Change default gleanings, chunk_size and chunk_overlap * Update patch-20240710114442871595.json --------- Co-authored-by: KylinMountain <kose2livs@gmail.com>
This commit is contained in:
parent
c3852b0749
commit
7a9c9071c1
@ -0,0 +1,4 @@
|
||||
{
|
||||
"type": "patch",
|
||||
"description": "Modify defaults for CHUNK_SIZE, CHUNK_OVERLAP and GLEANINGS to reduce time and LLM calls"
|
||||
}
|
||||
@ -134,7 +134,7 @@ These settings control the data input used by the pipeline. Any settings with a
|
||||
|
||||
| Parameter | Description | Type | Required or Optional | Default |
|
||||
| --------------------------- | ------------------------------------------------------------------------------------------- | ----- | -------------------- | ------- |
|
||||
| `GRAPHRAG_CHUNK_SIZE` | The chunk size in tokens for text-chunk analysis windows. | `str` | optional | 300 |
|
||||
| `GRAPHRAG_CHUNK_SIZE` | The chunk size in tokens for text-chunk analysis windows. | `str` | optional | 1200 |
|
||||
| `GRAPHRAG_CHUNK_OVERLAP` | The chunk overlap in tokens for text-chunk analysis windows. | `str` | optional | 100 |
|
||||
| `GRAPHRAG_CHUNK_BY_COLUMNS` | A comma-separated list of document attributes to groupby when performing TextUnit chunking. | `str` | optional | `id` |
|
||||
|
||||
@ -143,14 +143,14 @@ These settings control the data input used by the pipeline. Any settings with a
|
||||
| Parameter | Description | Type | Required or Optional | Default |
|
||||
| --------------------------------------------- | ------------------------------------------------------------------------------------------ | -------- | -------------------- | ---------------------------------------------------------------- |
|
||||
| `GRAPHRAG_ENTITY_EXTRACTION_PROMPT_FILE` | The path (relative to the root) of an entity extraction prompt template text file. | `str` | optional | `None` |
|
||||
| `GRAPHRAG_ENTITY_EXTRACTION_MAX_GLEANINGS` | The maximum number of redrives (gleanings) to invoke when extracting entities in a loop. | `int` | optional | 0 |
|
||||
| `GRAPHRAG_ENTITY_EXTRACTION_MAX_GLEANINGS` | The maximum number of redrives (gleanings) to invoke when extracting entities in a loop. | `int` | optional | 1 |
|
||||
| `GRAPHRAG_ENTITY_EXTRACTION_ENTITY_TYPES` | A comma-separated list of entity types to extract. | `str` | optional | `organization,person,event,geo` |
|
||||
| `GRAPHRAG_SUMMARIZE_DESCRIPTIONS_PROMPT_FILE` | The path (relative to the root) of an description summarization prompt template text file. | `str` | optional | `None` |
|
||||
| `GRAPHRAG_SUMMARIZE_DESCRIPTIONS_MAX_LENGTH` | The maximum number of tokens to generate per description summarization. | `int` | optional | 500 |
|
||||
| `GRAPHRAG_CLAIM_EXTRACTION_ENABLED` | Whether claim extraction is enabled for this pipeline. | `bool` | optional | `False` |
|
||||
| `GRAPHRAG_CLAIM_EXTRACTION_DESCRIPTION` | The claim_description prompting argument to utilize. | `string` | optional | "Any claims or facts that could be relevant to threat analysis." |
|
||||
| `GRAPHRAG_CLAIM_EXTRACTION_PROMPT_FILE` | The claim extraction prompt to utilize. | `string` | optional | `None` |
|
||||
| `GRAPHRAG_CLAIM_EXTRACTION_MAX_GLEANINGS` | The maximum number of redrives (gleanings) to invoke when extracting claims in a loop. | `int` | optional | 0 |
|
||||
| `GRAPHRAG_CLAIM_EXTRACTION_MAX_GLEANINGS` | The maximum number of redrives (gleanings) to invoke when extracting claims in a loop. | `int` | optional | 1 |
|
||||
| `GRAPHRAG_COMMUNITY_REPORTS_PROMPT_FILE` | The community reports extraction prompt to utilize. | `string` | optional | `None` |
|
||||
| `GRAPHRAG_COMMUNITY_REPORTS_MAX_LENGTH` | The maximum number of tokens to generate per community reports. | `int` | optional | 1500 |
|
||||
|
||||
|
||||
@ -117,19 +117,19 @@ GRAPHRAG_EMBEDDING_API_VERSION="api_version" # For Azure OpenAI Users and if GRA
|
||||
# GRAPHRAG_INPUT_ENCODING=utf-8
|
||||
|
||||
# Data Chunking
|
||||
# GRAPHRAG_CHUNK_SIZE=300
|
||||
# GRAPHRAG_CHUNK_SIZE=1200
|
||||
# GRAPHRAG_CHUNK_OVERLAP=100
|
||||
# GRAPHRAG_CHUNK_BY_COLUMNS=id
|
||||
|
||||
# Prompting Overrides
|
||||
# GRAPHRAG_ENTITY_EXTRACTION_PROMPT_FILE=None
|
||||
# GRAPHRAG_ENTITY_EXTRACTION_MAX_GLEANINGS=0
|
||||
# GRAPHRAG_ENTITY_EXTRACTION_MAX_GLEANINGS=1
|
||||
# GRAPHRAG_ENTITY_EXTRACTION_ENTITY_TYPES=organization,person,event,geo
|
||||
# GRAPHRAG_SUMMARIZE_DESCRIPTIONS_PROMPT_FILE=None
|
||||
# GRAPHRAG_SUMMARIZE_DESCRIPTIONS_MAX_LENGTH=500
|
||||
# GRAPHRAG_CLAIM_EXTRACTION_DESCRIPTION="Any claims or facts that could be relevant to threat analysis."
|
||||
# GRAPHRAG_CLAIM_EXTRACTION_PROMPT_FILE=None
|
||||
# GRAPHRAG_CLAIM_EXTRACTION_MAX_GLEANINGS=0
|
||||
# GRAPHRAG_CLAIM_EXTRACTION_MAX_GLEANINGS=1
|
||||
# GRAPHRAG_COMMUNITY_REPORT_PROMPT_FILE=None
|
||||
# GRAPHRAG_COMMUNITY_REPORT_MAX_LENGTH=1500
|
||||
|
||||
|
||||
@ -45,19 +45,19 @@ EMBEDDING_TARGET = TextEmbeddingTarget.required
|
||||
|
||||
CACHE_TYPE = CacheType.file
|
||||
CACHE_BASE_DIR = "cache"
|
||||
CHUNK_SIZE = 300
|
||||
CHUNK_SIZE = 1200
|
||||
CHUNK_OVERLAP = 100
|
||||
CHUNK_GROUP_BY_COLUMNS = ["id"]
|
||||
CLAIM_DESCRIPTION = (
|
||||
"Any claims or facts that could be relevant to information discovery."
|
||||
)
|
||||
CLAIM_MAX_GLEANINGS = 0
|
||||
CLAIM_MAX_GLEANINGS = 1
|
||||
CLAIM_EXTRACTION_ENABLED = False
|
||||
MAX_CLUSTER_SIZE = 10
|
||||
COMMUNITY_REPORT_MAX_LENGTH = 2000
|
||||
COMMUNITY_REPORT_MAX_INPUT_LENGTH = 8000
|
||||
ENTITY_EXTRACTION_ENTITY_TYPES = ["organization", "person", "geo", "event"]
|
||||
ENTITY_EXTRACTION_MAX_GLEANINGS = 0
|
||||
ENTITY_EXTRACTION_MAX_GLEANINGS = 1
|
||||
INPUT_FILE_TYPE = InputFileType.text
|
||||
INPUT_TYPE = InputType.file
|
||||
INPUT_BASE_DIR = "input"
|
||||
|
||||
@ -10,6 +10,7 @@ from typing import Any
|
||||
|
||||
import tiktoken
|
||||
|
||||
import graphrag.config.defaults as defs
|
||||
from graphrag.index.typing import ErrorHandlerFn
|
||||
from graphrag.llm import CompletionLLM
|
||||
|
||||
@ -80,7 +81,9 @@ class ClaimExtractor:
|
||||
self._input_resolved_entities_key = (
|
||||
input_resolved_entities_key or "resolved_entities"
|
||||
)
|
||||
self._max_gleanings = max_gleanings if max_gleanings is not None else 0
|
||||
self._max_gleanings = (
|
||||
max_gleanings if max_gleanings is not None else defs.CLAIM_MAX_GLEANINGS
|
||||
)
|
||||
self._on_error = on_error or (lambda _e, _s, _d: None)
|
||||
|
||||
# Construct the looping arguments
|
||||
|
||||
@ -14,6 +14,7 @@ from typing import Any
|
||||
import networkx as nx
|
||||
import tiktoken
|
||||
|
||||
import graphrag.config.defaults as defs
|
||||
from graphrag.index.typing import ErrorHandlerFn
|
||||
from graphrag.index.utils import clean_str
|
||||
from graphrag.llm import CompletionLLM
|
||||
@ -78,7 +79,11 @@ class GraphExtractor:
|
||||
)
|
||||
self._entity_types_key = entity_types_key or "entity_types"
|
||||
self._extraction_prompt = prompt or GRAPH_EXTRACTION_PROMPT
|
||||
self._max_gleanings = max_gleanings if max_gleanings is not None else 0
|
||||
self._max_gleanings = (
|
||||
max_gleanings
|
||||
if max_gleanings is not None
|
||||
else defs.ENTITY_EXTRACTION_MAX_GLEANINGS
|
||||
)
|
||||
self._on_error = on_error or (lambda _e, _s, _d: None)
|
||||
|
||||
# Construct the looping arguments
|
||||
|
||||
@ -93,7 +93,7 @@ entity_extraction:
|
||||
## async_mode: override the global async_mode settings for this task
|
||||
prompt: "prompts/entity_extraction.txt"
|
||||
entity_types: [{",".join(defs.ENTITY_EXTRACTION_ENTITY_TYPES)}]
|
||||
max_gleanings: 0
|
||||
max_gleanings: {defs.ENTITY_EXTRACTION_MAX_GLEANINGS}
|
||||
|
||||
summarize_descriptions:
|
||||
## llm: override the global llm settings for this task
|
||||
|
||||
@ -8,6 +8,7 @@ from typing import Any
|
||||
|
||||
from datashaper import VerbCallbacks
|
||||
|
||||
import graphrag.config.defaults as defs
|
||||
from graphrag.config.enums import LLMType
|
||||
from graphrag.index.cache import PipelineCache
|
||||
from graphrag.index.graph.extractors.claims import ClaimExtractor
|
||||
@ -49,7 +50,7 @@ async def _execute(
|
||||
strategy_config: dict[str, Any],
|
||||
) -> CovariateExtractionResult:
|
||||
extraction_prompt = strategy_config.get("extraction_prompt")
|
||||
max_gleanings = strategy_config.get("max_gleanings", 0)
|
||||
max_gleanings = strategy_config.get("max_gleanings", defs.CLAIM_MAX_GLEANINGS)
|
||||
tuple_delimiter = strategy_config.get("tuple_delimiter")
|
||||
record_delimiter = strategy_config.get("record_delimiter")
|
||||
completion_delimiter = strategy_config.get("completion_delimiter")
|
||||
@ -60,9 +61,9 @@ async def _execute(
|
||||
extraction_prompt=extraction_prompt,
|
||||
max_gleanings=max_gleanings,
|
||||
encoding_model=encoding_model,
|
||||
on_error=lambda e, s, d: reporter.error("Claim Extraction Error", e, s, d)
|
||||
if reporter
|
||||
else None,
|
||||
on_error=lambda e, s, d: (
|
||||
reporter.error("Claim Extraction Error", e, s, d) if reporter else None
|
||||
),
|
||||
)
|
||||
|
||||
claim_description = strategy_config.get("claim_description")
|
||||
|
||||
@ -108,8 +108,8 @@ async def entity_extract(
|
||||
|
||||
prechunked: true | false # Optional, If the document is already chunked beforehand, otherwise this will chunk the document into smaller bits. default: false
|
||||
encoding_name: cl100k_base # Optional, The encoding to use for the LLM, if not already prechunked, default: cl100k_base
|
||||
chunk_size: 2500 # Optional ,The chunk size to use for the LLM, if not already prechunked, default: 2500
|
||||
chunk_overlap: 300 # Optional, The chunk overlap to use for the LLM, if not already prechunked, default: 300
|
||||
chunk_size: 1000 # Optional ,The chunk size to use for the LLM, if not already prechunked, default: 1200
|
||||
chunk_overlap: 100 # Optional, The chunk overlap to use for the LLM, if not already prechunked, default: 100
|
||||
|
||||
llm: # The configuration for the LLM
|
||||
type: openai # the type of llm to use, available options are: openai, azure, openai_chat, azure_openai_chat. The last two being chat based LLMs.
|
||||
|
||||
@ -6,6 +6,7 @@
|
||||
import networkx as nx
|
||||
from datashaper import VerbCallbacks
|
||||
|
||||
import graphrag.config.defaults as defs
|
||||
from graphrag.config.enums import LLMType
|
||||
from graphrag.index.cache import PipelineCache
|
||||
from graphrag.index.graph.extractors.graph import GraphExtractor
|
||||
@ -52,8 +53,8 @@ async def run_extract_entities(
|
||||
|
||||
# Chunking Arguments
|
||||
prechunked = args.get("prechunked", False)
|
||||
chunk_size = args.get("chunk_size", 2500)
|
||||
chunk_overlap = args.get("chunk_overlap", 300)
|
||||
chunk_size = args.get("chunk_size", defs.CHUNK_SIZE)
|
||||
chunk_overlap = args.get("chunk_overlap", defs.CHUNK_OVERLAP)
|
||||
|
||||
# Extraction Arguments
|
||||
tuple_delimiter = args.get("tuple_delimiter", None)
|
||||
@ -61,7 +62,7 @@ async def run_extract_entities(
|
||||
completion_delimiter = args.get("completion_delimiter", None)
|
||||
extraction_prompt = args.get("extraction_prompt", None)
|
||||
encoding_model = args.get("encoding_name", None)
|
||||
max_gleanings = args.get("max_gleanings", None)
|
||||
max_gleanings = args.get("max_gleanings", defs.ENTITY_EXTRACTION_MAX_GLEANINGS)
|
||||
|
||||
# note: We're not using UnipartiteGraphChain.from_params
|
||||
# because we want to pass "timeout" to the llm_kwargs
|
||||
|
||||
@ -9,20 +9,18 @@ from typing import Any
|
||||
import tiktoken
|
||||
from datashaper import ProgressTicker
|
||||
|
||||
import graphrag.config.defaults as defs
|
||||
from graphrag.index.text_splitting import Tokenizer
|
||||
from graphrag.index.verbs.text.chunk.typing import TextChunk
|
||||
|
||||
DEFAULT_CHUNK_SIZE = 2500 # tokens
|
||||
DEFAULT_CHUNK_OVERLAP = 300 # tokens
|
||||
|
||||
|
||||
def run(
|
||||
input: list[str], args: dict[str, Any], tick: ProgressTicker
|
||||
) -> Iterable[TextChunk]:
|
||||
"""Chunks text into multiple parts. A pipeline verb."""
|
||||
tokens_per_chunk = args.get("chunk_size", DEFAULT_CHUNK_SIZE)
|
||||
chunk_overlap = args.get("chunk_overlap", DEFAULT_CHUNK_OVERLAP)
|
||||
encoding_name = args.get("encoding_name", "cl100k_base")
|
||||
tokens_per_chunk = args.get("chunk_size", defs.CHUNK_SIZE)
|
||||
chunk_overlap = args.get("chunk_overlap", defs.CHUNK_OVERLAP)
|
||||
encoding_name = args.get("encoding_name", defs.ENCODING_MODEL)
|
||||
enc = tiktoken.get_encoding(encoding_name)
|
||||
|
||||
def encode(text: str) -> list[int]:
|
||||
|
||||
@ -73,8 +73,8 @@ def chunk(
|
||||
```yaml
|
||||
strategy:
|
||||
type: tokens
|
||||
chunk_size: 1000 # Optional, The chunk size to use, default: 1000
|
||||
chunk_overlap: 300 # Optional, The chunk overlap to use, default: 300
|
||||
chunk_size: 1200 # Optional, The chunk size to use, default: 1200
|
||||
chunk_overlap: 100 # Optional, The chunk overlap to use, default: 100
|
||||
```
|
||||
|
||||
### sentence
|
||||
|
||||
@ -10,6 +10,7 @@ from typing import Any
|
||||
import numpy as np
|
||||
from datashaper import ProgressTicker, VerbCallbacks, progress_ticker
|
||||
|
||||
import graphrag.config.defaults as defs
|
||||
from graphrag.index.cache import PipelineCache
|
||||
from graphrag.index.llm import load_llm_embeddings
|
||||
from graphrag.index.text_splitting import TokenTextSplitter
|
||||
@ -68,7 +69,7 @@ def _get_splitter(
|
||||
config: OpenAIConfiguration, batch_max_tokens: int
|
||||
) -> TokenTextSplitter:
|
||||
return TokenTextSplitter(
|
||||
encoding_name=config.encoding_model or "cl100k_base",
|
||||
encoding_name=config.encoding_model or defs.ENCODING_MODEL,
|
||||
chunk_size=batch_max_tokens,
|
||||
)
|
||||
|
||||
|
||||
@ -9,6 +9,7 @@ from typing import Any
|
||||
|
||||
from datashaper import VerbCallbacks
|
||||
|
||||
import graphrag.config.defaults as defs
|
||||
from graphrag.config.enums import LLMType
|
||||
from graphrag.index.cache import PipelineCache
|
||||
from graphrag.index.llm import load_llm
|
||||
@ -40,8 +41,8 @@ async def run(
|
||||
)
|
||||
language = args.get("language", "English")
|
||||
prompt = args.get("prompt")
|
||||
chunk_size = args.get("chunk_size", 2500)
|
||||
chunk_overlap = args.get("chunk_overlap", 0)
|
||||
chunk_size = args.get("chunk_size", defs.CHUNK_SIZE)
|
||||
chunk_overlap = args.get("chunk_overlap", defs.CHUNK_OVERLAP)
|
||||
|
||||
input = [input] if isinstance(input, str) else input
|
||||
return TextTranslationResult(
|
||||
|
||||
Loading…
Reference in New Issue
Block a user