diff --git a/.semversioner/next-release/patch-20240710114442871595.json b/.semversioner/next-release/patch-20240710114442871595.json new file mode 100644 index 00000000..ccdf7d9a --- /dev/null +++ b/.semversioner/next-release/patch-20240710114442871595.json @@ -0,0 +1,4 @@ +{ + "type": "patch", + "description": "Modify defaults for CHUNK_SIZE, CHUNK_OVERLAP and GLEANINGS to reduce time and LLM calls" +} diff --git a/docsite/posts/config/env_vars.md b/docsite/posts/config/env_vars.md index 2667ae55..0dd133d6 100644 --- a/docsite/posts/config/env_vars.md +++ b/docsite/posts/config/env_vars.md @@ -134,7 +134,7 @@ These settings control the data input used by the pipeline. Any settings with a | Parameter | Description | Type | Required or Optional | Default | | --------------------------- | ------------------------------------------------------------------------------------------- | ----- | -------------------- | ------- | -| `GRAPHRAG_CHUNK_SIZE` | The chunk size in tokens for text-chunk analysis windows. | `str` | optional | 300 | +| `GRAPHRAG_CHUNK_SIZE` | The chunk size in tokens for text-chunk analysis windows. | `str` | optional | 1200 | | `GRAPHRAG_CHUNK_OVERLAP` | The chunk overlap in tokens for text-chunk analysis windows. | `str` | optional | 100 | | `GRAPHRAG_CHUNK_BY_COLUMNS` | A comma-separated list of document attributes to groupby when performing TextUnit chunking. | `str` | optional | `id` | @@ -143,14 +143,14 @@ These settings control the data input used by the pipeline. Any settings with a | Parameter | Description | Type | Required or Optional | Default | | --------------------------------------------- | ------------------------------------------------------------------------------------------ | -------- | -------------------- | ---------------------------------------------------------------- | | `GRAPHRAG_ENTITY_EXTRACTION_PROMPT_FILE` | The path (relative to the root) of an entity extraction prompt template text file. | `str` | optional | `None` | -| `GRAPHRAG_ENTITY_EXTRACTION_MAX_GLEANINGS` | The maximum number of redrives (gleanings) to invoke when extracting entities in a loop. | `int` | optional | 0 | +| `GRAPHRAG_ENTITY_EXTRACTION_MAX_GLEANINGS` | The maximum number of redrives (gleanings) to invoke when extracting entities in a loop. | `int` | optional | 1 | | `GRAPHRAG_ENTITY_EXTRACTION_ENTITY_TYPES` | A comma-separated list of entity types to extract. | `str` | optional | `organization,person,event,geo` | | `GRAPHRAG_SUMMARIZE_DESCRIPTIONS_PROMPT_FILE` | The path (relative to the root) of an description summarization prompt template text file. | `str` | optional | `None` | | `GRAPHRAG_SUMMARIZE_DESCRIPTIONS_MAX_LENGTH` | The maximum number of tokens to generate per description summarization. | `int` | optional | 500 | | `GRAPHRAG_CLAIM_EXTRACTION_ENABLED` | Whether claim extraction is enabled for this pipeline. | `bool` | optional | `False` | | `GRAPHRAG_CLAIM_EXTRACTION_DESCRIPTION` | The claim_description prompting argument to utilize. | `string` | optional | "Any claims or facts that could be relevant to threat analysis." | | `GRAPHRAG_CLAIM_EXTRACTION_PROMPT_FILE` | The claim extraction prompt to utilize. | `string` | optional | `None` | -| `GRAPHRAG_CLAIM_EXTRACTION_MAX_GLEANINGS` | The maximum number of redrives (gleanings) to invoke when extracting claims in a loop. | `int` | optional | 0 | +| `GRAPHRAG_CLAIM_EXTRACTION_MAX_GLEANINGS` | The maximum number of redrives (gleanings) to invoke when extracting claims in a loop. | `int` | optional | 1 | | `GRAPHRAG_COMMUNITY_REPORTS_PROMPT_FILE` | The community reports extraction prompt to utilize. | `string` | optional | `None` | | `GRAPHRAG_COMMUNITY_REPORTS_MAX_LENGTH` | The maximum number of tokens to generate per community reports. | `int` | optional | 1500 | diff --git a/docsite/posts/config/template.md b/docsite/posts/config/template.md index 80173b40..d3ff14d5 100644 --- a/docsite/posts/config/template.md +++ b/docsite/posts/config/template.md @@ -117,19 +117,19 @@ GRAPHRAG_EMBEDDING_API_VERSION="api_version" # For Azure OpenAI Users and if GRA # GRAPHRAG_INPUT_ENCODING=utf-8 # Data Chunking -# GRAPHRAG_CHUNK_SIZE=300 +# GRAPHRAG_CHUNK_SIZE=1200 # GRAPHRAG_CHUNK_OVERLAP=100 # GRAPHRAG_CHUNK_BY_COLUMNS=id # Prompting Overrides # GRAPHRAG_ENTITY_EXTRACTION_PROMPT_FILE=None -# GRAPHRAG_ENTITY_EXTRACTION_MAX_GLEANINGS=0 +# GRAPHRAG_ENTITY_EXTRACTION_MAX_GLEANINGS=1 # GRAPHRAG_ENTITY_EXTRACTION_ENTITY_TYPES=organization,person,event,geo # GRAPHRAG_SUMMARIZE_DESCRIPTIONS_PROMPT_FILE=None # GRAPHRAG_SUMMARIZE_DESCRIPTIONS_MAX_LENGTH=500 # GRAPHRAG_CLAIM_EXTRACTION_DESCRIPTION="Any claims or facts that could be relevant to threat analysis." # GRAPHRAG_CLAIM_EXTRACTION_PROMPT_FILE=None -# GRAPHRAG_CLAIM_EXTRACTION_MAX_GLEANINGS=0 +# GRAPHRAG_CLAIM_EXTRACTION_MAX_GLEANINGS=1 # GRAPHRAG_COMMUNITY_REPORT_PROMPT_FILE=None # GRAPHRAG_COMMUNITY_REPORT_MAX_LENGTH=1500 diff --git a/graphrag/config/defaults.py b/graphrag/config/defaults.py index 62dfacdb..a2a23e80 100644 --- a/graphrag/config/defaults.py +++ b/graphrag/config/defaults.py @@ -45,19 +45,19 @@ EMBEDDING_TARGET = TextEmbeddingTarget.required CACHE_TYPE = CacheType.file CACHE_BASE_DIR = "cache" -CHUNK_SIZE = 300 +CHUNK_SIZE = 1200 CHUNK_OVERLAP = 100 CHUNK_GROUP_BY_COLUMNS = ["id"] CLAIM_DESCRIPTION = ( "Any claims or facts that could be relevant to information discovery." ) -CLAIM_MAX_GLEANINGS = 0 +CLAIM_MAX_GLEANINGS = 1 CLAIM_EXTRACTION_ENABLED = False MAX_CLUSTER_SIZE = 10 COMMUNITY_REPORT_MAX_LENGTH = 2000 COMMUNITY_REPORT_MAX_INPUT_LENGTH = 8000 ENTITY_EXTRACTION_ENTITY_TYPES = ["organization", "person", "geo", "event"] -ENTITY_EXTRACTION_MAX_GLEANINGS = 0 +ENTITY_EXTRACTION_MAX_GLEANINGS = 1 INPUT_FILE_TYPE = InputFileType.text INPUT_TYPE = InputType.file INPUT_BASE_DIR = "input" diff --git a/graphrag/index/graph/extractors/claims/claim_extractor.py b/graphrag/index/graph/extractors/claims/claim_extractor.py index 26bb6e3f..e26d2069 100644 --- a/graphrag/index/graph/extractors/claims/claim_extractor.py +++ b/graphrag/index/graph/extractors/claims/claim_extractor.py @@ -10,6 +10,7 @@ from typing import Any import tiktoken +import graphrag.config.defaults as defs from graphrag.index.typing import ErrorHandlerFn from graphrag.llm import CompletionLLM @@ -80,7 +81,9 @@ class ClaimExtractor: self._input_resolved_entities_key = ( input_resolved_entities_key or "resolved_entities" ) - self._max_gleanings = max_gleanings if max_gleanings is not None else 0 + self._max_gleanings = ( + max_gleanings if max_gleanings is not None else defs.CLAIM_MAX_GLEANINGS + ) self._on_error = on_error or (lambda _e, _s, _d: None) # Construct the looping arguments diff --git a/graphrag/index/graph/extractors/graph/graph_extractor.py b/graphrag/index/graph/extractors/graph/graph_extractor.py index ccd2b729..1daa2e9f 100644 --- a/graphrag/index/graph/extractors/graph/graph_extractor.py +++ b/graphrag/index/graph/extractors/graph/graph_extractor.py @@ -14,6 +14,7 @@ from typing import Any import networkx as nx import tiktoken +import graphrag.config.defaults as defs from graphrag.index.typing import ErrorHandlerFn from graphrag.index.utils import clean_str from graphrag.llm import CompletionLLM @@ -78,7 +79,11 @@ class GraphExtractor: ) self._entity_types_key = entity_types_key or "entity_types" self._extraction_prompt = prompt or GRAPH_EXTRACTION_PROMPT - self._max_gleanings = max_gleanings if max_gleanings is not None else 0 + self._max_gleanings = ( + max_gleanings + if max_gleanings is not None + else defs.ENTITY_EXTRACTION_MAX_GLEANINGS + ) self._on_error = on_error or (lambda _e, _s, _d: None) # Construct the looping arguments diff --git a/graphrag/index/init_content.py b/graphrag/index/init_content.py index e210f964..13df1828 100644 --- a/graphrag/index/init_content.py +++ b/graphrag/index/init_content.py @@ -93,7 +93,7 @@ entity_extraction: ## async_mode: override the global async_mode settings for this task prompt: "prompts/entity_extraction.txt" entity_types: [{",".join(defs.ENTITY_EXTRACTION_ENTITY_TYPES)}] - max_gleanings: 0 + max_gleanings: {defs.ENTITY_EXTRACTION_MAX_GLEANINGS} summarize_descriptions: ## llm: override the global llm settings for this task diff --git a/graphrag/index/verbs/covariates/extract_covariates/strategies/graph_intelligence/run_gi_extract_claims.py b/graphrag/index/verbs/covariates/extract_covariates/strategies/graph_intelligence/run_gi_extract_claims.py index f047c44a..1c9f0588 100644 --- a/graphrag/index/verbs/covariates/extract_covariates/strategies/graph_intelligence/run_gi_extract_claims.py +++ b/graphrag/index/verbs/covariates/extract_covariates/strategies/graph_intelligence/run_gi_extract_claims.py @@ -8,6 +8,7 @@ from typing import Any from datashaper import VerbCallbacks +import graphrag.config.defaults as defs from graphrag.config.enums import LLMType from graphrag.index.cache import PipelineCache from graphrag.index.graph.extractors.claims import ClaimExtractor @@ -49,7 +50,7 @@ async def _execute( strategy_config: dict[str, Any], ) -> CovariateExtractionResult: extraction_prompt = strategy_config.get("extraction_prompt") - max_gleanings = strategy_config.get("max_gleanings", 0) + max_gleanings = strategy_config.get("max_gleanings", defs.CLAIM_MAX_GLEANINGS) tuple_delimiter = strategy_config.get("tuple_delimiter") record_delimiter = strategy_config.get("record_delimiter") completion_delimiter = strategy_config.get("completion_delimiter") @@ -60,9 +61,9 @@ async def _execute( extraction_prompt=extraction_prompt, max_gleanings=max_gleanings, encoding_model=encoding_model, - on_error=lambda e, s, d: reporter.error("Claim Extraction Error", e, s, d) - if reporter - else None, + on_error=lambda e, s, d: ( + reporter.error("Claim Extraction Error", e, s, d) if reporter else None + ), ) claim_description = strategy_config.get("claim_description") diff --git a/graphrag/index/verbs/entities/extraction/entity_extract.py b/graphrag/index/verbs/entities/extraction/entity_extract.py index 9be302ae..4e961f67 100644 --- a/graphrag/index/verbs/entities/extraction/entity_extract.py +++ b/graphrag/index/verbs/entities/extraction/entity_extract.py @@ -108,8 +108,8 @@ async def entity_extract( prechunked: true | false # Optional, If the document is already chunked beforehand, otherwise this will chunk the document into smaller bits. default: false encoding_name: cl100k_base # Optional, The encoding to use for the LLM, if not already prechunked, default: cl100k_base - chunk_size: 2500 # Optional ,The chunk size to use for the LLM, if not already prechunked, default: 2500 - chunk_overlap: 300 # Optional, The chunk overlap to use for the LLM, if not already prechunked, default: 300 + chunk_size: 1000 # Optional ,The chunk size to use for the LLM, if not already prechunked, default: 1200 + chunk_overlap: 100 # Optional, The chunk overlap to use for the LLM, if not already prechunked, default: 100 llm: # The configuration for the LLM type: openai # the type of llm to use, available options are: openai, azure, openai_chat, azure_openai_chat. The last two being chat based LLMs. diff --git a/graphrag/index/verbs/entities/extraction/strategies/graph_intelligence/run_graph_intelligence.py b/graphrag/index/verbs/entities/extraction/strategies/graph_intelligence/run_graph_intelligence.py index 997808f4..06284879 100644 --- a/graphrag/index/verbs/entities/extraction/strategies/graph_intelligence/run_graph_intelligence.py +++ b/graphrag/index/verbs/entities/extraction/strategies/graph_intelligence/run_graph_intelligence.py @@ -6,6 +6,7 @@ import networkx as nx from datashaper import VerbCallbacks +import graphrag.config.defaults as defs from graphrag.config.enums import LLMType from graphrag.index.cache import PipelineCache from graphrag.index.graph.extractors.graph import GraphExtractor @@ -52,8 +53,8 @@ async def run_extract_entities( # Chunking Arguments prechunked = args.get("prechunked", False) - chunk_size = args.get("chunk_size", 2500) - chunk_overlap = args.get("chunk_overlap", 300) + chunk_size = args.get("chunk_size", defs.CHUNK_SIZE) + chunk_overlap = args.get("chunk_overlap", defs.CHUNK_OVERLAP) # Extraction Arguments tuple_delimiter = args.get("tuple_delimiter", None) @@ -61,7 +62,7 @@ async def run_extract_entities( completion_delimiter = args.get("completion_delimiter", None) extraction_prompt = args.get("extraction_prompt", None) encoding_model = args.get("encoding_name", None) - max_gleanings = args.get("max_gleanings", None) + max_gleanings = args.get("max_gleanings", defs.ENTITY_EXTRACTION_MAX_GLEANINGS) # note: We're not using UnipartiteGraphChain.from_params # because we want to pass "timeout" to the llm_kwargs diff --git a/graphrag/index/verbs/text/chunk/strategies/tokens.py b/graphrag/index/verbs/text/chunk/strategies/tokens.py index 6dab135e..6426c783 100644 --- a/graphrag/index/verbs/text/chunk/strategies/tokens.py +++ b/graphrag/index/verbs/text/chunk/strategies/tokens.py @@ -9,20 +9,18 @@ from typing import Any import tiktoken from datashaper import ProgressTicker +import graphrag.config.defaults as defs from graphrag.index.text_splitting import Tokenizer from graphrag.index.verbs.text.chunk.typing import TextChunk -DEFAULT_CHUNK_SIZE = 2500 # tokens -DEFAULT_CHUNK_OVERLAP = 300 # tokens - def run( input: list[str], args: dict[str, Any], tick: ProgressTicker ) -> Iterable[TextChunk]: """Chunks text into multiple parts. A pipeline verb.""" - tokens_per_chunk = args.get("chunk_size", DEFAULT_CHUNK_SIZE) - chunk_overlap = args.get("chunk_overlap", DEFAULT_CHUNK_OVERLAP) - encoding_name = args.get("encoding_name", "cl100k_base") + tokens_per_chunk = args.get("chunk_size", defs.CHUNK_SIZE) + chunk_overlap = args.get("chunk_overlap", defs.CHUNK_OVERLAP) + encoding_name = args.get("encoding_name", defs.ENCODING_MODEL) enc = tiktoken.get_encoding(encoding_name) def encode(text: str) -> list[int]: diff --git a/graphrag/index/verbs/text/chunk/text_chunk.py b/graphrag/index/verbs/text/chunk/text_chunk.py index d8fab44f..40c5578a 100644 --- a/graphrag/index/verbs/text/chunk/text_chunk.py +++ b/graphrag/index/verbs/text/chunk/text_chunk.py @@ -73,8 +73,8 @@ def chunk( ```yaml strategy: type: tokens - chunk_size: 1000 # Optional, The chunk size to use, default: 1000 - chunk_overlap: 300 # Optional, The chunk overlap to use, default: 300 + chunk_size: 1200 # Optional, The chunk size to use, default: 1200 + chunk_overlap: 100 # Optional, The chunk overlap to use, default: 100 ``` ### sentence diff --git a/graphrag/index/verbs/text/embed/strategies/openai.py b/graphrag/index/verbs/text/embed/strategies/openai.py index 0658d604..fb443ec8 100644 --- a/graphrag/index/verbs/text/embed/strategies/openai.py +++ b/graphrag/index/verbs/text/embed/strategies/openai.py @@ -10,6 +10,7 @@ from typing import Any import numpy as np from datashaper import ProgressTicker, VerbCallbacks, progress_ticker +import graphrag.config.defaults as defs from graphrag.index.cache import PipelineCache from graphrag.index.llm import load_llm_embeddings from graphrag.index.text_splitting import TokenTextSplitter @@ -68,7 +69,7 @@ def _get_splitter( config: OpenAIConfiguration, batch_max_tokens: int ) -> TokenTextSplitter: return TokenTextSplitter( - encoding_name=config.encoding_model or "cl100k_base", + encoding_name=config.encoding_model or defs.ENCODING_MODEL, chunk_size=batch_max_tokens, ) diff --git a/graphrag/index/verbs/text/translate/strategies/openai.py b/graphrag/index/verbs/text/translate/strategies/openai.py index cf0628ff..49c47b34 100644 --- a/graphrag/index/verbs/text/translate/strategies/openai.py +++ b/graphrag/index/verbs/text/translate/strategies/openai.py @@ -9,6 +9,7 @@ from typing import Any from datashaper import VerbCallbacks +import graphrag.config.defaults as defs from graphrag.config.enums import LLMType from graphrag.index.cache import PipelineCache from graphrag.index.llm import load_llm @@ -40,8 +41,8 @@ async def run( ) language = args.get("language", "English") prompt = args.get("prompt") - chunk_size = args.get("chunk_size", 2500) - chunk_overlap = args.get("chunk_overlap", 0) + chunk_size = args.get("chunk_size", defs.CHUNK_SIZE) + chunk_overlap = args.get("chunk_overlap", defs.CHUNK_OVERLAP) input = [input] if isinstance(input, str) else input return TextTranslationResult(