Compare commits

..

No commits in common. "main" and "v1.2.0" have entirely different histories.
main ... v1.2.0

747 changed files with 25132 additions and 33829 deletions

View File

@ -6,7 +6,8 @@ permissions:
contents: write
env:
PYTHON_VERSION: "3.11"
POETRY_VERSION: '1.8.3'
PYTHON_VERSION: '3.11'
jobs:
build:
@ -14,7 +15,9 @@ jobs:
env:
GH_PAGES: 1
DEBUG: 1
GRAPHRAG_API_KEY: ${{ secrets.GRAPHRAG_API_KEY }}
GRAPHRAG_API_KEY: ${{ secrets.OPENAI_NOTEBOOK_KEY }}
GRAPHRAG_LLM_MODEL: ${{ secrets.GRAPHRAG_LLM_MODEL }}
GRAPHRAG_EMBEDDING_MODEL: ${{ secrets.GRAPHRAG_EMBEDDING_MODEL }}
steps:
- uses: actions/checkout@v4
@ -26,16 +29,18 @@ jobs:
with:
python-version: ${{ env.PYTHON_VERSION }}
- name: Install uv
uses: astral-sh/setup-uv@v6
- name: Install Poetry ${{ env.POETRY_VERSION }}
uses: abatilo/actions-poetry@v3.0.0
with:
poetry-version: ${{ env.POETRY_VERSION }}
- name: Install dependencies
- name: poetry intsall
shell: bash
run: uv sync
run: poetry install
- name: mkdocs build
shell: bash
run: uv run poe build_docs
run: poetry run poe build_docs
- name: List Docsite Contents
run: find site
@ -45,4 +50,4 @@ jobs:
with:
branch: gh-pages
folder: site
clean: true
clean: true

View File

@ -26,6 +26,9 @@ concurrency:
# Only run the for the latest commit
cancel-in-progress: true
env:
POETRY_VERSION: 1.8.3
jobs:
python-ci:
# skip draft PRs
@ -48,7 +51,7 @@ jobs:
filters: |
python:
- 'graphrag/**/*'
- 'uv.lock'
- 'poetry.lock'
- 'pyproject.toml'
- '**/*.py'
- '**/*.toml'
@ -61,27 +64,26 @@ jobs:
with:
python-version: ${{ matrix.python-version }}
- name: Install uv
uses: astral-sh/setup-uv@v6
- name: Install Poetry
uses: abatilo/actions-poetry@v3.0.0
with:
poetry-version: $POETRY_VERSION
- name: Install dependencies
shell: bash
run: |
uv sync
uv pip install gensim
poetry self add setuptools wheel
poetry run python -m pip install gensim
poetry install
- name: Check
run: |
uv run poe check
poetry run poe check
- name: Build
run: |
uv build
poetry build
- name: Unit Test
run: |
uv run poe test_unit
- name: Verb Test
run: |
uv run poe test_verbs
poetry run poe test_unit

View File

@ -26,6 +26,9 @@ concurrency:
# only run the for the latest commit
cancel-in-progress: true
env:
POETRY_VERSION: 1.8.3
jobs:
python-ci:
# skip draft PRs
@ -48,7 +51,7 @@ jobs:
filters: |
python:
- 'graphrag/**/*'
- 'uv.lock'
- 'poetry.lock'
- 'pyproject.toml'
- '**/*.py'
- '**/*.toml'
@ -61,24 +64,25 @@ jobs:
with:
python-version: ${{ matrix.python-version }}
- name: Install uv
uses: astral-sh/setup-uv@v6
- name: Install Poetry
uses: abatilo/actions-poetry@v3.0.0
with:
poetry-version: $POETRY_VERSION
- name: Install dependencies
shell: bash
run: |
uv sync
uv pip install gensim
poetry self add setuptools wheel
poetry run python -m pip install gensim
poetry install
- name: Build
run: |
uv build
poetry build
- name: Install and start Azurite
shell: bash
run: |
npm install -g azurite
azurite --silent --skipApiVersionCheck --location /tmp/azurite --debug /tmp/azurite-debug.log &
- name: Install Azurite
id: azuright
uses: potatoqualitee/azuright@v1.1
# For more information on installation/setup of Azure Cosmos DB Emulator
# https://learn.microsoft.com/en-us/azure/cosmos-db/how-to-develop-emulator?tabs=docker-linux%2Cpython&pivots=api-nosql
@ -93,4 +97,4 @@ jobs:
- name: Integration Test
run: |
uv run poe test_integration
poetry run poe test_integration

View File

@ -26,6 +26,9 @@ concurrency:
# Only run the for the latest commit
cancel-in-progress: true
env:
POETRY_VERSION: 1.8.3
jobs:
python-ci:
# skip draft PRs
@ -38,6 +41,8 @@ jobs:
env:
DEBUG: 1
GRAPHRAG_API_KEY: ${{ secrets.OPENAI_NOTEBOOK_KEY }}
GRAPHRAG_LLM_MODEL: ${{ secrets.GRAPHRAG_LLM_MODEL }}
GRAPHRAG_EMBEDDING_MODEL: ${{ secrets.GRAPHRAG_EMBEDDING_MODEL }}
runs-on: ${{ matrix.os }}
steps:
@ -49,7 +54,7 @@ jobs:
filters: |
python:
- 'graphrag/**/*'
- 'uv.lock'
- 'poetry.lock'
- 'pyproject.toml'
- '**/*.py'
- '**/*.toml'
@ -61,15 +66,18 @@ jobs:
with:
python-version: ${{ matrix.python-version }}
- name: Install uv
uses: astral-sh/setup-uv@v6
- name: Install Poetry
uses: abatilo/actions-poetry@v3.0.0
with:
poetry-version: $POETRY_VERSION
- name: Install dependencies
shell: bash
run: |
uv sync
uv pip install gensim
poetry self add setuptools wheel
poetry run python -m pip install gensim
poetry install
- name: Notebook Test
run: |
uv run poe test_notebook
poetry run poe test_notebook

View File

@ -6,6 +6,7 @@ on:
branches: [main]
env:
POETRY_VERSION: "1.8.3"
PYTHON_VERSION: "3.10"
jobs:
@ -30,19 +31,21 @@ jobs:
with:
python-version: ${{ env.PYTHON_VERSION }}
- name: Install uv
uses: astral-sh/setup-uv@v6
- name: Install Poetry
uses: abatilo/actions-poetry@v3.0.0
with:
poetry-version: ${{ env.POETRY_VERSION }}
- name: Install dependencies
shell: bash
run: uv sync
run: poetry install
- name: Export Publication Version
run: echo "version=$(uv version --short)" >> $GITHUB_OUTPUT
run: echo "version=`poetry version --short`" >> $GITHUB_OUTPUT
- name: Build Distributable
shell: bash
run: uv build
run: poetry build
- name: Publish package distributions to PyPI
uses: pypa/gh-action-pypi-publish@release/v1

View File

@ -26,6 +26,9 @@ concurrency:
# Only run the for the latest commit
cancel-in-progress: true
env:
POETRY_VERSION: 1.8.3
jobs:
python-ci:
# skip draft PRs
@ -37,8 +40,25 @@ jobs:
fail-fast: false # Continue running all jobs even if one fails
env:
DEBUG: 1
GRAPHRAG_LLM_TYPE: "azure_openai_chat"
GRAPHRAG_EMBEDDING_TYPE: "azure_openai_embedding"
GRAPHRAG_API_KEY: ${{ secrets.OPENAI_API_KEY }}
GRAPHRAG_API_BASE: ${{ secrets.GRAPHRAG_API_BASE }}
GRAPHRAG_API_VERSION: ${{ secrets.GRAPHRAG_API_VERSION }}
GRAPHRAG_LLM_DEPLOYMENT_NAME: ${{ secrets.GRAPHRAG_LLM_DEPLOYMENT_NAME }}
GRAPHRAG_EMBEDDING_DEPLOYMENT_NAME: ${{ secrets.GRAPHRAG_EMBEDDING_DEPLOYMENT_NAME }}
GRAPHRAG_CACHE_CONTAINER_NAME: "cicache"
GRAPHRAG_CACHE_BASE_DIR": "cache"
GRAPHRAG_LLM_MODEL: ${{ secrets.GRAPHRAG_LLM_MODEL }}
GRAPHRAG_EMBEDDING_MODEL: ${{ secrets.GRAPHRAG_EMBEDDING_MODEL }}
GRAPHRAG_ENTITY_EXTRACTION_ENCODING_MODEL: ${{ secrets.GRAPHRAG_ENTITY_EXTRACTION_ENCODING_MODEL }}
# We have Windows + Linux runners in 3.10 and 3.11, so we need to divide the rate limits by 4
GRAPHRAG_LLM_TPM: 45_000 # 180,000 / 4
GRAPHRAG_LLM_RPM: 270 # 1,080 / 4
GRAPHRAG_EMBEDDING_TPM: 87_500 # 350,000 / 4
GRAPHRAG_EMBEDDING_RPM: 525 # 2,100 / 4
GRAPHRAG_CHUNK_SIZE: 1200
GRAPHRAG_CHUNK_OVERLAP: 0
# Azure AI Search config
AZURE_AI_SEARCH_URL_ENDPOINT: ${{ secrets.AZURE_AI_SEARCH_URL_ENDPOINT }}
AZURE_AI_SEARCH_API_KEY: ${{ secrets.AZURE_AI_SEARCH_API_KEY }}
@ -53,7 +73,7 @@ jobs:
filters: |
python:
- 'graphrag/**/*'
- 'uv.lock'
- 'poetry.lock'
- 'pyproject.toml'
- '**/*.py'
- '**/*.toml'
@ -66,32 +86,37 @@ jobs:
with:
python-version: ${{ matrix.python-version }}
- name: Install uv
uses: astral-sh/setup-uv@v6
- name: Install Poetry
uses: abatilo/actions-poetry@v3.0.0
with:
poetry-version: $POETRY_VERSION
- name: Install dependencies
shell: bash
run: |
uv sync
uv pip install gensim
poetry self add setuptools wheel
poetry run python -m pip install gensim
poetry install
- name: Build
run: |
uv build
poetry build
- name: Install and start Azurite
shell: bash
- name: Verb Test
run: |
npm install -g azurite
azurite --silent --skipApiVersionCheck --location /tmp/azurite --debug /tmp/azurite-debug.log &
poetry run poe test_verbs
- name: Install Azurite
id: azuright
uses: potatoqualitee/azuright@v1.1
- name: Smoke Test
if: steps.changes.outputs.python == 'true'
run: |
uv run poe test_smoke
poetry run poe test_smoke
- uses: actions/upload-artifact@v4
if: always()
with:
name: smoke-test-artifacts-${{ matrix.python-version }}-${{ runner.os }}
path: tests/fixtures/*
name: smoke-test-artifacts-${{ matrix.python-version }}-${{ matrix.poetry-version }}-${{ runner.os }}
path: tests/fixtures/*/output

2
.gitignore vendored
View File

@ -1,8 +1,6 @@
# Python Artifacts
python/*/lib/
dist/
build/
*.egg-info/
# Test Output
.coverage

View File

@ -1,146 +0,0 @@
{
"changes": [
{
"description": "Add children to communities to avoid re-compute.",
"type": "major"
},
{
"description": "Reorganize and rename workflows and their outputs.",
"type": "major"
},
{
"description": "Rework API to accept callbacks.",
"type": "major"
},
{
"description": "Add LMM Manager and Factory, to support provider registration",
"type": "minor"
},
{
"description": "Add NLP graph extraction.",
"type": "minor"
},
{
"description": "Add pipeline_start and pipeline_end callbacks.",
"type": "minor"
},
{
"description": "Move embeddings snapshots to the workflow runner.",
"type": "minor"
},
{
"description": "Remove config inheritance, hydration, and automatic env var overlays.",
"type": "minor"
},
{
"description": "Rework the update output storage structure.",
"type": "minor"
},
{
"description": "Add caching to NLP extractor.",
"type": "patch"
},
{
"description": "Add vector store id reference to embeddings config.",
"type": "patch"
},
{
"description": "Export NLP community reports prompt.",
"type": "patch"
},
{
"description": "Fix DRIFT search on Azure AI Search.",
"type": "patch"
},
{
"description": "Fix StopAsyncIteration catch.",
"type": "patch"
},
{
"description": "Fix missing embeddings workflow in FastGraphRAG.",
"type": "patch"
},
{
"description": "Fix proper use of n_depth for drift search",
"type": "patch"
},
{
"description": "Fix report generation recursion.",
"type": "patch"
},
{
"description": "Fix summarization over large datasets for inc indexing. Fix relationship summarization",
"type": "patch"
},
{
"description": "Optimize data iteration by removing some iterrows from code",
"type": "patch"
},
{
"description": "Patch json mode for community reports",
"type": "patch"
},
{
"description": "Properly increment text unit IDs during updates.",
"type": "patch"
},
{
"description": "Refactor config defaults from constants to type-safe, hierarchical dataclass.",
"type": "patch"
},
{
"description": "Require explicit azure auth settings when using AOI.",
"type": "patch"
},
{
"description": "Separates graph pruning for differential usage.",
"type": "patch"
},
{
"description": "Tuck flow functions under their workflow modules.",
"type": "patch"
},
{
"description": "Update fnllm. Remove unused libs.",
"type": "patch"
},
{
"description": "Use ModelProvider for query module",
"type": "patch"
},
{
"description": "Use shared schema for final outputs.",
"type": "patch"
},
{
"description": "add dynamic retry logic.",
"type": "patch"
},
{
"description": "add option to prepend metadata into chunks",
"type": "patch"
},
{
"description": "cleanup query code duplication.",
"type": "patch"
},
{
"description": "implemented multi-index querying for api layer",
"type": "patch"
},
{
"description": "multi index query cli support",
"type": "patch"
},
{
"description": "remove unused columns and change property document_attribute_columns to metadata",
"type": "patch"
},
{
"description": "update multi-index query to support new workflows",
"type": "patch"
}
],
"created_at": "2025-02-25T23:30:50+00:00",
"version": "2.0.0"
}

View File

@ -1,22 +0,0 @@
{
"changes": [
{
"description": "Add support for JSON input files.",
"type": "minor"
},
{
"description": "Updated the prompt tunning client to support csv-metadata injection and updated output file types to match the new naming convention.",
"type": "minor"
},
{
"description": "Add check for custom model types while config loading",
"type": "patch"
},
{
"description": "Adds general-purpose pipeline run state object.",
"type": "patch"
}
],
"created_at": "2025-03-11T23:53:00+00:00",
"version": "2.1.0"
}

View File

@ -1,46 +0,0 @@
{
"changes": [
{
"description": "Support OpenAI reasoning models.",
"type": "minor"
},
{
"description": "Add option to snapshot raw extracted graph tables.",
"type": "patch"
},
{
"description": "Added batching logic to the prompt tuning autoselection embeddings workflow",
"type": "patch"
},
{
"description": "Align config classes and docs better.",
"type": "patch"
},
{
"description": "Align embeddings table loading with configured fields.",
"type": "patch"
},
{
"description": "Brings parity with our latest NLP extraction approaches.",
"type": "patch"
},
{
"description": "Fix fnllm to 0.2.3",
"type": "patch"
},
{
"description": "Fixes to basic search.",
"type": "patch"
},
{
"description": "Update llm args for consistency.",
"type": "patch"
},
{
"description": "add vector store integration tests",
"type": "patch"
}
],
"created_at": "2025-04-25T23:30:57+00:00",
"version": "2.2.0"
}

View File

@ -1,18 +0,0 @@
{
"changes": [
{
"description": "Fix Community Report prompt tuning response",
"type": "patch"
},
{
"description": "Fix graph creation missing edge weights.",
"type": "patch"
},
{
"description": "Update as workflows",
"type": "patch"
}
],
"created_at": "2025-04-30T23:50:31+00:00",
"version": "2.2.1"
}

View File

@ -1,34 +0,0 @@
{
"changes": [
{
"description": "Remove Dynamic Max Retries support. Refactor typer typing in cli interface",
"type": "minor"
},
{
"description": "Update fnllm to latest. Update default graphrag configuration",
"type": "minor"
},
{
"description": "A few fixes and enhancements for better reuse and flow.",
"type": "patch"
},
{
"description": "Add full llm response to LLM PRovider output",
"type": "patch"
},
{
"description": "Fix Drift Reduce Response for non streaming calls",
"type": "patch"
},
{
"description": "Fix global search prompt to include missing formatting key",
"type": "patch"
},
{
"description": "Upgrade pyarrow dependency to >=17.0.0 to fix CVE-2024-52338",
"type": "patch"
}
],
"created_at": "2025-05-23T21:02:47+00:00",
"version": "2.3.0"
}

View File

@ -1,26 +0,0 @@
{
"changes": [
{
"description": "Allow injection of custom pipelines.",
"type": "minor"
},
{
"description": "Refactored StorageFactory to use a registration-based approach",
"type": "minor"
},
{
"description": "Fix default values for tpm and rpm limiters on embeddings",
"type": "patch"
},
{
"description": "Update typer.",
"type": "patch"
},
{
"description": "cleaned up logging to follow python standards.",
"type": "patch"
}
],
"created_at": "2025-07-15T00:04:15+00:00",
"version": "2.4.0"
}

View File

@ -1,14 +0,0 @@
{
"changes": [
{
"description": "Add additional context variable to build index signature for custom parameter bag",
"type": "minor"
},
{
"description": "swap package management from Poetry -> UV",
"type": "minor"
}
],
"created_at": "2025-08-14T00:59:46+00:00",
"version": "2.5.0"
}

View File

@ -1,54 +0,0 @@
{
"changes": [
{
"description": "Add LiteLLM chat and embedding model providers.",
"type": "minor"
},
{
"description": "Add LoggerFactory and clean up related API.",
"type": "minor"
},
{
"description": "Add config for NLP async mode.",
"type": "minor"
},
{
"description": "Add optional input documents to indexing API.",
"type": "minor"
},
{
"description": "add customization to vector store",
"type": "minor"
},
{
"description": "Add gpt-5 support by updating fnllm dependency.",
"type": "patch"
},
{
"description": "Fix all human_readable_id fields to be 0-based.",
"type": "patch"
},
{
"description": "Fix multi-index search.",
"type": "patch"
},
{
"description": "Improve upon recent logging refactor",
"type": "patch"
},
{
"description": "Make cache, storage, and vector_store factories consistent with similar registration support",
"type": "patch"
},
{
"description": "Remove hard-coded community rate limiter.",
"type": "patch"
},
{
"description": "generate_text_embeddings only loads tables if embedding field is specified.",
"type": "patch"
}
],
"created_at": "2025-09-22T21:44:51+00:00",
"version": "2.6.0"
}

View File

@ -1,18 +0,0 @@
{
"changes": [
{
"description": "Set LiteLLM as default in init_content.",
"type": "minor"
},
{
"description": "Fix Azure auth scope issue with LiteLLM.",
"type": "patch"
},
{
"description": "Housekeeping toward 2.7.",
"type": "patch"
}
],
"created_at": "2025-10-08T22:39:42+00:00",
"version": "2.7.0"
}

57
.vscode/launch.json vendored
View File

@ -6,24 +6,21 @@
"name": "Indexer",
"type": "debugpy",
"request": "launch",
"module": "graphrag",
"module": "poetry",
"args": [
"index",
"--root",
"<path_to_index_folder>"
"poe", "index",
"--root", "<path_to_ragtest_root_demo>"
],
"console": "integratedTerminal"
},
{
"name": "Query",
"type": "debugpy",
"request": "launch",
"module": "graphrag",
"module": "poetry",
"args": [
"query",
"--root",
"<path_to_index_folder>",
"--method", "basic",
"poe", "query",
"--root", "<path_to_ragtest_root_demo>",
"--method", "global",
"--query", "What are the top themes in this story",
]
},
@ -31,48 +28,12 @@
"name": "Prompt Tuning",
"type": "debugpy",
"request": "launch",
"module": "uv",
"module": "poetry",
"args": [
"poe", "prompt-tune",
"--config",
"<path_to_ragtest_root_demo>/settings.yaml",
]
},
{
"name": "Debug Integration Pytest",
"type": "debugpy",
"request": "launch",
"module": "pytest",
"args": [
"./tests/integration/vector_stores",
"-k", "test_azure_ai_search"
],
"console": "integratedTerminal",
"justMyCode": false
},
{
"name": "Debug Verbs Pytest",
"type": "debugpy",
"request": "launch",
"module": "pytest",
"args": [
"./tests/verbs",
"-k", "test_generate_text_embeddings"
],
"console": "integratedTerminal",
"justMyCode": false
},
{
"name": "Debug Smoke Pytest",
"type": "debugpy",
"request": "launch",
"module": "pytest",
"args": [
"./tests/smoke",
"-k", "test_fixtures"
],
"console": "integratedTerminal",
"justMyCode": false
},
}
]
}

37
.vscode/settings.json vendored
View File

@ -1,8 +1,43 @@
{
"search.exclude": {
"**/.yarn": true,
"**/.pnp.*": true
},
"editor.formatOnSave": false,
"eslint.nodePath": ".yarn/sdks",
"typescript.tsdk": ".yarn/sdks/typescript/lib",
"typescript.enablePromptUseWorkspaceTsdk": true,
"javascript.preferences.importModuleSpecifier": "relative",
"javascript.preferences.importModuleSpecifierEnding": "js",
"typescript.preferences.importModuleSpecifier": "relative",
"typescript.preferences.importModuleSpecifierEnding": "js",
"explorer.fileNesting.enabled": true,
"explorer.fileNesting.patterns": {
"*.ts": "${capture}.ts, ${capture}.hooks.ts, ${capture}.hooks.tsx, ${capture}.contexts.ts, ${capture}.stories.tsx, ${capture}.story.tsx, ${capture}.spec.tsx, ${capture}.base.ts, ${capture}.base.tsx, ${capture}.types.ts, ${capture}.styles.ts, ${capture}.styles.tsx, ${capture}.utils.ts, ${capture}.utils.tsx, ${capture}.constants.ts, ${capture}.module.scss, ${capture}.module.css, ${capture}.md",
"*.js": "${capture}.js.map, ${capture}.min.js, ${capture}.d.ts",
"*.jsx": "${capture}.js",
"*.tsx": "${capture}.ts, ${capture}.hooks.ts, ${capture}.hooks.tsx, ${capture}.contexts.ts, ${capture}.stories.tsx, ${capture}.story.tsx, ${capture}.spec.tsx, ${capture}.base.ts, ${capture}.base.tsx, ${capture}.types.ts, ${capture}.styles.ts, ${capture}.styles.tsx, ${capture}.utils.ts, ${capture}.utils.tsx, ${capture}.constants.ts, ${capture}.module.scss, ${capture}.module.css, ${capture}.md, ${capture}.css",
"tsconfig.json": "tsconfig.*.json",
"package.json": "package-lock.json, turbo.json, tsconfig.json, rome.json, biome.json, .npmignore, dictionary.txt, cspell.config.yaml",
"README.md": "*.md, LICENSE, CODEOWNERS",
".eslintrc": ".eslintignore",
".prettierrc": ".prettierignore",
".gitattributes": ".gitignore",
".yarnrc.yml": "yarn.lock, .pnp.*",
"jest.config.js": "jest.setup.mjs",
"pyproject.toml": "poetry.lock, poetry.toml, mkdocs.yaml",
"cspell.config.yaml": "dictionary.txt"
},
"azureFunctions.postDeployTask": "npm install (functions)",
"azureFunctions.projectLanguage": "TypeScript",
"azureFunctions.projectRuntime": "~4",
"debug.internalConsoleOptions": "neverOpen",
"python.defaultInterpreterPath": "${workspaceRoot}/.venv/bin/python",
"azureFunctions.preDeployTask": "npm prune (functions)",
"appService.zipIgnorePattern": [
"node_modules{,/**}",
".vscode{,/**}"
],
"python.defaultInterpreterPath": "python/services/.venv/bin/python",
"python.languageServer": "Pylance",
"cSpell.customDictionaries": {
"project-words": {

View File

@ -1,114 +1,6 @@
# Changelog
Note: version releases in the 0.x.y range may introduce breaking changes.
## 2.7.0
- minor: Set LiteLLM as default in init_content.
- patch: Fix Azure auth scope issue with LiteLLM.
- patch: Housekeeping toward 2.7.
## 2.6.0
- minor: Add LiteLLM chat and embedding model providers.
- minor: Add LoggerFactory and clean up related API.
- minor: Add config for NLP async mode.
- minor: Add optional input documents to indexing API.
- minor: add customization to vector store
- patch: Add gpt-5 support by updating fnllm dependency.
- patch: Fix all human_readable_id fields to be 0-based.
- patch: Fix multi-index search.
- patch: Improve upon recent logging refactor
- patch: Make cache, storage, and vector_store factories consistent with similar registration support
- patch: Remove hard-coded community rate limiter.
- patch: generate_text_embeddings only loads tables if embedding field is specified.
## 2.5.0
- minor: Add additional context variable to build index signature for custom parameter bag
- minor: swap package management from Poetry -> UV
## 2.4.0
- minor: Allow injection of custom pipelines.
- minor: Refactored StorageFactory to use a registration-based approach
- patch: Fix default values for tpm and rpm limiters on embeddings
- patch: Update typer.
- patch: cleaned up logging to follow python standards.
## 2.3.0
- minor: Remove Dynamic Max Retries support. Refactor typer typing in cli interface
- minor: Update fnllm to latest. Update default graphrag configuration
- patch: A few fixes and enhancements for better reuse and flow.
- patch: Add full llm response to LLM PRovider output
- patch: Fix Drift Reduce Response for non streaming calls
- patch: Fix global search prompt to include missing formatting key
- patch: Upgrade pyarrow dependency to >=17.0.0 to fix CVE-2024-52338
## 2.2.1
- patch: Fix Community Report prompt tuning response
- patch: Fix graph creation missing edge weights.
- patch: Update as workflows
## 2.2.0
- minor: Support OpenAI reasoning models.
- patch: Add option to snapshot raw extracted graph tables.
- patch: Added batching logic to the prompt tuning autoselection embeddings workflow
- patch: Align config classes and docs better.
- patch: Align embeddings table loading with configured fields.
- patch: Brings parity with our latest NLP extraction approaches.
- patch: Fix fnllm to 0.2.3
- patch: Fixes to basic search.
- patch: Update llm args for consistency.
- patch: add vector store integration tests
## 2.1.0
- minor: Add support for JSON input files.
- minor: Updated the prompt tunning client to support csv-metadata injection and updated output file types to match the new naming convention.
- patch: Add check for custom model types while config loading
- patch: Adds general-purpose pipeline run state object.
## 2.0.0
- major: Add children to communities to avoid re-compute.
- major: Reorganize and rename workflows and their outputs.
- major: Rework API to accept callbacks.
- minor: Add LMM Manager and Factory, to support provider registration
- minor: Add NLP graph extraction.
- minor: Add pipeline_start and pipeline_end callbacks.
- minor: Move embeddings snapshots to the workflow runner.
- minor: Remove config inheritance, hydration, and automatic env var overlays.
- minor: Rework the update output storage structure.
- patch: Add caching to NLP extractor.
- patch: Add vector store id reference to embeddings config.
- patch: Export NLP community reports prompt.
- patch: Fix DRIFT search on Azure AI Search.
- patch: Fix StopAsyncIteration catch.
- patch: Fix missing embeddings workflow in FastGraphRAG.
- patch: Fix proper use of n_depth for drift search
- patch: Fix report generation recursion.
- patch: Fix summarization over large datasets for inc indexing. Fix relationship summarization
- patch: Optimize data iteration by removing some iterrows from code
- patch: Patch json mode for community reports
- patch: Properly increment text unit IDs during updates.
- patch: Refactor config defaults from constants to type-safe, hierarchical dataclass.
- patch: Require explicit azure auth settings when using AOI.
- patch: Separates graph pruning for differential usage.
- patch: Tuck flow functions under their workflow modules.
- patch: Update fnllm. Remove unused libs.
- patch: Use ModelProvider for query module
- patch: Use shared schema for final outputs.
- patch: add dynamic retry logic.
- patch: add option to prepend metadata into chunks
- patch: cleanup query code duplication.
- patch: implemented multi-index querying for api layer
- patch: multi index query cli support
- patch: remove unused columns and change property document_attribute_columns to metadata
- patch: update multi-index query to support new workflows
## 1.2.0
- minor: Add Drift Reduce response and streaming endpoint

View File

@ -22,7 +22,7 @@ or contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any addi
2. Create a new branch for your contribution: `git checkout -b my-contribution`.
3. Make your changes and ensure that the code passes all tests.
4. Commit your changes: `git commit -m "Add my contribution"`.
5. Create and commit a semver impact document by running `uv run semversioner add-change -t <major|minor|patch> -d <description>`.
5. Create and commit a semver impact document by running `poetry run semversioner add-change -t <major|minor|patch> -d <description>`.
6. Push your changes to your forked repository: `git push origin my-contribution`.
7. Open a pull request to the main repository.

View File

@ -5,29 +5,29 @@
| Name | Installation | Purpose |
| ------------------- | ------------------------------------------------------------ | ----------------------------------------------------------------------------------- |
| Python 3.10 or 3.11 | [Download](https://www.python.org/downloads/) | The library is Python-based. |
| uv | [Instructions](https://docs.astral.sh/uv/) | uv is used for package management and virtualenv management in Python codebases |
| Poetry | [Instructions](https://python-poetry.org/docs/#installation) | Poetry is used for package management and virtualenv management in Python codebases |
# Getting Started
## Install Dependencies
```shell
# install python dependencies
uv sync
poetry install
```
## Execute the indexing engine
```shell
uv run poe index <...args>
poetry run poe index <...args>
```
## Execute prompt tuning
```shell
uv run poe prompt_tune <...args>
poetry run poe prompt_tune <...args>
```
## Execute Queries
```shell
uv run poe query <...args>
poetry run poe query <...args>
```
## Repository Structure
@ -63,7 +63,7 @@ Where appropriate, the factories expose a registration method for users to provi
We use [semversioner](https://github.com/raulgomis/semversioner) to automate and enforce semantic versioning in the release process. Our CI/CD pipeline checks that all PR's include a json file generated by semversioner. When submitting a PR, please run:
```shell
uv run semversioner add-change -t patch -d "<a small sentence describing changes made>."
poetry run semversioner add-change -t patch -d "<a small sentence describing changes made>."
```
# Azurite
@ -78,29 +78,29 @@ or by simply running `azurite` in the terminal if already installed globally. Se
# Lifecycle Scripts
Our Python package utilizes uv to manage dependencies and [poethepoet](https://pypi.org/project/poethepoet/) to manage custom build scripts.
Our Python package utilizes Poetry to manage dependencies and [poethepoet](https://pypi.org/project/poethepoet/) to manage custom build scripts.
Available scripts are:
- `uv run poe index` - Run the Indexing CLI
- `uv run poe query` - Run the Query CLI
- `uv build` - This invokes `uv build`, which will build a wheel file and other distributable artifacts.
- `uv run poe test` - This will execute all tests.
- `uv run poe test_unit` - This will execute unit tests.
- `uv run poe test_integration` - This will execute integration tests.
- `uv run poe test_smoke` - This will execute smoke tests.
- `uv run poe check` - This will perform a suite of static checks across the package, including:
- `poetry run poe index` - Run the Indexing CLI
- `poetry run poe query` - Run the Query CLI
- `poetry build` - This invokes `poetry build`, which will build a wheel file and other distributable artifacts.
- `poetry run poe test` - This will execute all tests.
- `poetry run poe test_unit` - This will execute unit tests.
- `poetry run poe test_integration` - This will execute integration tests.
- `poetry run poe test_smoke` - This will execute smoke tests.
- `poetry run poe check` - This will perform a suite of static checks across the package, including:
- formatting
- documentation formatting
- linting
- security patterns
- type-checking
- `uv run poe fix` - This will apply any available auto-fixes to the package. Usually this is just formatting fixes.
- `uv run poe fix_unsafe` - This will apply any available auto-fixes to the package, including those that may be unsafe.
- `uv run poe format` - Explicitly run the formatter across the package.
- `poetry run poe fix` - This will apply any available auto-fixes to the package. Usually this is just formatting fixes.
- `poetry run poe fix_unsafe` - This will apply any available auto-fixes to the package, including those that may be unsafe.
- `poetry run poe format` - Explicitly run the formatter across the package.
## Troubleshooting
### "RuntimeError: llvm-config failed executing, please point LLVM_CONFIG to the path for llvm-config" when running uv sync
### "RuntimeError: llvm-config failed executing, please point LLVM_CONFIG to the path for llvm-config" when running poetry install
Make sure llvm-9 and llvm-9-dev are installed:
@ -110,8 +110,13 @@ and then in your bashrc, add
`export LLVM_CONFIG=/usr/bin/llvm-config-9`
### "numba/\_pymodule.h:6:10: fatal error: Python.h: No such file or directory" when running uv sync
### "numba/\_pymodule.h:6:10: fatal error: Python.h: No such file or directory" when running poetry install
Make sure you have python3.10-dev installed or more generally `python<version>-dev`
`sudo apt-get install python3.10-dev`
### LLM call constantly exceeds TPM, RPM or time limits
`GRAPHRAG_LLM_THREAD_COUNT` and `GRAPHRAG_EMBEDDING_THREAD_COUNT` are both set to 50 by default. You can modify this values
to reduce concurrency. Please refer to the [Configuration Documents](https://microsoft.github.io/graphrag/config/overview/)

View File

@ -1,5 +1,6 @@
# GraphRAG
👉 [Use the GraphRAG Accelerator solution](https://github.com/Azure-Samples/graphrag-accelerator) <br/>
👉 [Microsoft Research Blog Post](https://www.microsoft.com/en-us/research/blog/graphrag-unlocking-llm-discovery-on-narrative-private-data/)<br/>
👉 [Read the docs](https://microsoft.github.io/graphrag)<br/>
👉 [GraphRAG Arxiv](https://arxiv.org/pdf/2404.16130)
@ -27,7 +28,7 @@ To learn more about GraphRAG and how it can be used to enhance your LLM's abilit
## Quickstart
To get started with the GraphRAG system we recommend trying the [command line quickstart](https://microsoft.github.io/graphrag/get_started/).
To get started with the GraphRAG system we recommend trying the [Solution Accelerator](https://github.com/Azure-Samples/graphrag-accelerator) package. This provides a user-friendly end-to-end experience with Azure resources.
## Repository Guidance
@ -46,12 +47,6 @@ This repository presents a methodology for using knowledge graph memory structur
Using _GraphRAG_ with your data out of the box may not yield the best possible results.
We strongly recommend to fine-tune your prompts following the [Prompt Tuning Guide](https://microsoft.github.io/graphrag/prompt_tuning/overview/) in our documentation.
## Versioning
Please see the [breaking changes](./breaking-changes.md) document for notes on our approach to versioning the project.
*Always run `graphrag init --root [path] --force` between minor version bumps to ensure you have the latest config format. Run the provided migration notebook between major version bumps if you want to avoid re-indexing prior datasets. Note that this will overwrite your configuration and prompts, so backup if necessary.*
## Responsible AI FAQ
See [RAI_TRANSPARENCY.md](./RAI_TRANSPARENCY.md)

View File

@ -1,98 +0,0 @@
# GraphRAG Data Model and Config Breaking Changes
This document contains notes about our versioning approach and a log of changes over time that may result in breakage. As of version 1.0 we are aligning more closely with standard [semantic versioning](https://semver.org/) practices. However, this is an ongoing research project that needs to balance experimental progress with stakeholder communication about big feature releases, so there may be times when we don't adhere perfectly to the spec.
There are five surface areas that may be impacted on any given release. They are:
- [CLI](https://microsoft.github.io/graphrag/cli/) - The CLI is the interface most project consumers are using. **Changes to the CLI will conform to standard semver.**
- [API](https://github.com/microsoft/graphrag/tree/main/graphrag/api) - The API layer is the primary interface we expect developers to use if they are consuming the project as a library in their own codebases. **Changes to the API layer modules will conform to standard semver.**
- Internals - Any code modules behind the CLI and API layers are considered "internal" and may change at any time without conforming to strict semver. This is intended to give the research team high flexibility to change our underlying implementation rapidly. We are not enforcing access via tightly controlled `__init__.py` files, so please understand that if you utilize modules other than the index or query API, they may break between releases in a non-semver-compliant manner.
- [settings.yaml](https://microsoft.github.io/graphrag/config/yaml/) - The settings.yaml file may have changes made to it as we adjust configurability. **Changes that affect the settings.yml will result in a minor version bump**. `graphrag init` will always emit compatible starter config, so we recommend always running the command when updating GraphRAG between minor versions, and copying your endpoint information or other customizations over to the new file.
- [Data model](https://microsoft.github.io/graphrag/index/outputs/) - The output data model may change over time as we adjust our approach. **Changes to the data model will conform to standard semver.** Any changes to the output tables will be shimmed for backwards compatibility between major releases, and we'll provide a migration notebook for folks to upgrade without requiring a re-index.
> TL;DR: Always run `graphrag init --path [path] --force` between minor version bumps to ensure you have the latest config format. Run the provided migration notebook between major version bumps if you want to avoid re-indexing prior datasets. Note that this will overwrite your configuration and prompts, so backup if necessary.
# v2
Run the [migration notebook](./docs/examples_notebooks/index_migration_to_v2.ipynb) to convert older tables to the v2 format.
The v2 release renamed all of our index tables to simply name the items each table contains. The previous naming was a leftover requirement of our use of DataShaper, which is no longer necessary.
# v1
Run the [migration notebook](./docs/examples_notebooks/index_migration_to_v1.ipynb) to convert older tables to the v1 format.
Note that one of the new requirements is that we write embeddings to a vector store during indexing. By default, this uses a local lancedb instance. When you re-generate the default config, a block will be added to reflect this. If you need to write to Azure AI Search instead, we recommend updating these settings before you index, so you don't need to do a separate vector ingest.
All of the breaking changes listed below are accounted for in the four steps above.
## Updated data model
- We have streamlined the data model of the index in a few small ways to align tables more consistently and remove redundant content. Notably:
- Consistent use of `id` and `human_readable_id` across all tables; this also insures all int IDs are actually saved as ints and never strings
- Alignment of fields from `create_final_entities` (such as name -> title) with `create_final_nodes`, and removal of redundant content across these tables
- Rename of `document.raw_content` to `document.text`
- Rename of `entity.name` to `entity.title`
- Rename `rank` to `combined_degree` in `create_final_relationships` and removal of `source_degree` and `target_degree` fields
- Fixed community tables to use a proper UUID for the `id` field, and retain `community` and `human_readable_id` for the short IDs
- Removal of all embeddings columns from parquet files in favor of direct vector store writes
### Migration
- Run the migration notebook (some recent changes may invalidate existing caches, so migrating the format it cheaper than re-indexing).
## New required Embeddings
### Change
- Added new required embeddings for `DRIFTSearch` and base RAG capabilities.
### Migration
- Run a new index, leveraging existing cache.
## Vector Store required by default
### Change
- Vector store is now required by default for all search methods.
### Migration
- Run `graphrag init` command to generate a new settings.yaml file with the vector store configuration.
- Run a new index, leveraging existing cache.
## Deprecate timestamp paths
### Change
- Remove support for timestamp paths, those using `${timestamp}` directory nesting.
- Use the same directory for storage output and reporting output.
### Migration
- Ensure output directories no longer use `${timestamp}` directory nesting.
**Using Environment Variables**
- Ensure `GRAPHRAG_STORAGE_BASE_DIR` is set to a static directory, e.g., `output` instead of `output/${timestamp}/artifacts`.
- Ensure `GRAPHRAG_REPORTING_BASE_DIR` is set to a static directory, e.g., `output` instead of `output/${timestamp}/reports`
[Full docs on using environment variables for configuration](https://microsoft.github.io/graphrag/config/env_vars/).
**Using Configuration File**
```yaml
# rest of settings.yaml file
# ...
storage:
type: file
base_dir: "output" # changed from "output/${timestamp}/artifacts"
reporting:
type: file
base_dir: "output" # changed from "output/${timestamp}/reports"
```
[Full docs on using YAML files for configuration](https://microsoft.github.io/graphrag/config/yaml/).

View File

@ -33,17 +33,11 @@ cosmosdb
Hnsw
odata
# NLP Terms
# NLTK Terms
chunker
wordnet
maxent
punkt
punct
lemmatizer
PROPN
Syntatic
ents
INTJ
# Libraries
Langchain
@ -78,10 +72,6 @@ semversioner
mkdocs
fnllm
typer
spacy
kwargs
ollama
litellm
# Library Methods
iterrows
@ -103,9 +93,6 @@ itertuples
isin
nocache
nbconvert
levelno
acompletion
aembedding
# HTML
nbsp
@ -127,7 +114,10 @@ unhot
groupby
retryer
agenerate
aembed
dedupe
dropna
dtypes
notna
# LLM Terms
@ -135,8 +125,6 @@ AOAI
embedder
llm
llms
achat
aembed
# Galaxy-Brain Terms
Unipartite
@ -190,26 +178,13 @@ Verdantis's
# English
skippable
upvote
unconfigured
# Misc
Arxiv
kwds
jsons
txts
byog
# Dulce
astrotechnician
epitheg
unspooled
unnavigated
# Names
Hochul
Ashish
#unified-search
apos
dearmor
venv

View File

@ -1,18 +1,12 @@
# Default Configuration Mode (using Env Vars)
As of version 1.3, GraphRAG no longer supports a full complement of pre-built environment variables. Instead, we support variable replacement within the [settings.yml file](yaml.md) so you can specify any environment variables you like.
The only standard environment variable we expect, and include in the default settings.yml, is `GRAPHRAG_API_KEY`. If you are already using a number of the previous GRAPHRAG_* environment variables, you can insert them with template syntax into settings.yml and they will be adopted.
> **The environment variables below are documented as an aid for migration, but they WILL NOT be read unless you use template syntax in your settings.yml. We also WILL NOT be updating this page as the main config object changes.**
---
### Text-Embeddings Customization
## Text-Embeddings Customization
By default, the GraphRAG indexer will only export embeddings required for our query methods. However, the model has embeddings defined for all plaintext fields, and these can be generated by setting the `GRAPHRAG_EMBEDDING_TARGET` environment variable to `all`.
#### Embedded Fields
If the embedding target is `all`, and you want to only embed a subset of these fields, you may specify which embeddings to skip using the `GRAPHRAG_EMBEDDING_SKIP` argument described below.
### Embedded Fields
- `text_unit.text`
- `document.text`
@ -23,23 +17,23 @@ By default, the GraphRAG indexer will only export embeddings required for our qu
- `community.summary`
- `community.full_content`
### Input Data
## Input Data
Our pipeline can ingest .csv or .txt data from an input folder. These files can be nested within subfolders. To configure how input data is handled, what fields are mapped over, and how timestamps are parsed, look for configuration values starting with `GRAPHRAG_INPUT_` below. In general, CSV-based data provides the most customizability. Each CSV should at least contain a `text` field (which can be mapped with environment variables), but it's helpful if they also have `title`, `timestamp`, and `source` fields. Additional fields can be included as well, which will land as extra fields on the `Document` table.
### Base LLM Settings
## Base LLM Settings
These are the primary settings for configuring LLM connectivity.
| Parameter | Required? | Description | Type | Default Value |
| --------------------------- | ------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------- | ----- | ------------- |
| `GRAPHRAG_API_KEY` | **Yes for OpenAI. Optional for AOAI** | The API key. (Note: `OPENAI_API_KEY` is also used as a fallback). If not defined when using AOAI, managed identity will be used. | `str` | `None` |
| `GRAPHRAG_API_BASE` | **For AOAI** | The API Base URL | `str` | `None` |
| `GRAPHRAG_API_VERSION` | **For AOAI** | The AOAI API version. | `str` | `None` |
| `GRAPHRAG_API_ORGANIZATION` | | The AOAI organization. | `str` | `None` |
| `GRAPHRAG_API_PROXY` | | The AOAI proxy. | `str` | `None` |
| Parameter | Required? | Description | Type | Default Value |
| --------------------------- | ------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------- | ----- | ------------- |
| `GRAPHRAG_API_KEY` | **Yes for OpenAI. Optional for AOAI** | The API key. (Note: `OPENAI_API_KEY is also used as a fallback). If not defined when using AOAI, managed identity will be used. | `str` | `None` |
| `GRAPHRAG_API_BASE` | **For AOAI** | The API Base URL | `str` | `None` |
| `GRAPHRAG_API_VERSION` | **For AOAI** | The AOAI API version. | `str` | `None` |
| `GRAPHRAG_API_ORGANIZATION` | | The AOAI organization. | `str` | `None` |
| `GRAPHRAG_API_PROXY` | | The AOAI proxy. | `str` | `None` |
### Text Generation Settings
## Text Generation Settings
These settings control the text generation model used by the pipeline. Any settings with a fallback will use the base LLM settings, if available.
@ -68,7 +62,7 @@ These settings control the text generation model used by the pipeline. Any setti
| `GRAPHRAG_LLM_TOP_P` | | The top_p to use for sampling. | `float` | 1 |
| `GRAPHRAG_LLM_N` | | The number of responses to generate. | `int` | 1 |
### Text Embedding Settings
## Text Embedding Settings
These settings control the text embedding model used by the pipeline. Any settings with a fallback will use the base LLM settings, if available.
@ -84,7 +78,8 @@ These settings control the text embedding model used by the pipeline. Any settin
| `GRAPHRAG_EMBEDDING_MODEL` | | The model to use for the embedding client. | `str` | `text-embedding-3-small` |
| `GRAPHRAG_EMBEDDING_BATCH_SIZE` | | The number of texts to embed at once. [(Azure limit is 16)](https://learn.microsoft.com/en-us/azure/ai-ce) | `int` | 16 |
| `GRAPHRAG_EMBEDDING_BATCH_MAX_TOKENS` | | The maximum tokens per batch [(Azure limit is 8191)](https://learn.microsoft.com/en-us/azure/ai-services/openai/reference) | `int` | 8191 |
| `GRAPHRAG_EMBEDDING_TARGET` | | The target fields to embed. Either `required` or `all`. | `str` | `required` | |
| `GRAPHRAG_EMBEDDING_TARGET` | | The target fields to embed. Either `required` or `all`. | `str` | `required` |
| `GRAPHRAG_EMBEDDING_SKIP` | | A comma-separated list of fields to skip embeddings for . (e.g. 'relationship.description') | `str` | `None` |
| `GRAPHRAG_EMBEDDING_THREAD_COUNT` | | The number of threads to use for parallelization for embeddings. | `int` | |
| `GRAPHRAG_EMBEDDING_THREAD_STAGGER` | | The time to wait (in seconds) between starting each thread for embeddings. | `float` | 50 |
| `GRAPHRAG_EMBEDDING_CONCURRENT_REQUESTS` | | The number of concurrent requests to allow for the embedding client. | `int` | 25 |
@ -94,38 +89,41 @@ These settings control the text embedding model used by the pipeline. Any settin
| `GRAPHRAG_EMBEDDING_MAX_RETRY_WAIT` | | The maximum number of seconds to wait between retries. | `int` | 10 |
| `GRAPHRAG_EMBEDDING_SLEEP_ON_RATE_LIMIT_RECOMMENDATION` | | Whether to sleep on rate limit recommendation. (Azure Only) | `bool` | `True` |
### Input Settings
## Input Settings
These settings control the data input used by the pipeline. Any settings with a fallback will use the base LLM settings, if available.
#### Plaintext Input Data (`GRAPHRAG_INPUT_FILE_TYPE`=text)
### Plaintext Input Data (`GRAPHRAG_INPUT_FILE_TYPE`=text)
| Parameter | Description | Type | Required or Optional | Default |
| ----------------------------- | --------------------------------------------------------------------------------- | ----- | -------------------- | ---------- |
| `GRAPHRAG_INPUT_FILE_PATTERN` | The file pattern regexp to use when reading input files from the input directory. | `str` | optional | `.*\.txt$` |
#### CSV Input Data (`GRAPHRAG_INPUT_FILE_TYPE`=csv)
### CSV Input Data (`GRAPHRAG_INPUT_FILE_TYPE`=csv)
| Parameter | Description | Type | Required or Optional | Default |
| ------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------ | ----- | -------------------- | ---------- |
| `GRAPHRAG_INPUT_TYPE` | The input storage type to use when reading files. (`file` or `blob`) | `str` | optional | `file` |
| `GRAPHRAG_INPUT_FILE_PATTERN` | The file pattern regexp to use when reading input files from the input directory. | `str` | optional | `.*\.txt$` |
| `GRAPHRAG_INPUT_SOURCE_COLUMN` | The 'source' column to use when reading CSV input files. | `str` | optional | `source` |
| `GRAPHRAG_INPUT_TIMESTAMP_COLUMN` | The 'timestamp' column to use when reading CSV input files. | `str` | optional | `None` |
| `GRAPHRAG_INPUT_TIMESTAMP_FORMAT` | The timestamp format to use when parsing timestamps in the timestamp column. | `str` | optional | `None` |
| `GRAPHRAG_INPUT_TEXT_COLUMN` | The 'text' column to use when reading CSV input files. | `str` | optional | `text` |
| `GRAPHRAG_INPUT_METADATA` | A list of CSV columns, comma-separated, to incorporate as JSON in a metadata column. | `str` | optional | `None` |
| `GRAPHRAG_INPUT_DOCUMENT_ATTRIBUTE_COLUMNS` | A list of CSV columns, comma-separated, to incorporate as document fields. | `str` | optional | `id` |
| `GRAPHRAG_INPUT_TITLE_COLUMN` | The 'title' column to use when reading CSV input files. | `str` | optional | `title` |
| `GRAPHRAG_INPUT_STORAGE_ACCOUNT_BLOB_URL` | The Azure Storage blob endpoint to use when in `blob` mode and using managed identity. Will have the format `https://<storage_account_name>.blob.core.windows.net` | `str` | optional | `None` |
| `GRAPHRAG_INPUT_CONNECTION_STRING` | The connection string to use when reading CSV input files from Azure Blob Storage. | `str` | optional | `None` |
| `GRAPHRAG_INPUT_CONTAINER_NAME` | The container name to use when reading CSV input files from Azure Blob Storage. | `str` | optional | `None` |
| `GRAPHRAG_INPUT_BASE_DIR` | The base directory to read input files from. | `str` | optional | `None` |
### Data Mapping Settings
## Data Mapping Settings
| Parameter | Description | Type | Required or Optional | Default |
| -------------------------- | -------------------------------------------------------- | ----- | -------------------- | ------- |
| `GRAPHRAG_INPUT_FILE_TYPE` | The type of input data, `csv` or `text` | `str` | optional | `text` |
| `GRAPHRAG_INPUT_ENCODING` | The encoding to apply when reading CSV/text input files. | `str` | optional | `utf-8` |
### Data Chunking
## Data Chunking
| Parameter | Description | Type | Required or Optional | Default |
| ------------------------------- | ------------------------------------------------------------------------------------------- | ----- | -------------------- | ----------------------------- |
@ -134,7 +132,7 @@ These settings control the data input used by the pipeline. Any settings with a
| `GRAPHRAG_CHUNK_BY_COLUMNS` | A comma-separated list of document attributes to groupby when performing TextUnit chunking. | `str` | optional | `id` |
| `GRAPHRAG_CHUNK_ENCODING_MODEL` | The encoding model to use for chunking. | `str` | optional | The top-level encoding model. |
### Prompting Overrides
## Prompting Overrides
| Parameter | Description | Type | Required or Optional | Default |
| --------------------------------------------- | ------------------------------------------------------------------------------------------ | -------- | -------------------- | ---------------------------------------------------------------- |
@ -152,7 +150,7 @@ These settings control the data input used by the pipeline. Any settings with a
| `GRAPHRAG_COMMUNITY_REPORTS_PROMPT_FILE` | The community reports extraction prompt to utilize. | `string` | optional | `None` |
| `GRAPHRAG_COMMUNITY_REPORTS_MAX_LENGTH` | The maximum number of tokens to generate per community reports. | `int` | optional | 1500 |
### Storage
## Storage
This section controls the storage mechanism used by the pipeline used for exporting output tables.
@ -164,7 +162,7 @@ This section controls the storage mechanism used by the pipeline used for export
| `GRAPHRAG_STORAGE_CONTAINER_NAME` | The Azure Storage container name to use when in `blob` mode. | `str` | optional | None |
| `GRAPHRAG_STORAGE_BASE_DIR` | The base path to data outputs outputs. | `str` | optional | None |
### Cache
## Cache
This section controls the cache mechanism used by the pipeline. This is used to cache LLM invocation results.
@ -176,19 +174,19 @@ This section controls the cache mechanism used by the pipeline. This is used to
| `GRAPHRAG_CACHE_CONTAINER_NAME` | The Azure Storage container name to use when in `blob` mode. | `str` | optional | None |
| `GRAPHRAG_CACHE_BASE_DIR` | The base path to the cache files. | `str` | optional | None |
### Reporting
## Reporting
This section controls the reporting mechanism used by the pipeline, for common events and error messages. The default is to write reports to a file in the output directory. However, you can also choose to write reports to an Azure Blob Storage container.
This section controls the reporting mechanism used by the pipeline, for common events and error messages. The default is to write reports to a file in the output directory. However, you can also choose to write reports to the console or to an Azure Blob Storage container.
| Parameter | Description | Type | Required or Optional | Default |
| --------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------ | ----- | -------------------- | ------- |
| `GRAPHRAG_REPORTING_TYPE` | The type of reporter to use. Options are `file` or `blob` | `str` | optional | `file` |
| `GRAPHRAG_REPORTING_TYPE` | The type of reporter to use. Options are `file`, `console`, or `blob` | `str` | optional | `file` |
| `GRAPHRAG_REPORTING_STORAGE_ACCOUNT_BLOB_URL` | The Azure Storage blob endpoint to use when in `blob` mode and using managed identity. Will have the format `https://<storage_account_name>.blob.core.windows.net` | `str` | optional | None |
| `GRAPHRAG_REPORTING_CONNECTION_STRING` | The Azure Storage connection string to use when in `blob` mode. | `str` | optional | None |
| `GRAPHRAG_REPORTING_CONTAINER_NAME` | The Azure Storage container name to use when in `blob` mode. | `str` | optional | None |
| `GRAPHRAG_REPORTING_BASE_DIR` | The base path to the reporting outputs. | `str` | optional | None |
### Node2Vec Parameters
## Node2Vec Parameters
| Parameter | Description | Type | Required or Optional | Default |
| ------------------------------- | ---------------------------------------- | ------ | -------------------- | ------- |
@ -199,7 +197,7 @@ This section controls the reporting mechanism used by the pipeline, for common e
| `GRAPHRAG_NODE2VEC_ITERATIONS` | The number of iterations to run node2vec | `int` | optional | 3 |
| `GRAPHRAG_NODE2VEC_RANDOM_SEED` | The random seed to use for node2vec | `int` | optional | 597832 |
### Data Snapshotting
## Data Snapshotting
| Parameter | Description | Type | Required or Optional | Default |
| -------------------------------------- | ----------------------------------------------- | ------ | -------------------- | ------- |
@ -216,4 +214,5 @@ This section controls the reporting mechanism used by the pipeline, for common e
| `GRAPHRAG_ASYNC_MODE` | Which async mode to use. Either `asyncio` or `threaded`. | `str` | optional | `asyncio` |
| `GRAPHRAG_ENCODING_MODEL` | The text encoding model, used in tiktoken, to encode text. | `str` | optional | `cl100k_base` |
| `GRAPHRAG_MAX_CLUSTER_SIZE` | The maximum number of entities to include in a single Leiden cluster. | `int` | optional | 10 |
| `GRAPHRAG_SKIP_WORKFLOWS` | A comma-separated list of workflow names to skip. | `str` | optional | `None` |
| `GRAPHRAG_UMAP_ENABLED` | Whether to enable UMAP layouts | `bool` | optional | False |

View File

@ -5,13 +5,12 @@ To start using GraphRAG, you must generate a configuration file. The `init` comm
## Usage
```sh
graphrag init [--root PATH] [--force, --no-force]
graphrag init [--root PATH]
```
## Options
- `--root PATH` - The project root directory to initialize graphrag at. Default is the current directory.
- `--force`, `--no-force` - Optional, default is --no-force. Overwrite existing configuration and prompt files if they exist.
## Example
@ -29,4 +28,4 @@ The `init` command will create the following files in the specified directory:
## Next Steps
After initializing your workspace, you can either run the [Prompt Tuning](../prompt_tuning/auto_prompt_tuning.md) command to adapt the prompts to your data or even start running the [Indexing Pipeline](../index/overview.md) to index your data. For more information on configuration options available, see the [YAML details page](yaml.md).
After initializing your workspace, you can either run the [Prompt Tuning](../prompt_tuning/auto_prompt_tuning.md) command to adapt the prompts to your data or even start running the [Indexing Pipeline](../index/overview.md) to index your data. For more information on configuring GraphRAG, see the [Configuration](overview.md) documentation.

View File

@ -1,130 +0,0 @@
# Language Model Selection and Overriding
This page contains information on selecting a model to use and options to supply your own model for GraphRAG. Note that this is not a guide to finding the right model for your use case.
## Default Model Support
GraphRAG was built and tested using OpenAI models, so this is the default model set we support. This is not intended to be a limiter or statement of quality or fitness for your use case, only that it's the set we are most familiar with for prompting, tuning, and debugging.
GraphRAG also utilizes a language model wrapper library used by several projects within our team, called fnllm. fnllm provides two important functions for GraphRAG: rate limiting configuration to help us maximize throughput for large indexing jobs, and robust caching of API calls to minimize consumption on repeated indexes for testing, experimentation, or incremental ingest. fnllm uses the OpenAI Python SDK under the covers, so OpenAI-compliant endpoints are a base requirement out-of-the-box.
Starting with version 2.6.0, GraphRAG supports using [LiteLLM](https://docs.litellm.ai/) instead of fnllm for calling language models. LiteLLM provides support for 100+ models though it is important to note that when choosing a model it must support returning [structured outputs](https://openai.com/index/introducing-structured-outputs-in-the-api/) adhering to a [JSON schema](https://docs.litellm.ai/docs/completion/json_mode).
Example using LiteLLm as the language model tool for GraphRAG:
```yaml
models:
default_chat_model:
type: chat
auth_type: api_key
api_key: ${GEMINI_API_KEY}
model_provider: gemini
model: gemini-2.5-flash-lite
default_embedding_model:
type: embedding
auth_type: api_key
api_key: ${GEMINI_API_KEY}
model_provider: gemini
model: gemini-embedding-001
```
To use LiteLLM one must
- Set `type` to either `chat` or `embedding`.
- Provide a `model_provider`, e.g., `openai`, `azure`, `gemini`, etc.
- Set the `model` to a one supported by the `model_provider`'s API.
- Provide a `deployment_name` if using `azure` as the `model_provider`.
See [Detailed Configuration](yaml.md) for more details on configuration. [View LiteLLm basic usage](https://docs.litellm.ai/docs/#basic-usage) for details on how models are called (The `model_provider` is the portion prior to `/` while the `model` is the portion following the `/`).
## Model Selection Considerations
GraphRAG has been most thoroughly tested with the gpt-4 series of models from OpenAI, including gpt-4 gpt-4-turbo, gpt-4o, and gpt-4o-mini. Our [arXiv paper](https://arxiv.org/abs/2404.16130), for example, performed quality evaluation using gpt-4-turbo. As stated above, non-OpenAI models are now supported with GraphRAG 2.6.0 and onwards through the use of LiteLLM but the suite of gpt-4 series of models from OpenAI remain the most tested and supported suite of models for GraphRAG.
Versions of GraphRAG before 2.2.0 made extensive use of `max_tokens` and `logit_bias` to control generated response length or content. The introduction of the o-series of models added new, non-compatible parameters because these models include a reasoning component that has different consumption patterns and response generation attributes than non-reasoning models. GraphRAG 2.2.0 now supports these models, but there are important differences that need to be understood before you switch.
- Previously, GraphRAG used `max_tokens` to limit responses in a few locations. This is done so that we can have predictable content sizes when building downstream context windows for summarization. We have now switched from using `max_tokens` to use a prompted approach, which is working well in our tests. We suggest using `max_tokens` in your language model config only for budgetary reasons if you want to limit consumption, and not for expected response length control. We now also support the o-series equivalent `max_completion_tokens`, but if you use this keep in mind that there may be some unknown fixed reasoning consumption amount in addition to the response tokens, so it is not a good technique for response control.
- Previously, GraphRAG used a combination of `max_tokens` and `logit_bias` to strictly control a binary yes/no question during gleanings. This is not possible with reasoning models, so again we have switched to a prompted approach. Our tests with gpt-4o, gpt-4o-mini, and o1 show that this works consistently, but could have issues if you have an older or smaller model.
- The o-series models are much slower and more expensive. It may be useful to use an asymmetric approach to model use in your config: you can define as many models as you like in the `models` block of your settings.yaml and reference them by key for every workflow that requires a language model. You could use gpt-4o for indexing and o1 for query, for example. Experiment to find the right balance of cost, speed, and quality for your use case.
- The o-series models contain a form of native native chain-of-thought reasoning that is absent in the non-o-series models. GraphRAG's prompts sometimes contain CoT because it was an effective technique with the gpt-4* series. It may be counterproductive with the o-series, so you may want to tune or even re-write large portions of the prompt templates (particularly for graph and claim extraction).
Example config with asymmetric model use:
```yaml
models:
extraction_chat_model:
api_key: ${GRAPHRAG_API_KEY}
type: openai_chat
auth_type: api_key
model: gpt-4o
model_supports_json: true
query_chat_model:
api_key: ${GRAPHRAG_API_KEY}
type: openai_chat
auth_type: api_key
model: o1
model_supports_json: true
...
extract_graph:
model_id: extraction_chat_model
prompt: "prompts/extract_graph.txt"
entity_types: [organization,person,geo,event]
max_gleanings: 1
...
global_search:
chat_model_id: query_chat_model
map_prompt: "prompts/global_search_map_system_prompt.txt"
reduce_prompt: "prompts/global_search_reduce_system_prompt.txt"
knowledge_prompt: "prompts/global_search_knowledge_system_prompt.txt"
```
Another option would be to avoid using a language model at all for the graph extraction, instead using the `fast` [indexing method](../index/methods.md) that uses NLP for portions of the indexing phase in lieu of LLM APIs.
## Using Non-OpenAI Models
As shown above, non-OpenAI models may be used via LiteLLM starting with GraphRAG version 2.6.0 but cases may still exist in which some users wish to use models not supported by LiteLLM. There are two approaches one can use to connect to unsupported models:
### Proxy APIs
Many users have used platforms such as [ollama](https://ollama.com/) and [LiteLLM Proxy Server](https://docs.litellm.ai/docs/simple_proxy) to proxy the underlying model HTTP calls to a different model provider. This seems to work reasonably well, but we frequently see issues with malformed responses (especially JSON), so if you do this please understand that your model needs to reliably return the specific response formats that GraphRAG expects. If you're having trouble with a model, you may need to try prompting to coax the format, or intercepting the response within your proxy to try and handle malformed responses.
### Model Protocol
As of GraphRAG 2.0.0, we support model injection through the use of a standard chat and embedding Protocol and an accompanying ModelFactory that you can use to register your model implementation. This is not supported with the CLI, so you'll need to use GraphRAG as a library.
- Our Protocol is [defined here](https://github.com/microsoft/graphrag/blob/main/graphrag/language_model/protocol/base.py)
- Our base implementation, which wraps fnllm, [is here](https://github.com/microsoft/graphrag/blob/main/graphrag/language_model/providers/fnllm/models.py)
- We have a simple mock implementation in our tests that you can [reference here](https://github.com/microsoft/graphrag/blob/main/tests/mock_provider.py)
Once you have a model implementation, you need to register it with our ModelFactory:
```python
class MyCustomModel:
...
# implementation
# elsewhere...
ModelFactory.register_chat("my-custom-chat-model", lambda **kwargs: MyCustomModel(**kwargs))
```
Then in your config you can reference the type name you used:
```yaml
models:
default_chat_model:
type: my-custom-chat-model
extract_graph:
model_id: default_chat_model
prompt: "prompts/extract_graph.txt"
entity_types: [organization,person,geo,event]
max_gleanings: 1
```
Note that your custom model will be passed the same params for init and method calls that we use throughout GraphRAG. There is not currently any ability to define custom parameters, so you may need to use closure scope or a factory pattern within your implementation to get custom config values.

View File

@ -4,8 +4,8 @@ The GraphRAG system is highly configurable. This page provides an overview of th
## Default Configuration Mode
The default configuration mode is the simplest way to get started with the GraphRAG system. It is designed to work out-of-the-box with minimal configuration. The main ways to set up GraphRAG in Default Configuration mode are via:
The default configuration mode is the simplest way to get started with the GraphRAG system. It is designed to work out-of-the-box with minimal configuration. The primary configuration sections for the Indexing Engine pipelines are described below. The main ways to set up GraphRAG in Default Configuration mode are via:
- [Init command](init.md) (recommended first step)
- [Edit settings.yaml for deeper control](yaml.md)
- [Purely using environment variables](env_vars.md) (not recommended)
- [Init command](init.md) (recommended)
- [Using YAML for deeper control](yaml.md)
- [Purely using environment variables](env_vars.md)

View File

@ -17,285 +17,200 @@ llm:
# Config Sections
## Language Model Setup
## Indexing
### models
### llm
This is a dict of model configurations. The dict key is used to reference this configuration elsewhere when a model instance is desired. In this way, you can specify as many different models as you need, and reference them differentially in the workflow steps.
For example:
```yml
models:
default_chat_model:
api_key: ${GRAPHRAG_API_KEY}
type: openai_chat
model: gpt-4o
model_supports_json: true
default_embedding_model:
api_key: ${GRAPHRAG_API_KEY}
type: openai_embedding
model: text-embedding-ada-002
```
This is the base LLM configuration section. Other steps may override this configuration with their own LLM configuration.
#### Fields
- `api_key` **str** - The OpenAI API key to use.
- `auth_type` **api_key|azure_managed_identity** - Indicate how you want to authenticate requests.
- `type` **chat**|**embedding**|**openai_chat|azure_openai_chat|openai_embedding|azure_openai_embedding|mock_chat|mock_embeddings** - The type of LLM to use.
- `model_provider` **str|None** - The model provider to use, e.g., openai, azure, anthropic, etc. Required when `type == chat|embedding`. When `type == chat|embedding`, [LiteLLM](https://docs.litellm.ai/) is used under the hood which has support for calling 100+ models. [View LiteLLm basic usage](https://docs.litellm.ai/docs/#basic-usage) for details on how models are called (The `model_provider` is the portion prior to `/` while the `model` is the portion following the `/`). [View Language Model Selection](models.md) for more details and examples on using LiteLLM.
- `type` **openai_chat|azure_openai_chat|openai_embedding|azure_openai_embedding** - The type of LLM to use.
- `model` **str** - The model name.
- `encoding_model` **str** - The text encoding model to use. Default is to use the encoding model aligned with the language model (i.e., it is retrieved from tiktoken if unset).
- `max_tokens` **int** - The maximum number of output tokens.
- `request_timeout` **float** - The per-request timeout.
- `api_base` **str** - The API base url to use.
- `api_version` **str** - The API version.
- `deployment_name` **str** - The deployment name to use (Azure).
- `api_version` **str** - The API version
- `organization` **str** - The client organization.
- `proxy` **str** - The proxy URL to use.
- `audience` **str** - (Azure OpenAI only) The URI of the target Azure resource/service for which a managed identity token is requested. Used if `api_key` is not defined. Default=`https://cognitiveservices.azure.com/.default`
- `deployment_name` **str** - The deployment name to use (Azure).
- `model_supports_json` **bool** - Whether the model supports JSON-mode output.
- `request_timeout` **float** - The per-request timeout.
- `tokens_per_minute` **int** - Set a leaky-bucket throttle on tokens-per-minute.
- `requests_per_minute` **int** - Set a leaky-bucket throttle on requests-per-minute.
- `retry_strategy` **str** - Retry strategy to use, "native" is the default and uses the strategy built into the OpenAI SDK. Other allowable values include "exponential_backoff", "random_wait", and "incremental_wait".
- `max_retries` **int** - The maximum number of retries to use.
- `max_retry_wait` **float** - The maximum backoff time.
- `sleep_on_rate_limit_recommendation` **bool** - Whether to adhere to sleep recommendations (Azure).
- `concurrent_requests` **int** The number of open requests to allow at once.
- `async_mode` **asyncio|threaded** The async mode to use. Either `asyncio` or `threaded`.
- `responses` **list[str]** - If this model type is mock, this is a list of response strings to return.
- `temperature` **float** - The temperature to use.
- `top_p` **float** - The top-p value to use.
- `n` **int** - The number of completions to generate.
- `max_tokens` **int** - The maximum number of output tokens. Not valid for o-series models.
- `temperature` **float** - The temperature to use. Not valid for o-series models.
- `top_p` **float** - The top-p value to use. Not valid for o-series models.
- `frequency_penalty` **float** - Frequency penalty for token generation. Not valid for o-series models.
- `presence_penalty` **float** - Frequency penalty for token generation. Not valid for o-series models.
- `max_completion_tokens` **int** - Max number of tokens to consume for chat completion. Must be large enough to include an unknown amount for "reasoning" by the model. o-series models only.
- `reasoning_effort` **low|medium|high** - Amount of "thought" for the model to expend reasoning about a response. o-series models only.
## Input Files and Chunking
### input
Our pipeline can ingest .csv, .txt, or .json data from an input location. See the [inputs page](../index/inputs.md) for more details and examples.
### parallelization
#### Fields
- `storage` **StorageConfig**
- `type` **file|blob|cosmosdb** - The storage type to use. Default=`file`
- `base_dir` **str** - The base directory to write output artifacts to, relative to the root.
- `connection_string` **str** - (blob/cosmosdb only) The Azure Storage connection string.
- `container_name` **str** - (blob/cosmosdb only) The Azure Storage container name.
- `storage_account_blob_url` **str** - (blob only) The storage account blob URL to use.
- `cosmosdb_account_blob_url` **str** - (cosmosdb only) The CosmosDB account blob URL to use.
- `file_type` **text|csv|json** - The type of input data to load. Default is `text`
- `encoding` **str** - The encoding of the input file. Default is `utf-8`
- `file_pattern` **str** - A regex to match input files. Default is `.*\.csv$`, `.*\.txt$`, or `.*\.json$` depending on the specified `file_type`, but you can customize it if needed.
- `stagger` **float** - The threading stagger value.
- `num_threads` **int** - The maximum number of work threads.
### async_mode
**asyncio|threaded** The async mode to use. Either `asyncio` or `threaded.
### embeddings
#### Fields
- `llm` (see LLM top-level config)
- `parallelization` (see Parallelization top-level config)
- `async_mode` (see Async Mode top-level config)
- `batch_size` **int** - The maximum batch size to use.
- `batch_max_tokens` **int** - The maximum batch # of tokens.
- `target` **required|all|none** - Determines which set of embeddings to export.
- `skip` **list[str]** - Which embeddings to skip. Only useful if target=all to customize the list.
- `vector_store` **dict** - The vector store to use. Configured for lancedb by default.
- `type` **str** - `lancedb` or `azure_ai_search`. Default=`lancedb`
- `db_uri` **str** (only for lancedb) - The database uri. Default=`storage.base_dir/lancedb`
- `url` **str** (only for AI Search) - AI Search endpoint
- `api_key` **str** (optional - only for AI Search) - The AI Search api key to use.
- `audience` **str** (only for AI Search) - Audience for managed identity token if managed identity authentication is used.
- `overwrite` **bool** (only used at index creation time) - Overwrite collection if it exist. Default=`True`
- `container_name` **str** - The name of a vector container. This stores all indexes (tables) for a given dataset ingest. Default=`default`
- `strategy` **dict** - Fully override the text-embedding strategy.
### input
#### Fields
- `type` **file|blob** - The input type to use. Default=`file`
- `file_type` **text|csv** - The type of input data to load. Either `text` or `csv`. Default is `text`
- `base_dir` **str** - The base directory to read input from, relative to the root.
- `connection_string` **str** - (blob only) The Azure Storage connection string.
- `storage_account_blob_url` **str** - The storage account blob URL to use.
- `container_name` **str** - (blob only) The Azure Storage container name.
- `file_encoding` **str** - The encoding of the input file. Default is `utf-8`
- `file_pattern` **str** - A regex to match input files. Default is `.*\.csv$` if in csv mode and `.*\.txt$` if in text mode.
- `file_filter` **dict** - Key/value pairs to filter. Default is None.
- `text_column` **str** - (CSV/JSON only) The text column name. If unset we expect a column named `text`.
- `title_column` **str** - (CSV/JSON only) The title column name, filename will be used if unset.
- `metadata` **list[str]** - (CSV/JSON only) The additional document attributes fields to keep.
- `source_column` **str** - (CSV Mode Only) The source column name.
- `timestamp_column` **str** - (CSV Mode Only) The timestamp column name.
- `timestamp_format` **str** - (CSV Mode Only) The source format.
- `text_column` **str** - (CSV Mode Only) The text column name.
- `title_column` **str** - (CSV Mode Only) The title column name.
- `document_attribute_columns` **list[str]** - (CSV Mode Only) The additional document attributes to include.
### chunks
These settings configure how we parse documents into text chunks. This is necessary because very large documents may not fit into a single context window, and graph extraction accuracy can be modulated. Also note the `metadata` setting in the input document config, which will replicate document metadata into each chunk.
#### Fields
- `size` **int** - The max chunk size in tokens.
- `overlap` **int** - The chunk overlap in tokens.
- `group_by_columns` **list[str]** - Group documents by these fields before chunking.
- `strategy` **str**[tokens|sentences] - How to chunk the text.
- `encoding_model` **str** - The text encoding model to use for splitting on token boundaries.
- `prepend_metadata` **bool** - Determines if metadata values should be added at the beginning of each chunk. Default=`False`.
- `chunk_size_includes_metadata` **bool** - Specifies whether the chunk size calculation should include metadata tokens. Default=`False`.
## Outputs and Storage
### output
This section controls the storage mechanism used by the pipeline used for exporting output tables.
#### Fields
- `type` **file|memory|blob|cosmosdb** - The storage type to use. Default=`file`
- `base_dir` **str** - The base directory to write output artifacts to, relative to the root.
- `connection_string` **str** - (blob/cosmosdb only) The Azure Storage connection string.
- `container_name` **str** - (blob/cosmosdb only) The Azure Storage container name.
- `storage_account_blob_url` **str** - (blob only) The storage account blob URL to use.
- `cosmosdb_account_blob_url` **str** - (cosmosdb only) The CosmosDB account blob URL to use.
### update_index_output
The section defines a secondary storage location for running incremental indexing, to preserve your original outputs.
#### Fields
- `type` **file|memory|blob|cosmosdb** - The storage type to use. Default=`file`
- `base_dir` **str** - The base directory to write output artifacts to, relative to the root.
- `connection_string` **str** - (blob/cosmosdb only) The Azure Storage connection string.
- `container_name` **str** - (blob/cosmosdb only) The Azure Storage container name.
- `storage_account_blob_url` **str** - (blob only) The storage account blob URL to use.
- `cosmosdb_account_blob_url` **str** - (cosmosdb only) The CosmosDB account blob URL to use.
- `group_by_columns` **list[str]** - group documents by fields before chunking.
- `encoding_model` **str** - The text encoding model to use. Default is to use the top-level encoding model.
- `strategy` **dict** - Fully override the chunking strategy.
### cache
This section controls the cache mechanism used by the pipeline. This is used to cache LLM invocation results for faster performance when re-running the indexing process.
#### Fields
- `type` **file|memory|none|blob** - The cache type to use. Default=`file`
- `connection_string` **str** - (blob only) The Azure Storage connection string.
- `container_name` **str** - (blob only) The Azure Storage container name.
- `base_dir` **str** - The base directory to write cache to, relative to the root.
- `storage_account_blob_url` **str** - The storage account blob URL to use.
### storage
#### Fields
- `type` **file|memory|blob|cosmosdb** - The storage type to use. Default=`file`
- `type` **file|memory|blob** - The storage type to use. Default=`file`
- `connection_string` **str** - (blob only) The Azure Storage connection string.
- `container_name` **str** - (blob only) The Azure Storage container name.
- `base_dir` **str** - The base directory to write output artifacts to, relative to the root.
- `connection_string` **str** - (blob/cosmosdb only) The Azure Storage connection string.
- `container_name` **str** - (blob/cosmosdb only) The Azure Storage container name.
- `storage_account_blob_url` **str** - (blob only) The storage account blob URL to use.
- `cosmosdb_account_blob_url` **str** - (cosmosdb only) The CosmosDB account blob URL to use.
- `storage_account_blob_url` **str** - The storage account blob URL to use.
### update_index_storage
#### Fields
- `type` **file|memory|blob** - The storage type to use. Default=`file`
- `connection_string` **str** - (blob only) The Azure Storage connection string.
- `container_name` **str** - (blob only) The Azure Storage container name.
- `base_dir` **str** - The base directory to write output artifacts to, relative to the root.
- `storage_account_blob_url` **str** - The storage account blob URL to use.
### reporting
This section controls the reporting mechanism used by the pipeline, for common events and error messages. The default is to write reports to a file in the output directory. However, you can also choose to write reports to an Azure Blob Storage container.
#### Fields
- `type` **file|blob** - The reporting type to use. Default=`file`
- `base_dir` **str** - The base directory to write reports to, relative to the root.
- `type` **file|console|blob** - The reporting type to use. Default=`file`
- `connection_string` **str** - (blob only) The Azure Storage connection string.
- `container_name` **str** - (blob only) The Azure Storage container name.
- `base_dir` **str** - The base directory to write reports to, relative to the root.
- `storage_account_blob_url` **str** - The storage account blob URL to use.
### vector_store
Where to put all vectors for the system. Configured for lancedb by default. This is a dict, with the key used to identify individual store parameters (e.g., for text embedding).
### entity_extraction
#### Fields
- `type` **lancedb|azure_ai_search|cosmosdb** - Type of vector store. Default=`lancedb`
- `db_uri` **str** (only for lancedb) - The database uri. Default=`storage.base_dir/lancedb`
- `url` **str** (only for AI Search) - AI Search endpoint
- `api_key` **str** (optional - only for AI Search) - The AI Search api key to use.
- `audience` **str** (only for AI Search) - Audience for managed identity token if managed identity authentication is used.
- `container_name` **str** - The name of a vector container. This stores all indexes (tables) for a given dataset ingest. Default=`default`
- `database_name` **str** - (cosmosdb only) Name of the database.
- `overwrite` **bool** (only used at index creation time) - Overwrite collection if it exist. Default=`True`
## Workflow Configurations
These settings control each individual workflow as they execute.
### workflows
**list[str]** - This is a list of workflow names to run, in order. GraphRAG has built-in pipelines to configure this, but you can run exactly and only what you want by specifying the list here. Useful if you have done part of the processing yourself.
### embed_text
By default, the GraphRAG indexer will only export embeddings required for our query methods. However, the model has embeddings defined for all plaintext fields, and these can be customized by setting the `target` and `names` fields.
Supported embeddings names are:
- `text_unit.text`
- `document.text`
- `entity.title`
- `entity.description`
- `relationship.description`
- `community.title`
- `community.summary`
- `community.full_content`
#### Fields
- `model_id` **str** - Name of the model definition to use for text embedding.
- `vector_store_id` **str** - Name of vector store definition to write to.
- `batch_size` **int** - The maximum batch size to use.
- `batch_max_tokens` **int** - The maximum batch # of tokens.
- `names` **list[str]** - List of the embeddings names to run (must be in supported list).
### extract_graph
Tune the language model-based graph extraction process.
#### Fields
- `model_id` **str** - Name of the model definition to use for API calls.
- `llm` (see LLM top-level config)
- `parallelization` (see Parallelization top-level config)
- `async_mode` (see Async Mode top-level config)
- `prompt` **str** - The prompt file to use.
- `entity_types` **list[str]** - The entity types to identify.
- `max_gleanings` **int** - The maximum number of gleaning cycles to use.
- `encoding_model` **str** - The text encoding model to use. By default, this will use the top-level encoding model.
- `strategy` **dict** - Fully override the entity extraction strategy.
### summarize_descriptions
#### Fields
- `model_id` **str** - Name of the model definition to use for API calls.
- `llm` (see LLM top-level config)
- `parallelization` (see Parallelization top-level config)
- `async_mode` (see Async Mode top-level config)
- `prompt` **str** - The prompt file to use.
- `max_length` **int** - The maximum number of output tokens per summarization.
- `max_input_length` **int** - The maximum number of tokens to collect for summarization (this will limit how many descriptions you send to be summarized for a given entity or relationship).
- `strategy` **dict** - Fully override the summarize description strategy.
### extract_graph_nlp
Defines settings for NLP-based graph extraction methods.
#### Fields
- `normalize_edge_weights` **bool** - Whether to normalize the edge weights during graph construction. Default=`True`.
- `text_analyzer` **dict** - Parameters for the NLP model.
- extractor_type **regex_english|syntactic_parser|cfg** - Default=`regex_english`.
- model_name **str** - Name of NLP model (for SpaCy-based models)
- max_word_length **int** - Longest word to allow. Default=`15`.
- word_delimiter **str** - Delimiter to split words. Default ' '.
- include_named_entities **bool** - Whether to include named entities in noun phrases. Default=`True`.
- exclude_nouns **list[str] | None** - List of nouns to exclude. If `None`, we use an internal stopword list.
- exclude_entity_tags **list[str]** - List of entity tags to ignore.
- exclude_pos_tags **list[str]** - List of part-of-speech tags to ignore.
- noun_phrase_tags **list[str]** - List of noun phrase tags to ignore.
- noun_phrase_grammars **dict[str, str]** - Noun phrase grammars for the model (cfg-only).
### prune_graph
Parameters for manual graph pruning. This can be used to optimize the modularity of your graph clusters, by removing overly-connected or rare nodes.
#### Fields
- min_node_freq **int** - The minimum node frequency to allow.
- max_node_freq_std **float | None** - The maximum standard deviation of node frequency to allow.
- min_node_degree **int** - The minimum node degree to allow.
- max_node_degree_std **float | None** - The maximum standard deviation of node degree to allow.
- min_edge_weight_pct **float** - The minimum edge weight percentile to allow.
- remove_ego_nodes **bool** - Remove ego nodes.
- lcc_only **bool** - Only use largest connected component.
### cluster_graph
These are the settings used for Leiden hierarchical clustering of the graph to create communities.
#### Fields
- `max_cluster_size` **int** - The maximum cluster size to export.
- `use_lcc` **bool** - Whether to only use the largest connected component.
- `seed` **int** - A randomization seed to provide if consistent run-to-run results are desired. We do provide a default in order to guarantee clustering stability.
### extract_claims
### claim_extraction
#### Fields
- `enabled` **bool** - Whether to enable claim extraction. Off by default, because claim prompts really need user tuning.
- `model_id` **str** - Name of the model definition to use for API calls.
- `llm` (see LLM top-level config)
- `parallelization` (see Parallelization top-level config)
- `async_mode` (see Async Mode top-level config)
- `prompt` **str** - The prompt file to use.
- `description` **str** - Describes the types of claims we want to extract.
- `max_gleanings` **int** - The maximum number of gleaning cycles to use.
- `encoding_model` **str** - The text encoding model to use. By default, this will use the top-level encoding model.
- `strategy` **dict** - Fully override the claim extraction strategy.
### community_reports
#### Fields
- `model_id` **str** - Name of the model definition to use for API calls.
- `llm` (see LLM top-level config)
- `parallelization` (see Parallelization top-level config)
- `async_mode` (see Async Mode top-level config)
- `prompt` **str** - The prompt file to use.
- `max_length` **int** - The maximum number of output tokens per report.
- `max_input_length` **int** - The maximum number of input tokens to use when generating reports.
- `strategy` **dict** - Fully override the community reports strategy.
### cluster_graph
#### Fields
- `max_cluster_size` **int** - The maximum cluster size to export.
- `strategy` **dict** - Fully override the cluster_graph strategy.
### embed_graph
We use node2vec to embed the graph. This is primarily used for visualization, so it is not turned on by default.
#### Fields
- `enabled` **bool** - Whether to enable graph embeddings.
- `dimensions` **int** - Number of vector dimensions to produce.
- `num_walks` **int** - The node2vec number of walks.
- `walk_length` **int** - The node2vec walk length.
- `window_size` **int** - The node2vec window size.
@ -305,8 +220,6 @@ We use node2vec to embed the graph. This is primarily used for visualization, so
### umap
Indicates whether we should run UMAP dimensionality reduction. This is used to provide an x/y coordinate to each graph node, suitable for visualization. If this is not enabled, nodes will receive a 0/0 x/y coordinate. If this is enabled, you *must* enable graph embedding as well.
#### Fields
- `enabled` **bool** - Whether to enable UMAP layouts.
@ -317,6 +230,15 @@ Indicates whether we should run UMAP dimensionality reduction. This is used to p
- `embeddings` **bool** - Export embeddings snapshots to parquet.
- `graphml` **bool** - Export graph snapshots to GraphML.
- `transient` **bool** - Export transient workflow tables snapshots to parquet.
### encoding_model
**str** - The text encoding model to use. Default=`cl100k_base`.
### skip_workflows
**list[str]** - Which workflow names to skip.
## Query
@ -324,48 +246,54 @@ Indicates whether we should run UMAP dimensionality reduction. This is used to p
#### Fields
- `chat_model_id` **str** - Name of the model definition to use for Chat Completion calls.
- `embedding_model_id` **str** - Name of the model definition to use for Embedding calls.
- `prompt` **str** - The prompt file to use.
- `text_unit_prop` **float** - The text unit proportion.
- `community_prop` **float** - The community proportion.
- `conversation_history_max_turns` **int** - The conversation history maximum turns.
- `top_k_entities` **int** - The top k mapped entities.
- `top_k_relationships` **int** - The top k mapped relations.
- `max_context_tokens` **int** - The maximum tokens to use building the request context.
- `temperature` **float | None** - The temperature to use for token generation.
- `top_p` **float | None** - The top-p value to use for token generation.
- `n` **int | None** - The number of completions to generate.
- `max_tokens` **int** - The maximum tokens.
- `llm_max_tokens` **int** - The LLM maximum tokens.
### global_search
#### Fields
- `chat_model_id` **str** - Name of the model definition to use for Chat Completion calls.
- `map_prompt` **str** - The mapper prompt file to use.
- `reduce_prompt` **str** - The reducer prompt file to use.
- `knowledge_prompt` **str** - The knowledge prompt file to use.
- `map_prompt` **str | None** - The global search mapper prompt to use.
- `reduce_prompt` **str | None** - The global search reducer to use.
- `knowledge_prompt` **str | None** - The global search general prompt to use.
- `max_context_tokens` **int** - The maximum context size to create, in tokens.
- `data_max_tokens` **int** - The maximum tokens to use constructing the final response from the reduces responses.
- `map_max_length` **int** - The maximum length to request for map responses, in words.
- `reduce_max_length` **int** - The maximum length to request for reduce responses, in words.
- `temperature` **float | None** - The temperature to use for token generation.
- `top_p` **float | None** - The top-p value to use for token generation.
- `n` **int | None** - The number of completions to generate.
- `max_tokens` **int** - The maximum context size in tokens.
- `data_max_tokens` **int** - The data llm maximum tokens.
- `map_max_tokens` **int** - The map llm maximum tokens.
- `reduce_max_tokens` **int** - The reduce llm maximum tokens.
- `concurrency` **int** - The number of concurrent requests.
- `dynamic_search_llm` **str** - LLM model to use for dynamic community selection.
- `dynamic_search_threshold` **int** - Rating threshold in include a community report.
- `dynamic_search_keep_parent` **bool** - Keep parent community if any of the child communities are relevant.
- `dynamic_search_num_repeats` **int** - Number of times to rate the same community report.
- `dynamic_search_use_summary` **bool** - Use community summary instead of full_context.
- `dynamic_search_concurrent_coroutines` **int** - Number of concurrent coroutines to rate community reports.
- `dynamic_search_max_level` **int** - The maximum level of community hierarchy to consider if none of the processed communities are relevant.
### drift_search
#### Fields
- `chat_model_id` **str** - Name of the model definition to use for Chat Completion calls.
- `embedding_model_id` **str** - Name of the model definition to use for Embedding calls.
- `prompt` **str** - The prompt file to use.
- `reduce_prompt` **str** - The reducer prompt file to use.
- `temperature` **float** - The temperature to use for token generation.",
- `top_p` **float** - The top-p value to use for token generation.
- `n` **int** - The number of completions to generate.
- `max_tokens` **int** - The maximum context size in tokens.
- `data_max_tokens` **int** - The data llm maximum tokens.
- `reduce_max_tokens` **int** - The maximum tokens for the reduce phase. Only use if a non-o-series model.
- `reduce_max_completion_tokens` **int** - The maximum tokens for the reduce phase. Only use for o-series models.
- `concurrency` **int** - The number of concurrent requests.
- `drift_k_followups` **int** - The number of top global results to retrieve.
- `primer_folds` **int** - The number of folds for search priming.
@ -379,14 +307,4 @@ Indicates whether we should run UMAP dimensionality reduction. This is used to p
- `local_search_temperature` **float** - The temperature to use for token generation in local search.
- `local_search_top_p` **float** - The top-p value to use for token generation in local search.
- `local_search_n` **int** - The number of completions to generate in local search.
- `local_search_llm_max_gen_tokens` **int** - The maximum number of generated tokens for the LLM in local search. Only use if a non-o-series model.
- `local_search_llm_max_gen_completion_tokens` **int** - The maximum number of generated tokens for the LLM in local search. Only use for o-series models.
### basic_search
#### Fields
- `chat_model_id` **str** - Name of the model definition to use for Chat Completion calls.
- `embedding_model_id` **str** - Name of the model definition to use for Embedding calls.
- `prompt` **str** - The prompt file to use.
- `k` **int | None** - Number of text units to retrieve from the vector store for context building.
- `local_search_llm_max_gen_tokens` **int** - The maximum number of generated tokens for the LLM in local search.

View File

@ -5,27 +5,27 @@
| Name | Installation | Purpose |
| ------------------- | ------------------------------------------------------------ | ----------------------------------------------------------------------------------- |
| Python 3.10-3.12 | [Download](https://www.python.org/downloads/) | The library is Python-based. |
| uv | [Instructions](https://docs.astral.sh/uv/) | uv is used for package management and virtualenv management in Python codebases |
| Poetry | [Instructions](https://python-poetry.org/docs/#installation) | Poetry is used for package management and virtualenv management in Python codebases |
# Getting Started
## Install Dependencies
```sh
# install python dependencies
uv sync
# Install Python dependencies.
poetry install
```
## Execute the Indexing Engine
```sh
uv run poe index <...args>
poetry run poe index <...args>
```
## Executing Queries
```sh
uv run poe query <...args>
poetry run poe query <...args>
```
# Azurite
@ -40,31 +40,31 @@ or by simply running `azurite` in the terminal if already installed globally. Se
# Lifecycle Scripts
Our Python package utilize uv to manage dependencies and [poethepoet](https://pypi.org/project/poethepoet/) to manage build scripts.
Our Python package utilizes Poetry to manage dependencies and [poethepoet](https://pypi.org/project/poethepoet/) to manage build scripts.
Available scripts are:
- `uv run poe index` - Run the Indexing CLI
- `uv run poe query` - Run the Query CLI
- `uv build` - This will build a wheel file and other distributable artifacts.
- `uv run poe test` - This will execute all tests.
- `uv run poe test_unit` - This will execute unit tests.
- `uv run poe test_integration` - This will execute integration tests.
- `uv run poe test_smoke` - This will execute smoke tests.
- `uv run poe test_verbs` - This will execute tests of the basic workflows.
- `uv run poe check` - This will perform a suite of static checks across the package, including:
- `poetry run poe index` - Run the Indexing CLI
- `poetry run poe query` - Run the Query CLI
- `poetry build` - This invokes `poetry build`, which will build a wheel file and other distributable artifacts.
- `poetry run poe test` - This will execute all tests.
- `poetry run poe test_unit` - This will execute unit tests.
- `poetry run poe test_integration` - This will execute integration tests.
- `poetry run poe test_smoke` - This will execute smoke tests.
- `poetry run poe test_verbs` - This will execute tests of the basic workflows.
- `poetry run poe check` - This will perform a suite of static checks across the package, including:
- formatting
- documentation formatting
- linting
- security patterns
- type-checking
- `uv run poe fix` - This will apply any available auto-fixes to the package. Usually this is just formatting fixes.
- `uv run poe fix_unsafe` - This will apply any available auto-fixes to the package, including those that may be unsafe.
- `uv run poe format` - Explicitly run the formatter across the package.
- `poetry run poe fix` - This will apply any available auto-fixes to the package. Usually this is just formatting fixes.
- `poetry run poe fix_unsafe` - This will apply any available auto-fixes to the package, including those that may be unsafe.
- `poetry run poe format` - Explicitly run the formatter across the package.
## Troubleshooting
### "RuntimeError: llvm-config failed executing, please point LLVM_CONFIG to the path for llvm-config" when running uv install
### "RuntimeError: llvm-config failed executing, please point LLVM_CONFIG to the path for llvm-config" when running poetry install
Make sure llvm-9 and llvm-9-dev are installed:
@ -73,3 +73,14 @@ Make sure llvm-9 and llvm-9-dev are installed:
and then in your bashrc, add
`export LLVM_CONFIG=/usr/bin/llvm-config-9`
### "numba/\_pymodule.h:6:10: fatal error: Python.h: No such file or directory" when running poetry install
Make sure you have python3.10-dev installed or more generally `python<version>-dev`
`sudo apt-get install python3.10-dev`
### LLM call constantly exceeds TPM, RPM or time limits
`GRAPHRAG_LLM_THREAD_COUNT` and `GRAPHRAG_EMBEDDING_THREAD_COUNT` are both set to 50 by default. You can modify these values
to reduce concurrency. Please refer to the [Configuration Documents](config/overview.md)

View File

@ -25,23 +25,8 @@
"metadata": {},
"outputs": [],
"source": [
"from pathlib import Path\n",
"from pprint import pprint\n",
"\n",
"import pandas as pd\n",
"\n",
"import graphrag.api as api\n",
"from graphrag.config.load_config import load_config\n",
"from graphrag.index.typing.pipeline_run_result import PipelineRunResult"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"PROJECT_DIRECTORY = \"<your project directory>\""
"from graphrag.index.typing import PipelineRunResult"
]
},
{
@ -51,7 +36,27 @@
"## Prerequisite\n",
"As a prerequisite to all API operations, a `GraphRagConfig` object is required. It is the primary means to control the behavior of graphrag and can be instantiated from a `settings.yaml` configuration file.\n",
"\n",
"Please refer to the [CLI docs](https://microsoft.github.io/graphrag/cli/#init) for more detailed information on how to generate the `settings.yaml` file."
"Please refer to the [CLI docs](https://microsoft.github.io/graphrag/cli/#init) for more detailed information on how to generate the `settings.yaml` file.\n",
"\n",
"#### Load `settings.yaml` configuration"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import yaml\n",
"\n",
"settings = yaml.safe_load(open(\"<project_directory>/settings.yaml\")) # noqa: PTH123, SIM115"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"At this point, you can modify the imported settings to align with your application's requirements. For example, if building a UI application, the application might need to change the input and/or storage destinations dynamically in order to enable users to build and query different indexes."
]
},
{
@ -67,9 +72,11 @@
"metadata": {},
"outputs": [],
"source": [
"# note that we expect this to fail on the deployed docs because the PROJECT_DIRECTORY is not set to a real location.\n",
"# if you run this notebook locally, make sure to point at a location containing your settings.yaml\n",
"graphrag_config = load_config(Path(PROJECT_DIRECTORY))"
"from graphrag.config.create_graphrag_config import create_graphrag_config\n",
"\n",
"graphrag_config = create_graphrag_config(\n",
" values=settings, root_dir=\"<project_directory>\"\n",
")"
]
},
{
@ -117,17 +124,25 @@
"metadata": {},
"outputs": [],
"source": [
"entities = pd.read_parquet(f\"{PROJECT_DIRECTORY}/output/entities.parquet\")\n",
"communities = pd.read_parquet(f\"{PROJECT_DIRECTORY}/output/communities.parquet\")\n",
"community_reports = pd.read_parquet(\n",
" f\"{PROJECT_DIRECTORY}/output/community_reports.parquet\"\n",
"import pandas as pd\n",
"\n",
"final_nodes = pd.read_parquet(\"<project_directory>/output/create_final_nodes.parquet\")\n",
"final_entities = pd.read_parquet(\n",
" \"<project_directory>/output/create_final_entities.parquet\"\n",
")\n",
"final_communities = pd.read_parquet(\n",
" \"<project_directory>/output/create_final_communities.parquet\"\n",
")\n",
"final_community_reports = pd.read_parquet(\n",
" \"<project_directory>/output/create_final_community_reports.parquet\"\n",
")\n",
"\n",
"response, context = await api.global_search(\n",
" config=graphrag_config,\n",
" entities=entities,\n",
" communities=communities,\n",
" community_reports=community_reports,\n",
" nodes=final_nodes,\n",
" entities=final_entities,\n",
" communities=final_communities,\n",
" community_reports=final_community_reports,\n",
" community_level=2,\n",
" dynamic_community_selection=False,\n",
" response_type=\"Multiple Paragraphs\",\n",
@ -164,13 +179,15 @@
"metadata": {},
"outputs": [],
"source": [
"from pprint import pprint\n",
"\n",
"pprint(context) # noqa: T203"
]
}
],
"metadata": {
"kernelspec": {
"display_name": ".venv",
"display_name": "graphrag-venv",
"language": "python",
"name": "python3"
},
@ -184,7 +201,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.9"
"version": "3.10.15"
}
},
"nbformat": 4,

View File

@ -1,680 +0,0 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Copyright (c) 2024 Microsoft Corporation.\n",
"# Licensed under the MIT License."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Bring-Your-Own Vector Store\n",
"\n",
"This notebook demonstrates how to implement a custom vector store and register for usage with GraphRAG.\n",
"\n",
"## Overview\n",
"\n",
"GraphRAG uses a plug-and-play architecture that allow for easy integration of custom vector stores (outside of what is natively supported) by following a factory design pattern. This allows you to:\n",
"\n",
"- **Extend functionality**: Add support for new vector database backends\n",
"- **Customize behavior**: Implement specialized search logic or data structures\n",
"- **Integrate existing systems**: Connect GraphRAG to your existing vector database infrastructure\n",
"\n",
"### What You'll Learn\n",
"\n",
"1. Understanding the `BaseVectorStore` interface\n",
"2. Implementing a custom vector store class\n",
"3. Registering your vector store with the `VectorStoreFactory`\n",
"4. Testing and validating your implementation\n",
"5. Configuring GraphRAG to use your custom vector store\n",
"\n",
"Let's get started!"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Step 1: Import Required Dependencies\n",
"\n",
"First, let's import the necessary GraphRAG components and other dependencies we'll need.\n",
"\n",
"```bash\n",
"pip install graphrag\n",
"```"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from typing import Any\n",
"\n",
"import numpy as np\n",
"import yaml\n",
"\n",
"from graphrag.config.models.vector_store_schema_config import VectorStoreSchemaConfig\n",
"from graphrag.data_model.types import TextEmbedder\n",
"\n",
"# GraphRAG vector store components\n",
"from graphrag.vector_stores.base import (\n",
" BaseVectorStore,\n",
" VectorStoreDocument,\n",
" VectorStoreSearchResult,\n",
")\n",
"from graphrag.vector_stores.factory import VectorStoreFactory"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Step 2: Understand the BaseVectorStore Interface\n",
"\n",
"Before using a custom vector store, let's examine the `BaseVectorStore` interface to understand what methods need to be implemented."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Let's inspect the BaseVectorStore class to understand the required methods\n",
"import inspect\n",
"\n",
"print(\"BaseVectorStore Abstract Methods:\")\n",
"print(\"=\" * 40)\n",
"\n",
"abstract_methods = []\n",
"for name, method in inspect.getmembers(BaseVectorStore, predicate=inspect.isfunction):\n",
" if getattr(method, \"__isabstractmethod__\", False):\n",
" signature = inspect.signature(method)\n",
" abstract_methods.append(f\"• {name}{signature}\")\n",
" print(f\"• {name}{signature}\")\n",
"\n",
"print(f\"\\nTotal abstract methods to implement: {len(abstract_methods)}\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Step 3: Implement a Custom Vector Store\n",
"\n",
"Now let's implement a simple in-memory vector store as an example. This vector store will:\n",
"\n",
"- Store documents and vectors in memory using Python data structures\n",
"- Support all required BaseVectorStore methods\n",
"\n",
"**Note**: This is a simplified example for demonstration. Production vector stores would typically use optimized libraries like FAISS, more sophisticated indexing, and persistent storage."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"class SimpleInMemoryVectorStore(BaseVectorStore):\n",
" \"\"\"A simple in-memory vector store implementation for demonstration purposes.\n",
"\n",
" This vector store stores documents and their embeddings in memory and provides\n",
" basic similarity search functionality using cosine similarity.\n",
"\n",
" WARNING: This is for demonstration only - not suitable for production use.\n",
" For production, consider using optimized vector databases like LanceDB,\n",
" Azure AI Search, or other specialized vector stores.\n",
" \"\"\"\n",
"\n",
" # Internal storage for documents and vectors\n",
" documents: dict[str, VectorStoreDocument]\n",
" vectors: dict[str, np.ndarray]\n",
" connected: bool\n",
"\n",
" def __init__(self, **kwargs: Any):\n",
" \"\"\"Initialize the in-memory vector store.\"\"\"\n",
" super().__init__(**kwargs)\n",
"\n",
" self.documents: dict[str, VectorStoreDocument] = {}\n",
" self.vectors: dict[str, np.ndarray] = {}\n",
" self.connected = False\n",
"\n",
" print(f\"🚀 SimpleInMemoryVectorStore initialized for index: {self.index_name}\")\n",
"\n",
" def connect(self, **kwargs: Any) -> None:\n",
" \"\"\"Connect to the vector storage (no-op for in-memory store).\"\"\"\n",
" self.connected = True\n",
" print(f\"✅ Connected to in-memory vector store: {self.index_name}\")\n",
"\n",
" def load_documents(\n",
" self, documents: list[VectorStoreDocument], overwrite: bool = True\n",
" ) -> None:\n",
" \"\"\"Load documents into the vector store.\"\"\"\n",
" if not self.connected:\n",
" msg = \"Vector store not connected. Call connect() first.\"\n",
" raise RuntimeError(msg)\n",
"\n",
" if overwrite:\n",
" self.documents.clear()\n",
" self.vectors.clear()\n",
"\n",
" loaded_count = 0\n",
" for doc in documents:\n",
" if doc.vector is not None:\n",
" doc_id = str(doc.id)\n",
" self.documents[doc_id] = doc\n",
" self.vectors[doc_id] = np.array(doc.vector, dtype=np.float32)\n",
" loaded_count += 1\n",
"\n",
" print(f\"📚 Loaded {loaded_count} documents into vector store\")\n",
"\n",
" def _cosine_similarity(self, vec1: np.ndarray, vec2: np.ndarray) -> float:\n",
" \"\"\"Calculate cosine similarity between two vectors.\"\"\"\n",
" # Normalize vectors\n",
" norm1 = np.linalg.norm(vec1)\n",
" norm2 = np.linalg.norm(vec2)\n",
"\n",
" if norm1 == 0 or norm2 == 0:\n",
" return 0.0\n",
"\n",
" return float(np.dot(vec1, vec2) / (norm1 * norm2))\n",
"\n",
" def similarity_search_by_vector(\n",
" self, query_embedding: list[float], k: int = 10, **kwargs: Any\n",
" ) -> list[VectorStoreSearchResult]:\n",
" \"\"\"Perform similarity search using a query vector.\"\"\"\n",
" if not self.connected:\n",
" msg = \"Vector store not connected. Call connect() first.\"\n",
" raise RuntimeError(msg)\n",
"\n",
" if not self.vectors:\n",
" return []\n",
"\n",
" query_vec = np.array(query_embedding, dtype=np.float32)\n",
" similarities = []\n",
"\n",
" # Calculate similarity with all stored vectors\n",
" for doc_id, stored_vec in self.vectors.items():\n",
" similarity = self._cosine_similarity(query_vec, stored_vec)\n",
" similarities.append((doc_id, similarity))\n",
"\n",
" # Sort by similarity (descending) and take top k\n",
" similarities.sort(key=lambda x: x[1], reverse=True)\n",
" top_k = similarities[:k]\n",
"\n",
" # Create search results\n",
" results = []\n",
" for doc_id, score in top_k:\n",
" document = self.documents[doc_id]\n",
" result = VectorStoreSearchResult(document=document, score=score)\n",
" results.append(result)\n",
"\n",
" return results\n",
"\n",
" def similarity_search_by_text(\n",
" self, text: str, text_embedder: TextEmbedder, k: int = 10, **kwargs: Any\n",
" ) -> list[VectorStoreSearchResult]:\n",
" \"\"\"Perform similarity search using text (which gets embedded first).\"\"\"\n",
" # Embed the text first\n",
" query_embedding = text_embedder(text)\n",
"\n",
" # Use vector search with the embedding\n",
" return self.similarity_search_by_vector(query_embedding, k, **kwargs)\n",
"\n",
" def filter_by_id(self, include_ids: list[str] | list[int]) -> Any:\n",
" \"\"\"Build a query filter to filter documents by id.\n",
"\n",
" For this simple implementation, we return the list of IDs as the filter.\n",
" \"\"\"\n",
" return [str(id_) for id_ in include_ids]\n",
"\n",
" def search_by_id(self, id: str) -> VectorStoreDocument:\n",
" \"\"\"Search for a document by id.\"\"\"\n",
" doc_id = str(id)\n",
" if doc_id not in self.documents:\n",
" msg = f\"Document with id '{id}' not found\"\n",
" raise KeyError(msg)\n",
"\n",
" return self.documents[doc_id]\n",
"\n",
" def get_stats(self) -> dict[str, Any]:\n",
" \"\"\"Get statistics about the vector store (custom method).\"\"\"\n",
" return {\n",
" \"index_name\": self.index_name,\n",
" \"document_count\": len(self.documents),\n",
" \"vector_count\": len(self.vectors),\n",
" \"connected\": self.connected,\n",
" \"vector_dimension\": len(next(iter(self.vectors.values())))\n",
" if self.vectors\n",
" else 0,\n",
" }\n",
"\n",
"\n",
"print(\"✅ SimpleInMemoryVectorStore class defined!\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Step 4: Register the Custom Vector Store\n",
"\n",
"Now let's register our custom vector store with the `VectorStoreFactory` so it can be used throughout GraphRAG."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Register our custom vector store with a unique identifier\n",
"CUSTOM_VECTOR_STORE_TYPE = \"simple_memory\"\n",
"\n",
"# Register the vector store class\n",
"VectorStoreFactory.register(CUSTOM_VECTOR_STORE_TYPE, SimpleInMemoryVectorStore)\n",
"\n",
"print(f\"✅ Registered custom vector store with type: '{CUSTOM_VECTOR_STORE_TYPE}'\")\n",
"\n",
"# Verify registration\n",
"available_types = VectorStoreFactory.get_vector_store_types()\n",
"print(f\"\\n📋 Available vector store types: {available_types}\")\n",
"print(\n",
" f\"🔍 Is our custom type supported? {VectorStoreFactory.is_supported_type(CUSTOM_VECTOR_STORE_TYPE)}\"\n",
")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Step 5: Test the Custom Vector Store\n",
"\n",
"Let's create some sample data and test our custom vector store implementation."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Create sample documents with mock embeddings\n",
"def create_mock_embedding(dimension: int = 384) -> list[float]:\n",
" \"\"\"Create a random embedding vector for testing.\"\"\"\n",
" return np.random.normal(0, 1, dimension).tolist()\n",
"\n",
"\n",
"# Sample documents\n",
"sample_documents = [\n",
" VectorStoreDocument(\n",
" id=\"doc_1\",\n",
" text=\"GraphRAG is a powerful knowledge graph extraction and reasoning framework.\",\n",
" vector=create_mock_embedding(),\n",
" attributes={\"category\": \"technology\", \"source\": \"documentation\"},\n",
" ),\n",
" VectorStoreDocument(\n",
" id=\"doc_2\",\n",
" text=\"Vector stores enable efficient similarity search over high-dimensional data.\",\n",
" vector=create_mock_embedding(),\n",
" attributes={\"category\": \"technology\", \"source\": \"research\"},\n",
" ),\n",
" VectorStoreDocument(\n",
" id=\"doc_3\",\n",
" text=\"Machine learning models can process and understand natural language text.\",\n",
" vector=create_mock_embedding(),\n",
" attributes={\"category\": \"AI\", \"source\": \"article\"},\n",
" ),\n",
" VectorStoreDocument(\n",
" id=\"doc_4\",\n",
" text=\"Custom implementations allow for specialized behavior and integration.\",\n",
" vector=create_mock_embedding(),\n",
" attributes={\"category\": \"development\", \"source\": \"tutorial\"},\n",
" ),\n",
"]\n",
"\n",
"print(f\"📝 Created {len(sample_documents)} sample documents\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Test creating vector store using the factory\n",
"schema = VectorStoreSchemaConfig(index_name=\"test_collection\")\n",
"\n",
"# Create vector store instance using factory\n",
"vector_store = VectorStoreFactory.create_vector_store(\n",
" CUSTOM_VECTOR_STORE_TYPE, vector_store_schema_config=schema\n",
")\n",
"\n",
"print(f\"✅ Created vector store instance: {type(vector_store).__name__}\")\n",
"print(f\"📊 Initial stats: {vector_store.get_stats()}\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Connect and load documents\n",
"vector_store.connect()\n",
"vector_store.load_documents(sample_documents)\n",
"\n",
"print(f\"📊 Updated stats: {vector_store.get_stats()}\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Test similarity search\n",
"query_vector = create_mock_embedding() # Random query vector for testing\n",
"\n",
"search_results = vector_store.similarity_search_by_vector(\n",
" query_vector,\n",
" k=3, # Get top 3 similar documents\n",
")\n",
"\n",
"print(f\"🔍 Found {len(search_results)} similar documents:\\n\")\n",
"\n",
"for i, result in enumerate(search_results, 1):\n",
" doc = result.document\n",
" print(f\"{i}. ID: {doc.id}\")\n",
" print(f\" Text: {doc.text[:60]}...\")\n",
" print(f\" Similarity Score: {result.score:.4f}\")\n",
" print(f\" Category: {doc.attributes.get('category', 'N/A')}\")\n",
" print()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Test search by ID\n",
"try:\n",
" found_doc = vector_store.search_by_id(\"doc_2\")\n",
" print(\"✅ Found document by ID:\")\n",
" print(f\" ID: {found_doc.id}\")\n",
" print(f\" Text: {found_doc.text}\")\n",
" print(f\" Attributes: {found_doc.attributes}\")\n",
"except KeyError as e:\n",
" print(f\"❌ Error: {e}\")\n",
"\n",
"# Test filter by ID\n",
"id_filter = vector_store.filter_by_id([\"doc_1\", \"doc_3\"])\n",
"print(f\"\\n🔧 ID filter result: {id_filter}\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Step 6: Configuration for GraphRAG\n",
"\n",
"Now let's see how you would configure GraphRAG to use your custom vector store in a settings file."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Example GraphRAG yaml settings\n",
"example_settings = {\n",
" \"vector_store\": {\n",
" \"default_vector_store\": {\n",
" \"type\": CUSTOM_VECTOR_STORE_TYPE, # \"simple_memory\"\n",
" \"collection_name\": \"graphrag_entities\",\n",
" # Add any custom parameters your vector store needs\n",
" \"custom_parameter\": \"custom_value\",\n",
" }\n",
" },\n",
" # Other GraphRAG configuration...\n",
" \"models\": {\n",
" \"default_embedding_model\": {\n",
" \"type\": \"openai_embedding\",\n",
" \"model\": \"text-embedding-3-small\",\n",
" }\n",
" },\n",
"}\n",
"\n",
"# Convert to YAML format for settings.yml\n",
"yaml_config = yaml.dump(example_settings, default_flow_style=False, indent=2)\n",
"\n",
"print(\"📄 Example settings.yml configuration:\")\n",
"print(\"=\" * 40)\n",
"print(yaml_config)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Step 7: Integration with GraphRAG Pipeline\n",
"\n",
"Here's how your custom vector store would be used in a typical GraphRAG pipeline."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Example of how GraphRAG would use your custom vector store\n",
"def simulate_graphrag_pipeline():\n",
" \"\"\"Simulate how GraphRAG would use the custom vector store.\"\"\"\n",
" print(\"🚀 Simulating GraphRAG pipeline with custom vector store...\\n\")\n",
"\n",
" # 1. GraphRAG creates vector store using factory\n",
" schema = VectorStoreSchemaConfig(index_name=\"graphrag_entities\")\n",
"\n",
" store = VectorStoreFactory.create_vector_store(\n",
" CUSTOM_VECTOR_STORE_TYPE,\n",
" vector_store_schema_config=schema,\n",
" similarity_threshold=0.3,\n",
" )\n",
" store.connect()\n",
"\n",
" print(\"✅ Step 1: Vector store created and connected\")\n",
"\n",
" # 2. During indexing, GraphRAG loads extracted entities\n",
" entity_documents = [\n",
" VectorStoreDocument(\n",
" id=f\"entity_{i}\",\n",
" text=f\"Entity {i} description: Important concept in the knowledge graph\",\n",
" vector=create_mock_embedding(),\n",
" attributes={\"type\": \"entity\", \"importance\": i % 3 + 1},\n",
" )\n",
" for i in range(10)\n",
" ]\n",
"\n",
" store.load_documents(entity_documents)\n",
" print(f\"✅ Step 2: Loaded {len(entity_documents)} entity documents\")\n",
"\n",
" # 3. During query time, GraphRAG searches for relevant entities\n",
" query_embedding = create_mock_embedding()\n",
" relevant_entities = store.similarity_search_by_vector(query_embedding, k=5)\n",
"\n",
" print(f\"✅ Step 3: Found {len(relevant_entities)} relevant entities for query\")\n",
"\n",
" # 4. GraphRAG uses these entities for context building\n",
" context_entities = [result.document for result in relevant_entities]\n",
"\n",
" print(\"✅ Step 4: Context built using retrieved entities\")\n",
" print(f\"📊 Final stats: {store.get_stats()}\")\n",
"\n",
" return context_entities\n",
"\n",
"\n",
"# Run the simulation\n",
"context = simulate_graphrag_pipeline()\n",
"print(f\"\\n🎯 Retrieved {len(context)} entities for context building\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Step 8: Testing and Validation\n",
"\n",
"Let's create a comprehensive test suite to ensure our vector store works correctly."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def test_custom_vector_store():\n",
" \"\"\"Comprehensive test suite for the custom vector store.\"\"\"\n",
" print(\"🧪 Running comprehensive vector store tests...\\n\")\n",
"\n",
" # Test 1: Basic functionality\n",
" print(\"Test 1: Basic functionality\")\n",
" store = VectorStoreFactory.create_vector_store(\n",
" CUSTOM_VECTOR_STORE_TYPE,\n",
" vector_store_schema_config=VectorStoreSchemaConfig(index_name=\"test\"),\n",
" )\n",
" store.connect()\n",
"\n",
" # Load test documents\n",
" test_docs = sample_documents[:2]\n",
" store.load_documents(test_docs)\n",
"\n",
" assert len(store.documents) == 2, \"Should have 2 documents\"\n",
" assert len(store.vectors) == 2, \"Should have 2 vectors\"\n",
" print(\"✅ Basic functionality test passed\")\n",
"\n",
" # Test 2: Search functionality\n",
" print(\"\\nTest 2: Search functionality\")\n",
" query_vec = create_mock_embedding()\n",
" results = store.similarity_search_by_vector(query_vec, k=5)\n",
"\n",
" assert len(results) <= 2, \"Should not return more results than documents\"\n",
" assert all(isinstance(r, VectorStoreSearchResult) for r in results), (\n",
" \"Should return VectorStoreSearchResult objects\"\n",
" )\n",
" assert all(-1 <= r.score <= 1 for r in results), (\n",
" \"Similarity scores should be between -1 and 1\"\n",
" )\n",
" print(\"✅ Search functionality test passed\")\n",
"\n",
" # Test 3: Search by ID\n",
" print(\"\\nTest 3: Search by ID\")\n",
" found_doc = store.search_by_id(\"doc_1\")\n",
" assert found_doc.id == \"doc_1\", \"Should find correct document\"\n",
"\n",
" try:\n",
" store.search_by_id(\"nonexistent\")\n",
" assert False, \"Should raise KeyError for nonexistent ID\"\n",
" except KeyError:\n",
" pass # Expected\n",
"\n",
" print(\"✅ Search by ID test passed\")\n",
"\n",
" # Test 4: Filter functionality\n",
" print(\"\\nTest 4: Filter functionality\")\n",
" filter_result = store.filter_by_id([\"doc_1\", \"doc_2\"])\n",
" assert filter_result == [\"doc_1\", \"doc_2\"], \"Should return filtered IDs\"\n",
" print(\"✅ Filter functionality test passed\")\n",
"\n",
" # Test 5: Error handling\n",
" print(\"\\nTest 5: Error handling\")\n",
" disconnected_store = VectorStoreFactory.create_vector_store(\n",
" CUSTOM_VECTOR_STORE_TYPE,\n",
" vector_store_schema_config=VectorStoreSchemaConfig(index_name=\"test2\"),\n",
" )\n",
"\n",
" try:\n",
" disconnected_store.load_documents(test_docs)\n",
" assert False, \"Should raise error when not connected\"\n",
" except RuntimeError:\n",
" pass # Expected\n",
"\n",
" try:\n",
" disconnected_store.similarity_search_by_vector(query_vec)\n",
" assert False, \"Should raise error when not connected\"\n",
" except RuntimeError:\n",
" pass # Expected\n",
"\n",
" print(\"✅ Error handling test passed\")\n",
"\n",
" print(\"\\n🎉 All tests passed! Your custom vector store is working correctly.\")\n",
"\n",
"\n",
"# Run the tests\n",
"test_custom_vector_store()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Summary and Next Steps\n",
"\n",
"Congratulations! You've successfully learned how to implement and register a custom vector store with GraphRAG. Here's what you accomplished:\n",
"\n",
"### What You Built\n",
"- ✅ **Custom Vector Store Class**: Implemented `SimpleInMemoryVectorStore` with all required methods\n",
"- ✅ **Factory Integration**: Registered your vector store with `VectorStoreFactory`\n",
"- ✅ **Comprehensive Testing**: Validated functionality with a full test suite\n",
"- ✅ **Configuration Examples**: Learned how to configure GraphRAG to use your vector store\n",
"\n",
"### Key Takeaways\n",
"1. **Interface Compliance**: Always implement all methods from `BaseVectorStore`\n",
"2. **Factory Pattern**: Use `VectorStoreFactory.register()` to make your vector store available\n",
"3. **Configuration**: Vector stores are configured in GraphRAG settings files\n",
"4. **Testing**: Thoroughly test all functionality before deploying\n",
"\n",
"### Next Steps\n",
"Check out the API Overview notebook to learn how to index and query data via the graphrag API.\n",
"\n",
"### Resources\n",
"- [GraphRAG Documentation](https://microsoft.github.io/graphrag/)\n",
"\n",
"Happy building! 🚀"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "graphrag",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.10"
}
},
"nbformat": 4,
"nbformat_minor": 4
}

File diff suppressed because one or more lines are too long

File diff suppressed because it is too large Load Diff

View File

@ -2,7 +2,7 @@
"cells": [
{
"cell_type": "code",
"execution_count": null,
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
@ -12,22 +12,22 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"\n",
"import pandas as pd\n",
"import tiktoken\n",
"\n",
"from graphrag.config.enums import ModelType\n",
"from graphrag.config.models.language_model_config import LanguageModelConfig\n",
"from graphrag.language_model.manager import ModelManager\n",
"from graphrag.query.indexer_adapters import (\n",
" read_indexer_communities,\n",
" read_indexer_entities,\n",
" read_indexer_reports,\n",
")\n",
"from graphrag.query.llm.oai.chat_openai import ChatOpenAI\n",
"from graphrag.query.llm.oai.typing import OpenaiApiType\n",
"from graphrag.query.structured_search.global_search.community_context import (\n",
" GlobalCommunityContext,\n",
")\n",
@ -52,28 +52,21 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"from graphrag.tokenizer.get_tokenizer import get_tokenizer\n",
"\n",
"api_key = os.environ[\"GRAPHRAG_API_KEY\"]\n",
"llm_model = os.environ[\"GRAPHRAG_LLM_MODEL\"]\n",
"\n",
"config = LanguageModelConfig(\n",
"llm = ChatOpenAI(\n",
" api_key=api_key,\n",
" type=ModelType.Chat,\n",
" model_provider=\"openai\",\n",
" model=\"gpt-4.1\",\n",
" model=llm_model,\n",
" api_type=OpenaiApiType.OpenAI, # OpenaiApiType.OpenAI or OpenaiApiType.AzureOpenAI\n",
" max_retries=20,\n",
")\n",
"model = ModelManager().get_or_create_chat_model(\n",
" name=\"global_search\",\n",
" model_type=ModelType.Chat,\n",
" config=config,\n",
")\n",
"\n",
"tokenizer = get_tokenizer(config)"
"token_encoder = tiktoken.encoding_for_model(llm_model)"
]
},
{
@ -82,22 +75,23 @@
"source": [
"### Load community reports as context for global search\n",
"\n",
"- Load all community reports in the `community_reports` table from the indexing engine, to be used as context data for global search.\n",
"- Load entities from the `entities` tables from the indexing engine, to be used for calculating community weights for context ranking. Note that this is optional (if no entities are provided, we will not calculate community weights and only use the rank attribute in the community reports table for context ranking)\n",
"- Load all communities in the `communities` table from the indexing engine, to be used to reconstruct the community graph hierarchy for dynamic community selection."
"- Load all community reports in the `create_final_community_reports` table from the ire-indexing engine, to be used as context data for global search.\n",
"- Load entities from the `create_final_nodes` and `create_final_entities` tables from the ire-indexing engine, to be used for calculating community weights for context ranking. Note that this is optional (if no entities are provided, we will not calculate community weights and only use the rank attribute in the community reports table for context ranking)\n",
"- Load all communities in the `create_final_communites` table from the ire-indexing engine, to be used to reconstruct the community graph hierarchy for dynamic community selection."
]
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"# parquet files generated from indexing pipeline\n",
"INPUT_DIR = \"./inputs/operation dulce\"\n",
"COMMUNITY_TABLE = \"communities\"\n",
"COMMUNITY_REPORT_TABLE = \"community_reports\"\n",
"ENTITY_TABLE = \"entities\"\n",
"COMMUNITY_TABLE = \"create_final_communities\"\n",
"COMMUNITY_REPORT_TABLE = \"create_final_community_reports\"\n",
"ENTITY_TABLE = \"create_final_nodes\"\n",
"ENTITY_EMBEDDING_TABLE = \"create_final_entities\"\n",
"\n",
"# we don't fix a specific community level but instead use an agent to dynamicially\n",
"# search through all the community reports to check if they are relevant.\n",
@ -106,23 +100,191 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 5,
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Total report count: 20\n",
"Report count after filtering by community level None: 20\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>community</th>\n",
" <th>full_content</th>\n",
" <th>level</th>\n",
" <th>rank</th>\n",
" <th>title</th>\n",
" <th>rank_explanation</th>\n",
" <th>summary</th>\n",
" <th>findings</th>\n",
" <th>full_content_json</th>\n",
" <th>id</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>10</td>\n",
" <td># Paranormal Military Squad at Dulce Base: Dec...</td>\n",
" <td>1</td>\n",
" <td>8.5</td>\n",
" <td>Paranormal Military Squad at Dulce Base: Decod...</td>\n",
" <td>The impact severity rating is high due to the ...</td>\n",
" <td>The Paranormal Military Squad, stationed at Du...</td>\n",
" <td>[{'explanation': 'Jordan is a central figure i...</td>\n",
" <td>{\\n \"title\": \"Paranormal Military Squad at ...</td>\n",
" <td>1ba2d200-dd26-4693-affe-a5539d0a0e0d</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>11</td>\n",
" <td># Dulce and Paranormal Military Squad Operatio...</td>\n",
" <td>1</td>\n",
" <td>8.5</td>\n",
" <td>Dulce and Paranormal Military Squad Operations</td>\n",
" <td>The impact severity rating is high due to the ...</td>\n",
" <td>The community centers around Dulce, a secretiv...</td>\n",
" <td>[{'explanation': 'Dulce is described as a top-...</td>\n",
" <td>{\\n \"title\": \"Dulce and Paranormal Military...</td>\n",
" <td>a8a530b0-ae6b-44ea-b11c-9f70d138298d</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>12</td>\n",
" <td># Paranormal Military Squad and Dulce Base Ope...</td>\n",
" <td>1</td>\n",
" <td>7.5</td>\n",
" <td>Paranormal Military Squad and Dulce Base Opera...</td>\n",
" <td>The impact severity rating is relatively high ...</td>\n",
" <td>The community centers around the Paranormal Mi...</td>\n",
" <td>[{'explanation': 'Taylor is a central figure w...</td>\n",
" <td>{\\n \"title\": \"Paranormal Military Squad and...</td>\n",
" <td>0478975b-c805-4cc1-b746-82f3e689e2f3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>13</td>\n",
" <td># Mission Dynamics and Leadership: Cruz and Wa...</td>\n",
" <td>1</td>\n",
" <td>7.5</td>\n",
" <td>Mission Dynamics and Leadership: Cruz and Wash...</td>\n",
" <td>The impact severity rating is relatively high ...</td>\n",
" <td>This report explores the intricate dynamics of...</td>\n",
" <td>[{'explanation': 'Cruz is a central figure in ...</td>\n",
" <td>{\\n \"title\": \"Mission Dynamics and Leadersh...</td>\n",
" <td>b56f6e68-3951-4f07-8760-63700944a375</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>14</td>\n",
" <td># Dulce Base and Paranormal Military Squad: Br...</td>\n",
" <td>1</td>\n",
" <td>8.5</td>\n",
" <td>Dulce Base and Paranormal Military Squad: Brid...</td>\n",
" <td>The impact severity rating is high due to the ...</td>\n",
" <td>The community centers around the Dulce Base, a...</td>\n",
" <td>[{'explanation': 'Sam Rivera, a member of the ...</td>\n",
" <td>{\\n \"title\": \"Dulce Base and Paranormal Mil...</td>\n",
" <td>736e7006-d050-4abb-a122-00febf3f540f</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" community full_content level rank \\\n",
"0 10 # Paranormal Military Squad at Dulce Base: Dec... 1 8.5 \n",
"1 11 # Dulce and Paranormal Military Squad Operatio... 1 8.5 \n",
"2 12 # Paranormal Military Squad and Dulce Base Ope... 1 7.5 \n",
"3 13 # Mission Dynamics and Leadership: Cruz and Wa... 1 7.5 \n",
"4 14 # Dulce Base and Paranormal Military Squad: Br... 1 8.5 \n",
"\n",
" title \\\n",
"0 Paranormal Military Squad at Dulce Base: Decod... \n",
"1 Dulce and Paranormal Military Squad Operations \n",
"2 Paranormal Military Squad and Dulce Base Opera... \n",
"3 Mission Dynamics and Leadership: Cruz and Wash... \n",
"4 Dulce Base and Paranormal Military Squad: Brid... \n",
"\n",
" rank_explanation \\\n",
"0 The impact severity rating is high due to the ... \n",
"1 The impact severity rating is high due to the ... \n",
"2 The impact severity rating is relatively high ... \n",
"3 The impact severity rating is relatively high ... \n",
"4 The impact severity rating is high due to the ... \n",
"\n",
" summary \\\n",
"0 The Paranormal Military Squad, stationed at Du... \n",
"1 The community centers around Dulce, a secretiv... \n",
"2 The community centers around the Paranormal Mi... \n",
"3 This report explores the intricate dynamics of... \n",
"4 The community centers around the Dulce Base, a... \n",
"\n",
" findings \\\n",
"0 [{'explanation': 'Jordan is a central figure i... \n",
"1 [{'explanation': 'Dulce is described as a top-... \n",
"2 [{'explanation': 'Taylor is a central figure w... \n",
"3 [{'explanation': 'Cruz is a central figure in ... \n",
"4 [{'explanation': 'Sam Rivera, a member of the ... \n",
"\n",
" full_content_json \\\n",
"0 {\\n \"title\": \"Paranormal Military Squad at ... \n",
"1 {\\n \"title\": \"Dulce and Paranormal Military... \n",
"2 {\\n \"title\": \"Paranormal Military Squad and... \n",
"3 {\\n \"title\": \"Mission Dynamics and Leadersh... \n",
"4 {\\n \"title\": \"Dulce Base and Paranormal Mil... \n",
"\n",
" id \n",
"0 1ba2d200-dd26-4693-affe-a5539d0a0e0d \n",
"1 a8a530b0-ae6b-44ea-b11c-9f70d138298d \n",
"2 0478975b-c805-4cc1-b746-82f3e689e2f3 \n",
"3 b56f6e68-3951-4f07-8760-63700944a375 \n",
"4 736e7006-d050-4abb-a122-00febf3f540f "
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"community_df = pd.read_parquet(f\"{INPUT_DIR}/{COMMUNITY_TABLE}.parquet\")\n",
"entity_df = pd.read_parquet(f\"{INPUT_DIR}/{ENTITY_TABLE}.parquet\")\n",
"report_df = pd.read_parquet(f\"{INPUT_DIR}/{COMMUNITY_REPORT_TABLE}.parquet\")\n",
"entity_embedding_df = pd.read_parquet(f\"{INPUT_DIR}/{ENTITY_EMBEDDING_TABLE}.parquet\")\n",
"\n",
"communities = read_indexer_communities(community_df, report_df)\n",
"communities = read_indexer_communities(community_df, entity_df, report_df)\n",
"reports = read_indexer_reports(\n",
" report_df,\n",
" community_df,\n",
" entity_df,\n",
" community_level=COMMUNITY_LEVEL,\n",
" dynamic_community_selection=True,\n",
")\n",
"entities = read_indexer_entities(\n",
" entity_df, community_df, community_level=COMMUNITY_LEVEL\n",
" entity_df, entity_embedding_df, community_level=COMMUNITY_LEVEL\n",
")\n",
"\n",
"print(f\"Total report count: {len(report_df)}\")\n",
@ -148,19 +310,27 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"mini_llm = ChatOpenAI(\n",
" api_key=api_key,\n",
" model=\"gpt-4o-mini\",\n",
" api_type=OpenaiApiType.OpenAI, # OpenaiApiType.OpenAI or OpenaiApiType.AzureOpenAI\n",
" max_retries=20,\n",
")\n",
"mini_token_encoder = tiktoken.encoding_for_model(mini_llm.model)\n",
"\n",
"context_builder = GlobalCommunityContext(\n",
" community_reports=reports,\n",
" communities=communities,\n",
" entities=entities, # default to None if you don't want to use community weights for ranking\n",
" tokenizer=tokenizer,\n",
" token_encoder=token_encoder,\n",
" dynamic_community_selection=True,\n",
" dynamic_community_selection_kwargs={\n",
" \"model\": model,\n",
" \"tokenizer\": tokenizer,\n",
" \"llm\": mini_llm,\n",
" \"token_encoder\": mini_token_encoder,\n",
" },\n",
")"
]
@ -174,7 +344,7 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
@ -205,14 +375,14 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"search_engine = GlobalSearch(\n",
" model=model,\n",
" llm=llm,\n",
" context_builder=context_builder,\n",
" tokenizer=tokenizer,\n",
" token_encoder=token_encoder,\n",
" max_data_tokens=12_000, # change this based on the token limit you have on your model (if you are using a model with 8k limit, a good setting could be 5000)\n",
" map_llm_params=map_llm_params,\n",
" reduce_llm_params=reduce_llm_params,\n",
@ -226,20 +396,158 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 9,
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"### Overview of Cosmic Vocalization\n",
"\n",
"Cosmic Vocalization is a phenomenon that has captured the attention of various individuals and groups, becoming a focal point for community interest. It is perceived as a significant cosmic event, with interpretations ranging from a strategic security concern to a metaphorical interstellar duet [Data: Reports (6)].\n",
"\n",
"### Key Stakeholders and Perspectives\n",
"\n",
"1. **Paranormal Military Squad**: This group is actively engaged with Cosmic Vocalization, treating it as a strategic element in their security measures. Their involvement underscores the importance of Cosmic Vocalization in broader security contexts. They metaphorically view the Universe as a concert hall, suggesting a unique perspective on cosmic events and their implications for human entities [Data: Reports (6)].\n",
"\n",
"2. **Alex Mercer**: Alex Mercer perceives Cosmic Vocalization as part of an interstellar duet, indicating a responsive and perhaps artistic approach to understanding these cosmic phenomena. This perspective highlights the diverse interpretations and cultural significance attributed to Cosmic Vocalization [Data: Reports (6)].\n",
"\n",
"3. **Taylor Cruz**: Taylor Cruz expresses concerns about Cosmic Vocalization, fearing it might serve as a homing tune. This perspective introduces a layer of urgency and potential threat, suggesting that Cosmic Vocalization could have implications beyond mere observation, possibly affecting security or existential considerations [Data: Reports (6)].\n",
"\n",
"### Implications\n",
"\n",
"The involvement of these stakeholders and their varied perspectives on Cosmic Vocalization illustrate the complexity and multifaceted nature of this phenomenon. It is not only a subject of scientific and strategic interest but also a cultural and existential topic that prompts diverse interpretations and responses. The strategic engagement by the Paranormal Military Squad and the concerns raised by individuals like Taylor Cruz highlight the potential significance of Cosmic Vocalization in both security and broader cosmic contexts [Data: Reports (6)].\n"
]
}
],
"source": [
"result = await search_engine.search(\"What is operation dulce?\")\n",
"result = await search_engine.asearch(\n",
" \"What is Cosmic Vocalization and who are involved in it?\"\n",
")\n",
"\n",
"print(result.response)"
]
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 10,
"metadata": {},
"outputs": [],
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>id</th>\n",
" <th>title</th>\n",
" <th>occurrence weight</th>\n",
" <th>content</th>\n",
" <th>rank</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>15</td>\n",
" <td>Dulce Base and the Paranormal Military Squad: ...</td>\n",
" <td>1.00</td>\n",
" <td># Dulce Base and the Paranormal Military Squad...</td>\n",
" <td>9.5</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1</td>\n",
" <td>Earth's Interstellar Communication Initiative</td>\n",
" <td>0.16</td>\n",
" <td># Earth's Interstellar Communication Initiativ...</td>\n",
" <td>8.5</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>16</td>\n",
" <td>Dulce Military Base and Alien Intelligence Com...</td>\n",
" <td>0.08</td>\n",
" <td># Dulce Military Base and Alien Intelligence C...</td>\n",
" <td>8.5</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>18</td>\n",
" <td>Paranormal Military Squad Team and Dulce Base'...</td>\n",
" <td>0.04</td>\n",
" <td># Paranormal Military Squad Team and Dulce Bas...</td>\n",
" <td>8.5</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>19</td>\n",
" <td>Central Terminal and Viewing Monitors at Dulce...</td>\n",
" <td>0.02</td>\n",
" <td># Central Terminal and Viewing Monitors at Dul...</td>\n",
" <td>8.5</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>4</td>\n",
" <td>Dulce Facility and Control Room of Dulce: Extr...</td>\n",
" <td>0.02</td>\n",
" <td># Dulce Facility and Control Room of Dulce: Ex...</td>\n",
" <td>8.5</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>6</td>\n",
" <td>Cosmic Vocalization and Universe Interactions</td>\n",
" <td>0.02</td>\n",
" <td># Cosmic Vocalization and Universe Interaction...</td>\n",
" <td>7.5</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" id title occurrence weight \\\n",
"0 15 Dulce Base and the Paranormal Military Squad: ... 1.00 \n",
"1 1 Earth's Interstellar Communication Initiative 0.16 \n",
"2 16 Dulce Military Base and Alien Intelligence Com... 0.08 \n",
"3 18 Paranormal Military Squad Team and Dulce Base'... 0.04 \n",
"4 19 Central Terminal and Viewing Monitors at Dulce... 0.02 \n",
"5 4 Dulce Facility and Control Room of Dulce: Extr... 0.02 \n",
"6 6 Cosmic Vocalization and Universe Interactions 0.02 \n",
"\n",
" content rank \n",
"0 # Dulce Base and the Paranormal Military Squad... 9.5 \n",
"1 # Earth's Interstellar Communication Initiativ... 8.5 \n",
"2 # Dulce Military Base and Alien Intelligence C... 8.5 \n",
"3 # Paranormal Military Squad Team and Dulce Bas... 8.5 \n",
"4 # Central Terminal and Viewing Monitors at Dul... 8.5 \n",
"5 # Dulce Facility and Control Room of Dulce: Ex... 8.5 \n",
"6 # Cosmic Vocalization and Universe Interaction... 7.5 "
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# inspect the data used to build the context for the LLM responses\n",
"result.context_data[\"reports\"]"
@ -247,16 +555,27 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 11,
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Build context (gpt-4o-mini)\n",
"LLM calls: 12. Prompt tokens: 8565. Output tokens: 1091.\n",
"Map-reduce (gpt-4o)\n",
"LLM calls: 2. Prompt tokens: 5771. Output tokens: 600.\n"
]
}
],
"source": [
"# inspect number of LLM calls and tokens in dynamic community selection\n",
"llm_calls = result.llm_calls_categories[\"build_context\"]\n",
"prompt_tokens = result.prompt_tokens_categories[\"build_context\"]\n",
"output_tokens = result.output_tokens_categories[\"build_context\"]\n",
"print(\n",
" f\"Build context LLM calls: {llm_calls}. Prompt tokens: {prompt_tokens}. Output tokens: {output_tokens}.\"\n",
" f\"Build context ({mini_llm.model})\\nLLM calls: {llm_calls}. Prompt tokens: {prompt_tokens}. Output tokens: {output_tokens}.\"\n",
")\n",
"# inspect number of LLM calls and tokens in map-reduce\n",
"llm_calls = result.llm_calls_categories[\"map\"] + result.llm_calls_categories[\"reduce\"]\n",
@ -267,7 +586,7 @@
" result.output_tokens_categories[\"map\"] + result.output_tokens_categories[\"reduce\"]\n",
")\n",
"print(\n",
" f\"Map-reduce LLM calls: {llm_calls}. Prompt tokens: {prompt_tokens}. Output tokens: {output_tokens}.\"\n",
" f\"Map-reduce ({llm.model})\\nLLM calls: {llm_calls}. Prompt tokens: {prompt_tokens}. Output tokens: {output_tokens}.\"\n",
")"
]
}
@ -288,7 +607,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.10"
"version": "3.12.5"
}
},
"nbformat": 4,

View File

@ -2,7 +2,7 @@
"cells": [
{
"cell_type": "code",
"execution_count": 62,
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
@ -14,9 +14,9 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"## Index Migration (pre-v1 to v1)\n",
"## Index Migration\n",
"\n",
"This notebook is used to maintain data model parity with older indexes for version 1.0 of GraphRAG. If you have a pre-1.0 index and need to migrate without re-running the entire pipeline, you can use this notebook to only update the pieces necessary for alignment.\n",
"This notebook is used to maintain data model parity with older indexes for the latest versions of GraphRAG. If you have a pre-1.0 index and need to migrate without re-running the entire pipeline, you can use this notebook to only update the pieces necessary for alignment.\n",
"\n",
"NOTE: we recommend regenerating your settings.yml with the latest version of GraphRAG using `graphrag init`. Copy your LLM settings into it before running this notebook. This ensures your config is aligned with the latest version for the migration. This also ensures that you have default vector store config, which is now required or indexing will fail.\n",
"\n",
@ -25,38 +25,39 @@
},
{
"cell_type": "code",
"execution_count": 63,
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"# This is the directory that has your settings.yaml\n",
"# This is the directory that has your settings.yml\n",
"# NOTE: much older indexes may have been output with a timestamped directory\n",
"# if this is the case, you will need to make sure the storage.base_dir in settings.yaml points to it correctly\n",
"PROJECT_DIRECTORY = \"<your project directory\""
"# if this is the case, you will need to make sure the storage.base_dir in settings.yml points to it correctly\n",
"PROJECT_DIRECTORY = \"<your project directory>\""
]
},
{
"cell_type": "code",
"execution_count": 64,
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"from pathlib import Path\n",
"\n",
"from graphrag.config.load_config import load_config\n",
"from graphrag.storage.factory import StorageFactory\n",
"from graphrag.config.resolve_path import resolve_paths\n",
"from graphrag.index.create_pipeline_config import create_pipeline_config\n",
"from graphrag.storage.factory import create_storage\n",
"\n",
"# This first block does some config loading, path resolution, and translation that is normally done by the CLI/API when running a full workflow\n",
"config = load_config(Path(PROJECT_DIRECTORY))\n",
"storage_config = config.output.model_dump()\n",
"storage = StorageFactory().create_storage(\n",
" storage_type=storage_config[\"type\"],\n",
" kwargs=storage_config,\n",
")"
"resolve_paths(config)\n",
"pipeline_config = create_pipeline_config(config)\n",
"storage = create_storage(pipeline_config.storage)"
]
},
{
"cell_type": "code",
"execution_count": 65,
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
@ -67,7 +68,7 @@
},
{
"cell_type": "code",
"execution_count": 66,
"execution_count": 63,
"metadata": {},
"outputs": [],
"source": [
@ -96,7 +97,7 @@
},
{
"cell_type": "code",
"execution_count": 67,
"execution_count": 64,
"metadata": {},
"outputs": [],
"source": [
@ -107,16 +108,22 @@
"# First we'll go through any parquet files that had model changes and update them\n",
"# The new data model may have removed excess columns as well, but we will only make the minimal changes required for compatibility\n",
"\n",
"final_documents = await load_table_from_storage(\"create_final_documents\", storage)\n",
"final_text_units = await load_table_from_storage(\"create_final_text_units\", storage)\n",
"final_entities = await load_table_from_storage(\"create_final_entities\", storage)\n",
"final_nodes = await load_table_from_storage(\"create_final_nodes\", storage)\n",
"final_relationships = await load_table_from_storage(\n",
" \"create_final_relationships\", storage\n",
"final_documents = await load_table_from_storage(\n",
" \"create_final_documents.parquet\", storage\n",
")\n",
"final_text_units = await load_table_from_storage(\n",
" \"create_final_text_units.parquet\", storage\n",
")\n",
"final_entities = await load_table_from_storage(\"create_final_entities.parquet\", storage)\n",
"final_nodes = await load_table_from_storage(\"create_final_nodes.parquet\", storage)\n",
"final_relationships = await load_table_from_storage(\n",
" \"create_final_relationships.parquet\", storage\n",
")\n",
"final_communities = await load_table_from_storage(\n",
" \"create_final_communities.parquet\", storage\n",
")\n",
"final_communities = await load_table_from_storage(\"create_final_communities\", storage)\n",
"final_community_reports = await load_table_from_storage(\n",
" \"create_final_community_reports\", storage\n",
" \"create_final_community_reports.parquet\", storage\n",
")\n",
"\n",
"\n",
@ -132,7 +139,7 @@
"if \"name\" in final_entities.columns:\n",
" final_entities.rename(columns={\"name\": \"title\"}, inplace=True)\n",
"remove_columns(\n",
" final_entities, [\"name_embedding\", \"graph_embedding\", \"description_embedding\"]\n",
" final_entities, [\"mname_embedding\", \"graph_embedding\", \"description_embedding\"]\n",
")\n",
"\n",
"# Final nodes uses community for joins, which is now an int everywhere\n",
@ -166,15 +173,6 @@
" final_communities[\"id\"] = [str(uuid4()) for _ in range(len(final_communities))]\n",
"if \"parent\" not in final_communities.columns:\n",
" final_communities = final_communities.merge(parent_df, on=\"community\", how=\"left\")\n",
"if \"entity_ids\" not in final_communities.columns:\n",
" node_mapping = (\n",
" final_nodes.loc[:, [\"community\", \"id\"]]\n",
" .groupby(\"community\")\n",
" .agg(entity_ids=(\"id\", list))\n",
" )\n",
" final_communities = final_communities.merge(\n",
" node_mapping, on=\"community\", how=\"left\"\n",
" )\n",
"remove_columns(final_communities, [\"raw_community\"])\n",
"\n",
"# We need int for community and the human_readable_id copy for consistency\n",
@ -185,42 +183,44 @@
" parent_df, on=\"community\", how=\"left\"\n",
" )\n",
"\n",
"await write_table_to_storage(final_documents, \"create_final_documents\", storage)\n",
"await write_table_to_storage(final_text_units, \"create_final_text_units\", storage)\n",
"await write_table_to_storage(final_entities, \"create_final_entities\", storage)\n",
"await write_table_to_storage(final_nodes, \"create_final_nodes\", storage)\n",
"await write_table_to_storage(final_relationships, \"create_final_relationships\", storage)\n",
"await write_table_to_storage(final_communities, \"create_final_communities\", storage)\n",
"await write_table_to_storage(final_documents, \"create_final_documents.parquet\", storage)\n",
"await write_table_to_storage(\n",
" final_community_reports, \"create_final_community_reports\", storage\n",
" final_text_units, \"create_final_text_units.parquet\", storage\n",
")\n",
"await write_table_to_storage(final_entities, \"create_final_entities.parquet\", storage)\n",
"await write_table_to_storage(final_nodes, \"create_final_nodes.parquet\", storage)\n",
"await write_table_to_storage(\n",
" final_relationships, \"create_final_relationships.parquet\", storage\n",
")\n",
"await write_table_to_storage(\n",
" final_communities, \"create_final_communities.parquet\", storage\n",
")\n",
"await write_table_to_storage(\n",
" final_community_reports, \"create_final_community_reports.parquet\", storage\n",
")"
]
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"from graphrag.index.flows.generate_text_embeddings import generate_text_embeddings\n",
"\n",
"from graphrag.cache.factory import CacheFactory\n",
"from graphrag.cache.factory import create_cache\n",
"from graphrag.callbacks.noop_workflow_callbacks import NoopWorkflowCallbacks\n",
"from graphrag.config.embeddings import get_embedded_fields, get_embedding_settings\n",
"from graphrag.index.flows.generate_text_embeddings import generate_text_embeddings\n",
"\n",
"# We only need to re-run the embeddings workflow, to ensure that embeddings for all required search fields are in place\n",
"# We'll construct the context and run this function flow directly to avoid everything else\n",
"\n",
"\n",
"embedded_fields = get_embedded_fields(config)\n",
"text_embed = get_embedding_settings(config)\n",
"callbacks = NoopWorkflowCallbacks()\n",
"cache_config = config.cache.model_dump() # type: ignore\n",
"cache = CacheFactory().create_cache(\n",
" cache_type=cache_config[\"type\"], # type: ignore\n",
" root_dir=PROJECT_DIRECTORY,\n",
" kwargs=cache_config,\n",
"workflow = next(\n",
" (x for x in pipeline_config.workflows if x.name == \"generate_text_embeddings\"), None\n",
")\n",
"config = workflow.config\n",
"text_embed = config.get(\"text_embed\", {})\n",
"embedded_fields = config.get(\"embedded_fields\", {})\n",
"callbacks = NoopWorkflowCallbacks()\n",
"cache = create_cache(pipeline_config.cache, PROJECT_DIRECTORY)\n",
"\n",
"await generate_text_embeddings(\n",
" final_documents=None,\n",

View File

@ -1,171 +0,0 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [],
"source": [
"# Copyright (c) 2024 Microsoft Corporation.\n",
"# Licensed under the MIT License."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Index Migration (v1 to v2)\n",
"\n",
"This notebook is used to maintain data model parity with older indexes for version 2.0 of GraphRAG. If you have a pre-2.0 index and need to migrate without re-running the entire pipeline, you can use this notebook to only update the pieces necessary for alignment. If you have a pre-1.0 index, please run the v1 migration notebook first!\n",
"\n",
"NOTE: we recommend regenerating your settings.yml with the latest version of GraphRAG using `graphrag init`. Copy your LLM settings into it before running this notebook. This ensures your config is aligned with the latest version for the migration. This also ensures that you have default vector store config, which is now required or indexing will fail.\n",
"\n",
"WARNING: This will overwrite your parquet files, you may want to make a backup!"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# This is the directory that has your settings.yaml\n",
"PROJECT_DIRECTORY = \"<your project directory>\""
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [],
"source": [
"from pathlib import Path\n",
"\n",
"from graphrag.config.load_config import load_config\n",
"from graphrag.storage.factory import StorageFactory\n",
"\n",
"config = load_config(Path(PROJECT_DIRECTORY))\n",
"storage_config = config.output.model_dump()\n",
"storage = StorageFactory().create_storage(\n",
" storage_type=storage_config[\"type\"],\n",
" kwargs=storage_config,\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [],
"source": [
"def remove_columns(df, columns):\n",
" \"\"\"Remove columns from a DataFrame, suppressing errors.\"\"\"\n",
" df.drop(labels=columns, axis=1, errors=\"ignore\", inplace=True)"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [],
"source": [
"import numpy as np\n",
"\n",
"from graphrag.utils.storage import (\n",
" delete_table_from_storage,\n",
" load_table_from_storage,\n",
" write_table_to_storage,\n",
")\n",
"\n",
"final_documents = await load_table_from_storage(\"create_final_documents\", storage)\n",
"final_text_units = await load_table_from_storage(\"create_final_text_units\", storage)\n",
"final_entities = await load_table_from_storage(\"create_final_entities\", storage)\n",
"final_covariates = await load_table_from_storage(\"create_final_covariates\", storage)\n",
"final_nodes = await load_table_from_storage(\"create_final_nodes\", storage)\n",
"final_relationships = await load_table_from_storage(\n",
" \"create_final_relationships\", storage\n",
")\n",
"final_communities = await load_table_from_storage(\"create_final_communities\", storage)\n",
"final_community_reports = await load_table_from_storage(\n",
" \"create_final_community_reports\", storage\n",
")\n",
"\n",
"# we've renamed document attributes as metadata\n",
"if \"attributes\" in final_documents.columns:\n",
" final_documents.rename(columns={\"attributes\": \"metadata\"}, inplace=True)\n",
"\n",
"# we're removing the nodes table, so we need to copy the graph columns into entities\n",
"graph_props = (\n",
" final_nodes.loc[:, [\"id\", \"degree\", \"x\", \"y\"]].groupby(\"id\").first().reset_index()\n",
")\n",
"final_entities = final_entities.merge(graph_props, on=\"id\", how=\"left\")\n",
"# we're also persisting the frequency column\n",
"final_entities[\"frequency\"] = final_entities[\"text_unit_ids\"].count()\n",
"\n",
"\n",
"# we added children to communities to eliminate query-time reconstruction\n",
"parent_grouped = final_communities.groupby(\"parent\").agg(\n",
" children=(\"community\", \"unique\")\n",
")\n",
"final_communities = final_communities.merge(\n",
" parent_grouped,\n",
" left_on=\"community\",\n",
" right_on=\"parent\",\n",
" how=\"left\",\n",
")\n",
"# replace NaN children with empty list\n",
"final_communities[\"children\"] = final_communities[\"children\"].apply(\n",
" lambda x: x if isinstance(x, np.ndarray) else [] # type: ignore\n",
")\n",
"\n",
"# add children to the reports as well\n",
"final_community_reports = final_community_reports.merge(\n",
" parent_grouped,\n",
" left_on=\"community\",\n",
" right_on=\"parent\",\n",
" how=\"left\",\n",
")\n",
"\n",
"# we renamed all the output files for better clarity now that we don't have workflow naming constraints from DataShaper\n",
"await write_table_to_storage(final_documents, \"documents\", storage)\n",
"await write_table_to_storage(final_text_units, \"text_units\", storage)\n",
"await write_table_to_storage(final_entities, \"entities\", storage)\n",
"await write_table_to_storage(final_relationships, \"relationships\", storage)\n",
"await write_table_to_storage(final_covariates, \"covariates\", storage)\n",
"await write_table_to_storage(final_communities, \"communities\", storage)\n",
"await write_table_to_storage(final_community_reports, \"community_reports\", storage)\n",
"\n",
"# delete all the old versions\n",
"await delete_table_from_storage(\"create_final_documents\", storage)\n",
"await delete_table_from_storage(\"create_final_text_units\", storage)\n",
"await delete_table_from_storage(\"create_final_entities\", storage)\n",
"await delete_table_from_storage(\"create_final_nodes\", storage)\n",
"await delete_table_from_storage(\"create_final_relationships\", storage)\n",
"await delete_table_from_storage(\"create_final_covariates\", storage)\n",
"await delete_table_from_storage(\"create_final_communities\", storage)\n",
"await delete_table_from_storage(\"create_final_community_reports\", storage)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": ".venv",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.9"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

View File

@ -1,194 +0,0 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Copyright (c) 2024 Microsoft Corporation.\n",
"# Licensed under the MIT License."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Example of indexing from an existing in-memory dataframe\n",
"\n",
"Newer versions of GraphRAG let you submit a dataframe directly instead of running through the input processing step. This notebook demonstrates with regular or update runs.\n",
"\n",
"If performing an update, the assumption is that your dataframe contains only the new documents to add to the index."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from pathlib import Path\n",
"from pprint import pprint\n",
"\n",
"import pandas as pd\n",
"\n",
"import graphrag.api as api\n",
"from graphrag.config.load_config import load_config\n",
"from graphrag.index.typing.pipeline_run_result import PipelineRunResult"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"PROJECT_DIRECTORY = \"<your project directory>\"\n",
"UPDATE = False\n",
"FILENAME = \"new_documents.parquet\" if UPDATE else \"<original_documents>.parquet\"\n",
"inputs = pd.read_parquet(f\"{PROJECT_DIRECTORY}/input/{FILENAME}\")\n",
"# Only the bare minimum for input. These are the same fields that would be present after the load_input_documents workflow\n",
"inputs = inputs.loc[:, [\"id\", \"title\", \"text\", \"creation_date\"]]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Generate a `GraphRagConfig` object"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"graphrag_config = load_config(Path(PROJECT_DIRECTORY))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Indexing API\n",
"\n",
"*Indexing* is the process of ingesting raw text data and constructing a knowledge graph. GraphRAG currently supports plaintext (`.txt`) and `.csv` file formats."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Build an index"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"index_result: list[PipelineRunResult] = await api.build_index(\n",
" config=graphrag_config, input_documents=inputs, is_update_run=UPDATE\n",
")\n",
"\n",
"# index_result is a list of workflows that make up the indexing pipeline that was run\n",
"for workflow_result in index_result:\n",
" status = f\"error\\n{workflow_result.errors}\" if workflow_result.errors else \"success\"\n",
" print(f\"Workflow Name: {workflow_result.workflow}\\tStatus: {status}\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": []
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Query an index\n",
"\n",
"To query an index, several index files must first be read into memory and passed to the query API. "
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"entities = pd.read_parquet(f\"{PROJECT_DIRECTORY}/output/entities.parquet\")\n",
"communities = pd.read_parquet(f\"{PROJECT_DIRECTORY}/output/communities.parquet\")\n",
"community_reports = pd.read_parquet(\n",
" f\"{PROJECT_DIRECTORY}/output/community_reports.parquet\"\n",
")\n",
"\n",
"response, context = await api.global_search(\n",
" config=graphrag_config,\n",
" entities=entities,\n",
" communities=communities,\n",
" community_reports=community_reports,\n",
" community_level=2,\n",
" dynamic_community_selection=False,\n",
" response_type=\"Multiple Paragraphs\",\n",
" query=\"What are the top five themes of the dataset?\",\n",
")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"The response object is the official reponse from graphrag while the context object holds various metadata regarding the querying process used to obtain the final response."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"print(response)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Digging into the context a bit more provides users with extremely granular information such as what sources of data (down to the level of text chunks) were ultimately retrieved and used as part of the context sent to the LLM model)."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"pprint(context) # noqa: T203"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "graphrag",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.10"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

View File

@ -1,2 +1,2 @@
$2fed1d8b-daac-41b0-a93a-e115cda75be3²À$id ÿÿÿÿÿÿÿÿÿ*string08Zdefault(text ÿÿÿÿÿÿÿÿÿ*string08Zdefault>vector ÿÿÿÿÿÿÿÿÿ*fixed_size_list:float:153608Zdefault.
$c414b6c8-9525-4982-b2ba-595f265afd34²À$id ÿÿÿÿÿÿÿÿÿ*string08Zdefault(text ÿÿÿÿÿÿÿÿÿ*string08Zdefault>vector ÿÿÿÿÿÿÿÿÿ*fixed_size_list:float:153608Zdefault.
attributes ÿÿÿÿÿÿÿÿÿ*string08Zdefault

View File

@ -1,2 +0,0 @@
$60012692-a153-48f9-8f4e-c479b44cbf3f²À$id ÿÿÿÿÿÿÿÿÿ*string08Zdefault(text ÿÿÿÿÿÿÿÿÿ*string08Zdefault>vector ÿÿÿÿÿÿÿÿÿ*fixed_size_list:float:153608Zdefault.
attributes ÿÿÿÿÿÿÿÿÿ*string08Zdefault

View File

@ -1,2 +1,2 @@
$fd0434ac-e5cd-4ddd-9dd5-e5048d4edb59²À$id ÿÿÿÿÿÿÿÿÿ*string08Zdefault(text ÿÿÿÿÿÿÿÿÿ*string08Zdefault>vector ÿÿÿÿÿÿÿÿÿ*fixed_size_list:float:153608Zdefault.
$158f79e4-2f9f-4710-99cf-9b0425ae781c²À$id ÿÿÿÿÿÿÿÿÿ*string08Zdefault(text ÿÿÿÿÿÿÿÿÿ*string08Zdefault>vector ÿÿÿÿÿÿÿÿÿ*fixed_size_list:float:153608Zdefault.
attributes ÿÿÿÿÿÿÿÿÿ*string08Zdefault

View File

@ -1,2 +0,0 @@
$7de627d2-4c57-49e9-bf73-c17a9582ead4²À$id ÿÿÿÿÿÿÿÿÿ*string08Zdefault(text ÿÿÿÿÿÿÿÿÿ*string08Zdefault>vector ÿÿÿÿÿÿÿÿÿ*fixed_size_list:float:153608Zdefault.
attributes ÿÿÿÿÿÿÿÿÿ*string08Zdefault

View File

@ -1,2 +1,2 @@
$92c031e5-7558-451e-9d0f-f5514db9616d²À$id ÿÿÿÿÿÿÿÿÿ*string08Zdefault(text ÿÿÿÿÿÿÿÿÿ*string08Zdefault>vector ÿÿÿÿÿÿÿÿÿ*fixed_size_list:float:153608Zdefault.
$e15dabba-9c5e-480a-ac7a-a24b94931123²À$id ÿÿÿÿÿÿÿÿÿ*string08Zdefault(text ÿÿÿÿÿÿÿÿÿ*string08Zdefault>vector ÿÿÿÿÿÿÿÿÿ*fixed_size_list:float:153608Zdefault.
attributes ÿÿÿÿÿÿÿÿÿ*string08Zdefault

View File

@ -1,2 +0,0 @@
$8e74264c-f72d-44f5-a6f4-b3b61ae6a43b²À$id ÿÿÿÿÿÿÿÿÿ*string08Zdefault(text ÿÿÿÿÿÿÿÿÿ*string08Zdefault>vector ÿÿÿÿÿÿÿÿÿ*fixed_size_list:float:153608Zdefault.
attributes ÿÿÿÿÿÿÿÿÿ*string08Zdefault

View File

@ -0,0 +1,2 @@
$08336a29-08c5-442b-aca4-cd5c16d61192²œid ÿÿÿÿÿÿÿÿÿ*string08text ÿÿÿÿÿÿÿÿÿ*string085vector ÿÿÿÿÿÿÿÿÿ*fixed_size_list:float:153608%
attributes ÿÿÿÿÿÿÿÿÿ*string08

View File

@ -0,0 +1,3 @@

$0752a360-8a56-4e2a-8364-2afeb9bd6207²œid ÿÿÿÿÿÿÿÿÿ*string08text ÿÿÿÿÿÿÿÿÿ*string085vector ÿÿÿÿÿÿÿÿÿ*fixed_size_list:float:153608%
attributes ÿÿÿÿÿÿÿÿÿ*string08

View File

@ -0,0 +1,2 @@
 $05435a61-8d93-423a-9cad-dd618f590fd8²œid ÿÿÿÿÿÿÿÿÿ*string08text ÿÿÿÿÿÿÿÿÿ*string085vector ÿÿÿÿÿÿÿÿÿ*fixed_size_list:float:153608%
attributes ÿÿÿÿÿÿÿÿÿ*string08

View File

@ -0,0 +1,2 @@
$8b279616-14fd-48b5-a93d-febbf6719d3e²œid ÿÿÿÿÿÿÿÿÿ*string08text ÿÿÿÿÿÿÿÿÿ*string085vector ÿÿÿÿÿÿÿÿÿ*fixed_size_list:float:153608%
attributes ÿÿÿÿÿÿÿÿÿ*string08

View File

@ -0,0 +1,2 @@
$38fb0c0b-2d0e-401b-9083-be5ba9b67435²œid ÿÿÿÿÿÿÿÿÿ*string08text ÿÿÿÿÿÿÿÿÿ*string085vector ÿÿÿÿÿÿÿÿÿ*fixed_size_list:float:153608%
attributes ÿÿÿÿÿÿÿÿÿ*string08

View File

@ -0,0 +1,2 @@
$fe38cf8b-4241-4fb8-87d7-25f99b3efe12²œid ÿÿÿÿÿÿÿÿÿ*string08text ÿÿÿÿÿÿÿÿÿ*string085vector ÿÿÿÿÿÿÿÿÿ*fixed_size_list:float:153608%
attributes ÿÿÿÿÿÿÿÿÿ*string08

Some files were not shown because too many files have changed in this diff Show More