mirror of
https://github.com/microsoft/graphrag.git
synced 2026-01-14 00:57:23 +08:00
Compare commits
108 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
fdb7e3835b | ||
|
|
ac8a7f5eef | ||
|
|
6c86b0a7bb | ||
|
|
2bd3922d8d | ||
|
|
7f996cf584 | ||
|
|
9bc899fe95 | ||
|
|
2b70e4a4f3 | ||
|
|
82cd3b7df2 | ||
|
|
075cadd59a | ||
|
|
6d7a50b7f0 | ||
|
|
2bf7e7c018 | ||
|
|
6c66b7c30f | ||
|
|
a398cc38bb | ||
|
|
ac95c917d3 | ||
|
|
1cb20b66f5 | ||
|
|
2030f94eb4 | ||
|
|
69ad36e735 | ||
|
|
30bdb35cc8 | ||
|
|
77fb7d9d7d | ||
|
|
469ee8568f | ||
|
|
7c28c70d5c | ||
|
|
5713205210 | ||
|
|
1da1380615 | ||
|
|
dce02563eb | ||
|
|
13bf315a35 | ||
|
|
e84df28e64 | ||
|
|
27c6de846f | ||
|
|
1df89727c3 | ||
|
|
17e431cf42 | ||
|
|
4a42ac81af | ||
|
|
f1e2041f07 | ||
|
|
7fba9522d4 | ||
|
|
fb4fe72a73 | ||
|
|
f5a472ab14 | ||
|
|
24018c6155 | ||
|
|
36948b8d2e | ||
|
|
ee1b2db4a0 | ||
|
|
56a865bff0 | ||
|
|
8fb95a6209 | ||
|
|
8c81cc1563 | ||
|
|
832abf1e0c | ||
|
|
25bbae8642 | ||
|
|
c8621477ed | ||
|
|
fbf11f3a7b | ||
|
|
56e0fad218 | ||
|
|
25b605b6cd | ||
|
|
e2a448170a | ||
|
|
ad4cdd685f | ||
|
|
74ad1d4a0c | ||
|
|
89381296c3 | ||
|
|
66aab4267e | ||
|
|
0e1a6e3770 | ||
|
|
61769dd47e | ||
|
|
ffd8db7104 | ||
|
|
b7b2b562ce | ||
|
|
3b1e70c06b | ||
|
|
813b4de99f | ||
|
|
ddc6541ab6 | ||
|
|
321d479ab6 | ||
|
|
0d363e6957 | ||
|
|
53950f8442 | ||
|
|
e39d869bed | ||
|
|
66c2cfb3ce | ||
|
|
bcb74789f1 | ||
|
|
bd06d8b4f0 | ||
|
|
a15942629b | ||
|
|
b4b8b81c0a | ||
|
|
716f93dd8b | ||
|
|
facf68148a | ||
|
|
ede6a74546 | ||
|
|
e40476153d | ||
|
|
61a309b182 | ||
|
|
0144b3fd88 | ||
|
|
5dd9fc53cd | ||
|
|
e0d233fe10 | ||
|
|
faa05b691f | ||
|
|
a932b2d342 | ||
|
|
54885b8ab1 | ||
|
|
7bdeaee94a | ||
|
|
a42772d368 | ||
|
|
efcaf9636d | ||
|
|
7f020826be | ||
|
|
96219a2182 | ||
|
|
981fd31963 | ||
|
|
35b639399b | ||
|
|
5ef2399a6f | ||
|
|
f14cda2b6d | ||
|
|
b8b949f3bb | ||
|
|
fe461417b5 | ||
|
|
b94290ec2b | ||
|
|
b9dc7b90d5 | ||
|
|
a6a78d5897 | ||
|
|
c02ab0984a | ||
|
|
83cc2daf91 | ||
|
|
0805924a35 | ||
|
|
a4d35bc66f | ||
|
|
30f36316af | ||
|
|
ad5b5120ec | ||
|
|
907d271f4e | ||
|
|
53b06aa2ac | ||
|
|
94bd2bb816 | ||
|
|
d31750f44d | ||
|
|
eeee84e9d9 | ||
|
|
1bbce33f42 | ||
|
|
053bf60162 | ||
|
|
6b33977360 | ||
|
|
c644338bae | ||
|
|
47adfe16f0 |
21
.github/workflows/gh-pages.yml
vendored
21
.github/workflows/gh-pages.yml
vendored
@ -6,8 +6,7 @@ permissions:
|
||||
contents: write
|
||||
|
||||
env:
|
||||
POETRY_VERSION: '1.8.3'
|
||||
PYTHON_VERSION: '3.11'
|
||||
PYTHON_VERSION: "3.11"
|
||||
|
||||
jobs:
|
||||
build:
|
||||
@ -15,9 +14,7 @@ jobs:
|
||||
env:
|
||||
GH_PAGES: 1
|
||||
DEBUG: 1
|
||||
GRAPHRAG_API_KEY: ${{ secrets.OPENAI_NOTEBOOK_KEY }}
|
||||
GRAPHRAG_LLM_MODEL: ${{ secrets.GRAPHRAG_LLM_MODEL }}
|
||||
GRAPHRAG_EMBEDDING_MODEL: ${{ secrets.GRAPHRAG_EMBEDDING_MODEL }}
|
||||
GRAPHRAG_API_KEY: ${{ secrets.GRAPHRAG_API_KEY }}
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
@ -29,18 +26,16 @@ jobs:
|
||||
with:
|
||||
python-version: ${{ env.PYTHON_VERSION }}
|
||||
|
||||
- name: Install Poetry ${{ env.POETRY_VERSION }}
|
||||
uses: abatilo/actions-poetry@v3.0.0
|
||||
with:
|
||||
poetry-version: ${{ env.POETRY_VERSION }}
|
||||
- name: Install uv
|
||||
uses: astral-sh/setup-uv@v6
|
||||
|
||||
- name: poetry intsall
|
||||
- name: Install dependencies
|
||||
shell: bash
|
||||
run: poetry install
|
||||
run: uv sync
|
||||
|
||||
- name: mkdocs build
|
||||
shell: bash
|
||||
run: poetry run poe build_docs
|
||||
run: uv run poe build_docs
|
||||
|
||||
- name: List Docsite Contents
|
||||
run: find site
|
||||
@ -50,4 +45,4 @@ jobs:
|
||||
with:
|
||||
branch: gh-pages
|
||||
folder: site
|
||||
clean: true
|
||||
clean: true
|
||||
|
||||
26
.github/workflows/python-ci.yml
vendored
26
.github/workflows/python-ci.yml
vendored
@ -26,9 +26,6 @@ concurrency:
|
||||
# Only run the for the latest commit
|
||||
cancel-in-progress: true
|
||||
|
||||
env:
|
||||
POETRY_VERSION: 1.8.3
|
||||
|
||||
jobs:
|
||||
python-ci:
|
||||
# skip draft PRs
|
||||
@ -51,7 +48,7 @@ jobs:
|
||||
filters: |
|
||||
python:
|
||||
- 'graphrag/**/*'
|
||||
- 'poetry.lock'
|
||||
- 'uv.lock'
|
||||
- 'pyproject.toml'
|
||||
- '**/*.py'
|
||||
- '**/*.toml'
|
||||
@ -64,26 +61,27 @@ jobs:
|
||||
with:
|
||||
python-version: ${{ matrix.python-version }}
|
||||
|
||||
- name: Install Poetry
|
||||
uses: abatilo/actions-poetry@v3.0.0
|
||||
with:
|
||||
poetry-version: $POETRY_VERSION
|
||||
- name: Install uv
|
||||
uses: astral-sh/setup-uv@v6
|
||||
|
||||
- name: Install dependencies
|
||||
shell: bash
|
||||
run: |
|
||||
poetry self add setuptools wheel
|
||||
poetry run python -m pip install gensim
|
||||
poetry install
|
||||
uv sync
|
||||
uv pip install gensim
|
||||
|
||||
- name: Check
|
||||
run: |
|
||||
poetry run poe check
|
||||
uv run poe check
|
||||
|
||||
- name: Build
|
||||
run: |
|
||||
poetry build
|
||||
uv build
|
||||
|
||||
- name: Unit Test
|
||||
run: |
|
||||
poetry run poe test_unit
|
||||
uv run poe test_unit
|
||||
|
||||
- name: Verb Test
|
||||
run: |
|
||||
uv run poe test_verbs
|
||||
|
||||
28
.github/workflows/python-integration-tests.yml
vendored
28
.github/workflows/python-integration-tests.yml
vendored
@ -26,9 +26,6 @@ concurrency:
|
||||
# only run the for the latest commit
|
||||
cancel-in-progress: true
|
||||
|
||||
env:
|
||||
POETRY_VERSION: 1.8.3
|
||||
|
||||
jobs:
|
||||
python-ci:
|
||||
# skip draft PRs
|
||||
@ -51,7 +48,7 @@ jobs:
|
||||
filters: |
|
||||
python:
|
||||
- 'graphrag/**/*'
|
||||
- 'poetry.lock'
|
||||
- 'uv.lock'
|
||||
- 'pyproject.toml'
|
||||
- '**/*.py'
|
||||
- '**/*.toml'
|
||||
@ -64,25 +61,24 @@ jobs:
|
||||
with:
|
||||
python-version: ${{ matrix.python-version }}
|
||||
|
||||
- name: Install Poetry
|
||||
uses: abatilo/actions-poetry@v3.0.0
|
||||
with:
|
||||
poetry-version: $POETRY_VERSION
|
||||
- name: Install uv
|
||||
uses: astral-sh/setup-uv@v6
|
||||
|
||||
- name: Install dependencies
|
||||
shell: bash
|
||||
run: |
|
||||
poetry self add setuptools wheel
|
||||
poetry run python -m pip install gensim
|
||||
poetry install
|
||||
uv sync
|
||||
uv pip install gensim
|
||||
|
||||
- name: Build
|
||||
run: |
|
||||
poetry build
|
||||
uv build
|
||||
|
||||
- name: Install Azurite
|
||||
id: azuright
|
||||
uses: potatoqualitee/azuright@v1.1
|
||||
- name: Install and start Azurite
|
||||
shell: bash
|
||||
run: |
|
||||
npm install -g azurite
|
||||
azurite --silent --skipApiVersionCheck --location /tmp/azurite --debug /tmp/azurite-debug.log &
|
||||
|
||||
# For more information on installation/setup of Azure Cosmos DB Emulator
|
||||
# https://learn.microsoft.com/en-us/azure/cosmos-db/how-to-develop-emulator?tabs=docker-linux%2Cpython&pivots=api-nosql
|
||||
@ -97,4 +93,4 @@ jobs:
|
||||
|
||||
- name: Integration Test
|
||||
run: |
|
||||
poetry run poe test_integration
|
||||
uv run poe test_integration
|
||||
|
||||
20
.github/workflows/python-notebook-tests.yml
vendored
20
.github/workflows/python-notebook-tests.yml
vendored
@ -26,9 +26,6 @@ concurrency:
|
||||
# Only run the for the latest commit
|
||||
cancel-in-progress: true
|
||||
|
||||
env:
|
||||
POETRY_VERSION: 1.8.3
|
||||
|
||||
jobs:
|
||||
python-ci:
|
||||
# skip draft PRs
|
||||
@ -41,8 +38,6 @@ jobs:
|
||||
env:
|
||||
DEBUG: 1
|
||||
GRAPHRAG_API_KEY: ${{ secrets.OPENAI_NOTEBOOK_KEY }}
|
||||
GRAPHRAG_LLM_MODEL: ${{ secrets.GRAPHRAG_LLM_MODEL }}
|
||||
GRAPHRAG_EMBEDDING_MODEL: ${{ secrets.GRAPHRAG_EMBEDDING_MODEL }}
|
||||
|
||||
runs-on: ${{ matrix.os }}
|
||||
steps:
|
||||
@ -54,7 +49,7 @@ jobs:
|
||||
filters: |
|
||||
python:
|
||||
- 'graphrag/**/*'
|
||||
- 'poetry.lock'
|
||||
- 'uv.lock'
|
||||
- 'pyproject.toml'
|
||||
- '**/*.py'
|
||||
- '**/*.toml'
|
||||
@ -66,18 +61,15 @@ jobs:
|
||||
with:
|
||||
python-version: ${{ matrix.python-version }}
|
||||
|
||||
- name: Install Poetry
|
||||
uses: abatilo/actions-poetry@v3.0.0
|
||||
with:
|
||||
poetry-version: $POETRY_VERSION
|
||||
- name: Install uv
|
||||
uses: astral-sh/setup-uv@v6
|
||||
|
||||
- name: Install dependencies
|
||||
shell: bash
|
||||
run: |
|
||||
poetry self add setuptools wheel
|
||||
poetry run python -m pip install gensim
|
||||
poetry install
|
||||
uv sync
|
||||
uv pip install gensim
|
||||
|
||||
- name: Notebook Test
|
||||
run: |
|
||||
poetry run poe test_notebook
|
||||
uv run poe test_notebook
|
||||
|
||||
13
.github/workflows/python-publish.yml
vendored
13
.github/workflows/python-publish.yml
vendored
@ -6,7 +6,6 @@ on:
|
||||
branches: [main]
|
||||
|
||||
env:
|
||||
POETRY_VERSION: "1.8.3"
|
||||
PYTHON_VERSION: "3.10"
|
||||
|
||||
jobs:
|
||||
@ -31,21 +30,19 @@ jobs:
|
||||
with:
|
||||
python-version: ${{ env.PYTHON_VERSION }}
|
||||
|
||||
- name: Install Poetry
|
||||
uses: abatilo/actions-poetry@v3.0.0
|
||||
with:
|
||||
poetry-version: ${{ env.POETRY_VERSION }}
|
||||
- name: Install uv
|
||||
uses: astral-sh/setup-uv@v6
|
||||
|
||||
- name: Install dependencies
|
||||
shell: bash
|
||||
run: poetry install
|
||||
run: uv sync
|
||||
|
||||
- name: Export Publication Version
|
||||
run: echo "version=`poetry version --short`" >> $GITHUB_OUTPUT
|
||||
run: echo "version=$(uv version --short)" >> $GITHUB_OUTPUT
|
||||
|
||||
- name: Build Distributable
|
||||
shell: bash
|
||||
run: poetry build
|
||||
run: uv build
|
||||
|
||||
- name: Publish package distributions to PyPI
|
||||
uses: pypa/gh-action-pypi-publish@release/v1
|
||||
|
||||
51
.github/workflows/python-smoke-tests.yml
vendored
51
.github/workflows/python-smoke-tests.yml
vendored
@ -26,9 +26,6 @@ concurrency:
|
||||
# Only run the for the latest commit
|
||||
cancel-in-progress: true
|
||||
|
||||
env:
|
||||
POETRY_VERSION: 1.8.3
|
||||
|
||||
jobs:
|
||||
python-ci:
|
||||
# skip draft PRs
|
||||
@ -40,25 +37,8 @@ jobs:
|
||||
fail-fast: false # Continue running all jobs even if one fails
|
||||
env:
|
||||
DEBUG: 1
|
||||
GRAPHRAG_LLM_TYPE: "azure_openai_chat"
|
||||
GRAPHRAG_EMBEDDING_TYPE: "azure_openai_embedding"
|
||||
GRAPHRAG_API_KEY: ${{ secrets.OPENAI_API_KEY }}
|
||||
GRAPHRAG_API_BASE: ${{ secrets.GRAPHRAG_API_BASE }}
|
||||
GRAPHRAG_API_VERSION: ${{ secrets.GRAPHRAG_API_VERSION }}
|
||||
GRAPHRAG_LLM_DEPLOYMENT_NAME: ${{ secrets.GRAPHRAG_LLM_DEPLOYMENT_NAME }}
|
||||
GRAPHRAG_EMBEDDING_DEPLOYMENT_NAME: ${{ secrets.GRAPHRAG_EMBEDDING_DEPLOYMENT_NAME }}
|
||||
GRAPHRAG_CACHE_CONTAINER_NAME: "cicache"
|
||||
GRAPHRAG_CACHE_BASE_DIR": "cache"
|
||||
GRAPHRAG_LLM_MODEL: ${{ secrets.GRAPHRAG_LLM_MODEL }}
|
||||
GRAPHRAG_EMBEDDING_MODEL: ${{ secrets.GRAPHRAG_EMBEDDING_MODEL }}
|
||||
GRAPHRAG_ENTITY_EXTRACTION_ENCODING_MODEL: ${{ secrets.GRAPHRAG_ENTITY_EXTRACTION_ENCODING_MODEL }}
|
||||
# We have Windows + Linux runners in 3.10 and 3.11, so we need to divide the rate limits by 4
|
||||
GRAPHRAG_LLM_TPM: 45_000 # 180,000 / 4
|
||||
GRAPHRAG_LLM_RPM: 270 # 1,080 / 4
|
||||
GRAPHRAG_EMBEDDING_TPM: 87_500 # 350,000 / 4
|
||||
GRAPHRAG_EMBEDDING_RPM: 525 # 2,100 / 4
|
||||
GRAPHRAG_CHUNK_SIZE: 1200
|
||||
GRAPHRAG_CHUNK_OVERLAP: 0
|
||||
# Azure AI Search config
|
||||
AZURE_AI_SEARCH_URL_ENDPOINT: ${{ secrets.AZURE_AI_SEARCH_URL_ENDPOINT }}
|
||||
AZURE_AI_SEARCH_API_KEY: ${{ secrets.AZURE_AI_SEARCH_API_KEY }}
|
||||
@ -73,7 +53,7 @@ jobs:
|
||||
filters: |
|
||||
python:
|
||||
- 'graphrag/**/*'
|
||||
- 'poetry.lock'
|
||||
- 'uv.lock'
|
||||
- 'pyproject.toml'
|
||||
- '**/*.py'
|
||||
- '**/*.toml'
|
||||
@ -86,37 +66,32 @@ jobs:
|
||||
with:
|
||||
python-version: ${{ matrix.python-version }}
|
||||
|
||||
- name: Install Poetry
|
||||
uses: abatilo/actions-poetry@v3.0.0
|
||||
with:
|
||||
poetry-version: $POETRY_VERSION
|
||||
- name: Install uv
|
||||
uses: astral-sh/setup-uv@v6
|
||||
|
||||
- name: Install dependencies
|
||||
shell: bash
|
||||
run: |
|
||||
poetry self add setuptools wheel
|
||||
poetry run python -m pip install gensim
|
||||
poetry install
|
||||
uv sync
|
||||
uv pip install gensim
|
||||
|
||||
- name: Build
|
||||
run: |
|
||||
poetry build
|
||||
uv build
|
||||
|
||||
- name: Verb Test
|
||||
- name: Install and start Azurite
|
||||
shell: bash
|
||||
run: |
|
||||
poetry run poe test_verbs
|
||||
|
||||
- name: Install Azurite
|
||||
id: azuright
|
||||
uses: potatoqualitee/azuright@v1.1
|
||||
npm install -g azurite
|
||||
azurite --silent --skipApiVersionCheck --location /tmp/azurite --debug /tmp/azurite-debug.log &
|
||||
|
||||
- name: Smoke Test
|
||||
if: steps.changes.outputs.python == 'true'
|
||||
run: |
|
||||
poetry run poe test_smoke
|
||||
uv run poe test_smoke
|
||||
|
||||
- uses: actions/upload-artifact@v4
|
||||
if: always()
|
||||
with:
|
||||
name: smoke-test-artifacts-${{ matrix.python-version }}-${{ matrix.poetry-version }}-${{ runner.os }}
|
||||
path: tests/fixtures/*/output
|
||||
name: smoke-test-artifacts-${{ matrix.python-version }}-${{ runner.os }}
|
||||
path: tests/fixtures/*
|
||||
|
||||
2
.gitignore
vendored
2
.gitignore
vendored
@ -1,6 +1,8 @@
|
||||
# Python Artifacts
|
||||
python/*/lib/
|
||||
dist/
|
||||
build/
|
||||
*.egg-info/
|
||||
|
||||
# Test Output
|
||||
.coverage
|
||||
|
||||
146
.semversioner/2.0.0.json
Normal file
146
.semversioner/2.0.0.json
Normal file
@ -0,0 +1,146 @@
|
||||
{
|
||||
"changes": [
|
||||
{
|
||||
"description": "Add children to communities to avoid re-compute.",
|
||||
"type": "major"
|
||||
},
|
||||
{
|
||||
"description": "Reorganize and rename workflows and their outputs.",
|
||||
"type": "major"
|
||||
},
|
||||
{
|
||||
"description": "Rework API to accept callbacks.",
|
||||
"type": "major"
|
||||
},
|
||||
{
|
||||
"description": "Add LMM Manager and Factory, to support provider registration",
|
||||
"type": "minor"
|
||||
},
|
||||
{
|
||||
"description": "Add NLP graph extraction.",
|
||||
"type": "minor"
|
||||
},
|
||||
{
|
||||
"description": "Add pipeline_start and pipeline_end callbacks.",
|
||||
"type": "minor"
|
||||
},
|
||||
{
|
||||
"description": "Move embeddings snapshots to the workflow runner.",
|
||||
"type": "minor"
|
||||
},
|
||||
{
|
||||
"description": "Remove config inheritance, hydration, and automatic env var overlays.",
|
||||
"type": "minor"
|
||||
},
|
||||
{
|
||||
"description": "Rework the update output storage structure.",
|
||||
"type": "minor"
|
||||
},
|
||||
{
|
||||
"description": "Add caching to NLP extractor.",
|
||||
"type": "patch"
|
||||
},
|
||||
{
|
||||
"description": "Add vector store id reference to embeddings config.",
|
||||
"type": "patch"
|
||||
},
|
||||
{
|
||||
"description": "Export NLP community reports prompt.",
|
||||
"type": "patch"
|
||||
},
|
||||
{
|
||||
"description": "Fix DRIFT search on Azure AI Search.",
|
||||
"type": "patch"
|
||||
},
|
||||
{
|
||||
"description": "Fix StopAsyncIteration catch.",
|
||||
"type": "patch"
|
||||
},
|
||||
{
|
||||
"description": "Fix missing embeddings workflow in FastGraphRAG.",
|
||||
"type": "patch"
|
||||
},
|
||||
{
|
||||
"description": "Fix proper use of n_depth for drift search",
|
||||
"type": "patch"
|
||||
},
|
||||
{
|
||||
"description": "Fix report generation recursion.",
|
||||
"type": "patch"
|
||||
},
|
||||
{
|
||||
"description": "Fix summarization over large datasets for inc indexing. Fix relationship summarization",
|
||||
"type": "patch"
|
||||
},
|
||||
{
|
||||
"description": "Optimize data iteration by removing some iterrows from code",
|
||||
"type": "patch"
|
||||
},
|
||||
{
|
||||
"description": "Patch json mode for community reports",
|
||||
"type": "patch"
|
||||
},
|
||||
{
|
||||
"description": "Properly increment text unit IDs during updates.",
|
||||
"type": "patch"
|
||||
},
|
||||
{
|
||||
"description": "Refactor config defaults from constants to type-safe, hierarchical dataclass.",
|
||||
"type": "patch"
|
||||
},
|
||||
{
|
||||
"description": "Require explicit azure auth settings when using AOI.",
|
||||
"type": "patch"
|
||||
},
|
||||
{
|
||||
"description": "Separates graph pruning for differential usage.",
|
||||
"type": "patch"
|
||||
},
|
||||
{
|
||||
"description": "Tuck flow functions under their workflow modules.",
|
||||
"type": "patch"
|
||||
},
|
||||
{
|
||||
"description": "Update fnllm. Remove unused libs.",
|
||||
"type": "patch"
|
||||
},
|
||||
{
|
||||
"description": "Use ModelProvider for query module",
|
||||
"type": "patch"
|
||||
},
|
||||
{
|
||||
"description": "Use shared schema for final outputs.",
|
||||
"type": "patch"
|
||||
},
|
||||
{
|
||||
"description": "add dynamic retry logic.",
|
||||
"type": "patch"
|
||||
},
|
||||
{
|
||||
"description": "add option to prepend metadata into chunks",
|
||||
"type": "patch"
|
||||
},
|
||||
{
|
||||
"description": "cleanup query code duplication.",
|
||||
"type": "patch"
|
||||
},
|
||||
{
|
||||
"description": "implemented multi-index querying for api layer",
|
||||
"type": "patch"
|
||||
},
|
||||
{
|
||||
"description": "multi index query cli support",
|
||||
"type": "patch"
|
||||
},
|
||||
{
|
||||
"description": "remove unused columns and change property document_attribute_columns to metadata",
|
||||
"type": "patch"
|
||||
},
|
||||
{
|
||||
"description": "update multi-index query to support new workflows",
|
||||
"type": "patch"
|
||||
}
|
||||
],
|
||||
"created_at": "2025-02-25T23:30:50+00:00",
|
||||
"version": "2.0.0"
|
||||
}
|
||||
22
.semversioner/2.1.0.json
Normal file
22
.semversioner/2.1.0.json
Normal file
@ -0,0 +1,22 @@
|
||||
{
|
||||
"changes": [
|
||||
{
|
||||
"description": "Add support for JSON input files.",
|
||||
"type": "minor"
|
||||
},
|
||||
{
|
||||
"description": "Updated the prompt tunning client to support csv-metadata injection and updated output file types to match the new naming convention.",
|
||||
"type": "minor"
|
||||
},
|
||||
{
|
||||
"description": "Add check for custom model types while config loading",
|
||||
"type": "patch"
|
||||
},
|
||||
{
|
||||
"description": "Adds general-purpose pipeline run state object.",
|
||||
"type": "patch"
|
||||
}
|
||||
],
|
||||
"created_at": "2025-03-11T23:53:00+00:00",
|
||||
"version": "2.1.0"
|
||||
}
|
||||
46
.semversioner/2.2.0.json
Normal file
46
.semversioner/2.2.0.json
Normal file
@ -0,0 +1,46 @@
|
||||
{
|
||||
"changes": [
|
||||
{
|
||||
"description": "Support OpenAI reasoning models.",
|
||||
"type": "minor"
|
||||
},
|
||||
{
|
||||
"description": "Add option to snapshot raw extracted graph tables.",
|
||||
"type": "patch"
|
||||
},
|
||||
{
|
||||
"description": "Added batching logic to the prompt tuning autoselection embeddings workflow",
|
||||
"type": "patch"
|
||||
},
|
||||
{
|
||||
"description": "Align config classes and docs better.",
|
||||
"type": "patch"
|
||||
},
|
||||
{
|
||||
"description": "Align embeddings table loading with configured fields.",
|
||||
"type": "patch"
|
||||
},
|
||||
{
|
||||
"description": "Brings parity with our latest NLP extraction approaches.",
|
||||
"type": "patch"
|
||||
},
|
||||
{
|
||||
"description": "Fix fnllm to 0.2.3",
|
||||
"type": "patch"
|
||||
},
|
||||
{
|
||||
"description": "Fixes to basic search.",
|
||||
"type": "patch"
|
||||
},
|
||||
{
|
||||
"description": "Update llm args for consistency.",
|
||||
"type": "patch"
|
||||
},
|
||||
{
|
||||
"description": "add vector store integration tests",
|
||||
"type": "patch"
|
||||
}
|
||||
],
|
||||
"created_at": "2025-04-25T23:30:57+00:00",
|
||||
"version": "2.2.0"
|
||||
}
|
||||
18
.semversioner/2.2.1.json
Normal file
18
.semversioner/2.2.1.json
Normal file
@ -0,0 +1,18 @@
|
||||
{
|
||||
"changes": [
|
||||
{
|
||||
"description": "Fix Community Report prompt tuning response",
|
||||
"type": "patch"
|
||||
},
|
||||
{
|
||||
"description": "Fix graph creation missing edge weights.",
|
||||
"type": "patch"
|
||||
},
|
||||
{
|
||||
"description": "Update as workflows",
|
||||
"type": "patch"
|
||||
}
|
||||
],
|
||||
"created_at": "2025-04-30T23:50:31+00:00",
|
||||
"version": "2.2.1"
|
||||
}
|
||||
34
.semversioner/2.3.0.json
Normal file
34
.semversioner/2.3.0.json
Normal file
@ -0,0 +1,34 @@
|
||||
{
|
||||
"changes": [
|
||||
{
|
||||
"description": "Remove Dynamic Max Retries support. Refactor typer typing in cli interface",
|
||||
"type": "minor"
|
||||
},
|
||||
{
|
||||
"description": "Update fnllm to latest. Update default graphrag configuration",
|
||||
"type": "minor"
|
||||
},
|
||||
{
|
||||
"description": "A few fixes and enhancements for better reuse and flow.",
|
||||
"type": "patch"
|
||||
},
|
||||
{
|
||||
"description": "Add full llm response to LLM PRovider output",
|
||||
"type": "patch"
|
||||
},
|
||||
{
|
||||
"description": "Fix Drift Reduce Response for non streaming calls",
|
||||
"type": "patch"
|
||||
},
|
||||
{
|
||||
"description": "Fix global search prompt to include missing formatting key",
|
||||
"type": "patch"
|
||||
},
|
||||
{
|
||||
"description": "Upgrade pyarrow dependency to >=17.0.0 to fix CVE-2024-52338",
|
||||
"type": "patch"
|
||||
}
|
||||
],
|
||||
"created_at": "2025-05-23T21:02:47+00:00",
|
||||
"version": "2.3.0"
|
||||
}
|
||||
26
.semversioner/2.4.0.json
Normal file
26
.semversioner/2.4.0.json
Normal file
@ -0,0 +1,26 @@
|
||||
{
|
||||
"changes": [
|
||||
{
|
||||
"description": "Allow injection of custom pipelines.",
|
||||
"type": "minor"
|
||||
},
|
||||
{
|
||||
"description": "Refactored StorageFactory to use a registration-based approach",
|
||||
"type": "minor"
|
||||
},
|
||||
{
|
||||
"description": "Fix default values for tpm and rpm limiters on embeddings",
|
||||
"type": "patch"
|
||||
},
|
||||
{
|
||||
"description": "Update typer.",
|
||||
"type": "patch"
|
||||
},
|
||||
{
|
||||
"description": "cleaned up logging to follow python standards.",
|
||||
"type": "patch"
|
||||
}
|
||||
],
|
||||
"created_at": "2025-07-15T00:04:15+00:00",
|
||||
"version": "2.4.0"
|
||||
}
|
||||
14
.semversioner/2.5.0.json
Normal file
14
.semversioner/2.5.0.json
Normal file
@ -0,0 +1,14 @@
|
||||
{
|
||||
"changes": [
|
||||
{
|
||||
"description": "Add additional context variable to build index signature for custom parameter bag",
|
||||
"type": "minor"
|
||||
},
|
||||
{
|
||||
"description": "swap package management from Poetry -> UV",
|
||||
"type": "minor"
|
||||
}
|
||||
],
|
||||
"created_at": "2025-08-14T00:59:46+00:00",
|
||||
"version": "2.5.0"
|
||||
}
|
||||
54
.semversioner/2.6.0.json
Normal file
54
.semversioner/2.6.0.json
Normal file
@ -0,0 +1,54 @@
|
||||
{
|
||||
"changes": [
|
||||
{
|
||||
"description": "Add LiteLLM chat and embedding model providers.",
|
||||
"type": "minor"
|
||||
},
|
||||
{
|
||||
"description": "Add LoggerFactory and clean up related API.",
|
||||
"type": "minor"
|
||||
},
|
||||
{
|
||||
"description": "Add config for NLP async mode.",
|
||||
"type": "minor"
|
||||
},
|
||||
{
|
||||
"description": "Add optional input documents to indexing API.",
|
||||
"type": "minor"
|
||||
},
|
||||
{
|
||||
"description": "add customization to vector store",
|
||||
"type": "minor"
|
||||
},
|
||||
{
|
||||
"description": "Add gpt-5 support by updating fnllm dependency.",
|
||||
"type": "patch"
|
||||
},
|
||||
{
|
||||
"description": "Fix all human_readable_id fields to be 0-based.",
|
||||
"type": "patch"
|
||||
},
|
||||
{
|
||||
"description": "Fix multi-index search.",
|
||||
"type": "patch"
|
||||
},
|
||||
{
|
||||
"description": "Improve upon recent logging refactor",
|
||||
"type": "patch"
|
||||
},
|
||||
{
|
||||
"description": "Make cache, storage, and vector_store factories consistent with similar registration support",
|
||||
"type": "patch"
|
||||
},
|
||||
{
|
||||
"description": "Remove hard-coded community rate limiter.",
|
||||
"type": "patch"
|
||||
},
|
||||
{
|
||||
"description": "generate_text_embeddings only loads tables if embedding field is specified.",
|
||||
"type": "patch"
|
||||
}
|
||||
],
|
||||
"created_at": "2025-09-22T21:44:51+00:00",
|
||||
"version": "2.6.0"
|
||||
}
|
||||
18
.semversioner/2.7.0.json
Normal file
18
.semversioner/2.7.0.json
Normal file
@ -0,0 +1,18 @@
|
||||
{
|
||||
"changes": [
|
||||
{
|
||||
"description": "Set LiteLLM as default in init_content.",
|
||||
"type": "minor"
|
||||
},
|
||||
{
|
||||
"description": "Fix Azure auth scope issue with LiteLLM.",
|
||||
"type": "patch"
|
||||
},
|
||||
{
|
||||
"description": "Housekeeping toward 2.7.",
|
||||
"type": "patch"
|
||||
}
|
||||
],
|
||||
"created_at": "2025-10-08T22:39:42+00:00",
|
||||
"version": "2.7.0"
|
||||
}
|
||||
57
.vscode/launch.json
vendored
57
.vscode/launch.json
vendored
@ -6,21 +6,24 @@
|
||||
"name": "Indexer",
|
||||
"type": "debugpy",
|
||||
"request": "launch",
|
||||
"module": "poetry",
|
||||
"module": "graphrag",
|
||||
"args": [
|
||||
"poe", "index",
|
||||
"--root", "<path_to_ragtest_root_demo>"
|
||||
"index",
|
||||
"--root",
|
||||
"<path_to_index_folder>"
|
||||
],
|
||||
"console": "integratedTerminal"
|
||||
},
|
||||
{
|
||||
"name": "Query",
|
||||
"type": "debugpy",
|
||||
"request": "launch",
|
||||
"module": "poetry",
|
||||
"module": "graphrag",
|
||||
"args": [
|
||||
"poe", "query",
|
||||
"--root", "<path_to_ragtest_root_demo>",
|
||||
"--method", "global",
|
||||
"query",
|
||||
"--root",
|
||||
"<path_to_index_folder>",
|
||||
"--method", "basic",
|
||||
"--query", "What are the top themes in this story",
|
||||
]
|
||||
},
|
||||
@ -28,12 +31,48 @@
|
||||
"name": "Prompt Tuning",
|
||||
"type": "debugpy",
|
||||
"request": "launch",
|
||||
"module": "poetry",
|
||||
"module": "uv",
|
||||
"args": [
|
||||
"poe", "prompt-tune",
|
||||
"--config",
|
||||
"<path_to_ragtest_root_demo>/settings.yaml",
|
||||
]
|
||||
}
|
||||
},
|
||||
{
|
||||
"name": "Debug Integration Pytest",
|
||||
"type": "debugpy",
|
||||
"request": "launch",
|
||||
"module": "pytest",
|
||||
"args": [
|
||||
"./tests/integration/vector_stores",
|
||||
"-k", "test_azure_ai_search"
|
||||
],
|
||||
"console": "integratedTerminal",
|
||||
"justMyCode": false
|
||||
},
|
||||
{
|
||||
"name": "Debug Verbs Pytest",
|
||||
"type": "debugpy",
|
||||
"request": "launch",
|
||||
"module": "pytest",
|
||||
"args": [
|
||||
"./tests/verbs",
|
||||
"-k", "test_generate_text_embeddings"
|
||||
],
|
||||
"console": "integratedTerminal",
|
||||
"justMyCode": false
|
||||
},
|
||||
{
|
||||
"name": "Debug Smoke Pytest",
|
||||
"type": "debugpy",
|
||||
"request": "launch",
|
||||
"module": "pytest",
|
||||
"args": [
|
||||
"./tests/smoke",
|
||||
"-k", "test_fixtures"
|
||||
],
|
||||
"console": "integratedTerminal",
|
||||
"justMyCode": false
|
||||
},
|
||||
]
|
||||
}
|
||||
37
.vscode/settings.json
vendored
37
.vscode/settings.json
vendored
@ -1,43 +1,8 @@
|
||||
{
|
||||
"search.exclude": {
|
||||
"**/.yarn": true,
|
||||
"**/.pnp.*": true
|
||||
},
|
||||
"editor.formatOnSave": false,
|
||||
"eslint.nodePath": ".yarn/sdks",
|
||||
"typescript.tsdk": ".yarn/sdks/typescript/lib",
|
||||
"typescript.enablePromptUseWorkspaceTsdk": true,
|
||||
"javascript.preferences.importModuleSpecifier": "relative",
|
||||
"javascript.preferences.importModuleSpecifierEnding": "js",
|
||||
"typescript.preferences.importModuleSpecifier": "relative",
|
||||
"typescript.preferences.importModuleSpecifierEnding": "js",
|
||||
"explorer.fileNesting.enabled": true,
|
||||
"explorer.fileNesting.patterns": {
|
||||
"*.ts": "${capture}.ts, ${capture}.hooks.ts, ${capture}.hooks.tsx, ${capture}.contexts.ts, ${capture}.stories.tsx, ${capture}.story.tsx, ${capture}.spec.tsx, ${capture}.base.ts, ${capture}.base.tsx, ${capture}.types.ts, ${capture}.styles.ts, ${capture}.styles.tsx, ${capture}.utils.ts, ${capture}.utils.tsx, ${capture}.constants.ts, ${capture}.module.scss, ${capture}.module.css, ${capture}.md",
|
||||
"*.js": "${capture}.js.map, ${capture}.min.js, ${capture}.d.ts",
|
||||
"*.jsx": "${capture}.js",
|
||||
"*.tsx": "${capture}.ts, ${capture}.hooks.ts, ${capture}.hooks.tsx, ${capture}.contexts.ts, ${capture}.stories.tsx, ${capture}.story.tsx, ${capture}.spec.tsx, ${capture}.base.ts, ${capture}.base.tsx, ${capture}.types.ts, ${capture}.styles.ts, ${capture}.styles.tsx, ${capture}.utils.ts, ${capture}.utils.tsx, ${capture}.constants.ts, ${capture}.module.scss, ${capture}.module.css, ${capture}.md, ${capture}.css",
|
||||
"tsconfig.json": "tsconfig.*.json",
|
||||
"package.json": "package-lock.json, turbo.json, tsconfig.json, rome.json, biome.json, .npmignore, dictionary.txt, cspell.config.yaml",
|
||||
"README.md": "*.md, LICENSE, CODEOWNERS",
|
||||
".eslintrc": ".eslintignore",
|
||||
".prettierrc": ".prettierignore",
|
||||
".gitattributes": ".gitignore",
|
||||
".yarnrc.yml": "yarn.lock, .pnp.*",
|
||||
"jest.config.js": "jest.setup.mjs",
|
||||
"pyproject.toml": "poetry.lock, poetry.toml, mkdocs.yaml",
|
||||
"cspell.config.yaml": "dictionary.txt"
|
||||
},
|
||||
"azureFunctions.postDeployTask": "npm install (functions)",
|
||||
"azureFunctions.projectLanguage": "TypeScript",
|
||||
"azureFunctions.projectRuntime": "~4",
|
||||
"debug.internalConsoleOptions": "neverOpen",
|
||||
"azureFunctions.preDeployTask": "npm prune (functions)",
|
||||
"appService.zipIgnorePattern": [
|
||||
"node_modules{,/**}",
|
||||
".vscode{,/**}"
|
||||
],
|
||||
"python.defaultInterpreterPath": "python/services/.venv/bin/python",
|
||||
"python.defaultInterpreterPath": "${workspaceRoot}/.venv/bin/python",
|
||||
"python.languageServer": "Pylance",
|
||||
"cSpell.customDictionaries": {
|
||||
"project-words": {
|
||||
|
||||
108
CHANGELOG.md
108
CHANGELOG.md
@ -1,6 +1,114 @@
|
||||
# Changelog
|
||||
Note: version releases in the 0.x.y range may introduce breaking changes.
|
||||
|
||||
## 2.7.0
|
||||
|
||||
- minor: Set LiteLLM as default in init_content.
|
||||
- patch: Fix Azure auth scope issue with LiteLLM.
|
||||
- patch: Housekeeping toward 2.7.
|
||||
|
||||
## 2.6.0
|
||||
|
||||
- minor: Add LiteLLM chat and embedding model providers.
|
||||
- minor: Add LoggerFactory and clean up related API.
|
||||
- minor: Add config for NLP async mode.
|
||||
- minor: Add optional input documents to indexing API.
|
||||
- minor: add customization to vector store
|
||||
- patch: Add gpt-5 support by updating fnllm dependency.
|
||||
- patch: Fix all human_readable_id fields to be 0-based.
|
||||
- patch: Fix multi-index search.
|
||||
- patch: Improve upon recent logging refactor
|
||||
- patch: Make cache, storage, and vector_store factories consistent with similar registration support
|
||||
- patch: Remove hard-coded community rate limiter.
|
||||
- patch: generate_text_embeddings only loads tables if embedding field is specified.
|
||||
|
||||
## 2.5.0
|
||||
|
||||
- minor: Add additional context variable to build index signature for custom parameter bag
|
||||
- minor: swap package management from Poetry -> UV
|
||||
|
||||
## 2.4.0
|
||||
|
||||
- minor: Allow injection of custom pipelines.
|
||||
- minor: Refactored StorageFactory to use a registration-based approach
|
||||
- patch: Fix default values for tpm and rpm limiters on embeddings
|
||||
- patch: Update typer.
|
||||
- patch: cleaned up logging to follow python standards.
|
||||
|
||||
## 2.3.0
|
||||
|
||||
- minor: Remove Dynamic Max Retries support. Refactor typer typing in cli interface
|
||||
- minor: Update fnllm to latest. Update default graphrag configuration
|
||||
- patch: A few fixes and enhancements for better reuse and flow.
|
||||
- patch: Add full llm response to LLM PRovider output
|
||||
- patch: Fix Drift Reduce Response for non streaming calls
|
||||
- patch: Fix global search prompt to include missing formatting key
|
||||
- patch: Upgrade pyarrow dependency to >=17.0.0 to fix CVE-2024-52338
|
||||
|
||||
## 2.2.1
|
||||
|
||||
- patch: Fix Community Report prompt tuning response
|
||||
- patch: Fix graph creation missing edge weights.
|
||||
- patch: Update as workflows
|
||||
|
||||
## 2.2.0
|
||||
|
||||
- minor: Support OpenAI reasoning models.
|
||||
- patch: Add option to snapshot raw extracted graph tables.
|
||||
- patch: Added batching logic to the prompt tuning autoselection embeddings workflow
|
||||
- patch: Align config classes and docs better.
|
||||
- patch: Align embeddings table loading with configured fields.
|
||||
- patch: Brings parity with our latest NLP extraction approaches.
|
||||
- patch: Fix fnllm to 0.2.3
|
||||
- patch: Fixes to basic search.
|
||||
- patch: Update llm args for consistency.
|
||||
- patch: add vector store integration tests
|
||||
|
||||
## 2.1.0
|
||||
|
||||
- minor: Add support for JSON input files.
|
||||
- minor: Updated the prompt tunning client to support csv-metadata injection and updated output file types to match the new naming convention.
|
||||
- patch: Add check for custom model types while config loading
|
||||
- patch: Adds general-purpose pipeline run state object.
|
||||
|
||||
## 2.0.0
|
||||
|
||||
- major: Add children to communities to avoid re-compute.
|
||||
- major: Reorganize and rename workflows and their outputs.
|
||||
- major: Rework API to accept callbacks.
|
||||
- minor: Add LMM Manager and Factory, to support provider registration
|
||||
- minor: Add NLP graph extraction.
|
||||
- minor: Add pipeline_start and pipeline_end callbacks.
|
||||
- minor: Move embeddings snapshots to the workflow runner.
|
||||
- minor: Remove config inheritance, hydration, and automatic env var overlays.
|
||||
- minor: Rework the update output storage structure.
|
||||
- patch: Add caching to NLP extractor.
|
||||
- patch: Add vector store id reference to embeddings config.
|
||||
- patch: Export NLP community reports prompt.
|
||||
- patch: Fix DRIFT search on Azure AI Search.
|
||||
- patch: Fix StopAsyncIteration catch.
|
||||
- patch: Fix missing embeddings workflow in FastGraphRAG.
|
||||
- patch: Fix proper use of n_depth for drift search
|
||||
- patch: Fix report generation recursion.
|
||||
- patch: Fix summarization over large datasets for inc indexing. Fix relationship summarization
|
||||
- patch: Optimize data iteration by removing some iterrows from code
|
||||
- patch: Patch json mode for community reports
|
||||
- patch: Properly increment text unit IDs during updates.
|
||||
- patch: Refactor config defaults from constants to type-safe, hierarchical dataclass.
|
||||
- patch: Require explicit azure auth settings when using AOI.
|
||||
- patch: Separates graph pruning for differential usage.
|
||||
- patch: Tuck flow functions under their workflow modules.
|
||||
- patch: Update fnllm. Remove unused libs.
|
||||
- patch: Use ModelProvider for query module
|
||||
- patch: Use shared schema for final outputs.
|
||||
- patch: add dynamic retry logic.
|
||||
- patch: add option to prepend metadata into chunks
|
||||
- patch: cleanup query code duplication.
|
||||
- patch: implemented multi-index querying for api layer
|
||||
- patch: multi index query cli support
|
||||
- patch: remove unused columns and change property document_attribute_columns to metadata
|
||||
- patch: update multi-index query to support new workflows
|
||||
|
||||
## 1.2.0
|
||||
|
||||
- minor: Add Drift Reduce response and streaming endpoint
|
||||
|
||||
@ -22,7 +22,7 @@ or contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any addi
|
||||
2. Create a new branch for your contribution: `git checkout -b my-contribution`.
|
||||
3. Make your changes and ensure that the code passes all tests.
|
||||
4. Commit your changes: `git commit -m "Add my contribution"`.
|
||||
5. Create and commit a semver impact document by running `poetry run semversioner add-change -t <major|minor|patch> -d <description>`.
|
||||
5. Create and commit a semver impact document by running `uv run semversioner add-change -t <major|minor|patch> -d <description>`.
|
||||
6. Push your changes to your forked repository: `git push origin my-contribution`.
|
||||
7. Open a pull request to the main repository.
|
||||
|
||||
|
||||
@ -5,29 +5,29 @@
|
||||
| Name | Installation | Purpose |
|
||||
| ------------------- | ------------------------------------------------------------ | ----------------------------------------------------------------------------------- |
|
||||
| Python 3.10 or 3.11 | [Download](https://www.python.org/downloads/) | The library is Python-based. |
|
||||
| Poetry | [Instructions](https://python-poetry.org/docs/#installation) | Poetry is used for package management and virtualenv management in Python codebases |
|
||||
| uv | [Instructions](https://docs.astral.sh/uv/) | uv is used for package management and virtualenv management in Python codebases |
|
||||
|
||||
# Getting Started
|
||||
|
||||
## Install Dependencies
|
||||
```shell
|
||||
# install python dependencies
|
||||
poetry install
|
||||
uv sync
|
||||
```
|
||||
|
||||
## Execute the indexing engine
|
||||
```shell
|
||||
poetry run poe index <...args>
|
||||
uv run poe index <...args>
|
||||
```
|
||||
|
||||
## Execute prompt tuning
|
||||
```shell
|
||||
poetry run poe prompt_tune <...args>
|
||||
uv run poe prompt_tune <...args>
|
||||
```
|
||||
|
||||
## Execute Queries
|
||||
```shell
|
||||
poetry run poe query <...args>
|
||||
uv run poe query <...args>
|
||||
```
|
||||
|
||||
## Repository Structure
|
||||
@ -63,7 +63,7 @@ Where appropriate, the factories expose a registration method for users to provi
|
||||
|
||||
We use [semversioner](https://github.com/raulgomis/semversioner) to automate and enforce semantic versioning in the release process. Our CI/CD pipeline checks that all PR's include a json file generated by semversioner. When submitting a PR, please run:
|
||||
```shell
|
||||
poetry run semversioner add-change -t patch -d "<a small sentence describing changes made>."
|
||||
uv run semversioner add-change -t patch -d "<a small sentence describing changes made>."
|
||||
```
|
||||
|
||||
# Azurite
|
||||
@ -78,29 +78,29 @@ or by simply running `azurite` in the terminal if already installed globally. Se
|
||||
|
||||
# Lifecycle Scripts
|
||||
|
||||
Our Python package utilizes Poetry to manage dependencies and [poethepoet](https://pypi.org/project/poethepoet/) to manage custom build scripts.
|
||||
Our Python package utilizes uv to manage dependencies and [poethepoet](https://pypi.org/project/poethepoet/) to manage custom build scripts.
|
||||
|
||||
Available scripts are:
|
||||
- `poetry run poe index` - Run the Indexing CLI
|
||||
- `poetry run poe query` - Run the Query CLI
|
||||
- `poetry build` - This invokes `poetry build`, which will build a wheel file and other distributable artifacts.
|
||||
- `poetry run poe test` - This will execute all tests.
|
||||
- `poetry run poe test_unit` - This will execute unit tests.
|
||||
- `poetry run poe test_integration` - This will execute integration tests.
|
||||
- `poetry run poe test_smoke` - This will execute smoke tests.
|
||||
- `poetry run poe check` - This will perform a suite of static checks across the package, including:
|
||||
- `uv run poe index` - Run the Indexing CLI
|
||||
- `uv run poe query` - Run the Query CLI
|
||||
- `uv build` - This invokes `uv build`, which will build a wheel file and other distributable artifacts.
|
||||
- `uv run poe test` - This will execute all tests.
|
||||
- `uv run poe test_unit` - This will execute unit tests.
|
||||
- `uv run poe test_integration` - This will execute integration tests.
|
||||
- `uv run poe test_smoke` - This will execute smoke tests.
|
||||
- `uv run poe check` - This will perform a suite of static checks across the package, including:
|
||||
- formatting
|
||||
- documentation formatting
|
||||
- linting
|
||||
- security patterns
|
||||
- type-checking
|
||||
- `poetry run poe fix` - This will apply any available auto-fixes to the package. Usually this is just formatting fixes.
|
||||
- `poetry run poe fix_unsafe` - This will apply any available auto-fixes to the package, including those that may be unsafe.
|
||||
- `poetry run poe format` - Explicitly run the formatter across the package.
|
||||
- `uv run poe fix` - This will apply any available auto-fixes to the package. Usually this is just formatting fixes.
|
||||
- `uv run poe fix_unsafe` - This will apply any available auto-fixes to the package, including those that may be unsafe.
|
||||
- `uv run poe format` - Explicitly run the formatter across the package.
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
### "RuntimeError: llvm-config failed executing, please point LLVM_CONFIG to the path for llvm-config" when running poetry install
|
||||
### "RuntimeError: llvm-config failed executing, please point LLVM_CONFIG to the path for llvm-config" when running uv sync
|
||||
|
||||
Make sure llvm-9 and llvm-9-dev are installed:
|
||||
|
||||
@ -110,13 +110,8 @@ and then in your bashrc, add
|
||||
|
||||
`export LLVM_CONFIG=/usr/bin/llvm-config-9`
|
||||
|
||||
### "numba/\_pymodule.h:6:10: fatal error: Python.h: No such file or directory" when running poetry install
|
||||
### "numba/\_pymodule.h:6:10: fatal error: Python.h: No such file or directory" when running uv sync
|
||||
|
||||
Make sure you have python3.10-dev installed or more generally `python<version>-dev`
|
||||
|
||||
`sudo apt-get install python3.10-dev`
|
||||
|
||||
### LLM call constantly exceeds TPM, RPM or time limits
|
||||
|
||||
`GRAPHRAG_LLM_THREAD_COUNT` and `GRAPHRAG_EMBEDDING_THREAD_COUNT` are both set to 50 by default. You can modify this values
|
||||
to reduce concurrency. Please refer to the [Configuration Documents](https://microsoft.github.io/graphrag/config/overview/)
|
||||
|
||||
@ -1,6 +1,5 @@
|
||||
# GraphRAG
|
||||
|
||||
👉 [Use the GraphRAG Accelerator solution](https://github.com/Azure-Samples/graphrag-accelerator) <br/>
|
||||
👉 [Microsoft Research Blog Post](https://www.microsoft.com/en-us/research/blog/graphrag-unlocking-llm-discovery-on-narrative-private-data/)<br/>
|
||||
👉 [Read the docs](https://microsoft.github.io/graphrag)<br/>
|
||||
👉 [GraphRAG Arxiv](https://arxiv.org/pdf/2404.16130)
|
||||
@ -28,7 +27,7 @@ To learn more about GraphRAG and how it can be used to enhance your LLM's abilit
|
||||
|
||||
## Quickstart
|
||||
|
||||
To get started with the GraphRAG system we recommend trying the [Solution Accelerator](https://github.com/Azure-Samples/graphrag-accelerator) package. This provides a user-friendly end-to-end experience with Azure resources.
|
||||
To get started with the GraphRAG system we recommend trying the [command line quickstart](https://microsoft.github.io/graphrag/get_started/).
|
||||
|
||||
## Repository Guidance
|
||||
|
||||
@ -47,6 +46,12 @@ This repository presents a methodology for using knowledge graph memory structur
|
||||
Using _GraphRAG_ with your data out of the box may not yield the best possible results.
|
||||
We strongly recommend to fine-tune your prompts following the [Prompt Tuning Guide](https://microsoft.github.io/graphrag/prompt_tuning/overview/) in our documentation.
|
||||
|
||||
## Versioning
|
||||
|
||||
Please see the [breaking changes](./breaking-changes.md) document for notes on our approach to versioning the project.
|
||||
|
||||
*Always run `graphrag init --root [path] --force` between minor version bumps to ensure you have the latest config format. Run the provided migration notebook between major version bumps if you want to avoid re-indexing prior datasets. Note that this will overwrite your configuration and prompts, so backup if necessary.*
|
||||
|
||||
## Responsible AI FAQ
|
||||
|
||||
See [RAI_TRANSPARENCY.md](./RAI_TRANSPARENCY.md)
|
||||
|
||||
98
breaking-changes.md
Normal file
98
breaking-changes.md
Normal file
@ -0,0 +1,98 @@
|
||||
# GraphRAG Data Model and Config Breaking Changes
|
||||
|
||||
This document contains notes about our versioning approach and a log of changes over time that may result in breakage. As of version 1.0 we are aligning more closely with standard [semantic versioning](https://semver.org/) practices. However, this is an ongoing research project that needs to balance experimental progress with stakeholder communication about big feature releases, so there may be times when we don't adhere perfectly to the spec.
|
||||
|
||||
There are five surface areas that may be impacted on any given release. They are:
|
||||
|
||||
- [CLI](https://microsoft.github.io/graphrag/cli/) - The CLI is the interface most project consumers are using. **Changes to the CLI will conform to standard semver.**
|
||||
- [API](https://github.com/microsoft/graphrag/tree/main/graphrag/api) - The API layer is the primary interface we expect developers to use if they are consuming the project as a library in their own codebases. **Changes to the API layer modules will conform to standard semver.**
|
||||
- Internals - Any code modules behind the CLI and API layers are considered "internal" and may change at any time without conforming to strict semver. This is intended to give the research team high flexibility to change our underlying implementation rapidly. We are not enforcing access via tightly controlled `__init__.py` files, so please understand that if you utilize modules other than the index or query API, they may break between releases in a non-semver-compliant manner.
|
||||
- [settings.yaml](https://microsoft.github.io/graphrag/config/yaml/) - The settings.yaml file may have changes made to it as we adjust configurability. **Changes that affect the settings.yml will result in a minor version bump**. `graphrag init` will always emit compatible starter config, so we recommend always running the command when updating GraphRAG between minor versions, and copying your endpoint information or other customizations over to the new file.
|
||||
- [Data model](https://microsoft.github.io/graphrag/index/outputs/) - The output data model may change over time as we adjust our approach. **Changes to the data model will conform to standard semver.** Any changes to the output tables will be shimmed for backwards compatibility between major releases, and we'll provide a migration notebook for folks to upgrade without requiring a re-index.
|
||||
|
||||
> TL;DR: Always run `graphrag init --path [path] --force` between minor version bumps to ensure you have the latest config format. Run the provided migration notebook between major version bumps if you want to avoid re-indexing prior datasets. Note that this will overwrite your configuration and prompts, so backup if necessary.
|
||||
|
||||
# v2
|
||||
|
||||
Run the [migration notebook](./docs/examples_notebooks/index_migration_to_v2.ipynb) to convert older tables to the v2 format.
|
||||
|
||||
The v2 release renamed all of our index tables to simply name the items each table contains. The previous naming was a leftover requirement of our use of DataShaper, which is no longer necessary.
|
||||
|
||||
# v1
|
||||
|
||||
Run the [migration notebook](./docs/examples_notebooks/index_migration_to_v1.ipynb) to convert older tables to the v1 format.
|
||||
|
||||
Note that one of the new requirements is that we write embeddings to a vector store during indexing. By default, this uses a local lancedb instance. When you re-generate the default config, a block will be added to reflect this. If you need to write to Azure AI Search instead, we recommend updating these settings before you index, so you don't need to do a separate vector ingest.
|
||||
|
||||
All of the breaking changes listed below are accounted for in the four steps above.
|
||||
|
||||
## Updated data model
|
||||
|
||||
- We have streamlined the data model of the index in a few small ways to align tables more consistently and remove redundant content. Notably:
|
||||
- Consistent use of `id` and `human_readable_id` across all tables; this also insures all int IDs are actually saved as ints and never strings
|
||||
- Alignment of fields from `create_final_entities` (such as name -> title) with `create_final_nodes`, and removal of redundant content across these tables
|
||||
- Rename of `document.raw_content` to `document.text`
|
||||
- Rename of `entity.name` to `entity.title`
|
||||
- Rename `rank` to `combined_degree` in `create_final_relationships` and removal of `source_degree` and `target_degree` fields
|
||||
- Fixed community tables to use a proper UUID for the `id` field, and retain `community` and `human_readable_id` for the short IDs
|
||||
- Removal of all embeddings columns from parquet files in favor of direct vector store writes
|
||||
|
||||
### Migration
|
||||
|
||||
- Run the migration notebook (some recent changes may invalidate existing caches, so migrating the format it cheaper than re-indexing).
|
||||
|
||||
## New required Embeddings
|
||||
|
||||
### Change
|
||||
|
||||
- Added new required embeddings for `DRIFTSearch` and base RAG capabilities.
|
||||
|
||||
### Migration
|
||||
|
||||
- Run a new index, leveraging existing cache.
|
||||
|
||||
## Vector Store required by default
|
||||
|
||||
### Change
|
||||
|
||||
- Vector store is now required by default for all search methods.
|
||||
|
||||
### Migration
|
||||
|
||||
- Run `graphrag init` command to generate a new settings.yaml file with the vector store configuration.
|
||||
- Run a new index, leveraging existing cache.
|
||||
|
||||
## Deprecate timestamp paths
|
||||
|
||||
### Change
|
||||
|
||||
- Remove support for timestamp paths, those using `${timestamp}` directory nesting.
|
||||
- Use the same directory for storage output and reporting output.
|
||||
|
||||
### Migration
|
||||
|
||||
- Ensure output directories no longer use `${timestamp}` directory nesting.
|
||||
|
||||
**Using Environment Variables**
|
||||
|
||||
- Ensure `GRAPHRAG_STORAGE_BASE_DIR` is set to a static directory, e.g., `output` instead of `output/${timestamp}/artifacts`.
|
||||
- Ensure `GRAPHRAG_REPORTING_BASE_DIR` is set to a static directory, e.g., `output` instead of `output/${timestamp}/reports`
|
||||
|
||||
[Full docs on using environment variables for configuration](https://microsoft.github.io/graphrag/config/env_vars/).
|
||||
|
||||
**Using Configuration File**
|
||||
|
||||
```yaml
|
||||
# rest of settings.yaml file
|
||||
# ...
|
||||
|
||||
storage:
|
||||
type: file
|
||||
base_dir: "output" # changed from "output/${timestamp}/artifacts"
|
||||
|
||||
reporting:
|
||||
type: file
|
||||
base_dir: "output" # changed from "output/${timestamp}/reports"
|
||||
```
|
||||
|
||||
[Full docs on using YAML files for configuration](https://microsoft.github.io/graphrag/config/yaml/).
|
||||
@ -33,11 +33,17 @@ cosmosdb
|
||||
Hnsw
|
||||
odata
|
||||
|
||||
# NLTK Terms
|
||||
# NLP Terms
|
||||
chunker
|
||||
wordnet
|
||||
maxent
|
||||
punkt
|
||||
punct
|
||||
lemmatizer
|
||||
PROPN
|
||||
Syntatic
|
||||
ents
|
||||
INTJ
|
||||
|
||||
# Libraries
|
||||
Langchain
|
||||
@ -72,6 +78,10 @@ semversioner
|
||||
mkdocs
|
||||
fnllm
|
||||
typer
|
||||
spacy
|
||||
kwargs
|
||||
ollama
|
||||
litellm
|
||||
|
||||
# Library Methods
|
||||
iterrows
|
||||
@ -93,6 +103,9 @@ itertuples
|
||||
isin
|
||||
nocache
|
||||
nbconvert
|
||||
levelno
|
||||
acompletion
|
||||
aembedding
|
||||
|
||||
# HTML
|
||||
nbsp
|
||||
@ -114,10 +127,7 @@ unhot
|
||||
groupby
|
||||
retryer
|
||||
agenerate
|
||||
aembed
|
||||
dedupe
|
||||
dropna
|
||||
dtypes
|
||||
notna
|
||||
|
||||
# LLM Terms
|
||||
@ -125,6 +135,8 @@ AOAI
|
||||
embedder
|
||||
llm
|
||||
llms
|
||||
achat
|
||||
aembed
|
||||
|
||||
# Galaxy-Brain Terms
|
||||
Unipartite
|
||||
@ -178,13 +190,26 @@ Verdantis's
|
||||
# English
|
||||
skippable
|
||||
upvote
|
||||
unconfigured
|
||||
|
||||
# Misc
|
||||
Arxiv
|
||||
kwds
|
||||
jsons
|
||||
txts
|
||||
byog
|
||||
|
||||
# Dulce
|
||||
astrotechnician
|
||||
epitheg
|
||||
unspooled
|
||||
unnavigated
|
||||
|
||||
# Names
|
||||
Hochul
|
||||
Ashish
|
||||
|
||||
#unified-search
|
||||
apos
|
||||
dearmor
|
||||
venv
|
||||
@ -1,12 +1,18 @@
|
||||
# Default Configuration Mode (using Env Vars)
|
||||
|
||||
## Text-Embeddings Customization
|
||||
As of version 1.3, GraphRAG no longer supports a full complement of pre-built environment variables. Instead, we support variable replacement within the [settings.yml file](yaml.md) so you can specify any environment variables you like.
|
||||
|
||||
The only standard environment variable we expect, and include in the default settings.yml, is `GRAPHRAG_API_KEY`. If you are already using a number of the previous GRAPHRAG_* environment variables, you can insert them with template syntax into settings.yml and they will be adopted.
|
||||
|
||||
> **The environment variables below are documented as an aid for migration, but they WILL NOT be read unless you use template syntax in your settings.yml. We also WILL NOT be updating this page as the main config object changes.**
|
||||
|
||||
---
|
||||
|
||||
### Text-Embeddings Customization
|
||||
|
||||
By default, the GraphRAG indexer will only export embeddings required for our query methods. However, the model has embeddings defined for all plaintext fields, and these can be generated by setting the `GRAPHRAG_EMBEDDING_TARGET` environment variable to `all`.
|
||||
|
||||
If the embedding target is `all`, and you want to only embed a subset of these fields, you may specify which embeddings to skip using the `GRAPHRAG_EMBEDDING_SKIP` argument described below.
|
||||
|
||||
### Embedded Fields
|
||||
#### Embedded Fields
|
||||
|
||||
- `text_unit.text`
|
||||
- `document.text`
|
||||
@ -17,23 +23,23 @@ If the embedding target is `all`, and you want to only embed a subset of these f
|
||||
- `community.summary`
|
||||
- `community.full_content`
|
||||
|
||||
## Input Data
|
||||
### Input Data
|
||||
|
||||
Our pipeline can ingest .csv or .txt data from an input folder. These files can be nested within subfolders. To configure how input data is handled, what fields are mapped over, and how timestamps are parsed, look for configuration values starting with `GRAPHRAG_INPUT_` below. In general, CSV-based data provides the most customizability. Each CSV should at least contain a `text` field (which can be mapped with environment variables), but it's helpful if they also have `title`, `timestamp`, and `source` fields. Additional fields can be included as well, which will land as extra fields on the `Document` table.
|
||||
|
||||
## Base LLM Settings
|
||||
### Base LLM Settings
|
||||
|
||||
These are the primary settings for configuring LLM connectivity.
|
||||
|
||||
| Parameter | Required? | Description | Type | Default Value |
|
||||
| --------------------------- | ------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------- | ----- | ------------- |
|
||||
| `GRAPHRAG_API_KEY` | **Yes for OpenAI. Optional for AOAI** | The API key. (Note: `OPENAI_API_KEY is also used as a fallback). If not defined when using AOAI, managed identity will be used. | `str` | `None` |
|
||||
| `GRAPHRAG_API_BASE` | **For AOAI** | The API Base URL | `str` | `None` |
|
||||
| `GRAPHRAG_API_VERSION` | **For AOAI** | The AOAI API version. | `str` | `None` |
|
||||
| `GRAPHRAG_API_ORGANIZATION` | | The AOAI organization. | `str` | `None` |
|
||||
| `GRAPHRAG_API_PROXY` | | The AOAI proxy. | `str` | `None` |
|
||||
| Parameter | Required? | Description | Type | Default Value |
|
||||
| --------------------------- | ------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------- | ----- | ------------- |
|
||||
| `GRAPHRAG_API_KEY` | **Yes for OpenAI. Optional for AOAI** | The API key. (Note: `OPENAI_API_KEY` is also used as a fallback). If not defined when using AOAI, managed identity will be used. | `str` | `None` |
|
||||
| `GRAPHRAG_API_BASE` | **For AOAI** | The API Base URL | `str` | `None` |
|
||||
| `GRAPHRAG_API_VERSION` | **For AOAI** | The AOAI API version. | `str` | `None` |
|
||||
| `GRAPHRAG_API_ORGANIZATION` | | The AOAI organization. | `str` | `None` |
|
||||
| `GRAPHRAG_API_PROXY` | | The AOAI proxy. | `str` | `None` |
|
||||
|
||||
## Text Generation Settings
|
||||
### Text Generation Settings
|
||||
|
||||
These settings control the text generation model used by the pipeline. Any settings with a fallback will use the base LLM settings, if available.
|
||||
|
||||
@ -62,7 +68,7 @@ These settings control the text generation model used by the pipeline. Any setti
|
||||
| `GRAPHRAG_LLM_TOP_P` | | The top_p to use for sampling. | `float` | 1 |
|
||||
| `GRAPHRAG_LLM_N` | | The number of responses to generate. | `int` | 1 |
|
||||
|
||||
## Text Embedding Settings
|
||||
### Text Embedding Settings
|
||||
|
||||
These settings control the text embedding model used by the pipeline. Any settings with a fallback will use the base LLM settings, if available.
|
||||
|
||||
@ -78,8 +84,7 @@ These settings control the text embedding model used by the pipeline. Any settin
|
||||
| `GRAPHRAG_EMBEDDING_MODEL` | | The model to use for the embedding client. | `str` | `text-embedding-3-small` |
|
||||
| `GRAPHRAG_EMBEDDING_BATCH_SIZE` | | The number of texts to embed at once. [(Azure limit is 16)](https://learn.microsoft.com/en-us/azure/ai-ce) | `int` | 16 |
|
||||
| `GRAPHRAG_EMBEDDING_BATCH_MAX_TOKENS` | | The maximum tokens per batch [(Azure limit is 8191)](https://learn.microsoft.com/en-us/azure/ai-services/openai/reference) | `int` | 8191 |
|
||||
| `GRAPHRAG_EMBEDDING_TARGET` | | The target fields to embed. Either `required` or `all`. | `str` | `required` |
|
||||
| `GRAPHRAG_EMBEDDING_SKIP` | | A comma-separated list of fields to skip embeddings for . (e.g. 'relationship.description') | `str` | `None` |
|
||||
| `GRAPHRAG_EMBEDDING_TARGET` | | The target fields to embed. Either `required` or `all`. | `str` | `required` | |
|
||||
| `GRAPHRAG_EMBEDDING_THREAD_COUNT` | | The number of threads to use for parallelization for embeddings. | `int` | |
|
||||
| `GRAPHRAG_EMBEDDING_THREAD_STAGGER` | | The time to wait (in seconds) between starting each thread for embeddings. | `float` | 50 |
|
||||
| `GRAPHRAG_EMBEDDING_CONCURRENT_REQUESTS` | | The number of concurrent requests to allow for the embedding client. | `int` | 25 |
|
||||
@ -89,41 +94,38 @@ These settings control the text embedding model used by the pipeline. Any settin
|
||||
| `GRAPHRAG_EMBEDDING_MAX_RETRY_WAIT` | | The maximum number of seconds to wait between retries. | `int` | 10 |
|
||||
| `GRAPHRAG_EMBEDDING_SLEEP_ON_RATE_LIMIT_RECOMMENDATION` | | Whether to sleep on rate limit recommendation. (Azure Only) | `bool` | `True` |
|
||||
|
||||
## Input Settings
|
||||
### Input Settings
|
||||
|
||||
These settings control the data input used by the pipeline. Any settings with a fallback will use the base LLM settings, if available.
|
||||
|
||||
### Plaintext Input Data (`GRAPHRAG_INPUT_FILE_TYPE`=text)
|
||||
#### Plaintext Input Data (`GRAPHRAG_INPUT_FILE_TYPE`=text)
|
||||
|
||||
| Parameter | Description | Type | Required or Optional | Default |
|
||||
| ----------------------------- | --------------------------------------------------------------------------------- | ----- | -------------------- | ---------- |
|
||||
| `GRAPHRAG_INPUT_FILE_PATTERN` | The file pattern regexp to use when reading input files from the input directory. | `str` | optional | `.*\.txt$` |
|
||||
|
||||
### CSV Input Data (`GRAPHRAG_INPUT_FILE_TYPE`=csv)
|
||||
#### CSV Input Data (`GRAPHRAG_INPUT_FILE_TYPE`=csv)
|
||||
|
||||
| Parameter | Description | Type | Required or Optional | Default |
|
||||
| ------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------ | ----- | -------------------- | ---------- |
|
||||
| `GRAPHRAG_INPUT_TYPE` | The input storage type to use when reading files. (`file` or `blob`) | `str` | optional | `file` |
|
||||
| `GRAPHRAG_INPUT_FILE_PATTERN` | The file pattern regexp to use when reading input files from the input directory. | `str` | optional | `.*\.txt$` |
|
||||
| `GRAPHRAG_INPUT_SOURCE_COLUMN` | The 'source' column to use when reading CSV input files. | `str` | optional | `source` |
|
||||
| `GRAPHRAG_INPUT_TIMESTAMP_COLUMN` | The 'timestamp' column to use when reading CSV input files. | `str` | optional | `None` |
|
||||
| `GRAPHRAG_INPUT_TIMESTAMP_FORMAT` | The timestamp format to use when parsing timestamps in the timestamp column. | `str` | optional | `None` |
|
||||
| `GRAPHRAG_INPUT_TEXT_COLUMN` | The 'text' column to use when reading CSV input files. | `str` | optional | `text` |
|
||||
| `GRAPHRAG_INPUT_DOCUMENT_ATTRIBUTE_COLUMNS` | A list of CSV columns, comma-separated, to incorporate as document fields. | `str` | optional | `id` |
|
||||
| `GRAPHRAG_INPUT_METADATA` | A list of CSV columns, comma-separated, to incorporate as JSON in a metadata column. | `str` | optional | `None` |
|
||||
| `GRAPHRAG_INPUT_TITLE_COLUMN` | The 'title' column to use when reading CSV input files. | `str` | optional | `title` |
|
||||
| `GRAPHRAG_INPUT_STORAGE_ACCOUNT_BLOB_URL` | The Azure Storage blob endpoint to use when in `blob` mode and using managed identity. Will have the format `https://<storage_account_name>.blob.core.windows.net` | `str` | optional | `None` |
|
||||
| `GRAPHRAG_INPUT_CONNECTION_STRING` | The connection string to use when reading CSV input files from Azure Blob Storage. | `str` | optional | `None` |
|
||||
| `GRAPHRAG_INPUT_CONTAINER_NAME` | The container name to use when reading CSV input files from Azure Blob Storage. | `str` | optional | `None` |
|
||||
| `GRAPHRAG_INPUT_BASE_DIR` | The base directory to read input files from. | `str` | optional | `None` |
|
||||
|
||||
## Data Mapping Settings
|
||||
### Data Mapping Settings
|
||||
|
||||
| Parameter | Description | Type | Required or Optional | Default |
|
||||
| -------------------------- | -------------------------------------------------------- | ----- | -------------------- | ------- |
|
||||
| `GRAPHRAG_INPUT_FILE_TYPE` | The type of input data, `csv` or `text` | `str` | optional | `text` |
|
||||
| `GRAPHRAG_INPUT_ENCODING` | The encoding to apply when reading CSV/text input files. | `str` | optional | `utf-8` |
|
||||
|
||||
## Data Chunking
|
||||
### Data Chunking
|
||||
|
||||
| Parameter | Description | Type | Required or Optional | Default |
|
||||
| ------------------------------- | ------------------------------------------------------------------------------------------- | ----- | -------------------- | ----------------------------- |
|
||||
@ -132,7 +134,7 @@ These settings control the data input used by the pipeline. Any settings with a
|
||||
| `GRAPHRAG_CHUNK_BY_COLUMNS` | A comma-separated list of document attributes to groupby when performing TextUnit chunking. | `str` | optional | `id` |
|
||||
| `GRAPHRAG_CHUNK_ENCODING_MODEL` | The encoding model to use for chunking. | `str` | optional | The top-level encoding model. |
|
||||
|
||||
## Prompting Overrides
|
||||
### Prompting Overrides
|
||||
|
||||
| Parameter | Description | Type | Required or Optional | Default |
|
||||
| --------------------------------------------- | ------------------------------------------------------------------------------------------ | -------- | -------------------- | ---------------------------------------------------------------- |
|
||||
@ -150,7 +152,7 @@ These settings control the data input used by the pipeline. Any settings with a
|
||||
| `GRAPHRAG_COMMUNITY_REPORTS_PROMPT_FILE` | The community reports extraction prompt to utilize. | `string` | optional | `None` |
|
||||
| `GRAPHRAG_COMMUNITY_REPORTS_MAX_LENGTH` | The maximum number of tokens to generate per community reports. | `int` | optional | 1500 |
|
||||
|
||||
## Storage
|
||||
### Storage
|
||||
|
||||
This section controls the storage mechanism used by the pipeline used for exporting output tables.
|
||||
|
||||
@ -162,7 +164,7 @@ This section controls the storage mechanism used by the pipeline used for export
|
||||
| `GRAPHRAG_STORAGE_CONTAINER_NAME` | The Azure Storage container name to use when in `blob` mode. | `str` | optional | None |
|
||||
| `GRAPHRAG_STORAGE_BASE_DIR` | The base path to data outputs outputs. | `str` | optional | None |
|
||||
|
||||
## Cache
|
||||
### Cache
|
||||
|
||||
This section controls the cache mechanism used by the pipeline. This is used to cache LLM invocation results.
|
||||
|
||||
@ -174,19 +176,19 @@ This section controls the cache mechanism used by the pipeline. This is used to
|
||||
| `GRAPHRAG_CACHE_CONTAINER_NAME` | The Azure Storage container name to use when in `blob` mode. | `str` | optional | None |
|
||||
| `GRAPHRAG_CACHE_BASE_DIR` | The base path to the cache files. | `str` | optional | None |
|
||||
|
||||
## Reporting
|
||||
### Reporting
|
||||
|
||||
This section controls the reporting mechanism used by the pipeline, for common events and error messages. The default is to write reports to a file in the output directory. However, you can also choose to write reports to the console or to an Azure Blob Storage container.
|
||||
This section controls the reporting mechanism used by the pipeline, for common events and error messages. The default is to write reports to a file in the output directory. However, you can also choose to write reports to an Azure Blob Storage container.
|
||||
|
||||
| Parameter | Description | Type | Required or Optional | Default |
|
||||
| --------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------ | ----- | -------------------- | ------- |
|
||||
| `GRAPHRAG_REPORTING_TYPE` | The type of reporter to use. Options are `file`, `console`, or `blob` | `str` | optional | `file` |
|
||||
| `GRAPHRAG_REPORTING_TYPE` | The type of reporter to use. Options are `file` or `blob` | `str` | optional | `file` |
|
||||
| `GRAPHRAG_REPORTING_STORAGE_ACCOUNT_BLOB_URL` | The Azure Storage blob endpoint to use when in `blob` mode and using managed identity. Will have the format `https://<storage_account_name>.blob.core.windows.net` | `str` | optional | None |
|
||||
| `GRAPHRAG_REPORTING_CONNECTION_STRING` | The Azure Storage connection string to use when in `blob` mode. | `str` | optional | None |
|
||||
| `GRAPHRAG_REPORTING_CONTAINER_NAME` | The Azure Storage container name to use when in `blob` mode. | `str` | optional | None |
|
||||
| `GRAPHRAG_REPORTING_BASE_DIR` | The base path to the reporting outputs. | `str` | optional | None |
|
||||
|
||||
## Node2Vec Parameters
|
||||
### Node2Vec Parameters
|
||||
|
||||
| Parameter | Description | Type | Required or Optional | Default |
|
||||
| ------------------------------- | ---------------------------------------- | ------ | -------------------- | ------- |
|
||||
@ -197,7 +199,7 @@ This section controls the reporting mechanism used by the pipeline, for common e
|
||||
| `GRAPHRAG_NODE2VEC_ITERATIONS` | The number of iterations to run node2vec | `int` | optional | 3 |
|
||||
| `GRAPHRAG_NODE2VEC_RANDOM_SEED` | The random seed to use for node2vec | `int` | optional | 597832 |
|
||||
|
||||
## Data Snapshotting
|
||||
### Data Snapshotting
|
||||
|
||||
| Parameter | Description | Type | Required or Optional | Default |
|
||||
| -------------------------------------- | ----------------------------------------------- | ------ | -------------------- | ------- |
|
||||
@ -214,5 +216,4 @@ This section controls the reporting mechanism used by the pipeline, for common e
|
||||
| `GRAPHRAG_ASYNC_MODE` | Which async mode to use. Either `asyncio` or `threaded`. | `str` | optional | `asyncio` |
|
||||
| `GRAPHRAG_ENCODING_MODEL` | The text encoding model, used in tiktoken, to encode text. | `str` | optional | `cl100k_base` |
|
||||
| `GRAPHRAG_MAX_CLUSTER_SIZE` | The maximum number of entities to include in a single Leiden cluster. | `int` | optional | 10 |
|
||||
| `GRAPHRAG_SKIP_WORKFLOWS` | A comma-separated list of workflow names to skip. | `str` | optional | `None` |
|
||||
| `GRAPHRAG_UMAP_ENABLED` | Whether to enable UMAP layouts | `bool` | optional | False |
|
||||
|
||||
@ -5,12 +5,13 @@ To start using GraphRAG, you must generate a configuration file. The `init` comm
|
||||
## Usage
|
||||
|
||||
```sh
|
||||
graphrag init [--root PATH]
|
||||
graphrag init [--root PATH] [--force, --no-force]
|
||||
```
|
||||
|
||||
## Options
|
||||
|
||||
- `--root PATH` - The project root directory to initialize graphrag at. Default is the current directory.
|
||||
- `--force`, `--no-force` - Optional, default is --no-force. Overwrite existing configuration and prompt files if they exist.
|
||||
|
||||
## Example
|
||||
|
||||
@ -28,4 +29,4 @@ The `init` command will create the following files in the specified directory:
|
||||
|
||||
## Next Steps
|
||||
|
||||
After initializing your workspace, you can either run the [Prompt Tuning](../prompt_tuning/auto_prompt_tuning.md) command to adapt the prompts to your data or even start running the [Indexing Pipeline](../index/overview.md) to index your data. For more information on configuring GraphRAG, see the [Configuration](overview.md) documentation.
|
||||
After initializing your workspace, you can either run the [Prompt Tuning](../prompt_tuning/auto_prompt_tuning.md) command to adapt the prompts to your data or even start running the [Indexing Pipeline](../index/overview.md) to index your data. For more information on configuration options available, see the [YAML details page](yaml.md).
|
||||
|
||||
130
docs/config/models.md
Normal file
130
docs/config/models.md
Normal file
@ -0,0 +1,130 @@
|
||||
# Language Model Selection and Overriding
|
||||
|
||||
This page contains information on selecting a model to use and options to supply your own model for GraphRAG. Note that this is not a guide to finding the right model for your use case.
|
||||
|
||||
## Default Model Support
|
||||
|
||||
GraphRAG was built and tested using OpenAI models, so this is the default model set we support. This is not intended to be a limiter or statement of quality or fitness for your use case, only that it's the set we are most familiar with for prompting, tuning, and debugging.
|
||||
|
||||
GraphRAG also utilizes a language model wrapper library used by several projects within our team, called fnllm. fnllm provides two important functions for GraphRAG: rate limiting configuration to help us maximize throughput for large indexing jobs, and robust caching of API calls to minimize consumption on repeated indexes for testing, experimentation, or incremental ingest. fnllm uses the OpenAI Python SDK under the covers, so OpenAI-compliant endpoints are a base requirement out-of-the-box.
|
||||
|
||||
Starting with version 2.6.0, GraphRAG supports using [LiteLLM](https://docs.litellm.ai/) instead of fnllm for calling language models. LiteLLM provides support for 100+ models though it is important to note that when choosing a model it must support returning [structured outputs](https://openai.com/index/introducing-structured-outputs-in-the-api/) adhering to a [JSON schema](https://docs.litellm.ai/docs/completion/json_mode).
|
||||
|
||||
Example using LiteLLm as the language model tool for GraphRAG:
|
||||
|
||||
```yaml
|
||||
models:
|
||||
default_chat_model:
|
||||
type: chat
|
||||
auth_type: api_key
|
||||
api_key: ${GEMINI_API_KEY}
|
||||
model_provider: gemini
|
||||
model: gemini-2.5-flash-lite
|
||||
default_embedding_model:
|
||||
type: embedding
|
||||
auth_type: api_key
|
||||
api_key: ${GEMINI_API_KEY}
|
||||
model_provider: gemini
|
||||
model: gemini-embedding-001
|
||||
```
|
||||
|
||||
To use LiteLLM one must
|
||||
|
||||
- Set `type` to either `chat` or `embedding`.
|
||||
- Provide a `model_provider`, e.g., `openai`, `azure`, `gemini`, etc.
|
||||
- Set the `model` to a one supported by the `model_provider`'s API.
|
||||
- Provide a `deployment_name` if using `azure` as the `model_provider`.
|
||||
|
||||
See [Detailed Configuration](yaml.md) for more details on configuration. [View LiteLLm basic usage](https://docs.litellm.ai/docs/#basic-usage) for details on how models are called (The `model_provider` is the portion prior to `/` while the `model` is the portion following the `/`).
|
||||
|
||||
## Model Selection Considerations
|
||||
|
||||
GraphRAG has been most thoroughly tested with the gpt-4 series of models from OpenAI, including gpt-4 gpt-4-turbo, gpt-4o, and gpt-4o-mini. Our [arXiv paper](https://arxiv.org/abs/2404.16130), for example, performed quality evaluation using gpt-4-turbo. As stated above, non-OpenAI models are now supported with GraphRAG 2.6.0 and onwards through the use of LiteLLM but the suite of gpt-4 series of models from OpenAI remain the most tested and supported suite of models for GraphRAG.
|
||||
|
||||
Versions of GraphRAG before 2.2.0 made extensive use of `max_tokens` and `logit_bias` to control generated response length or content. The introduction of the o-series of models added new, non-compatible parameters because these models include a reasoning component that has different consumption patterns and response generation attributes than non-reasoning models. GraphRAG 2.2.0 now supports these models, but there are important differences that need to be understood before you switch.
|
||||
|
||||
- Previously, GraphRAG used `max_tokens` to limit responses in a few locations. This is done so that we can have predictable content sizes when building downstream context windows for summarization. We have now switched from using `max_tokens` to use a prompted approach, which is working well in our tests. We suggest using `max_tokens` in your language model config only for budgetary reasons if you want to limit consumption, and not for expected response length control. We now also support the o-series equivalent `max_completion_tokens`, but if you use this keep in mind that there may be some unknown fixed reasoning consumption amount in addition to the response tokens, so it is not a good technique for response control.
|
||||
- Previously, GraphRAG used a combination of `max_tokens` and `logit_bias` to strictly control a binary yes/no question during gleanings. This is not possible with reasoning models, so again we have switched to a prompted approach. Our tests with gpt-4o, gpt-4o-mini, and o1 show that this works consistently, but could have issues if you have an older or smaller model.
|
||||
- The o-series models are much slower and more expensive. It may be useful to use an asymmetric approach to model use in your config: you can define as many models as you like in the `models` block of your settings.yaml and reference them by key for every workflow that requires a language model. You could use gpt-4o for indexing and o1 for query, for example. Experiment to find the right balance of cost, speed, and quality for your use case.
|
||||
- The o-series models contain a form of native native chain-of-thought reasoning that is absent in the non-o-series models. GraphRAG's prompts sometimes contain CoT because it was an effective technique with the gpt-4* series. It may be counterproductive with the o-series, so you may want to tune or even re-write large portions of the prompt templates (particularly for graph and claim extraction).
|
||||
|
||||
Example config with asymmetric model use:
|
||||
|
||||
```yaml
|
||||
models:
|
||||
extraction_chat_model:
|
||||
api_key: ${GRAPHRAG_API_KEY}
|
||||
type: openai_chat
|
||||
auth_type: api_key
|
||||
model: gpt-4o
|
||||
model_supports_json: true
|
||||
query_chat_model:
|
||||
api_key: ${GRAPHRAG_API_KEY}
|
||||
type: openai_chat
|
||||
auth_type: api_key
|
||||
model: o1
|
||||
model_supports_json: true
|
||||
|
||||
...
|
||||
|
||||
extract_graph:
|
||||
model_id: extraction_chat_model
|
||||
prompt: "prompts/extract_graph.txt"
|
||||
entity_types: [organization,person,geo,event]
|
||||
max_gleanings: 1
|
||||
|
||||
...
|
||||
|
||||
|
||||
global_search:
|
||||
chat_model_id: query_chat_model
|
||||
map_prompt: "prompts/global_search_map_system_prompt.txt"
|
||||
reduce_prompt: "prompts/global_search_reduce_system_prompt.txt"
|
||||
knowledge_prompt: "prompts/global_search_knowledge_system_prompt.txt"
|
||||
```
|
||||
|
||||
Another option would be to avoid using a language model at all for the graph extraction, instead using the `fast` [indexing method](../index/methods.md) that uses NLP for portions of the indexing phase in lieu of LLM APIs.
|
||||
|
||||
## Using Non-OpenAI Models
|
||||
|
||||
As shown above, non-OpenAI models may be used via LiteLLM starting with GraphRAG version 2.6.0 but cases may still exist in which some users wish to use models not supported by LiteLLM. There are two approaches one can use to connect to unsupported models:
|
||||
|
||||
### Proxy APIs
|
||||
|
||||
Many users have used platforms such as [ollama](https://ollama.com/) and [LiteLLM Proxy Server](https://docs.litellm.ai/docs/simple_proxy) to proxy the underlying model HTTP calls to a different model provider. This seems to work reasonably well, but we frequently see issues with malformed responses (especially JSON), so if you do this please understand that your model needs to reliably return the specific response formats that GraphRAG expects. If you're having trouble with a model, you may need to try prompting to coax the format, or intercepting the response within your proxy to try and handle malformed responses.
|
||||
|
||||
### Model Protocol
|
||||
|
||||
As of GraphRAG 2.0.0, we support model injection through the use of a standard chat and embedding Protocol and an accompanying ModelFactory that you can use to register your model implementation. This is not supported with the CLI, so you'll need to use GraphRAG as a library.
|
||||
|
||||
- Our Protocol is [defined here](https://github.com/microsoft/graphrag/blob/main/graphrag/language_model/protocol/base.py)
|
||||
- Our base implementation, which wraps fnllm, [is here](https://github.com/microsoft/graphrag/blob/main/graphrag/language_model/providers/fnllm/models.py)
|
||||
- We have a simple mock implementation in our tests that you can [reference here](https://github.com/microsoft/graphrag/blob/main/tests/mock_provider.py)
|
||||
|
||||
Once you have a model implementation, you need to register it with our ModelFactory:
|
||||
|
||||
```python
|
||||
class MyCustomModel:
|
||||
...
|
||||
# implementation
|
||||
|
||||
# elsewhere...
|
||||
ModelFactory.register_chat("my-custom-chat-model", lambda **kwargs: MyCustomModel(**kwargs))
|
||||
```
|
||||
|
||||
Then in your config you can reference the type name you used:
|
||||
|
||||
```yaml
|
||||
models:
|
||||
default_chat_model:
|
||||
type: my-custom-chat-model
|
||||
|
||||
|
||||
extract_graph:
|
||||
model_id: default_chat_model
|
||||
prompt: "prompts/extract_graph.txt"
|
||||
entity_types: [organization,person,geo,event]
|
||||
max_gleanings: 1
|
||||
```
|
||||
|
||||
Note that your custom model will be passed the same params for init and method calls that we use throughout GraphRAG. There is not currently any ability to define custom parameters, so you may need to use closure scope or a factory pattern within your implementation to get custom config values.
|
||||
@ -4,8 +4,8 @@ The GraphRAG system is highly configurable. This page provides an overview of th
|
||||
|
||||
## Default Configuration Mode
|
||||
|
||||
The default configuration mode is the simplest way to get started with the GraphRAG system. It is designed to work out-of-the-box with minimal configuration. The primary configuration sections for the Indexing Engine pipelines are described below. The main ways to set up GraphRAG in Default Configuration mode are via:
|
||||
The default configuration mode is the simplest way to get started with the GraphRAG system. It is designed to work out-of-the-box with minimal configuration. The main ways to set up GraphRAG in Default Configuration mode are via:
|
||||
|
||||
- [Init command](init.md) (recommended)
|
||||
- [Using YAML for deeper control](yaml.md)
|
||||
- [Purely using environment variables](env_vars.md)
|
||||
- [Init command](init.md) (recommended first step)
|
||||
- [Edit settings.yaml for deeper control](yaml.md)
|
||||
- [Purely using environment variables](env_vars.md) (not recommended)
|
||||
|
||||
@ -17,200 +17,285 @@ llm:
|
||||
|
||||
# Config Sections
|
||||
|
||||
## Indexing
|
||||
## Language Model Setup
|
||||
|
||||
### llm
|
||||
### models
|
||||
|
||||
This is the base LLM configuration section. Other steps may override this configuration with their own LLM configuration.
|
||||
This is a dict of model configurations. The dict key is used to reference this configuration elsewhere when a model instance is desired. In this way, you can specify as many different models as you need, and reference them differentially in the workflow steps.
|
||||
|
||||
For example:
|
||||
```yml
|
||||
models:
|
||||
default_chat_model:
|
||||
api_key: ${GRAPHRAG_API_KEY}
|
||||
type: openai_chat
|
||||
model: gpt-4o
|
||||
model_supports_json: true
|
||||
default_embedding_model:
|
||||
api_key: ${GRAPHRAG_API_KEY}
|
||||
type: openai_embedding
|
||||
model: text-embedding-ada-002
|
||||
```
|
||||
|
||||
#### Fields
|
||||
|
||||
- `api_key` **str** - The OpenAI API key to use.
|
||||
- `type` **openai_chat|azure_openai_chat|openai_embedding|azure_openai_embedding** - The type of LLM to use.
|
||||
- `auth_type` **api_key|azure_managed_identity** - Indicate how you want to authenticate requests.
|
||||
- `type` **chat**|**embedding**|**openai_chat|azure_openai_chat|openai_embedding|azure_openai_embedding|mock_chat|mock_embeddings** - The type of LLM to use.
|
||||
- `model_provider` **str|None** - The model provider to use, e.g., openai, azure, anthropic, etc. Required when `type == chat|embedding`. When `type == chat|embedding`, [LiteLLM](https://docs.litellm.ai/) is used under the hood which has support for calling 100+ models. [View LiteLLm basic usage](https://docs.litellm.ai/docs/#basic-usage) for details on how models are called (The `model_provider` is the portion prior to `/` while the `model` is the portion following the `/`). [View Language Model Selection](models.md) for more details and examples on using LiteLLM.
|
||||
- `model` **str** - The model name.
|
||||
- `max_tokens` **int** - The maximum number of output tokens.
|
||||
- `request_timeout` **float** - The per-request timeout.
|
||||
- `encoding_model` **str** - The text encoding model to use. Default is to use the encoding model aligned with the language model (i.e., it is retrieved from tiktoken if unset).
|
||||
- `api_base` **str** - The API base url to use.
|
||||
- `api_version` **str** - The API version
|
||||
- `api_version` **str** - The API version.
|
||||
- `deployment_name` **str** - The deployment name to use (Azure).
|
||||
- `organization` **str** - The client organization.
|
||||
- `proxy` **str** - The proxy URL to use.
|
||||
- `audience` **str** - (Azure OpenAI only) The URI of the target Azure resource/service for which a managed identity token is requested. Used if `api_key` is not defined. Default=`https://cognitiveservices.azure.com/.default`
|
||||
- `deployment_name` **str** - The deployment name to use (Azure).
|
||||
- `model_supports_json` **bool** - Whether the model supports JSON-mode output.
|
||||
- `request_timeout` **float** - The per-request timeout.
|
||||
- `tokens_per_minute` **int** - Set a leaky-bucket throttle on tokens-per-minute.
|
||||
- `requests_per_minute` **int** - Set a leaky-bucket throttle on requests-per-minute.
|
||||
- `retry_strategy` **str** - Retry strategy to use, "native" is the default and uses the strategy built into the OpenAI SDK. Other allowable values include "exponential_backoff", "random_wait", and "incremental_wait".
|
||||
- `max_retries` **int** - The maximum number of retries to use.
|
||||
- `max_retry_wait` **float** - The maximum backoff time.
|
||||
- `sleep_on_rate_limit_recommendation` **bool** - Whether to adhere to sleep recommendations (Azure).
|
||||
- `concurrent_requests` **int** The number of open requests to allow at once.
|
||||
- `temperature` **float** - The temperature to use.
|
||||
- `top_p` **float** - The top-p value to use.
|
||||
- `async_mode` **asyncio|threaded** The async mode to use. Either `asyncio` or `threaded`.
|
||||
- `responses` **list[str]** - If this model type is mock, this is a list of response strings to return.
|
||||
- `n` **int** - The number of completions to generate.
|
||||
- `max_tokens` **int** - The maximum number of output tokens. Not valid for o-series models.
|
||||
- `temperature` **float** - The temperature to use. Not valid for o-series models.
|
||||
- `top_p` **float** - The top-p value to use. Not valid for o-series models.
|
||||
- `frequency_penalty` **float** - Frequency penalty for token generation. Not valid for o-series models.
|
||||
- `presence_penalty` **float** - Frequency penalty for token generation. Not valid for o-series models.
|
||||
- `max_completion_tokens` **int** - Max number of tokens to consume for chat completion. Must be large enough to include an unknown amount for "reasoning" by the model. o-series models only.
|
||||
- `reasoning_effort` **low|medium|high** - Amount of "thought" for the model to expend reasoning about a response. o-series models only.
|
||||
|
||||
### parallelization
|
||||
|
||||
#### Fields
|
||||
|
||||
- `stagger` **float** - The threading stagger value.
|
||||
- `num_threads` **int** - The maximum number of work threads.
|
||||
|
||||
### async_mode
|
||||
|
||||
**asyncio|threaded** The async mode to use. Either `asyncio` or `threaded.
|
||||
|
||||
### embeddings
|
||||
|
||||
#### Fields
|
||||
|
||||
- `llm` (see LLM top-level config)
|
||||
- `parallelization` (see Parallelization top-level config)
|
||||
- `async_mode` (see Async Mode top-level config)
|
||||
- `batch_size` **int** - The maximum batch size to use.
|
||||
- `batch_max_tokens` **int** - The maximum batch # of tokens.
|
||||
- `target` **required|all|none** - Determines which set of embeddings to export.
|
||||
- `skip` **list[str]** - Which embeddings to skip. Only useful if target=all to customize the list.
|
||||
- `vector_store` **dict** - The vector store to use. Configured for lancedb by default.
|
||||
- `type` **str** - `lancedb` or `azure_ai_search`. Default=`lancedb`
|
||||
- `db_uri` **str** (only for lancedb) - The database uri. Default=`storage.base_dir/lancedb`
|
||||
- `url` **str** (only for AI Search) - AI Search endpoint
|
||||
- `api_key` **str** (optional - only for AI Search) - The AI Search api key to use.
|
||||
- `audience` **str** (only for AI Search) - Audience for managed identity token if managed identity authentication is used.
|
||||
- `overwrite` **bool** (only used at index creation time) - Overwrite collection if it exist. Default=`True`
|
||||
- `container_name` **str** - The name of a vector container. This stores all indexes (tables) for a given dataset ingest. Default=`default`
|
||||
- `strategy` **dict** - Fully override the text-embedding strategy.
|
||||
## Input Files and Chunking
|
||||
|
||||
### input
|
||||
|
||||
Our pipeline can ingest .csv, .txt, or .json data from an input location. See the [inputs page](../index/inputs.md) for more details and examples.
|
||||
|
||||
#### Fields
|
||||
|
||||
- `type` **file|blob** - The input type to use. Default=`file`
|
||||
- `file_type` **text|csv** - The type of input data to load. Either `text` or `csv`. Default is `text`
|
||||
- `base_dir` **str** - The base directory to read input from, relative to the root.
|
||||
- `connection_string` **str** - (blob only) The Azure Storage connection string.
|
||||
- `storage_account_blob_url` **str** - The storage account blob URL to use.
|
||||
- `container_name` **str** - (blob only) The Azure Storage container name.
|
||||
- `file_encoding` **str** - The encoding of the input file. Default is `utf-8`
|
||||
- `file_pattern` **str** - A regex to match input files. Default is `.*\.csv$` if in csv mode and `.*\.txt$` if in text mode.
|
||||
- `storage` **StorageConfig**
|
||||
- `type` **file|blob|cosmosdb** - The storage type to use. Default=`file`
|
||||
- `base_dir` **str** - The base directory to write output artifacts to, relative to the root.
|
||||
- `connection_string` **str** - (blob/cosmosdb only) The Azure Storage connection string.
|
||||
- `container_name` **str** - (blob/cosmosdb only) The Azure Storage container name.
|
||||
- `storage_account_blob_url` **str** - (blob only) The storage account blob URL to use.
|
||||
- `cosmosdb_account_blob_url` **str** - (cosmosdb only) The CosmosDB account blob URL to use.
|
||||
- `file_type` **text|csv|json** - The type of input data to load. Default is `text`
|
||||
- `encoding` **str** - The encoding of the input file. Default is `utf-8`
|
||||
- `file_pattern` **str** - A regex to match input files. Default is `.*\.csv$`, `.*\.txt$`, or `.*\.json$` depending on the specified `file_type`, but you can customize it if needed.
|
||||
- `file_filter` **dict** - Key/value pairs to filter. Default is None.
|
||||
- `source_column` **str** - (CSV Mode Only) The source column name.
|
||||
- `timestamp_column` **str** - (CSV Mode Only) The timestamp column name.
|
||||
- `timestamp_format` **str** - (CSV Mode Only) The source format.
|
||||
- `text_column` **str** - (CSV Mode Only) The text column name.
|
||||
- `title_column` **str** - (CSV Mode Only) The title column name.
|
||||
- `document_attribute_columns` **list[str]** - (CSV Mode Only) The additional document attributes to include.
|
||||
- `text_column` **str** - (CSV/JSON only) The text column name. If unset we expect a column named `text`.
|
||||
- `title_column` **str** - (CSV/JSON only) The title column name, filename will be used if unset.
|
||||
- `metadata` **list[str]** - (CSV/JSON only) The additional document attributes fields to keep.
|
||||
|
||||
### chunks
|
||||
|
||||
These settings configure how we parse documents into text chunks. This is necessary because very large documents may not fit into a single context window, and graph extraction accuracy can be modulated. Also note the `metadata` setting in the input document config, which will replicate document metadata into each chunk.
|
||||
|
||||
#### Fields
|
||||
|
||||
- `size` **int** - The max chunk size in tokens.
|
||||
- `overlap` **int** - The chunk overlap in tokens.
|
||||
- `group_by_columns` **list[str]** - group documents by fields before chunking.
|
||||
- `encoding_model` **str** - The text encoding model to use. Default is to use the top-level encoding model.
|
||||
- `strategy` **dict** - Fully override the chunking strategy.
|
||||
- `group_by_columns` **list[str]** - Group documents by these fields before chunking.
|
||||
- `strategy` **str**[tokens|sentences] - How to chunk the text.
|
||||
- `encoding_model` **str** - The text encoding model to use for splitting on token boundaries.
|
||||
- `prepend_metadata` **bool** - Determines if metadata values should be added at the beginning of each chunk. Default=`False`.
|
||||
- `chunk_size_includes_metadata` **bool** - Specifies whether the chunk size calculation should include metadata tokens. Default=`False`.
|
||||
|
||||
## Outputs and Storage
|
||||
|
||||
### output
|
||||
|
||||
This section controls the storage mechanism used by the pipeline used for exporting output tables.
|
||||
|
||||
#### Fields
|
||||
|
||||
- `type` **file|memory|blob|cosmosdb** - The storage type to use. Default=`file`
|
||||
- `base_dir` **str** - The base directory to write output artifacts to, relative to the root.
|
||||
- `connection_string` **str** - (blob/cosmosdb only) The Azure Storage connection string.
|
||||
- `container_name` **str** - (blob/cosmosdb only) The Azure Storage container name.
|
||||
- `storage_account_blob_url` **str** - (blob only) The storage account blob URL to use.
|
||||
- `cosmosdb_account_blob_url` **str** - (cosmosdb only) The CosmosDB account blob URL to use.
|
||||
|
||||
### update_index_output
|
||||
|
||||
The section defines a secondary storage location for running incremental indexing, to preserve your original outputs.
|
||||
|
||||
#### Fields
|
||||
|
||||
- `type` **file|memory|blob|cosmosdb** - The storage type to use. Default=`file`
|
||||
- `base_dir` **str** - The base directory to write output artifacts to, relative to the root.
|
||||
- `connection_string` **str** - (blob/cosmosdb only) The Azure Storage connection string.
|
||||
- `container_name` **str** - (blob/cosmosdb only) The Azure Storage container name.
|
||||
- `storage_account_blob_url` **str** - (blob only) The storage account blob URL to use.
|
||||
- `cosmosdb_account_blob_url` **str** - (cosmosdb only) The CosmosDB account blob URL to use.
|
||||
|
||||
### cache
|
||||
|
||||
#### Fields
|
||||
|
||||
- `type` **file|memory|none|blob** - The cache type to use. Default=`file`
|
||||
- `connection_string` **str** - (blob only) The Azure Storage connection string.
|
||||
- `container_name` **str** - (blob only) The Azure Storage container name.
|
||||
- `base_dir` **str** - The base directory to write cache to, relative to the root.
|
||||
- `storage_account_blob_url` **str** - The storage account blob URL to use.
|
||||
|
||||
### storage
|
||||
This section controls the cache mechanism used by the pipeline. This is used to cache LLM invocation results for faster performance when re-running the indexing process.
|
||||
|
||||
#### Fields
|
||||
|
||||
- `type` **file|memory|blob** - The storage type to use. Default=`file`
|
||||
- `connection_string` **str** - (blob only) The Azure Storage connection string.
|
||||
- `container_name` **str** - (blob only) The Azure Storage container name.
|
||||
- `type` **file|memory|blob|cosmosdb** - The storage type to use. Default=`file`
|
||||
- `base_dir` **str** - The base directory to write output artifacts to, relative to the root.
|
||||
- `storage_account_blob_url` **str** - The storage account blob URL to use.
|
||||
|
||||
### update_index_storage
|
||||
|
||||
#### Fields
|
||||
|
||||
- `type` **file|memory|blob** - The storage type to use. Default=`file`
|
||||
- `connection_string` **str** - (blob only) The Azure Storage connection string.
|
||||
- `container_name` **str** - (blob only) The Azure Storage container name.
|
||||
- `base_dir` **str** - The base directory to write output artifacts to, relative to the root.
|
||||
- `storage_account_blob_url` **str** - The storage account blob URL to use.
|
||||
- `connection_string` **str** - (blob/cosmosdb only) The Azure Storage connection string.
|
||||
- `container_name` **str** - (blob/cosmosdb only) The Azure Storage container name.
|
||||
- `storage_account_blob_url` **str** - (blob only) The storage account blob URL to use.
|
||||
- `cosmosdb_account_blob_url` **str** - (cosmosdb only) The CosmosDB account blob URL to use.
|
||||
|
||||
### reporting
|
||||
|
||||
This section controls the reporting mechanism used by the pipeline, for common events and error messages. The default is to write reports to a file in the output directory. However, you can also choose to write reports to an Azure Blob Storage container.
|
||||
|
||||
#### Fields
|
||||
|
||||
- `type` **file|console|blob** - The reporting type to use. Default=`file`
|
||||
- `type` **file|blob** - The reporting type to use. Default=`file`
|
||||
- `base_dir` **str** - The base directory to write reports to, relative to the root.
|
||||
- `connection_string` **str** - (blob only) The Azure Storage connection string.
|
||||
- `container_name` **str** - (blob only) The Azure Storage container name.
|
||||
- `base_dir` **str** - The base directory to write reports to, relative to the root.
|
||||
- `storage_account_blob_url` **str** - The storage account blob URL to use.
|
||||
|
||||
### entity_extraction
|
||||
### vector_store
|
||||
|
||||
Where to put all vectors for the system. Configured for lancedb by default. This is a dict, with the key used to identify individual store parameters (e.g., for text embedding).
|
||||
|
||||
#### Fields
|
||||
|
||||
- `llm` (see LLM top-level config)
|
||||
- `parallelization` (see Parallelization top-level config)
|
||||
- `async_mode` (see Async Mode top-level config)
|
||||
- `type` **lancedb|azure_ai_search|cosmosdb** - Type of vector store. Default=`lancedb`
|
||||
- `db_uri` **str** (only for lancedb) - The database uri. Default=`storage.base_dir/lancedb`
|
||||
- `url` **str** (only for AI Search) - AI Search endpoint
|
||||
- `api_key` **str** (optional - only for AI Search) - The AI Search api key to use.
|
||||
- `audience` **str** (only for AI Search) - Audience for managed identity token if managed identity authentication is used.
|
||||
- `container_name` **str** - The name of a vector container. This stores all indexes (tables) for a given dataset ingest. Default=`default`
|
||||
- `database_name` **str** - (cosmosdb only) Name of the database.
|
||||
- `overwrite` **bool** (only used at index creation time) - Overwrite collection if it exist. Default=`True`
|
||||
|
||||
## Workflow Configurations
|
||||
|
||||
These settings control each individual workflow as they execute.
|
||||
|
||||
### workflows
|
||||
|
||||
**list[str]** - This is a list of workflow names to run, in order. GraphRAG has built-in pipelines to configure this, but you can run exactly and only what you want by specifying the list here. Useful if you have done part of the processing yourself.
|
||||
|
||||
### embed_text
|
||||
|
||||
By default, the GraphRAG indexer will only export embeddings required for our query methods. However, the model has embeddings defined for all plaintext fields, and these can be customized by setting the `target` and `names` fields.
|
||||
|
||||
Supported embeddings names are:
|
||||
|
||||
- `text_unit.text`
|
||||
- `document.text`
|
||||
- `entity.title`
|
||||
- `entity.description`
|
||||
- `relationship.description`
|
||||
- `community.title`
|
||||
- `community.summary`
|
||||
- `community.full_content`
|
||||
|
||||
#### Fields
|
||||
|
||||
- `model_id` **str** - Name of the model definition to use for text embedding.
|
||||
- `vector_store_id` **str** - Name of vector store definition to write to.
|
||||
- `batch_size` **int** - The maximum batch size to use.
|
||||
- `batch_max_tokens` **int** - The maximum batch # of tokens.
|
||||
- `names` **list[str]** - List of the embeddings names to run (must be in supported list).
|
||||
|
||||
### extract_graph
|
||||
|
||||
Tune the language model-based graph extraction process.
|
||||
|
||||
#### Fields
|
||||
|
||||
- `model_id` **str** - Name of the model definition to use for API calls.
|
||||
- `prompt` **str** - The prompt file to use.
|
||||
- `entity_types` **list[str]** - The entity types to identify.
|
||||
- `max_gleanings` **int** - The maximum number of gleaning cycles to use.
|
||||
- `encoding_model` **str** - The text encoding model to use. By default, this will use the top-level encoding model.
|
||||
- `strategy` **dict** - Fully override the entity extraction strategy.
|
||||
|
||||
### summarize_descriptions
|
||||
|
||||
#### Fields
|
||||
|
||||
- `llm` (see LLM top-level config)
|
||||
- `parallelization` (see Parallelization top-level config)
|
||||
- `async_mode` (see Async Mode top-level config)
|
||||
- `model_id` **str** - Name of the model definition to use for API calls.
|
||||
- `prompt` **str** - The prompt file to use.
|
||||
- `max_length` **int** - The maximum number of output tokens per summarization.
|
||||
- `strategy` **dict** - Fully override the summarize description strategy.
|
||||
- `max_input_length` **int** - The maximum number of tokens to collect for summarization (this will limit how many descriptions you send to be summarized for a given entity or relationship).
|
||||
|
||||
### claim_extraction
|
||||
### extract_graph_nlp
|
||||
|
||||
Defines settings for NLP-based graph extraction methods.
|
||||
|
||||
#### Fields
|
||||
|
||||
- `normalize_edge_weights` **bool** - Whether to normalize the edge weights during graph construction. Default=`True`.
|
||||
- `text_analyzer` **dict** - Parameters for the NLP model.
|
||||
- extractor_type **regex_english|syntactic_parser|cfg** - Default=`regex_english`.
|
||||
- model_name **str** - Name of NLP model (for SpaCy-based models)
|
||||
- max_word_length **int** - Longest word to allow. Default=`15`.
|
||||
- word_delimiter **str** - Delimiter to split words. Default ' '.
|
||||
- include_named_entities **bool** - Whether to include named entities in noun phrases. Default=`True`.
|
||||
- exclude_nouns **list[str] | None** - List of nouns to exclude. If `None`, we use an internal stopword list.
|
||||
- exclude_entity_tags **list[str]** - List of entity tags to ignore.
|
||||
- exclude_pos_tags **list[str]** - List of part-of-speech tags to ignore.
|
||||
- noun_phrase_tags **list[str]** - List of noun phrase tags to ignore.
|
||||
- noun_phrase_grammars **dict[str, str]** - Noun phrase grammars for the model (cfg-only).
|
||||
|
||||
### prune_graph
|
||||
|
||||
Parameters for manual graph pruning. This can be used to optimize the modularity of your graph clusters, by removing overly-connected or rare nodes.
|
||||
|
||||
#### Fields
|
||||
|
||||
- min_node_freq **int** - The minimum node frequency to allow.
|
||||
- max_node_freq_std **float | None** - The maximum standard deviation of node frequency to allow.
|
||||
- min_node_degree **int** - The minimum node degree to allow.
|
||||
- max_node_degree_std **float | None** - The maximum standard deviation of node degree to allow.
|
||||
- min_edge_weight_pct **float** - The minimum edge weight percentile to allow.
|
||||
- remove_ego_nodes **bool** - Remove ego nodes.
|
||||
- lcc_only **bool** - Only use largest connected component.
|
||||
|
||||
### cluster_graph
|
||||
|
||||
These are the settings used for Leiden hierarchical clustering of the graph to create communities.
|
||||
|
||||
#### Fields
|
||||
|
||||
- `max_cluster_size` **int** - The maximum cluster size to export.
|
||||
- `use_lcc` **bool** - Whether to only use the largest connected component.
|
||||
- `seed` **int** - A randomization seed to provide if consistent run-to-run results are desired. We do provide a default in order to guarantee clustering stability.
|
||||
|
||||
### extract_claims
|
||||
|
||||
#### Fields
|
||||
|
||||
- `enabled` **bool** - Whether to enable claim extraction. Off by default, because claim prompts really need user tuning.
|
||||
- `llm` (see LLM top-level config)
|
||||
- `parallelization` (see Parallelization top-level config)
|
||||
- `async_mode` (see Async Mode top-level config)
|
||||
- `model_id` **str** - Name of the model definition to use for API calls.
|
||||
- `prompt` **str** - The prompt file to use.
|
||||
- `description` **str** - Describes the types of claims we want to extract.
|
||||
- `max_gleanings` **int** - The maximum number of gleaning cycles to use.
|
||||
- `encoding_model` **str** - The text encoding model to use. By default, this will use the top-level encoding model.
|
||||
- `strategy` **dict** - Fully override the claim extraction strategy.
|
||||
|
||||
### community_reports
|
||||
|
||||
#### Fields
|
||||
|
||||
- `llm` (see LLM top-level config)
|
||||
- `parallelization` (see Parallelization top-level config)
|
||||
- `async_mode` (see Async Mode top-level config)
|
||||
- `model_id` **str** - Name of the model definition to use for API calls.
|
||||
- `prompt` **str** - The prompt file to use.
|
||||
- `max_length` **int** - The maximum number of output tokens per report.
|
||||
- `max_input_length` **int** - The maximum number of input tokens to use when generating reports.
|
||||
- `strategy` **dict** - Fully override the community reports strategy.
|
||||
|
||||
### cluster_graph
|
||||
|
||||
#### Fields
|
||||
|
||||
- `max_cluster_size` **int** - The maximum cluster size to export.
|
||||
- `strategy` **dict** - Fully override the cluster_graph strategy.
|
||||
|
||||
### embed_graph
|
||||
|
||||
We use node2vec to embed the graph. This is primarily used for visualization, so it is not turned on by default.
|
||||
|
||||
#### Fields
|
||||
|
||||
- `enabled` **bool** - Whether to enable graph embeddings.
|
||||
- `dimensions` **int** - Number of vector dimensions to produce.
|
||||
- `num_walks` **int** - The node2vec number of walks.
|
||||
- `walk_length` **int** - The node2vec walk length.
|
||||
- `window_size` **int** - The node2vec window size.
|
||||
@ -220,6 +305,8 @@ This is the base LLM configuration section. Other steps may override this config
|
||||
|
||||
### umap
|
||||
|
||||
Indicates whether we should run UMAP dimensionality reduction. This is used to provide an x/y coordinate to each graph node, suitable for visualization. If this is not enabled, nodes will receive a 0/0 x/y coordinate. If this is enabled, you *must* enable graph embedding as well.
|
||||
|
||||
#### Fields
|
||||
|
||||
- `enabled` **bool** - Whether to enable UMAP layouts.
|
||||
@ -230,15 +317,6 @@ This is the base LLM configuration section. Other steps may override this config
|
||||
|
||||
- `embeddings` **bool** - Export embeddings snapshots to parquet.
|
||||
- `graphml` **bool** - Export graph snapshots to GraphML.
|
||||
- `transient` **bool** - Export transient workflow tables snapshots to parquet.
|
||||
|
||||
### encoding_model
|
||||
|
||||
**str** - The text encoding model to use. Default=`cl100k_base`.
|
||||
|
||||
### skip_workflows
|
||||
|
||||
**list[str]** - Which workflow names to skip.
|
||||
|
||||
## Query
|
||||
|
||||
@ -246,54 +324,48 @@ This is the base LLM configuration section. Other steps may override this config
|
||||
|
||||
#### Fields
|
||||
|
||||
- `chat_model_id` **str** - Name of the model definition to use for Chat Completion calls.
|
||||
- `embedding_model_id` **str** - Name of the model definition to use for Embedding calls.
|
||||
- `prompt` **str** - The prompt file to use.
|
||||
- `text_unit_prop` **float** - The text unit proportion.
|
||||
- `community_prop` **float** - The community proportion.
|
||||
- `conversation_history_max_turns` **int** - The conversation history maximum turns.
|
||||
- `top_k_entities` **int** - The top k mapped entities.
|
||||
- `top_k_relationships` **int** - The top k mapped relations.
|
||||
- `temperature` **float | None** - The temperature to use for token generation.
|
||||
- `top_p` **float | None** - The top-p value to use for token generation.
|
||||
- `n` **int | None** - The number of completions to generate.
|
||||
- `max_tokens` **int** - The maximum tokens.
|
||||
- `llm_max_tokens` **int** - The LLM maximum tokens.
|
||||
- `max_context_tokens` **int** - The maximum tokens to use building the request context.
|
||||
|
||||
### global_search
|
||||
|
||||
#### Fields
|
||||
|
||||
- `chat_model_id` **str** - Name of the model definition to use for Chat Completion calls.
|
||||
- `map_prompt` **str** - The mapper prompt file to use.
|
||||
- `reduce_prompt` **str** - The reducer prompt file to use.
|
||||
- `knowledge_prompt` **str** - The knowledge prompt file to use.
|
||||
- `map_prompt` **str | None** - The global search mapper prompt to use.
|
||||
- `reduce_prompt` **str | None** - The global search reducer to use.
|
||||
- `knowledge_prompt` **str | None** - The global search general prompt to use.
|
||||
- `temperature` **float | None** - The temperature to use for token generation.
|
||||
- `top_p` **float | None** - The top-p value to use for token generation.
|
||||
- `n` **int | None** - The number of completions to generate.
|
||||
- `max_tokens` **int** - The maximum context size in tokens.
|
||||
- `data_max_tokens` **int** - The data llm maximum tokens.
|
||||
- `map_max_tokens` **int** - The map llm maximum tokens.
|
||||
- `reduce_max_tokens` **int** - The reduce llm maximum tokens.
|
||||
- `concurrency` **int** - The number of concurrent requests.
|
||||
- `dynamic_search_llm` **str** - LLM model to use for dynamic community selection.
|
||||
- `max_context_tokens` **int** - The maximum context size to create, in tokens.
|
||||
- `data_max_tokens` **int** - The maximum tokens to use constructing the final response from the reduces responses.
|
||||
- `map_max_length` **int** - The maximum length to request for map responses, in words.
|
||||
- `reduce_max_length` **int** - The maximum length to request for reduce responses, in words.
|
||||
- `dynamic_search_threshold` **int** - Rating threshold in include a community report.
|
||||
- `dynamic_search_keep_parent` **bool** - Keep parent community if any of the child communities are relevant.
|
||||
- `dynamic_search_num_repeats` **int** - Number of times to rate the same community report.
|
||||
- `dynamic_search_use_summary` **bool** - Use community summary instead of full_context.
|
||||
- `dynamic_search_concurrent_coroutines` **int** - Number of concurrent coroutines to rate community reports.
|
||||
- `dynamic_search_max_level` **int** - The maximum level of community hierarchy to consider if none of the processed communities are relevant.
|
||||
|
||||
### drift_search
|
||||
|
||||
#### Fields
|
||||
|
||||
- `chat_model_id` **str** - Name of the model definition to use for Chat Completion calls.
|
||||
- `embedding_model_id` **str** - Name of the model definition to use for Embedding calls.
|
||||
- `prompt` **str** - The prompt file to use.
|
||||
- `temperature` **float** - The temperature to use for token generation.",
|
||||
- `top_p` **float** - The top-p value to use for token generation.
|
||||
- `n` **int** - The number of completions to generate.
|
||||
- `max_tokens` **int** - The maximum context size in tokens.
|
||||
- `reduce_prompt` **str** - The reducer prompt file to use.
|
||||
- `data_max_tokens` **int** - The data llm maximum tokens.
|
||||
- `reduce_max_tokens` **int** - The maximum tokens for the reduce phase. Only use if a non-o-series model.
|
||||
- `reduce_max_completion_tokens` **int** - The maximum tokens for the reduce phase. Only use for o-series models.
|
||||
- `concurrency` **int** - The number of concurrent requests.
|
||||
- `drift_k_followups` **int** - The number of top global results to retrieve.
|
||||
- `primer_folds` **int** - The number of folds for search priming.
|
||||
@ -307,4 +379,14 @@ This is the base LLM configuration section. Other steps may override this config
|
||||
- `local_search_temperature` **float** - The temperature to use for token generation in local search.
|
||||
- `local_search_top_p` **float** - The top-p value to use for token generation in local search.
|
||||
- `local_search_n` **int** - The number of completions to generate in local search.
|
||||
- `local_search_llm_max_gen_tokens` **int** - The maximum number of generated tokens for the LLM in local search.
|
||||
- `local_search_llm_max_gen_tokens` **int** - The maximum number of generated tokens for the LLM in local search. Only use if a non-o-series model.
|
||||
- `local_search_llm_max_gen_completion_tokens` **int** - The maximum number of generated tokens for the LLM in local search. Only use for o-series models.
|
||||
|
||||
### basic_search
|
||||
|
||||
#### Fields
|
||||
|
||||
- `chat_model_id` **str** - Name of the model definition to use for Chat Completion calls.
|
||||
- `embedding_model_id` **str** - Name of the model definition to use for Embedding calls.
|
||||
- `prompt` **str** - The prompt file to use.
|
||||
- `k` **int | None** - Number of text units to retrieve from the vector store for context building.
|
||||
|
||||
@ -5,27 +5,27 @@
|
||||
| Name | Installation | Purpose |
|
||||
| ------------------- | ------------------------------------------------------------ | ----------------------------------------------------------------------------------- |
|
||||
| Python 3.10-3.12 | [Download](https://www.python.org/downloads/) | The library is Python-based. |
|
||||
| Poetry | [Instructions](https://python-poetry.org/docs/#installation) | Poetry is used for package management and virtualenv management in Python codebases |
|
||||
| uv | [Instructions](https://docs.astral.sh/uv/) | uv is used for package management and virtualenv management in Python codebases |
|
||||
|
||||
# Getting Started
|
||||
|
||||
## Install Dependencies
|
||||
|
||||
```sh
|
||||
# Install Python dependencies.
|
||||
poetry install
|
||||
# install python dependencies
|
||||
uv sync
|
||||
```
|
||||
|
||||
## Execute the Indexing Engine
|
||||
|
||||
```sh
|
||||
poetry run poe index <...args>
|
||||
uv run poe index <...args>
|
||||
```
|
||||
|
||||
## Executing Queries
|
||||
|
||||
```sh
|
||||
poetry run poe query <...args>
|
||||
uv run poe query <...args>
|
||||
```
|
||||
|
||||
# Azurite
|
||||
@ -40,31 +40,31 @@ or by simply running `azurite` in the terminal if already installed globally. Se
|
||||
|
||||
# Lifecycle Scripts
|
||||
|
||||
Our Python package utilizes Poetry to manage dependencies and [poethepoet](https://pypi.org/project/poethepoet/) to manage build scripts.
|
||||
Our Python package utilize uv to manage dependencies and [poethepoet](https://pypi.org/project/poethepoet/) to manage build scripts.
|
||||
|
||||
Available scripts are:
|
||||
|
||||
- `poetry run poe index` - Run the Indexing CLI
|
||||
- `poetry run poe query` - Run the Query CLI
|
||||
- `poetry build` - This invokes `poetry build`, which will build a wheel file and other distributable artifacts.
|
||||
- `poetry run poe test` - This will execute all tests.
|
||||
- `poetry run poe test_unit` - This will execute unit tests.
|
||||
- `poetry run poe test_integration` - This will execute integration tests.
|
||||
- `poetry run poe test_smoke` - This will execute smoke tests.
|
||||
- `poetry run poe test_verbs` - This will execute tests of the basic workflows.
|
||||
- `poetry run poe check` - This will perform a suite of static checks across the package, including:
|
||||
- `uv run poe index` - Run the Indexing CLI
|
||||
- `uv run poe query` - Run the Query CLI
|
||||
- `uv build` - This will build a wheel file and other distributable artifacts.
|
||||
- `uv run poe test` - This will execute all tests.
|
||||
- `uv run poe test_unit` - This will execute unit tests.
|
||||
- `uv run poe test_integration` - This will execute integration tests.
|
||||
- `uv run poe test_smoke` - This will execute smoke tests.
|
||||
- `uv run poe test_verbs` - This will execute tests of the basic workflows.
|
||||
- `uv run poe check` - This will perform a suite of static checks across the package, including:
|
||||
- formatting
|
||||
- documentation formatting
|
||||
- linting
|
||||
- security patterns
|
||||
- type-checking
|
||||
- `poetry run poe fix` - This will apply any available auto-fixes to the package. Usually this is just formatting fixes.
|
||||
- `poetry run poe fix_unsafe` - This will apply any available auto-fixes to the package, including those that may be unsafe.
|
||||
- `poetry run poe format` - Explicitly run the formatter across the package.
|
||||
- `uv run poe fix` - This will apply any available auto-fixes to the package. Usually this is just formatting fixes.
|
||||
- `uv run poe fix_unsafe` - This will apply any available auto-fixes to the package, including those that may be unsafe.
|
||||
- `uv run poe format` - Explicitly run the formatter across the package.
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
### "RuntimeError: llvm-config failed executing, please point LLVM_CONFIG to the path for llvm-config" when running poetry install
|
||||
### "RuntimeError: llvm-config failed executing, please point LLVM_CONFIG to the path for llvm-config" when running uv install
|
||||
|
||||
Make sure llvm-9 and llvm-9-dev are installed:
|
||||
|
||||
@ -73,14 +73,3 @@ Make sure llvm-9 and llvm-9-dev are installed:
|
||||
and then in your bashrc, add
|
||||
|
||||
`export LLVM_CONFIG=/usr/bin/llvm-config-9`
|
||||
|
||||
### "numba/\_pymodule.h:6:10: fatal error: Python.h: No such file or directory" when running poetry install
|
||||
|
||||
Make sure you have python3.10-dev installed or more generally `python<version>-dev`
|
||||
|
||||
`sudo apt-get install python3.10-dev`
|
||||
|
||||
### LLM call constantly exceeds TPM, RPM or time limits
|
||||
|
||||
`GRAPHRAG_LLM_THREAD_COUNT` and `GRAPHRAG_EMBEDDING_THREAD_COUNT` are both set to 50 by default. You can modify these values
|
||||
to reduce concurrency. Please refer to the [Configuration Documents](config/overview.md)
|
||||
|
||||
@ -25,8 +25,23 @@
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from pathlib import Path\n",
|
||||
"from pprint import pprint\n",
|
||||
"\n",
|
||||
"import pandas as pd\n",
|
||||
"\n",
|
||||
"import graphrag.api as api\n",
|
||||
"from graphrag.index.typing import PipelineRunResult"
|
||||
"from graphrag.config.load_config import load_config\n",
|
||||
"from graphrag.index.typing.pipeline_run_result import PipelineRunResult"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"PROJECT_DIRECTORY = \"<your project directory>\""
|
||||
]
|
||||
},
|
||||
{
|
||||
@ -36,27 +51,7 @@
|
||||
"## Prerequisite\n",
|
||||
"As a prerequisite to all API operations, a `GraphRagConfig` object is required. It is the primary means to control the behavior of graphrag and can be instantiated from a `settings.yaml` configuration file.\n",
|
||||
"\n",
|
||||
"Please refer to the [CLI docs](https://microsoft.github.io/graphrag/cli/#init) for more detailed information on how to generate the `settings.yaml` file.\n",
|
||||
"\n",
|
||||
"#### Load `settings.yaml` configuration"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import yaml\n",
|
||||
"\n",
|
||||
"settings = yaml.safe_load(open(\"<project_directory>/settings.yaml\")) # noqa: PTH123, SIM115"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"At this point, you can modify the imported settings to align with your application's requirements. For example, if building a UI application, the application might need to change the input and/or storage destinations dynamically in order to enable users to build and query different indexes."
|
||||
"Please refer to the [CLI docs](https://microsoft.github.io/graphrag/cli/#init) for more detailed information on how to generate the `settings.yaml` file."
|
||||
]
|
||||
},
|
||||
{
|
||||
@ -72,11 +67,9 @@
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from graphrag.config.create_graphrag_config import create_graphrag_config\n",
|
||||
"\n",
|
||||
"graphrag_config = create_graphrag_config(\n",
|
||||
" values=settings, root_dir=\"<project_directory>\"\n",
|
||||
")"
|
||||
"# note that we expect this to fail on the deployed docs because the PROJECT_DIRECTORY is not set to a real location.\n",
|
||||
"# if you run this notebook locally, make sure to point at a location containing your settings.yaml\n",
|
||||
"graphrag_config = load_config(Path(PROJECT_DIRECTORY))"
|
||||
]
|
||||
},
|
||||
{
|
||||
@ -124,25 +117,17 @@
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import pandas as pd\n",
|
||||
"\n",
|
||||
"final_nodes = pd.read_parquet(\"<project_directory>/output/create_final_nodes.parquet\")\n",
|
||||
"final_entities = pd.read_parquet(\n",
|
||||
" \"<project_directory>/output/create_final_entities.parquet\"\n",
|
||||
")\n",
|
||||
"final_communities = pd.read_parquet(\n",
|
||||
" \"<project_directory>/output/create_final_communities.parquet\"\n",
|
||||
")\n",
|
||||
"final_community_reports = pd.read_parquet(\n",
|
||||
" \"<project_directory>/output/create_final_community_reports.parquet\"\n",
|
||||
"entities = pd.read_parquet(f\"{PROJECT_DIRECTORY}/output/entities.parquet\")\n",
|
||||
"communities = pd.read_parquet(f\"{PROJECT_DIRECTORY}/output/communities.parquet\")\n",
|
||||
"community_reports = pd.read_parquet(\n",
|
||||
" f\"{PROJECT_DIRECTORY}/output/community_reports.parquet\"\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"response, context = await api.global_search(\n",
|
||||
" config=graphrag_config,\n",
|
||||
" nodes=final_nodes,\n",
|
||||
" entities=final_entities,\n",
|
||||
" communities=final_communities,\n",
|
||||
" community_reports=final_community_reports,\n",
|
||||
" entities=entities,\n",
|
||||
" communities=communities,\n",
|
||||
" community_reports=community_reports,\n",
|
||||
" community_level=2,\n",
|
||||
" dynamic_community_selection=False,\n",
|
||||
" response_type=\"Multiple Paragraphs\",\n",
|
||||
@ -179,15 +164,13 @@
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from pprint import pprint\n",
|
||||
"\n",
|
||||
"pprint(context) # noqa: T203"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "graphrag-venv",
|
||||
"display_name": ".venv",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
@ -201,7 +184,7 @@
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.10.15"
|
||||
"version": "3.11.9"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
|
||||
680
docs/examples_notebooks/custom_vector_store.ipynb
Normal file
680
docs/examples_notebooks/custom_vector_store.ipynb
Normal file
@ -0,0 +1,680 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Copyright (c) 2024 Microsoft Corporation.\n",
|
||||
"# Licensed under the MIT License."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Bring-Your-Own Vector Store\n",
|
||||
"\n",
|
||||
"This notebook demonstrates how to implement a custom vector store and register for usage with GraphRAG.\n",
|
||||
"\n",
|
||||
"## Overview\n",
|
||||
"\n",
|
||||
"GraphRAG uses a plug-and-play architecture that allow for easy integration of custom vector stores (outside of what is natively supported) by following a factory design pattern. This allows you to:\n",
|
||||
"\n",
|
||||
"- **Extend functionality**: Add support for new vector database backends\n",
|
||||
"- **Customize behavior**: Implement specialized search logic or data structures\n",
|
||||
"- **Integrate existing systems**: Connect GraphRAG to your existing vector database infrastructure\n",
|
||||
"\n",
|
||||
"### What You'll Learn\n",
|
||||
"\n",
|
||||
"1. Understanding the `BaseVectorStore` interface\n",
|
||||
"2. Implementing a custom vector store class\n",
|
||||
"3. Registering your vector store with the `VectorStoreFactory`\n",
|
||||
"4. Testing and validating your implementation\n",
|
||||
"5. Configuring GraphRAG to use your custom vector store\n",
|
||||
"\n",
|
||||
"Let's get started!"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Step 1: Import Required Dependencies\n",
|
||||
"\n",
|
||||
"First, let's import the necessary GraphRAG components and other dependencies we'll need.\n",
|
||||
"\n",
|
||||
"```bash\n",
|
||||
"pip install graphrag\n",
|
||||
"```"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from typing import Any\n",
|
||||
"\n",
|
||||
"import numpy as np\n",
|
||||
"import yaml\n",
|
||||
"\n",
|
||||
"from graphrag.config.models.vector_store_schema_config import VectorStoreSchemaConfig\n",
|
||||
"from graphrag.data_model.types import TextEmbedder\n",
|
||||
"\n",
|
||||
"# GraphRAG vector store components\n",
|
||||
"from graphrag.vector_stores.base import (\n",
|
||||
" BaseVectorStore,\n",
|
||||
" VectorStoreDocument,\n",
|
||||
" VectorStoreSearchResult,\n",
|
||||
")\n",
|
||||
"from graphrag.vector_stores.factory import VectorStoreFactory"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Step 2: Understand the BaseVectorStore Interface\n",
|
||||
"\n",
|
||||
"Before using a custom vector store, let's examine the `BaseVectorStore` interface to understand what methods need to be implemented."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Let's inspect the BaseVectorStore class to understand the required methods\n",
|
||||
"import inspect\n",
|
||||
"\n",
|
||||
"print(\"BaseVectorStore Abstract Methods:\")\n",
|
||||
"print(\"=\" * 40)\n",
|
||||
"\n",
|
||||
"abstract_methods = []\n",
|
||||
"for name, method in inspect.getmembers(BaseVectorStore, predicate=inspect.isfunction):\n",
|
||||
" if getattr(method, \"__isabstractmethod__\", False):\n",
|
||||
" signature = inspect.signature(method)\n",
|
||||
" abstract_methods.append(f\"• {name}{signature}\")\n",
|
||||
" print(f\"• {name}{signature}\")\n",
|
||||
"\n",
|
||||
"print(f\"\\nTotal abstract methods to implement: {len(abstract_methods)}\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Step 3: Implement a Custom Vector Store\n",
|
||||
"\n",
|
||||
"Now let's implement a simple in-memory vector store as an example. This vector store will:\n",
|
||||
"\n",
|
||||
"- Store documents and vectors in memory using Python data structures\n",
|
||||
"- Support all required BaseVectorStore methods\n",
|
||||
"\n",
|
||||
"**Note**: This is a simplified example for demonstration. Production vector stores would typically use optimized libraries like FAISS, more sophisticated indexing, and persistent storage."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"class SimpleInMemoryVectorStore(BaseVectorStore):\n",
|
||||
" \"\"\"A simple in-memory vector store implementation for demonstration purposes.\n",
|
||||
"\n",
|
||||
" This vector store stores documents and their embeddings in memory and provides\n",
|
||||
" basic similarity search functionality using cosine similarity.\n",
|
||||
"\n",
|
||||
" WARNING: This is for demonstration only - not suitable for production use.\n",
|
||||
" For production, consider using optimized vector databases like LanceDB,\n",
|
||||
" Azure AI Search, or other specialized vector stores.\n",
|
||||
" \"\"\"\n",
|
||||
"\n",
|
||||
" # Internal storage for documents and vectors\n",
|
||||
" documents: dict[str, VectorStoreDocument]\n",
|
||||
" vectors: dict[str, np.ndarray]\n",
|
||||
" connected: bool\n",
|
||||
"\n",
|
||||
" def __init__(self, **kwargs: Any):\n",
|
||||
" \"\"\"Initialize the in-memory vector store.\"\"\"\n",
|
||||
" super().__init__(**kwargs)\n",
|
||||
"\n",
|
||||
" self.documents: dict[str, VectorStoreDocument] = {}\n",
|
||||
" self.vectors: dict[str, np.ndarray] = {}\n",
|
||||
" self.connected = False\n",
|
||||
"\n",
|
||||
" print(f\"🚀 SimpleInMemoryVectorStore initialized for index: {self.index_name}\")\n",
|
||||
"\n",
|
||||
" def connect(self, **kwargs: Any) -> None:\n",
|
||||
" \"\"\"Connect to the vector storage (no-op for in-memory store).\"\"\"\n",
|
||||
" self.connected = True\n",
|
||||
" print(f\"✅ Connected to in-memory vector store: {self.index_name}\")\n",
|
||||
"\n",
|
||||
" def load_documents(\n",
|
||||
" self, documents: list[VectorStoreDocument], overwrite: bool = True\n",
|
||||
" ) -> None:\n",
|
||||
" \"\"\"Load documents into the vector store.\"\"\"\n",
|
||||
" if not self.connected:\n",
|
||||
" msg = \"Vector store not connected. Call connect() first.\"\n",
|
||||
" raise RuntimeError(msg)\n",
|
||||
"\n",
|
||||
" if overwrite:\n",
|
||||
" self.documents.clear()\n",
|
||||
" self.vectors.clear()\n",
|
||||
"\n",
|
||||
" loaded_count = 0\n",
|
||||
" for doc in documents:\n",
|
||||
" if doc.vector is not None:\n",
|
||||
" doc_id = str(doc.id)\n",
|
||||
" self.documents[doc_id] = doc\n",
|
||||
" self.vectors[doc_id] = np.array(doc.vector, dtype=np.float32)\n",
|
||||
" loaded_count += 1\n",
|
||||
"\n",
|
||||
" print(f\"📚 Loaded {loaded_count} documents into vector store\")\n",
|
||||
"\n",
|
||||
" def _cosine_similarity(self, vec1: np.ndarray, vec2: np.ndarray) -> float:\n",
|
||||
" \"\"\"Calculate cosine similarity between two vectors.\"\"\"\n",
|
||||
" # Normalize vectors\n",
|
||||
" norm1 = np.linalg.norm(vec1)\n",
|
||||
" norm2 = np.linalg.norm(vec2)\n",
|
||||
"\n",
|
||||
" if norm1 == 0 or norm2 == 0:\n",
|
||||
" return 0.0\n",
|
||||
"\n",
|
||||
" return float(np.dot(vec1, vec2) / (norm1 * norm2))\n",
|
||||
"\n",
|
||||
" def similarity_search_by_vector(\n",
|
||||
" self, query_embedding: list[float], k: int = 10, **kwargs: Any\n",
|
||||
" ) -> list[VectorStoreSearchResult]:\n",
|
||||
" \"\"\"Perform similarity search using a query vector.\"\"\"\n",
|
||||
" if not self.connected:\n",
|
||||
" msg = \"Vector store not connected. Call connect() first.\"\n",
|
||||
" raise RuntimeError(msg)\n",
|
||||
"\n",
|
||||
" if not self.vectors:\n",
|
||||
" return []\n",
|
||||
"\n",
|
||||
" query_vec = np.array(query_embedding, dtype=np.float32)\n",
|
||||
" similarities = []\n",
|
||||
"\n",
|
||||
" # Calculate similarity with all stored vectors\n",
|
||||
" for doc_id, stored_vec in self.vectors.items():\n",
|
||||
" similarity = self._cosine_similarity(query_vec, stored_vec)\n",
|
||||
" similarities.append((doc_id, similarity))\n",
|
||||
"\n",
|
||||
" # Sort by similarity (descending) and take top k\n",
|
||||
" similarities.sort(key=lambda x: x[1], reverse=True)\n",
|
||||
" top_k = similarities[:k]\n",
|
||||
"\n",
|
||||
" # Create search results\n",
|
||||
" results = []\n",
|
||||
" for doc_id, score in top_k:\n",
|
||||
" document = self.documents[doc_id]\n",
|
||||
" result = VectorStoreSearchResult(document=document, score=score)\n",
|
||||
" results.append(result)\n",
|
||||
"\n",
|
||||
" return results\n",
|
||||
"\n",
|
||||
" def similarity_search_by_text(\n",
|
||||
" self, text: str, text_embedder: TextEmbedder, k: int = 10, **kwargs: Any\n",
|
||||
" ) -> list[VectorStoreSearchResult]:\n",
|
||||
" \"\"\"Perform similarity search using text (which gets embedded first).\"\"\"\n",
|
||||
" # Embed the text first\n",
|
||||
" query_embedding = text_embedder(text)\n",
|
||||
"\n",
|
||||
" # Use vector search with the embedding\n",
|
||||
" return self.similarity_search_by_vector(query_embedding, k, **kwargs)\n",
|
||||
"\n",
|
||||
" def filter_by_id(self, include_ids: list[str] | list[int]) -> Any:\n",
|
||||
" \"\"\"Build a query filter to filter documents by id.\n",
|
||||
"\n",
|
||||
" For this simple implementation, we return the list of IDs as the filter.\n",
|
||||
" \"\"\"\n",
|
||||
" return [str(id_) for id_ in include_ids]\n",
|
||||
"\n",
|
||||
" def search_by_id(self, id: str) -> VectorStoreDocument:\n",
|
||||
" \"\"\"Search for a document by id.\"\"\"\n",
|
||||
" doc_id = str(id)\n",
|
||||
" if doc_id not in self.documents:\n",
|
||||
" msg = f\"Document with id '{id}' not found\"\n",
|
||||
" raise KeyError(msg)\n",
|
||||
"\n",
|
||||
" return self.documents[doc_id]\n",
|
||||
"\n",
|
||||
" def get_stats(self) -> dict[str, Any]:\n",
|
||||
" \"\"\"Get statistics about the vector store (custom method).\"\"\"\n",
|
||||
" return {\n",
|
||||
" \"index_name\": self.index_name,\n",
|
||||
" \"document_count\": len(self.documents),\n",
|
||||
" \"vector_count\": len(self.vectors),\n",
|
||||
" \"connected\": self.connected,\n",
|
||||
" \"vector_dimension\": len(next(iter(self.vectors.values())))\n",
|
||||
" if self.vectors\n",
|
||||
" else 0,\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"print(\"✅ SimpleInMemoryVectorStore class defined!\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Step 4: Register the Custom Vector Store\n",
|
||||
"\n",
|
||||
"Now let's register our custom vector store with the `VectorStoreFactory` so it can be used throughout GraphRAG."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Register our custom vector store with a unique identifier\n",
|
||||
"CUSTOM_VECTOR_STORE_TYPE = \"simple_memory\"\n",
|
||||
"\n",
|
||||
"# Register the vector store class\n",
|
||||
"VectorStoreFactory.register(CUSTOM_VECTOR_STORE_TYPE, SimpleInMemoryVectorStore)\n",
|
||||
"\n",
|
||||
"print(f\"✅ Registered custom vector store with type: '{CUSTOM_VECTOR_STORE_TYPE}'\")\n",
|
||||
"\n",
|
||||
"# Verify registration\n",
|
||||
"available_types = VectorStoreFactory.get_vector_store_types()\n",
|
||||
"print(f\"\\n📋 Available vector store types: {available_types}\")\n",
|
||||
"print(\n",
|
||||
" f\"🔍 Is our custom type supported? {VectorStoreFactory.is_supported_type(CUSTOM_VECTOR_STORE_TYPE)}\"\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Step 5: Test the Custom Vector Store\n",
|
||||
"\n",
|
||||
"Let's create some sample data and test our custom vector store implementation."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Create sample documents with mock embeddings\n",
|
||||
"def create_mock_embedding(dimension: int = 384) -> list[float]:\n",
|
||||
" \"\"\"Create a random embedding vector for testing.\"\"\"\n",
|
||||
" return np.random.normal(0, 1, dimension).tolist()\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"# Sample documents\n",
|
||||
"sample_documents = [\n",
|
||||
" VectorStoreDocument(\n",
|
||||
" id=\"doc_1\",\n",
|
||||
" text=\"GraphRAG is a powerful knowledge graph extraction and reasoning framework.\",\n",
|
||||
" vector=create_mock_embedding(),\n",
|
||||
" attributes={\"category\": \"technology\", \"source\": \"documentation\"},\n",
|
||||
" ),\n",
|
||||
" VectorStoreDocument(\n",
|
||||
" id=\"doc_2\",\n",
|
||||
" text=\"Vector stores enable efficient similarity search over high-dimensional data.\",\n",
|
||||
" vector=create_mock_embedding(),\n",
|
||||
" attributes={\"category\": \"technology\", \"source\": \"research\"},\n",
|
||||
" ),\n",
|
||||
" VectorStoreDocument(\n",
|
||||
" id=\"doc_3\",\n",
|
||||
" text=\"Machine learning models can process and understand natural language text.\",\n",
|
||||
" vector=create_mock_embedding(),\n",
|
||||
" attributes={\"category\": \"AI\", \"source\": \"article\"},\n",
|
||||
" ),\n",
|
||||
" VectorStoreDocument(\n",
|
||||
" id=\"doc_4\",\n",
|
||||
" text=\"Custom implementations allow for specialized behavior and integration.\",\n",
|
||||
" vector=create_mock_embedding(),\n",
|
||||
" attributes={\"category\": \"development\", \"source\": \"tutorial\"},\n",
|
||||
" ),\n",
|
||||
"]\n",
|
||||
"\n",
|
||||
"print(f\"📝 Created {len(sample_documents)} sample documents\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Test creating vector store using the factory\n",
|
||||
"schema = VectorStoreSchemaConfig(index_name=\"test_collection\")\n",
|
||||
"\n",
|
||||
"# Create vector store instance using factory\n",
|
||||
"vector_store = VectorStoreFactory.create_vector_store(\n",
|
||||
" CUSTOM_VECTOR_STORE_TYPE, vector_store_schema_config=schema\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"print(f\"✅ Created vector store instance: {type(vector_store).__name__}\")\n",
|
||||
"print(f\"📊 Initial stats: {vector_store.get_stats()}\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Connect and load documents\n",
|
||||
"vector_store.connect()\n",
|
||||
"vector_store.load_documents(sample_documents)\n",
|
||||
"\n",
|
||||
"print(f\"📊 Updated stats: {vector_store.get_stats()}\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Test similarity search\n",
|
||||
"query_vector = create_mock_embedding() # Random query vector for testing\n",
|
||||
"\n",
|
||||
"search_results = vector_store.similarity_search_by_vector(\n",
|
||||
" query_vector,\n",
|
||||
" k=3, # Get top 3 similar documents\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"print(f\"🔍 Found {len(search_results)} similar documents:\\n\")\n",
|
||||
"\n",
|
||||
"for i, result in enumerate(search_results, 1):\n",
|
||||
" doc = result.document\n",
|
||||
" print(f\"{i}. ID: {doc.id}\")\n",
|
||||
" print(f\" Text: {doc.text[:60]}...\")\n",
|
||||
" print(f\" Similarity Score: {result.score:.4f}\")\n",
|
||||
" print(f\" Category: {doc.attributes.get('category', 'N/A')}\")\n",
|
||||
" print()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Test search by ID\n",
|
||||
"try:\n",
|
||||
" found_doc = vector_store.search_by_id(\"doc_2\")\n",
|
||||
" print(\"✅ Found document by ID:\")\n",
|
||||
" print(f\" ID: {found_doc.id}\")\n",
|
||||
" print(f\" Text: {found_doc.text}\")\n",
|
||||
" print(f\" Attributes: {found_doc.attributes}\")\n",
|
||||
"except KeyError as e:\n",
|
||||
" print(f\"❌ Error: {e}\")\n",
|
||||
"\n",
|
||||
"# Test filter by ID\n",
|
||||
"id_filter = vector_store.filter_by_id([\"doc_1\", \"doc_3\"])\n",
|
||||
"print(f\"\\n🔧 ID filter result: {id_filter}\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Step 6: Configuration for GraphRAG\n",
|
||||
"\n",
|
||||
"Now let's see how you would configure GraphRAG to use your custom vector store in a settings file."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Example GraphRAG yaml settings\n",
|
||||
"example_settings = {\n",
|
||||
" \"vector_store\": {\n",
|
||||
" \"default_vector_store\": {\n",
|
||||
" \"type\": CUSTOM_VECTOR_STORE_TYPE, # \"simple_memory\"\n",
|
||||
" \"collection_name\": \"graphrag_entities\",\n",
|
||||
" # Add any custom parameters your vector store needs\n",
|
||||
" \"custom_parameter\": \"custom_value\",\n",
|
||||
" }\n",
|
||||
" },\n",
|
||||
" # Other GraphRAG configuration...\n",
|
||||
" \"models\": {\n",
|
||||
" \"default_embedding_model\": {\n",
|
||||
" \"type\": \"openai_embedding\",\n",
|
||||
" \"model\": \"text-embedding-3-small\",\n",
|
||||
" }\n",
|
||||
" },\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"# Convert to YAML format for settings.yml\n",
|
||||
"yaml_config = yaml.dump(example_settings, default_flow_style=False, indent=2)\n",
|
||||
"\n",
|
||||
"print(\"📄 Example settings.yml configuration:\")\n",
|
||||
"print(\"=\" * 40)\n",
|
||||
"print(yaml_config)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Step 7: Integration with GraphRAG Pipeline\n",
|
||||
"\n",
|
||||
"Here's how your custom vector store would be used in a typical GraphRAG pipeline."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Example of how GraphRAG would use your custom vector store\n",
|
||||
"def simulate_graphrag_pipeline():\n",
|
||||
" \"\"\"Simulate how GraphRAG would use the custom vector store.\"\"\"\n",
|
||||
" print(\"🚀 Simulating GraphRAG pipeline with custom vector store...\\n\")\n",
|
||||
"\n",
|
||||
" # 1. GraphRAG creates vector store using factory\n",
|
||||
" schema = VectorStoreSchemaConfig(index_name=\"graphrag_entities\")\n",
|
||||
"\n",
|
||||
" store = VectorStoreFactory.create_vector_store(\n",
|
||||
" CUSTOM_VECTOR_STORE_TYPE,\n",
|
||||
" vector_store_schema_config=schema,\n",
|
||||
" similarity_threshold=0.3,\n",
|
||||
" )\n",
|
||||
" store.connect()\n",
|
||||
"\n",
|
||||
" print(\"✅ Step 1: Vector store created and connected\")\n",
|
||||
"\n",
|
||||
" # 2. During indexing, GraphRAG loads extracted entities\n",
|
||||
" entity_documents = [\n",
|
||||
" VectorStoreDocument(\n",
|
||||
" id=f\"entity_{i}\",\n",
|
||||
" text=f\"Entity {i} description: Important concept in the knowledge graph\",\n",
|
||||
" vector=create_mock_embedding(),\n",
|
||||
" attributes={\"type\": \"entity\", \"importance\": i % 3 + 1},\n",
|
||||
" )\n",
|
||||
" for i in range(10)\n",
|
||||
" ]\n",
|
||||
"\n",
|
||||
" store.load_documents(entity_documents)\n",
|
||||
" print(f\"✅ Step 2: Loaded {len(entity_documents)} entity documents\")\n",
|
||||
"\n",
|
||||
" # 3. During query time, GraphRAG searches for relevant entities\n",
|
||||
" query_embedding = create_mock_embedding()\n",
|
||||
" relevant_entities = store.similarity_search_by_vector(query_embedding, k=5)\n",
|
||||
"\n",
|
||||
" print(f\"✅ Step 3: Found {len(relevant_entities)} relevant entities for query\")\n",
|
||||
"\n",
|
||||
" # 4. GraphRAG uses these entities for context building\n",
|
||||
" context_entities = [result.document for result in relevant_entities]\n",
|
||||
"\n",
|
||||
" print(\"✅ Step 4: Context built using retrieved entities\")\n",
|
||||
" print(f\"📊 Final stats: {store.get_stats()}\")\n",
|
||||
"\n",
|
||||
" return context_entities\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"# Run the simulation\n",
|
||||
"context = simulate_graphrag_pipeline()\n",
|
||||
"print(f\"\\n🎯 Retrieved {len(context)} entities for context building\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Step 8: Testing and Validation\n",
|
||||
"\n",
|
||||
"Let's create a comprehensive test suite to ensure our vector store works correctly."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def test_custom_vector_store():\n",
|
||||
" \"\"\"Comprehensive test suite for the custom vector store.\"\"\"\n",
|
||||
" print(\"🧪 Running comprehensive vector store tests...\\n\")\n",
|
||||
"\n",
|
||||
" # Test 1: Basic functionality\n",
|
||||
" print(\"Test 1: Basic functionality\")\n",
|
||||
" store = VectorStoreFactory.create_vector_store(\n",
|
||||
" CUSTOM_VECTOR_STORE_TYPE,\n",
|
||||
" vector_store_schema_config=VectorStoreSchemaConfig(index_name=\"test\"),\n",
|
||||
" )\n",
|
||||
" store.connect()\n",
|
||||
"\n",
|
||||
" # Load test documents\n",
|
||||
" test_docs = sample_documents[:2]\n",
|
||||
" store.load_documents(test_docs)\n",
|
||||
"\n",
|
||||
" assert len(store.documents) == 2, \"Should have 2 documents\"\n",
|
||||
" assert len(store.vectors) == 2, \"Should have 2 vectors\"\n",
|
||||
" print(\"✅ Basic functionality test passed\")\n",
|
||||
"\n",
|
||||
" # Test 2: Search functionality\n",
|
||||
" print(\"\\nTest 2: Search functionality\")\n",
|
||||
" query_vec = create_mock_embedding()\n",
|
||||
" results = store.similarity_search_by_vector(query_vec, k=5)\n",
|
||||
"\n",
|
||||
" assert len(results) <= 2, \"Should not return more results than documents\"\n",
|
||||
" assert all(isinstance(r, VectorStoreSearchResult) for r in results), (\n",
|
||||
" \"Should return VectorStoreSearchResult objects\"\n",
|
||||
" )\n",
|
||||
" assert all(-1 <= r.score <= 1 for r in results), (\n",
|
||||
" \"Similarity scores should be between -1 and 1\"\n",
|
||||
" )\n",
|
||||
" print(\"✅ Search functionality test passed\")\n",
|
||||
"\n",
|
||||
" # Test 3: Search by ID\n",
|
||||
" print(\"\\nTest 3: Search by ID\")\n",
|
||||
" found_doc = store.search_by_id(\"doc_1\")\n",
|
||||
" assert found_doc.id == \"doc_1\", \"Should find correct document\"\n",
|
||||
"\n",
|
||||
" try:\n",
|
||||
" store.search_by_id(\"nonexistent\")\n",
|
||||
" assert False, \"Should raise KeyError for nonexistent ID\"\n",
|
||||
" except KeyError:\n",
|
||||
" pass # Expected\n",
|
||||
"\n",
|
||||
" print(\"✅ Search by ID test passed\")\n",
|
||||
"\n",
|
||||
" # Test 4: Filter functionality\n",
|
||||
" print(\"\\nTest 4: Filter functionality\")\n",
|
||||
" filter_result = store.filter_by_id([\"doc_1\", \"doc_2\"])\n",
|
||||
" assert filter_result == [\"doc_1\", \"doc_2\"], \"Should return filtered IDs\"\n",
|
||||
" print(\"✅ Filter functionality test passed\")\n",
|
||||
"\n",
|
||||
" # Test 5: Error handling\n",
|
||||
" print(\"\\nTest 5: Error handling\")\n",
|
||||
" disconnected_store = VectorStoreFactory.create_vector_store(\n",
|
||||
" CUSTOM_VECTOR_STORE_TYPE,\n",
|
||||
" vector_store_schema_config=VectorStoreSchemaConfig(index_name=\"test2\"),\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
" try:\n",
|
||||
" disconnected_store.load_documents(test_docs)\n",
|
||||
" assert False, \"Should raise error when not connected\"\n",
|
||||
" except RuntimeError:\n",
|
||||
" pass # Expected\n",
|
||||
"\n",
|
||||
" try:\n",
|
||||
" disconnected_store.similarity_search_by_vector(query_vec)\n",
|
||||
" assert False, \"Should raise error when not connected\"\n",
|
||||
" except RuntimeError:\n",
|
||||
" pass # Expected\n",
|
||||
"\n",
|
||||
" print(\"✅ Error handling test passed\")\n",
|
||||
"\n",
|
||||
" print(\"\\n🎉 All tests passed! Your custom vector store is working correctly.\")\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"# Run the tests\n",
|
||||
"test_custom_vector_store()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Summary and Next Steps\n",
|
||||
"\n",
|
||||
"Congratulations! You've successfully learned how to implement and register a custom vector store with GraphRAG. Here's what you accomplished:\n",
|
||||
"\n",
|
||||
"### What You Built\n",
|
||||
"- ✅ **Custom Vector Store Class**: Implemented `SimpleInMemoryVectorStore` with all required methods\n",
|
||||
"- ✅ **Factory Integration**: Registered your vector store with `VectorStoreFactory`\n",
|
||||
"- ✅ **Comprehensive Testing**: Validated functionality with a full test suite\n",
|
||||
"- ✅ **Configuration Examples**: Learned how to configure GraphRAG to use your vector store\n",
|
||||
"\n",
|
||||
"### Key Takeaways\n",
|
||||
"1. **Interface Compliance**: Always implement all methods from `BaseVectorStore`\n",
|
||||
"2. **Factory Pattern**: Use `VectorStoreFactory.register()` to make your vector store available\n",
|
||||
"3. **Configuration**: Vector stores are configured in GraphRAG settings files\n",
|
||||
"4. **Testing**: Thoroughly test all functionality before deploying\n",
|
||||
"\n",
|
||||
"### Next Steps\n",
|
||||
"Check out the API Overview notebook to learn how to index and query data via the graphrag API.\n",
|
||||
"\n",
|
||||
"### Resources\n",
|
||||
"- [GraphRAG Documentation](https://microsoft.github.io/graphrag/)\n",
|
||||
"\n",
|
||||
"Happy building! 🚀"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "graphrag",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.12.10"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 4
|
||||
}
|
||||
File diff suppressed because one or more lines are too long
File diff suppressed because it is too large
Load Diff
@ -2,7 +2,7 @@
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
@ -12,22 +12,22 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import os\n",
|
||||
"\n",
|
||||
"import pandas as pd\n",
|
||||
"import tiktoken\n",
|
||||
"\n",
|
||||
"from graphrag.config.enums import ModelType\n",
|
||||
"from graphrag.config.models.language_model_config import LanguageModelConfig\n",
|
||||
"from graphrag.language_model.manager import ModelManager\n",
|
||||
"from graphrag.query.indexer_adapters import (\n",
|
||||
" read_indexer_communities,\n",
|
||||
" read_indexer_entities,\n",
|
||||
" read_indexer_reports,\n",
|
||||
")\n",
|
||||
"from graphrag.query.llm.oai.chat_openai import ChatOpenAI\n",
|
||||
"from graphrag.query.llm.oai.typing import OpenaiApiType\n",
|
||||
"from graphrag.query.structured_search.global_search.community_context import (\n",
|
||||
" GlobalCommunityContext,\n",
|
||||
")\n",
|
||||
@ -52,21 +52,28 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"api_key = os.environ[\"GRAPHRAG_API_KEY\"]\n",
|
||||
"llm_model = os.environ[\"GRAPHRAG_LLM_MODEL\"]\n",
|
||||
"from graphrag.tokenizer.get_tokenizer import get_tokenizer\n",
|
||||
"\n",
|
||||
"llm = ChatOpenAI(\n",
|
||||
"api_key = os.environ[\"GRAPHRAG_API_KEY\"]\n",
|
||||
"\n",
|
||||
"config = LanguageModelConfig(\n",
|
||||
" api_key=api_key,\n",
|
||||
" model=llm_model,\n",
|
||||
" api_type=OpenaiApiType.OpenAI, # OpenaiApiType.OpenAI or OpenaiApiType.AzureOpenAI\n",
|
||||
" type=ModelType.Chat,\n",
|
||||
" model_provider=\"openai\",\n",
|
||||
" model=\"gpt-4.1\",\n",
|
||||
" max_retries=20,\n",
|
||||
")\n",
|
||||
"model = ModelManager().get_or_create_chat_model(\n",
|
||||
" name=\"global_search\",\n",
|
||||
" model_type=ModelType.Chat,\n",
|
||||
" config=config,\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"token_encoder = tiktoken.encoding_for_model(llm_model)"
|
||||
"tokenizer = get_tokenizer(config)"
|
||||
]
|
||||
},
|
||||
{
|
||||
@ -75,23 +82,22 @@
|
||||
"source": [
|
||||
"### Load community reports as context for global search\n",
|
||||
"\n",
|
||||
"- Load all community reports in the `create_final_community_reports` table from the ire-indexing engine, to be used as context data for global search.\n",
|
||||
"- Load entities from the `create_final_nodes` and `create_final_entities` tables from the ire-indexing engine, to be used for calculating community weights for context ranking. Note that this is optional (if no entities are provided, we will not calculate community weights and only use the rank attribute in the community reports table for context ranking)\n",
|
||||
"- Load all communities in the `create_final_communites` table from the ire-indexing engine, to be used to reconstruct the community graph hierarchy for dynamic community selection."
|
||||
"- Load all community reports in the `community_reports` table from the indexing engine, to be used as context data for global search.\n",
|
||||
"- Load entities from the `entities` tables from the indexing engine, to be used for calculating community weights for context ranking. Note that this is optional (if no entities are provided, we will not calculate community weights and only use the rank attribute in the community reports table for context ranking)\n",
|
||||
"- Load all communities in the `communities` table from the indexing engine, to be used to reconstruct the community graph hierarchy for dynamic community selection."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# parquet files generated from indexing pipeline\n",
|
||||
"INPUT_DIR = \"./inputs/operation dulce\"\n",
|
||||
"COMMUNITY_TABLE = \"create_final_communities\"\n",
|
||||
"COMMUNITY_REPORT_TABLE = \"create_final_community_reports\"\n",
|
||||
"ENTITY_TABLE = \"create_final_nodes\"\n",
|
||||
"ENTITY_EMBEDDING_TABLE = \"create_final_entities\"\n",
|
||||
"COMMUNITY_TABLE = \"communities\"\n",
|
||||
"COMMUNITY_REPORT_TABLE = \"community_reports\"\n",
|
||||
"ENTITY_TABLE = \"entities\"\n",
|
||||
"\n",
|
||||
"# we don't fix a specific community level but instead use an agent to dynamicially\n",
|
||||
"# search through all the community reports to check if they are relevant.\n",
|
||||
@ -100,191 +106,23 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Total report count: 20\n",
|
||||
"Report count after filtering by community level None: 20\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"text/html": [
|
||||
"<div>\n",
|
||||
"<style scoped>\n",
|
||||
" .dataframe tbody tr th:only-of-type {\n",
|
||||
" vertical-align: middle;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe tbody tr th {\n",
|
||||
" vertical-align: top;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe thead th {\n",
|
||||
" text-align: right;\n",
|
||||
" }\n",
|
||||
"</style>\n",
|
||||
"<table border=\"1\" class=\"dataframe\">\n",
|
||||
" <thead>\n",
|
||||
" <tr style=\"text-align: right;\">\n",
|
||||
" <th></th>\n",
|
||||
" <th>community</th>\n",
|
||||
" <th>full_content</th>\n",
|
||||
" <th>level</th>\n",
|
||||
" <th>rank</th>\n",
|
||||
" <th>title</th>\n",
|
||||
" <th>rank_explanation</th>\n",
|
||||
" <th>summary</th>\n",
|
||||
" <th>findings</th>\n",
|
||||
" <th>full_content_json</th>\n",
|
||||
" <th>id</th>\n",
|
||||
" </tr>\n",
|
||||
" </thead>\n",
|
||||
" <tbody>\n",
|
||||
" <tr>\n",
|
||||
" <th>0</th>\n",
|
||||
" <td>10</td>\n",
|
||||
" <td># Paranormal Military Squad at Dulce Base: Dec...</td>\n",
|
||||
" <td>1</td>\n",
|
||||
" <td>8.5</td>\n",
|
||||
" <td>Paranormal Military Squad at Dulce Base: Decod...</td>\n",
|
||||
" <td>The impact severity rating is high due to the ...</td>\n",
|
||||
" <td>The Paranormal Military Squad, stationed at Du...</td>\n",
|
||||
" <td>[{'explanation': 'Jordan is a central figure i...</td>\n",
|
||||
" <td>{\\n \"title\": \"Paranormal Military Squad at ...</td>\n",
|
||||
" <td>1ba2d200-dd26-4693-affe-a5539d0a0e0d</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>1</th>\n",
|
||||
" <td>11</td>\n",
|
||||
" <td># Dulce and Paranormal Military Squad Operatio...</td>\n",
|
||||
" <td>1</td>\n",
|
||||
" <td>8.5</td>\n",
|
||||
" <td>Dulce and Paranormal Military Squad Operations</td>\n",
|
||||
" <td>The impact severity rating is high due to the ...</td>\n",
|
||||
" <td>The community centers around Dulce, a secretiv...</td>\n",
|
||||
" <td>[{'explanation': 'Dulce is described as a top-...</td>\n",
|
||||
" <td>{\\n \"title\": \"Dulce and Paranormal Military...</td>\n",
|
||||
" <td>a8a530b0-ae6b-44ea-b11c-9f70d138298d</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>2</th>\n",
|
||||
" <td>12</td>\n",
|
||||
" <td># Paranormal Military Squad and Dulce Base Ope...</td>\n",
|
||||
" <td>1</td>\n",
|
||||
" <td>7.5</td>\n",
|
||||
" <td>Paranormal Military Squad and Dulce Base Opera...</td>\n",
|
||||
" <td>The impact severity rating is relatively high ...</td>\n",
|
||||
" <td>The community centers around the Paranormal Mi...</td>\n",
|
||||
" <td>[{'explanation': 'Taylor is a central figure w...</td>\n",
|
||||
" <td>{\\n \"title\": \"Paranormal Military Squad and...</td>\n",
|
||||
" <td>0478975b-c805-4cc1-b746-82f3e689e2f3</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>3</th>\n",
|
||||
" <td>13</td>\n",
|
||||
" <td># Mission Dynamics and Leadership: Cruz and Wa...</td>\n",
|
||||
" <td>1</td>\n",
|
||||
" <td>7.5</td>\n",
|
||||
" <td>Mission Dynamics and Leadership: Cruz and Wash...</td>\n",
|
||||
" <td>The impact severity rating is relatively high ...</td>\n",
|
||||
" <td>This report explores the intricate dynamics of...</td>\n",
|
||||
" <td>[{'explanation': 'Cruz is a central figure in ...</td>\n",
|
||||
" <td>{\\n \"title\": \"Mission Dynamics and Leadersh...</td>\n",
|
||||
" <td>b56f6e68-3951-4f07-8760-63700944a375</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>4</th>\n",
|
||||
" <td>14</td>\n",
|
||||
" <td># Dulce Base and Paranormal Military Squad: Br...</td>\n",
|
||||
" <td>1</td>\n",
|
||||
" <td>8.5</td>\n",
|
||||
" <td>Dulce Base and Paranormal Military Squad: Brid...</td>\n",
|
||||
" <td>The impact severity rating is high due to the ...</td>\n",
|
||||
" <td>The community centers around the Dulce Base, a...</td>\n",
|
||||
" <td>[{'explanation': 'Sam Rivera, a member of the ...</td>\n",
|
||||
" <td>{\\n \"title\": \"Dulce Base and Paranormal Mil...</td>\n",
|
||||
" <td>736e7006-d050-4abb-a122-00febf3f540f</td>\n",
|
||||
" </tr>\n",
|
||||
" </tbody>\n",
|
||||
"</table>\n",
|
||||
"</div>"
|
||||
],
|
||||
"text/plain": [
|
||||
" community full_content level rank \\\n",
|
||||
"0 10 # Paranormal Military Squad at Dulce Base: Dec... 1 8.5 \n",
|
||||
"1 11 # Dulce and Paranormal Military Squad Operatio... 1 8.5 \n",
|
||||
"2 12 # Paranormal Military Squad and Dulce Base Ope... 1 7.5 \n",
|
||||
"3 13 # Mission Dynamics and Leadership: Cruz and Wa... 1 7.5 \n",
|
||||
"4 14 # Dulce Base and Paranormal Military Squad: Br... 1 8.5 \n",
|
||||
"\n",
|
||||
" title \\\n",
|
||||
"0 Paranormal Military Squad at Dulce Base: Decod... \n",
|
||||
"1 Dulce and Paranormal Military Squad Operations \n",
|
||||
"2 Paranormal Military Squad and Dulce Base Opera... \n",
|
||||
"3 Mission Dynamics and Leadership: Cruz and Wash... \n",
|
||||
"4 Dulce Base and Paranormal Military Squad: Brid... \n",
|
||||
"\n",
|
||||
" rank_explanation \\\n",
|
||||
"0 The impact severity rating is high due to the ... \n",
|
||||
"1 The impact severity rating is high due to the ... \n",
|
||||
"2 The impact severity rating is relatively high ... \n",
|
||||
"3 The impact severity rating is relatively high ... \n",
|
||||
"4 The impact severity rating is high due to the ... \n",
|
||||
"\n",
|
||||
" summary \\\n",
|
||||
"0 The Paranormal Military Squad, stationed at Du... \n",
|
||||
"1 The community centers around Dulce, a secretiv... \n",
|
||||
"2 The community centers around the Paranormal Mi... \n",
|
||||
"3 This report explores the intricate dynamics of... \n",
|
||||
"4 The community centers around the Dulce Base, a... \n",
|
||||
"\n",
|
||||
" findings \\\n",
|
||||
"0 [{'explanation': 'Jordan is a central figure i... \n",
|
||||
"1 [{'explanation': 'Dulce is described as a top-... \n",
|
||||
"2 [{'explanation': 'Taylor is a central figure w... \n",
|
||||
"3 [{'explanation': 'Cruz is a central figure in ... \n",
|
||||
"4 [{'explanation': 'Sam Rivera, a member of the ... \n",
|
||||
"\n",
|
||||
" full_content_json \\\n",
|
||||
"0 {\\n \"title\": \"Paranormal Military Squad at ... \n",
|
||||
"1 {\\n \"title\": \"Dulce and Paranormal Military... \n",
|
||||
"2 {\\n \"title\": \"Paranormal Military Squad and... \n",
|
||||
"3 {\\n \"title\": \"Mission Dynamics and Leadersh... \n",
|
||||
"4 {\\n \"title\": \"Dulce Base and Paranormal Mil... \n",
|
||||
"\n",
|
||||
" id \n",
|
||||
"0 1ba2d200-dd26-4693-affe-a5539d0a0e0d \n",
|
||||
"1 a8a530b0-ae6b-44ea-b11c-9f70d138298d \n",
|
||||
"2 0478975b-c805-4cc1-b746-82f3e689e2f3 \n",
|
||||
"3 b56f6e68-3951-4f07-8760-63700944a375 \n",
|
||||
"4 736e7006-d050-4abb-a122-00febf3f540f "
|
||||
]
|
||||
},
|
||||
"execution_count": 5,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"community_df = pd.read_parquet(f\"{INPUT_DIR}/{COMMUNITY_TABLE}.parquet\")\n",
|
||||
"entity_df = pd.read_parquet(f\"{INPUT_DIR}/{ENTITY_TABLE}.parquet\")\n",
|
||||
"report_df = pd.read_parquet(f\"{INPUT_DIR}/{COMMUNITY_REPORT_TABLE}.parquet\")\n",
|
||||
"entity_embedding_df = pd.read_parquet(f\"{INPUT_DIR}/{ENTITY_EMBEDDING_TABLE}.parquet\")\n",
|
||||
"\n",
|
||||
"communities = read_indexer_communities(community_df, entity_df, report_df)\n",
|
||||
"communities = read_indexer_communities(community_df, report_df)\n",
|
||||
"reports = read_indexer_reports(\n",
|
||||
" report_df,\n",
|
||||
" entity_df,\n",
|
||||
" community_df,\n",
|
||||
" community_level=COMMUNITY_LEVEL,\n",
|
||||
" dynamic_community_selection=True,\n",
|
||||
")\n",
|
||||
"entities = read_indexer_entities(\n",
|
||||
" entity_df, entity_embedding_df, community_level=COMMUNITY_LEVEL\n",
|
||||
" entity_df, community_df, community_level=COMMUNITY_LEVEL\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"print(f\"Total report count: {len(report_df)}\")\n",
|
||||
@ -310,27 +148,19 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"mini_llm = ChatOpenAI(\n",
|
||||
" api_key=api_key,\n",
|
||||
" model=\"gpt-4o-mini\",\n",
|
||||
" api_type=OpenaiApiType.OpenAI, # OpenaiApiType.OpenAI or OpenaiApiType.AzureOpenAI\n",
|
||||
" max_retries=20,\n",
|
||||
")\n",
|
||||
"mini_token_encoder = tiktoken.encoding_for_model(mini_llm.model)\n",
|
||||
"\n",
|
||||
"context_builder = GlobalCommunityContext(\n",
|
||||
" community_reports=reports,\n",
|
||||
" communities=communities,\n",
|
||||
" entities=entities, # default to None if you don't want to use community weights for ranking\n",
|
||||
" token_encoder=token_encoder,\n",
|
||||
" tokenizer=tokenizer,\n",
|
||||
" dynamic_community_selection=True,\n",
|
||||
" dynamic_community_selection_kwargs={\n",
|
||||
" \"llm\": mini_llm,\n",
|
||||
" \"token_encoder\": mini_token_encoder,\n",
|
||||
" \"model\": model,\n",
|
||||
" \"tokenizer\": tokenizer,\n",
|
||||
" },\n",
|
||||
")"
|
||||
]
|
||||
@ -344,7 +174,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 7,
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
@ -375,14 +205,14 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 8,
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"search_engine = GlobalSearch(\n",
|
||||
" llm=llm,\n",
|
||||
" model=model,\n",
|
||||
" context_builder=context_builder,\n",
|
||||
" token_encoder=token_encoder,\n",
|
||||
" tokenizer=tokenizer,\n",
|
||||
" max_data_tokens=12_000, # change this based on the token limit you have on your model (if you are using a model with 8k limit, a good setting could be 5000)\n",
|
||||
" map_llm_params=map_llm_params,\n",
|
||||
" reduce_llm_params=reduce_llm_params,\n",
|
||||
@ -396,158 +226,20 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 9,
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"### Overview of Cosmic Vocalization\n",
|
||||
"\n",
|
||||
"Cosmic Vocalization is a phenomenon that has captured the attention of various individuals and groups, becoming a focal point for community interest. It is perceived as a significant cosmic event, with interpretations ranging from a strategic security concern to a metaphorical interstellar duet [Data: Reports (6)].\n",
|
||||
"\n",
|
||||
"### Key Stakeholders and Perspectives\n",
|
||||
"\n",
|
||||
"1. **Paranormal Military Squad**: This group is actively engaged with Cosmic Vocalization, treating it as a strategic element in their security measures. Their involvement underscores the importance of Cosmic Vocalization in broader security contexts. They metaphorically view the Universe as a concert hall, suggesting a unique perspective on cosmic events and their implications for human entities [Data: Reports (6)].\n",
|
||||
"\n",
|
||||
"2. **Alex Mercer**: Alex Mercer perceives Cosmic Vocalization as part of an interstellar duet, indicating a responsive and perhaps artistic approach to understanding these cosmic phenomena. This perspective highlights the diverse interpretations and cultural significance attributed to Cosmic Vocalization [Data: Reports (6)].\n",
|
||||
"\n",
|
||||
"3. **Taylor Cruz**: Taylor Cruz expresses concerns about Cosmic Vocalization, fearing it might serve as a homing tune. This perspective introduces a layer of urgency and potential threat, suggesting that Cosmic Vocalization could have implications beyond mere observation, possibly affecting security or existential considerations [Data: Reports (6)].\n",
|
||||
"\n",
|
||||
"### Implications\n",
|
||||
"\n",
|
||||
"The involvement of these stakeholders and their varied perspectives on Cosmic Vocalization illustrate the complexity and multifaceted nature of this phenomenon. It is not only a subject of scientific and strategic interest but also a cultural and existential topic that prompts diverse interpretations and responses. The strategic engagement by the Paranormal Military Squad and the concerns raised by individuals like Taylor Cruz highlight the potential significance of Cosmic Vocalization in both security and broader cosmic contexts [Data: Reports (6)].\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"result = await search_engine.asearch(\n",
|
||||
" \"What is Cosmic Vocalization and who are involved in it?\"\n",
|
||||
")\n",
|
||||
"result = await search_engine.search(\"What is operation dulce?\")\n",
|
||||
"\n",
|
||||
"print(result.response)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 10,
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/html": [
|
||||
"<div>\n",
|
||||
"<style scoped>\n",
|
||||
" .dataframe tbody tr th:only-of-type {\n",
|
||||
" vertical-align: middle;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe tbody tr th {\n",
|
||||
" vertical-align: top;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe thead th {\n",
|
||||
" text-align: right;\n",
|
||||
" }\n",
|
||||
"</style>\n",
|
||||
"<table border=\"1\" class=\"dataframe\">\n",
|
||||
" <thead>\n",
|
||||
" <tr style=\"text-align: right;\">\n",
|
||||
" <th></th>\n",
|
||||
" <th>id</th>\n",
|
||||
" <th>title</th>\n",
|
||||
" <th>occurrence weight</th>\n",
|
||||
" <th>content</th>\n",
|
||||
" <th>rank</th>\n",
|
||||
" </tr>\n",
|
||||
" </thead>\n",
|
||||
" <tbody>\n",
|
||||
" <tr>\n",
|
||||
" <th>0</th>\n",
|
||||
" <td>15</td>\n",
|
||||
" <td>Dulce Base and the Paranormal Military Squad: ...</td>\n",
|
||||
" <td>1.00</td>\n",
|
||||
" <td># Dulce Base and the Paranormal Military Squad...</td>\n",
|
||||
" <td>9.5</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>1</th>\n",
|
||||
" <td>1</td>\n",
|
||||
" <td>Earth's Interstellar Communication Initiative</td>\n",
|
||||
" <td>0.16</td>\n",
|
||||
" <td># Earth's Interstellar Communication Initiativ...</td>\n",
|
||||
" <td>8.5</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>2</th>\n",
|
||||
" <td>16</td>\n",
|
||||
" <td>Dulce Military Base and Alien Intelligence Com...</td>\n",
|
||||
" <td>0.08</td>\n",
|
||||
" <td># Dulce Military Base and Alien Intelligence C...</td>\n",
|
||||
" <td>8.5</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>3</th>\n",
|
||||
" <td>18</td>\n",
|
||||
" <td>Paranormal Military Squad Team and Dulce Base'...</td>\n",
|
||||
" <td>0.04</td>\n",
|
||||
" <td># Paranormal Military Squad Team and Dulce Bas...</td>\n",
|
||||
" <td>8.5</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>4</th>\n",
|
||||
" <td>19</td>\n",
|
||||
" <td>Central Terminal and Viewing Monitors at Dulce...</td>\n",
|
||||
" <td>0.02</td>\n",
|
||||
" <td># Central Terminal and Viewing Monitors at Dul...</td>\n",
|
||||
" <td>8.5</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>5</th>\n",
|
||||
" <td>4</td>\n",
|
||||
" <td>Dulce Facility and Control Room of Dulce: Extr...</td>\n",
|
||||
" <td>0.02</td>\n",
|
||||
" <td># Dulce Facility and Control Room of Dulce: Ex...</td>\n",
|
||||
" <td>8.5</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>6</th>\n",
|
||||
" <td>6</td>\n",
|
||||
" <td>Cosmic Vocalization and Universe Interactions</td>\n",
|
||||
" <td>0.02</td>\n",
|
||||
" <td># Cosmic Vocalization and Universe Interaction...</td>\n",
|
||||
" <td>7.5</td>\n",
|
||||
" </tr>\n",
|
||||
" </tbody>\n",
|
||||
"</table>\n",
|
||||
"</div>"
|
||||
],
|
||||
"text/plain": [
|
||||
" id title occurrence weight \\\n",
|
||||
"0 15 Dulce Base and the Paranormal Military Squad: ... 1.00 \n",
|
||||
"1 1 Earth's Interstellar Communication Initiative 0.16 \n",
|
||||
"2 16 Dulce Military Base and Alien Intelligence Com... 0.08 \n",
|
||||
"3 18 Paranormal Military Squad Team and Dulce Base'... 0.04 \n",
|
||||
"4 19 Central Terminal and Viewing Monitors at Dulce... 0.02 \n",
|
||||
"5 4 Dulce Facility and Control Room of Dulce: Extr... 0.02 \n",
|
||||
"6 6 Cosmic Vocalization and Universe Interactions 0.02 \n",
|
||||
"\n",
|
||||
" content rank \n",
|
||||
"0 # Dulce Base and the Paranormal Military Squad... 9.5 \n",
|
||||
"1 # Earth's Interstellar Communication Initiativ... 8.5 \n",
|
||||
"2 # Dulce Military Base and Alien Intelligence C... 8.5 \n",
|
||||
"3 # Paranormal Military Squad Team and Dulce Bas... 8.5 \n",
|
||||
"4 # Central Terminal and Viewing Monitors at Dul... 8.5 \n",
|
||||
"5 # Dulce Facility and Control Room of Dulce: Ex... 8.5 \n",
|
||||
"6 # Cosmic Vocalization and Universe Interaction... 7.5 "
|
||||
]
|
||||
},
|
||||
"execution_count": 10,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# inspect the data used to build the context for the LLM responses\n",
|
||||
"result.context_data[\"reports\"]"
|
||||
@ -555,27 +247,16 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 11,
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Build context (gpt-4o-mini)\n",
|
||||
"LLM calls: 12. Prompt tokens: 8565. Output tokens: 1091.\n",
|
||||
"Map-reduce (gpt-4o)\n",
|
||||
"LLM calls: 2. Prompt tokens: 5771. Output tokens: 600.\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# inspect number of LLM calls and tokens in dynamic community selection\n",
|
||||
"llm_calls = result.llm_calls_categories[\"build_context\"]\n",
|
||||
"prompt_tokens = result.prompt_tokens_categories[\"build_context\"]\n",
|
||||
"output_tokens = result.output_tokens_categories[\"build_context\"]\n",
|
||||
"print(\n",
|
||||
" f\"Build context ({mini_llm.model})\\nLLM calls: {llm_calls}. Prompt tokens: {prompt_tokens}. Output tokens: {output_tokens}.\"\n",
|
||||
" f\"Build context LLM calls: {llm_calls}. Prompt tokens: {prompt_tokens}. Output tokens: {output_tokens}.\"\n",
|
||||
")\n",
|
||||
"# inspect number of LLM calls and tokens in map-reduce\n",
|
||||
"llm_calls = result.llm_calls_categories[\"map\"] + result.llm_calls_categories[\"reduce\"]\n",
|
||||
@ -586,7 +267,7 @@
|
||||
" result.output_tokens_categories[\"map\"] + result.output_tokens_categories[\"reduce\"]\n",
|
||||
")\n",
|
||||
"print(\n",
|
||||
" f\"Map-reduce ({llm.model})\\nLLM calls: {llm_calls}. Prompt tokens: {prompt_tokens}. Output tokens: {output_tokens}.\"\n",
|
||||
" f\"Map-reduce LLM calls: {llm_calls}. Prompt tokens: {prompt_tokens}. Output tokens: {output_tokens}.\"\n",
|
||||
")"
|
||||
]
|
||||
}
|
||||
@ -607,7 +288,7 @@
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.12.5"
|
||||
"version": "3.12.10"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
|
||||
@ -2,7 +2,7 @@
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"execution_count": 62,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
@ -14,9 +14,9 @@
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Index Migration\n",
|
||||
"## Index Migration (pre-v1 to v1)\n",
|
||||
"\n",
|
||||
"This notebook is used to maintain data model parity with older indexes for the latest versions of GraphRAG. If you have a pre-1.0 index and need to migrate without re-running the entire pipeline, you can use this notebook to only update the pieces necessary for alignment.\n",
|
||||
"This notebook is used to maintain data model parity with older indexes for version 1.0 of GraphRAG. If you have a pre-1.0 index and need to migrate without re-running the entire pipeline, you can use this notebook to only update the pieces necessary for alignment.\n",
|
||||
"\n",
|
||||
"NOTE: we recommend regenerating your settings.yml with the latest version of GraphRAG using `graphrag init`. Copy your LLM settings into it before running this notebook. This ensures your config is aligned with the latest version for the migration. This also ensures that you have default vector store config, which is now required or indexing will fail.\n",
|
||||
"\n",
|
||||
@ -25,39 +25,38 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"execution_count": 63,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# This is the directory that has your settings.yml\n",
|
||||
"# This is the directory that has your settings.yaml\n",
|
||||
"# NOTE: much older indexes may have been output with a timestamped directory\n",
|
||||
"# if this is the case, you will need to make sure the storage.base_dir in settings.yml points to it correctly\n",
|
||||
"PROJECT_DIRECTORY = \"<your project directory>\""
|
||||
"# if this is the case, you will need to make sure the storage.base_dir in settings.yaml points to it correctly\n",
|
||||
"PROJECT_DIRECTORY = \"<your project directory\""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"execution_count": 64,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from pathlib import Path\n",
|
||||
"\n",
|
||||
"from graphrag.config.load_config import load_config\n",
|
||||
"from graphrag.config.resolve_path import resolve_paths\n",
|
||||
"from graphrag.index.create_pipeline_config import create_pipeline_config\n",
|
||||
"from graphrag.storage.factory import create_storage\n",
|
||||
"from graphrag.storage.factory import StorageFactory\n",
|
||||
"\n",
|
||||
"# This first block does some config loading, path resolution, and translation that is normally done by the CLI/API when running a full workflow\n",
|
||||
"config = load_config(Path(PROJECT_DIRECTORY))\n",
|
||||
"resolve_paths(config)\n",
|
||||
"pipeline_config = create_pipeline_config(config)\n",
|
||||
"storage = create_storage(pipeline_config.storage)"
|
||||
"storage_config = config.output.model_dump()\n",
|
||||
"storage = StorageFactory().create_storage(\n",
|
||||
" storage_type=storage_config[\"type\"],\n",
|
||||
" kwargs=storage_config,\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"execution_count": 65,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
@ -68,7 +67,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 63,
|
||||
"execution_count": 66,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
@ -97,7 +96,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 64,
|
||||
"execution_count": 67,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
@ -108,22 +107,16 @@
|
||||
"# First we'll go through any parquet files that had model changes and update them\n",
|
||||
"# The new data model may have removed excess columns as well, but we will only make the minimal changes required for compatibility\n",
|
||||
"\n",
|
||||
"final_documents = await load_table_from_storage(\n",
|
||||
" \"create_final_documents.parquet\", storage\n",
|
||||
")\n",
|
||||
"final_text_units = await load_table_from_storage(\n",
|
||||
" \"create_final_text_units.parquet\", storage\n",
|
||||
")\n",
|
||||
"final_entities = await load_table_from_storage(\"create_final_entities.parquet\", storage)\n",
|
||||
"final_nodes = await load_table_from_storage(\"create_final_nodes.parquet\", storage)\n",
|
||||
"final_documents = await load_table_from_storage(\"create_final_documents\", storage)\n",
|
||||
"final_text_units = await load_table_from_storage(\"create_final_text_units\", storage)\n",
|
||||
"final_entities = await load_table_from_storage(\"create_final_entities\", storage)\n",
|
||||
"final_nodes = await load_table_from_storage(\"create_final_nodes\", storage)\n",
|
||||
"final_relationships = await load_table_from_storage(\n",
|
||||
" \"create_final_relationships.parquet\", storage\n",
|
||||
")\n",
|
||||
"final_communities = await load_table_from_storage(\n",
|
||||
" \"create_final_communities.parquet\", storage\n",
|
||||
" \"create_final_relationships\", storage\n",
|
||||
")\n",
|
||||
"final_communities = await load_table_from_storage(\"create_final_communities\", storage)\n",
|
||||
"final_community_reports = await load_table_from_storage(\n",
|
||||
" \"create_final_community_reports.parquet\", storage\n",
|
||||
" \"create_final_community_reports\", storage\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"\n",
|
||||
@ -139,7 +132,7 @@
|
||||
"if \"name\" in final_entities.columns:\n",
|
||||
" final_entities.rename(columns={\"name\": \"title\"}, inplace=True)\n",
|
||||
"remove_columns(\n",
|
||||
" final_entities, [\"mname_embedding\", \"graph_embedding\", \"description_embedding\"]\n",
|
||||
" final_entities, [\"name_embedding\", \"graph_embedding\", \"description_embedding\"]\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"# Final nodes uses community for joins, which is now an int everywhere\n",
|
||||
@ -173,6 +166,15 @@
|
||||
" final_communities[\"id\"] = [str(uuid4()) for _ in range(len(final_communities))]\n",
|
||||
"if \"parent\" not in final_communities.columns:\n",
|
||||
" final_communities = final_communities.merge(parent_df, on=\"community\", how=\"left\")\n",
|
||||
"if \"entity_ids\" not in final_communities.columns:\n",
|
||||
" node_mapping = (\n",
|
||||
" final_nodes.loc[:, [\"community\", \"id\"]]\n",
|
||||
" .groupby(\"community\")\n",
|
||||
" .agg(entity_ids=(\"id\", list))\n",
|
||||
" )\n",
|
||||
" final_communities = final_communities.merge(\n",
|
||||
" node_mapping, on=\"community\", how=\"left\"\n",
|
||||
" )\n",
|
||||
"remove_columns(final_communities, [\"raw_community\"])\n",
|
||||
"\n",
|
||||
"# We need int for community and the human_readable_id copy for consistency\n",
|
||||
@ -183,44 +185,42 @@
|
||||
" parent_df, on=\"community\", how=\"left\"\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
"await write_table_to_storage(final_documents, \"create_final_documents.parquet\", storage)\n",
|
||||
"await write_table_to_storage(final_documents, \"create_final_documents\", storage)\n",
|
||||
"await write_table_to_storage(final_text_units, \"create_final_text_units\", storage)\n",
|
||||
"await write_table_to_storage(final_entities, \"create_final_entities\", storage)\n",
|
||||
"await write_table_to_storage(final_nodes, \"create_final_nodes\", storage)\n",
|
||||
"await write_table_to_storage(final_relationships, \"create_final_relationships\", storage)\n",
|
||||
"await write_table_to_storage(final_communities, \"create_final_communities\", storage)\n",
|
||||
"await write_table_to_storage(\n",
|
||||
" final_text_units, \"create_final_text_units.parquet\", storage\n",
|
||||
")\n",
|
||||
"await write_table_to_storage(final_entities, \"create_final_entities.parquet\", storage)\n",
|
||||
"await write_table_to_storage(final_nodes, \"create_final_nodes.parquet\", storage)\n",
|
||||
"await write_table_to_storage(\n",
|
||||
" final_relationships, \"create_final_relationships.parquet\", storage\n",
|
||||
")\n",
|
||||
"await write_table_to_storage(\n",
|
||||
" final_communities, \"create_final_communities.parquet\", storage\n",
|
||||
")\n",
|
||||
"await write_table_to_storage(\n",
|
||||
" final_community_reports, \"create_final_community_reports.parquet\", storage\n",
|
||||
" final_community_reports, \"create_final_community_reports\", storage\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 7,
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from graphrag.cache.factory import create_cache\n",
|
||||
"from graphrag.callbacks.noop_workflow_callbacks import NoopWorkflowCallbacks\n",
|
||||
"from graphrag.index.flows.generate_text_embeddings import generate_text_embeddings\n",
|
||||
"\n",
|
||||
"from graphrag.cache.factory import CacheFactory\n",
|
||||
"from graphrag.callbacks.noop_workflow_callbacks import NoopWorkflowCallbacks\n",
|
||||
"from graphrag.config.embeddings import get_embedded_fields, get_embedding_settings\n",
|
||||
"\n",
|
||||
"# We only need to re-run the embeddings workflow, to ensure that embeddings for all required search fields are in place\n",
|
||||
"# We'll construct the context and run this function flow directly to avoid everything else\n",
|
||||
"\n",
|
||||
"workflow = next(\n",
|
||||
" (x for x in pipeline_config.workflows if x.name == \"generate_text_embeddings\"), None\n",
|
||||
")\n",
|
||||
"config = workflow.config\n",
|
||||
"text_embed = config.get(\"text_embed\", {})\n",
|
||||
"embedded_fields = config.get(\"embedded_fields\", {})\n",
|
||||
"\n",
|
||||
"embedded_fields = get_embedded_fields(config)\n",
|
||||
"text_embed = get_embedding_settings(config)\n",
|
||||
"callbacks = NoopWorkflowCallbacks()\n",
|
||||
"cache = create_cache(pipeline_config.cache, PROJECT_DIRECTORY)\n",
|
||||
"cache_config = config.cache.model_dump() # type: ignore\n",
|
||||
"cache = CacheFactory().create_cache(\n",
|
||||
" cache_type=cache_config[\"type\"], # type: ignore\n",
|
||||
" root_dir=PROJECT_DIRECTORY,\n",
|
||||
" kwargs=cache_config,\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"await generate_text_embeddings(\n",
|
||||
" final_documents=None,\n",
|
||||
171
docs/examples_notebooks/index_migration_to_v2.ipynb
Normal file
171
docs/examples_notebooks/index_migration_to_v2.ipynb
Normal file
@ -0,0 +1,171 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 18,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Copyright (c) 2024 Microsoft Corporation.\n",
|
||||
"# Licensed under the MIT License."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Index Migration (v1 to v2)\n",
|
||||
"\n",
|
||||
"This notebook is used to maintain data model parity with older indexes for version 2.0 of GraphRAG. If you have a pre-2.0 index and need to migrate without re-running the entire pipeline, you can use this notebook to only update the pieces necessary for alignment. If you have a pre-1.0 index, please run the v1 migration notebook first!\n",
|
||||
"\n",
|
||||
"NOTE: we recommend regenerating your settings.yml with the latest version of GraphRAG using `graphrag init`. Copy your LLM settings into it before running this notebook. This ensures your config is aligned with the latest version for the migration. This also ensures that you have default vector store config, which is now required or indexing will fail.\n",
|
||||
"\n",
|
||||
"WARNING: This will overwrite your parquet files, you may want to make a backup!"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# This is the directory that has your settings.yaml\n",
|
||||
"PROJECT_DIRECTORY = \"<your project directory>\""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 20,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from pathlib import Path\n",
|
||||
"\n",
|
||||
"from graphrag.config.load_config import load_config\n",
|
||||
"from graphrag.storage.factory import StorageFactory\n",
|
||||
"\n",
|
||||
"config = load_config(Path(PROJECT_DIRECTORY))\n",
|
||||
"storage_config = config.output.model_dump()\n",
|
||||
"storage = StorageFactory().create_storage(\n",
|
||||
" storage_type=storage_config[\"type\"],\n",
|
||||
" kwargs=storage_config,\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 21,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def remove_columns(df, columns):\n",
|
||||
" \"\"\"Remove columns from a DataFrame, suppressing errors.\"\"\"\n",
|
||||
" df.drop(labels=columns, axis=1, errors=\"ignore\", inplace=True)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 22,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import numpy as np\n",
|
||||
"\n",
|
||||
"from graphrag.utils.storage import (\n",
|
||||
" delete_table_from_storage,\n",
|
||||
" load_table_from_storage,\n",
|
||||
" write_table_to_storage,\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"final_documents = await load_table_from_storage(\"create_final_documents\", storage)\n",
|
||||
"final_text_units = await load_table_from_storage(\"create_final_text_units\", storage)\n",
|
||||
"final_entities = await load_table_from_storage(\"create_final_entities\", storage)\n",
|
||||
"final_covariates = await load_table_from_storage(\"create_final_covariates\", storage)\n",
|
||||
"final_nodes = await load_table_from_storage(\"create_final_nodes\", storage)\n",
|
||||
"final_relationships = await load_table_from_storage(\n",
|
||||
" \"create_final_relationships\", storage\n",
|
||||
")\n",
|
||||
"final_communities = await load_table_from_storage(\"create_final_communities\", storage)\n",
|
||||
"final_community_reports = await load_table_from_storage(\n",
|
||||
" \"create_final_community_reports\", storage\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"# we've renamed document attributes as metadata\n",
|
||||
"if \"attributes\" in final_documents.columns:\n",
|
||||
" final_documents.rename(columns={\"attributes\": \"metadata\"}, inplace=True)\n",
|
||||
"\n",
|
||||
"# we're removing the nodes table, so we need to copy the graph columns into entities\n",
|
||||
"graph_props = (\n",
|
||||
" final_nodes.loc[:, [\"id\", \"degree\", \"x\", \"y\"]].groupby(\"id\").first().reset_index()\n",
|
||||
")\n",
|
||||
"final_entities = final_entities.merge(graph_props, on=\"id\", how=\"left\")\n",
|
||||
"# we're also persisting the frequency column\n",
|
||||
"final_entities[\"frequency\"] = final_entities[\"text_unit_ids\"].count()\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"# we added children to communities to eliminate query-time reconstruction\n",
|
||||
"parent_grouped = final_communities.groupby(\"parent\").agg(\n",
|
||||
" children=(\"community\", \"unique\")\n",
|
||||
")\n",
|
||||
"final_communities = final_communities.merge(\n",
|
||||
" parent_grouped,\n",
|
||||
" left_on=\"community\",\n",
|
||||
" right_on=\"parent\",\n",
|
||||
" how=\"left\",\n",
|
||||
")\n",
|
||||
"# replace NaN children with empty list\n",
|
||||
"final_communities[\"children\"] = final_communities[\"children\"].apply(\n",
|
||||
" lambda x: x if isinstance(x, np.ndarray) else [] # type: ignore\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"# add children to the reports as well\n",
|
||||
"final_community_reports = final_community_reports.merge(\n",
|
||||
" parent_grouped,\n",
|
||||
" left_on=\"community\",\n",
|
||||
" right_on=\"parent\",\n",
|
||||
" how=\"left\",\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"# we renamed all the output files for better clarity now that we don't have workflow naming constraints from DataShaper\n",
|
||||
"await write_table_to_storage(final_documents, \"documents\", storage)\n",
|
||||
"await write_table_to_storage(final_text_units, \"text_units\", storage)\n",
|
||||
"await write_table_to_storage(final_entities, \"entities\", storage)\n",
|
||||
"await write_table_to_storage(final_relationships, \"relationships\", storage)\n",
|
||||
"await write_table_to_storage(final_covariates, \"covariates\", storage)\n",
|
||||
"await write_table_to_storage(final_communities, \"communities\", storage)\n",
|
||||
"await write_table_to_storage(final_community_reports, \"community_reports\", storage)\n",
|
||||
"\n",
|
||||
"# delete all the old versions\n",
|
||||
"await delete_table_from_storage(\"create_final_documents\", storage)\n",
|
||||
"await delete_table_from_storage(\"create_final_text_units\", storage)\n",
|
||||
"await delete_table_from_storage(\"create_final_entities\", storage)\n",
|
||||
"await delete_table_from_storage(\"create_final_nodes\", storage)\n",
|
||||
"await delete_table_from_storage(\"create_final_relationships\", storage)\n",
|
||||
"await delete_table_from_storage(\"create_final_covariates\", storage)\n",
|
||||
"await delete_table_from_storage(\"create_final_communities\", storage)\n",
|
||||
"await delete_table_from_storage(\"create_final_community_reports\", storage)"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": ".venv",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.11.9"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
||||
194
docs/examples_notebooks/input_documents.ipynb
Normal file
194
docs/examples_notebooks/input_documents.ipynb
Normal file
@ -0,0 +1,194 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Copyright (c) 2024 Microsoft Corporation.\n",
|
||||
"# Licensed under the MIT License."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Example of indexing from an existing in-memory dataframe\n",
|
||||
"\n",
|
||||
"Newer versions of GraphRAG let you submit a dataframe directly instead of running through the input processing step. This notebook demonstrates with regular or update runs.\n",
|
||||
"\n",
|
||||
"If performing an update, the assumption is that your dataframe contains only the new documents to add to the index."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from pathlib import Path\n",
|
||||
"from pprint import pprint\n",
|
||||
"\n",
|
||||
"import pandas as pd\n",
|
||||
"\n",
|
||||
"import graphrag.api as api\n",
|
||||
"from graphrag.config.load_config import load_config\n",
|
||||
"from graphrag.index.typing.pipeline_run_result import PipelineRunResult"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"PROJECT_DIRECTORY = \"<your project directory>\"\n",
|
||||
"UPDATE = False\n",
|
||||
"FILENAME = \"new_documents.parquet\" if UPDATE else \"<original_documents>.parquet\"\n",
|
||||
"inputs = pd.read_parquet(f\"{PROJECT_DIRECTORY}/input/{FILENAME}\")\n",
|
||||
"# Only the bare minimum for input. These are the same fields that would be present after the load_input_documents workflow\n",
|
||||
"inputs = inputs.loc[:, [\"id\", \"title\", \"text\", \"creation_date\"]]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Generate a `GraphRagConfig` object"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"graphrag_config = load_config(Path(PROJECT_DIRECTORY))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Indexing API\n",
|
||||
"\n",
|
||||
"*Indexing* is the process of ingesting raw text data and constructing a knowledge graph. GraphRAG currently supports plaintext (`.txt`) and `.csv` file formats."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Build an index"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"index_result: list[PipelineRunResult] = await api.build_index(\n",
|
||||
" config=graphrag_config, input_documents=inputs, is_update_run=UPDATE\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"# index_result is a list of workflows that make up the indexing pipeline that was run\n",
|
||||
"for workflow_result in index_result:\n",
|
||||
" status = f\"error\\n{workflow_result.errors}\" if workflow_result.errors else \"success\"\n",
|
||||
" print(f\"Workflow Name: {workflow_result.workflow}\\tStatus: {status}\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": []
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Query an index\n",
|
||||
"\n",
|
||||
"To query an index, several index files must first be read into memory and passed to the query API. "
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"entities = pd.read_parquet(f\"{PROJECT_DIRECTORY}/output/entities.parquet\")\n",
|
||||
"communities = pd.read_parquet(f\"{PROJECT_DIRECTORY}/output/communities.parquet\")\n",
|
||||
"community_reports = pd.read_parquet(\n",
|
||||
" f\"{PROJECT_DIRECTORY}/output/community_reports.parquet\"\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"response, context = await api.global_search(\n",
|
||||
" config=graphrag_config,\n",
|
||||
" entities=entities,\n",
|
||||
" communities=communities,\n",
|
||||
" community_reports=community_reports,\n",
|
||||
" community_level=2,\n",
|
||||
" dynamic_community_selection=False,\n",
|
||||
" response_type=\"Multiple Paragraphs\",\n",
|
||||
" query=\"What are the top five themes of the dataset?\",\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"The response object is the official reponse from graphrag while the context object holds various metadata regarding the querying process used to obtain the final response."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"print(response)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Digging into the context a bit more provides users with extremely granular information such as what sources of data (down to the level of text chunks) were ultimately retrieved and used as part of the context sent to the LLM model)."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"pprint(context) # noqa: T203"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "graphrag",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.12.10"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
BIN
docs/examples_notebooks/inputs/operation dulce/documents.parquet
Normal file
BIN
docs/examples_notebooks/inputs/operation dulce/documents.parquet
Normal file
Binary file not shown.
BIN
docs/examples_notebooks/inputs/operation dulce/entities.parquet
Normal file
BIN
docs/examples_notebooks/inputs/operation dulce/entities.parquet
Normal file
Binary file not shown.
Binary file not shown.
@ -1,2 +1,2 @@
|
||||
$c414b6c8-9525-4982-b2ba-595f265afd34²À$id ÿÿÿÿÿÿÿÿÿ*string08Zdefault(text ÿÿÿÿÿÿÿÿÿ*string08Zdefault>vector ÿÿÿÿÿÿÿÿÿ*fixed_size_list:float:153608Zdefault.
|
||||
$2fed1d8b-daac-41b0-a93a-e115cda75be3²À$id ÿÿÿÿÿÿÿÿÿ*string08Zdefault(text ÿÿÿÿÿÿÿÿÿ*string08Zdefault>vector ÿÿÿÿÿÿÿÿÿ*fixed_size_list:float:153608Zdefault.
|
||||
attributes ÿÿÿÿÿÿÿÿÿ*string08Zdefault
|
||||
Binary file not shown.
Binary file not shown.
@ -0,0 +1,2 @@
|
||||
$60012692-a153-48f9-8f4e-c479b44cbf3f²À$id ÿÿÿÿÿÿÿÿÿ*string08Zdefault(text ÿÿÿÿÿÿÿÿÿ*string08Zdefault>vector ÿÿÿÿÿÿÿÿÿ*fixed_size_list:float:153608Zdefault.
|
||||
attributes ÿÿÿÿÿÿÿÿÿ*string08Zdefault
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@ -1,2 +1,2 @@
|
||||
$158f79e4-2f9f-4710-99cf-9b0425ae781c²À$id ÿÿÿÿÿÿÿÿÿ*string08Zdefault(text ÿÿÿÿÿÿÿÿÿ*string08Zdefault>vector ÿÿÿÿÿÿÿÿÿ*fixed_size_list:float:153608Zdefault.
|
||||
$92c031e5-7558-451e-9d0f-f5514db9616d²À$id ÿÿÿÿÿÿÿÿÿ*string08Zdefault(text ÿÿÿÿÿÿÿÿÿ*string08Zdefault>vector ÿÿÿÿÿÿÿÿÿ*fixed_size_list:float:153608Zdefault.
|
||||
attributes ÿÿÿÿÿÿÿÿÿ*string08Zdefault
|
||||
Binary file not shown.
Binary file not shown.
@ -0,0 +1,2 @@
|
||||
$7de627d2-4c57-49e9-bf73-c17a9582ead4²À$id ÿÿÿÿÿÿÿÿÿ*string08Zdefault(text ÿÿÿÿÿÿÿÿÿ*string08Zdefault>vector ÿÿÿÿÿÿÿÿÿ*fixed_size_list:float:153608Zdefault.
|
||||
attributes ÿÿÿÿÿÿÿÿÿ*string08Zdefault
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@ -1,2 +1,2 @@
|
||||
$e15dabba-9c5e-480a-ac7a-a24b94931123²À$id ÿÿÿÿÿÿÿÿÿ*string08Zdefault(text ÿÿÿÿÿÿÿÿÿ*string08Zdefault>vector ÿÿÿÿÿÿÿÿÿ*fixed_size_list:float:153608Zdefault.
|
||||
$fd0434ac-e5cd-4ddd-9dd5-e5048d4edb59²À$id ÿÿÿÿÿÿÿÿÿ*string08Zdefault(text ÿÿÿÿÿÿÿÿÿ*string08Zdefault>vector ÿÿÿÿÿÿÿÿÿ*fixed_size_list:float:153608Zdefault.
|
||||
attributes ÿÿÿÿÿÿÿÿÿ*string08Zdefault
|
||||
Binary file not shown.
Binary file not shown.
@ -0,0 +1,2 @@
|
||||
$8e74264c-f72d-44f5-a6f4-b3b61ae6a43b²À$id ÿÿÿÿÿÿÿÿÿ*string08Zdefault(text ÿÿÿÿÿÿÿÿÿ*string08Zdefault>vector ÿÿÿÿÿÿÿÿÿ*fixed_size_list:float:153608Zdefault.
|
||||
attributes ÿÿÿÿÿÿÿÿÿ*string08Zdefault
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@ -1,2 +0,0 @@
|
||||
$08336a29-08c5-442b-aca4-cd5c16d61192²œid ÿÿÿÿÿÿÿÿÿ*string08text ÿÿÿÿÿÿÿÿÿ*string085vector ÿÿÿÿÿÿÿÿÿ*fixed_size_list:float:153608%
|
||||
attributes ÿÿÿÿÿÿÿÿÿ*string08
|
||||
Binary file not shown.
@ -1,3 +0,0 @@
|
||||
|
||||
$0752a360-8a56-4e2a-8364-2afeb9bd6207²œid ÿÿÿÿÿÿÿÿÿ*string08text ÿÿÿÿÿÿÿÿÿ*string085vector ÿÿÿÿÿÿÿÿÿ*fixed_size_list:float:153608%
|
||||
attributes ÿÿÿÿÿÿÿÿÿ*string08
|
||||
Binary file not shown.
@ -1,2 +0,0 @@
|
||||
$05435a61-8d93-423a-9cad-dd618f590fd8²œid ÿÿÿÿÿÿÿÿÿ*string08text ÿÿÿÿÿÿÿÿÿ*string085vector ÿÿÿÿÿÿÿÿÿ*fixed_size_list:float:153608%
|
||||
attributes ÿÿÿÿÿÿÿÿÿ*string08
|
||||
Binary file not shown.
@ -1,2 +0,0 @@
|
||||
$8b279616-14fd-48b5-a93d-febbf6719d3e²œid ÿÿÿÿÿÿÿÿÿ*string08text ÿÿÿÿÿÿÿÿÿ*string085vector ÿÿÿÿÿÿÿÿÿ*fixed_size_list:float:153608%
|
||||
attributes ÿÿÿÿÿÿÿÿÿ*string08
|
||||
Binary file not shown.
@ -1,2 +0,0 @@
|
||||
$38fb0c0b-2d0e-401b-9083-be5ba9b67435²œid ÿÿÿÿÿÿÿÿÿ*string08text ÿÿÿÿÿÿÿÿÿ*string085vector ÿÿÿÿÿÿÿÿÿ*fixed_size_list:float:153608%
|
||||
attributes ÿÿÿÿÿÿÿÿÿ*string08
|
||||
Binary file not shown.
@ -1,2 +0,0 @@
|
||||
$fe38cf8b-4241-4fb8-87d7-25f99b3efe12²œid ÿÿÿÿÿÿÿÿÿ*string08text ÿÿÿÿÿÿÿÿÿ*string085vector ÿÿÿÿÿÿÿÿÿ*fixed_size_list:float:153608%
|
||||
attributes ÿÿÿÿÿÿÿÿÿ*string08
|
||||
Binary file not shown.
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue
Block a user