graphrag/tests/integration/vector_stores/test_lancedb.py
Derek Worthen 619269243d
Restructure project as monorepo. (#2111)
* Restructure project as monorepo.
2025-11-04 09:51:56 -08:00

233 lines
8.3 KiB
Python

# Copyright (c) 2024 Microsoft Corporation.
# Licensed under the MIT License
"""Integration tests for LanceDB vector store implementation."""
import shutil
import tempfile
import numpy as np
import pytest
from graphrag.config.models.vector_store_schema_config import VectorStoreSchemaConfig
from graphrag.vector_stores.base import VectorStoreDocument
from graphrag.vector_stores.lancedb import LanceDBVectorStore
class TestLanceDBVectorStore:
"""Test class for TestLanceDBVectorStore."""
@pytest.fixture
def sample_documents(self):
"""Create sample documents for testing."""
return [
VectorStoreDocument(
id="1",
vector=[0.1, 0.2, 0.3, 0.4, 0.5],
),
VectorStoreDocument(
id="2",
vector=[0.2, 0.3, 0.4, 0.5, 0.6],
),
VectorStoreDocument(
id="3",
vector=[0.3, 0.4, 0.5, 0.6, 0.7],
),
]
@pytest.fixture
def sample_documents_categories(self):
"""Create sample documents with different categories for testing."""
return [
VectorStoreDocument(
id="1",
vector=[0.1, 0.2, 0.3, 0.4, 0.5],
),
VectorStoreDocument(
id="2",
vector=[0.2, 0.3, 0.4, 0.5, 0.6],
),
VectorStoreDocument(
id="3",
vector=[0.3, 0.4, 0.5, 0.6, 0.7],
),
]
def test_vector_store_operations(self, sample_documents):
"""Test basic vector store operations with LanceDB."""
# Create a temporary directory for the test database
temp_dir = tempfile.mkdtemp()
try:
vector_store = LanceDBVectorStore(
vector_store_schema_config=VectorStoreSchemaConfig(
index_name="test_collection", vector_size=5
)
)
vector_store.connect(db_uri=temp_dir)
vector_store.create_index()
vector_store.load_documents(sample_documents[:2])
if vector_store.index_name:
assert (
vector_store.index_name in vector_store.db_connection.table_names()
)
doc = vector_store.search_by_id("1")
assert doc.id == "1"
assert doc.vector is not None
assert np.allclose(doc.vector, [0.1, 0.2, 0.3, 0.4, 0.5])
results = vector_store.similarity_search_by_vector(
[0.1, 0.2, 0.3, 0.4, 0.5], k=2
)
assert 1 <= len(results) <= 2
assert isinstance(results[0].score, float)
# Test append mode
vector_store.create_index()
vector_store.load_documents([sample_documents[2]])
result = vector_store.search_by_id("3")
assert result.id == "3"
# Define a simple text embedder function for testing
def mock_embedder(text: str) -> list[float]:
return [0.1, 0.2, 0.3, 0.4, 0.5]
text_results = vector_store.similarity_search_by_text(
"test query", mock_embedder, k=2
)
assert 1 <= len(text_results) <= 2
assert isinstance(text_results[0].score, float)
# Test non-existent document
non_existent = vector_store.search_by_id("nonexistent")
assert non_existent.id == "nonexistent"
assert non_existent.vector is None
finally:
shutil.rmtree(temp_dir)
def test_empty_collection(self):
"""Test creating an empty collection."""
# Create a temporary directory for the test database
temp_dir = tempfile.mkdtemp()
try:
vector_store = LanceDBVectorStore(
vector_store_schema_config=VectorStoreSchemaConfig(
index_name="empty_collection", vector_size=5
)
)
vector_store.connect(db_uri=temp_dir)
# Load the vector store with a document, then delete it
sample_doc = VectorStoreDocument(
id="tmp",
vector=[0.1, 0.2, 0.3, 0.4, 0.5],
)
vector_store.create_index()
vector_store.load_documents([sample_doc])
vector_store.db_connection.open_table(
vector_store.index_name if vector_store.index_name else ""
).delete("id = 'tmp'")
# Should still have the collection
if vector_store.index_name:
assert (
vector_store.index_name in vector_store.db_connection.table_names()
)
# Add a document after creating an empty collection
doc = VectorStoreDocument(
id="1",
vector=[0.1, 0.2, 0.3, 0.4, 0.5],
)
vector_store.create_index()
vector_store.load_documents([doc])
result = vector_store.search_by_id("1")
assert result.id == "1"
finally:
# Clean up - remove the temporary directory
shutil.rmtree(temp_dir)
def test_filter_search(self, sample_documents_categories):
"""Test filtered search with LanceDB."""
# Create a temporary directory for the test database
temp_dir = tempfile.mkdtemp()
try:
vector_store = LanceDBVectorStore(
vector_store_schema_config=VectorStoreSchemaConfig(
index_name="filter_collection", vector_size=5
)
)
vector_store.connect(db_uri=temp_dir)
vector_store.create_index()
vector_store.load_documents(sample_documents_categories)
# Filter to include only documents about animals
results = vector_store.similarity_search_by_vector(
[0.1, 0.2, 0.3, 0.4, 0.5], k=3
)
# Should return at most 3 documents (the filtered ones)
assert len(results) <= 3
ids = [result.document.id for result in results]
assert set(ids).issubset({"1", "2", "3"})
finally:
shutil.rmtree(temp_dir)
def test_vector_store_customization(self, sample_documents):
"""Test vector store customization with LanceDB."""
# Create a temporary directory for the test database
temp_dir = tempfile.mkdtemp()
try:
vector_store = LanceDBVectorStore(
vector_store_schema_config=VectorStoreSchemaConfig(
index_name="text-embeddings",
id_field="id_custom",
vector_field="vector_custom",
vector_size=5,
),
)
vector_store.connect(db_uri=temp_dir)
vector_store.create_index()
vector_store.load_documents(sample_documents[:2])
if vector_store.index_name:
assert (
vector_store.index_name in vector_store.db_connection.table_names()
)
doc = vector_store.search_by_id("1")
assert doc.id == "1"
assert doc.vector is not None
assert np.allclose(doc.vector, [0.1, 0.2, 0.3, 0.4, 0.5])
results = vector_store.similarity_search_by_vector(
[0.1, 0.2, 0.3, 0.4, 0.5], k=2
)
assert 1 <= len(results) <= 2
assert isinstance(results[0].score, float)
# Test append mode
vector_store.create_index()
vector_store.load_documents([sample_documents[2]])
result = vector_store.search_by_id("3")
assert result.id == "3"
# Define a simple text embedder function for testing
def mock_embedder(text: str) -> list[float]:
return [0.1, 0.2, 0.3, 0.4, 0.5]
text_results = vector_store.similarity_search_by_text(
"test query", mock_embedder, k=2
)
assert 1 <= len(text_results) <= 2
assert isinstance(text_results[0].score, float)
# Test non-existent document
non_existent = vector_store.search_by_id("nonexistent")
assert non_existent.id == "nonexistent"
assert non_existent.vector is None
finally:
shutil.rmtree(temp_dir)