From 9b14b9da3614195a3d8f40f54ea7afbe10755a31 Mon Sep 17 00:00:00 2001
From: Gaudy Blanco <gaudy-microsoft@MacBook-Pro-m4-Gaudy-For-Work.local>
Date: Sat, 13 Sep 2025 00:27:55 -0600
Subject: [PATCH] fix for lancedb vectors

---
 .vscode/launch.json               | 17 +++++----
 graphrag/vector_stores/lancedb.py | 58 +++++++++++++++++++------------
 2 files changed, 45 insertions(+), 30 deletions(-)

diff --git a/.vscode/launch.json b/.vscode/launch.json
index 944b77dd..641fb933 100644
--- a/.vscode/launch.json
+++ b/.vscode/launch.json
@@ -6,21 +6,24 @@
 			"name": "Indexer",
 			"type": "debugpy",
 			"request": "launch",
-			"module": "uv",
+			"module": "graphrag",
 			"args": [
-				"poe", "index",
-				"--root", "<path_to_ragtest_root_demo>"
+				"index",
+				"--root", 
+				"<path_to_index_folder>"
 			],
+			"console": "integratedTerminal"
 		},
 		{
 			"name": "Query",
 			"type": "debugpy",
 			"request": "launch",
-			"module": "uv",
+			"module": "graphrag",
 			"args": [
-				"poe", "query",
-				"--root", "<path_to_ragtest_root_demo>",
-				"--method", "global",
+				"query",
+				"--root", 
+				"<path_to_index_folder>",
+				"--method", "basic",
 				"--query", "What are the top themes in this story",
 			]
 		},
diff --git a/graphrag/vector_stores/lancedb.py b/graphrag/vector_stores/lancedb.py
index 0da2df50..f8c8c468 100644
--- a/graphrag/vector_stores/lancedb.py
+++ b/graphrag/vector_stores/lancedb.py
@@ -5,9 +5,8 @@
 
 import json  # noqa: I001
 from typing import Any
-
 import pyarrow as pa
-
+import numpy as np
 from graphrag.config.models.vector_store_schema_config import VectorStoreSchemaConfig
 from graphrag.data_model.types import TextEmbedder
 
@@ -37,42 +36,54 @@ class LanceDBVectorStore(BaseVectorStore):
                 self.index_name
             )
 
+
     def load_documents(
         self, documents: list[VectorStoreDocument], overwrite: bool = True
     ) -> None:
         """Load documents into vector storage."""
-        data = [
-            {
-                self.id_field: document.id,
-                self.text_field: document.text,
-                self.vector_field: document.vector,
-                self.attributes_field: json.dumps(document.attributes),
-            }
-            for document in documents
-            if document.vector is not None
-        ]
+        # TODO GAUDY Step 1: Prepare data columns manually
+        ids = []
+        texts = []
+        vectors = []
+        attributes = []
 
-        if len(data) == 0:
+        for document in documents:
+            if document.vector is not None and len(document.vector) == self.vector_size:
+                ids.append(document.id)
+                texts.append(document.text)
+                vectors.append(np.array(document.vector, dtype=np.float32))
+                attributes.append(json.dumps(document.attributes))
+
+        # Step 2: Handle empty case
+        if len(ids) == 0:
             data = None
+        else:
+            # Step 3: Flatten the vectors and build FixedSizeListArray manually
+            flat_vector = np.concatenate(vectors).astype(np.float32)
+            flat_array = pa.array(flat_vector, type=pa.float32())
+            vector_column = pa.FixedSizeListArray.from_arrays(flat_array, self.vector_size)
+
+            # Step 4: Create PyArrow table (let schema be inferred)
+            data = pa.table({
+                self.id_field: pa.array(ids, type=pa.string()),
+                self.text_field: pa.array(texts, type=pa.string()),
+                self.vector_field: vector_column,
+                self.attributes_field: pa.array(attributes, type=pa.string())
+            })
 
-        schema = pa.schema([
-            pa.field(self.id_field, pa.string()),
-            pa.field(self.text_field, pa.string()),
-            pa.field(self.vector_field, pa.list_(pa.float64())),
-            pa.field(self.attributes_field, pa.string()),
-        ])
         # NOTE: If modifying the next section of code, ensure that the schema remains the same.
         #       The pyarrow format of the 'vector' field may change if the order of operations is changed
         #       and will break vector search.
         if overwrite:
             if data:
                 self.document_collection = self.db_connection.create_table(
-                    self.index_name, data=data, mode="overwrite"
+                    self.index_name, data=data, mode="overwrite", schema=data.schema
                 )
             else:
                 self.document_collection = self.db_connection.create_table(
-                    self.index_name, schema=schema, mode="overwrite"
+                    self.index_name, mode="overwrite"
                 )
+            self.document_collection.create_index(vector_column_name=self.vector_field, index_type="IVF_FLAT")
         else:
             # add data to existing table
             self.document_collection = self.db_connection.open_table(
@@ -80,7 +91,6 @@ class LanceDBVectorStore(BaseVectorStore):
             )
             if data:
                 self.document_collection.add(data)
-                self.document_collection.create_index(vector_column_name=self.vector_field)
 
     def filter_by_id(self, include_ids: list[str] | list[int]) -> Any:
         """Build a query filter to filter documents by id."""
@@ -97,7 +107,7 @@ class LanceDBVectorStore(BaseVectorStore):
         return self.query_filter
 
     def similarity_search_by_vector(
-        self, query_embedding: list[float], k: int = 10, **kwargs: Any
+        self, query_embedding: list[float] | np.ndarray, k: int = 10, **kwargs: Any
     ) -> list[VectorStoreSearchResult]:
         """Perform a vector-based similarity search."""
         if self.query_filter:
@@ -110,6 +120,8 @@ class LanceDBVectorStore(BaseVectorStore):
                 .to_list()
             )
         else:
+            query_embedding = np.array(query_embedding, dtype=np.float32)
+
             docs = (
                 self.document_collection.search(
                     query=query_embedding, vector_column_name=self.vector_field