Pandas-ify Create Final Entities (#1225)

2026-01-14 09:07:20 +08:00 · 2024-09-26 15:09:40 -06:00 · 2024-09-26 15:09:40 -06:00 · 737a471d18
commit 737a471d18
parent ce71bcf7fb
2 changed files with 31 additions and 36 deletions
--- a/.semversioner/next-release/patch-20240926192015519738.json
+++ b/.semversioner/next-release/patch-20240926192015519738.json
@ -0,0 +1,4 @@
+{
+  "type": "patch",
+  "description": "Apply pandas optimizations to create final entities"
+}
--- a/graphrag/index/workflows/v1/subflows/create_final_entities.py
+++ b/graphrag/index/workflows/v1/subflows/create_final_entities.py
@ -37,12 +37,12 @@ async def create_final_entities(
    """All the steps to transform final entities."""
    table = cast(pd.DataFrame, input.get_input())

-    nodes = unpack_graph_df(table, callbacks, "clustered_graph", "nodes")
-    nodes.rename(columns={"label": "name"}, inplace=True)
-
-    nodes = cast(
-        pd.DataFrame,
-        nodes[
+    # Process nodes
+    nodes = (
+        unpack_graph_df(table, callbacks, "clustered_graph", "nodes")
+        .rename(columns={"label": "name"})
+        .loc[
+            :,
            [
                "id",
                "name",
@ -51,27 +51,22 @@ async def create_final_entities(
                "human_readable_id",
                "graph_embedding",
                "source_id",
-            ]
-        ],
+            ],
+        ]
+        .drop_duplicates(subset="id")
    )

-    # create_base_entity_graph has multiple levels of clustering, which means there are multiple graphs with the same entities
-    # this dedupes the entities so that there is only one of each entity
-    nodes.drop_duplicates(subset="id", inplace=True)
+    nodes = nodes.loc[nodes["name"].notna()]

-    # eliminate empty names
-    filtered = cast(pd.DataFrame, nodes[nodes["name"].notna()].reset_index(drop=True))
-
-    with_ids = text_split_df(
-        filtered, column="source_id", separator=",", to="text_unit_ids"
-    )
-    with_ids.drop(columns=["source_id"], inplace=True)
-
-    embedded = with_ids
+    # Split 'source_id' column into 'text_unit_ids'
+    nodes = text_split_df(
+        nodes, column="source_id", separator=",", to="text_unit_ids"
+    ).drop(columns=["source_id"])

+    # Embed name if not skipped
    if not skip_name_embedding:
-        embedded = await text_embed_df(
-            embedded,
+        nodes = await text_embed_df(
+            nodes,
            callbacks,
            cache,
            column="name",
@ -80,11 +75,11 @@ async def create_final_entities(
            embedding_name="entity_name",
        )

+    # Embed description if not skipped
    if not skip_description_embedding:
-        # description embedding is a concat of the name + description, so we'll create a temporary column
-        embedded["name_description"] = embedded["name"] + ":" + embedded["description"]
-        embedded = await text_embed_df(
-            embedded,
+        # Concatenate 'name' and 'description' and embed
+        nodes = await text_embed_df(
+            nodes.assign(name_description=nodes["name"] + ":" + nodes["description"]),
            callbacks,
            cache,
            column="name_description",
@ -92,14 +87,10 @@ async def create_final_entities(
            to="description_embedding",
            embedding_name="entity_name_description",
        )
-        embedded.drop(columns=["name_description"], inplace=True)
-        is_using_vector_store = (
-            description_text_embed.get("strategy", {}).get("vector_store", None)
-            is not None
-        )
-        if not is_using_vector_store:
-            embedded = embedded[embedded["description_embedding"].notna()].reset_index(
-                drop=True
-            )

-    return create_verb_result(cast(Table, embedded))
+        # Drop rows with NaN 'description_embedding' if not using vector store
+        if not description_text_embed.get("strategy", {}).get("vector_store"):
+            nodes = nodes.loc[nodes["description_embedding"].notna()]
+
+    # Return final result
+    return create_verb_result(cast(Table, nodes))