Pandas-ify Create Final Entities (#1225)

This commit is contained in:
Alonso Guevara 2024-09-26 15:09:40 -06:00 committed by GitHub
parent ce71bcf7fb
commit 737a471d18
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 31 additions and 36 deletions

View File

@ -0,0 +1,4 @@
{
"type": "patch",
"description": "Apply pandas optimizations to create final entities"
}

View File

@ -37,12 +37,12 @@ async def create_final_entities(
"""All the steps to transform final entities."""
table = cast(pd.DataFrame, input.get_input())
nodes = unpack_graph_df(table, callbacks, "clustered_graph", "nodes")
nodes.rename(columns={"label": "name"}, inplace=True)
nodes = cast(
pd.DataFrame,
nodes[
# Process nodes
nodes = (
unpack_graph_df(table, callbacks, "clustered_graph", "nodes")
.rename(columns={"label": "name"})
.loc[
:,
[
"id",
"name",
@ -51,27 +51,22 @@ async def create_final_entities(
"human_readable_id",
"graph_embedding",
"source_id",
]
],
],
]
.drop_duplicates(subset="id")
)
# create_base_entity_graph has multiple levels of clustering, which means there are multiple graphs with the same entities
# this dedupes the entities so that there is only one of each entity
nodes.drop_duplicates(subset="id", inplace=True)
nodes = nodes.loc[nodes["name"].notna()]
# eliminate empty names
filtered = cast(pd.DataFrame, nodes[nodes["name"].notna()].reset_index(drop=True))
with_ids = text_split_df(
filtered, column="source_id", separator=",", to="text_unit_ids"
)
with_ids.drop(columns=["source_id"], inplace=True)
embedded = with_ids
# Split 'source_id' column into 'text_unit_ids'
nodes = text_split_df(
nodes, column="source_id", separator=",", to="text_unit_ids"
).drop(columns=["source_id"])
# Embed name if not skipped
if not skip_name_embedding:
embedded = await text_embed_df(
embedded,
nodes = await text_embed_df(
nodes,
callbacks,
cache,
column="name",
@ -80,11 +75,11 @@ async def create_final_entities(
embedding_name="entity_name",
)
# Embed description if not skipped
if not skip_description_embedding:
# description embedding is a concat of the name + description, so we'll create a temporary column
embedded["name_description"] = embedded["name"] + ":" + embedded["description"]
embedded = await text_embed_df(
embedded,
# Concatenate 'name' and 'description' and embed
nodes = await text_embed_df(
nodes.assign(name_description=nodes["name"] + ":" + nodes["description"]),
callbacks,
cache,
column="name_description",
@ -92,14 +87,10 @@ async def create_final_entities(
to="description_embedding",
embedding_name="entity_name_description",
)
embedded.drop(columns=["name_description"], inplace=True)
is_using_vector_store = (
description_text_embed.get("strategy", {}).get("vector_store", None)
is not None
)
if not is_using_vector_store:
embedded = embedded[embedded["description_embedding"].notna()].reset_index(
drop=True
)
return create_verb_result(cast(Table, embedded))
# Drop rows with NaN 'description_embedding' if not using vector store
if not description_text_embed.get("strategy", {}).get("vector_store"):
nodes = nodes.loc[nodes["description_embedding"].notna()]
# Return final result
return create_verb_result(cast(Table, nodes))