mirror of
https://github.com/microsoft/graphrag.git
synced 2026-01-14 09:07:20 +08:00
Pandas-ify Create Final Entities (#1225)
This commit is contained in:
parent
ce71bcf7fb
commit
737a471d18
@ -0,0 +1,4 @@
|
||||
{
|
||||
"type": "patch",
|
||||
"description": "Apply pandas optimizations to create final entities"
|
||||
}
|
||||
@ -37,12 +37,12 @@ async def create_final_entities(
|
||||
"""All the steps to transform final entities."""
|
||||
table = cast(pd.DataFrame, input.get_input())
|
||||
|
||||
nodes = unpack_graph_df(table, callbacks, "clustered_graph", "nodes")
|
||||
nodes.rename(columns={"label": "name"}, inplace=True)
|
||||
|
||||
nodes = cast(
|
||||
pd.DataFrame,
|
||||
nodes[
|
||||
# Process nodes
|
||||
nodes = (
|
||||
unpack_graph_df(table, callbacks, "clustered_graph", "nodes")
|
||||
.rename(columns={"label": "name"})
|
||||
.loc[
|
||||
:,
|
||||
[
|
||||
"id",
|
||||
"name",
|
||||
@ -51,27 +51,22 @@ async def create_final_entities(
|
||||
"human_readable_id",
|
||||
"graph_embedding",
|
||||
"source_id",
|
||||
]
|
||||
],
|
||||
],
|
||||
]
|
||||
.drop_duplicates(subset="id")
|
||||
)
|
||||
|
||||
# create_base_entity_graph has multiple levels of clustering, which means there are multiple graphs with the same entities
|
||||
# this dedupes the entities so that there is only one of each entity
|
||||
nodes.drop_duplicates(subset="id", inplace=True)
|
||||
nodes = nodes.loc[nodes["name"].notna()]
|
||||
|
||||
# eliminate empty names
|
||||
filtered = cast(pd.DataFrame, nodes[nodes["name"].notna()].reset_index(drop=True))
|
||||
|
||||
with_ids = text_split_df(
|
||||
filtered, column="source_id", separator=",", to="text_unit_ids"
|
||||
)
|
||||
with_ids.drop(columns=["source_id"], inplace=True)
|
||||
|
||||
embedded = with_ids
|
||||
# Split 'source_id' column into 'text_unit_ids'
|
||||
nodes = text_split_df(
|
||||
nodes, column="source_id", separator=",", to="text_unit_ids"
|
||||
).drop(columns=["source_id"])
|
||||
|
||||
# Embed name if not skipped
|
||||
if not skip_name_embedding:
|
||||
embedded = await text_embed_df(
|
||||
embedded,
|
||||
nodes = await text_embed_df(
|
||||
nodes,
|
||||
callbacks,
|
||||
cache,
|
||||
column="name",
|
||||
@ -80,11 +75,11 @@ async def create_final_entities(
|
||||
embedding_name="entity_name",
|
||||
)
|
||||
|
||||
# Embed description if not skipped
|
||||
if not skip_description_embedding:
|
||||
# description embedding is a concat of the name + description, so we'll create a temporary column
|
||||
embedded["name_description"] = embedded["name"] + ":" + embedded["description"]
|
||||
embedded = await text_embed_df(
|
||||
embedded,
|
||||
# Concatenate 'name' and 'description' and embed
|
||||
nodes = await text_embed_df(
|
||||
nodes.assign(name_description=nodes["name"] + ":" + nodes["description"]),
|
||||
callbacks,
|
||||
cache,
|
||||
column="name_description",
|
||||
@ -92,14 +87,10 @@ async def create_final_entities(
|
||||
to="description_embedding",
|
||||
embedding_name="entity_name_description",
|
||||
)
|
||||
embedded.drop(columns=["name_description"], inplace=True)
|
||||
is_using_vector_store = (
|
||||
description_text_embed.get("strategy", {}).get("vector_store", None)
|
||||
is not None
|
||||
)
|
||||
if not is_using_vector_store:
|
||||
embedded = embedded[embedded["description_embedding"].notna()].reset_index(
|
||||
drop=True
|
||||
)
|
||||
|
||||
return create_verb_result(cast(Table, embedded))
|
||||
# Drop rows with NaN 'description_embedding' if not using vector store
|
||||
if not description_text_embed.get("strategy", {}).get("vector_store"):
|
||||
nodes = nodes.loc[nodes["description_embedding"].notna()]
|
||||
|
||||
# Return final result
|
||||
return create_verb_result(cast(Table, nodes))
|
||||
|
||||
Loading…
Reference in New Issue
Block a user