mirror of
https://github.com/microsoft/graphrag.git
synced 2026-01-14 00:57:23 +08:00
205 lines
5.0 KiB
Plaintext
205 lines
5.0 KiB
Plaintext
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "2b5f42c2",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"import time\n",
|
|
"\n",
|
|
"import pandas as pd"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "6b2aff68",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"PROJECT_DIRECTORY = \"<PROJECT_DIRECTORY>\""
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "7abffd6f",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"entities = pd.read_parquet(f\"{PROJECT_DIRECTORY}/output/entities.parquet\")\n",
|
|
"print(len(entities))\n",
|
|
"entities.head()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "6a8d5897",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"relationships = pd.read_parquet(f\"{PROJECT_DIRECTORY}/output/relationships.parquet\")\n",
|
|
"print(len(relationships))\n",
|
|
"relationships.head()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "00236b6b",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"communities = pd.read_parquet(f\"{PROJECT_DIRECTORY}/output/communities.parquet\")\n",
|
|
"print(len(communities))\n",
|
|
"communities.head()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "dc324f92",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"from graphrag.index.operations.create_graph import create_graph\n",
|
|
"\n",
|
|
"graph = create_graph(relationships, edge_attr=[\"weight\"])\n",
|
|
"print(graph.nodes)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "8eb41087",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"from graphrag.index.operations.embed_graph.embed_node2vec import embed_node2vec\n",
|
|
"from graphrag.index.operations.layout_graph.umap import run as run_umap\n",
|
|
"\n",
|
|
"start = time.time()\n",
|
|
"n2v = embed_node2vec(\n",
|
|
" graph,\n",
|
|
")\n",
|
|
"end = time.time()\n",
|
|
"print(\"n2v time:\", end - start)\n",
|
|
"n_embeddings = dict(zip(n2v.nodes, n2v.embeddings))\n",
|
|
"\n",
|
|
"\n",
|
|
"n_umap = run_umap(graph, n_embeddings, lambda x: x)\n",
|
|
"n_umap_list = [{\"title\": p.label, \"x_n2v\": p.x, \"y_n2v\": p.y} for p in n_umap]\n",
|
|
"\n",
|
|
"n_df = pd.DataFrame(n_umap_list)\n",
|
|
"\n",
|
|
"n_df.head()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "ae7b6da1",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"from graphrag.config.models.embed_graph_config import EmbedGraphConfig\n",
|
|
"from graphrag.index.operations.embed_graph.embed_graph import embed_graph\n",
|
|
"\n",
|
|
"start = time.time()\n",
|
|
"pipeline_embeddings = embed_graph(graph, entities, communities, EmbedGraphConfig())\n",
|
|
"end = time.time()\n",
|
|
"print(\"gee time:\", end - start)\n",
|
|
"\n",
|
|
"p_umap = run_umap(graph, pipeline_embeddings, lambda x: x)\n",
|
|
"\n",
|
|
"p_umap_list = [{\"title\": p.label, \"x_gee_p\": p.x, \"y_gee_p\": p.y} for p in p_umap]\n",
|
|
"\n",
|
|
"p_df = pd.DataFrame(p_umap_list)\n",
|
|
"\n",
|
|
"p_df.head()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "ba9ab829",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"merged_entities = entities.merge(n_df, left_on=\"title\", right_on=\"title\", how=\"left\")\n",
|
|
"merged_entities = merged_entities.merge(\n",
|
|
" p_df, left_on=\"title\", right_on=\"title\", how=\"left\"\n",
|
|
")\n",
|
|
"community_labels = communities.explode(\"entity_ids\")[\n",
|
|
" [\"community\", \"entity_ids\", \"level\"]\n",
|
|
"]\n",
|
|
"merged_entities = merged_entities.merge(\n",
|
|
" community_labels, left_on=\"id\", right_on=\"entity_ids\", how=\"left\"\n",
|
|
")\n",
|
|
"merged_entities = merged_entities[merged_entities[\"level\"] == 0]"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "fde33384",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"merged_entities.plot(\n",
|
|
" x=\"x_n2v\",\n",
|
|
" y=\"y_n2v\",\n",
|
|
" s=5,\n",
|
|
" kind=\"scatter\",\n",
|
|
" c=\"community\",\n",
|
|
" cmap=\"tab20\",\n",
|
|
" title=\"n2v\",\n",
|
|
" figsize=(12, 10),\n",
|
|
" xticks=[],\n",
|
|
" yticks=[],\n",
|
|
" xlabel=\"\",\n",
|
|
" ylabel=\"\",\n",
|
|
")\n",
|
|
"merged_entities.plot(\n",
|
|
" x=\"x_gee_p\",\n",
|
|
" y=\"y_gee_p\",\n",
|
|
" s=5,\n",
|
|
" kind=\"scatter\",\n",
|
|
" c=\"community\",\n",
|
|
" cmap=\"tab20\",\n",
|
|
" title=\"workflow\",\n",
|
|
" figsize=(12, 10),\n",
|
|
" xticks=[],\n",
|
|
" yticks=[],\n",
|
|
" xlabel=\"\",\n",
|
|
" ylabel=\"\",\n",
|
|
")"
|
|
]
|
|
}
|
|
],
|
|
"metadata": {
|
|
"kernelspec": {
|
|
"display_name": "graphrag",
|
|
"language": "python",
|
|
"name": "python3"
|
|
},
|
|
"language_info": {
|
|
"codemirror_mode": {
|
|
"name": "ipython",
|
|
"version": 3
|
|
},
|
|
"file_extension": ".py",
|
|
"mimetype": "text/x-python",
|
|
"name": "python",
|
|
"nbconvert_exporter": "python",
|
|
"pygments_lexer": "ipython3",
|
|
"version": "3.12.10"
|
|
}
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 5
|
|
}
|