graphrag/docs/examples_notebooks/gee.ipynb
Nathan Evans db36524a18 Timing
2025-09-10 14:21:23 -07:00

205 lines
5.0 KiB
Plaintext

{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"id": "2b5f42c2",
"metadata": {},
"outputs": [],
"source": [
"import time\n",
"\n",
"import pandas as pd"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "6b2aff68",
"metadata": {},
"outputs": [],
"source": [
"PROJECT_DIRECTORY = \"<PROJECT_DIRECTORY>\""
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "7abffd6f",
"metadata": {},
"outputs": [],
"source": [
"entities = pd.read_parquet(f\"{PROJECT_DIRECTORY}/output/entities.parquet\")\n",
"print(len(entities))\n",
"entities.head()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "6a8d5897",
"metadata": {},
"outputs": [],
"source": [
"relationships = pd.read_parquet(f\"{PROJECT_DIRECTORY}/output/relationships.parquet\")\n",
"print(len(relationships))\n",
"relationships.head()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "00236b6b",
"metadata": {},
"outputs": [],
"source": [
"communities = pd.read_parquet(f\"{PROJECT_DIRECTORY}/output/communities.parquet\")\n",
"print(len(communities))\n",
"communities.head()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "dc324f92",
"metadata": {},
"outputs": [],
"source": [
"from graphrag.index.operations.create_graph import create_graph\n",
"\n",
"graph = create_graph(relationships, edge_attr=[\"weight\"])\n",
"print(graph.nodes)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "8eb41087",
"metadata": {},
"outputs": [],
"source": [
"from graphrag.index.operations.embed_graph.embed_node2vec import embed_node2vec\n",
"from graphrag.index.operations.layout_graph.umap import run as run_umap\n",
"\n",
"start = time.time()\n",
"n2v = embed_node2vec(\n",
" graph,\n",
")\n",
"end = time.time()\n",
"print(\"n2v time:\", end - start)\n",
"n_embeddings = dict(zip(n2v.nodes, n2v.embeddings))\n",
"\n",
"\n",
"n_umap = run_umap(graph, n_embeddings, lambda x: x)\n",
"n_umap_list = [{\"title\": p.label, \"x_n2v\": p.x, \"y_n2v\": p.y} for p in n_umap]\n",
"\n",
"n_df = pd.DataFrame(n_umap_list)\n",
"\n",
"n_df.head()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "ae7b6da1",
"metadata": {},
"outputs": [],
"source": [
"from graphrag.config.models.embed_graph_config import EmbedGraphConfig\n",
"from graphrag.index.operations.embed_graph.embed_graph import embed_graph\n",
"\n",
"start = time.time()\n",
"pipeline_embeddings = embed_graph(graph, entities, communities, EmbedGraphConfig())\n",
"end = time.time()\n",
"print(\"gee time:\", end - start)\n",
"\n",
"p_umap = run_umap(graph, pipeline_embeddings, lambda x: x)\n",
"\n",
"p_umap_list = [{\"title\": p.label, \"x_gee_p\": p.x, \"y_gee_p\": p.y} for p in p_umap]\n",
"\n",
"p_df = pd.DataFrame(p_umap_list)\n",
"\n",
"p_df.head()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "ba9ab829",
"metadata": {},
"outputs": [],
"source": [
"merged_entities = entities.merge(n_df, left_on=\"title\", right_on=\"title\", how=\"left\")\n",
"merged_entities = merged_entities.merge(\n",
" p_df, left_on=\"title\", right_on=\"title\", how=\"left\"\n",
")\n",
"community_labels = communities.explode(\"entity_ids\")[\n",
" [\"community\", \"entity_ids\", \"level\"]\n",
"]\n",
"merged_entities = merged_entities.merge(\n",
" community_labels, left_on=\"id\", right_on=\"entity_ids\", how=\"left\"\n",
")\n",
"merged_entities = merged_entities[merged_entities[\"level\"] == 0]"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "fde33384",
"metadata": {},
"outputs": [],
"source": [
"merged_entities.plot(\n",
" x=\"x_n2v\",\n",
" y=\"y_n2v\",\n",
" s=5,\n",
" kind=\"scatter\",\n",
" c=\"community\",\n",
" cmap=\"tab20\",\n",
" title=\"n2v\",\n",
" figsize=(12, 10),\n",
" xticks=[],\n",
" yticks=[],\n",
" xlabel=\"\",\n",
" ylabel=\"\",\n",
")\n",
"merged_entities.plot(\n",
" x=\"x_gee_p\",\n",
" y=\"y_gee_p\",\n",
" s=5,\n",
" kind=\"scatter\",\n",
" c=\"community\",\n",
" cmap=\"tab20\",\n",
" title=\"workflow\",\n",
" figsize=(12, 10),\n",
" xticks=[],\n",
" yticks=[],\n",
" xlabel=\"\",\n",
" ylabel=\"\",\n",
")"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "graphrag",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.10"
}
},
"nbformat": 4,
"nbformat_minor": 5
}