mirror of
https://github.com/microsoft/graphrag.git
synced 2026-01-14 09:07:20 +08:00
text and graph embeddings extraction scripts
This commit is contained in:
parent
f5c5876dde
commit
0acea3e737
116
migration_scripts/extract_graph_embeddings.ipynb
Normal file
116
migration_scripts/extract_graph_embeddings.ipynb
Normal file
@ -0,0 +1,116 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## SCRIPT TO EXTRACT EXISTING GRAPH EMBEDDINGS INTO A NEW WORKFLOW WITH NEW LOOKUP TABLES"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import sys\n",
|
||||
"sys.path.insert(1, '../../')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import pandas as pd"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### SET VALUES FOR THE INDEX FOLDER TO BE EXTRACTED"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# set local folder where the index data is located\n",
|
||||
"LOCAL_ROOT = \"/Users/gaudy-microsoft/Repositories/unified-copilot/app/data/CHRISTMAS-CAROL\"\n",
|
||||
"\n",
|
||||
"# value to decide if the original file should maintain or remove the embedding column\n",
|
||||
"REMOVE_ORIGINAL_EMBEDDING_COLUMN_IN_SOURCE_FILE = True\n",
|
||||
"\n",
|
||||
"#identifier field\n",
|
||||
"STANDARD_IDENTIFIER_FIELD = \"id\"\n",
|
||||
"\n",
|
||||
"#new embedding field name\n",
|
||||
"NEW_STANDARD_EMBEDDING_FIELD = \"embedding\""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def extract_graph_embedding_from_table(input_path: str, embedding_field: str, embeddings_parquet_output_field: str):\n",
|
||||
" \"\"\"Migrate table for embeddings.\"\"\"\n",
|
||||
" original_df = pd.read_parquet(input_path)\n",
|
||||
" no_embeddings_df = original_df.drop(columns=[embedding_field])\n",
|
||||
" \n",
|
||||
" embeddings_df = original_df[[STANDARD_IDENTIFIER_FIELD, embedding_field]]\n",
|
||||
" embeddings_df = embeddings_df.rename(columns={embedding_field: NEW_STANDARD_EMBEDDING_FIELD}) # type: ignore\n",
|
||||
" embeddings_df.to_parquet(embeddings_parquet_output_field, index=False)\n",
|
||||
"\n",
|
||||
" if REMOVE_ORIGINAL_EMBEDDING_COLUMN_IN_SOURCE_FILE is True:\n",
|
||||
" no_embeddings_df.to_parquet(input_path, index=False)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### EMBEDDINGS TO MIGRATE IN FILE: `create_base_entity_graph.parquet`"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"#input file with the embedding column\n",
|
||||
"INPUT_PATH = f\"{LOCAL_ROOT}/create_base_entity_graph.parquet\"\n",
|
||||
"\n",
|
||||
"#output file for embeddings\n",
|
||||
"EMBEDDINGS_PARQUET_OUTPUT_PATH = f\"{LOCAL_ROOT}/create_base_entity_graph_embeddings.parquet\"\n",
|
||||
"\n",
|
||||
"#output file without embeddings\n",
|
||||
"NO_EMBEDDINGS_PARQUET_OUTPUT_PATH = f\"{LOCAL_ROOT}/create_base_entity_graph.parquet\"\n",
|
||||
"\n",
|
||||
"#embedding field\n",
|
||||
"EMBEDDING_FIELD = \"embeddings\"\n",
|
||||
"\n",
|
||||
"extract_graph_embedding_from_table(INPUT_PATH, EMBEDDING_FIELD, EMBEDDINGS_PARQUET_OUTPUT_PATH)"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": ".venv",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"name": "python",
|
||||
"version": "3.11.10"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
||||
271
migration_scripts/extract_text_embeddings.ipynb
Normal file
271
migration_scripts/extract_text_embeddings.ipynb
Normal file
@ -0,0 +1,271 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## SCRIPT TO EXTRACT EXISTING TEXT EMBEDDINGS INTO A NEW WORKFLOW WITH NEW LOOKUP TABLES"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import sys\n",
|
||||
"sys.path.insert(1, '../../')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import pandas as pd"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### SET VALUES FOR THE INDEX FOLDER TO BE EXTRACTED"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# set local folder where the index data is located\n",
|
||||
"LOCAL_ROOT = \"/Users/gaudy-microsoft/Repositories/unified-copilot/app/data/CHRISTMAS-CAROL\"\n",
|
||||
"\n",
|
||||
"# value to decide if the original file should maintain or remove the embedding column\n",
|
||||
"REMOVE_ORIGINAL_EMBEDDING_COLUMN_IN_SOURCE_FILE = True\n",
|
||||
"\n",
|
||||
"#identifier field\n",
|
||||
"STANDARD_IDENTIFIER_FIELD = \"id\"\n",
|
||||
"\n",
|
||||
"#new embedding field name\n",
|
||||
"NEW_STANDARD_EMBEDDING_FIELD = \"embedding\""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def extract_text_embedding_from_table(input_path: str, embedding_field: str, embeddings_parquet_output_field: str):\n",
|
||||
" \"\"\"Migrate table for embeddings.\"\"\"\n",
|
||||
" original_df = pd.read_parquet(input_path)\n",
|
||||
" no_embeddings_df = original_df.drop(columns=[embedding_field])\n",
|
||||
" \n",
|
||||
" embeddings_df = original_df[[STANDARD_IDENTIFIER_FIELD, embedding_field]]\n",
|
||||
" embeddings_df = embeddings_df.rename(columns={embedding_field: NEW_STANDARD_EMBEDDING_FIELD}) # type: ignore\n",
|
||||
" embeddings_df.to_parquet(embeddings_parquet_output_field, index=False)\n",
|
||||
"\n",
|
||||
" if REMOVE_ORIGINAL_EMBEDDING_COLUMN_IN_SOURCE_FILE is True:\n",
|
||||
" no_embeddings_df.to_parquet(input_path, index=False)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### EMBEDDINGS TO MIGRATE IN FILE: `create_final_community_reports.parquet`"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"#input file with the embedding column\n",
|
||||
"INPUT_PATH = f\"{LOCAL_ROOT}/create_final_community_reports.parquet\"\n",
|
||||
"\n",
|
||||
"#output file for embeddings\n",
|
||||
"EMBEDDINGS_PARQUET_OUTPUT_PATH = f\"{LOCAL_ROOT}/create_final_community_reports_embeddings.parquet\"\n",
|
||||
"\n",
|
||||
"#output file without embeddings\n",
|
||||
"NO_EMBEDDINGS_PARQUET_OUTPUT_PATH = f\"{LOCAL_ROOT}/create_final_community_reports.parquet\"\n",
|
||||
"\n",
|
||||
"#embedding field\n",
|
||||
"EMBEDDING_FIELD = \"full_content_embedding\"\n",
|
||||
"\n",
|
||||
"extract_text_embedding_from_table(INPUT_PATH, EMBEDDING_FIELD, EMBEDDINGS_PARQUET_OUTPUT_PATH)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"#input file with the embedding column\n",
|
||||
"INPUT_PATH = f\"{LOCAL_ROOT}/create_final_community_reports.parquet\"\n",
|
||||
"\n",
|
||||
"#output file for embeddings\n",
|
||||
"EMBEDDINGS_PARQUET_OUTPUT_PATH = f\"{LOCAL_ROOT}/create_final_community_reports_embeddings.parquet\"\n",
|
||||
"\n",
|
||||
"#output file without embeddings\n",
|
||||
"NO_EMBEDDINGS_PARQUET_OUTPUT_PATH = f\"{LOCAL_ROOT}/create_final_community_reports.parquet\"\n",
|
||||
"\n",
|
||||
"#embedding field\n",
|
||||
"EMBEDDING_FIELD = \"summary_embedding\"\n",
|
||||
"\n",
|
||||
"extract_text_embedding_from_table(INPUT_PATH, EMBEDDING_FIELD, EMBEDDINGS_PARQUET_OUTPUT_PATH)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"#input file with the embedding column\n",
|
||||
"INPUT_PATH = f\"{LOCAL_ROOT}/create_final_community_reports.parquet\"\n",
|
||||
"\n",
|
||||
"#output file for embeddings\n",
|
||||
"EMBEDDINGS_PARQUET_OUTPUT_PATH = f\"{LOCAL_ROOT}/create_final_community_reports_embeddings.parquet\"\n",
|
||||
"\n",
|
||||
"#output file without embeddings\n",
|
||||
"NO_EMBEDDINGS_PARQUET_OUTPUT_PATH = f\"{LOCAL_ROOT}/create_final_community_reports.parquet\"\n",
|
||||
"\n",
|
||||
"#embedding field\n",
|
||||
"EMBEDDING_FIELD = \"title_embedding\"\n",
|
||||
"\n",
|
||||
"extract_text_embedding_from_table(INPUT_PATH, EMBEDDING_FIELD, EMBEDDINGS_PARQUET_OUTPUT_PATH)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### EMBEDDINGS TO MIGRATE IN FILE: `create_final_documents.parquet`"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"#input file with the embedding column\n",
|
||||
"INPUT_PATH = f\"{LOCAL_ROOT}/create_final_documents.parquet\"\n",
|
||||
"\n",
|
||||
"#output file for embeddings\n",
|
||||
"EMBEDDINGS_PARQUET_OUTPUT_PATH = f\"{LOCAL_ROOT}/create_final_documents_embeddings.parquet\"\n",
|
||||
"\n",
|
||||
"#output file without embeddings\n",
|
||||
"NO_EMBEDDINGS_PARQUET_OUTPUT_PATH = f\"{LOCAL_ROOT}/create_final_documents.parquet\"\n",
|
||||
"\n",
|
||||
"#embedding field\n",
|
||||
"EMBEDDING_FIELD = \"raw_content_embedding\"\n",
|
||||
"\n",
|
||||
"extract_text_embedding_from_table(INPUT_PATH, EMBEDDING_FIELD, EMBEDDINGS_PARQUET_OUTPUT_PATH)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### EMBEDDINGS TO MIGRATE IN FILE: `create_final_entities.parquet`"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"#input file with the embedding column\n",
|
||||
"INPUT_PATH = f\"{LOCAL_ROOT}/create_final_entities.parquet\"\n",
|
||||
"\n",
|
||||
"#output file for embeddings\n",
|
||||
"EMBEDDINGS_PARQUET_OUTPUT_PATH = f\"{LOCAL_ROOT}/create_final_entities_embeddings.parquet\"\n",
|
||||
"\n",
|
||||
"#output file without embeddings\n",
|
||||
"NO_EMBEDDINGS_PARQUET_OUTPUT_PATH = f\"{LOCAL_ROOT}/create_final_entities.parquet\"\n",
|
||||
"\n",
|
||||
"#embedding field\n",
|
||||
"EMBEDDING_FIELD = \"name_embedding\"\n",
|
||||
"\n",
|
||||
"extract_text_embedding_from_table(INPUT_PATH, EMBEDDING_FIELD, EMBEDDINGS_PARQUET_OUTPUT_PATH)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"#input file with the embedding column\n",
|
||||
"INPUT_PATH = f\"{LOCAL_ROOT}/create_final_entities.parquet\"\n",
|
||||
"\n",
|
||||
"#output file for embeddings\n",
|
||||
"EMBEDDINGS_PARQUET_OUTPUT_PATH = f\"{LOCAL_ROOT}/create_final_entities_embeddings.parquet\"\n",
|
||||
"\n",
|
||||
"#output file without embeddings\n",
|
||||
"NO_EMBEDDINGS_PARQUET_OUTPUT_PATH = f\"{LOCAL_ROOT}/create_final_entities.parquet\"\n",
|
||||
"\n",
|
||||
"#embedding field\n",
|
||||
"EMBEDDING_FIELD = \"description_embedding\"\n",
|
||||
"\n",
|
||||
"extract_text_embedding_from_table(INPUT_PATH, EMBEDDING_FIELD, EMBEDDINGS_PARQUET_OUTPUT_PATH)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### EMBEDDINGS TO MIGRATE IN FILE: `create_final_text_units.parquet`"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"#input file with the embedding column\n",
|
||||
"INPUT_PATH = f\"{LOCAL_ROOT}/create_final_text_units.parquet\"\n",
|
||||
"\n",
|
||||
"#output file for embeddings\n",
|
||||
"EMBEDDINGS_PARQUET_OUTPUT_PATH = f\"{LOCAL_ROOT}/create_final_text_units_embeddings.parquet\"\n",
|
||||
"\n",
|
||||
"#output file without embeddings\n",
|
||||
"NO_EMBEDDINGS_PARQUET_OUTPUT_PATH = f\"{LOCAL_ROOT}/create_final_text_units.parquet\"\n",
|
||||
"\n",
|
||||
"#embedding field\n",
|
||||
"EMBEDDING_FIELD = \"text_embedding\"\n",
|
||||
"\n",
|
||||
"extract_text_embedding_from_table(INPUT_PATH, EMBEDDING_FIELD, EMBEDDINGS_PARQUET_OUTPUT_PATH)"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": ".venv",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.11.10"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
||||
Loading…
Reference in New Issue
Block a user