diff --git a/migration_scripts/extract_graph_embeddings.ipynb b/migration_scripts/extract_graph_embeddings.ipynb new file mode 100644 index 00000000..29e5371e --- /dev/null +++ b/migration_scripts/extract_graph_embeddings.ipynb @@ -0,0 +1,116 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## SCRIPT TO EXTRACT EXISTING GRAPH EMBEDDINGS INTO A NEW WORKFLOW WITH NEW LOOKUP TABLES" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import sys\n", + "sys.path.insert(1, '../../')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### SET VALUES FOR THE INDEX FOLDER TO BE EXTRACTED" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# set local folder where the index data is located\n", + "LOCAL_ROOT = \"/Users/gaudy-microsoft/Repositories/unified-copilot/app/data/CHRISTMAS-CAROL\"\n", + "\n", + "# value to decide if the original file should maintain or remove the embedding column\n", + "REMOVE_ORIGINAL_EMBEDDING_COLUMN_IN_SOURCE_FILE = True\n", + "\n", + "#identifier field\n", + "STANDARD_IDENTIFIER_FIELD = \"id\"\n", + "\n", + "#new embedding field name\n", + "NEW_STANDARD_EMBEDDING_FIELD = \"embedding\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def extract_graph_embedding_from_table(input_path: str, embedding_field: str, embeddings_parquet_output_field: str):\n", + " \"\"\"Migrate table for embeddings.\"\"\"\n", + " original_df = pd.read_parquet(input_path)\n", + " no_embeddings_df = original_df.drop(columns=[embedding_field])\n", + " \n", + " embeddings_df = original_df[[STANDARD_IDENTIFIER_FIELD, embedding_field]]\n", + " embeddings_df = embeddings_df.rename(columns={embedding_field: NEW_STANDARD_EMBEDDING_FIELD}) # type: ignore\n", + " embeddings_df.to_parquet(embeddings_parquet_output_field, index=False)\n", + "\n", + " if REMOVE_ORIGINAL_EMBEDDING_COLUMN_IN_SOURCE_FILE is True:\n", + " no_embeddings_df.to_parquet(input_path, index=False)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### EMBEDDINGS TO MIGRATE IN FILE: `create_base_entity_graph.parquet`" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#input file with the embedding column\n", + "INPUT_PATH = f\"{LOCAL_ROOT}/create_base_entity_graph.parquet\"\n", + "\n", + "#output file for embeddings\n", + "EMBEDDINGS_PARQUET_OUTPUT_PATH = f\"{LOCAL_ROOT}/create_base_entity_graph_embeddings.parquet\"\n", + "\n", + "#output file without embeddings\n", + "NO_EMBEDDINGS_PARQUET_OUTPUT_PATH = f\"{LOCAL_ROOT}/create_base_entity_graph.parquet\"\n", + "\n", + "#embedding field\n", + "EMBEDDING_FIELD = \"embeddings\"\n", + "\n", + "extract_graph_embedding_from_table(INPUT_PATH, EMBEDDING_FIELD, EMBEDDINGS_PARQUET_OUTPUT_PATH)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": ".venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "name": "python", + "version": "3.11.10" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/migration_scripts/extract_text_embeddings.ipynb b/migration_scripts/extract_text_embeddings.ipynb new file mode 100644 index 00000000..df429163 --- /dev/null +++ b/migration_scripts/extract_text_embeddings.ipynb @@ -0,0 +1,271 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## SCRIPT TO EXTRACT EXISTING TEXT EMBEDDINGS INTO A NEW WORKFLOW WITH NEW LOOKUP TABLES" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import sys\n", + "sys.path.insert(1, '../../')" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### SET VALUES FOR THE INDEX FOLDER TO BE EXTRACTED" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "# set local folder where the index data is located\n", + "LOCAL_ROOT = \"/Users/gaudy-microsoft/Repositories/unified-copilot/app/data/CHRISTMAS-CAROL\"\n", + "\n", + "# value to decide if the original file should maintain or remove the embedding column\n", + "REMOVE_ORIGINAL_EMBEDDING_COLUMN_IN_SOURCE_FILE = True\n", + "\n", + "#identifier field\n", + "STANDARD_IDENTIFIER_FIELD = \"id\"\n", + "\n", + "#new embedding field name\n", + "NEW_STANDARD_EMBEDDING_FIELD = \"embedding\"" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "def extract_text_embedding_from_table(input_path: str, embedding_field: str, embeddings_parquet_output_field: str):\n", + " \"\"\"Migrate table for embeddings.\"\"\"\n", + " original_df = pd.read_parquet(input_path)\n", + " no_embeddings_df = original_df.drop(columns=[embedding_field])\n", + " \n", + " embeddings_df = original_df[[STANDARD_IDENTIFIER_FIELD, embedding_field]]\n", + " embeddings_df = embeddings_df.rename(columns={embedding_field: NEW_STANDARD_EMBEDDING_FIELD}) # type: ignore\n", + " embeddings_df.to_parquet(embeddings_parquet_output_field, index=False)\n", + "\n", + " if REMOVE_ORIGINAL_EMBEDDING_COLUMN_IN_SOURCE_FILE is True:\n", + " no_embeddings_df.to_parquet(input_path, index=False)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### EMBEDDINGS TO MIGRATE IN FILE: `create_final_community_reports.parquet`" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#input file with the embedding column\n", + "INPUT_PATH = f\"{LOCAL_ROOT}/create_final_community_reports.parquet\"\n", + "\n", + "#output file for embeddings\n", + "EMBEDDINGS_PARQUET_OUTPUT_PATH = f\"{LOCAL_ROOT}/create_final_community_reports_embeddings.parquet\"\n", + "\n", + "#output file without embeddings\n", + "NO_EMBEDDINGS_PARQUET_OUTPUT_PATH = f\"{LOCAL_ROOT}/create_final_community_reports.parquet\"\n", + "\n", + "#embedding field\n", + "EMBEDDING_FIELD = \"full_content_embedding\"\n", + "\n", + "extract_text_embedding_from_table(INPUT_PATH, EMBEDDING_FIELD, EMBEDDINGS_PARQUET_OUTPUT_PATH)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#input file with the embedding column\n", + "INPUT_PATH = f\"{LOCAL_ROOT}/create_final_community_reports.parquet\"\n", + "\n", + "#output file for embeddings\n", + "EMBEDDINGS_PARQUET_OUTPUT_PATH = f\"{LOCAL_ROOT}/create_final_community_reports_embeddings.parquet\"\n", + "\n", + "#output file without embeddings\n", + "NO_EMBEDDINGS_PARQUET_OUTPUT_PATH = f\"{LOCAL_ROOT}/create_final_community_reports.parquet\"\n", + "\n", + "#embedding field\n", + "EMBEDDING_FIELD = \"summary_embedding\"\n", + "\n", + "extract_text_embedding_from_table(INPUT_PATH, EMBEDDING_FIELD, EMBEDDINGS_PARQUET_OUTPUT_PATH)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#input file with the embedding column\n", + "INPUT_PATH = f\"{LOCAL_ROOT}/create_final_community_reports.parquet\"\n", + "\n", + "#output file for embeddings\n", + "EMBEDDINGS_PARQUET_OUTPUT_PATH = f\"{LOCAL_ROOT}/create_final_community_reports_embeddings.parquet\"\n", + "\n", + "#output file without embeddings\n", + "NO_EMBEDDINGS_PARQUET_OUTPUT_PATH = f\"{LOCAL_ROOT}/create_final_community_reports.parquet\"\n", + "\n", + "#embedding field\n", + "EMBEDDING_FIELD = \"title_embedding\"\n", + "\n", + "extract_text_embedding_from_table(INPUT_PATH, EMBEDDING_FIELD, EMBEDDINGS_PARQUET_OUTPUT_PATH)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### EMBEDDINGS TO MIGRATE IN FILE: `create_final_documents.parquet`" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#input file with the embedding column\n", + "INPUT_PATH = f\"{LOCAL_ROOT}/create_final_documents.parquet\"\n", + "\n", + "#output file for embeddings\n", + "EMBEDDINGS_PARQUET_OUTPUT_PATH = f\"{LOCAL_ROOT}/create_final_documents_embeddings.parquet\"\n", + "\n", + "#output file without embeddings\n", + "NO_EMBEDDINGS_PARQUET_OUTPUT_PATH = f\"{LOCAL_ROOT}/create_final_documents.parquet\"\n", + "\n", + "#embedding field\n", + "EMBEDDING_FIELD = \"raw_content_embedding\"\n", + "\n", + "extract_text_embedding_from_table(INPUT_PATH, EMBEDDING_FIELD, EMBEDDINGS_PARQUET_OUTPUT_PATH)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### EMBEDDINGS TO MIGRATE IN FILE: `create_final_entities.parquet`" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#input file with the embedding column\n", + "INPUT_PATH = f\"{LOCAL_ROOT}/create_final_entities.parquet\"\n", + "\n", + "#output file for embeddings\n", + "EMBEDDINGS_PARQUET_OUTPUT_PATH = f\"{LOCAL_ROOT}/create_final_entities_embeddings.parquet\"\n", + "\n", + "#output file without embeddings\n", + "NO_EMBEDDINGS_PARQUET_OUTPUT_PATH = f\"{LOCAL_ROOT}/create_final_entities.parquet\"\n", + "\n", + "#embedding field\n", + "EMBEDDING_FIELD = \"name_embedding\"\n", + "\n", + "extract_text_embedding_from_table(INPUT_PATH, EMBEDDING_FIELD, EMBEDDINGS_PARQUET_OUTPUT_PATH)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#input file with the embedding column\n", + "INPUT_PATH = f\"{LOCAL_ROOT}/create_final_entities.parquet\"\n", + "\n", + "#output file for embeddings\n", + "EMBEDDINGS_PARQUET_OUTPUT_PATH = f\"{LOCAL_ROOT}/create_final_entities_embeddings.parquet\"\n", + "\n", + "#output file without embeddings\n", + "NO_EMBEDDINGS_PARQUET_OUTPUT_PATH = f\"{LOCAL_ROOT}/create_final_entities.parquet\"\n", + "\n", + "#embedding field\n", + "EMBEDDING_FIELD = \"description_embedding\"\n", + "\n", + "extract_text_embedding_from_table(INPUT_PATH, EMBEDDING_FIELD, EMBEDDINGS_PARQUET_OUTPUT_PATH)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### EMBEDDINGS TO MIGRATE IN FILE: `create_final_text_units.parquet`" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#input file with the embedding column\n", + "INPUT_PATH = f\"{LOCAL_ROOT}/create_final_text_units.parquet\"\n", + "\n", + "#output file for embeddings\n", + "EMBEDDINGS_PARQUET_OUTPUT_PATH = f\"{LOCAL_ROOT}/create_final_text_units_embeddings.parquet\"\n", + "\n", + "#output file without embeddings\n", + "NO_EMBEDDINGS_PARQUET_OUTPUT_PATH = f\"{LOCAL_ROOT}/create_final_text_units.parquet\"\n", + "\n", + "#embedding field\n", + "EMBEDDING_FIELD = \"text_embedding\"\n", + "\n", + "extract_text_embedding_from_table(INPUT_PATH, EMBEDDING_FIELD, EMBEDDINGS_PARQUET_OUTPUT_PATH)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": ".venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.10" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}