lint checks fixed

This commit is contained in:
gaudyb 2024-10-10 15:39:40 -06:00
parent e859260d9d
commit a456e01864

View File

@ -27,7 +27,7 @@
"import re\n",
"from pathlib import Path\n",
"\n",
"import pandas as pd\n"
"import pandas as pd"
]
},
{
@ -49,10 +49,10 @@
"# value to decide if the original file should maintain or remove the embedding column\n",
"REMOVE_ORIGINAL_EMBEDDING_COLUMN_IN_SOURCE_FILE = True\n",
"\n",
"#identifier field\n",
"# identifier field\n",
"STANDARD_IDENTIFIER_FIELD = \"id\"\n",
"\n",
"#new embedding field name\n",
"# new embedding field name\n",
"NEW_STANDARD_EMBEDDING_FIELD = \"embedding\""
]
},
@ -69,13 +69,17 @@
"metadata": {},
"outputs": [],
"source": [
"def extract_text_embedding_from_table(input_path: str, original_embedding_field: str, embeddings_parquet_output_field: str):\n",
"def extract_text_embedding_from_table(\n",
" input_path: str, original_embedding_field: str, embeddings_parquet_output_field: str\n",
"):\n",
" \"\"\"Migrate table for embeddings.\"\"\"\n",
" original_df = pd.read_parquet(input_path)\n",
" no_embeddings_df = original_df.drop(columns=[original_embedding_field])\n",
" \n",
"\n",
" embeddings_df = original_df[[STANDARD_IDENTIFIER_FIELD, original_embedding_field]]\n",
" embeddings_df = embeddings_df.rename(columns={original_embedding_field: NEW_STANDARD_EMBEDDING_FIELD}) # type: ignore\n",
" embeddings_df = embeddings_df.rename(\n",
" columns={original_embedding_field: NEW_STANDARD_EMBEDDING_FIELD}\n",
" ) # type: ignore\n",
" embeddings_df.to_parquet(embeddings_parquet_output_field, index=False)\n",
"\n",
" if REMOVE_ORIGINAL_EMBEDDING_COLUMN_IN_SOURCE_FILE is True:\n",
@ -95,7 +99,7 @@
"metadata": {},
"outputs": [],
"source": [
"#READ ENTIRE DATA FOLDER LOOKING FOR EMBEDDING COLUMNS IN EACH FILE\n",
"# READ ENTIRE DATA FOLDER LOOKING FOR EMBEDDING COLUMNS IN EACH FILE\n",
"folder_path = Path(LOCAL_ROOT)\n",
"pattern = r\"^(.*?)(_embedding)$\"\n",
"\n",
@ -109,8 +113,12 @@
" if match:\n",
" print(f\"Reading {file_path}\")\n",
" filename_without_extension = str(file_path.with_suffix(\"\").as_posix())\n",
" embedding_file_name = f\"{filename_without_extension}_{column}s{file_path.suffix}\"\n",
" extract_text_embedding_from_table(str(file_path), column, embedding_file_name)\n"
" embedding_file_name = (\n",
" f\"{filename_without_extension}_{column}s{file_path.suffix}\"\n",
" )\n",
" extract_text_embedding_from_table(\n",
" str(file_path), column, embedding_file_name\n",
" )"
]
}
],