From a456e018642ee58545784d32d111615b0fdeeddb Mon Sep 17 00:00:00 2001 From: gaudyb Date: Thu, 10 Oct 2024 15:39:40 -0600 Subject: [PATCH] lint checks fixed --- .../extract_text_embeddings.ipynb | 26 ++++++++++++------- 1 file changed, 17 insertions(+), 9 deletions(-) diff --git a/migration_scripts/extract_text_embeddings.ipynb b/migration_scripts/extract_text_embeddings.ipynb index 7ae059e1..73376878 100644 --- a/migration_scripts/extract_text_embeddings.ipynb +++ b/migration_scripts/extract_text_embeddings.ipynb @@ -27,7 +27,7 @@ "import re\n", "from pathlib import Path\n", "\n", - "import pandas as pd\n" + "import pandas as pd" ] }, { @@ -49,10 +49,10 @@ "# value to decide if the original file should maintain or remove the embedding column\n", "REMOVE_ORIGINAL_EMBEDDING_COLUMN_IN_SOURCE_FILE = True\n", "\n", - "#identifier field\n", + "# identifier field\n", "STANDARD_IDENTIFIER_FIELD = \"id\"\n", "\n", - "#new embedding field name\n", + "# new embedding field name\n", "NEW_STANDARD_EMBEDDING_FIELD = \"embedding\"" ] }, @@ -69,13 +69,17 @@ "metadata": {}, "outputs": [], "source": [ - "def extract_text_embedding_from_table(input_path: str, original_embedding_field: str, embeddings_parquet_output_field: str):\n", + "def extract_text_embedding_from_table(\n", + " input_path: str, original_embedding_field: str, embeddings_parquet_output_field: str\n", + "):\n", " \"\"\"Migrate table for embeddings.\"\"\"\n", " original_df = pd.read_parquet(input_path)\n", " no_embeddings_df = original_df.drop(columns=[original_embedding_field])\n", - " \n", + "\n", " embeddings_df = original_df[[STANDARD_IDENTIFIER_FIELD, original_embedding_field]]\n", - " embeddings_df = embeddings_df.rename(columns={original_embedding_field: NEW_STANDARD_EMBEDDING_FIELD}) # type: ignore\n", + " embeddings_df = embeddings_df.rename(\n", + " columns={original_embedding_field: NEW_STANDARD_EMBEDDING_FIELD}\n", + " ) # type: ignore\n", " embeddings_df.to_parquet(embeddings_parquet_output_field, index=False)\n", "\n", " if REMOVE_ORIGINAL_EMBEDDING_COLUMN_IN_SOURCE_FILE is True:\n", @@ -95,7 +99,7 @@ "metadata": {}, "outputs": [], "source": [ - "#READ ENTIRE DATA FOLDER LOOKING FOR EMBEDDING COLUMNS IN EACH FILE\n", + "# READ ENTIRE DATA FOLDER LOOKING FOR EMBEDDING COLUMNS IN EACH FILE\n", "folder_path = Path(LOCAL_ROOT)\n", "pattern = r\"^(.*?)(_embedding)$\"\n", "\n", @@ -109,8 +113,12 @@ " if match:\n", " print(f\"Reading {file_path}\")\n", " filename_without_extension = str(file_path.with_suffix(\"\").as_posix())\n", - " embedding_file_name = f\"{filename_without_extension}_{column}s{file_path.suffix}\"\n", - " extract_text_embedding_from_table(str(file_path), column, embedding_file_name)\n" + " embedding_file_name = (\n", + " f\"{filename_without_extension}_{column}s{file_path.suffix}\"\n", + " )\n", + " extract_text_embedding_from_table(\n", + " str(file_path), column, embedding_file_name\n", + " )" ] } ],