mirror of
https://github.com/microsoft/graphrag.git
synced 2026-01-14 09:07:20 +08:00
lint checks fixed
This commit is contained in:
parent
e859260d9d
commit
a456e01864
@ -27,7 +27,7 @@
|
||||
"import re\n",
|
||||
"from pathlib import Path\n",
|
||||
"\n",
|
||||
"import pandas as pd\n"
|
||||
"import pandas as pd"
|
||||
]
|
||||
},
|
||||
{
|
||||
@ -49,10 +49,10 @@
|
||||
"# value to decide if the original file should maintain or remove the embedding column\n",
|
||||
"REMOVE_ORIGINAL_EMBEDDING_COLUMN_IN_SOURCE_FILE = True\n",
|
||||
"\n",
|
||||
"#identifier field\n",
|
||||
"# identifier field\n",
|
||||
"STANDARD_IDENTIFIER_FIELD = \"id\"\n",
|
||||
"\n",
|
||||
"#new embedding field name\n",
|
||||
"# new embedding field name\n",
|
||||
"NEW_STANDARD_EMBEDDING_FIELD = \"embedding\""
|
||||
]
|
||||
},
|
||||
@ -69,13 +69,17 @@
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def extract_text_embedding_from_table(input_path: str, original_embedding_field: str, embeddings_parquet_output_field: str):\n",
|
||||
"def extract_text_embedding_from_table(\n",
|
||||
" input_path: str, original_embedding_field: str, embeddings_parquet_output_field: str\n",
|
||||
"):\n",
|
||||
" \"\"\"Migrate table for embeddings.\"\"\"\n",
|
||||
" original_df = pd.read_parquet(input_path)\n",
|
||||
" no_embeddings_df = original_df.drop(columns=[original_embedding_field])\n",
|
||||
" \n",
|
||||
"\n",
|
||||
" embeddings_df = original_df[[STANDARD_IDENTIFIER_FIELD, original_embedding_field]]\n",
|
||||
" embeddings_df = embeddings_df.rename(columns={original_embedding_field: NEW_STANDARD_EMBEDDING_FIELD}) # type: ignore\n",
|
||||
" embeddings_df = embeddings_df.rename(\n",
|
||||
" columns={original_embedding_field: NEW_STANDARD_EMBEDDING_FIELD}\n",
|
||||
" ) # type: ignore\n",
|
||||
" embeddings_df.to_parquet(embeddings_parquet_output_field, index=False)\n",
|
||||
"\n",
|
||||
" if REMOVE_ORIGINAL_EMBEDDING_COLUMN_IN_SOURCE_FILE is True:\n",
|
||||
@ -95,7 +99,7 @@
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"#READ ENTIRE DATA FOLDER LOOKING FOR EMBEDDING COLUMNS IN EACH FILE\n",
|
||||
"# READ ENTIRE DATA FOLDER LOOKING FOR EMBEDDING COLUMNS IN EACH FILE\n",
|
||||
"folder_path = Path(LOCAL_ROOT)\n",
|
||||
"pattern = r\"^(.*?)(_embedding)$\"\n",
|
||||
"\n",
|
||||
@ -109,8 +113,12 @@
|
||||
" if match:\n",
|
||||
" print(f\"Reading {file_path}\")\n",
|
||||
" filename_without_extension = str(file_path.with_suffix(\"\").as_posix())\n",
|
||||
" embedding_file_name = f\"{filename_without_extension}_{column}s{file_path.suffix}\"\n",
|
||||
" extract_text_embedding_from_table(str(file_path), column, embedding_file_name)\n"
|
||||
" embedding_file_name = (\n",
|
||||
" f\"{filename_without_extension}_{column}s{file_path.suffix}\"\n",
|
||||
" )\n",
|
||||
" extract_text_embedding_from_table(\n",
|
||||
" str(file_path), column, embedding_file_name\n",
|
||||
" )"
|
||||
]
|
||||
}
|
||||
],
|
||||
|
||||
Loading…
Reference in New Issue
Block a user