This commit is contained in:
Wang Bojun 2024-04-11 10:40:32 +08:00
parent 667baea09b
commit ddb0d8a2fa
5 changed files with 997 additions and 0 deletions

72
emb_search.ipynb Normal file
View File

@ -0,0 +1,72 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 5,
"id": "bcda7dd6-621a-4018-806a-e3a4b7c7b5b2",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Vector dimension: 384\n",
"First element of the vector: -0.027828959748148918\n"
]
}
],
"source": [
"import redis\n",
"import numpy as np\n",
"\n",
"# 初始化Redis客户端\n",
"r = redis.Redis(host='localhost', port=6379, db=3) # 假设向量保存在db=0中\n",
"\n",
"# 获取一个向量的键\n",
"keys = r.keys('*')\n",
"if not keys:\n",
" print(\"No keys found in Redis.\")\n",
"else:\n",
" # 读取第一个键的向量\n",
" key = keys[0]\n",
" vector_data = r.get(key)\n",
"\n",
" # 将二进制数据转换为numpy数组\n",
" vector = np.frombuffer(vector_data, dtype=np.float32)\n",
"\n",
" # 打印向量维度\n",
" print(f\"Vector dimension: {vector.shape[0]}\")\n",
" print(f\"First element of the vector: {vector[0]}\") # 举例打印第一个元素\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "d55ce965-c121-45dc-8570-a34f1edbefa1",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.2"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

277
embedding.ipynb Normal file

File diff suppressed because one or more lines are too long

115
pipline.ipynb Normal file
View File

@ -0,0 +1,115 @@
{
"cells": [
{
"cell_type": "markdown",
"id": "e03ab29a-7cfb-44e9-a045-34d68f1e94bb",
"metadata": {},
"source": [
"# 假设用户指定请求某篇文章。\n",
"## 我们通过ID 获取该文献的简介与PDF文件。"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "81b9f218-85c2-4982-8f19-ee3849a9e31a",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"{'title': 'Constraining effective equation of state in $f(Q,T)$ gravity', 'summary': \" New high-precision observations are now possible to constrain different\\ngravity theories. To examine the accelerated expansion of the Universe, we used\\nthe newly proposed $f(Q,T)$ gravity, where $Q$ is the non-metricity, and $T$ is\\nthe trace of the energy-momentum tensor. The investigation is carried out using\\na parameterized effective equation of state with two parameters, $m$ and $n$.\\nWe have also considered the linear form of $f(Q,T)= Q+bT$, where $b$ is\\nconstant. By constraining the model with the recently published 1048 Pantheon\\nsample, we were able to find the best fitting values for the parameters $b$,\\n$m$, and $n$. The model appears to be in good agreement with the observations.\\nFinally, we analyzed the behavior of the deceleration parameter and equation of\\nstate parameter. The results support the feasibility of $f(Q,T)$ as a promising\\ntheory of gravity, illuminating a new direction towards explaining the\\nUniverse's dark sector.\\n\", 'pdf_url': 'http://arxiv.org/pdf/2104.00001v2.pdf'}\n"
]
}
],
"source": [
"import requests\n",
"from xml.etree import ElementTree\n",
"\n",
"def get_arxiv_paper_info(arxiv_id):\n",
" # 构建请求URL\n",
" url = f'http://export.arxiv.org/api/query?id_list={arxiv_id}'\n",
" \n",
" # 发送GET请求\n",
" response = requests.get(url)\n",
" \n",
" # 解析返回的XML数据\n",
" root = ElementTree.fromstring(response.content)\n",
" \n",
" # 初始化结果字典\n",
" paper_info = {'title': '', 'summary': '', 'pdf_url': ''}\n",
" \n",
" # 提取文章信息\n",
" for entry in root.findall('{http://www.w3.org/2005/Atom}entry'):\n",
" paper_info['title'] = entry.find('{http://www.w3.org/2005/Atom}title').text\n",
" paper_info['summary'] = entry.find('{http://www.w3.org/2005/Atom}summary').text\n",
" # 查找并提取PDF链接\n",
" for link in entry.findall('{http://www.w3.org/2005/Atom}link'):\n",
" if link.attrib.get('title') == 'pdf':\n",
" paper_info['pdf_url'] = link.attrib.get('href') + '.pdf'\n",
" \n",
" return paper_info\n",
"\n",
"# 示例使用arXiv ID获取文章信息\n",
"arxiv_id = '2104.00001' # 这里替换成实际的arXiv ID\n",
"paper_info = get_arxiv_paper_info(arxiv_id)\n",
"print(paper_info)\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "0a4e3fe2-f965-4030-b175-14ef02a6863b",
"metadata": {},
"outputs": [],
"source": [
"import requests\n",
"import fitz # PyMuPDF\n",
"\n",
"def download_pdf(url, filename):\n",
" response = requests.get(url)\n",
" with open(filename, 'wb') as f:\n",
" f.write(response.content)\n",
"\n",
"def parse_pdf(filename):\n",
" # 打开PDF文件\n",
" doc = fitz.open(filename)\n",
" \n",
" # 遍历每一页\n",
" for page_num in range(len(doc)):\n",
" page = doc.load_page(page_num)\n",
" print(page.get_text())\n",
"\n",
"# 使用上面获得的PDF URL下载PDF\n",
"pdf_url = paper_info['pdf_url']\n",
"filename = 'downloaded_paper.pdf'\n",
"download_pdf(pdf_url, filename)\n",
"\n",
"# 解析并打印PDF内容\n",
"parse_pdf(filename)\n"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.2"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

50
step1/check_ids.py Normal file
View File

@ -0,0 +1,50 @@
import requests
from datetime import datetime
from pymongo import MongoClient
# Function to check if the paper with the given arXiv ID exists
def is_paper_exists(arxiv_id):
url = f"https://arxiv.org/abs/{arxiv_id}"
response = requests.get(url)
return response.status_code != 404
# Function to use binary search to find the number of papers in a month
def find_paper_count(year, month, lower_bound, upper_bound):
left = lower_bound
right = upper_bound
while left <= right:
mid = (left + right) // 2
arxiv_id = f"{year % 100:02}{month:02}.{mid:0{5 if year > 2015 or (year == 2015 and month > 1) else 4}}"
if is_paper_exists(arxiv_id):
left = mid + 1
else:
right = mid - 1
return right
# Connect to MongoDB
client = MongoClient('mongodb://localhost:27017/')
db = client.arxiv
collection = db.papers_per_month
# Loop over each month and store the results in MongoDB
for year in range(2008, 2025):
for month in range(1, 13):
if year == 2024 and month > 4:
break
lower_bound = 1
upper_bound = 9999 if year < 2015 or (year == 2015 and month == 1) else 99999
count = find_paper_count(year, month, lower_bound, upper_bound)
print(f"Number of papers uploaded in {year}-{month:02}: {count}")
# Store the data in MongoDB
document = {
"year": year,
"month": month,
"papers_uploaded": count
}
collection.insert_one(document)
# Optional: Add a delay to avoid overloading the server
# time.sleep(1)
print("Data has been stored to MongoDB.")

483
unzip.ipynb Normal file
View File

@ -0,0 +1,483 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 4,
"id": "d73cfd74-d726-465c-9a2b-d459a148fe49",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Extracted all files to /4t/arxiv/1907\n",
"Deleted zip file /4t/zip/1907.zip\n",
"Extracted all files to /4t/arxiv/1906\n",
"Deleted zip file /4t/zip/1906.zip\n",
"Extracted all files to /4t/arxiv/1905\n",
"Deleted zip file /4t/zip/1905.zip\n",
"Extracted all files to /4t/arxiv/1904\n",
"Deleted zip file /4t/zip/1904.zip\n",
"Extracted all files to /4t/arxiv/1903\n",
"Deleted zip file /4t/zip/1903.zip\n",
"Extracted all files to /4t/arxiv/1902\n",
"Deleted zip file /4t/zip/1902.zip\n",
"Extracted all files to /4t/arxiv/1901\n",
"Deleted zip file /4t/zip/1901.zip\n",
"Extracted all files to /4t/arxiv/1812\n",
"Deleted zip file /4t/zip/1812.zip\n",
"Extracted all files to /4t/arxiv/1811\n",
"Deleted zip file /4t/zip/1811.zip\n",
"Extracted all files to /4t/arxiv/1810\n",
"Deleted zip file /4t/zip/1810.zip\n",
"Extracted all files to /4t/arxiv/1809\n",
"Deleted zip file /4t/zip/1809.zip\n",
"Extracted all files to /4t/arxiv/1808\n",
"Deleted zip file /4t/zip/1808.zip\n",
"Extracted all files to /4t/arxiv/1807\n",
"Deleted zip file /4t/zip/1807.zip\n",
"Extracted all files to /4t/arxiv/1806\n",
"Deleted zip file /4t/zip/1806.zip\n",
"Extracted all files to /4t/arxiv/1805\n",
"Deleted zip file /4t/zip/1805.zip\n",
"Extracted all files to /4t/arxiv/1804\n",
"Deleted zip file /4t/zip/1804.zip\n",
"Extracted all files to /4t/arxiv/1803\n",
"Deleted zip file /4t/zip/1803.zip\n",
"Extracted all files to /4t/arxiv/1802\n",
"Deleted zip file /4t/zip/1802.zip\n",
"Extracted all files to /4t/arxiv/1801\n",
"Deleted zip file /4t/zip/1801.zip\n",
"Extracted all files to /4t/arxiv/1712\n",
"Deleted zip file /4t/zip/1712.zip\n",
"Extracted all files to /4t/arxiv/1711\n",
"Deleted zip file /4t/zip/1711.zip\n",
"Extracted all files to /4t/arxiv/1710\n",
"Deleted zip file /4t/zip/1710.zip\n",
"Extracted all files to /4t/arxiv/1709\n",
"Deleted zip file /4t/zip/1709.zip\n",
"Extracted all files to /4t/arxiv/1708\n",
"Deleted zip file /4t/zip/1708.zip\n",
"Extracted all files to /4t/arxiv/1707\n",
"Deleted zip file /4t/zip/1707.zip\n",
"Extracted all files to /4t/arxiv/1706\n",
"Deleted zip file /4t/zip/1706.zip\n",
"Extracted all files to /4t/arxiv/1705\n",
"Deleted zip file /4t/zip/1705.zip\n",
"Extracted all files to /4t/arxiv/1704\n",
"Deleted zip file /4t/zip/1704.zip\n",
"Extracted all files to /4t/arxiv/1703\n",
"Deleted zip file /4t/zip/1703.zip\n",
"Extracted all files to /4t/arxiv/1702\n",
"Deleted zip file /4t/zip/1702.zip\n",
"Extracted all files to /4t/arxiv/1701\n",
"Deleted zip file /4t/zip/1701.zip\n",
"Extracted all files to /4t/arxiv/1612\n",
"Deleted zip file /4t/zip/1612.zip\n",
"Extracted all files to /4t/arxiv/1611\n",
"Deleted zip file /4t/zip/1611.zip\n",
"Extracted all files to /4t/arxiv/1610\n",
"Deleted zip file /4t/zip/1610.zip\n",
"Extracted all files to /4t/arxiv/1609\n",
"Deleted zip file /4t/zip/1609.zip\n",
"Extracted all files to /4t/arxiv/1608\n",
"Deleted zip file /4t/zip/1608.zip\n",
"Extracted all files to /4t/arxiv/1607\n",
"Deleted zip file /4t/zip/1607.zip\n",
"Extracted all files to /4t/arxiv/1606\n",
"Deleted zip file /4t/zip/1606.zip\n",
"Extracted all files to /4t/arxiv/1605\n",
"Deleted zip file /4t/zip/1605.zip\n",
"Extracted all files to /4t/arxiv/1604\n",
"Deleted zip file /4t/zip/1604.zip\n",
"Extracted all files to /4t/arxiv/1603\n",
"Deleted zip file /4t/zip/1603.zip\n",
"Extracted all files to /4t/arxiv/1602\n",
"Deleted zip file /4t/zip/1602.zip\n",
"Extracted all files to /4t/arxiv/1601\n",
"Deleted zip file /4t/zip/1601.zip\n",
"Extracted all files to /4t/arxiv/1512\n",
"Deleted zip file /4t/zip/1512.zip\n",
"Extracted all files to /4t/arxiv/1511\n",
"Deleted zip file /4t/zip/1511.zip\n",
"Extracted all files to /4t/arxiv/1510\n",
"Deleted zip file /4t/zip/1510.zip\n",
"Extracted all files to /4t/arxiv/1509\n",
"Deleted zip file /4t/zip/1509.zip\n",
"Extracted all files to /4t/arxiv/1508\n",
"Deleted zip file /4t/zip/1508.zip\n",
"Extracted all files to /4t/arxiv/1507\n",
"Deleted zip file /4t/zip/1507.zip\n",
"Extracted all files to /4t/arxiv/1506\n",
"Deleted zip file /4t/zip/1506.zip\n",
"Extracted all files to /4t/arxiv/1505\n",
"Deleted zip file /4t/zip/1505.zip\n",
"Extracted all files to /4t/arxiv/1504\n",
"Deleted zip file /4t/zip/1504.zip\n",
"Extracted all files to /4t/arxiv/1503\n",
"Deleted zip file /4t/zip/1503.zip\n",
"Extracted all files to /4t/arxiv/1502\n",
"Deleted zip file /4t/zip/1502.zip\n",
"Extracted all files to /4t/arxiv/1501\n",
"Deleted zip file /4t/zip/1501.zip\n",
"Extracted all files to /4t/arxiv/1412\n",
"Deleted zip file /4t/zip/1412.zip\n",
"Extracted all files to /4t/arxiv/1411\n",
"Deleted zip file /4t/zip/1411.zip\n",
"Extracted all files to /4t/arxiv/1410\n",
"Deleted zip file /4t/zip/1410.zip\n",
"Extracted all files to /4t/arxiv/1409\n",
"Deleted zip file /4t/zip/1409.zip\n",
"Extracted all files to /4t/arxiv/1408\n",
"Deleted zip file /4t/zip/1408.zip\n",
"Extracted all files to /4t/arxiv/1407\n",
"Deleted zip file /4t/zip/1407.zip\n",
"Extracted all files to /4t/arxiv/1406\n",
"Deleted zip file /4t/zip/1406.zip\n",
"Extracted all files to /4t/arxiv/1405\n",
"Deleted zip file /4t/zip/1405.zip\n",
"Extracted all files to /4t/arxiv/1404\n",
"Deleted zip file /4t/zip/1404.zip\n",
"Extracted all files to /4t/arxiv/1403\n",
"Deleted zip file /4t/zip/1403.zip\n",
"Extracted all files to /4t/arxiv/1402\n",
"Deleted zip file /4t/zip/1402.zip\n",
"Extracted all files to /4t/arxiv/1401\n",
"Deleted zip file /4t/zip/1401.zip\n",
"Extracted all files to /4t/arxiv/1312\n",
"Deleted zip file /4t/zip/1312.zip\n",
"Extracted all files to /4t/arxiv/1311\n",
"Deleted zip file /4t/zip/1311.zip\n",
"Extracted all files to /4t/arxiv/1310\n",
"Deleted zip file /4t/zip/1310.zip\n",
"Extracted all files to /4t/arxiv/1309\n",
"Deleted zip file /4t/zip/1309.zip\n",
"Extracted all files to /4t/arxiv/1308\n",
"Deleted zip file /4t/zip/1308.zip\n",
"Extracted all files to /4t/arxiv/1307\n",
"Deleted zip file /4t/zip/1307.zip\n",
"Extracted all files to /4t/arxiv/1306\n",
"Deleted zip file /4t/zip/1306.zip\n",
"Extracted all files to /4t/arxiv/1305\n",
"Deleted zip file /4t/zip/1305.zip\n",
"Extracted all files to /4t/arxiv/1304\n",
"Deleted zip file /4t/zip/1304.zip\n",
"Extracted all files to /4t/arxiv/1303\n",
"Deleted zip file /4t/zip/1303.zip\n",
"Extracted all files to /4t/arxiv/1302\n",
"Deleted zip file /4t/zip/1302.zip\n",
"Extracted all files to /4t/arxiv/1301\n",
"Deleted zip file /4t/zip/1301.zip\n",
"Extracted all files to /4t/arxiv/1212\n",
"Deleted zip file /4t/zip/1212.zip\n",
"Extracted all files to /4t/arxiv/1211\n",
"Deleted zip file /4t/zip/1211.zip\n",
"Extracted all files to /4t/arxiv/1210\n",
"Deleted zip file /4t/zip/1210.zip\n",
"Extracted all files to /4t/arxiv/1209\n",
"Deleted zip file /4t/zip/1209.zip\n",
"Extracted all files to /4t/arxiv/1208\n",
"Deleted zip file /4t/zip/1208.zip\n",
"Extracted all files to /4t/arxiv/1207\n",
"Deleted zip file /4t/zip/1207.zip\n",
"Extracted all files to /4t/arxiv/1206\n",
"Deleted zip file /4t/zip/1206.zip\n",
"Extracted all files to /4t/arxiv/1205\n",
"Deleted zip file /4t/zip/1205.zip\n",
"Extracted all files to /4t/arxiv/1204\n",
"Deleted zip file /4t/zip/1204.zip\n",
"Extracted all files to /4t/arxiv/1203\n",
"Deleted zip file /4t/zip/1203.zip\n",
"Extracted all files to /4t/arxiv/1202\n",
"Deleted zip file /4t/zip/1202.zip\n",
"Extracted all files to /4t/arxiv/1201\n",
"Deleted zip file /4t/zip/1201.zip\n",
"Extracted all files to /4t/arxiv/1112\n",
"Deleted zip file /4t/zip/1112.zip\n",
"Extracted all files to /4t/arxiv/1111\n",
"Deleted zip file /4t/zip/1111.zip\n",
"Extracted all files to /4t/arxiv/1110\n",
"Deleted zip file /4t/zip/1110.zip\n",
"Extracted all files to /4t/arxiv/1109\n",
"Deleted zip file /4t/zip/1109.zip\n",
"Extracted all files to /4t/arxiv/1108\n",
"Deleted zip file /4t/zip/1108.zip\n",
"Extracted all files to /4t/arxiv/1107\n",
"Deleted zip file /4t/zip/1107.zip\n",
"Extracted all files to /4t/arxiv/1106\n",
"Deleted zip file /4t/zip/1106.zip\n",
"Extracted all files to /4t/arxiv/1105\n",
"Deleted zip file /4t/zip/1105.zip\n",
"Extracted all files to /4t/arxiv/1104\n",
"Deleted zip file /4t/zip/1104.zip\n",
"Extracted all files to /4t/arxiv/1103\n",
"Deleted zip file /4t/zip/1103.zip\n",
"Extracted all files to /4t/arxiv/1102\n",
"Deleted zip file /4t/zip/1102.zip\n",
"Extracted all files to /4t/arxiv/1101\n",
"Deleted zip file /4t/zip/1101.zip\n",
"Extracted all files to /4t/arxiv/1012\n",
"Deleted zip file /4t/zip/1012.zip\n",
"Extracted all files to /4t/arxiv/1011\n",
"Deleted zip file /4t/zip/1011.zip\n",
"Extracted all files to /4t/arxiv/1010\n",
"Deleted zip file /4t/zip/1010.zip\n",
"Extracted all files to /4t/arxiv/1009\n",
"Deleted zip file /4t/zip/1009.zip\n",
"Extracted all files to /4t/arxiv/1008\n",
"Deleted zip file /4t/zip/1008.zip\n",
"Extracted all files to /4t/arxiv/1007\n",
"Deleted zip file /4t/zip/1007.zip\n",
"Extracted all files to /4t/arxiv/1006\n",
"Deleted zip file /4t/zip/1006.zip\n",
"Extracted all files to /4t/arxiv/1005\n",
"Deleted zip file /4t/zip/1005.zip\n",
"Extracted all files to /4t/arxiv/1004\n",
"Deleted zip file /4t/zip/1004.zip\n",
"Extracted all files to /4t/arxiv/1003\n",
"Deleted zip file /4t/zip/1003.zip\n",
"Extracted all files to /4t/arxiv/1002\n",
"Deleted zip file /4t/zip/1002.zip\n",
"Extracted all files to /4t/arxiv/1001\n",
"Deleted zip file /4t/zip/1001.zip\n",
"Extracted all files to /4t/arxiv/0912\n",
"Deleted zip file /4t/zip/0912.zip\n",
"Extracted all files to /4t/arxiv/0911\n",
"Deleted zip file /4t/zip/0911.zip\n",
"Extracted all files to /4t/arxiv/0910\n",
"Deleted zip file /4t/zip/0910.zip\n",
"Extracted all files to /4t/arxiv/0909\n",
"Deleted zip file /4t/zip/0909.zip\n",
"Extracted all files to /4t/arxiv/0908\n",
"Deleted zip file /4t/zip/0908.zip\n",
"Extracted all files to /4t/arxiv/0907\n",
"Deleted zip file /4t/zip/0907.zip\n",
"Extracted all files to /4t/arxiv/0906\n",
"Deleted zip file /4t/zip/0906.zip\n",
"Extracted all files to /4t/arxiv/0905\n",
"Deleted zip file /4t/zip/0905.zip\n",
"Extracted all files to /4t/arxiv/0904\n",
"Deleted zip file /4t/zip/0904.zip\n",
"Extracted all files to /4t/arxiv/0903\n",
"Deleted zip file /4t/zip/0903.zip\n",
"Extracted all files to /4t/arxiv/0902\n",
"Deleted zip file /4t/zip/0902.zip\n",
"Extracted all files to /4t/arxiv/0901\n",
"Deleted zip file /4t/zip/0901.zip\n",
"Extracted all files to /4t/arxiv/0812\n",
"Deleted zip file /4t/zip/0812.zip\n",
"Extracted all files to /4t/arxiv/0811\n",
"Deleted zip file /4t/zip/0811.zip\n",
"Extracted all files to /4t/arxiv/0810\n",
"Deleted zip file /4t/zip/0810.zip\n",
"Extracted all files to /4t/arxiv/0809\n",
"Deleted zip file /4t/zip/0809.zip\n",
"Extracted all files to /4t/arxiv/0808\n",
"Deleted zip file /4t/zip/0808.zip\n",
"Extracted all files to /4t/arxiv/0807\n",
"Deleted zip file /4t/zip/0807.zip\n",
"Extracted all files to /4t/arxiv/0806\n",
"Deleted zip file /4t/zip/0806.zip\n",
"Extracted all files to /4t/arxiv/0805\n",
"Deleted zip file /4t/zip/0805.zip\n",
"Extracted all files to /4t/arxiv/0804\n",
"Deleted zip file /4t/zip/0804.zip\n",
"Extracted all files to /4t/arxiv/0803\n",
"Deleted zip file /4t/zip/0803.zip\n",
"Extracted all files to /4t/arxiv/0802\n",
"Deleted zip file /4t/zip/0802.zip\n",
"Extracted all files to /4t/arxiv/0801\n",
"Deleted zip file /4t/zip/0801.zip\n",
"Extracted all files to /4t/arxiv/1908\n",
"Deleted zip file /4t/zip/1908.zip\n",
"Extracted all files to /4t/arxiv/1909\n",
"Deleted zip file /4t/zip/1909.zip\n",
"Extracted all files to /4t/arxiv/1910\n",
"Deleted zip file /4t/zip/1910.zip\n",
"Extracted all files to /4t/arxiv/1911\n",
"Deleted zip file /4t/zip/1911.zip\n",
"Extracted all files to /4t/arxiv/1912\n",
"Deleted zip file /4t/zip/1912.zip\n",
"Extracted all files to /4t/arxiv/2001\n",
"Deleted zip file /4t/zip/2001.zip\n",
"Extracted all files to /4t/arxiv/2002\n",
"Deleted zip file /4t/zip/2002.zip\n",
"Extracted all files to /4t/arxiv/2003\n",
"Deleted zip file /4t/zip/2003.zip\n",
"Extracted all files to /4t/arxiv/2004\n",
"Deleted zip file /4t/zip/2004.zip\n",
"Extracted all files to /4t/arxiv/2005\n",
"Deleted zip file /4t/zip/2005.zip\n",
"Extracted all files to /4t/arxiv/2006\n",
"Deleted zip file /4t/zip/2006.zip\n",
"Extracted all files to /4t/arxiv/2007\n",
"Deleted zip file /4t/zip/2007.zip\n",
"Extracted all files to /4t/arxiv/2008\n",
"Deleted zip file /4t/zip/2008.zip\n",
"Extracted all files to /4t/arxiv/2009\n",
"Deleted zip file /4t/zip/2009.zip\n",
"Extracted all files to /4t/arxiv/2010\n",
"Deleted zip file /4t/zip/2010.zip\n",
"Extracted all files to /4t/arxiv/2011\n",
"Deleted zip file /4t/zip/2011.zip\n",
"Extracted all files to /4t/arxiv/2012\n",
"Deleted zip file /4t/zip/2012.zip\n",
"Extracted all files to /4t/arxiv/2101\n",
"Deleted zip file /4t/zip/2101.zip\n",
"Extracted all files to /4t/arxiv/2102\n",
"Deleted zip file /4t/zip/2102.zip\n",
"Extracted all files to /4t/arxiv/2103\n",
"Deleted zip file /4t/zip/2103.zip\n",
"Extracted all files to /4t/arxiv/2104\n",
"Deleted zip file /4t/zip/2104.zip\n",
"Extracted all files to /4t/arxiv/2105\n",
"Deleted zip file /4t/zip/2105.zip\n",
"Extracted all files to /4t/arxiv/2106\n",
"Deleted zip file /4t/zip/2106.zip\n",
"Extracted all files to /4t/arxiv/2107\n",
"Deleted zip file /4t/zip/2107.zip\n",
"Extracted all files to /4t/arxiv/2108\n",
"Deleted zip file /4t/zip/2108.zip\n",
"Extracted all files to /4t/arxiv/2109\n",
"Deleted zip file /4t/zip/2109.zip\n",
"Extracted all files to /4t/arxiv/2110\n",
"Deleted zip file /4t/zip/2110.zip\n",
"Extracted all files to /4t/arxiv/2111\n",
"Deleted zip file /4t/zip/2111.zip\n",
"Extracted all files to /4t/arxiv/2112\n",
"Deleted zip file /4t/zip/2112.zip\n",
"Extracted all files to /4t/arxiv/2201\n",
"Deleted zip file /4t/zip/2201.zip\n",
"Extracted all files to /4t/arxiv/2202\n",
"Deleted zip file /4t/zip/2202.zip\n",
"Extracted all files to /4t/arxiv/2203\n",
"Deleted zip file /4t/zip/2203.zip\n",
"Extracted all files to /4t/arxiv/2204\n",
"Deleted zip file /4t/zip/2204.zip\n",
"Extracted all files to /4t/arxiv/2205\n",
"Deleted zip file /4t/zip/2205.zip\n",
"Extracted all files to /4t/arxiv/2206\n",
"Deleted zip file /4t/zip/2206.zip\n",
"Extracted all files to /4t/arxiv/2207\n",
"Deleted zip file /4t/zip/2207.zip\n",
"Extracted all files to /4t/arxiv/2208\n",
"Deleted zip file /4t/zip/2208.zip\n",
"Extracted all files to /4t/arxiv/2209\n",
"Deleted zip file /4t/zip/2209.zip\n",
"Extracted all files to /4t/arxiv/2210\n",
"Deleted zip file /4t/zip/2210.zip\n",
"Extracted all files to /4t/arxiv/2211\n",
"Deleted zip file /4t/zip/2211.zip\n",
"Extracted all files to /4t/arxiv/2212\n",
"Deleted zip file /4t/zip/2212.zip\n",
"Extracted all files to /4t/arxiv/2301\n",
"Deleted zip file /4t/zip/2301.zip\n",
"Extracted all files to /4t/arxiv/2302\n",
"Deleted zip file /4t/zip/2302.zip\n",
"Extracted all files to /4t/arxiv/2303\n",
"Deleted zip file /4t/zip/2303.zip\n",
"Extracted all files to /4t/arxiv/2304\n",
"Deleted zip file /4t/zip/2304.zip\n",
"Extracted all files to /4t/arxiv/2305\n",
"Deleted zip file /4t/zip/2305.zip\n",
"Extracted all files to /4t/arxiv/2306\n",
"Deleted zip file /4t/zip/2306.zip\n",
"Extracted all files to /4t/arxiv/2307\n",
"Deleted zip file /4t/zip/2307.zip\n",
"Extracted all files to /4t/arxiv/2308\n",
"Deleted zip file /4t/zip/2308.zip\n",
"Extracted all files to /4t/arxiv/2309\n",
"Deleted zip file /4t/zip/2309.zip\n",
"Extracted all files to /4t/arxiv/2310\n",
"Deleted zip file /4t/zip/2310.zip\n",
"Extracted all files to /4t/arxiv/2311\n",
"Deleted zip file /4t/zip/2311.zip\n",
"Extracted all files to /4t/arxiv/2312\n",
"Deleted zip file /4t/zip/2312.zip\n",
"Extracted all files to /4t/arxiv/2401\n",
"Deleted zip file /4t/zip/2401.zip\n"
]
}
],
"source": [
"import zipfile\n",
"import os\n",
"\n",
"def unzip_file(zip_path, extract_to):\n",
" \"\"\"\n",
" Unzip a zip file to the specified extract path and delete the zip file.\n",
"\n",
" :param zip_path: The path to the zip file.\n",
" :param extract_to: The directory to extract the files to.\n",
" \"\"\"\n",
" # Ensure the target directory exists\n",
" os.makedirs(extract_to, exist_ok=True)\n",
" \n",
" # Open the zip file\n",
" with zipfile.ZipFile(zip_path, 'r') as zip_ref:\n",
" # Extract all the contents into the directory\n",
" zip_ref.extractall(extract_to)\n",
" print(f\"Extracted all files to {extract_to}\")\n",
"\n",
" # Delete the zip file after extracting\n",
" os.remove(zip_path)\n",
" print(f\"Deleted zip file {zip_path}\")\n",
"\n",
"def unzip_all_in_directory(zip_dir, extract_root):\n",
" \"\"\"\n",
" Unzip all zip files in the given directory and delete them after extraction.\n",
"\n",
" :param zip_dir: The directory containing zip files.\n",
" :param extract_root: The root directory to extract the zip files into.\n",
" \"\"\"\n",
" # List all the files in the zip directory\n",
" for file in os.listdir(zip_dir):\n",
" # Check if the file is a zip file\n",
" if file.endswith('.zip'):\n",
" # Construct full zip file path\n",
" zip_file_path = os.path.join(zip_dir, file)\n",
" \n",
" # Construct the destination directory path\n",
" # Assuming that the directory name should be the same as the zip file without the extension\n",
" destination_dir = os.path.join(extract_root, os.path.splitext(file)[0])\n",
" # Unzip the file to the destination directory\n",
" unzip_file(zip_file_path, destination_dir)\n",
"\n",
"# Example usage\n",
"zip_files_directory = '/4t/zip/' # Directory containing zip files\n",
"extract_to_root_directory = '/4t/arxiv/' # Root directory to extract contents\n",
"\n",
"unzip_all_in_directory(zip_files_directory, extract_to_root_directory)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "43ab5bfd-0c36-40c0-b688-4eb3489ed551",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.2"
}
},
"nbformat": 4,
"nbformat_minor": 5
}