paper_dataset/pipline.ipynb
Wang Bojun ddb0d8a2fa init
2024-04-11 10:40:32 +08:00

116 lines
4.3 KiB
Plaintext
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

{
"cells": [
{
"cell_type": "markdown",
"id": "e03ab29a-7cfb-44e9-a045-34d68f1e94bb",
"metadata": {},
"source": [
"# 假设用户指定请求某篇文章。\n",
"## 我们通过ID 获取该文献的简介与PDF文件。"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "81b9f218-85c2-4982-8f19-ee3849a9e31a",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"{'title': 'Constraining effective equation of state in $f(Q,T)$ gravity', 'summary': \" New high-precision observations are now possible to constrain different\\ngravity theories. To examine the accelerated expansion of the Universe, we used\\nthe newly proposed $f(Q,T)$ gravity, where $Q$ is the non-metricity, and $T$ is\\nthe trace of the energy-momentum tensor. The investigation is carried out using\\na parameterized effective equation of state with two parameters, $m$ and $n$.\\nWe have also considered the linear form of $f(Q,T)= Q+bT$, where $b$ is\\nconstant. By constraining the model with the recently published 1048 Pantheon\\nsample, we were able to find the best fitting values for the parameters $b$,\\n$m$, and $n$. The model appears to be in good agreement with the observations.\\nFinally, we analyzed the behavior of the deceleration parameter and equation of\\nstate parameter. The results support the feasibility of $f(Q,T)$ as a promising\\ntheory of gravity, illuminating a new direction towards explaining the\\nUniverse's dark sector.\\n\", 'pdf_url': 'http://arxiv.org/pdf/2104.00001v2.pdf'}\n"
]
}
],
"source": [
"import requests\n",
"from xml.etree import ElementTree\n",
"\n",
"def get_arxiv_paper_info(arxiv_id):\n",
" # 构建请求URL\n",
" url = f'http://export.arxiv.org/api/query?id_list={arxiv_id}'\n",
" \n",
" # 发送GET请求\n",
" response = requests.get(url)\n",
" \n",
" # 解析返回的XML数据\n",
" root = ElementTree.fromstring(response.content)\n",
" \n",
" # 初始化结果字典\n",
" paper_info = {'title': '', 'summary': '', 'pdf_url': ''}\n",
" \n",
" # 提取文章信息\n",
" for entry in root.findall('{http://www.w3.org/2005/Atom}entry'):\n",
" paper_info['title'] = entry.find('{http://www.w3.org/2005/Atom}title').text\n",
" paper_info['summary'] = entry.find('{http://www.w3.org/2005/Atom}summary').text\n",
" # 查找并提取PDF链接\n",
" for link in entry.findall('{http://www.w3.org/2005/Atom}link'):\n",
" if link.attrib.get('title') == 'pdf':\n",
" paper_info['pdf_url'] = link.attrib.get('href') + '.pdf'\n",
" \n",
" return paper_info\n",
"\n",
"# 示例使用arXiv ID获取文章信息\n",
"arxiv_id = '2104.00001' # 这里替换成实际的arXiv ID\n",
"paper_info = get_arxiv_paper_info(arxiv_id)\n",
"print(paper_info)\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "0a4e3fe2-f965-4030-b175-14ef02a6863b",
"metadata": {},
"outputs": [],
"source": [
"import requests\n",
"import fitz # PyMuPDF\n",
"\n",
"def download_pdf(url, filename):\n",
" response = requests.get(url)\n",
" with open(filename, 'wb') as f:\n",
" f.write(response.content)\n",
"\n",
"def parse_pdf(filename):\n",
" # 打开PDF文件\n",
" doc = fitz.open(filename)\n",
" \n",
" # 遍历每一页\n",
" for page_num in range(len(doc)):\n",
" page = doc.load_page(page_num)\n",
" print(page.get_text())\n",
"\n",
"# 使用上面获得的PDF URL下载PDF\n",
"pdf_url = paper_info['pdf_url']\n",
"filename = 'downloaded_paper.pdf'\n",
"download_pdf(pdf_url, filename)\n",
"\n",
"# 解析并打印PDF内容\n",
"parse_pdf(filename)\n"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.2"
}
},
"nbformat": 4,
"nbformat_minor": 5
}