add thumbnails for papers, which apparently ppl like

This commit is contained in:
Andrej Karpathy 2022-02-13 18:30:14 -08:00
parent f980c7947a
commit d7a303b410
4 changed files with 111 additions and 2 deletions

View File

@ -87,6 +87,8 @@ def render_pid(pid):
# render a single paper with just the information we need for the UI
pdb = get_papers()
tags = get_tags()
thumb_path = 'static/thumb/' + pid + '.jpg'
thumb_url = thumb_path if os.path.isfile(thumb_path) else ''
d = pdb[pid]
return dict(
weight = 0.0,
@ -97,6 +99,7 @@ def render_pid(pid):
tags = ', '.join(t['term'] for t in d['tags']),
utags = [t for t, pids in tags.items() if pid in pids],
summary = d['summary'],
thumb_url = thumb_url,
)
def random_rank():

View File

@ -22,7 +22,7 @@ const Paper = props => {
const utags = p.utags.map((utxt, ix) => <UTag key={ix} tag={utxt} />);
const similar_url = "/?rank=pid&pid=" + p.id;
const inspect_url = "/inspect?pid=" + p.id;
const thumb_img = p.thumb_url === '' ? null : <div class='rel_img'><img src={p.thumb_url} /></div>;
// if the user is logged in then we can show add/sub buttons
let utag_controls = null;
if(user) {
@ -43,6 +43,7 @@ const Paper = props => {
<div class="rel_time">{p.time}</div>
<div class='rel_tags'>{p.tags}</div>
{utag_controls}
{thumb_img}
<div class='rel_abs'>{p.summary}</div>
<div class='rel_more'><a href={similar_url}>similar</a></div>
<div class='rel_inspect'><a href={inspect_url}>inspect</a></div>

View File

@ -3,6 +3,7 @@ body {
padding: 0;
font-family: sans-serif;
line-height: 1.2;
background-color: #eee;
}
#header {
height: 24px;
@ -50,7 +51,7 @@ body {
.rel_paper {
margin-bottom: 10px;
padding: 10px;
background-color: #eee;
background-color: white;
border-radius: 5px;
}

104
thumb_daemon.py Normal file
View File

@ -0,0 +1,104 @@
"""
Iterates over the current database and makes best effort to download the papers,
convert them to thumbnail images and save them to disk, for display in the UI.
Atm only runs the most recent 5K papers. Intended to be run as a cron job daily
or something like that.
"""
import os
import time
import random
import requests
from subprocess import Popen
from aslite.db import get_papers_db, get_metas_db
# create the tmp directory if it does not exist, where we will do temporary work
TMP_DIR = 'tmp'
if not os.path.exists(TMP_DIR):
os.makedirs(TMP_DIR)
# create the thumb directory, where we will store the paper thumbnails
THUMB_DIR = os.path.join('static', 'thumb')
if not os.path.exists(THUMB_DIR):
os.makedirs(THUMB_DIR)
# open the database, determine which papers we'll try to get thumbs for
pdb = get_papers_db()
n = len(pdb)
mdb = get_metas_db()
metas = list(mdb.items())
metas.sort(key=lambda kv: kv[1]['_time'], reverse=True) # most recent papers first
keys = [k for k,v in metas[:5000]] # only the most recent papers
for i, key in enumerate(keys):
time.sleep(0.01) # for safety
# the path where we would store the thumbnail for this key
thumb_path = os.path.join(THUMB_DIR, key + '.jpg')
if os.path.exists(thumb_path):
continue
# fetch the paper
p = pdb[key]
print("%d/%d: paper to process: %s" % (i, n, key))
# get the link to the pdf
url = p['link'].replace('abs', 'pdf')
# attempt to download the pdf
print("attempting to download pdf from: ", url)
try:
x = requests.get(url, timeout=10, allow_redirects=True)
with open(os.path.join(TMP_DIR, 'paper.pdf'), 'wb') as f:
f.write(x.content)
print("OK")
except Exception as e:
print("error downloading the pdf at url", url)
print(e)
continue
time.sleep(5 + random.uniform(0, 5)) # take a breather
# mv away the previous temporary files if they exist
if os.path.isfile(os.path.join(TMP_DIR, 'thumb-0.png')):
for i in range(8):
f1 = os.path.join(TMP_DIR, 'thumb-%d.png' % (i,))
f2 = os.path.join(TMP_DIR, 'thumbbuf-%d.png' % (i,))
if os.path.isfile(f1):
cmd = 'mv %s %s' % (f1, f2)
os.system(cmd)
# convert pdf to png images per page. spawn async because convert can unfortunately enter an infinite loop, have to handle this.
# this command will generate 8 independent images thumb-0.png ... thumb-7.png of the thumbnails
print("converting the pdf to png images")
pp = Popen(['convert', '%s[0-7]' % ('tmp/paper.pdf', ), '-thumbnail', 'x156', os.path.join(TMP_DIR, 'thumb.png')])
t0 = time.time()
while time.time() - t0 < 20: # give it 20 seconds deadline
ret = pp.poll()
if not (ret is None):
# process terminated
break
time.sleep(0.1)
ret = pp.poll()
if ret is None:
print("convert command did not terminate in 20 seconds, terminating.")
pp.terminate() # give up
continue
if not os.path.isfile(os.path.join(TMP_DIR, 'thumb-0.png')):
# failed to render pdf, replace with missing image
#missing_thumb_path = os.path.join('static', 'missing.jpg')
#os.system('cp %s %s' % (missing_thumb_path, thumb_path))
#print("could not render pdf, creating a missing image placeholder")
print("could not render pdf, skipping")
continue
else:
# otherwise concatenate the 8 images into one
cmd = "montage -mode concatenate -quality 80 -tile x1 %s %s" \
% (os.path.join(TMP_DIR, 'thumb-*.png'), thumb_path)
print(cmd)
os.system(cmd)
# remove the temporary paper.pdf file
tmp_pdf = os.path.join(TMP_DIR, 'paper.pdf')
if os.path.isfile(tmp_pdf):
os.remove(tmp_pdf)