From d7a303b410b0246fbd19087e37f1885f7ca8a9dc Mon Sep 17 00:00:00 2001 From: Andrej Karpathy Date: Sun, 13 Feb 2022 18:30:14 -0800 Subject: [PATCH] add thumbnails for papers, which apparently ppl like --- serve.py | 3 ++ static/paper_list.js | 3 +- static/style.css | 3 +- thumb_daemon.py | 104 +++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 111 insertions(+), 2 deletions(-) create mode 100644 thumb_daemon.py diff --git a/serve.py b/serve.py index c3373e4..6aecb8e 100644 --- a/serve.py +++ b/serve.py @@ -87,6 +87,8 @@ def render_pid(pid): # render a single paper with just the information we need for the UI pdb = get_papers() tags = get_tags() + thumb_path = 'static/thumb/' + pid + '.jpg' + thumb_url = thumb_path if os.path.isfile(thumb_path) else '' d = pdb[pid] return dict( weight = 0.0, @@ -97,6 +99,7 @@ def render_pid(pid): tags = ', '.join(t['term'] for t in d['tags']), utags = [t for t, pids in tags.items() if pid in pids], summary = d['summary'], + thumb_url = thumb_url, ) def random_rank(): diff --git a/static/paper_list.js b/static/paper_list.js index 7434911..e325ffa 100644 --- a/static/paper_list.js +++ b/static/paper_list.js @@ -22,7 +22,7 @@ const Paper = props => { const utags = p.utags.map((utxt, ix) => ); const similar_url = "/?rank=pid&pid=" + p.id; const inspect_url = "/inspect?pid=" + p.id; - + const thumb_img = p.thumb_url === '' ? null :
; // if the user is logged in then we can show add/sub buttons let utag_controls = null; if(user) { @@ -43,6 +43,7 @@ const Paper = props => {
{p.time}
{p.tags}
{utag_controls} + {thumb_img}
{p.summary}
diff --git a/static/style.css b/static/style.css index 3ac3ff6..29efaa3 100644 --- a/static/style.css +++ b/static/style.css @@ -3,6 +3,7 @@ body { padding: 0; font-family: sans-serif; line-height: 1.2; + background-color: #eee; } #header { height: 24px; @@ -50,7 +51,7 @@ body { .rel_paper { margin-bottom: 10px; padding: 10px; - background-color: #eee; + background-color: white; border-radius: 5px; } diff --git a/thumb_daemon.py b/thumb_daemon.py new file mode 100644 index 0000000..2a7b010 --- /dev/null +++ b/thumb_daemon.py @@ -0,0 +1,104 @@ +""" +Iterates over the current database and makes best effort to download the papers, +convert them to thumbnail images and save them to disk, for display in the UI. +Atm only runs the most recent 5K papers. Intended to be run as a cron job daily +or something like that. +""" + +import os +import time +import random +import requests +from subprocess import Popen +from aslite.db import get_papers_db, get_metas_db + +# create the tmp directory if it does not exist, where we will do temporary work +TMP_DIR = 'tmp' +if not os.path.exists(TMP_DIR): + os.makedirs(TMP_DIR) +# create the thumb directory, where we will store the paper thumbnails +THUMB_DIR = os.path.join('static', 'thumb') +if not os.path.exists(THUMB_DIR): + os.makedirs(THUMB_DIR) + +# open the database, determine which papers we'll try to get thumbs for +pdb = get_papers_db() +n = len(pdb) +mdb = get_metas_db() +metas = list(mdb.items()) +metas.sort(key=lambda kv: kv[1]['_time'], reverse=True) # most recent papers first +keys = [k for k,v in metas[:5000]] # only the most recent papers + +for i, key in enumerate(keys): + time.sleep(0.01) # for safety + + # the path where we would store the thumbnail for this key + thumb_path = os.path.join(THUMB_DIR, key + '.jpg') + if os.path.exists(thumb_path): + continue + + # fetch the paper + p = pdb[key] + print("%d/%d: paper to process: %s" % (i, n, key)) + + # get the link to the pdf + url = p['link'].replace('abs', 'pdf') + + # attempt to download the pdf + print("attempting to download pdf from: ", url) + try: + x = requests.get(url, timeout=10, allow_redirects=True) + with open(os.path.join(TMP_DIR, 'paper.pdf'), 'wb') as f: + f.write(x.content) + print("OK") + except Exception as e: + print("error downloading the pdf at url", url) + print(e) + continue + time.sleep(5 + random.uniform(0, 5)) # take a breather + + # mv away the previous temporary files if they exist + if os.path.isfile(os.path.join(TMP_DIR, 'thumb-0.png')): + for i in range(8): + f1 = os.path.join(TMP_DIR, 'thumb-%d.png' % (i,)) + f2 = os.path.join(TMP_DIR, 'thumbbuf-%d.png' % (i,)) + if os.path.isfile(f1): + cmd = 'mv %s %s' % (f1, f2) + os.system(cmd) + + # convert pdf to png images per page. spawn async because convert can unfortunately enter an infinite loop, have to handle this. + # this command will generate 8 independent images thumb-0.png ... thumb-7.png of the thumbnails + print("converting the pdf to png images") + pp = Popen(['convert', '%s[0-7]' % ('tmp/paper.pdf', ), '-thumbnail', 'x156', os.path.join(TMP_DIR, 'thumb.png')]) + t0 = time.time() + while time.time() - t0 < 20: # give it 20 seconds deadline + ret = pp.poll() + if not (ret is None): + # process terminated + break + time.sleep(0.1) + ret = pp.poll() + if ret is None: + print("convert command did not terminate in 20 seconds, terminating.") + pp.terminate() # give up + continue + + if not os.path.isfile(os.path.join(TMP_DIR, 'thumb-0.png')): + # failed to render pdf, replace with missing image + #missing_thumb_path = os.path.join('static', 'missing.jpg') + #os.system('cp %s %s' % (missing_thumb_path, thumb_path)) + #print("could not render pdf, creating a missing image placeholder") + print("could not render pdf, skipping") + continue + else: + # otherwise concatenate the 8 images into one + cmd = "montage -mode concatenate -quality 80 -tile x1 %s %s" \ + % (os.path.join(TMP_DIR, 'thumb-*.png'), thumb_path) + print(cmd) + os.system(cmd) + + # remove the temporary paper.pdf file + tmp_pdf = os.path.join(TMP_DIR, 'paper.pdf') + if os.path.isfile(tmp_pdf): + os.remove(tmp_pdf) +