add thumbnails for papers, which apparently ppl like

2022-02-13 18:30:14 -08:00 · 2022-02-13 18:30:14 -08:00 · d7a303b410
commit d7a303b410
parent f980c7947a
4 changed files with 111 additions and 2 deletions
--- a/serve.py
+++ b/serve.py
@ -87,6 +87,8 @@ def render_pid(pid):
    # render a single paper with just the information we need for the UI
    pdb = get_papers()
    tags = get_tags()
+    thumb_path = 'static/thumb/' + pid + '.jpg'
+    thumb_url = thumb_path if os.path.isfile(thumb_path) else ''
    d = pdb[pid]
    return dict(
        weight = 0.0,
@ -97,6 +99,7 @@ def render_pid(pid):
        tags = ', '.join(t['term'] for t in d['tags']),
        utags = [t for t, pids in tags.items() if pid in pids],
        summary = d['summary'],
+        thumb_url = thumb_url,
    )

 def random_rank():
--- a/static/paper_list.js
+++ b/static/paper_list.js
@ -22,7 +22,7 @@ const Paper = props => {
    const utags = p.utags.map((utxt, ix) => <UTag key={ix} tag={utxt} />);
    const similar_url = "/?rank=pid&pid=" + p.id;
    const inspect_url = "/inspect?pid=" + p.id;
-
+    const thumb_img = p.thumb_url === '' ? null : <div class='rel_img'><img src={p.thumb_url} /></div>;
    // if the user is logged in then we can show add/sub buttons
    let utag_controls = null;
    if(user) {
@ -43,6 +43,7 @@ const Paper = props => {
        <div class="rel_time">{p.time}</div>
        <div class='rel_tags'>{p.tags}</div>
        {utag_controls}
+        {thumb_img}
        <div class='rel_abs'>{p.summary}</div>
        <div class='rel_more'><a href={similar_url}>similar</a></div>
        <div class='rel_inspect'><a href={inspect_url}>inspect</a></div>
--- a/static/style.css
+++ b/static/style.css
@ -3,6 +3,7 @@ body {
    padding: 0;
    font-family: sans-serif;
    line-height: 1.2;
+    background-color: #eee;
 }
 #header {
    height: 24px;
@ -50,7 +51,7 @@ body {
 .rel_paper {
    margin-bottom: 10px;
    padding: 10px;
-    background-color: #eee;
+    background-color: white;
    border-radius: 5px;
 }

--- a/thumb_daemon.py
+++ b/thumb_daemon.py
@ -0,0 +1,104 @@
+"""
+Iterates over the current database and makes best effort to download the papers,
+convert them to thumbnail images and save them to disk, for display in the UI.
+Atm only runs the most recent 5K papers. Intended to be run as a cron job daily
+or something like that.
+"""
+
+import os
+import time
+import random
+import requests
+from subprocess import Popen
+from aslite.db import get_papers_db, get_metas_db
+
+# create the tmp directory if it does not exist, where we will do temporary work
+TMP_DIR = 'tmp'
+if not os.path.exists(TMP_DIR):
+    os.makedirs(TMP_DIR)
+# create the thumb directory, where we will store the paper thumbnails
+THUMB_DIR = os.path.join('static', 'thumb')
+if not os.path.exists(THUMB_DIR):
+    os.makedirs(THUMB_DIR)
+
+# open the database, determine which papers we'll try to get thumbs for
+pdb = get_papers_db()
+n = len(pdb)
+mdb = get_metas_db()
+metas = list(mdb.items())
+metas.sort(key=lambda kv: kv[1]['_time'], reverse=True) # most recent papers first
+keys = [k for k,v in metas[:5000]] # only the most recent papers
+
+for i, key in enumerate(keys):
+    time.sleep(0.01) # for safety
+
+    # the path where we would store the thumbnail for this key
+    thumb_path = os.path.join(THUMB_DIR, key + '.jpg')
+    if os.path.exists(thumb_path):
+        continue
+
+    # fetch the paper
+    p = pdb[key]
+    print("%d/%d: paper to process: %s" % (i, n, key))
+
+    # get the link to the pdf
+    url = p['link'].replace('abs', 'pdf')
+
+    # attempt to download the pdf
+    print("attempting to download pdf from: ", url)
+    try:
+        x = requests.get(url, timeout=10, allow_redirects=True)
+        with open(os.path.join(TMP_DIR, 'paper.pdf'), 'wb') as f:
+            f.write(x.content)
+        print("OK")
+    except Exception as e:
+        print("error downloading the pdf at url", url)
+        print(e)
+        continue
+    time.sleep(5 + random.uniform(0, 5)) # take a breather
+
+    # mv away the previous temporary files if they exist
+    if os.path.isfile(os.path.join(TMP_DIR, 'thumb-0.png')):
+        for i in range(8):
+            f1 = os.path.join(TMP_DIR, 'thumb-%d.png' % (i,))
+            f2 = os.path.join(TMP_DIR, 'thumbbuf-%d.png' % (i,))
+            if os.path.isfile(f1):
+                cmd = 'mv %s %s' % (f1, f2)
+                os.system(cmd)
+
+    # convert pdf to png images per page. spawn async because convert can unfortunately enter an infinite loop, have to handle this.
+    # this command will generate 8 independent images thumb-0.png ... thumb-7.png of the thumbnails
+    print("converting the pdf to png images")
+    pp = Popen(['convert', '%s[0-7]' % ('tmp/paper.pdf', ), '-thumbnail', 'x156', os.path.join(TMP_DIR, 'thumb.png')])
+    t0 = time.time()
+    while time.time() - t0 < 20: # give it 20 seconds deadline
+        ret = pp.poll()
+        if not (ret is None):
+            # process terminated
+            break
+        time.sleep(0.1)
+    ret = pp.poll()
+    if ret is None:
+        print("convert command did not terminate in 20 seconds, terminating.")
+        pp.terminate() # give up
+        continue
+
+    if not os.path.isfile(os.path.join(TMP_DIR, 'thumb-0.png')):
+        # failed to render pdf, replace with missing image
+        #missing_thumb_path = os.path.join('static', 'missing.jpg')
+        #os.system('cp %s %s' % (missing_thumb_path, thumb_path))
+        #print("could not render pdf, creating a missing image placeholder")
+        print("could not render pdf, skipping")
+        continue
+    else:
+        # otherwise concatenate the 8 images into one
+        cmd = "montage -mode concatenate -quality 80 -tile x1 %s %s" \
+              % (os.path.join(TMP_DIR, 'thumb-*.png'), thumb_path)
+        print(cmd)
+        os.system(cmd)
+
+    # remove the temporary paper.pdf file
+    tmp_pdf = os.path.join(TMP_DIR, 'paper.pdf')
+    if os.path.isfile(tmp_pdf):
+        os.remove(tmp_pdf)
+