arxiv-sanity-lite/thumb_daemon.py

"""
Iterates over the current database and makes best effort to download the papers,
convert them to thumbnail images and save them to disk, for display in the UI.
Atm only runs the most recent 5K papers. Intended to be run as a cron job daily
or something like that.
"""

import os
import time
import random
import requests
from subprocess import Popen
from aslite.db import get_papers_db, get_metas_db

# create the tmp directory if it does not exist, where we will do temporary work
TMP_DIR = 'tmp'
if not os.path.exists(TMP_DIR):
    os.makedirs(TMP_DIR)
# create the thumb directory, where we will store the paper thumbnails
THUMB_DIR = os.path.join('static', 'thumb')
if not os.path.exists(THUMB_DIR):
    os.makedirs(THUMB_DIR)

# open the database, determine which papers we'll try to get thumbs for
pdb = get_papers_db()
n = len(pdb)
mdb = get_metas_db()
metas = list(mdb.items())
metas.sort(key=lambda kv: kv[1]['_time'], reverse=True) # most recent papers first
keys = [k for k,v in metas[:5000]] # only the most recent papers

for i, key in enumerate(keys):
    time.sleep(0.01) # for safety

    # the path where we would store the thumbnail for this key
    thumb_path = os.path.join(THUMB_DIR, key + '.jpg')
    if os.path.exists(thumb_path):
        continue

    # fetch the paper
    p = pdb[key]
    print("%d/%d: paper to process: %s" % (i, n, key))

    # get the link to the pdf
    url = p['link'].replace('abs', 'pdf')

    # attempt to download the pdf
    print("attempting to download pdf from: ", url)
    try:
        x = requests.get(url, timeout=10, allow_redirects=True)
        with open(os.path.join(TMP_DIR, 'paper.pdf'), 'wb') as f:
            f.write(x.content)
        print("OK")
    except Exception as e:
        print("error downloading the pdf at url", url)
        print(e)
        continue
    time.sleep(5 + random.uniform(0, 5)) # take a breather

    # mv away the previous temporary files if they exist
    if os.path.isfile(os.path.join(TMP_DIR, 'thumb-0.png')):
        for i in range(8):
            f1 = os.path.join(TMP_DIR, 'thumb-%d.png' % (i,))
            f2 = os.path.join(TMP_DIR, 'thumbbuf-%d.png' % (i,))
            if os.path.isfile(f1):
                cmd = 'mv %s %s' % (f1, f2)
                os.system(cmd)

    # convert pdf to png images per page. spawn async because convert can unfortunately enter an infinite loop, have to handle this.
    # this command will generate 8 independent images thumb-0.png ... thumb-7.png of the thumbnails
    print("converting the pdf to png images")
    pp = Popen(['convert', '%s[0-7]' % ('tmp/paper.pdf', ), '-thumbnail', 'x156', os.path.join(TMP_DIR, 'thumb.png')])
    t0 = time.time()
    while time.time() - t0 < 20: # give it 20 seconds deadline
        ret = pp.poll()
        if not (ret is None):
            # process terminated
            break
        time.sleep(0.1)
    ret = pp.poll()
    if ret is None:
        print("convert command did not terminate in 20 seconds, terminating.")
        pp.terminate() # give up
        continue

    if not os.path.isfile(os.path.join(TMP_DIR, 'thumb-0.png')):
        # failed to render pdf, replace with missing image
        #missing_thumb_path = os.path.join('static', 'missing.jpg')
        #os.system('cp %s %s' % (missing_thumb_path, thumb_path))
        #print("could not render pdf, creating a missing image placeholder")
        print("could not render pdf, skipping")
        continue
    else:
        # otherwise concatenate the 8 images into one
        cmd = "montage -mode concatenate -quality 80 -tile x1 %s %s" \
              % (os.path.join(TMP_DIR, 'thumb-*.png'), thumb_path)
        print(cmd)
        os.system(cmd)

    # remove the temporary paper.pdf file
    tmp_pdf = os.path.join(TMP_DIR, 'paper.pdf')
    if os.path.isfile(tmp_pdf):
        os.remove(tmp_pdf)