add email sending script to repo

2021-11-27 11:19:55 -08:00 · 2021-11-27 11:19:55 -08:00 · 5eece3f992
commit 5eece3f992
parent 49995465df
2 changed files with 222 additions and 0 deletions
--- a/README.md
+++ b/README.md
@ -35,6 +35,8 @@ export FLASK_APP=serve.py; flask run

 All of the database will be stored inside the `data` directory. Finally, if you'd like to run your own instance on the interwebs I recommend simply running the above on a [Linode](https://www.linode.com), e.g. I am running this code currently on the smallest "Nanode 1 GB" instance indexing about 30K papers, which costs $5/month.

+Finally, if you'd like to send periodic emails to users about new papers, see the `send_emails.py` script. I run this script in a daily cron job.
+
 #### todos

 - I need a proper requirements.txt and such
--- a/send_emails.py
+++ b/send_emails.py
@ -0,0 +1,220 @@
+"""
+Compose and send recommendation emails to arxiv-sanity-lite users!
+
+I run this script in a cron job to send out emails to the users with their
+recommendations. There's a bit of copy paste code here but I expect that
+the recommendations may become more complex in the future, so this is ok for now.
+
+You'll notice that the file sendgrid_api_key.txt is not in the repo, you'd have
+to manually register with sendgrid yourself, get an API key and put it in the file.
+"""
+
+import os
+import time
+import numpy as np
+from sklearn import svm
+
+import sendgrid
+from sendgrid.helpers.mail import Email, To, Content, Mail
+
+from aslite.db import load_features
+from aslite.db import get_tags_db
+from aslite.db import get_metas_db
+from aslite.db import get_papers_db
+from aslite.db import get_email_db
+
+# -----------------------------------------------------------------------------
+# the html template for the email
+
+template = """
+<!DOCTYPE HTML>
+<html>
+
+<head>
+<style>
+body {
+    font-family: Arial, sans-serif;
+}
+.s {
+    font-weight: bold;
+    margin-right: 10px;
+}
+.a {
+    color: #333;
+}
+.u {
+    font-size: 12px;
+    color: #333;
+    margin-bottom: 10px;
+}
+</style>
+</head>
+
+<body>
+
+<br><br>
+<div>Good morning! Here are your daily <a href="https://arxiv-sanity-lite.com">arxiv-sanity-lite</a> recommendations of very recent papers:</div>
+<br><br>
+
+<div>
+    __CONTENT__
+</div>
+
+<br><br>
+<div>
+To stop these emails remove your email in your <a href="https://arxiv-sanity-lite.com/profile">account</a> settings.
+</div>
+<br><br>
+
+</body>
+</html>
+"""
+
+# -----------------------------------------------------------------------------
+
+def calculate_recommendation(
+    tags,
+    time_delta = 3, # how recent papers are we recommending? in days
+    ):
+
+    # a bit of preprocessing
+    x, pids = features['x'], features['pids']
+    n, d = x.shape
+    ptoi, itop = {}, {}
+    for i, p in enumerate(pids):
+        ptoi[p] = i
+        itop[i] = p
+
+    # construct the positive set via simple union of all tags
+    y = np.zeros(n, dtype=np.float32)
+    for tag, pids in tags.items():
+        for pid in pids:
+            y[ptoi[pid]] = 1.0
+
+    # classify
+    clf = svm.LinearSVC(class_weight='balanced', verbose=False, max_iter=10000, tol=1e-6, C=0.1)
+    clf.fit(x, y)
+    s = clf.decision_function(x)
+    sortix = np.argsort(-s)
+    pids = [itop[ix] for ix in sortix]
+    scores = [100*float(s[ix]) for ix in sortix]
+
+    # filter by time to only recent papers
+    deltat = time_delta*60*60*24 # allowed time delta in seconds
+    keep = [i for i,pid in enumerate(pids) if (tnow - metas[pid]['_time']) < deltat]
+    pids, scores = [pids[i] for i in keep], [scores[i] for i in keep]
+
+    # finally exclude the papers we already have tagged
+    have = set().union(*tags.values())
+    keep = [i for i,pid in enumerate(pids) if pid not in have]
+    pids, scores = [pids[i] for i in keep], [scores[i] for i in keep]
+
+    return pids, scores
+
+# -----------------------------------------------------------------------------
+
+def render_recommendations(pids, scores, num_recommendations = 10):
+    # render the paper recommendations into the html template
+
+    parts = []
+    n = min(len(scores), num_recommendations)
+    for score, pid in zip(scores[:n], pids[:n]):
+        p = pdb[pid]
+        authors = ', '.join(a['name'] for a in p['authors'])
+        # crop the abstract
+        summary = p['summary']
+        summary = summary[:min(500, len(summary))]
+        if len(summary) == 500:
+            summary += '...'
+        parts.append(
+"""
+<tr>
+<td valign="top"><div class="s">%.2f</div></td>
+<td>
+<a href="%s">%s</a>
+<div class="a">%s</div>
+<div class="u">%s</div>
+</td>
+</tr>
+""" % (score, p['link'], p['title'], authors, summary)
+        )
+
+    final = '<table>' + ''.join(parts) + '</table>'
+    out = template.replace('__CONTENT__', final)
+    return out
+
+# -----------------------------------------------------------------------------
+# send the actual html via sendgrid
+
+def send_email(to, html):
+
+    # init the api
+    assert os.path.isfile('sendgrid_api_key.txt')
+    api_key = open('sendgrid_api_key.txt', 'r').read().strip()
+    sg = sendgrid.SendGridAPIClient(api_key=api_key)
+
+    # construct the email
+    from_email = Email("arxiv-sanity-lite-admin@arxiv-sanity-lite.com")
+    to_email = To(to)
+    subject = tnow_str + " Arxiv Sanity Lite recommendations"
+    content = Content("text/html", html)
+    mail = Mail(from_email, to_email, subject, content)
+
+    # hope for the best :)
+    response = sg.client.mail.send.post(request_body=mail.get())
+    print(response.status_code)
+
+# -----------------------------------------------------------------------------
+
+if __name__ == "__main__":
+
+    TIME_DELTA = 3 # how recent papers are we recommending? in days
+    NUM_RECCOMENDATIONS = 20 # how many papers to recommend?
+
+    tnow = time.time()
+    tnow_str = time.strftime('%b %d', time.localtime(tnow)) # e.g. "Nov 27"
+
+    # read entire db simply into RAM
+    with get_tags_db() as tags_db:
+        tags = {k:v for k,v in tags_db.items()}
+
+    # read entire db simply into RAM
+    with get_metas_db() as mdb:
+        metas = {k:v for k,v in mdb.items()}
+
+    # read entire db simply into RAM
+    with get_email_db() as edb:
+        emails = {k:v for k,v in edb.items()}
+
+    # read tfidf features into RAM
+    features = load_features()
+
+    # keep the papers as only a handle, since this can be larger
+    pdb = get_papers_db()
+
+    # iterate all users, create recommendations, send emails
+    for user, tags in tags.items():
+
+        # verify that we have an email for this user
+        email = emails.get(user, None)
+        if not email:
+            print("skipping user %s, no email" % (user, ))
+            continue
+
+        # calculate the recommendations
+        pids, scores = calculate_recommendation(tags, time_delta=TIME_DELTA)
+        print("user %s has %d recommendations over last %d days" % (user, len(pids), TIME_DELTA))
+
+        # render the html
+        print("rendering top %d recommendations into a report..." % (NUM_RECCOMENDATIONS, ))
+        html = render_recommendations(pids, scores, num_recommendations=NUM_RECCOMENDATIONS)
+        # temporarily for debugging write recommendations to disk for manual inspection
+        with open('recco/%s.html' % (user, ), 'w') as f:
+            f.write(html)
+
+        # actually send the email
+        print("sending email...")
+        send_email(email, html)
+
+
+    print("done.")