change the way we recommend papers. we'll do it per tag instead of just mixing everything together. i believe this should produce higher quality results

2021-12-18 14:53:57 -08:00 · 2021-12-18 14:53:57 -08:00 · 6e85778b24
commit 6e85778b24
parent 65d4bb8415
1 changed files with 55 additions and 23 deletions
--- a/send_emails.py
+++ b/send_emails.py
@ -50,6 +50,10 @@ body {
    color: #333;
    margin-bottom: 10px;
 }
+.f {
+    color: #933;
+    display: inline-block;
+}
 </style>
 </head>

@ -88,37 +92,62 @@ def calculate_recommendation(
        ptoi[p] = i
        itop[i] = p

-    # construct the positive set via simple union of all tags
-    y = np.zeros(n, dtype=np.float32)
+    # loop over all the tags
+    all_pids, all_scores = {}, {}
    for tag, pids in tags.items():
+
+        if len(pids) == 0:
+            continue
+
+        # construct the positive set for this tag
+        y = np.zeros(n, dtype=np.float32)
        for pid in pids:
            y[ptoi[pid]] = 1.0

-    # classify
-    clf = svm.LinearSVC(class_weight='balanced', verbose=False, max_iter=10000, tol=1e-6, C=0.1)
-    clf.fit(x, y)
-    s = clf.decision_function(x)
-    sortix = np.argsort(-s)
-    pids = [itop[ix] for ix in sortix]
-    scores = [100*float(s[ix]) for ix in sortix]
+        # classify
+        clf = svm.LinearSVC(class_weight='balanced', verbose=False, max_iter=10000, tol=1e-6, C=0.01)
+        clf.fit(x, y)
+        s = clf.decision_function(x)
+        sortix = np.argsort(-s)
+        pids = [itop[ix] for ix in sortix]
+        scores = [100*float(s[ix]) for ix in sortix]

-    # filter by time to only recent papers
-    deltat = time_delta*60*60*24 # allowed time delta in seconds
-    keep = [i for i,pid in enumerate(pids) if (tnow - metas[pid]['_time']) < deltat]
-    pids, scores = [pids[i] for i in keep], [scores[i] for i in keep]
+        # filter by time to only recent papers
+        deltat = time_delta*60*60*24 # allowed time delta in seconds
+        keep = [i for i,pid in enumerate(pids) if (tnow - metas[pid]['_time']) < deltat]
+        pids, scores = [pids[i] for i in keep], [scores[i] for i in keep]

-    # finally exclude the papers we already have tagged
-    have = set().union(*tags.values())
-    keep = [i for i,pid in enumerate(pids) if pid not in have]
-    pids, scores = [pids[i] for i in keep], [scores[i] for i in keep]
+        # finally exclude the papers we already have tagged
+        have = set().union(*tags.values())
+        keep = [i for i,pid in enumerate(pids) if pid not in have]
+        pids, scores = [pids[i] for i in keep], [scores[i] for i in keep]

-    return pids, scores
+        # store results
+        all_pids[tag] = pids
+        all_scores[tag] = scores
+
+
+    return all_pids, all_scores

 # -----------------------------------------------------------------------------

-def render_recommendations(user, tags, pids, scores):
+def render_recommendations(user, tags, tag_pids, tag_scores):
    # render the paper recommendations into the html template

+    # first we are going to merge all of the papers / scores together using a MAX
+    max_score = {}
+    max_source_tag = {}
+    for tag in tag_pids:
+        for pid, score in zip(tag_pids[tag], tag_scores[tag]):
+            max_score[pid] = max(max_score.get(pid, -99999), score) # lol
+            if max_score[pid] == score:
+                max_source_tag[pid] = tag
+
+    # now we have a dict of pid -> max score. sort by score
+    max_score_list = sorted(max_score.items(), key=lambda x: x[1], reverse=True)
+    pids, scores = zip(*max_score_list)
+
+    # now render the html for each individual recommendation
    parts = []
    n = min(len(scores), args.num_recommendations)
    for score, pid in zip(scores[:n], pids[:n]):
@ -134,12 +163,12 @@ def render_recommendations(user, tags, pids, scores):
 <tr>
 <td valign="top"><div class="s">%.2f</div></td>
 <td>
-<a href="%s">%s</a>
+<a href="%s">%s</a> <div class="f">(%s)</div>
 <div class="a">%s</div>
 <div class="u">%s</div>
 </td>
 </tr>
-""" % (score, p['link'], p['title'], authors, summary)
+""" % (score, p['link'], p['title'], max_source_tag[pid], authors, summary)
        )

    # render the final html
@ -239,6 +268,9 @@ if __name__ == "__main__":
            print("skipping user %s, no papers tagged" % (user, ))
            continue

+        # insert a fake entry in tags for the special "all" tag, which is the union of all papers
+        # tags['all'] = set().union(*tags.values())
+
        # calculate the recommendations
        pids, scores = calculate_recommendation(tags, time_delta=args.time_delta)
        print("user %s has %d recommendations over last %d days" % (user, len(pids), args.time_delta))
@ -259,8 +291,8 @@ if __name__ == "__main__":
        send_email(email, html)
        num_sent += 1

-        # zzz
-        time.sleep(1 + random.uniform(0, 2))
+        # zzz?
+        # time.sleep(1 + random.uniform(0, 2))

    print("done.")
    print("sent %d emails" % (num_sent, ))