change the way we recommend papers. we'll do it per tag instead of just mixing everything together. i believe this should produce higher quality results

This commit is contained in:
Andrej Karpathy 2021-12-18 14:53:57 -08:00
parent 65d4bb8415
commit 6e85778b24

View File

@ -50,6 +50,10 @@ body {
color: #333; color: #333;
margin-bottom: 10px; margin-bottom: 10px;
} }
.f {
color: #933;
display: inline-block;
}
</style> </style>
</head> </head>
@ -88,37 +92,62 @@ def calculate_recommendation(
ptoi[p] = i ptoi[p] = i
itop[i] = p itop[i] = p
# construct the positive set via simple union of all tags # loop over all the tags
y = np.zeros(n, dtype=np.float32) all_pids, all_scores = {}, {}
for tag, pids in tags.items(): for tag, pids in tags.items():
if len(pids) == 0:
continue
# construct the positive set for this tag
y = np.zeros(n, dtype=np.float32)
for pid in pids: for pid in pids:
y[ptoi[pid]] = 1.0 y[ptoi[pid]] = 1.0
# classify # classify
clf = svm.LinearSVC(class_weight='balanced', verbose=False, max_iter=10000, tol=1e-6, C=0.1) clf = svm.LinearSVC(class_weight='balanced', verbose=False, max_iter=10000, tol=1e-6, C=0.01)
clf.fit(x, y) clf.fit(x, y)
s = clf.decision_function(x) s = clf.decision_function(x)
sortix = np.argsort(-s) sortix = np.argsort(-s)
pids = [itop[ix] for ix in sortix] pids = [itop[ix] for ix in sortix]
scores = [100*float(s[ix]) for ix in sortix] scores = [100*float(s[ix]) for ix in sortix]
# filter by time to only recent papers # filter by time to only recent papers
deltat = time_delta*60*60*24 # allowed time delta in seconds deltat = time_delta*60*60*24 # allowed time delta in seconds
keep = [i for i,pid in enumerate(pids) if (tnow - metas[pid]['_time']) < deltat] keep = [i for i,pid in enumerate(pids) if (tnow - metas[pid]['_time']) < deltat]
pids, scores = [pids[i] for i in keep], [scores[i] for i in keep] pids, scores = [pids[i] for i in keep], [scores[i] for i in keep]
# finally exclude the papers we already have tagged # finally exclude the papers we already have tagged
have = set().union(*tags.values()) have = set().union(*tags.values())
keep = [i for i,pid in enumerate(pids) if pid not in have] keep = [i for i,pid in enumerate(pids) if pid not in have]
pids, scores = [pids[i] for i in keep], [scores[i] for i in keep] pids, scores = [pids[i] for i in keep], [scores[i] for i in keep]
return pids, scores # store results
all_pids[tag] = pids
all_scores[tag] = scores
return all_pids, all_scores
# ----------------------------------------------------------------------------- # -----------------------------------------------------------------------------
def render_recommendations(user, tags, pids, scores): def render_recommendations(user, tags, tag_pids, tag_scores):
# render the paper recommendations into the html template # render the paper recommendations into the html template
# first we are going to merge all of the papers / scores together using a MAX
max_score = {}
max_source_tag = {}
for tag in tag_pids:
for pid, score in zip(tag_pids[tag], tag_scores[tag]):
max_score[pid] = max(max_score.get(pid, -99999), score) # lol
if max_score[pid] == score:
max_source_tag[pid] = tag
# now we have a dict of pid -> max score. sort by score
max_score_list = sorted(max_score.items(), key=lambda x: x[1], reverse=True)
pids, scores = zip(*max_score_list)
# now render the html for each individual recommendation
parts = [] parts = []
n = min(len(scores), args.num_recommendations) n = min(len(scores), args.num_recommendations)
for score, pid in zip(scores[:n], pids[:n]): for score, pid in zip(scores[:n], pids[:n]):
@ -134,12 +163,12 @@ def render_recommendations(user, tags, pids, scores):
<tr> <tr>
<td valign="top"><div class="s">%.2f</div></td> <td valign="top"><div class="s">%.2f</div></td>
<td> <td>
<a href="%s">%s</a> <a href="%s">%s</a> <div class="f">(%s)</div>
<div class="a">%s</div> <div class="a">%s</div>
<div class="u">%s</div> <div class="u">%s</div>
</td> </td>
</tr> </tr>
""" % (score, p['link'], p['title'], authors, summary) """ % (score, p['link'], p['title'], max_source_tag[pid], authors, summary)
) )
# render the final html # render the final html
@ -239,6 +268,9 @@ if __name__ == "__main__":
print("skipping user %s, no papers tagged" % (user, )) print("skipping user %s, no papers tagged" % (user, ))
continue continue
# insert a fake entry in tags for the special "all" tag, which is the union of all papers
# tags['all'] = set().union(*tags.values())
# calculate the recommendations # calculate the recommendations
pids, scores = calculate_recommendation(tags, time_delta=args.time_delta) pids, scores = calculate_recommendation(tags, time_delta=args.time_delta)
print("user %s has %d recommendations over last %d days" % (user, len(pids), args.time_delta)) print("user %s has %d recommendations over last %d days" % (user, len(pids), args.time_delta))
@ -259,8 +291,8 @@ if __name__ == "__main__":
send_email(email, html) send_email(email, html)
num_sent += 1 num_sent += 1
# zzz # zzz?
time.sleep(1 + random.uniform(0, 2)) # time.sleep(1 + random.uniform(0, 2))
print("done.") print("done.")
print("sent %d emails" % (num_sent, )) print("sent %d emails" % (num_sent, ))