change the way we recommend papers. we'll do it per tag instead of just mixing everything together. i believe this should produce higher quality results

This commit is contained in:
Andrej Karpathy 2021-12-18 14:53:57 -08:00
parent 65d4bb8415
commit 6e85778b24

View File

@ -50,6 +50,10 @@ body {
color: #333;
margin-bottom: 10px;
}
.f {
color: #933;
display: inline-block;
}
</style>
</head>
@ -88,37 +92,62 @@ def calculate_recommendation(
ptoi[p] = i
itop[i] = p
# construct the positive set via simple union of all tags
y = np.zeros(n, dtype=np.float32)
# loop over all the tags
all_pids, all_scores = {}, {}
for tag, pids in tags.items():
if len(pids) == 0:
continue
# construct the positive set for this tag
y = np.zeros(n, dtype=np.float32)
for pid in pids:
y[ptoi[pid]] = 1.0
# classify
clf = svm.LinearSVC(class_weight='balanced', verbose=False, max_iter=10000, tol=1e-6, C=0.1)
clf.fit(x, y)
s = clf.decision_function(x)
sortix = np.argsort(-s)
pids = [itop[ix] for ix in sortix]
scores = [100*float(s[ix]) for ix in sortix]
# classify
clf = svm.LinearSVC(class_weight='balanced', verbose=False, max_iter=10000, tol=1e-6, C=0.01)
clf.fit(x, y)
s = clf.decision_function(x)
sortix = np.argsort(-s)
pids = [itop[ix] for ix in sortix]
scores = [100*float(s[ix]) for ix in sortix]
# filter by time to only recent papers
deltat = time_delta*60*60*24 # allowed time delta in seconds
keep = [i for i,pid in enumerate(pids) if (tnow - metas[pid]['_time']) < deltat]
pids, scores = [pids[i] for i in keep], [scores[i] for i in keep]
# filter by time to only recent papers
deltat = time_delta*60*60*24 # allowed time delta in seconds
keep = [i for i,pid in enumerate(pids) if (tnow - metas[pid]['_time']) < deltat]
pids, scores = [pids[i] for i in keep], [scores[i] for i in keep]
# finally exclude the papers we already have tagged
have = set().union(*tags.values())
keep = [i for i,pid in enumerate(pids) if pid not in have]
pids, scores = [pids[i] for i in keep], [scores[i] for i in keep]
# finally exclude the papers we already have tagged
have = set().union(*tags.values())
keep = [i for i,pid in enumerate(pids) if pid not in have]
pids, scores = [pids[i] for i in keep], [scores[i] for i in keep]
return pids, scores
# store results
all_pids[tag] = pids
all_scores[tag] = scores
return all_pids, all_scores
# -----------------------------------------------------------------------------
def render_recommendations(user, tags, pids, scores):
def render_recommendations(user, tags, tag_pids, tag_scores):
# render the paper recommendations into the html template
# first we are going to merge all of the papers / scores together using a MAX
max_score = {}
max_source_tag = {}
for tag in tag_pids:
for pid, score in zip(tag_pids[tag], tag_scores[tag]):
max_score[pid] = max(max_score.get(pid, -99999), score) # lol
if max_score[pid] == score:
max_source_tag[pid] = tag
# now we have a dict of pid -> max score. sort by score
max_score_list = sorted(max_score.items(), key=lambda x: x[1], reverse=True)
pids, scores = zip(*max_score_list)
# now render the html for each individual recommendation
parts = []
n = min(len(scores), args.num_recommendations)
for score, pid in zip(scores[:n], pids[:n]):
@ -134,12 +163,12 @@ def render_recommendations(user, tags, pids, scores):
<tr>
<td valign="top"><div class="s">%.2f</div></td>
<td>
<a href="%s">%s</a>
<a href="%s">%s</a> <div class="f">(%s)</div>
<div class="a">%s</div>
<div class="u">%s</div>
</td>
</tr>
""" % (score, p['link'], p['title'], authors, summary)
""" % (score, p['link'], p['title'], max_source_tag[pid], authors, summary)
)
# render the final html
@ -239,6 +268,9 @@ if __name__ == "__main__":
print("skipping user %s, no papers tagged" % (user, ))
continue
# insert a fake entry in tags for the special "all" tag, which is the union of all papers
# tags['all'] = set().union(*tags.values())
# calculate the recommendations
pids, scores = calculate_recommendation(tags, time_delta=args.time_delta)
print("user %s has %d recommendations over last %d days" % (user, len(pids), args.time_delta))
@ -259,8 +291,8 @@ if __name__ == "__main__":
send_email(email, html)
num_sent += 1
# zzz
time.sleep(1 + random.uniform(0, 2))
# zzz?
# time.sleep(1 + random.uniform(0, 2))
print("done.")
print("sent %d emails" % (num_sent, ))