change the way we recommend papers. we'll do it per tag instead of just mixing everything together. i believe this should produce higher quality results
This commit is contained in:
parent
65d4bb8415
commit
6e85778b24
@ -50,6 +50,10 @@ body {
|
||||
color: #333;
|
||||
margin-bottom: 10px;
|
||||
}
|
||||
.f {
|
||||
color: #933;
|
||||
display: inline-block;
|
||||
}
|
||||
</style>
|
||||
</head>
|
||||
|
||||
@ -88,37 +92,62 @@ def calculate_recommendation(
|
||||
ptoi[p] = i
|
||||
itop[i] = p
|
||||
|
||||
# construct the positive set via simple union of all tags
|
||||
y = np.zeros(n, dtype=np.float32)
|
||||
# loop over all the tags
|
||||
all_pids, all_scores = {}, {}
|
||||
for tag, pids in tags.items():
|
||||
|
||||
if len(pids) == 0:
|
||||
continue
|
||||
|
||||
# construct the positive set for this tag
|
||||
y = np.zeros(n, dtype=np.float32)
|
||||
for pid in pids:
|
||||
y[ptoi[pid]] = 1.0
|
||||
|
||||
# classify
|
||||
clf = svm.LinearSVC(class_weight='balanced', verbose=False, max_iter=10000, tol=1e-6, C=0.1)
|
||||
clf.fit(x, y)
|
||||
s = clf.decision_function(x)
|
||||
sortix = np.argsort(-s)
|
||||
pids = [itop[ix] for ix in sortix]
|
||||
scores = [100*float(s[ix]) for ix in sortix]
|
||||
# classify
|
||||
clf = svm.LinearSVC(class_weight='balanced', verbose=False, max_iter=10000, tol=1e-6, C=0.01)
|
||||
clf.fit(x, y)
|
||||
s = clf.decision_function(x)
|
||||
sortix = np.argsort(-s)
|
||||
pids = [itop[ix] for ix in sortix]
|
||||
scores = [100*float(s[ix]) for ix in sortix]
|
||||
|
||||
# filter by time to only recent papers
|
||||
deltat = time_delta*60*60*24 # allowed time delta in seconds
|
||||
keep = [i for i,pid in enumerate(pids) if (tnow - metas[pid]['_time']) < deltat]
|
||||
pids, scores = [pids[i] for i in keep], [scores[i] for i in keep]
|
||||
# filter by time to only recent papers
|
||||
deltat = time_delta*60*60*24 # allowed time delta in seconds
|
||||
keep = [i for i,pid in enumerate(pids) if (tnow - metas[pid]['_time']) < deltat]
|
||||
pids, scores = [pids[i] for i in keep], [scores[i] for i in keep]
|
||||
|
||||
# finally exclude the papers we already have tagged
|
||||
have = set().union(*tags.values())
|
||||
keep = [i for i,pid in enumerate(pids) if pid not in have]
|
||||
pids, scores = [pids[i] for i in keep], [scores[i] for i in keep]
|
||||
# finally exclude the papers we already have tagged
|
||||
have = set().union(*tags.values())
|
||||
keep = [i for i,pid in enumerate(pids) if pid not in have]
|
||||
pids, scores = [pids[i] for i in keep], [scores[i] for i in keep]
|
||||
|
||||
return pids, scores
|
||||
# store results
|
||||
all_pids[tag] = pids
|
||||
all_scores[tag] = scores
|
||||
|
||||
|
||||
return all_pids, all_scores
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
|
||||
def render_recommendations(user, tags, pids, scores):
|
||||
def render_recommendations(user, tags, tag_pids, tag_scores):
|
||||
# render the paper recommendations into the html template
|
||||
|
||||
# first we are going to merge all of the papers / scores together using a MAX
|
||||
max_score = {}
|
||||
max_source_tag = {}
|
||||
for tag in tag_pids:
|
||||
for pid, score in zip(tag_pids[tag], tag_scores[tag]):
|
||||
max_score[pid] = max(max_score.get(pid, -99999), score) # lol
|
||||
if max_score[pid] == score:
|
||||
max_source_tag[pid] = tag
|
||||
|
||||
# now we have a dict of pid -> max score. sort by score
|
||||
max_score_list = sorted(max_score.items(), key=lambda x: x[1], reverse=True)
|
||||
pids, scores = zip(*max_score_list)
|
||||
|
||||
# now render the html for each individual recommendation
|
||||
parts = []
|
||||
n = min(len(scores), args.num_recommendations)
|
||||
for score, pid in zip(scores[:n], pids[:n]):
|
||||
@ -134,12 +163,12 @@ def render_recommendations(user, tags, pids, scores):
|
||||
<tr>
|
||||
<td valign="top"><div class="s">%.2f</div></td>
|
||||
<td>
|
||||
<a href="%s">%s</a>
|
||||
<a href="%s">%s</a> <div class="f">(%s)</div>
|
||||
<div class="a">%s</div>
|
||||
<div class="u">%s</div>
|
||||
</td>
|
||||
</tr>
|
||||
""" % (score, p['link'], p['title'], authors, summary)
|
||||
""" % (score, p['link'], p['title'], max_source_tag[pid], authors, summary)
|
||||
)
|
||||
|
||||
# render the final html
|
||||
@ -239,6 +268,9 @@ if __name__ == "__main__":
|
||||
print("skipping user %s, no papers tagged" % (user, ))
|
||||
continue
|
||||
|
||||
# insert a fake entry in tags for the special "all" tag, which is the union of all papers
|
||||
# tags['all'] = set().union(*tags.values())
|
||||
|
||||
# calculate the recommendations
|
||||
pids, scores = calculate_recommendation(tags, time_delta=args.time_delta)
|
||||
print("user %s has %d recommendations over last %d days" % (user, len(pids), args.time_delta))
|
||||
@ -259,8 +291,8 @@ if __name__ == "__main__":
|
||||
send_email(email, html)
|
||||
num_sent += 1
|
||||
|
||||
# zzz
|
||||
time.sleep(1 + random.uniform(0, 2))
|
||||
# zzz?
|
||||
# time.sleep(1 + random.uniform(0, 2))
|
||||
|
||||
print("done.")
|
||||
print("sent %d emails" % (num_sent, ))
|
||||
|
||||
Loading…
Reference in New Issue
Block a user