change the way we recommend papers. we'll do it per tag instead of just mixing everything together. i believe this should produce higher quality results
This commit is contained in:
parent
65d4bb8415
commit
6e85778b24
@ -50,6 +50,10 @@ body {
|
|||||||
color: #333;
|
color: #333;
|
||||||
margin-bottom: 10px;
|
margin-bottom: 10px;
|
||||||
}
|
}
|
||||||
|
.f {
|
||||||
|
color: #933;
|
||||||
|
display: inline-block;
|
||||||
|
}
|
||||||
</style>
|
</style>
|
||||||
</head>
|
</head>
|
||||||
|
|
||||||
@ -88,37 +92,62 @@ def calculate_recommendation(
|
|||||||
ptoi[p] = i
|
ptoi[p] = i
|
||||||
itop[i] = p
|
itop[i] = p
|
||||||
|
|
||||||
# construct the positive set via simple union of all tags
|
# loop over all the tags
|
||||||
y = np.zeros(n, dtype=np.float32)
|
all_pids, all_scores = {}, {}
|
||||||
for tag, pids in tags.items():
|
for tag, pids in tags.items():
|
||||||
|
|
||||||
|
if len(pids) == 0:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# construct the positive set for this tag
|
||||||
|
y = np.zeros(n, dtype=np.float32)
|
||||||
for pid in pids:
|
for pid in pids:
|
||||||
y[ptoi[pid]] = 1.0
|
y[ptoi[pid]] = 1.0
|
||||||
|
|
||||||
# classify
|
# classify
|
||||||
clf = svm.LinearSVC(class_weight='balanced', verbose=False, max_iter=10000, tol=1e-6, C=0.1)
|
clf = svm.LinearSVC(class_weight='balanced', verbose=False, max_iter=10000, tol=1e-6, C=0.01)
|
||||||
clf.fit(x, y)
|
clf.fit(x, y)
|
||||||
s = clf.decision_function(x)
|
s = clf.decision_function(x)
|
||||||
sortix = np.argsort(-s)
|
sortix = np.argsort(-s)
|
||||||
pids = [itop[ix] for ix in sortix]
|
pids = [itop[ix] for ix in sortix]
|
||||||
scores = [100*float(s[ix]) for ix in sortix]
|
scores = [100*float(s[ix]) for ix in sortix]
|
||||||
|
|
||||||
# filter by time to only recent papers
|
# filter by time to only recent papers
|
||||||
deltat = time_delta*60*60*24 # allowed time delta in seconds
|
deltat = time_delta*60*60*24 # allowed time delta in seconds
|
||||||
keep = [i for i,pid in enumerate(pids) if (tnow - metas[pid]['_time']) < deltat]
|
keep = [i for i,pid in enumerate(pids) if (tnow - metas[pid]['_time']) < deltat]
|
||||||
pids, scores = [pids[i] for i in keep], [scores[i] for i in keep]
|
pids, scores = [pids[i] for i in keep], [scores[i] for i in keep]
|
||||||
|
|
||||||
# finally exclude the papers we already have tagged
|
# finally exclude the papers we already have tagged
|
||||||
have = set().union(*tags.values())
|
have = set().union(*tags.values())
|
||||||
keep = [i for i,pid in enumerate(pids) if pid not in have]
|
keep = [i for i,pid in enumerate(pids) if pid not in have]
|
||||||
pids, scores = [pids[i] for i in keep], [scores[i] for i in keep]
|
pids, scores = [pids[i] for i in keep], [scores[i] for i in keep]
|
||||||
|
|
||||||
return pids, scores
|
# store results
|
||||||
|
all_pids[tag] = pids
|
||||||
|
all_scores[tag] = scores
|
||||||
|
|
||||||
|
|
||||||
|
return all_pids, all_scores
|
||||||
|
|
||||||
# -----------------------------------------------------------------------------
|
# -----------------------------------------------------------------------------
|
||||||
|
|
||||||
def render_recommendations(user, tags, pids, scores):
|
def render_recommendations(user, tags, tag_pids, tag_scores):
|
||||||
# render the paper recommendations into the html template
|
# render the paper recommendations into the html template
|
||||||
|
|
||||||
|
# first we are going to merge all of the papers / scores together using a MAX
|
||||||
|
max_score = {}
|
||||||
|
max_source_tag = {}
|
||||||
|
for tag in tag_pids:
|
||||||
|
for pid, score in zip(tag_pids[tag], tag_scores[tag]):
|
||||||
|
max_score[pid] = max(max_score.get(pid, -99999), score) # lol
|
||||||
|
if max_score[pid] == score:
|
||||||
|
max_source_tag[pid] = tag
|
||||||
|
|
||||||
|
# now we have a dict of pid -> max score. sort by score
|
||||||
|
max_score_list = sorted(max_score.items(), key=lambda x: x[1], reverse=True)
|
||||||
|
pids, scores = zip(*max_score_list)
|
||||||
|
|
||||||
|
# now render the html for each individual recommendation
|
||||||
parts = []
|
parts = []
|
||||||
n = min(len(scores), args.num_recommendations)
|
n = min(len(scores), args.num_recommendations)
|
||||||
for score, pid in zip(scores[:n], pids[:n]):
|
for score, pid in zip(scores[:n], pids[:n]):
|
||||||
@ -134,12 +163,12 @@ def render_recommendations(user, tags, pids, scores):
|
|||||||
<tr>
|
<tr>
|
||||||
<td valign="top"><div class="s">%.2f</div></td>
|
<td valign="top"><div class="s">%.2f</div></td>
|
||||||
<td>
|
<td>
|
||||||
<a href="%s">%s</a>
|
<a href="%s">%s</a> <div class="f">(%s)</div>
|
||||||
<div class="a">%s</div>
|
<div class="a">%s</div>
|
||||||
<div class="u">%s</div>
|
<div class="u">%s</div>
|
||||||
</td>
|
</td>
|
||||||
</tr>
|
</tr>
|
||||||
""" % (score, p['link'], p['title'], authors, summary)
|
""" % (score, p['link'], p['title'], max_source_tag[pid], authors, summary)
|
||||||
)
|
)
|
||||||
|
|
||||||
# render the final html
|
# render the final html
|
||||||
@ -239,6 +268,9 @@ if __name__ == "__main__":
|
|||||||
print("skipping user %s, no papers tagged" % (user, ))
|
print("skipping user %s, no papers tagged" % (user, ))
|
||||||
continue
|
continue
|
||||||
|
|
||||||
|
# insert a fake entry in tags for the special "all" tag, which is the union of all papers
|
||||||
|
# tags['all'] = set().union(*tags.values())
|
||||||
|
|
||||||
# calculate the recommendations
|
# calculate the recommendations
|
||||||
pids, scores = calculate_recommendation(tags, time_delta=args.time_delta)
|
pids, scores = calculate_recommendation(tags, time_delta=args.time_delta)
|
||||||
print("user %s has %d recommendations over last %d days" % (user, len(pids), args.time_delta))
|
print("user %s has %d recommendations over last %d days" % (user, len(pids), args.time_delta))
|
||||||
@ -259,8 +291,8 @@ if __name__ == "__main__":
|
|||||||
send_email(email, html)
|
send_email(email, html)
|
||||||
num_sent += 1
|
num_sent += 1
|
||||||
|
|
||||||
# zzz
|
# zzz?
|
||||||
time.sleep(1 + random.uniform(0, 2))
|
# time.sleep(1 + random.uniform(0, 2))
|
||||||
|
|
||||||
print("done.")
|
print("done.")
|
||||||
print("sent %d emails" % (num_sent, ))
|
print("sent %d emails" % (num_sent, ))
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user