Add twitter functionality similar to original site

This commit is contained in:
Ben Rush 2022-05-05 20:08:33 -07:00
parent d7a303b410
commit bea33f6d71
9 changed files with 304 additions and 4 deletions

4
.gitignore vendored
View File

@ -3,4 +3,6 @@
__pycache__
data
*.ipynb
secret_key.txt
secet_key.txt
env/
twitter.txt

View File

@ -129,6 +129,10 @@ def get_email_db(flag='r', autocommit=True):
edb = SqliteDict(DICT_DB_FILE, tablename='email', flag=flag, autocommit=autocommit)
return edb
def get_tweets_db(flag='r', autocommit=True):
assert flag in ['r', 'c']
tdb = SqliteDict(DICT_DB_FILE, tablename='tweets', flag=flag, autocommit=autocommit)
return tdb
# -----------------------------------------------------------------------------
"""
our "feature store" is currently just a pickle file, may want to consider hdf5 in the future

91
aslite/twitter.py Normal file
View File

@ -0,0 +1,91 @@
"""
Periodically checks Twitter for tweets about arxiv papers we recognize
and logs the tweets into mongodb database "arxiv", under "tweets" collection.
"""
import os
import re
import time
import math
import pickle
import datetime
import tweepy
import logging
# settings
# -----------------------------------------------------------------------------
sleep_time = 60*10 # in seconds, between twitter API calls. Default rate limit is 180 per 15 minutes
# convenience functions
# -----------------------------------------------------------------------------
def extract_arxiv_pids(r):
pids = []
for u in r.get("entities",{}).get("urls",[]):
m = re.search('arxiv.org/(?:abs|pdf)/(.+)', u.get('unwound_url',u.get('expanded_url','')))
if m:
pids.append(m.group(1))
if m:
rawid = m.group(1).strip(".pdf")
pids.append(rawid)
return pids
def get_latest_or_loop(q, start_datetime=None):
if start_datetime is None:
start_datetime = datetime.datetime.utcnow() - datetime.timedelta(days=6, hours=23)
start = start_datetime.strftime('%Y-%m-%dT%H:%M:%SZ')
results = []
next_token = None
q = "url:arxiv.org lang:en -is:retweet"
bearer = open('twitter.txt', 'r').read().splitlines()[0]
client = tweepy.Client(bearer)
while True:
try:
resp = client.search_recent_tweets(q, expansions=['author_id'],
max_results=100,
next_token=next_token,
start_time=start,
tweet_fields=['id', 'created_at', 'author_id', 'entities', 'lang', 'public_metrics'],
user_fields=['public_metrics'])
#logging.log(logging.INFO, "fetched %d tweets", len(resp.data))
results.append(resp)
next_token = resp.meta.get('next_token', None)
if next_token is None:
break
except Exception as e:
print('there was some problem (waiting some time and trying again):')
print(e)
time.sleep(sleep_time)
return results
def parse_tweets(results):
tweets = []
for result in results:
authors = result.includes.get('users',[])
for r in result.data:
arxiv_pids = extract_arxiv_pids(r)
if not arxiv_pids: continue # nothing we know about here, lets move on
author = next(a for a in authors if a.id == r.author_id)
if "arxiv" in r.user.screen_name.lower(): continue # banned user, very likely a bot
# create the tweet. intentionally making it flat here without user nesting
#d = datetime.datetime.strptime(r.created_at,'iso8601')
tweet = {}
tweet['id'] = r.id
tweet['pids'] = arxiv_pids # arxiv paper ids mentioned in this tweet
tweet['inserted_at_date'] = datetime.datetime.utcnow().isoformat()
tweet['created_at_date'] = r.created_at.isoformat()
tweet['created_at_time'] = int(time.mktime(r.created_at.timetuple()))
tweet['lang'] = r.lang
tweet['text'] = r.text
tweet['retweet_count'] = r.public_metrics.get('retweet_count',0)
tweet['reply_count'] = r.public_metrics.get('reply_count',0)
tweet['like_count'] = r.public_metrics.get('like_count',0)
tweet['quote_count'] = r.public_metrics.get('quote_count',0)
tweet['user_screen_name'] = author.username
tweet['user_followers_count'] = author.get('public_metrics',{}).get('followers_count',0)
tweet['user_following_count'] = author.get('public_metrics',{}).get('following_count',0)
tweets.append(tweet)
return tweets

View File

@ -3,3 +3,6 @@ Flask==2.0.2
numpy==1.21.4
scikit-learn==1.0.1
sqlitedict==1.7.0
tweepy

104
serve.py
View File

@ -9,8 +9,10 @@ ideas:
import os
import re
from termios import tcsendbreak
import time
from random import shuffle
import math
import numpy as np
from sklearn import svm
@ -20,13 +22,14 @@ from flask import render_template
from flask import g # global session-level object
from flask import session
from aslite.db import get_papers_db, get_metas_db, get_tags_db, get_last_active_db, get_email_db
from aslite.db import get_papers_db, get_metas_db, get_tags_db, get_last_active_db, get_email_db, get_tweets_db
from aslite.db import load_features
# -----------------------------------------------------------------------------
# inits and globals
RET_NUM = 25 # number of papers to return per page
max_tweet_records = 15
app = Flask(__name__)
@ -62,6 +65,11 @@ def get_metas():
g._mdb = get_metas_db()
return g._mdb
def get_tweets():
if not hasattr(g, '_tweets'):
g._tweets = get_tweets_db()
return g._tweets
@app.before_request
def before_request():
g.user = session.get('user', None)
@ -168,6 +176,82 @@ def svm_rank(tags: str = '', pid: str = '', C: float = 0.01):
})
return pids, scores, words
def tprepro(tweet_text):
# take tweet, return set of words
t = tweet_text.lower()
t = re.sub(r'[^\w\s]','',t) # remove punctuation
ws = set([w for w in t.split() if not w.startswith('#')])
return ws
def tweets_rank(days=7):
try:
days = int(days)
except:
days = 7
tweets = get_tweets()
papers = get_papers()
tnow = time.time()
t0 = tnow - int(days)*24*60*60
tweets_filter = [t for p,t in tweets.items() if t['created_at_time'] > t0]
raw_votes, votes, records_dict, pid_to_words_cache = {}, {}, {}, {}
for tweet in tweets_filter:
# some tweets are really boring, like an RT
if "arxiv" in tweet['user_screen_name'].lower():
continue
tweet_words = tprepro(tweet['text'])
isok = not(tweet['text'].startswith('RT') or
tweet['lang'] != 'en' or
len(tweet['text']) < 40)
# give people with more followers more vote, as it's seen by more people and contributes to more hype
float_vote = min(math.log10(tweet['user_followers_count'] + 1), 4.0)/2.0
# uprank tweets that have more likes, retweets, replies, and quotes
float_vote += math.log10(tweet['like_count'] + tweet['retweet_count'] + 1)
float_vote += math.log10(tweet['reply_count'] + tweet['quote_count'] + 1)
for pid in set(tweet['pids']):
if pid not in papers:
continue
if not pid in records_dict:
records_dict[pid] = {'pid':pid, 'tweets':[], 'vote': 0.0, 'raw_vote': 0} # create a new entry for this pid
# good tweets make a comment, not just a boring RT, or exactly the post title. Detect these.
if pid in pid_to_words_cache:
title_words = pid_to_words_cache[pid]
else:
title_words = tprepro(papers[pid]['title'])
pid_to_words_cache[pid] = title_words
comment_words = tweet_words - title_words # how much does the tweet have other than just the actual title of the article?
isok2 = int(isok and len(comment_words) >= 3)
# add up the votes for papers
tweet_sort_bonus = 10000 if isok2 else 0 # lets bring meaningful comments up front.
records_dict[pid]['tweets'].append({'screen_name':tweet['user_screen_name'], 'text':tweet['text'], 'weight':float_vote + tweet_sort_bonus, 'ok':isok2, 'id':str(tweet['id']) })
votes[pid] = votes.get(pid, 0.0) + float_vote
raw_votes[pid] = raw_votes.get(pid, 0) + 1
# record the total amount of vote/raw_vote for each pid
for pid in votes:
records_dict[pid]['vote'] = votes[pid] # record the total amount of vote across relevant tweets
records_dict[pid]['raw_vote'] = raw_votes[pid]
# crop the tweets to only some number of highest weight ones (for efficiency)
# for pid, d in records_dict.items():
# d['num_tweets'] = len(d['tweets']) # back this up before we crop
# d['tweets'].sort(reverse=True, key=lambda x: x['weight'])
# if len(d['tweets']) > max_tweet_records: d['tweets'] = d['tweets'][:max_tweet_records]
pids = sorted(records_dict, key=lambda x: records_dict[x]['vote'], reverse=True)
scores = [records_dict[pid]['vote'] for pid in pids]
tweets = [records_dict[pid]['tweets'] for pid in pids]
return pids, scores, tweets
def search_rank(q: str = ''):
if not q:
@ -210,13 +294,14 @@ def main():
default_skip_have = 'no'
# override variables with any provided options via the interface
opt_rank = request.args.get('rank', default_rank) # rank type. search|tags|pid|time|random
opt_rank = request.args.get('rank', default_rank) # rank type. search|tags|pid|time|tweets|random
opt_q = request.args.get('q', '') # search request in the text box
opt_tags = request.args.get('tags', default_tags) # tags to rank by if opt_rank == 'tag'
opt_pid = request.args.get('pid', '') # pid to find nearest neighbors to
opt_time_filter = request.args.get('time_filter', default_time_filter) # number of days to filter by
opt_skip_have = request.args.get('skip_have', default_skip_have) # hide papers we already have?
opt_svm_c = request.args.get('svm_c', '') # svm C parameter
opt_tweet_filter = request.args.get('tweet_filter', '') # days of tweets to filter
opt_page_number = request.args.get('page_number', '1') # page number for pagination
# if a query is given, override rank to be of type "search"
@ -232,6 +317,7 @@ def main():
# rank papers: by tags, by time, by random
words = [] # only populated in the case of svm rank
tweets = [] # only populated in the case of tweet rank
if opt_rank == 'search':
pids, scores = search_rank(q=opt_q)
elif opt_rank == 'tags':
@ -240,6 +326,8 @@ def main():
pids, scores, words = svm_rank(pid=opt_pid, C=C)
elif opt_rank == 'time':
pids, scores = time_rank()
elif opt_rank == 'tweets':
pids, scores, tweets = tweets_rank(days=opt_tweet_filter)
elif opt_rank == 'random':
pids, scores = random_rank()
else:
@ -287,12 +375,15 @@ def main():
context['papers'] = papers
context['tags'] = rtags
context['words'] = words
context['tweets'] = tweets
context['words_desc'] = "Here are the top 40 most positive and bottom 20 most negative weights of the SVM. If they don't look great then try tuning the regularization strength hyperparameter of the SVM, svm_c, above. Lower C is higher regularization."
context['words_desc'] = "Here are the top 15 most influential tweets about this paper."
context['gvars'] = {}
context['gvars']['rank'] = opt_rank
context['gvars']['tags'] = opt_tags
context['gvars']['pid'] = opt_pid
context['gvars']['time_filter'] = opt_time_filter
context['gvars']['tweet_filter'] = opt_tweet_filter
context['gvars']['skip_have'] = opt_skip_have
context['gvars']['search_query'] = opt_q
context['gvars']['svm_c'] = str(C)
@ -324,12 +415,21 @@ def inspect():
})
words.sort(key=lambda w: w['weight'], reverse=True)
# get the tweets for this paper
tdb = get_tweets()
tweets = [t for _, t in tdb.items() if pid in t['pids']]
for i, t in enumerate(tweets):
tweets[i]['id'] = str(t['id'])
# package everything up and render
paper = render_pid(pid)
context = default_context()
context['paper'] = paper
context['words'] = words
context['tweets'] = tweets
context['words_desc'] = "The following are the tokens and their (tfidf) weight in the paper vector. This is the actual summary that feeds into the SVM to power recommendations, so hopefully it is good and representative!"
context['tweets_desc'] = "The following are the most influential tweets and their scores."
return render_template('inspect.html', **context)
@app.route('/profile')

31
static/tweet_list.js Normal file
View File

@ -0,0 +1,31 @@
'use strict';
const Tweet = props => {
const p = props.tweet;
// tweet, score
return (
twttr.widgets.createTweet(
String(p.id),
document.getElementById('tweetwrap')
)
)
}
const TweetList = props => {
const lst = props.tweets;
const tweets_desc = props.tweets_desc;
const tlst = lst.map((jtweet, ix) => <Tweet key={ix} tweet={jtweet} />);
return (
<div>
<div>{tweets_desc}</div>
<div id="tweetList" class="rel_tweets">
{tlst}
</div>
</div>
)
}
var elt = document.getElementById('tweetwrap');
if(elt) {
ReactDOM.render(<TweetList tweets={tweets} tweets_desc={tweets_desc} />, elt);
}

View File

@ -5,6 +5,7 @@
var papers = {{ papers | tojson }};
var tags = {{ tags | tojson }};
var words = {{ words | tojson }};
var tweets = {{ tweets | tojson }};
var words_desc = {{ words_desc | tojson }};
var gvars = {{ gvars | tojson }};
@ -43,6 +44,7 @@ var move_page = function(int_offset) {
<option value="tags" {{ gvars.rank == 'tags' and 'selected' }}>tags</option>
<option value="pid" {{ gvars.rank == 'pid' and 'selected' }}>pid</option>
<option value="time" {{ gvars.rank == 'time' and 'selected' }}>time</option>
<option value="tweets" {{ gvars.rank == 'tweets' and 'selected' }}>tweets</option>
<option value="random" {{ gvars.rank == 'random' and 'selected' }}>random</option>
</select>
@ -58,6 +60,10 @@ var move_page = function(int_offset) {
<label for="time_filter">time_filter (days): </label>
<input name="time_filter" type="text" id="time_filter_field" value="{{ gvars.time_filter }}">
<!-- current tweets_filter, in a text field -->
<label for="tweets_filter">tweets_filter (tweets): </label>
<input name="tweets_filter" type="text" id="tweets_filter_field" value="{{ gvars.tweets_filter }}">
<!-- current svm_c, in a text field -->
<label for="svm_c">svm_c: </label>
<input name="svm_c" type="text" id="svm_c_field" value="{{ gvars.svm_c }}">

View File

@ -5,17 +5,38 @@
var paper = {{ paper | tojson }};
var words = {{ words | tojson }};
var words_desc = {{ words_desc | tojson }};
var tweets = {{ tweets | tojson }};
var tweets_desc = {{ tweets_desc | tojson }};
</script>
<script>window.twttr = (function(d, s, id) {
var js, fjs = d.getElementsByTagName(s)[0],
t = window.twttr || {};
if (d.getElementById(id)) return t;
js = d.createElement(s);
js.id = id;
js.src = "https://platform.twitter.com/widgets.js";
fjs.parentNode.insertBefore(js, fjs);
t._e = [];
t.ready = function(f) {
t._e.push(f);
};
return t;
}(document, "script", "twitter-wjs"));</script>
{% endblock %}
{% block content %}
<div id="wrap">
</div>
<div id="wordwrap">
<div id="tweetwrap">
</div>
<div id="wordwrap">
{% endblock %}
{% block elements %}
<script src="{{ url_for('static', filename='paper_detail.js') }}" type="text/babel"></script>
<script src="{{ url_for('static', filename='word_list.js') }}" type="text/babel"></script>
<script src="{{ url_for('static', filename='tweet_list.js') }}" type="text/babel"></script>
{% endblock %}

42
twitter_daemon.py Normal file
View File

@ -0,0 +1,42 @@
"""
This script is intended to wake up every 30 min or so (eg via cron),
it checks for any new arxiv papers via the arxiv API and stashes
them into a sqlite database.
"""
import sys
import time
import random
import datetime
import logging
import argparse
from aslite.twitter import get_latest_or_loop, parse_tweets
from aslite.db import get_papers_db, get_tweets_db
if __name__ == '__main__':
logging.basicConfig(level=logging.INFO, format='%(name)s %(levelname)s %(asctime)s %(message)s', datefmt='%m/%d/%Y %I:%M:%S %p')
pdb = get_papers_db(flag='c')
tdb = get_tweets_db(flag='c')
prevn = len(tdb)
def store(t):
tdb[t['id']] = t
latest = 0
for k,v in tdb.items():
if v['created_at_time'] > latest:
latest = v['created_at_time']
if prevn > 0:
start = datetime.datetime.utcfromtimestamp(latest)
else:
start = None
# fetch the latest tweets mentioning arxiv.org
results = get_latest_or_loop(start)
tweets = parse_tweets(results)
for t in tweets:
if t['id'] not in tdb: store(t)