Files

68 lines
2.4 KiB
Python

"""
Extracts tfidf features from all paper abstracts and saves them to disk.
"""
import argparse
from random import shuffle
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from aslite.db import get_papers_db, save_features
# -----------------------------------------------------------------------------
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Arxiv Computor')
parser.add_argument('-n', '--num', type=int, default=20000, help='number of tfidf features')
parser.add_argument('--min_df', type=int, default=5, help='min df')
parser.add_argument('--max_df', type=float, default=0.1, help='max df')
parser.add_argument('--max_docs', type=int, default=-1, help='maximum number of documents to use when training tfidf, or -1 to disable')
args = parser.parse_args()
print(args)
v = TfidfVectorizer(input='content',
encoding='utf-8', decode_error='replace', strip_accents='unicode',
lowercase=True, analyzer='word', stop_words='english',
token_pattern=r'(?u)\b[a-zA-Z_][a-zA-Z0-9_]+\b',
ngram_range=(1, 2), max_features=args.num,
norm='l2', use_idf=True, smooth_idf=True, sublinear_tf=True,
max_df=args.max_df, min_df=args.min_df)
pdb = get_papers_db(flag='r')
def make_corpus(training: bool):
assert isinstance(training, bool)
# determine which papers we will use to build tfidf
if training and args.max_docs > 0 and args.max_docs < len(pdb):
# crop to a random subset of papers
keys = list(pdb.keys())
shuffle(keys)
keys = keys[:args.max_docs]
else:
keys = pdb.keys()
# yield the abstracts of the papers
for p in keys:
d = pdb[p]
author_str = ' '.join([a['name'] for a in d['authors']])
yield ' '.join([d['title'], d['summary'], author_str])
print("training tfidf vectors...")
v.fit(make_corpus(training=True))
print("running inference...")
x = v.transform(make_corpus(training=False)).astype(np.float32)
print(x.shape)
print("saving to features to disk...")
features = {
'pids': list(pdb.keys()),
'x': x,
'vocab': v.vocabulary_,
'idf': v._tfidf.idf_,
}
save_features(features)