allow to use fewer documents for training tfidf features to prevent OOMs

This commit is contained in:
Andrej Karpathy 2021-11-29 15:38:36 -08:00
parent e182dda381
commit d5b91270a9

View File

@ -3,6 +3,7 @@ Extracts tfidf features from all paper abstracts and saves them to disk.
""" """
import argparse import argparse
from random import shuffle
import numpy as np import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.feature_extraction.text import TfidfVectorizer
@ -17,6 +18,7 @@ if __name__ == '__main__':
parser.add_argument('-n', '--num', type=int, default=20000, help='number of tfidf features') parser.add_argument('-n', '--num', type=int, default=20000, help='number of tfidf features')
parser.add_argument('--min_df', type=int, default=5, help='min df') parser.add_argument('--min_df', type=int, default=5, help='min df')
parser.add_argument('--max_df', type=float, default=0.1, help='max df') parser.add_argument('--max_df', type=float, default=0.1, help='max df')
parser.add_argument('--max_docs', type=int, default=-1, help='maximum number of documents to use when training tfidf, or -1 to disable')
args = parser.parse_args() args = parser.parse_args()
print(args) print(args)
@ -30,16 +32,29 @@ if __name__ == '__main__':
pdb = get_papers_db(flag='r') pdb = get_papers_db(flag='r')
def make_corpus(): def make_corpus(training: bool):
for p, d in pdb.items(): assert isinstance(training, bool)
# determine which papers we will use to build tfidf
if training and args.max_docs > 0 and args.max_docs < len(pdb):
# crop to a random subset of papers
keys = list(pdb.keys())
shuffle(keys)
keys = keys[:args.max_docs]
else:
keys = pdb.keys()
# yield the abstracts of the papers
for p in keys:
d = pdb[p]
author_str = ' '.join([a['name'] for a in d['authors']]) author_str = ' '.join([a['name'] for a in d['authors']])
yield ' '.join([d['title'], d['summary'], author_str]) yield ' '.join([d['title'], d['summary'], author_str])
print("training tfidf vectors...") print("training tfidf vectors...")
v.fit(make_corpus()) v.fit(make_corpus(training=True))
print("running inference...") print("running inference...")
x = v.transform(make_corpus()).astype(np.float32) x = v.transform(make_corpus(training=False)).astype(np.float32)
print(x.shape) print(x.shape)
print("saving to features to disk...") print("saving to features to disk...")