ok now we can sequester all the database files into data/ folder so everything is nice and clean yay

This commit is contained in:
Andrej Karpathy 2021-11-25 13:47:45 -08:00
parent 77279e1777
commit 1ed6e3f1b0
3 changed files with 13 additions and 3 deletions

2
.gitignore vendored
View File

@ -1,3 +1,5 @@
.DS_Store
.ipynb_checkpoints
__pycache__
data
*.ipynb

View File

@ -4,9 +4,12 @@ The idea is that none of the individual scripts deal directly with the file syst
Any of the file system I/O and the associated settings are in this single file.
"""
import os
import sqlite3, zlib, pickle
from sqlitedict import SqliteDict
DATA_DIR = 'data'
# -----------------------------------------------------------------------------
class CompressedSqliteDict(SqliteDict):
@ -29,8 +32,10 @@ flag='c': default mode, open for read/write, and creating the db/table if necess
flag='r': open for read-only
"""
PAPERS_DB_FILE = 'papers.db' # stores info about papers, and also their lighter-weight metadata
DICT_DB_FILE = 'dict.db' # stores account-relevant info, like which tags exist for which papers
# stores info about papers, and also their lighter-weight metadata
PAPERS_DB_FILE = os.path.join(DATA_DIR, 'papers.db')
# stores account-relevant info, like which tags exist for which papers
DICT_DB_FILE = os.path.join(DATA_DIR, 'dict.db')
def get_papers_db(flag='r', autocommit=True):
assert flag in ['r', 'c']
@ -52,7 +57,8 @@ def get_tags_db(flag='r', autocommit=True):
our "feature store" is currently just a pickle file, may want to consider hdf5 in the future
"""
FEATURES_FILE = 'features.p' # stores tfidf features a bunch of other metadata
# stores tfidf features a bunch of other metadata
FEATURES_FILE = os.path.join(DATA_DIR, 'features.p')
def save_features(features):
""" takes the features dict and save it to disk in a simple pickle file """

2
data/readme.md Normal file
View File

@ -0,0 +1,2 @@
This directory stores the database, sequestered away from the code and the main project directory.
E.g. includes the arxiv paper metadata, the calculated tfidf features, and the user tags data.