diff --git a/parser/__init__.py b/parser/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/parser/aggregate_fms_fixies.py b/parser/aggregate_fms_fixies.py new file mode 100644 index 0000000..b4930a4 --- /dev/null +++ b/parser/aggregate_fms_fixies.py @@ -0,0 +1,181 @@ +#!/usr/bin/env python +from __future__ import absolute_import, print_function + +import argparse +import json +import logging +from operator import itemgetter +import os +import sqlite3 +import sys + +import arrow +import pandas as pd + +from constants import (DEFAULT_DAILY_CSV_DIR, DEFAULT_LIFETIME_CSV_DIR, + DEFAULT_DATA_DIR, EARLIEST_DATE, + DB_TABLE_NAMES, TABLE_KEYS) +from utils import get_all_dates, get_daily_csvs_by_date + + +LOGGER = logging.getLogger('aggregate_fms_fixies') +LOGGER.setLevel(logging.INFO) +_handler = logging.StreamHandler() +_formatter = logging.Formatter('%(name)s | %(levelname)s | %(message)s') +_handler.setFormatter(_formatter) +LOGGER.addHandler(_handler) + + +def aggregate_table(table_key, lifetimecsvdir, daily_csvs_by_date, force): + """ + Args: + table_key (str): Unique table identifier; see :obj:`TABLE_KEYS`. + lifetimecsvdir (str) + daily_csvs_by_date (List[Tuple[:class:`arrow.Arrow`, str]]) + force (bool) + """ + lifetime_csv_fname = os.path.join(lifetimecsvdir, table_key + '.csv') + # write a new lifetime file? + if not os.path.isfile(lifetime_csv_fname) or force is True: + lifetime_csv_dates = set() + first_mode = 'w' + # append to existing lifetime file? + else: + lifetime_csv_dates = set( + pd.read_csv(lifetime_csv_fname, usecols=['date'])['date'] + .unique()) + first_mode = 'a' + + is_first = True + for daily_csv_date, fnames in daily_csvs_by_date: + if daily_csv_date.format('YYYY-MM-DD') in lifetime_csv_dates: + LOGGER.debug( + '%s %s table already aggregated -- skipping...', + daily_csv_date, table_key) + df = pd.read_csv(fnames[table_key], header=0) + if is_first is True: + df.to_csv(lifetime_csv_fname, mode=first_mode, header=True, index=False) + is_first = False + else: + df.to_csv(lifetime_csv_fname, mode='a', header=False, index=False) + + +def build_db(dbfile, lifetimecsvdir): + """ + Args: + dbfile (str) + lifetimecsvdir (str) + """ + os.remove(dbfile) + + connection = sqlite3.connect(dbfile) + # bad, but pandas doesn't work otherwise (TODO: check this) + # true, but the perfect is the enemy of the good + connection.text_factory = str + + for table_key, table_name in zip(TABLE_KEYS, DB_TABLE_NAMES): + + lifetime_csv_fname = os.path.join(lifetimecsvdir, table_key + '.csv') + if not os.path.isfile(lifetime_csv_fname): + LOGGER.warning( + 'lifetime CSV %s does not exist -- skipping db table build...', + lifetime_csv_fname) + df = pd.read_csv(lifetime_csv_fname, header=0) + + # HACK: filter out Table V after 2012-04-02 + # TODO: check with cezary podkul about this + if table_name == 't5': + LOGGER.debug( + 'filtering out invalid dates for TABLE V (deprecated as of 2012-04-02)') + max_date = arrow.get('2012-04-02').date() + df['date'] = df['date'].map(lambda x: arrow.get(x).date()) + df = df.loc[df['date'] < max_date, :] + + df.to_sql(table_name, connection, index=False) + + connection.commit() + + +def main(): + parser = argparse.ArgumentParser( + description='Script to aggregate parsed "FMS fixie" files.') + parser.add_argument( + '-s', '--startdate', type=str, default=EARLIEST_DATE.format('YYYY-MM-DD'), + help="""Start of date range over which to parse FMS fixies + as an ISO-formatted string, i.e. YYYY-MM-DD.""") + parser.add_argument( + '-e', '--enddate', type=str, default=arrow.utcnow().shift(days=-1).format('YYYY-MM-DD'), + help="""End of date range over which to download FMS fixies + as an ISO-formatted string, i.e. YYYY-MM-DD.""") + parser.add_argument( + '--dailycsvdir', type=str, default=DEFAULT_DAILY_CSV_DIR, + help='Directory on disk from which parsed fixies (daily CSVs) are loaded.') + parser.add_argument( + '--lifetimecsvdir', type=str, default=DEFAULT_LIFETIME_CSV_DIR, + help='Directory on disk to which aggregated fixies (lifetime CSVs) are saved.') + parser.add_argument( + '-db', '--dbfile', type=str, + default=os.path.join(DEFAULT_DATA_DIR, 'treasury_data.db'), + help="""Write aggregated fixies (lifetime CSVs) to the SQL database file + given here, provided the ``nodb`` flag is not specified. + NOTE: DB file is overwritten in its entirety every time.""") + parser.add_argument( + '--loglevel', type=int, default=20, choices=[10, 20, 30, 40, 50], + help='Level of message to be logged; 20 => "INFO".') + parser.add_argument( + '--nodb', default=False, action='store_true', + help="""If True, don't bother (over)writing the SQL database file given + in ``dbfile``.""") + parser.add_argument( + '--force', default=False, action='store_true', + help="""If True, aggregate all parsed fixies in [start_date, end_date], + overwriting any existing lifetime CSVs if they already exist on disk + in ``lifetimecsvdir``. Otherwise, only aggregate un-aggregated fixies.""") + args = parser.parse_args() + + LOGGER.setLevel(args.loglevel) + + # auto-make data directories, if not present + for dir_ in (args.dailycsvdir, args.lifetimecsvdir): + try: + os.makedirs(dir_) + except OSError: # already exists + continue + + # get all valid dates within range, and their corresponding fixie files + all_dates = get_all_dates(args.startdate, args.enddate) + if not all_dates: + LOGGER.warning( + 'no valid dates in range [%s, %s]', + args.startdate, args.enddate) + return + + # get all parsed fixies within range, and sort them by date + daily_csvs_by_date = sorted( + get_daily_csvs_by_date( + all_dates[0], all_dates[-1], data_dir=args.dailycsvdir).items(), + key=itemgetter(0)) + + if not daily_csvs_by_date: + LOGGER.warning( + 'no parsed fixies in range [%s, %s]', + args.startdate, args.enddate) + else: + # iterate over lifetime tables, adding corresponding daily rows as needed + for table_key in TABLE_KEYS: + LOGGER.info( + 'aggregating parsed fixies for %s in range [%s, %s]', + table_key, args.startdate, args.enddate) + aggregate_table( + table_key, args.lifetimecsvdir, daily_csvs_by_date, args.force) + + # build a sqlite database of aggregated tables? + if args.nodb is False: + LOGGER.info( + 'building sqlite db file of aggregated tables at %s', + args.dbfile) + build_db(args.dbfile, args.lifetimecsvdir) + + +if __name__ == '__main__': + sys.exit(main()) diff --git a/parser/constants.py b/parser/constants.py new file mode 100644 index 0000000..e97324c --- /dev/null +++ b/parser/constants.py @@ -0,0 +1,38 @@ +import os + +import arrow + + +EARLIEST_DATE = arrow.get('2005-06-09') +""" +:class:`arrow.Arrow`: Earliest date of available fixie files. Note that PDFs +*are* available, for the brave soul who wants to parse them. +""" + +PARSER_DIR = os.path.dirname(os.path.abspath(__file__)) +DEFAULT_DATA_DIR = os.path.abspath(os.path.join(PARSER_DIR, '..', 'data')) +DEFAULT_FIXIE_DIR = os.path.join(DEFAULT_DATA_DIR, 'fixie') +DEFAULT_DAILY_CSV_DIR = os.path.join(DEFAULT_DATA_DIR, 'daily_csv') +DEFAULT_LIFETIME_CSV_DIR = os.path.join(DEFAULT_DATA_DIR, 'lifetime_csv') + +TABLE_KEYS = ( + 'table_i', + 'table_ii', + 'table_iii_a', + 'table_iii_b', + 'table_iii_c', + 'table_iv', + 'table_v', + 'table_vi', + ) + +DB_TABLE_NAMES = ( + 't1', + 't2', + 't3a', + 't3b', + 't3c', + 't4', + 't5', + 't6', + ) diff --git a/parser/download_and_parse_fms_fixies.py b/parser/download_and_parse_fms_fixies.py deleted file mode 100755 index b85c09e..0000000 --- a/parser/download_and_parse_fms_fixies.py +++ /dev/null @@ -1,199 +0,0 @@ -#!/usr/bin/env python -from __future__ import print_function - -import datetime -import download_fms_fixies -import os -import pandas as pd -import pandas.io.sql -import parse_fms_fixies -import re -import sqlite3 -import sys - -# script must be run from fms_parser/parser directory -if not os.path.split(os.getcwd())[-1] == 'parser': - if os.path.split(os.getcwd())[-1] == 'federal-treasury-api' or os.path.split(os.getcwd())[-1] =='fms_parser': - os.chdir('parser') - print('\n*INFO: current working directory set to', os.getcwd()) - else: - raise Exception('This script must be run from the /parser directory!') - -# auto-make data directories, if not present -FIXIE_DIR = os.path.join('..', 'data', 'fixie') -DAILY_CSV_DIR = os.path.join('..', 'data', 'daily_csv') -LIFETIME_CSV_DIR = os.path.join('..', 'data', 'lifetime_csv') -os.system('mkdir -pv ' + FIXIE_DIR) -os.system('mkdir -pv ' + DAILY_CSV_DIR) -os.system('mkdir -pv ' + LIFETIME_CSV_DIR) - -## DOWNLOAD! ################################################################## -# test for existence of downloaded fixies -test_fixies = sorted([f for f in os.listdir(FIXIE_DIR) if f.endswith('.txt')]) -# if none, start from THE BEGINNING -if len(test_fixies) == 0: - start_date = datetime.date(2005, 6, 9) -# else start from last available fixie date -else: - start_date = parse_fms_fixies.get_date_from_fname(test_fixies[-1]) -# always end with today -end_date = datetime.date.today() - -# download all teh fixies! -download_fms_fixies.download_fixies(start_date, end_date) - -# check all downloaded fixies against all parsed csvs -downloaded_files = set([fixie.split('.')[0] for fixie in os.listdir(FIXIE_DIR) if fixie.endswith('.txt')]) -def parsed_files(): - return set([csv.split('_')[0] for csv in os.listdir(DAILY_CSV_DIR) if csv.endswith('.csv')]) - - -## PARSE! ##################################################################### -# fixies that have not yet been parsed into csvs -new_files = sorted(list(downloaded_files.difference(parsed_files()))) - -# parse all teh fixies! -for f in new_files: - fname = os.path.join(FIXIE_DIR, f+'.txt') - dfs = parse_fms_fixies.parse_file(fname, verbose=False) - - # each table for each date stored in separate csv files - for df in dfs.values(): - try: - t_name = df.ix[0,'table'] - t_name_match = re.search(r'TABLE [\w-]+', t_name) - t_name_short = re.sub(r'-| ', '_', t_name_match.group().lower()) - except Exception as e: - print('***ERROR: tables failed to parse!', e) - # go on - continue - - daily_csv = os.path.join(DAILY_CSV_DIR, f.split('.')[0]+'_'+t_name_short+'.csv') - df.to_csv(daily_csv, index=False, header=True, encoding='utf-8', na_rep='') - -# iterate over all fms tables -for i in ['i', 'ii', 'iii_a', 'iii_b', 'iii_c', 'iv', 'v', 'vi']: - - # create the lifetime csv files it they don't exist - lifetime_csv = os.path.join(LIFETIME_CSV_DIR, 'table_'+str(i)+'.csv') - - # if it doesn't exist - if not os.path.isfile(lifetime_csv): - lifetime = open(lifetime_csv, 'ab') - # add the header - lifetime.write(open(os.path.join(DAILY_CSV_DIR, list(parsed_files())[0]+'_table_'+str(i)+'.csv')).readline()) - lifetime.close() - - # append new csvs to lifetime csvs - for f in new_files: - - # we have no idea why it's giving us a blank file - if len(f) == 0: continue - - daily_csv = os.path.join(DAILY_CSV_DIR, f.split('.')[0]+'_table_'+str(i)+'.csv') - if not os.path.isfile(daily_csv): continue - - lifetime = open(lifetime_csv, 'ab') - daily = open(daily_csv, 'rb') - - daily.readline() # burn header - for line in daily: - lifetime.write(line) - daily.close() - - -## SQL-IZE! ################################################################### -TABLES = [ - { - 'raw-table': 'i', - 'new-table': 't1', - }, - { - 'raw-table': 'ii', - 'new-table': 't2', - }, - { - 'raw-table': 'iii_a', - 'new-table': 't3a', - }, - { - 'raw-table': 'iii_b', - 'new-table': 't3b', - }, - { - 'raw-table': 'iii_c', - 'new-table': 't3c', - }, - { - 'raw-table': 'iv', - 'new-table': 't4', - }, - { - 'raw-table': 'v', - 'new-table': 't5', - }, - { - 'raw-table': 'vi', - 'new-table': 't6', - }, -] - -# delete the db and promptly rewrite it from csvs -print("INFO: building sqlite database") -db = os.path.join('..', 'data', 'treasury_data.db') -os.system("rm " + db) - -connection = sqlite3.connect(db) -connection.text_factory = str # bad, but pandas doesn't work otherwise - -for table in TABLES: - df = pandas.read_csv(os.path.join('..', 'data', 'lifetime_csv', 'table_%s.csv' % table['raw-table'])) - - # WARNING SERIOUS HACKS FOLLOW # - # FILTER OUT TABLE 5 AFTER 2012-04-02 - HACK BUT WORKS FOR NOW # - if table['new-table']=="t5": - print("INFO: filtering out invalid dates for TABLE V (deprecated as of 2012-04-02) ") - table_v_end = datetime.date(2012, 4, 2) - df.date = df.date.apply(lambda x: datetime.datetime.strptime(x, "%Y-%m-%d").date()) - df = df[df.date < table_v_end] - - #pandas.io.sql.write_frame(df, table['new-table'], connection) - df.to_sql(table['new-table'], connection, index=False) - -# Commit -connection.commit() - - -## CELEBRATE! ################################################################# -csv_txt = r""" - ,----.. .--.--. - / / \ / / '. ,---. -| : :| : /`. / /__./| -. | ;. /; | |--` ,---.; ; | -. ; /--` | : ;_ /___/ \ | | -; | ; \ \ `.\ ; \ ' | -| : | `----. \\ \ \: | -. | '___ __ \ \ | ; \ ' . -' ; : .'| / /`--' / \ \ ' -' | '/ :'--'. / \ ` ; -| : / `--'---' : \ | - \ \ .' '---" - `---` -""" -soundsystem_txt = r""" -.-. .-. . . . . .-. .-. . . .-. .-. .-. . . -`-. | | | | |\| | )`-. | `-. | |- |\/| -`-' `-' `-' ' ` `-' `-' ` `-' ' `-' ' ` -""" -welcome_msg = r""" -Everything you just downloaded is in the data/ directory. -The raw files are in data/fixie. -They were parsed and converted to CSVs in the data/daily_csv directory. -These are combined by table in the data/lifetime_csv directory. -Those tables were made into a SQLite database at data/treasury_data.db, which you can load using your favorite SQLite viewer. -If you have any questions, check out http://treasury.io for usage and a link to the support Google Group. -""" -print(csv_txt) -print(soundsystem_txt) -print('*http://csvsoundsystem.com/') -print(welcome_msg) diff --git a/parser/download_fms_fixies.py b/parser/download_fms_fixies.py index 707d55f..fbabb72 100644 --- a/parser/download_fms_fixies.py +++ b/parser/download_fms_fixies.py @@ -1,127 +1,156 @@ #!/usr/bin/env python -import codecs +from __future__ import absolute_import, print_function + +import argparse import datetime +import io +import logging import os +import random +import sys +import time + +import arrow import pandas as pd import requests -import sys + +from constants import DEFAULT_FIXIE_DIR, EARLIEST_DATE +from utils import get_all_dates, get_fixies_by_date + + +LOGGER = logging.getLogger('download_fms_fixies') +LOGGER.setLevel(logging.INFO) +_handler = logging.StreamHandler() +_formatter = logging.Formatter('%(name)s | %(levelname)s | %(message)s') +_handler.setFormatter(_formatter) +LOGGER.addHandler(_handler) BASE_URL = 'https://www.fms.treas.gov/fmsweb/viewDTSFiles' -SAVE_DIR = os.path.join('..', 'data', 'fixie') -HOLIDAYS = [datetime.datetime.strptime(d, '%Y%m%d').date() for d in [ - '20050117', '20050221', '20050530', '20050704', '20050905', '20051010', '20051111', '20051124', '20051226', - '20060102', '20060116', '20060220', '20060529', '20060704', '20060904', '20061009', '20061110', '20061123', '20061225', - '20070101', '20070115', '20070219', '20070528', '20070704', '20070903', '20071008', '20071112', '20071122', '20071225', - '20080101', '20080121', '20080218', '20080526', '20080704', '20080901', '20081013', '20081111', '20081127', '20081225', - '20090101', '20090119', '20090216', '20090525', '20090703', '20090907', '20091012', '20091111', '20091126', '20091225', - '20100101', '20100118', '20100215', '20100531', '20100705', '20100906', '20101011', '20101111', '20101125', '20101224', - '20101231', '20110117', '20110221', '20110530', '20110704', '20110905', '20111010', '20111111', '20111124', '20111226', - '20120102', '20120116', '20120220', '20120528', '20120704', '20120903', '20121008', '20121112', '20121122', '20121225', - '20130101', '20130121', '20130218', '20130527', '20130704', '20131014', '20131111', '20131128', '20131225' - ]] - -################################################################################ -def check_dates(start_date, end_date): - # fixie files not available before this date - # PDFs *are* available, for the brave soul who wants to parse them - earliest_date = datetime.date(2005, 6, 9) - if start_date < earliest_date: - print '\n**WARNING:', start_date, 'before earliest available date (', - print str(earliest_date), ')' - print '... setting start_date to', str(earliest_date) - start_date = earliest_date - if start_date > end_date: - temp = start_date - start_date = end_date - end_date = temp - - return start_date, end_date - -################################################################################ -def generate_date_range(start_date, end_date): - start_date, end_date = check_dates(start_date, end_date) - dates = [] - td = datetime.timedelta(days=1) - current_date = start_date - while current_date <= end_date: - dates.append(current_date) - current_date += td - return dates - -################################################################################ -def remove_weekends_and_holidays(all_dates): - good_dates = [date for date in all_dates - if datetime.datetime.strftime(date, '%A') not in ['Saturday', 'Sunday'] - and date not in HOLIDAYS] - return good_dates - -################################################################################ -def request_fixie(fname): - response = requests.get(BASE_URL, - params={'dir': 'a', - 'fname': fname} - ) - if response.status_code == 200: - return response.text - # check in working directory instead - else: - response = requests.get(BASE_URL, - params={'dir': 'w', - 'fname': fname} - ) - if response.status_code == 200: - return response.text - else: - return None - -################################################################################ -def request_all_fixies(fnames): - for fname in reversed(fnames): - alt_fnames = [fname] - alt_fnames.extend([fname[:-5] + i +'.txt' for i in ['1', '2', '3']]) - for alt_fname in alt_fnames: - fixie = request_fixie(alt_fname) - if fixie: - print 'INFO: saving', os.path.join(SAVE_DIR, alt_fname) - f = codecs.open(os.path.join(SAVE_DIR, alt_fname), 'wb', 'utf-8') - f.write(fixie) - f.close() - break - - if fixie is None: - print 'WARNING:', fname, '(', - print str(datetime.datetime.strptime(fname[:6], '%y%m%d').date()), - print ')', 'not available' - - return fnames - -################################################################################ -def download_fixies(start_date, end_date=None): - start_date = datetime.datetime.strptime(str(start_date), '%Y-%m-%d').date() - if end_date: - end_date = datetime.datetime.strptime(str(end_date), '%Y-%m-%d').date() - else: - end_date = start_date - - all_dates = generate_date_range(start_date, end_date) - print '\nINFO: Downloading FMS fixies from', all_dates[0], 'to', all_dates[-1], "!\n" - - good_dates = remove_weekends_and_holidays(all_dates) - fnames = [''.join([datetime.datetime.strftime(date, '%y%m%d'), '00.txt']) for date in good_dates] - request_all_fixies(fnames) - -################################################################################ -if __name__ == '__main__': - try: - start_date = datetime.datetime.strptime(str(sys.argv[1]), '%Y-%m-%d').date() - except IndexError: - print 'ERROR: must provide date as argument!' - sys.exit() - try: - end_date = datetime.datetime.strptime(str(sys.argv[2]), '%Y-%m-%d').date() - except IndexError: - end_date = start_date - download_fixies(start_date, end_date) +def generate_fixie_fnames(dates): + """ + Generate likely fixie filenames for each date in ``dates``. Fnames are + constructed by combining the date formatted as 'YYMMDD', a zero-padded digit + string like '00', and the filetype, '.txt'. + + Args: + dates (Iterable[:class:`arrow.Arrow`]) + + Yields: + Tuple[str] + """ + suffixes = ('00.txt', '01.txt', '02.txt', '03.txt') + for dt in dates: + yield tuple(dt.format('YYMMDD') + suffix for suffix in suffixes) + + +def request_all_fixies(all_fnames, data_dir): + """ + Args: + all_fnames (Iterable[Tuple[str]]) + data_dir (str) + """ + for fnames in all_fnames: + time.sleep(0.1 + 0.1 * random.random()) + success = False + for fname in fnames: + fixie = request_fixie(fname) + if fixie: + filepath = os.path.join(data_dir, fname) + with io.open(filepath, mode='wb') as f: + f.write(fixie) + LOGGER.debug('%s fixie saved to %s', fname, filepath) + success = True + break + if success is False: + LOGGER.warning( + '%s fixie (%s) not available', + fnames[0], + str(datetime.datetime.strptime(fname[:6], '%y%m%d').date())) + +def request_fixie(fname): + """ + Request fixie files from FMS server, checking in 2 different directories + where they are likely to be stored. + + Args: + fname (str) + + Returns: + str or None + """ + for dir_ in ('a', 'w'): + response = requests.get(BASE_URL, params={'dir': dir_, 'fname': fname}) + if response.status_code == 200: + return response.text + return None + + +def main(): + parser = argparse.ArgumentParser( + description="""Script to download "FMS fixie" files for all non-weekend, + non-holiday dates between ``startdate`` and ``enddate``.""") + parser.add_argument( + '-s', '--startdate', type=str, default=EARLIEST_DATE.format('YYYY-MM-DD'), + help="""Start of date range over which to download FMS fixies + as an ISO-formatted string, i.e. YYYY-MM-DD.""") + parser.add_argument( + '-e', '--enddate', type=str, default=arrow.utcnow().shift(days=-1).format('YYYY-MM-DD'), + help="""End of date range over which to download FMS fixies + as an ISO-formatted string, i.e. YYYY-MM-DD.""") + parser.add_argument( + '--fixiedir', type=str, default=DEFAULT_FIXIE_DIR, + help='Directory on disk to which fixies (raw text) are saved.') + parser.add_argument( + '--loglevel', type=int, default=20, choices=[10, 20, 30, 40, 50], + help='Level of message to be logged; 20 => "INFO".') + parser.add_argument( + '--force', default=False, action='store_true', + help="""If True, download all fixies in [start_date, end_date], even if + the resulting files already exist on disk in ``datadir``. + Otherwise, only download un-downloaded fixies.""") + args = parser.parse_args() + + LOGGER.setLevel(args.loglevel) + + # auto-make data directory, if not present + try: + os.makedirs(args.fixiedir) + except OSError: # already exists + pass + + # get all valid dates within range + all_dates = get_all_dates(args.startdate, args.enddate) + if not all_dates: + LOGGER.warning( + 'no valid dates in range [%s, %s]', + args.startdate, args.enddate) + return + + # if force is False, only download fixies that haven't yet been downloaded + if args.force is False: + fixie_dates = set( + get_fixies_by_date( + all_dates[0], all_dates[-1], data_dir=args.fixiedir + ).keys()) + if fixie_dates: + fixie_dates = sorted(set(all_dates).difference(fixie_dates)) + else: + fixie_dates = all_dates + + if not fixie_dates: + LOGGER.warning( + 'no un-requested fixies in range [%s, %s]', + args.startdate, args.enddate) + else: + LOGGER.info( + 'requesting %s fixies in range [%s, %s] and saving them to %s', + len(fixie_dates), args.startdate, args.enddate, args.fixiedir) + fixie_fnames = generate_fixie_fnames(fixie_dates) + request_all_fixies(fixie_fnames, args.fixiedir) + + +if __name__ == '__main__': + sys.exit(main()) diff --git a/parser/errant_footnote_patterns.txt b/parser/errant_footnote_patterns.txt index 6c62226..5ae7017 100644 --- a/parser/errant_footnote_patterns.txt +++ b/parser/errant_footnote_patterns.txt @@ -13,4 +13,4 @@ .*Treasury reduced the amount of Debt Subject.* .*r/ Revised.* .*Statutory debt limit temporarily suspended through February.* -.*\s*was renamed to.* \ No newline at end of file +.*\s*was renamed to.* diff --git a/parser/parse_fms_fixies.py b/parser/parse_fms_fixies.py index 62c116a..fd8ad96 100644 --- a/parser/parse_fms_fixies.py +++ b/parser/parse_fms_fixies.py @@ -1,609 +1,734 @@ #!/usr/bin/env python -import json +from __future__ import absolute_import, print_function + +import argparse import datetime -import pandas as pd +import io +import json +import logging +from operator import itemgetter +import os import re +import sys + +import arrow +import pandas as pd import requests -# Global Vars -NORMALIZE_FIELD_TABLE = json.load(open("../parser/normalize_field_table.json")) +from .constants import (DEFAULT_FIXIE_DIR, DEFAULT_DAILY_CSV_DIR, + EARLIEST_DATE, PARSER_DIR, TABLE_KEYS) +from .utils import (get_all_dates, get_date_from_fname, + get_daily_csvs_by_date, get_fixies_by_date) -T4_USE_ITEMS = [ - 'Tax and Loan Accounts', - 'Inter agency Transfers', - 'Federal Reserve Account Direct', - 'Federal Reserve Account Total', - 'Federal Reserve Account Depositaries' -] -ERRANT_FOOTNOTE_PATTERNS = [p for p in open("../parser/errant_footnote_patterns.txt").read().split('\n') if p is not ''] +LOGGER = logging.getLogger('parse_fms_fixies') +LOGGER.setLevel(logging.INFO) +_handler = logging.StreamHandler() +_formatter = logging.Formatter('%(name)s | %(levelname)s | %(message)s') +_handler.setFormatter(_formatter) +LOGGER.addHandler(_handler) -NULL_TEST_PARAMS = json.load(open("../tests/null_test_params.json")) +T4_USE_ITEMS = { + 'Tax and Loan Accounts', + 'Inter agency Transfers', + 'Federal Reserve Account Direct', + 'Federal Reserve Account Total', + 'Federal Reserve Account Depositaries', + } + +with io.open(os.path.join(PARSER_DIR, 'normalize_field_table.json'), mode='rt') as f: + NORMALIZE_FIELD_TABLE = json.load(f) + +with io.open(os.path.join(PARSER_DIR, 'errant_footnote_patterns.txt'), mode='rt') as f: + ERRANT_FOOTNOTE_PATTERNS = tuple( + line.strip() for line in f.readlines() if line.strip()) + +# TODO +# with io.open('../tests/null_test_params.json', mode='rt') as f: +# NULL_TEST_PARAMS = json.load(f) re_net = re.compile(".*\(.*net.*\).*", flags=re.IGNORECASE) re_net_remove = re.compile('\(.*net.*\)', flags=re.IGNORECASE) -################################################################################ + def is_errant_footnote(line): - return any([re.search(p, line, flags=re.IGNORECASE) for p in ERRANT_FOOTNOTE_PATTERNS]) + return any(re.search(p, line, flags=re.IGNORECASE) + for p in ERRANT_FOOTNOTE_PATTERNS) + -################################################################################ def normalize_fields(text, table, field): - table_lookup = NORMALIZE_FIELD_TABLE[table] - try: - value_lookup = table_lookup[field] - except KeyError: - return text - else: - try: - value = value_lookup[text] - except KeyError: - return text - else: - return value - -################################################################################ -def get_date_from_fname(f_name): - raw_date = re.search(r'(\d+).txt', f_name).group(1) - date = datetime.date(2000+int(raw_date[0:2]), int(raw_date[2:4]), int(raw_date[4:6])) - return date - - -################################################################################ + table_lookup = NORMALIZE_FIELD_TABLE[table] + try: + value_lookup = table_lookup[field] + except KeyError: + return text + else: + try: + value = value_lookup[text] + except KeyError: + return text + else: + return value + + def get_table_name(line): - try: - table_line = re.search(r'\s+TABLE\s+[\w-]+.*', line).group() - table_name = table_line.strip() - except AttributeError: - table_name = None - return table_name - -################################################################################ -def normalize_page_text(page): - # ignore unicode errors - # i.e. remove superscript 3 symbols ('\xc2\xb3') by way of ignoring their errors - # hopefully this doesn't have any undesirable side-effects - page = re.sub("\xc2\xa0|\xc2\xb3", "", page) - # split on line breaks, usually '\r\n' and rarely just '\n' - lines = re.split(r'\r\n|\n', page) - # get rid of pipe delimiters and divider lines - lines = [re.sub(r'^ \|', ' ', line) for line in lines] - lines = [re.sub(r'\|', '', line) for line in lines] - lines = [re.sub(r'\s?_{5,}', '', line) for line in lines] - # get rid of dollar signs and thousand commas - lines = [re.sub(r'\$', '', line) for line in lines] - lines = [re.sub(r'(\d),(\d)', r'\1\2', line) for line in lines] - # normalize non-leading white space - lines = [line[:6] + re.sub(r'\s{2,}', ' ', line[6:]) for line in lines] - lines = [line.rstrip() for line in lines] - # get rid of blank lines - lines = [line for line in lines if line!='' and line!=' '] - return lines - -################################################################################ + try: + table_line = re.search(r'\s+TABLE\s+[\w-]+.*', line).group() + table_name = table_line.strip() + except AttributeError: + table_name = None + return table_name + + def get_footnote(line): - footnote = re.search(r'^\s*(\d)\/([\w\s\./,]+.*)', line) - if footnote: - return [footnote.group(1), footnote.group(2)] - return None + footnote = re.search(r'^\s*(\d)\/([\w\s\./,]+.*)', line) + if footnote: + return [footnote.group(1), footnote.group(2)] + return None + + +def normalize_page_text(page): + # ignore unicode errors + # i.e. remove superscript 3 symbols ('\xc2\xb3') by way of ignoring their errors + # hopefully this doesn't have any undesirable side-effects + page = re.sub("\xc2\xa0|\xc2\xb3", "", page) + # split on line breaks, usually '\r\n' and rarely just '\n' + lines = re.split(r'\r\n|\n', page) + # get rid of pipe delimiters and divider lines + lines = [re.sub(r'^ \|', ' ', line) for line in lines] + lines = [re.sub(r'\|', '', line) for line in lines] + lines = [re.sub(r'\s?_{5,}', '', line) for line in lines] + # get rid of dollar signs and thousand commas + lines = [re.sub(r'\$', '', line) for line in lines] + lines = [re.sub(r'(\d),(\d)', r'\1\2', line) for line in lines] + # normalize non-leading white space + lines = [line[:6] + re.sub(r'\s{2,}', ' ', line[6:]) for line in lines] + lines = [line.rstrip() for line in lines] + # get rid of blank lines + lines = [line for line in lines if line != '' and line != ' '] + return lines + -################################################################################ def check_fixie_url(url): - print "INFO: checking %s to make sure it's valid" % url - r = requests.get(url) - if r.status_code==200: - return url - else: - # what directory are we in? - bad_dir = re.search('.*dir=([aw])$', url).group(1) - if bad_dir == 'a': - good_dir = 'w' - elif bad_dir == 'w': - good_dir = 'a' - return re.sub("dir="+bad_dir, "dir="+good_dir, url) - -################################################################################ -def gen_fixie_url(f_name, date): - # simplify file name for url creation - new_f_name = re.sub(r'\.\./data/fixie/', '', f_name) - - # arbitrary cutoff to determine archive and working directories - rolling_cutoff = datetime.datetime.now().date() - datetime.timedelta(days=50) - if date < rolling_cutoff: - f_dir = "a" - else: - f_dir = "w" - - # format the url - url = "https://www.fms.treas.gov/fmsweb/viewDTSFiles?fname=%s&dir=%s" % (new_f_name, f_dir) - - # now lets check urls that fall within 15 days before and after our rolling cutoff - check_cutoff_start = rolling_cutoff - datetime.timedelta(days=15) - check_cutoff_end = rolling_cutoff + datetime.timedelta(days=15) - if date > check_cutoff_start and date < check_cutoff_end: - url = check_fixie_url(url) - - return url - -################################################################################ + LOGGER.debug("checking %s to make sure it's valid", url) + r = requests.head(url) + if r.status_code == 200: + return url + else: + # what directory are we in? + bad_dir = re.search('.*dir=([aw])$', url).group(1) + if bad_dir == 'a': + good_dir = 'w' + elif bad_dir == 'w': + good_dir = 'a' + return re.sub("dir=" + bad_dir, "dir=" + good_dir, url) + + +def gen_fixie_url(fname, date): + """Super awkward function whose purpose isn't entirely clear.""" + # split the filename from the rest of its path + _, fname = os.path.split(fname) + + # arbitrary cutoff to determine archive and working directories + rolling_cutoff = arrow.utcnow().shift(days=-50) + if date < rolling_cutoff.date(): + f_dir = "a" + else: + f_dir = "w" + + # format the url + url = 'https://www.fms.treas.gov/fmsweb/viewDTSFiles?fname={}&dir={}'.format(fname, f_dir) + + # now lets check urls that fall within 15 days before and after our rolling cutoff + check_cutoff_start = rolling_cutoff.shift(days=-15).date() + check_cutoff_end = rolling_cutoff.shift(days=15).date() + if date > check_cutoff_start and date < check_cutoff_end: + url = check_fixie_url(url) + + return url + + def check_for_nulls(df, table): - print "TO DO" - # test_params = NULL_TEST_PARAMS[table] - # null_rows = [] - # for v in test_params["values"]: - # null_row = df.loc(i, ) for i in df.index if pd.isnull(df[v][i]) - # null_rows.append(null_row) - # null_field_values = [] - # for f in test_params['fields'] - # [r[f] for r in null_rows - - -################################################################################ -def parse_file(f_name, verbose=False): - f = open(f_name, 'rb').read() - - #raw_tables = re.split(r'(\s+TABLE\s+[\w-]+.*)', f) - raw_tables = re.split(r'([\s_]+TABLE[\s_]+[\w_-]+.*)', f) - tables = [] - for raw_table in raw_tables[1:]: - #if re.search(r'\s+TABLE\s+[\w-]+.*', raw_table): - if re.search(r'([\s_]+TABLE[\s_]+[\w_-]+.*)', raw_table): - table_name = raw_table - # fix malformed fixie table names, BLERGH GOV'T! - table_name = re.sub(r'_+', ' ', table_name) - continue - raw_table = table_name + raw_table - table = normalize_page_text(raw_table) - tables.append(table) - - # file metadata - date = get_date_from_fname(f_name) - url = gen_fixie_url(f_name, date) - - print 'INFO: parsing', f_name, '(', date, ')' - dfs = {} - for table in tables: - table_index = tables.index(table) - dfs[table_index] = parse_table(table, date, url, verbose=verbose) - - return dfs - -################################################################################ -def parse_table(table, date, url, verbose=False): - - # table defaults - t4_total_count = 0 - indent = 0 - footnotes = {} - index = surtype_index = type_index = subtype_index = used_index = -1 - type_indent = subtype_indent = -1 - page_number = -1 - type_ = subtype = None - table_name = None - - # total hack for when the treasury decided to switch - # which (upper or lower) line of two-line items gets the 0s - # NOTE: THIS IS ONLY FOR TABLE I, BECAUSE OF COURSE - if date > datetime.date(2013, 1, 3) or date < datetime.date(2012, 6, 1): - two_line_delta = 1 - else: - two_line_delta = -1 - - parsed_table = [] - for i, line in enumerate(table): - # print '|' + line + '|', '<', i, '>' - row = {} - # a variety of date formats -- for your convenience - row['date'] = date - row['year'] = date.year - row['month'] = date.month - row['day'] = date.day - row['year_month'] = datetime.date.strftime(date, '%Y-%m') - row['weekday'] = datetime.datetime.strftime(date, '%A') - row['url'] = url - - # what's our line number? shall we bail out? - index += 1 - if index <= used_index: continue - indent = len(re.search(r'^\s*', line).group()) - - # Rows that we definitely want to skip - # empty rows or centered header rows - if re.match(r'^\s{7,}', line): continue - - # page number rows - page_number_match = re.search(r'\d+.*DAILY\s+TREASURY\s+STATEMENT.*PAGE:\s+(\d+)', line) - if page_number_match: - page_number = page_number_match.group(1) - continue - - # HARD CODED HACKS - # catch rare exceptions to the above - if re.search(r'DAILY\s+TREASURY\s+STATEMENT', line): - continue #ok - # comment on statutory debt limit at end of Table III-C, and beyond - elif re.search(r'(As|Act) of ([A-Z]\w+ \d+, \d+|\d+\/\d+\/\d+)', line) and re.search(r'(statutory )*debt( limit)*', line): - break #ok - # comment on whatever this is; above line may make this redundant - elif re.search(r'\s*Unamortized Discount represents|amortization is calculated daily', line, flags=re.IGNORECASE): - break #ok - # more cruft of a similar sort - elif re.search(r'billion after \d+\/\d+\/\d+', line): - continue #ok - elif re.search(r'.*r\-revised.*', line): - continue #ok - elif is_errant_footnote(line): - break #ok - - # skip table header rows - if get_table_name(line): - table_name = get_table_name(line) - continue - - row['table'] = table_name - - # save footnotes for later assignment to their rows - footnote = get_footnote(line) - - if footnote is not None: - # while footnote does not end in valid sentence-ending punctuation... - i = 1 - while True: - # get next line, if it exists - try: - next_line = table[index + i] - except IndexError: - break - # and next line is not itself a new footnote... - else: - if re.search('\d+.*DAILY\s+TREASURY\s+STATEMENT.*PAGE:\s+(\d+)', next_line): - break #ok - if not get_footnote(next_line): - # add next line text to current footnote - footnote[1] = ''.join([footnote[1], next_line]) - used_index = index + i - i += 1 - if footnote[1].endswith("program."): - continue #ok - elif re.search(r'[.!?]$', footnote[1]): - break #ok - - # make our merged footnote hack official! - footnotes[footnote[0]] = re.sub("\s{2,}", "", footnote[1]) - - # if next line after footnote is not another footnote - # it is most assuredly extra comments we don't need - try: - last_line = table[index + i] - - except IndexError: - break #ok - - else: - if re.search('\d+.*DAILY\s+TREASURY\s+STATEMENT.*PAGE:\s+(\d+)', last_line): - continue #ok - elif re.search(r'\.aspx\.', last_line): - continue #ok - elif not get_footnote(last_line): - break #ok - - # *****THIS LINE MUST BE HERE TO ENSURE THAT FOOTNOTES AREN'T INCLUDED AS ITEMS ******# - continue - - # note rows with footnote markers for later assignment - if re.search(r'\d+\/', line): - row['footnote'] = re.search(r'(\d+)\/', line).group(1) - - # separate digits and words - digits = re.findall(r'(-{,1}\d+)', line) - words = re.findall(r'\(\-\)|[()]|[^\W\d]+:?', line) - - # check for (-) in words => multiply all digits by -1 - if '(-)' in words: - digits = [str((-1)*int(digit)) for digit in digits] - - # bug fix, to remove the govt's usage of 'r/' in front of numbers - # to denote revised values, and the abhorrent usage of '(-)'' - text = ' '.join(word for word in words if word not in ['r', '(-)']) - - # get type row - if len(digits) == 0 and text.endswith(':') and indent == 1: - type_ = text[:-1] - type_indent = indent - type_index = index - continue - - elif indent <= type_indent: - type_ = None - - row['type'] = type_ - - # special handling for table 3c - if re.search(r'TABLE III-C', row.get('table', '')): - if re.search(r'Less: Debt Not', text): - subtype = 'Debt Not Subject to Limit' - subtype_indent = indent - subtype_index = index - continue - elif re.search(r'Plus: Other Debt', text): - subtype = 'Other Debt Subject to Limit' - subtype_indent = indent - subtype_index = index - continue - # get subtype row - elif len(digits) == 0 and text.endswith(':'): - subtype = text[:-1] - subtype_indent = indent - subtype_index = index - continue - - if index == subtype_index + 1: - pass # possibly unnecessary - elif indent <= subtype_indent: - subtype = None - - row['subtype'] = subtype - - # get and merge two-line rows - if len(digits) == 0 and not text.endswith(':'): - - if two_line_delta == 1 or not re.search(r'TABLE I\s', row.get('table', '')): - - try: - next_line = table[index + 1] - - # check for footnotes, then note and erase them if present! - if re.search(r'\d+\/', next_line): - row['footnote'] = re.search(r'(\d+)\/', next_line).group(1) - next_line = re.sub(r'\d+\/', '', next_line) - - next_digits = re.findall(r'(\d+)', next_line) - next_words = re.findall(r'[^\W\d]+:?', next_line) - - if len(next_digits) != 0: - text = text + ' ' + ' '.join(next_words) - digits = next_digits - used_index = index + 1 - - except IndexError: - pass - - elif two_line_delta == -1 and re.search(r'TABLE I\s', row.get('table', '')): - - try: - prev_line = table[index - 1] - prev_digits = re.findall(r'(\d+)', prev_line) - prev_words = re.findall(r'[^\W\d]+:?', prev_line) - - if len(prev_digits) != 0: - text = ' '.join(prev_words) + ' ' + text - digits = prev_digits - get_rid_of_prev_line = parsed_table.pop() - - except IndexError: - pass - - # skip table annotations that aren't footnotes - # this is a band-aid at best, sorry folks - if len(digits) == 0: - continue - if len(text) > 80: - continue - - row['is_total'] = int('total' in text.lower()) - - # parse one table at a time... - if re.search(r'TABLE I\s', row.get('table', '')): - try: - row['account_raw'] = text - row['account'] = normalize_fields(text, 't1', 'account') - row['close_today'] = digits[-4] - row['open_today'] = digits[-3] - row['open_mo'] = digits[-2] - row['open_fy'] = digits[-1] - except: - if verbose is True: - print 'WARNING:', line - - elif re.search(r'TABLE II\s', row.get('table', '')): - try: - row['item_raw'] = text - - # determine whether item is calculated as a net - if re_net.search(text): - row['is_net'] = 1 - else: - row['is_net'] = 0 - - # remove net from items - text = re_net_remove.sub("", text).strip() - - # proceed - row['item'] = normalize_fields(text, 't2', 'item') - row['today'] = digits[-3] - row['mtd'] = digits[-2] - row['fytd'] = digits[-1] - # tweak column names - row['account'] = row.get('type') - # this is a hack, deal with it :-/ - row['transaction_type'] = 'deposit' - if int(page_number) == 3: - row['transaction_type'] = 'withdrawal' - # now handle items with sub-classification - if row.get('subtype') is not None: - row_subtype = row['subtype'] - row_item = row['item'] - row['parent_item'] = row_subtype - row['item'] = row_item - row['item_raw'] = row_item_raw - row.pop('subtype') - except: - if verbose is True: - print 'WARNING:', line - - elif re.search(r'TABLE III-A', row.get('table', '')): - try: - row['item_raw'] = text - row['item'] = normalize_fields(text, "t3a", 'item') - row['today'] = digits[-3] - row['mtd'] = digits[-2] - row['fytd'] = digits[-1] - # tweak column names - row['debt_type'] = row.get('type') - # now handle items with sub-classification - if row.get('subtype') is not None: - row_subtype = row['subtype'] - row_item = row['item'] - row['parent_item'] = row_subtype - row['item'] = row_item - row['item_raw'] = row_item_raw - row.pop('subtype') - except: - if verbose is True: - print 'WARNING:', line - - elif re.search(r'TABLE III-B', row.get('table', '')): - try: - row['item_raw'] = text - row['item'] = normalize_fields(text, "t3b", 'item') - row['today'] = digits[-3] - row['mtd'] = digits[-2] - row['fytd'] = digits[-1] - # tweak column names - row['transaction_type'] = row.get('type') - # now handle items with sub-classification - if row.get('subtype') is not None: - row_subtype = row['subtype'] - row_item = row['item'] - row['parent_item'] = row_subtype - row['item'] = row_item - row['item_raw'] = row_item_raw - row.pop('subtype') - except: - if verbose is True: - print 'WARNING:', line - - elif re.search(r'TABLE III-C', row.get('table', '')): - try: - row['item_raw'] = text - row['item'] = normalize_fields(text, 't3c', 'item') - row['close_today'] = digits[-4] - row['open_today'] = digits[-3] - row['open_mo'] = digits[-2] - row['open_fy'] = digits[-1] - # now handle items with sub-classification - if row.get('subtype') is not None: - row['parent_item'] = row['subtype'] - row.pop('subtype') - except: - if verbose is True: - print 'WARNING:', line - - elif re.search(r'TABLE IV', row.get('table', '')): - try: - row['type'] = '' - row['classification_raw'] = text - this_class = normalize_fields(text, 't4', 'classification') - row['classification'] = this_class - row['today'] = digits[-3] - row['mtd'] = digits[-2] - row['fytd'] = digits[-1] - # increment Total counts - if this_class == "Total": t4_total_count += 1 - # assign source and use types - if t4_total_count == 1 and this_class == "Total": - row['type'] = "source" - elif t4_total_count == 2 and this_class == "Total": - row['type'] = "use" - elif this_class not in T4_USE_ITEMS: - row['type'] = "source" - else: - row['type'] = "use" - except: - if verbose is True: - print 'WARNING:', line - - elif re.search(r'TABLE V\s', row.get('table', '')): - try: - row['balance_transactions'] = text - row['depositary_type_a'] = digits[-4] - row['depositary_type_b'] = digits[-3] - row['depositary_type_c'] = digits[-2] - row['total'] = digits[-1] - # tweak column names - row['transaction_type'] = row.get('type') - except: - if verbose is True: - print 'WARNING:', line - - elif re.search(r'TABLE VI', row.get('table', '')): - try: - row['refund_type_raw'] = text - row['refund_type'] = normalize_fields(text, 't6', 'classification') - row['today'] = digits[-3] - row['mtd'] = digits[-2] - row['fytd'] = digits[-1] - if '( eft )' in row.get('refund_type_raw', '').lower(): - row['refund_method'] = 'EFT' - - elif '( checks )' in row.get('refund_type_raw', '').lower(): - row['refund_method'] = 'CHECKS' - except: - if verbose is True: - print 'WARNING:', line - - parsed_table.append(row) - - # assign footnotes to rows - # and split table III-a by surtype - for row in parsed_table: - if row.get('footnote'): - row['footnote'] = footnotes.get(row['footnote']) - if row.get('item'): - if row['item'].lower().strip() == 'total issues': - surtype_index = parsed_table.index(row) - row['transaction_type'] = 'issue' - - # after-the-fact surtype assignment - if surtype_index != -1: - for row in parsed_table[:surtype_index]: - row['transaction_type'] = 'issue' - for row in parsed_table[surtype_index + 1:]: - row['transaction_type'] = 'redemption' - - # create data frame from table list of row dicts - df = pd.DataFrame(parsed_table) - - # and pretty them up - if re.search(r'TABLE I\s', row.get('table', '')): - df = df.reindex(columns=['table', 'url', 'date', 'year_month', 'year', 'month', 'day', 'weekday', 'is_total', 'account', 'account_raw', 'close_today', 'open_today', 'open_mo', 'open_fy', 'footnote']) - # check_for_nulls(df, "t1") - elif re.search(r'TABLE II\s', row.get('table', '')): - df = df.reindex(columns=['table', 'url', 'date', 'year_month', 'year', 'month', 'day', 'weekday', 'account', 'transaction_type', 'parent_item','is_total', 'is_net', 'item', 'item_raw', 'today', 'mtd', 'fytd', 'footnote']) - if 'withdrawal' not in set(list(df['transaction_type'])): - print "ERROR: No withdrawal items in t2 for %s" % df['date'][0] - # check_for_nulls(df, "t2") - elif re.search(r'TABLE III-A', row.get('table', '')): - df = df.reindex(columns=['table', 'url', 'date', 'year_month', 'year', 'month', 'day', 'weekday', 'transaction_type', 'debt_type', 'parent_item', 'is_total', 'item', 'item_raw', 'today', 'mtd', 'fytd', 'footnote']) - # check_for_nulls(df, "t3a") - elif re.search(r'TABLE III-B', row.get('table', '')): - df = df.reindex(columns=['table', 'url', 'date', 'year_month', 'year', 'month', 'day', 'weekday', 'transaction_type', 'parent_item', 'is_total', 'item', 'item_raw', 'today', 'mtd', 'fytd', 'footnote']) - # check_for_nulls(df, "t3b") - elif re.search(r'TABLE III-C', row.get('table', '')): - df = df.reindex(columns=['table', 'url', 'date', 'year_month', 'year', 'month', 'day', 'weekday', 'is_total', 'parent_item', 'item', 'item_raw', 'close_today', 'open_today', 'open_mo', 'open_fy', 'footnote']) - # check_for_nulls(df, "t3c") - elif re.search(r'TABLE IV', row.get('table', '')): - df = df.reindex(columns=['table', 'url', 'date', 'year_month', 'year', 'month', 'day', 'weekday', 'type', 'is_total', 'classification', 'classification_raw', 'today', 'mtd', 'fytd', 'footnote']) - # check_for_nulls(df, "t4") - elif re.search(r'TABLE V\s', row.get('table', '')): - df = df.reindex(columns=['table', 'url', 'date', 'year_month', 'year', 'month', 'day', 'weekday', 'transaction_type', 'is_total', 'balance_transactions', 'depositary_type_a', 'depositary_type_b', 'depositary_type_c', 'total', 'footnote']) - # check_for_nulls(df, "t5") - elif re.search(r'TABLE VI', row.get('table', '')): - df = df.reindex(columns=['table', 'url', 'date', 'year_month', 'year', 'month', 'day', 'weekday', 'refund_method', 'refund_type', 'refund_type_raw', 'today', 'mtd', 'fytd', 'footnote']) - # check_for_nulls(df, "t6") - - return df - -# BJD: Does this function serve a purpose...? -def strip_table_name(table_name): - return re.sub('[^a-zA-Z]*$', '', table_name) + print("TO DO") + # test_params = NULL_TEST_PARAMS[table] + # null_rows = [] + # for v in test_params["values"]: + # null_row = df.loc(i, ) for i in df.index if pd.isnull(df[v][i]) + # null_rows.append(null_row) + # null_field_values = [] + # for f in test_params['fields'] + # [r[f] for r in null_rows + + +def parse_all_fixies(filepaths, data_dir): + """ + Args: + filepaths (Iterable[str]) + data_dir (str) + """ + for filepath in filepaths: + _, fname = os.path.split(filepath) + froot, _ = os.path.splitext(fname) + + dfs = parse_fixie(filepath) + if dfs: + for table_key, df in sorted(dfs.items(), key=itemgetter(0)): + daily_csv_fname = os.path.join( + data_dir, froot + '_' + table_key + '.csv') + df.to_csv( + daily_csv_fname, + index=False, header=True, encoding='utf-8', na_rep='') + LOGGER.debug( + 'parsed %s and saved it to %s', + table_key, daily_csv_fname) + else: + LOGGER.warning('error parsing fixie %s', fname) + + +def parse_fixie(fname): + """ + Args: + fname (str) + + Returns: + Dict[str, :class:`pd.DataFrame`] + """ + with io.open(fname, mode='rb') as f: + data = f.read() + + # file metadata + date = get_date_from_fname(fname) + url = gen_fixie_url(fname, date) + + LOGGER.debug('parsing %s (%s)', fname, date) + + # split the file on table names + # use regex parens to *keep* the delimiters, then clean them up + # skip the first one, which is just the fixie header info + re_raw_table_name = re.compile(r'([\s_]+TABLE[\s_]+[\w_-]+.*)') + raw_tables = re_raw_table_name.split(data) + dfs = {} + for raw_table in raw_tables[1:]: + try: + if re_raw_table_name.search(raw_table): + # fix malformed fixie table names, BLERGH GOV'T! + table_name = re.sub(r'_+', ' ', raw_table) + # then convert it into a standardized "key" + table_key = re.sub( + r'-| ', '_', + re.search(r'TABLE [\w-]+', table_name).group().lower() + ) + if table_key not in TABLE_KEYS: + raise ValueError() + continue + raw_table = table_name + raw_table + table = normalize_page_text(raw_table) + dfs[table_key] = parse_table(table, date, url) + except Exception as e: + LOGGER.exception('error parsing fixie table %s', table_key) + + return dfs + + +def parse_table(table, date, url): + + # table defaults + t4_total_count = 0 + indent = 0 + footnotes = {} + index = surtype_index = type_index = subtype_index = used_index = -1 + type_indent = subtype_indent = -1 + page_number = -1 + type_ = subtype = None + table_name = None + + # total hack for when the treasury decided to switch + # which (upper or lower) line of two-line items gets the 0s + # NOTE: THIS IS ONLY FOR TABLE I, BECAUSE OF COURSE + if datetime.date(2012, 6, 1) <= date <= datetime.date(2013, 1, 3): + two_line_delta = -1 + else: + two_line_delta = 1 + + parsed_table = [] + for i, line in enumerate(table): + # print('|' + line + '|', '<', i, '>') + row = {} + + # a variety of date formats -- for your convenience + row['date'] = date + row['year'] = date.year + row['month'] = date.month + row['day'] = date.day + row['year_month'] = datetime.datetime.strftime(date, '%Y-%m') + row['weekday'] = datetime.datetime.strftime(date, '%A') + row['url'] = url + + # what's our line number? shall we bail out? + index += 1 + if index <= used_index: + continue + indent = len(re.search(r'^\s*', line).group()) + + # rows that we definitely want to skip + # empty rows or centered header rows + if re.match(r'^\s{7,}', line): + continue + + # page number rows + page_number_match = re.search(r'\d+.*DAILY\s+TREASURY\s+STATEMENT.*PAGE:\s+(\d+)', line) + if page_number_match: + page_number = page_number_match.group(1) + continue + + # HARD CODED HACKS + # catch rare exceptions to the above + if re.search(r'DAILY\s+TREASURY\s+STATEMENT', line): + continue # ok + # comment on statutory debt limit at end of Table III-C, and beyond + elif (re.search(r'(As|Act) of ([A-Z]\w+ \d+, \d+|\d+\/\d+\/\d+)', line) and + re.search(r'(statutory )*debt( limit)*', line)): + break # ok + # comment on whatever this is; above line may make this redundant + elif re.search(r'\s*Unamortized Discount represents|amortization is calculated daily', + line, flags=re.IGNORECASE): + break # ok + # more cruft of a similar sort + elif re.search(r'billion after \d+\/\d+\/\d+', line): + continue # ok + elif re.search(r'.*r\-revised.*', line): + continue # ok + elif is_errant_footnote(line): + break # ok + + # skip table header rows + if get_table_name(line): + table_name = get_table_name(line) + continue + + row['table'] = table_name + + # save footnotes for later assignment to their rows + footnote = get_footnote(line) + + if footnote is not None: + # while footnote does not end in valid sentence-ending punctuation... + i = 1 + while True: + # get next line, if it exists + try: + next_line = table[index + i] + except IndexError: + break + # and next line is not itself a new footnote... + else: + if re.search('\d+.*DAILY\s+TREASURY\s+STATEMENT.*PAGE:\s+(\d+)', next_line): + break # ok + if not get_footnote(next_line): + # add next line text to current footnote + footnote[1] = ''.join([footnote[1], next_line]) + used_index = index + i + i += 1 + if footnote[1].endswith("program."): + continue # ok + elif re.search(r'[.!?]$', footnote[1]): + break # ok + + # make our merged footnote hack official! + footnotes[footnote[0]] = re.sub("\s{2,}", "", footnote[1]) + + # if next line after footnote is not another footnote + # it is most assuredly extra comments we don't need + try: + last_line = table[index + i] + except IndexError: + break # ok + + else: + if re.search('\d+.*DAILY\s+TREASURY\s+STATEMENT.*PAGE:\s+(\d+)', last_line): + continue # ok + elif re.search(r'\.aspx\.', last_line): + continue # ok + elif not get_footnote(last_line): + break # ok + + # *****THIS LINE MUST BE HERE TO ENSURE THAT FOOTNOTES AREN'T INCLUDED AS ITEMS ******# + continue + + # note rows with footnote markers for later assignment + if re.search(r'\d+\/', line): + row['footnote'] = re.search(r'(\d+)\/', line).group(1) + + # separate digits and words + digits = re.findall(r'(-{,1}\d+)', line) + words = re.findall(r'\(\-\)|[()]|[^\W\d]+:?', line) + + # check for (-) in words => multiply all digits by -1 + if '(-)' in words: + digits = [str((-1)*int(digit)) for digit in digits] + + # bug fix, to remove the govt's usage of 'r/' in front of numbers + # to denote revised values, and the abhorrent usage of '(-)'' + text = ' '.join(word for word in words if word not in ['r', '(-)']) + + # get type row + if len(digits) == 0 and text.endswith(':') and indent == 1: + type_ = text[:-1] + type_indent = indent + type_index = index + continue + elif indent <= type_indent: + type_ = None + + row['type'] = type_ + + # special handling for table 3c + if re.search(r'TABLE III-C', row.get('table', '')): + if re.search(r'Less: Debt Not', text): + subtype = 'Debt Not Subject to Limit' + subtype_indent = indent + subtype_index = index + continue + elif re.search(r'Plus: Other Debt', text): + subtype = 'Other Debt Subject to Limit' + subtype_indent = indent + subtype_index = index + continue + # get subtype row + elif len(digits) == 0 and text.endswith(':'): + subtype = text[:-1] + subtype_indent = indent + subtype_index = index + continue + + if index == subtype_index + 1: + pass # possibly unnecessary + elif indent <= subtype_indent: + subtype = None + row['subtype'] = subtype + + # get and merge two-line rows + if len(digits) == 0 and not text.endswith(':'): + + if two_line_delta == 1 or not re.search(r'TABLE I\s', row.get('table', '')): + + try: + next_line = table[index + 1] + + # check for footnotes, then note and erase them if present! + if re.search(r'\d+\/', next_line): + row['footnote'] = re.search(r'(\d+)\/', next_line).group(1) + next_line = re.sub(r'\d+\/', '', next_line) + + next_digits = re.findall(r'(\d+)', next_line) + next_words = re.findall(r'[^\W\d]+:?', next_line) + + if len(next_digits) != 0: + text = text + ' ' + ' '.join(next_words) + digits = next_digits + used_index = index + 1 + + except IndexError: + pass + + elif two_line_delta == -1 and re.search(r'TABLE I\s', row.get('table', '')): + + try: + prev_line = table[index - 1] + prev_digits = re.findall(r'(\d+)', prev_line) + prev_words = re.findall(r'[^\W\d]+:?', prev_line) + + if len(prev_digits) != 0: + text = ' '.join(prev_words) + ' ' + text + digits = prev_digits + get_rid_of_prev_line = parsed_table.pop() + + except IndexError: + pass + + # skip table annotations that aren't footnotes + # this is a band-aid at best, sorry folks + if len(digits) == 0: + continue + if len(text) > 80: + continue + + row['is_total'] = int('total' in text.lower()) + + # parse one table at a time... + if re.search(r'TABLE I\s', row.get('table', '')): + try: + row['account_raw'] = text + row['account'] = normalize_fields(text, 't1', 'account') + row['close_today'] = digits[-4] + row['open_today'] = digits[-3] + row['open_mo'] = digits[-2] + row['open_fy'] = digits[-1] + except Exception: + LOGGER.debug('table-i line exception: %s', line) + + elif re.search(r'TABLE II\s', row.get('table', '')): + try: + row['item_raw'] = text + + # determine whether item is calculated as a net + if re_net.search(text): + row['is_net'] = 1 + else: + row['is_net'] = 0 + + # remove net from items + text = re_net_remove.sub("", text).strip() + + # proceed + row['item'] = normalize_fields(text, 't2', 'item') + row['today'] = digits[-3] + row['mtd'] = digits[-2] + row['fytd'] = digits[-1] + # tweak column names + row['account'] = row.get('type') + # this is a hack, deal with it :-/ + row['transaction_type'] = 'deposit' + if int(page_number) == 3: + row['transaction_type'] = 'withdrawal' + # now handle items with sub-classification + if row.get('subtype') is not None: + row_subtype = row['subtype'] + row_item = row['item'] + row['parent_item'] = row_subtype + row['item'] = row_item + row['item_raw'] = row_item_raw + row.pop('subtype') + except Exception: + LOGGER.debug('table-ii line exception: %s', line) + + elif re.search(r'TABLE III-A', row.get('table', '')): + try: + row['item_raw'] = text + row['item'] = normalize_fields(text, "t3a", 'item') + row['today'] = digits[-3] + row['mtd'] = digits[-2] + row['fytd'] = digits[-1] + # tweak column names + row['debt_type'] = row.get('type') + # now handle items with sub-classification + if row.get('subtype') is not None: + row_subtype = row['subtype'] + row_item = row['item'] + row['parent_item'] = row_subtype + row['item'] = row_item + row['item_raw'] = row_item_raw + row.pop('subtype') + except Exception: + LOGGER.debug('table-iiia line exception: %s', line) + + elif re.search(r'TABLE III-B', row.get('table', '')): + try: + row['item_raw'] = text + row['item'] = normalize_fields(text, "t3b", 'item') + row['today'] = digits[-3] + row['mtd'] = digits[-2] + row['fytd'] = digits[-1] + # tweak column names + row['transaction_type'] = row.get('type') + # now handle items with sub-classification + if row.get('subtype') is not None: + row_subtype = row['subtype'] + row_item = row['item'] + row['parent_item'] = row_subtype + row['item'] = row_item + row['item_raw'] = row_item_raw + row.pop('subtype') + except Exception: + LOGGER.debug('table-iiib line exception: %s', line) + + elif re.search(r'TABLE III-C', row.get('table', '')): + try: + row['item_raw'] = text + row['item'] = normalize_fields(text, 't3c', 'item') + row['close_today'] = digits[-4] + row['open_today'] = digits[-3] + row['open_mo'] = digits[-2] + row['open_fy'] = digits[-1] + # now handle items with sub-classification + if row.get('subtype') is not None: + row['parent_item'] = row['subtype'] + row.pop('subtype') + except Exception: + LOGGER.debug('table-iiic line exception: %s', line) + + elif re.search(r'TABLE IV', row.get('table', '')): + try: + row['type'] = '' + row['classification_raw'] = text + this_class = normalize_fields(text, 't4', 'classification') + row['classification'] = this_class + row['today'] = digits[-3] + row['mtd'] = digits[-2] + row['fytd'] = digits[-1] + # increment Total counts + if this_class == "Total": + t4_total_count += 1 + # assign source and use types + if t4_total_count == 1 and this_class == "Total": + row['type'] = "source" + elif t4_total_count == 2 and this_class == "Total": + row['type'] = "use" + elif this_class not in T4_USE_ITEMS: + row['type'] = "source" + else: + row['type'] = "use" + except Exception: + LOGGER.debug('table-iv line exception: %s', line) + + elif re.search(r'TABLE V\s', row.get('table', '')): + try: + row['balance_transactions'] = text + row['depositary_type_a'] = digits[-4] + row['depositary_type_b'] = digits[-3] + row['depositary_type_c'] = digits[-2] + row['total'] = digits[-1] + # tweak column names + row['transaction_type'] = row.get('type') + except Exception: + LOGGER.debug('table-v line exception: %s', line) + + elif re.search(r'TABLE VI', row.get('table', '')): + try: + row['refund_type_raw'] = text + row['refund_type'] = normalize_fields(text, 't6', 'classification') + row['today'] = digits[-3] + row['mtd'] = digits[-2] + row['fytd'] = digits[-1] + if '( eft )' in row.get('refund_type_raw', '').lower(): + row['refund_method'] = 'EFT' + elif '( checks )' in row.get('refund_type_raw', '').lower(): + row['refund_method'] = 'CHECKS' + except Exception: + LOGGER.debug('table-vi line exception: %s', line) + + parsed_table.append(row) + + # assign footnotes to rows + # and split table III-a by surtype + for row in parsed_table: + if row.get('footnote'): + row['footnote'] = footnotes.get(row['footnote']) + if row.get('item'): + if row['item'].lower().strip() == 'total issues': + surtype_index = parsed_table.index(row) + row['transaction_type'] = 'issue' + + # after-the-fact surtype assignment + if surtype_index != -1: + for row in parsed_table[:surtype_index]: + row['transaction_type'] = 'issue' + for row in parsed_table[surtype_index + 1:]: + row['transaction_type'] = 'redemption' + + # create data frame from table list of row dicts + df = pd.DataFrame(parsed_table) + + # and pretty them up + if re.search(r'TABLE I\s', row.get('table', '')): + df = df.reindex(columns=['table', 'url', 'date', 'year_month', 'year', 'month', 'day', 'weekday', 'is_total', 'account', 'account_raw', 'close_today', 'open_today', 'open_mo', 'open_fy', 'footnote']) + # check_for_nulls(df, "t1") + elif re.search(r'TABLE II\s', row.get('table', '')): + df = df.reindex(columns=['table', 'url', 'date', 'year_month', 'year', 'month', 'day', 'weekday', 'account', 'transaction_type', 'parent_item', 'is_total', 'is_net', 'item', 'item_raw', 'today', 'mtd', 'fytd', 'footnote']) + if 'withdrawal' not in set(list(df['transaction_type'])): + LOGGER.error('No withdrawal items in t2 for %s', df['date'][0]) + # check_for_nulls(df, "t2") + elif re.search(r'TABLE III-A', row.get('table', '')): + df = df.reindex(columns=['table', 'url', 'date', 'year_month', 'year', 'month', 'day', 'weekday', 'transaction_type', 'debt_type', 'parent_item', 'is_total', 'item', 'item_raw', 'today', 'mtd', 'fytd', 'footnote']) + # check_for_nulls(df, "t3a") + elif re.search(r'TABLE III-B', row.get('table', '')): + df = df.reindex(columns=['table', 'url', 'date', 'year_month', 'year', 'month', 'day', 'weekday', 'transaction_type', 'parent_item', 'is_total', 'item', 'item_raw', 'today', 'mtd', 'fytd', 'footnote']) + # check_for_nulls(df, "t3b") + elif re.search(r'TABLE III-C', row.get('table', '')): + df = df.reindex(columns=['table', 'url', 'date', 'year_month', 'year', 'month', 'day', 'weekday', 'is_total', 'parent_item', 'item', 'item_raw', 'close_today', 'open_today', 'open_mo', 'open_fy', 'footnote']) + # check_for_nulls(df, "t3c") + elif re.search(r'TABLE IV', row.get('table', '')): + df = df.reindex(columns=['table', 'url', 'date', 'year_month', 'year', 'month', 'day', 'weekday', 'type', 'is_total', 'classification', 'classification_raw', 'today', 'mtd', 'fytd', 'footnote']) + # check_for_nulls(df, "t4") + elif re.search(r'TABLE V\s', row.get('table', '')): + df = df.reindex(columns=['table', 'url', 'date', 'year_month', 'year', 'month', 'day', 'weekday', 'transaction_type', 'is_total', 'balance_transactions', 'depositary_type_a', 'depositary_type_b', 'depositary_type_c', 'total', 'footnote']) + # check_for_nulls(df, "t5") + elif re.search(r'TABLE VI', row.get('table', '')): + df = df.reindex(columns=['table', 'url', 'date', 'year_month', 'year', 'month', 'day', 'weekday', 'refund_method', 'refund_type', 'refund_type_raw', 'today', 'mtd', 'fytd', 'footnote']) + # check_for_nulls(df, "t6") + + return df + + +def main(): + parser = argparse.ArgumentParser( + description='Script to parse "FMS fixie" files.') + parser.add_argument( + '-s', '--startdate', type=str, default=EARLIEST_DATE.format('YYYY-MM-DD'), + help="""Start of date range over which to parse FMS fixies + as an ISO-formatted string, i.e. YYYY-MM-DD.""") + parser.add_argument( + '-e', '--enddate', type=str, default=arrow.utcnow().shift(days=-1).format('YYYY-MM-DD'), + help="""End of date range over which to download FMS fixies + as an ISO-formatted string, i.e. YYYY-MM-DD.""") + parser.add_argument( + '--fixiedir', type=str, default=DEFAULT_FIXIE_DIR, + help='Directory on disk from which fixies (raw text) are loaded.') + parser.add_argument( + '--dailycsvdir', type=str, default=DEFAULT_DAILY_CSV_DIR, + help='Directory on disk to which parsed fixies (daily CSVs) are saved.') + parser.add_argument( + '--loglevel', type=int, default=20, choices=[10, 20, 30, 40, 50], + help='Level of message to be logged; 20 => "INFO".') + parser.add_argument( + '--force', default=False, action='store_true', + help="""If True, parse all fixies in [start_date, end_date], even if + the resulting csvs already exist on disk in ``dailycsvdir``. + Otherwise, only parse un-parsed fixies.""") + args = parser.parse_args() + + LOGGER.setLevel(args.loglevel) + + # auto-make data directories, if not present + for dir_ in (args.fixiedir, args.dailycsvdir): + try: + os.makedirs(dir_) + except OSError: # already exists + continue + + # get all valid dates within range, and their corresponding fixie files + all_dates = get_all_dates(args.startdate, args.enddate) + if not all_dates: + LOGGER.warning( + 'no valid dates in range [%s, %s]', + args.startdate, args.enddate) + return + + fixies_by_date = get_fixies_by_date( + all_dates[0], all_dates[-1], data_dir=args.fixiedir) + # if force is False, only parse fixies that haven't yet been parsed + if args.force is False: + daily_csv_dates = set( + get_daily_csvs_by_date( + all_dates[0], all_dates[-1], data_dir=args.dailycsvdir + ).keys()) + if daily_csv_dates: + fixies_by_date = { + dt: fname for dt, fname in fixies_by_date.items() + if dt not in daily_csv_dates} + + if not fixies_by_date: + LOGGER.warning( + 'no un-parsed fixies in range [%s, %s]', + args.startdate, args.enddate) + else: + LOGGER.info( + 'parsing %s fixies in range [%s, %s] and saving them to %s', + len(fixies_by_date), args.startdate, args.enddate, args.dailycsvdir) + fixie_fnames = ( + fname for _, fname in sorted(fixies_by_date.items(), key=itemgetter(0))) + parse_all_fixies(fixie_fnames, args.dailycsvdir) + + +if __name__ == '__main__': + sys.exit(main()) diff --git a/parser/utils.py b/parser/utils.py new file mode 100644 index 0000000..515584b --- /dev/null +++ b/parser/utils.py @@ -0,0 +1,126 @@ +from __future__ import absolute_import + +import collections +import datetime +import logging +import os +import re + +import arrow +import holidays + +from constants import DEFAULT_FIXIE_DIR, DEFAULT_DAILY_CSV_DIR, EARLIEST_DATE + + +LOGGER = logging.getLogger('utils') +LOGGER.setLevel(logging.INFO) +_handler = logging.StreamHandler() +_formatter = logging.Formatter('%(name)s | %(levelname)s | %(message)s') +_handler.setFormatter(_formatter) +LOGGER.addHandler(_handler) + + +re_fname_date_str = re.compile(r'^(\d{6})(\d{2})_?([\w_]*?)(?:\.txt|\.csv)$') + + +def get_all_dates(start_date, end_date): + """ + Get all dates between ``start_date`` and ``end_date`` that are neither + holidays nor weekends. + + Args: + start_date (str or datetime or :class:`arrow.Arrow`) + end_date (str or datetime or :class:`arrow.Arrow`) + + Returns: + Tuple[:class:`arrow.Arrow`] + """ + start_date = arrow.get(start_date) + end_date = arrow.get(end_date) if end_date else start_date + # sanity check the date range + if start_date < EARLIEST_DATE: + start_date = EARLIEST_DATE + LOGGER.warning( + 'start date "%s" before earliest available date; setting equal to "%s"', + start_date, EARLIEST_DATE) + if start_date > end_date: + start_date, end_date = end_date, start_date + LOGGER.warning( + 'start date "%s" before end date "%s"; swapping the dates', + start_date, end_date) + # no fixies on holidays or weekends; remove them + us_holidays = holidays.UnitedStates( + years=range(start_date.year, end_date.year + 1), + observed=True) + return tuple(dt for dt in arrow.Arrow.range('day', start_date, end_date) + if dt.isoweekday() < 6 and dt.date() not in us_holidays) + + +def get_fixies_by_date(start_date, end_date, data_dir=DEFAULT_FIXIE_DIR): + """ + Args: + start_date (:class:`arrow.Arrow`) + end_date (:class:`arrow.Arrow`) + data_dir (str) + + Returns: + Dict[:class:`arrow.Arrow`, str] + """ + start_date_str = start_date.format('YYMMDD') + end_date_str = end_date.format('YYMMDD') + fnames_by_date = {} + for fname in os.listdir(data_dir): + match = re_fname_date_str.search(fname) + # filter out the cruft + if not match: + continue + fname_date_str = match.group(1) + if not start_date_str <= fname_date_str <= end_date_str: + continue + fname_date = arrow.get(datetime.datetime.strptime(fname_date_str, '%y%m%d')) + fnames_by_date[fname_date] = os.path.join(data_dir, fname) + + return fnames_by_date + + +def get_daily_csvs_by_date(start_date, end_date, data_dir=DEFAULT_DAILY_CSV_DIR): + """ + Args: + start_date (:class:`arrow.Arrow`) + end_date (:class:`arrow.Arrow`) + data_dir (str) + + Returns: + Dict[str, List[str]] + """ + start_date_str = start_date.format('YYMMDD') + end_date_str = end_date.format('YYMMDD') + fnames_by_date = collections.defaultdict(dict) + for fname in os.listdir(data_dir): + match = re_fname_date_str.search(fname) + # filter out the cruft + if not match: + continue + fname_date_str = match.group(1) + table_key = match.group(3) + if not start_date_str <= fname_date_str <= end_date_str: + continue + fname_date = arrow.get(datetime.datetime.strptime(fname_date_str, '%y%m%d')) + fnames_by_date[fname_date][table_key] = os.path.join(data_dir, fname) + + return dict(fnames_by_date) + + +def get_date_from_fname(fname): + """ + Args: + fname (str) + + Returns: + :class:`datetime.date` + """ + match = re_fname_date_str.search(fname) + if not match: + raise ValueError('invalid fname: "{}"'.format(fname)) + date_str = match.group(1) + return datetime.datetime.strptime(date_str, '%y%m%d').date() diff --git a/readme.md b/readme.md index 883c813..d8cacee 100644 --- a/readme.md +++ b/readme.md @@ -1,4 +1,5 @@ # Federal Treasury API + ``` ‹ € | | €€€€≤±‹€€€€≤±‹ @@ -22,26 +23,30 @@ ``` ## About this branch + The `master` branch of `federal-treasury-api` contains code specific to running our project on [ScraperWiki](http://www.scraperwiki.com). the [`just-the-api`](https://github.com/csvsoundsystem/federal-treasury-api/tree/just-the-api) branch contains only the code needed to download the data locally and launch a queryable api. In other words, if you're just looking to get and host the database, go [here](https://github.com/csvsoundsystem/federal-treasury-api/tree/just-the-api). ## About the API + `federal-treasury-api` is the first-ever electronically-searchable database of the Federal government's daily cash spending and borrowing. It updates daily and the data can be exported in various formats and loaded into various systems. ## About the data + There are eight tables. * I. Operating Cash Balance (`t1`) * II. Deposits and Withdrawals (`t2`) * IIIa. Public Debt Transactions (`t3a`) -* IIIb. Adjustment of Public Dept Transactions to Cash Basis (`t3b`) +* IIIb. Adjustment of Public Debt Transactions to Cash Basis (`t3b`) * IIIc. Debt Subject to Limit (`t3c`) * IV. Federal Tax Deposits (`t4`) * V. Short-term Cash Investments (`t5`) * VI. Incom Tax Refunds Issued (`t6`) -Check out thre comprehensive [data dictionary](https://github.com/csvsoundsystem/federal-treasury-api/wiki/Treasury.io-Data-Dictionary) and [treasury.io](http://treasury.io) for more information. +Check out the comprehensive [data dictionary](https://github.com/csvsoundsystem/federal-treasury-api/wiki/Treasury.io-Data-Dictionary) and [treasury.io](http://treasury.io) for more information. ## Obtaining the data + Optionally set up a virtualenv. (You need this on ScraperWiki.) Run this from the root of the current repository. @@ -57,22 +62,29 @@ Enable the git post-merge hook. ln -s ../../utils/post-merge . ### POSIX + This one command downloads the (new) fixies and converts them to an SQLite3 database. ./run.sh ### Windows -Run everything - cd parser - python download_and_parse_fms_fixies.py +Run everything: + + python -m parser.download_fms_fixies + python -m parser.parse_fms_fixies + python -m parser.aggregate_fms_fixies + +For information about options, pass the `--help` flag to any of these. ## Testing the data + Various tests are contained in `tests` Tests are run everyday with `./run.sh` and the results are emailed to `csvsoundsystem@gmail.com` ## Cron + Run everything each day around 4:30 - right after the data has been released. ``` @@ -80,15 +92,18 @@ Run everything each day around 4:30 - right after the data has been released. ``` #### Optional: set up logging + ``` 30 16 * * * cd path/to/federal-treasury-api && ./run.sh >> run.log 2>> err.log ``` ## Deploying to ScraperWiki + You can run this on any number of servers, but we happen to be using ScraperWiki. You can check out their documentation [here](https://beta.scraperwiki.com/help/developer/) ### SSH + To use ScraperWiki, log in [here](https://beta.scraperwiki.com/login), make a project, click the "SSH in" link, add your SSH key and SSH in. Then you can SSH to the box like so. @@ -106,6 +121,7 @@ and just run ssh fms ### What this ScraperWiki account is + Some notes about how ScraperWiki works: * We have a user account in a chroot jail. @@ -113,7 +129,28 @@ Some notes about how ScraperWiki works: * Files in `/home/http` get served on the web. * The database `/home/scraperwiki.sqlite` gets served from the SQLite web API. - NOTE: the `home/scraperwiki.sqlite` is simply a symbolic link to `data/treasury_data.db` generated by this command: - ```ln -s data/treasury_data.db scraperwiki.sqlite``` + `ln -s data/treasury_data.db scraperwiki.sqlite` The directions below still apply for any other service, of course. +#### hacks and data brought to you by + +``` + ,----.. .--.--. + / / \ / / '. ,---. +| : :| : /`. / /__./| +. | ;. /; | |--` ,---.; ; | +. ; /--` | : ;_ /___/ \ | | +; | ; \ \ `.\ ; \ ' | +| : | `----. \\ \ \: | +. | '___ __ \ \ | ; \ ' . +' ; : .'| / /`--' / \ \ ' +' | '/ :'--'. / \ ` ; +| : / `--'---' : \ | + \ \ .' '---" + `---` +.-. .-. . . . . .-. .-. . . .-. .-. .-. . . +`-. | | | | |\| | )`-. | `-. | |- |\/| +`-' `-' `-' ' ` `-' `-' ` `-' ' `-' ' ` +``` +\*http://csvsoundsystem.com diff --git a/requirements.pip b/requirements.pip index 52b10d9..706d418 100644 --- a/requirements.pip +++ b/requirements.pip @@ -1,4 +1,6 @@ -pandas>=0.11.0 +arrow==0.10.0 +holidays==0.5 +pandas>=0.20.0 requests>=1.2.0 pysqlite bottle>=0.11.4 diff --git a/run.sh b/run.sh index b5bcece..8aee39c 100755 --- a/run.sh +++ b/run.sh @@ -3,7 +3,7 @@ set -e if [ -d env ]; then . ./env/bin/activate - echo Activated virtualenv + echo "Activated virtualenv" pip install --upgrade -r requirements.pip fi @@ -12,9 +12,9 @@ git pull origin master npm install ( - cd ./parser - python download_and_parse_fms_fixies.py - cd .. + python -m parser.download_fms_fixies + python -m parser.parse_fms_fixies + python -m parser.aggregate_fms_fixies ) echo "INFO: Waiting for Database to update before proceeding\r\n" sleep 20 diff --git a/utils/reset_data.sh b/utils/reset_data.sh index 07260e1..81817a4 100644 --- a/utils/reset_data.sh +++ b/utils/reset_data.sh @@ -18,5 +18,6 @@ if [ -d data/lifetime_csv ]; then rm -r data/lifetime_csv/ fi -cd parser -python download_and_parse_fms_fixies.py +python -m parser.download_fms_fixies +python -m parser.parse_fms_fixies +python -m parser.aggregate_fms_fixies