Skip to content

Commit

Permalink
Create a related bills json with same and similar title bills #9 and #10
Browse files Browse the repository at this point in the history
 (#11)

* Make related bills into a dict. Fix for #10 and #9
  • Loading branch information
aih authored Sep 6, 2020
1 parent eeb71f9 commit f8f6b8c
Show file tree
Hide file tree
Showing 4 changed files with 107 additions and 48 deletions.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,8 @@ env.bak/
venv.bak/

# Sandbox Data
relatedBills.json
relatedBills.json.gz
billsMeta.json
billsMeta.json.gz
titlesIndex.json
Expand Down
63 changes: 40 additions & 23 deletions scripts/billdata.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,21 +3,14 @@
# Command line template from https://gist.githubusercontent.com/opie4624/3896526/raw/3aff2ad7030a74ce26f9fcf80791ae0396d84f18/commandline.py

import sys, os, argparse, logging, re, json, gzip
import datetime
from typing import Dict
from functools import reduce
import re

PATH_TO_BILLS_META = '../billsMeta.json'
SAVE_ON_COUNT = 1000

BILL_TYPES = {
'ih': 'introduced',
'rh': 'reported to house'
}

CURRENT_CONGRESSIONAL_YEAR = datetime.date.today().year if datetime.date.today() > datetime.date(datetime.date.today().year, 1, 3) else (datetime.date.today().year - 1)
CURRENT_CONGRESS, cs_temp = divmod(round(((datetime.date(CURRENT_CONGRESSIONAL_YEAR, 1, 3) - datetime.date(1788, 1, 3)).days) / 365) + 1, 2)
CURRENT_SESSION = cs_temp + 1
try:
from . import constants
except:
import constants

logging.basicConfig(filename='billdata.log', filemode='w', level='INFO')
logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -57,6 +50,21 @@ def walkBillDirs(rootDir = '../congress/data', processFile = logName, dirMatch =
for fname in filteredFileList:
processFile(dirName=dirName, fileName=fname)

# Utilities. These should go in a utils.py module
def billIdToBillNumber(bill_id: str) -> str:
"""
Converts a bill_id of the form `hr299-116` into `116hr299`
Args:
bill_id (str): hyphenated bill_id from bill status JSON
Returns:
str: billCongressTypeNumber (e.g. 116hr299)
"""
if not re.match(constants.BILL_ID_REGEX, bill_id):
raise Exception('bill_id does not have the expected form (e.g. "hjres200-116"')
return ''.join(reversed(bill_id.split('-')))

def deep_get(dictionary: Dict, *keys):
"""
A Dict utility to get a field; returns None if the field does not exist
Expand All @@ -80,11 +88,9 @@ def loadJSON(filePath: str):
def getBillCongressTypeNumber(fileDict: Dict):
bill_id = fileDict.get('bill_id')
if bill_id:
bill_id_parts = bill_id.split('-')
return bill_id_parts[1] + bill_id_parts[0]
billCongressTypeNumber = billIdToBillNumber(bill_id)
else:
logging.error('No bill_id: ' + str(fileDict.get('bill_type')))
return None
raise Exception('No bill_id: ' + str(fileDict.get('bill_type')))

def getCosponsors(fileDict: Dict, includeFields = []) -> list:
"""
Expand Down Expand Up @@ -120,8 +126,8 @@ def getBillTitles(fileDict: Dict, include_partial = True, billType = 'all') -> l
if not include_partial:
titles = [title for title in titles if not title.get('is_for_portion')]

if (billType != 'all') and BILL_TYPES.get(billType):
titles = [title for title in titles if BILL_TYPES.get(billType) == title.get('as')]
if (billType != 'all') and constants.BILL_TYPES.get(billType):
titles = [title for title in titles if constants.BILL_TYPES.get(billType) == title.get('as')]
return titles

def testWalkDirs():
Expand All @@ -132,7 +138,7 @@ def addToFilePathList(dirName: str, fileName: str):
walkBillDirs(processFile=addToFilePathList)
return filePathList

def loadBillsMeta(billMetaPath = PATH_TO_BILLS_META, zip = True):
def loadBillsMeta(billMetaPath = constants.PATH_TO_BILLS_META, zip = True):
billsMeta = {}
if zip:
try:
Expand All @@ -149,7 +155,7 @@ def loadBillsMeta(billMetaPath = PATH_TO_BILLS_META, zip = True):

return billsMeta

def saveBillsMeta(billsMeta: Dict, metaPath = PATH_TO_BILLS_META, zip = True):
def saveBillsMeta(billsMeta: Dict, metaPath = constants.PATH_TO_BILLS_META, zip = True):
with open(metaPath, 'w') as f:
json.dump(billsMeta, f)
if zip:
Expand All @@ -159,17 +165,28 @@ def saveBillsMeta(billsMeta: Dict, metaPath = PATH_TO_BILLS_META, zip = True):
def updateBillsMeta(billsMeta= {}, congress= ''):
def addToBillsMeta(dirName: str, fileName: str):
billDict = loadJSON(os.path.join(dirName, fileName))
billCongressTypeNumber = getBillCongressTypeNumber(billDict)
if not billCongressTypeNumber:
try:
billCongressTypeNumber = getBillCongressTypeNumber(billDict)
except Exception as err:
logging.error(err)
return
if not billsMeta.get(billCongressTypeNumber):
billsMeta[billCongressTypeNumber] = {}
titles = getBillTitles(billDict)
billsMeta[billCongressTypeNumber]['titles'] = [title.get('title') for title in titles]
billsMeta[billCongressTypeNumber]['titles_whole_bill'] = [title.get('title') for title in titles if not title.get('is_for_portion')]
billsMeta[billCongressTypeNumber]['cosponsors'] = getCosponsors(fileDict=billDict, includeFields=['name', 'bioguide_id'])

# TODO convert bill_id to billnumber
billsMeta[billCongressTypeNumber]['related_bills'] = billDict.get('related_bills')
for item in billsMeta[billCongressTypeNumber]['related_bills']:
bill_id = item.get('bill_id')
if bill_id:
item['billCongressTypeNumber'] = billIdToBillNumber(bill_id)
else:
item['billCongressTypeNumber'] = None
billCount = len(billsMeta.keys())
if billCount % SAVE_ON_COUNT == 0:
if billCount % constants.SAVE_ON_COUNT == 0:
saveBillsMeta(billsMeta)

walkBillDirs(processFile=addToBillsMeta)
Expand Down
13 changes: 13 additions & 0 deletions scripts/constants.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
import datetime

PATH_TO_BILLS_META = '../billsMeta.json'
SAVE_ON_COUNT = 1000

BILL_TYPES = {
'ih': 'introduced',
'rh': 'reported to house'
}

CURRENT_CONGRESSIONAL_YEAR = datetime.date.today().year if datetime.date.today() > datetime.date(datetime.date.today().year, 1, 3) else (datetime.date.today().year - 1)
CURRENT_CONGRESS, cs_temp = divmod(round(((datetime.date(CURRENT_CONGRESSIONAL_YEAR, 1, 3) - datetime.date(1788, 1, 3)).days) / 365) + 1, 2)
CURRENT_SESSION = cs_temp + 1
77 changes: 52 additions & 25 deletions scripts/relatedBills.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,11 +9,13 @@
import datetime
from typing import Dict
from functools import reduce
from billdata import saveBillsMeta, loadJSON, loadBillsMeta
from billdata import deep_get, saveBillsMeta, loadBillsMeta

PATH_TO_TITLES_INDEX = '../titlesIndex.json'
PATH_TO_RELATEDBILLS = '../relatedBills.json'

OF_YEAR_REGEX = re.compile(r'\sof\s[0-9]+$')

logging.basicConfig(filename='billdata.log', filemode='w', level='INFO')
logger = logging.getLogger(__name__)
logger.addHandler(logging.StreamHandler(sys.stdout))
Expand All @@ -36,38 +38,63 @@ def loadTitlesIndex(titleIndexPath=PATH_TO_TITLES_INDEX, zip=True):

return titlesIndex

def getSimilarTitles(titlesIndex: dict, same=True):
billsRelatedByTitle = {}
# NOTE: This is very slow. Takes ~20 minutes
def addSimilarTitles(titlesIndex: dict, billsRelated = {}):
allTitles = list(titlesIndex.keys())
billsMeta = loadBillsMeta()
allBills = list(billsMeta.keys())
for bill_outer in allBills:
if not billsRelated.get(bill_outer):
billsRelated[bill_outer] = {'related': {}}

if not deep_get(billsRelated, bill_outer, 'related'):
billsRelated[bill_outer]['related'] ={}

titles = billsMeta[bill_outer].get('titles')
for title in titles:
noYearTitle = OF_YEAR_REGEX.sub('', title)
similarTitles = filter(lambda titleItem: title != titleItem and titleItem.startswith(noYearTitle), allTitles)
for similarTitle in similarTitles:
similarTitleBills = titlesIndex.get(similarTitle)
for bill_inner in similarTitleBills:
# Find a matching item, if any, in the list billsRelated[bill_outer]
if not deep_get(billsRelated, bill_outer, 'related', bill_inner):
billsRelated[bill_outer]['related'][bill_inner] = {
'titles_year': [similarTitle]
}
elif not deep_get(billsRelated, bill_outer, 'related', bill_inner, 'titles_year'):
billsRelated[bill_outer]['related'][bill_inner]['titles_year'] = [similarTitle]
else:
if similarTitle not in billsRelated[bill_outer]['related'][bill_inner]['titles_year']:
billsRelated[bill_outer]['related'][bill_inner]['titles_year'].append(similarTitle)
if deep_get(billsRelated, bill_outer, 'related', bill_inner, 'titles_year'):
print(billsRelated[bill_outer]['related'][bill_inner])
return billsRelated


def addSameTitles(titlesIndex: dict, billsRelated = {}):
for title, bills in titlesIndex.items():
for bill_outer in bills:
similarTitles = billsRelatedByTitle.get(bill_outer)


# Initialize the key-value for the bill
if not similarTitles:
billsRelatedByTitle[bill_outer] = []
if not billsRelated.get(bill_outer):
billsRelated[bill_outer] = {'related': {}}

if not deep_get(billsRelated, bill_outer, 'related'):
billsRelated[bill_outer]['related'] = {}

for bill_inner in bills:
# Find a matching item, if any, in the list billsRelatedByTitle[bill_outer]
# See https://stackoverflow.com/a/1701404/628748
bill_index = next((i for i,v in enumerate(billsRelatedByTitle[bill_outer]) if (bill_inner == v.get('billCongressTypeNumber'))), None)
if not bill_index:
billsRelatedByTitle[bill_outer].append({
'billCongressTypeNumber': bill_inner,
'titles': [title]
})
# Find a matching item, if any, in the list billsRelated[bill_outer]
if not deep_get(billsRelated, bill_outer, 'related', bill_inner):
billsRelated[bill_outer]['related'][bill_inner] = {'titles': [title]}
else:
billsRelatedByTitle[bill_outer][bill_index]['titles'].append(title)
billsRelated[bill_outer]['related'][bill_inner]['titles'].append(title)

return billsRelatedByTitle

def getRelatedBills():
billsMeta = loadBillsMeta()
titlesIndex = loadTitlesIndex()
billsRelatedIndex = getSimilarTitles(titlesIndex)
return billsRelatedIndex
return billsRelated

def makeAndSaveRelatedBills():
relatedBills = getRelatedBills()
def makeAndSaveRelatedBills(titlesIndex = loadTitlesIndex()):
sameTitleBills = addSameTitles(titlesIndex)
relatedBills = addSimilarTitles(titlesIndex=titlesIndex, billsRelated=sameTitleBills)
saveBillsMeta(billsMeta=relatedBills,
metaPath=PATH_TO_RELATEDBILLS)

Expand Down

0 comments on commit f8f6b8c

Please sign in to comment.