Skip to content

Commit

Permalink
Merge pull request #296 from NYPL/SFR-1900_ParseDownloadRequests
Browse files Browse the repository at this point in the history
SFR-1900_ParseDownloadRequests
  • Loading branch information
mitri-slory authored Mar 26, 2024
2 parents 4526b29 + 4b5eb3e commit 43aed9a
Show file tree
Hide file tree
Showing 3 changed files with 96 additions and 2 deletions.
7 changes: 6 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,11 @@
# CHANGELOG

## unreleased version -- v0.12.4
## unreleased version -- v0.13.1
## Added
- New script to parse download requests from S3 log files for UMP books
## Fixed

## 2024-03-21 -- v0.13.0
## Added
- New script to add nypl_login flag to Links objects
- Added nypl_login flags to NYPL and University of Michigan mapping and process
Expand Down
3 changes: 2 additions & 1 deletion scripts/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,4 +13,5 @@
from .updatePubLocationAndLinks import main as updateLocationAndLinks
from .countCABooks import main as countCA
from .nyplLoginFlags import main as nyplFlags
from .deleteUMPManifestLinks import main as deleteUMPManifests
from .deleteUMPManifestLinks import main as deleteUMPManifests
from .parseDownloadRequests import main as parseDownloads
88 changes: 88 additions & 0 deletions scripts/parseDownloadRequests.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
import os
import boto3
import re
import numpy

from model import Edition, Item, Link
from model.postgres.item import ITEM_LINKS
from managers import DBManager

s3_client = boto3.client("s3")

bucketName = 'ump-pdf-repository-logs'
logPrefix = 'logs/946183545209/us-east-1/ump-pdf-repository/2024/03/13/'
requestRegex = r'REST.GET.OBJECT '
fileIDRegex = r'REST.GET.OBJECT (.+pdf\s)' #File ID includes the file name for the pdf object
timeStampRegex = r'\[.+\]'
referrerRegex = r'https://drb-qa.nypl.org/'
umpDownloadArray = [['title', 'timeStamp', 'identifier']]

def main():

'''
The edition title, identifier, and timestamp are parsed out of the
S3 server access log files for UMP download requests
'''

batches = load_batch()
for batch in batches:
for c in batch['Contents']:
currKey = str(c['Key'])
#logObject is a dict type
logObject = s3_client.get_object(Bucket= bucketName, Key= f'{currKey}')
for i in logObject['Body'].iter_lines():
logObject = i.decode('utf8')
parseTuple = parseInfo(logObject)
if parseTuple:
umpDownloadArray.append(parseTuple)
umpDownloadCSV = numpy.array(umpDownloadArray)
with open('data3.csv', 'w') as f:
f.write(str(umpDownloadCSV))

def load_batch():
paginator = s3_client.get_paginator('list_objects_v2')
page_iterator = paginator.paginate(Bucket= bucketName, Prefix=logPrefix)
return page_iterator

def parseInfo(logObject):
matchRequest = re.search(requestRegex, logObject)
matchReferrer = re.search(referrerRegex, logObject)

if matchRequest and matchReferrer and '403 AccessDenied' not in logObject:
matchTime = re.search(timeStampRegex, logObject)
matchFileID = re.search(fileIDRegex, logObject)
linkGroup = matchFileID.group(1)
titleParse = ''
idParse = None

dbManager = DBManager(
user= os.environ.get('POSTGRES_USER', None),
pswd= os.environ.get('POSTGRES_PSWD', None),
host= os.environ.get('POSTGRES_HOST', None),
port= os.environ.get('POSTGRES_PORT', None),
db= os.environ.get('POSTGRES_NAME', None)
)
dbManager.generateEngine()

dbManager.createSession()

for item in dbManager.session.query(Item) \
.filter(Item.source == 'UofM'):
for link in dbManager.session.query(Link) \
.join(ITEM_LINKS) \
.filter(ITEM_LINKS.c.item_id == item.id) \
.filter(Link.media_type == 'application/pdf') \
.filter(Link.url.contains(linkGroup.strip())).all():
itemEditID = item.edition_id
for edit in dbManager.session.query(Edition) \
.filter(Edition.id == itemEditID):
titleParse = edit.title
idParse = edit.id

dbManager.closeConnection()

return [titleParse, matchTime.group(0), idParse]


if __name__ == '__main__':
main()

0 comments on commit 43aed9a

Please sign in to comment.