From 32d656711d7e1041cd738cf3d912271e207fa970 Mon Sep 17 00:00:00 2001 From: Your Name Date: Mon, 18 Mar 2024 16:43:35 -0400 Subject: [PATCH 1/3] SFR-1900_ParseDownloadRequests --- CHANGELOG.md | 1 + scripts/__init__.py | 3 +- scripts/parseDataRequests.json | 12 +++++ scripts/parseDownloadRequests.py | 88 ++++++++++++++++++++++++++++++++ 4 files changed, 103 insertions(+), 1 deletion(-) create mode 100644 scripts/parseDataRequests.json create mode 100644 scripts/parseDownloadRequests.py diff --git a/CHANGELOG.md b/CHANGELOG.md index cf55d7012fa..3507146eec8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,6 +11,7 @@ - Added new University of Michigan process and mapping for ingestion - New directory for test JSON files that will be ingested - New script to delete UMP Manifest links from links table +- New script to parse download requests from S3 log files for UMP books ## Fixed - NYPL records not being added due to SQLAlchemy error - Bardo CCE API and Hathi DataFiles URL updated diff --git a/scripts/__init__.py b/scripts/__init__.py index 95b23e0f7c4..ee883ebe356 100644 --- a/scripts/__init__.py +++ b/scripts/__init__.py @@ -13,4 +13,5 @@ from .updatePubLocationAndLinks import main as updateLocationAndLinks from .countCABooks import main as countCA from .nyplLoginFlags import main as nyplFlags -from .deleteUMPManifestLinks import main as deleteUMPManifests \ No newline at end of file +from .deleteUMPManifestLinks import main as deleteUMPManifests +from .parseDownloadRequests import main as parseDownloads \ No newline at end of file diff --git a/scripts/parseDataRequests.json b/scripts/parseDataRequests.json new file mode 100644 index 00000000000..06dd266ec04 --- /dev/null +++ b/scripts/parseDataRequests.json @@ -0,0 +1,12 @@ +[ + { + "title": "Educating economists", + "timeStamp": "[11/Mar/2024:18:28:52 +0000]", + "identifier": 6969309 + }, + { + "title": "Educating economists", + "timeStamp": "[11/Mar/2024:18:28:51 +0000]", + "identifier": 6969309 + } +] \ No newline at end of file diff --git a/scripts/parseDownloadRequests.py b/scripts/parseDownloadRequests.py new file mode 100644 index 00000000000..35cee54e899 --- /dev/null +++ b/scripts/parseDownloadRequests.py @@ -0,0 +1,88 @@ +import os +import boto3 +import json +import re + +from model import Edition, Item, Link +from model.postgres.item import ITEM_LINKS +from managers import DBManager + +s3_client = boto3.client("s3") + +bucketName = 'ump-pdf-repository-logs' + +def main(): + + + ''' + The edition title, identifier, and timestamp are parsed out of the + S3 server access log files for UMP download requests + ''' + + requestRegex = r'REST.GET.OBJECT ' + fileIDRegex = r'REST.GET.OBJECT (.+pdf\s)' #File ID includes the file name for the pdf object + timeStampRegex = r'\[.+\]' + referrerRegex = r'https://drb-qa.nypl.org/' + umpDownloadJSON = [] + + batches = load_batch() + for batch in batches: + for c in batch['Contents']: + currKey = str(c['Key']) + #logObject is a dict type + logObject = s3_client.get_object(Bucket= bucketName, Key= f'{currKey}') + for i in logObject['Body'].iter_lines(): + logObject = i.decode('utf8') + parseTuple = parseInfo(logObject, requestRegex, referrerRegex, timeStampRegex, fileIDRegex) + if parseTuple: + umpDownloadJSON.append(parseTuple) + with open("parseDataRequests.json", "w", encoding='utf-8') as write_file: + json.dump(umpDownloadJSON, write_file, ensure_ascii = False, indent = 6) + +def load_batch(): + paginator = s3_client.get_paginator('list_objects_v2') + page_iterator = paginator.paginate(Bucket= bucketName, Prefix= 'logs/946183545209/us-east-1/ump-pdf-repository/2024/03/13/') + return page_iterator + +def parseInfo(logObject, requestRegex, referrerRegex, timeStampRegex, fileIDRegex): + matchRequest = re.search(requestRegex, logObject) + matchReferrer = re.search(referrerRegex, logObject) + + if matchRequest and matchReferrer and '403 AccessDenied' not in logObject: + matchTime = re.search(timeStampRegex, logObject) + matchFileID = re.search(fileIDRegex, logObject) + linkGroup = matchFileID.group(1) + titleParse = '' + idParse = None + + dbManager = DBManager( + user= os.environ.get('POSTGRES_USER', None), + pswd= os.environ.get('POSTGRES_PSWD', None), + host= os.environ.get('POSTGRES_HOST', None), + port= os.environ.get('POSTGRES_PORT', None), + db= os.environ.get('POSTGRES_NAME', None) + ) + dbManager.generateEngine() + + dbManager.createSession() + + for item in dbManager.session.query(Item) \ + .filter(Item.source == 'UofM'): + for link in dbManager.session.query(Link) \ + .join(ITEM_LINKS) \ + .filter(ITEM_LINKS.c.item_id == item.id) \ + .filter(Link.media_type == 'application/pdf') \ + .filter(Link.url.contains(linkGroup.strip())).all(): + itemEditID = item.edition_id + for edit in dbManager.session.query(Edition) \ + .filter(Edition.id == itemEditID): + titleParse = edit.title + idParse = edit.id + + dbManager.closeConnection() + + return {'title': titleParse, 'timeStamp': matchTime.group(0), 'identifier': idParse} + + +if __name__ == '__main__': + main() \ No newline at end of file From 808110edfbdfea1bed85858c44ab733df6f177e7 Mon Sep 17 00:00:00 2001 From: Your Name Date: Fri, 22 Mar 2024 14:43:06 -0400 Subject: [PATCH 2/3] Changed json to csv file --- CHANGELOG.md | 2 +- data3.csv | 5 +++++ scripts/parseDownloadRequests.py | 13 +++++++------ 3 files changed, 13 insertions(+), 7 deletions(-) create mode 100644 data3.csv diff --git a/CHANGELOG.md b/CHANGELOG.md index 3507146eec8..06478ccbf13 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,6 @@ # CHANGELOG -## unreleased version -- v0.12.4 +## 2024-03-21 -- v0.13.0 ## Added - New script to add nypl_login flag to Links objects - Added nypl_login flags to NYPL and University of Michigan mapping and process diff --git a/data3.csv b/data3.csv new file mode 100644 index 00000000000..56b8cb72598 --- /dev/null +++ b/data3.csv @@ -0,0 +1,5 @@ +[['title' 'timeStamp' 'identifier'] + ['China enters the twentieth century' '[13/Mar/2024:21:07:46 +0000]' + '6969533'] + ['China enters the twentieth century' '[13/Mar/2024:21:07:46 +0000]' + '6969533']] \ No newline at end of file diff --git a/scripts/parseDownloadRequests.py b/scripts/parseDownloadRequests.py index 35cee54e899..214279c98ec 100644 --- a/scripts/parseDownloadRequests.py +++ b/scripts/parseDownloadRequests.py @@ -1,7 +1,7 @@ import os import boto3 -import json import re +import numpy from model import Edition, Item, Link from model.postgres.item import ITEM_LINKS @@ -23,7 +23,7 @@ def main(): fileIDRegex = r'REST.GET.OBJECT (.+pdf\s)' #File ID includes the file name for the pdf object timeStampRegex = r'\[.+\]' referrerRegex = r'https://drb-qa.nypl.org/' - umpDownloadJSON = [] + umpDownloadArray = [['title', 'timeStamp', 'identifier']] batches = load_batch() for batch in batches: @@ -35,9 +35,10 @@ def main(): logObject = i.decode('utf8') parseTuple = parseInfo(logObject, requestRegex, referrerRegex, timeStampRegex, fileIDRegex) if parseTuple: - umpDownloadJSON.append(parseTuple) - with open("parseDataRequests.json", "w", encoding='utf-8') as write_file: - json.dump(umpDownloadJSON, write_file, ensure_ascii = False, indent = 6) + umpDownloadArray.append(parseTuple) + umpDownloadCSV = numpy.array(umpDownloadArray) + with open('data3.csv', 'w') as f: + f.write(str(umpDownloadCSV)) def load_batch(): paginator = s3_client.get_paginator('list_objects_v2') @@ -81,7 +82,7 @@ def parseInfo(logObject, requestRegex, referrerRegex, timeStampRegex, fileIDRege dbManager.closeConnection() - return {'title': titleParse, 'timeStamp': matchTime.group(0), 'identifier': idParse} + return [titleParse, matchTime.group(0), idParse] if __name__ == '__main__': From 4b5eb3e7670a25b16df11400e6c4f5f9c23b41d1 Mon Sep 17 00:00:00 2001 From: Your Name Date: Mon, 25 Mar 2024 14:29:10 -0400 Subject: [PATCH 3/3] Deleted test outputs, update changelog, modify script --- CHANGELOG.md | 6 +++++- data3.csv | 5 ----- scripts/parseDataRequests.json | 12 ------------ scripts/parseDownloadRequests.py | 19 +++++++++---------- 4 files changed, 14 insertions(+), 28 deletions(-) delete mode 100644 data3.csv delete mode 100644 scripts/parseDataRequests.json diff --git a/CHANGELOG.md b/CHANGELOG.md index 06478ccbf13..f9c71b47926 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,10 @@ # CHANGELOG +## unreleased version -- v0.13.1 +## Added +- New script to parse download requests from S3 log files for UMP books +## Fixed + ## 2024-03-21 -- v0.13.0 ## Added - New script to add nypl_login flag to Links objects @@ -11,7 +16,6 @@ - Added new University of Michigan process and mapping for ingestion - New directory for test JSON files that will be ingested - New script to delete UMP Manifest links from links table -- New script to parse download requests from S3 log files for UMP books ## Fixed - NYPL records not being added due to SQLAlchemy error - Bardo CCE API and Hathi DataFiles URL updated diff --git a/data3.csv b/data3.csv deleted file mode 100644 index 56b8cb72598..00000000000 --- a/data3.csv +++ /dev/null @@ -1,5 +0,0 @@ -[['title' 'timeStamp' 'identifier'] - ['China enters the twentieth century' '[13/Mar/2024:21:07:46 +0000]' - '6969533'] - ['China enters the twentieth century' '[13/Mar/2024:21:07:46 +0000]' - '6969533']] \ No newline at end of file diff --git a/scripts/parseDataRequests.json b/scripts/parseDataRequests.json deleted file mode 100644 index 06dd266ec04..00000000000 --- a/scripts/parseDataRequests.json +++ /dev/null @@ -1,12 +0,0 @@ -[ - { - "title": "Educating economists", - "timeStamp": "[11/Mar/2024:18:28:52 +0000]", - "identifier": 6969309 - }, - { - "title": "Educating economists", - "timeStamp": "[11/Mar/2024:18:28:51 +0000]", - "identifier": 6969309 - } -] \ No newline at end of file diff --git a/scripts/parseDownloadRequests.py b/scripts/parseDownloadRequests.py index 214279c98ec..84397adb7f3 100644 --- a/scripts/parseDownloadRequests.py +++ b/scripts/parseDownloadRequests.py @@ -10,21 +10,20 @@ s3_client = boto3.client("s3") bucketName = 'ump-pdf-repository-logs' +logPrefix = 'logs/946183545209/us-east-1/ump-pdf-repository/2024/03/13/' +requestRegex = r'REST.GET.OBJECT ' +fileIDRegex = r'REST.GET.OBJECT (.+pdf\s)' #File ID includes the file name for the pdf object +timeStampRegex = r'\[.+\]' +referrerRegex = r'https://drb-qa.nypl.org/' +umpDownloadArray = [['title', 'timeStamp', 'identifier']] def main(): - ''' The edition title, identifier, and timestamp are parsed out of the S3 server access log files for UMP download requests ''' - requestRegex = r'REST.GET.OBJECT ' - fileIDRegex = r'REST.GET.OBJECT (.+pdf\s)' #File ID includes the file name for the pdf object - timeStampRegex = r'\[.+\]' - referrerRegex = r'https://drb-qa.nypl.org/' - umpDownloadArray = [['title', 'timeStamp', 'identifier']] - batches = load_batch() for batch in batches: for c in batch['Contents']: @@ -33,7 +32,7 @@ def main(): logObject = s3_client.get_object(Bucket= bucketName, Key= f'{currKey}') for i in logObject['Body'].iter_lines(): logObject = i.decode('utf8') - parseTuple = parseInfo(logObject, requestRegex, referrerRegex, timeStampRegex, fileIDRegex) + parseTuple = parseInfo(logObject) if parseTuple: umpDownloadArray.append(parseTuple) umpDownloadCSV = numpy.array(umpDownloadArray) @@ -42,10 +41,10 @@ def main(): def load_batch(): paginator = s3_client.get_paginator('list_objects_v2') - page_iterator = paginator.paginate(Bucket= bucketName, Prefix= 'logs/946183545209/us-east-1/ump-pdf-repository/2024/03/13/') + page_iterator = paginator.paginate(Bucket= bucketName, Prefix=logPrefix) return page_iterator -def parseInfo(logObject, requestRegex, referrerRegex, timeStampRegex, fileIDRegex): +def parseInfo(logObject): matchRequest = re.search(requestRegex, logObject) matchReferrer = re.search(referrerRegex, logObject)