From 32d656711d7e1041cd738cf3d912271e207fa970 Mon Sep 17 00:00:00 2001
From: Your Name <you@example.com>
Date: Mon, 18 Mar 2024 16:43:35 -0400
Subject: [PATCH 1/3] SFR-1900_ParseDownloadRequests

---
 CHANGELOG.md                     |  1 +
 scripts/__init__.py              |  3 +-
 scripts/parseDataRequests.json   | 12 +++++
 scripts/parseDownloadRequests.py | 88 ++++++++++++++++++++++++++++++++
 4 files changed, 103 insertions(+), 1 deletion(-)
 create mode 100644 scripts/parseDataRequests.json
 create mode 100644 scripts/parseDownloadRequests.py

diff --git a/CHANGELOG.md b/CHANGELOG.md
index cf55d7012fa..3507146eec8 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -11,6 +11,7 @@
 - Added new University of Michigan process and mapping for ingestion
 - New directory for test JSON files that will be ingested
 - New script to delete UMP Manifest links from links table
+- New script to parse download requests from S3 log files for UMP books 
 ## Fixed
 - NYPL records not being added due to SQLAlchemy error
 - Bardo CCE API and Hathi DataFiles URL updated
diff --git a/scripts/__init__.py b/scripts/__init__.py
index 95b23e0f7c4..ee883ebe356 100644
--- a/scripts/__init__.py
+++ b/scripts/__init__.py
@@ -13,4 +13,5 @@
 from .updatePubLocationAndLinks import main as updateLocationAndLinks
 from .countCABooks import main as countCA
 from .nyplLoginFlags import main as nyplFlags
-from .deleteUMPManifestLinks import main as deleteUMPManifests
\ No newline at end of file
+from .deleteUMPManifestLinks import main as deleteUMPManifests
+from .parseDownloadRequests import main as parseDownloads
\ No newline at end of file
diff --git a/scripts/parseDataRequests.json b/scripts/parseDataRequests.json
new file mode 100644
index 00000000000..06dd266ec04
--- /dev/null
+++ b/scripts/parseDataRequests.json
@@ -0,0 +1,12 @@
+[
+      {
+            "title": "Educating economists",
+            "timeStamp": "[11/Mar/2024:18:28:52 +0000]",
+            "identifier": 6969309
+      },
+      {
+            "title": "Educating economists",
+            "timeStamp": "[11/Mar/2024:18:28:51 +0000]",
+            "identifier": 6969309
+      }
+]
\ No newline at end of file
diff --git a/scripts/parseDownloadRequests.py b/scripts/parseDownloadRequests.py
new file mode 100644
index 00000000000..35cee54e899
--- /dev/null
+++ b/scripts/parseDownloadRequests.py
@@ -0,0 +1,88 @@
+import os
+import boto3
+import json
+import re
+
+from model import Edition, Item, Link
+from model.postgres.item import ITEM_LINKS
+from managers import DBManager
+
+s3_client = boto3.client("s3")
+
+bucketName = 'ump-pdf-repository-logs'
+
+def main():
+    
+    
+    '''
+    The edition title, identifier, and timestamp are parsed out of the 
+    S3 server access log files for UMP download requests
+    '''
+
+    requestRegex = r'REST.GET.OBJECT '
+    fileIDRegex = r'REST.GET.OBJECT (.+pdf\s)' #File ID includes the file name for the pdf object
+    timeStampRegex = r'\[.+\]'
+    referrerRegex = r'https://drb-qa.nypl.org/'
+    umpDownloadJSON = []
+
+    batches = load_batch()
+    for batch in batches:
+        for c in batch['Contents']:
+            currKey = str(c['Key'])
+            #logObject is a dict type
+            logObject = s3_client.get_object(Bucket= bucketName, Key= f'{currKey}')
+            for i in logObject['Body'].iter_lines():
+                logObject = i.decode('utf8')
+                parseTuple = parseInfo(logObject, requestRegex, referrerRegex, timeStampRegex, fileIDRegex)
+                if parseTuple:
+                    umpDownloadJSON.append(parseTuple)
+    with open("parseDataRequests.json", "w", encoding='utf-8') as write_file:
+        json.dump(umpDownloadJSON, write_file, ensure_ascii = False, indent = 6) 
+
+def load_batch():
+    paginator = s3_client.get_paginator('list_objects_v2')
+    page_iterator = paginator.paginate(Bucket= bucketName, Prefix= 'logs/946183545209/us-east-1/ump-pdf-repository/2024/03/13/')
+    return page_iterator
+
+def parseInfo(logObject, requestRegex, referrerRegex, timeStampRegex, fileIDRegex):
+    matchRequest = re.search(requestRegex, logObject)
+    matchReferrer = re.search(referrerRegex, logObject) 
+    
+    if matchRequest and matchReferrer and '403 AccessDenied' not in logObject:
+        matchTime = re.search(timeStampRegex, logObject)
+        matchFileID = re.search(fileIDRegex, logObject)
+        linkGroup = matchFileID.group(1)
+        titleParse = ''
+        idParse = None
+
+        dbManager = DBManager(
+        user= os.environ.get('POSTGRES_USER', None),
+        pswd= os.environ.get('POSTGRES_PSWD', None),
+        host= os.environ.get('POSTGRES_HOST', None),
+        port= os.environ.get('POSTGRES_PORT', None),
+        db= os.environ.get('POSTGRES_NAME', None)
+    )
+        dbManager.generateEngine()
+
+        dbManager.createSession()
+
+        for item in dbManager.session.query(Item) \
+            .filter(Item.source == 'UofM'):
+            for link in dbManager.session.query(Link) \
+                .join(ITEM_LINKS) \
+                .filter(ITEM_LINKS.c.item_id == item.id) \
+                .filter(Link.media_type == 'application/pdf') \
+                .filter(Link.url.contains(linkGroup.strip())).all(): 
+                    itemEditID = item.edition_id
+                    for edit in dbManager.session.query(Edition) \
+                        .filter(Edition.id == itemEditID):
+                            titleParse = edit.title
+                            idParse = edit.id
+        
+        dbManager.closeConnection()
+
+        return {'title': titleParse, 'timeStamp': matchTime.group(0), 'identifier': idParse}
+
+
+if __name__ == '__main__':
+    main()
\ No newline at end of file

From 808110edfbdfea1bed85858c44ab733df6f177e7 Mon Sep 17 00:00:00 2001
From: Your Name <you@example.com>
Date: Fri, 22 Mar 2024 14:43:06 -0400
Subject: [PATCH 2/3] Changed json to csv file

---
 CHANGELOG.md                     |  2 +-
 data3.csv                        |  5 +++++
 scripts/parseDownloadRequests.py | 13 +++++++------
 3 files changed, 13 insertions(+), 7 deletions(-)
 create mode 100644 data3.csv

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 3507146eec8..06478ccbf13 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,6 +1,6 @@
 # CHANGELOG
 
-## unreleased version -- v0.12.4
+## 2024-03-21 -- v0.13.0
 ## Added
 - New script to add nypl_login flag to Links objects
 - Added nypl_login flags to NYPL and University of Michigan mapping and process
diff --git a/data3.csv b/data3.csv
new file mode 100644
index 00000000000..56b8cb72598
--- /dev/null
+++ b/data3.csv
@@ -0,0 +1,5 @@
+[['title' 'timeStamp' 'identifier']
+ ['China enters the twentieth century' '[13/Mar/2024:21:07:46 +0000]'
+  '6969533']
+ ['China enters the twentieth century' '[13/Mar/2024:21:07:46 +0000]'
+  '6969533']]
\ No newline at end of file
diff --git a/scripts/parseDownloadRequests.py b/scripts/parseDownloadRequests.py
index 35cee54e899..214279c98ec 100644
--- a/scripts/parseDownloadRequests.py
+++ b/scripts/parseDownloadRequests.py
@@ -1,7 +1,7 @@
 import os
 import boto3
-import json
 import re
+import numpy 
 
 from model import Edition, Item, Link
 from model.postgres.item import ITEM_LINKS
@@ -23,7 +23,7 @@ def main():
     fileIDRegex = r'REST.GET.OBJECT (.+pdf\s)' #File ID includes the file name for the pdf object
     timeStampRegex = r'\[.+\]'
     referrerRegex = r'https://drb-qa.nypl.org/'
-    umpDownloadJSON = []
+    umpDownloadArray = [['title', 'timeStamp', 'identifier']]
 
     batches = load_batch()
     for batch in batches:
@@ -35,9 +35,10 @@ def main():
                 logObject = i.decode('utf8')
                 parseTuple = parseInfo(logObject, requestRegex, referrerRegex, timeStampRegex, fileIDRegex)
                 if parseTuple:
-                    umpDownloadJSON.append(parseTuple)
-    with open("parseDataRequests.json", "w", encoding='utf-8') as write_file:
-        json.dump(umpDownloadJSON, write_file, ensure_ascii = False, indent = 6) 
+                    umpDownloadArray.append(parseTuple)
+    umpDownloadCSV = numpy.array(umpDownloadArray)
+    with open('data3.csv', 'w') as f: 
+        f.write(str(umpDownloadCSV))
 
 def load_batch():
     paginator = s3_client.get_paginator('list_objects_v2')
@@ -81,7 +82,7 @@ def parseInfo(logObject, requestRegex, referrerRegex, timeStampRegex, fileIDRege
         
         dbManager.closeConnection()
 
-        return {'title': titleParse, 'timeStamp': matchTime.group(0), 'identifier': idParse}
+        return [titleParse, matchTime.group(0), idParse]
 
 
 if __name__ == '__main__':

From 4b5eb3e7670a25b16df11400e6c4f5f9c23b41d1 Mon Sep 17 00:00:00 2001
From: Your Name <you@example.com>
Date: Mon, 25 Mar 2024 14:29:10 -0400
Subject: [PATCH 3/3] Deleted test outputs, update changelog, modify script

---
 CHANGELOG.md                     |  6 +++++-
 data3.csv                        |  5 -----
 scripts/parseDataRequests.json   | 12 ------------
 scripts/parseDownloadRequests.py | 19 +++++++++----------
 4 files changed, 14 insertions(+), 28 deletions(-)
 delete mode 100644 data3.csv
 delete mode 100644 scripts/parseDataRequests.json

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 06478ccbf13..f9c71b47926 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,5 +1,10 @@
 # CHANGELOG
 
+## unreleased version -- v0.13.1
+## Added
+- New script to parse download requests from S3 log files for UMP books 
+## Fixed
+
 ## 2024-03-21 -- v0.13.0
 ## Added
 - New script to add nypl_login flag to Links objects
@@ -11,7 +16,6 @@
 - Added new University of Michigan process and mapping for ingestion
 - New directory for test JSON files that will be ingested
 - New script to delete UMP Manifest links from links table
-- New script to parse download requests from S3 log files for UMP books 
 ## Fixed
 - NYPL records not being added due to SQLAlchemy error
 - Bardo CCE API and Hathi DataFiles URL updated
diff --git a/data3.csv b/data3.csv
deleted file mode 100644
index 56b8cb72598..00000000000
--- a/data3.csv
+++ /dev/null
@@ -1,5 +0,0 @@
-[['title' 'timeStamp' 'identifier']
- ['China enters the twentieth century' '[13/Mar/2024:21:07:46 +0000]'
-  '6969533']
- ['China enters the twentieth century' '[13/Mar/2024:21:07:46 +0000]'
-  '6969533']]
\ No newline at end of file
diff --git a/scripts/parseDataRequests.json b/scripts/parseDataRequests.json
deleted file mode 100644
index 06dd266ec04..00000000000
--- a/scripts/parseDataRequests.json
+++ /dev/null
@@ -1,12 +0,0 @@
-[
-      {
-            "title": "Educating economists",
-            "timeStamp": "[11/Mar/2024:18:28:52 +0000]",
-            "identifier": 6969309
-      },
-      {
-            "title": "Educating economists",
-            "timeStamp": "[11/Mar/2024:18:28:51 +0000]",
-            "identifier": 6969309
-      }
-]
\ No newline at end of file
diff --git a/scripts/parseDownloadRequests.py b/scripts/parseDownloadRequests.py
index 214279c98ec..84397adb7f3 100644
--- a/scripts/parseDownloadRequests.py
+++ b/scripts/parseDownloadRequests.py
@@ -10,21 +10,20 @@
 s3_client = boto3.client("s3")
 
 bucketName = 'ump-pdf-repository-logs'
+logPrefix = 'logs/946183545209/us-east-1/ump-pdf-repository/2024/03/13/'
+requestRegex = r'REST.GET.OBJECT '
+fileIDRegex = r'REST.GET.OBJECT (.+pdf\s)' #File ID includes the file name for the pdf object
+timeStampRegex = r'\[.+\]'
+referrerRegex = r'https://drb-qa.nypl.org/'
+umpDownloadArray = [['title', 'timeStamp', 'identifier']]
 
 def main():
     
-    
     '''
     The edition title, identifier, and timestamp are parsed out of the 
     S3 server access log files for UMP download requests
     '''
 
-    requestRegex = r'REST.GET.OBJECT '
-    fileIDRegex = r'REST.GET.OBJECT (.+pdf\s)' #File ID includes the file name for the pdf object
-    timeStampRegex = r'\[.+\]'
-    referrerRegex = r'https://drb-qa.nypl.org/'
-    umpDownloadArray = [['title', 'timeStamp', 'identifier']]
-
     batches = load_batch()
     for batch in batches:
         for c in batch['Contents']:
@@ -33,7 +32,7 @@ def main():
             logObject = s3_client.get_object(Bucket= bucketName, Key= f'{currKey}')
             for i in logObject['Body'].iter_lines():
                 logObject = i.decode('utf8')
-                parseTuple = parseInfo(logObject, requestRegex, referrerRegex, timeStampRegex, fileIDRegex)
+                parseTuple = parseInfo(logObject)
                 if parseTuple:
                     umpDownloadArray.append(parseTuple)
     umpDownloadCSV = numpy.array(umpDownloadArray)
@@ -42,10 +41,10 @@ def main():
 
 def load_batch():
     paginator = s3_client.get_paginator('list_objects_v2')
-    page_iterator = paginator.paginate(Bucket= bucketName, Prefix= 'logs/946183545209/us-east-1/ump-pdf-repository/2024/03/13/')
+    page_iterator = paginator.paginate(Bucket= bucketName, Prefix=logPrefix)
     return page_iterator
 
-def parseInfo(logObject, requestRegex, referrerRegex, timeStampRegex, fileIDRegex):
+def parseInfo(logObject):
     matchRequest = re.search(requestRegex, logObject)
     matchReferrer = re.search(referrerRegex, logObject)