Skip to content

Commit

Permalink
Merge branch 'main' into SFR-1763_UseNon5000PortInDev
Browse files Browse the repository at this point in the history
  • Loading branch information
Apophenia authored Feb 1, 2024
2 parents 77a781f + 233473f commit ecf5a47
Show file tree
Hide file tree
Showing 11 changed files with 6,550 additions and 6 deletions.
2 changes: 1 addition & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -13,4 +13,4 @@ config/local*
tags
# Vim
*.swp
/*venv*
/*venv*
5 changes: 5 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,16 @@ New /fulfill endpoint with ability to check for NYPL login in Bearer authorizati
Fulfill endpoint returns pre-signed URLs for objects in private buckets when user is logged in
Change default development port to 5050 due to macOS Monterey and higher occupying port 5000 by default


## unreleased version -- v0.12.4
## Added
- New script to add nypl_login flag to Links objects
- Added nypl_login flag to nypl mapping
- New APIUtils method to generate a presigned url for S3 actions
- New /fulfill endpoint with ability to check for NYPL login in Bearer authorization header
- Fulfill endpoint returns pre-signed URLs for objects in private buckets when user is logged in
- Added new University of Michigan process and mapping for ingestion
- New directory for test JSON files that will be ingested
## Fixed
- NYPL records not being added due to SQLAlchemy error
- Bardo CCE API and Hathi DataFiles URL updated
Expand Down
9 changes: 5 additions & 4 deletions api/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,10 +40,11 @@ class APIUtils():
'muse': 4,
'met': 5,
'isac': 6,
'UofSC': 7,
'hathitrust': 8,
'oclc': 9,
'nypl': 10
'UofM': 7,
'UofSC': 8,
'hathitrust': 9,
'oclc': 10,
'nypl': 11
}

@staticmethod
Expand Down
6,142 changes: 6,142 additions & 0 deletions ingestJSONFiles/UofM_CSV.json

Large diffs are not rendered by default.

File renamed without changes.
81 changes: 81 additions & 0 deletions mappings/UofM.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
from .json import JSONMapping
import logging

class UofMMapping(JSONMapping):
def __init__(self, source):
super().__init__(source, {})
self.mapping = self.createMapping()

def createMapping(self):
return {
'title': ('Title', '{0}'),
'authors': ('Author(s)', '{0}'),
'dates': [('Pub Date', '{0}|publication_date')],
'publisher': [('Publisher (from Projects)', '{0}||')],
'identifiers': [
('ISBN', '{0}|isbn'),
('OCLC', '{0}|oclc')
],
'contributors': [('Contributors', '{0}|||contributor')],
'subjects': ('Subject 1', '{0}'),
}

def applyFormatting(self):
self.record.has_part = []
self.record.spatial = 'Michigan'
self.record.source = 'UofM'

if self.record.authors:
self.record.authors = self.formatAuthors()

if self.record.subjects:
self.record.subjects = self.formatSubjects()

if self.record.identifiers:
if len(self.record.identifiers) == 1:
source_id = self.record.identifiers[0].split('|')[0]
else:
source_id = self.record.identifiers[1].split('|')[0]

self.record.source_id = f'UofM_{source_id}'
self.record.identifiers = self.formatIdentifiers()

def formatAuthors(self):
authorList = []

if ';' in self.record.authors:
authorList = self.record.authors.split('; ')
newAuthorList = [f'{author}|||true' for author in authorList]
return newAuthorList
else:
authorList.append(f'{self.record.authors}|||true)')
return authorList

def formatSubjects(self):
subjectList = []

if '|' in self.record.subjects:
subjectList = self.record.subjects.split('|')
newSubjectList = [f'{subject}||' for subject in subjectList]
return newSubjectList
else:
subjectList.append(f'{self.record.subjects}||')
return subjectList

def formatIdentifiers(self):
if 'isbn' in self.record.identifiers[0]:
isbnString = self.record.identifiers[0].split('|')[0]
if ';' in isbnString:
isbnList = isbnString.split('; ')
newISBNList = [f'{isbn}|isbn' for isbn in isbnList]
if len(self.record.identifiers) > 1 and 'oclc' in self.record.identifiers[1]:
newISBNList.append(f'{self.record.identifiers[1]}')
return newISBNList
else:
return newISBNList

return self.record.identifiers




159 changes: 159 additions & 0 deletions processes/UofM.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,159 @@
import json
import os
from requests.exceptions import HTTPError, ConnectionError
from botocore.exceptions import ClientError

from .core import CoreProcess
from urllib.error import HTTPError
from mappings.core import MappingError
from mappings.UofM import UofMMapping
from managers import WebpubManifest
from logger import createLog

logger = createLog(__name__)

class UofMProcess(CoreProcess):

def __init__(self, *args):
super(UofMProcess, self).__init__(*args[:4])

self.ingestOffset = int(args[5] or 0)
self.ingestLimit = (int(args[4]) + self.ingestOffset) if args[4] else 5000
self.fullImport = self.process == 'complete'

# Connect to database
self.generateEngine()
self.createSession()

# S3 Configuration
self.s3Bucket = os.environ['FILE_BUCKET']
self.createS3Client()

def runProcess(self):
with open('ingestJSONFiles/UofM_CSV.json') as f:
UofMData = json.load(f)

for i in range(0, len(UofMData['data'])):
metaDict = UofMData['data'][i]
self.processUofMRecord(metaDict)

self.saveRecords()
self.commitChanges()

def processUofMRecord(self, record):
try:
UofMRec = UofMMapping(record)
UofMRec.applyMapping()
self.addHasPartMapping(record, UofMRec.record)
self.storePDFManifest(UofMRec.record)
self.addDCDWToUpdateList(UofMRec)

except (MappingError, HTTPError, ConnectionError, IndexError, TypeError) as e:
logger.exception(e)
logger.warn(UofMError('Unable to process UofM record'))

def addHasPartMapping(self, resultsRecord, record):
bucket = 'ump-pdf-repository'

try:
#The get_object method is to make sure the object with a specific bucket and key exists in S3
self.s3Client.get_object(Bucket=bucket,
Key=f'{resultsRecord["File ID 1"]}_060pct.pdf')
key = f'{resultsRecord["File ID 1"]}_060pct.pdf'
urlPDFObject = f'https://{bucket}.s3.amazonaws.com/{key}'

linkString = '|'.join([
'1',
urlPDFObject,
'UofM',
'application/pdf',
'{"catalog": false, "download": true, "reader": false, "embed": false}'
])
record.has_part.append(linkString)

except ClientError or Exception or HTTPError as err:
if err.response['Error']['Code'] == 'NoSuchKey':
logger.info(UofMError("Key doesn't exist"))
else:
logger.info(UofMError("Object doesn't exist"))

if not record.has_part:
try:
#The get_object method is to make sure the object with a specific bucket and key exists in S3
self.s3Client.get_object(Bucket= 'ump-pdf-repository',
Key= f'{resultsRecord["File ID 1"]}_100pct.pdf')
key = f'{resultsRecord["File ID 1"]}_100pct.pdf'
urlPDFObject = f'https://{bucket}.s3.amazonaws.com/{key}'

linkString = '|'.join([
'1',
urlPDFObject,
'UofM',
'application/pdf',
'{"catalog": false, "download": true, "reader": false, "embed": false}'
])
record.has_part.append(linkString)

except ClientError or Exception or HTTPError as err:
if err.response['Error']['Code'] == 'NoSuchKey':
logger.info(UofMError("Key doesn't exist"))
else:
logger.info(UofMError("Object doesn't exist"))



def storePDFManifest(self, record):
for link in record.has_part:
itemNo, uri, source, mediaType, flags = link.split('|')

if mediaType == 'application/pdf':
logger.warn(f'Identifiers: {record.identifiers}')
recordID = record.identifiers[0].split('|')[0]

manifestPath = 'manifests/{}/{}.json'.format(source, recordID)
manifestURI = 'https://{}.s3.amazonaws.com/{}'.format(
self.s3Bucket, manifestPath
)

manifestJSON = self.generateManifest(record, uri, manifestURI)

self.createManifestInS3(manifestPath, manifestJSON)

linkString = '|'.join([
itemNo,
manifestURI,
source,
'application/webpub+json',
'{"catalog": false, "download": false, "reader": true, "embed": false}'
])

record.has_part.insert(0, linkString)
break

def createManifestInS3(self, manifestPath, manifestJSON):
self.putObjectInBucket(
manifestJSON.encode('utf-8'), manifestPath, self.s3Bucket
)

@staticmethod
def generateManifest(record, sourceURI, manifestURI):
manifest = WebpubManifest(sourceURI, 'application/pdf')

manifest.addMetadata(
record,
conformsTo=os.environ['WEBPUB_PDF_PROFILE']
)

manifest.addChapter(sourceURI, record.title)

manifest.links.append({
'rel': 'self',
'href': manifestURI,
'type': 'application/webpub+json'
})

return manifest.toJson()


class UofMError(Exception):
pass
1 change: 1 addition & 0 deletions processes/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,3 +17,4 @@
from .chicagoISAC import ChicagoISACProcess
from .UofSC import UofSCProcess
from .loc import LOCProcess
from .UofM import UofMProcess
2 changes: 1 addition & 1 deletion processes/chicagoISAC.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ def __init__(self, *args):
self.createS3Client()

def runProcess(self):
with open('chicagoISAC_metadata.json') as f:
with open('ingestJSONFiles/chicagoISAC_metadata.json') as f:
chicagoISACData = json.load(f)

for metaDict in chicagoISACData:
Expand Down
49 changes: 49 additions & 0 deletions tests/unit/test_UofM_mapping.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
import pytest

from mappings.UofM import UofMMapping


class TestUofMMapping:
@pytest.fixture
def testMapping(self):
class TestUofMMapping(UofMMapping):
def __init__(self):
self.mapping = None

return TestUofMMapping()

@pytest.fixture
def testRecordStandard(self, mocker):
return mocker.MagicMock(
title='testTitle',
authors=['testAuthor|||true'],
dates=['testDate|publication_date'],
publisher=['testPublisher||'],
identifiers=['testISBN|isbn', 'testOCLC|oclc'],
contributor=['testContributor|||contributor'],
subjects='testSubject'
)

def test_createMapping(self, testMapping):
recordMapping = testMapping.createMapping()

assert list(recordMapping.keys()) == [
'title', 'authors', 'dates', 'publisher',
'identifiers', 'contributors', 'subjects'
]
assert recordMapping['title'] == ('Title', '{0}')

def test_applyFormatting_standard(self, testMapping, testRecordStandard):
testMapping.record = testRecordStandard

testMapping.applyFormatting()

assert testMapping.record.has_part == []
assert testMapping.record.source == 'UofM'
assert testMapping.record.identifiers == ['testISBN|isbn', 'testOCLC|oclc']
assert testMapping.record.source_id == 'UofM_testOCLC'
assert testMapping.record.publisher == ['testPublisher||']
assert testMapping.record.spatial == 'Michigan'
assert testMapping.record.subjects == ['testSubject||']


Loading

0 comments on commit ecf5a47

Please sign in to comment.