From 822230596a760717b840b3c74f7084f4afb4b6ff Mon Sep 17 00:00:00 2001 From: Kyle Villegas <86266231+kylevillegas93@users.noreply.github.com> Date: Tue, 28 Jan 2025 12:25:13 -0500 Subject: [PATCH] NO-REF: Asserting manifests are uploaded during ingest (#544) --- ingestJSONFiles/chicagoISAC_metadata.json | 2 +- model/postgres/record.py | 10 ++++++++++ .../processes/ingest/assert_ingested_records.py | 2 +- .../ingest/assert_uploaded_manifests.py | 17 +++++++++++++++++ .../ingest/test_chicago_isac_process.py | 4 +++- .../processes/ingest/test_doab_process.py | 4 +++- .../processes/ingest/test_gutenberg_process.py | 5 ++++- .../processes/ingest/test_loc_process.py | 4 +++- .../processes/ingest/test_met_process.py | 16 +++------------- .../processes/ingest/test_muse_process.py | 6 ++++-- 10 files changed, 49 insertions(+), 21 deletions(-) create mode 100644 tests/functional/processes/ingest/assert_uploaded_manifests.py diff --git a/ingestJSONFiles/chicagoISAC_metadata.json b/ingestJSONFiles/chicagoISAC_metadata.json index ba6a67baf40..972c66adab7 100644 --- a/ingestJSONFiles/chicagoISAC_metadata.json +++ b/ingestJSONFiles/chicagoISAC_metadata.json @@ -6800,7 +6800,7 @@ "Ghent and A Joint Publication of the University of Ghent and the Oriental Institute of the University of Chicago" ], "publicationDate": "2014", - "isbn": "Pp. xix + 102; 48 figures, 136 plates, 9 tables", + "isbn": "9781614910183", "extent": "Hardback, 24 x 34.5 cm", "url": [ "https://isac.uchicago.edu//sites/default/files/uploads/shared/docs/Mesopotamian_Pottery.pdf" diff --git a/model/postgres/record.py b/model/postgres/record.py index 8e287902ba8..28363269bed 100644 --- a/model/postgres/record.py +++ b/model/postgres/record.py @@ -20,6 +20,11 @@ class Part: def get_file_bucket(self) -> Optional[str]: parsed_url = urlparse(self.url) + if 'localhost' in parsed_url.hostname: + path_parts = parsed_url.path.split('/') + + return path_parts[1] + if 's3' not in parsed_url.hostname: return None @@ -28,6 +33,11 @@ def get_file_bucket(self) -> Optional[str]: def get_file_key(self) -> Optional[str]: parsed_url = urlparse(self.url) + if 'localhost' in parsed_url.hostname: + path_parts = parsed_url.path.split('/') + + return '/'.join(path_parts[2:]) + if 's3' not in parsed_url.hostname: return None diff --git a/tests/functional/processes/ingest/assert_ingested_records.py b/tests/functional/processes/ingest/assert_ingested_records.py index ae47f83fdff..3185b290656 100644 --- a/tests/functional/processes/ingest/assert_ingested_records.py +++ b/tests/functional/processes/ingest/assert_ingested_records.py @@ -15,6 +15,6 @@ def assert_ingested_records(source_name: str) -> list[Record]: .all() ) - assert len(records) > 1 + assert len(records) >= 1 return records diff --git a/tests/functional/processes/ingest/assert_uploaded_manifests.py b/tests/functional/processes/ingest/assert_uploaded_manifests.py new file mode 100644 index 00000000000..d06e0dc25df --- /dev/null +++ b/tests/functional/processes/ingest/assert_uploaded_manifests.py @@ -0,0 +1,17 @@ +from model import Record +from managers.s3 import S3Manager + + +def assert_uploaded_manifests(records: list[Record]): + s3_manager = S3Manager() + s3_manager.createS3Client() + + for record in records: + parts = record.get_parts() + + manifest_part = next((part for part in parts if part.file_type == 'application/webpub+json'), None) + + if manifest_part and 'epubs' not in manifest_part.url: + manifest_head_response = s3_manager.s3Client.head_object(Key=manifest_part.get_file_key(), Bucket=manifest_part.get_file_bucket()) + + assert manifest_head_response is not None diff --git a/tests/functional/processes/ingest/test_chicago_isac_process.py b/tests/functional/processes/ingest/test_chicago_isac_process.py index 264a0715e52..b917ea209b3 100644 --- a/tests/functional/processes/ingest/test_chicago_isac_process.py +++ b/tests/functional/processes/ingest/test_chicago_isac_process.py @@ -1,5 +1,6 @@ from processes import ChicagoISACProcess from .assert_ingested_records import assert_ingested_records +from .assert_uploaded_manifests import assert_uploaded_manifests def test_chigaco_isac_process(): @@ -7,4 +8,5 @@ def test_chigaco_isac_process(): isac_process.runProcess() - assert_ingested_records(source_name='isac') + records = assert_ingested_records(source_name='isac') + assert_uploaded_manifests(records) diff --git a/tests/functional/processes/ingest/test_doab_process.py b/tests/functional/processes/ingest/test_doab_process.py index fb6d2f0e009..8876b4d38e3 100644 --- a/tests/functional/processes/ingest/test_doab_process.py +++ b/tests/functional/processes/ingest/test_doab_process.py @@ -1,5 +1,6 @@ from processes import DOABProcess from .assert_ingested_records import assert_ingested_records +from .assert_uploaded_manifests import assert_uploaded_manifests def test_doab_process(): @@ -7,4 +8,5 @@ def test_doab_process(): doab_process.runProcess() - assert_ingested_records(source_name='doab') + records = assert_ingested_records(source_name='doab') + assert_uploaded_manifests(records) diff --git a/tests/functional/processes/ingest/test_gutenberg_process.py b/tests/functional/processes/ingest/test_gutenberg_process.py index c5660bcb890..dd6ce278bd6 100644 --- a/tests/functional/processes/ingest/test_gutenberg_process.py +++ b/tests/functional/processes/ingest/test_gutenberg_process.py @@ -1,5 +1,6 @@ from processes import GutenbergProcess from .assert_ingested_records import assert_ingested_records +from .assert_uploaded_manifests import assert_uploaded_manifests def test_gutenberg_process(): @@ -7,4 +8,6 @@ def test_gutenberg_process(): gutenberg_process.runProcess() - assert_ingested_records(source_name='gutenberg') + records = assert_ingested_records(source_name='gutenberg') + + assert_uploaded_manifests(records) diff --git a/tests/functional/processes/ingest/test_loc_process.py b/tests/functional/processes/ingest/test_loc_process.py index e836da66c47..5504179e5a7 100644 --- a/tests/functional/processes/ingest/test_loc_process.py +++ b/tests/functional/processes/ingest/test_loc_process.py @@ -1,5 +1,6 @@ from processes import LOCProcess from .assert_ingested_records import assert_ingested_records +from .assert_uploaded_manifests import assert_uploaded_manifests def test_loc_process(): @@ -7,4 +8,5 @@ def test_loc_process(): loc_process.runProcess() - assert_ingested_records(source_name='loc') + records = assert_ingested_records(source_name='loc') + assert_uploaded_manifests(records) diff --git a/tests/functional/processes/ingest/test_met_process.py b/tests/functional/processes/ingest/test_met_process.py index e34cc3f917b..2cc76733a92 100644 --- a/tests/functional/processes/ingest/test_met_process.py +++ b/tests/functional/processes/ingest/test_met_process.py @@ -1,22 +1,12 @@ from processes import METProcess from .assert_ingested_records import assert_ingested_records -from managers.s3 import S3Manager +from .assert_uploaded_manifests import assert_uploaded_manifests def test_met_process(): met_process = METProcess('complete', None, None, None, 5, None) met_process.runProcess() - records = assert_ingested_records(source_name='met') - s3_manager = S3Manager() - s3_manager.createS3Client() - - for record in records: - parts= record.get_parts() - - manifest_part = next(part for part in parts if part.file_type == 'application/webpub+json') - - manifest_head_response = s3_manager.s3Client.head_object(Key=manifest_part.get_file_key(), Bucket=manifest_part.get_file_bucket()) - - assert manifest_head_response is not None + records = assert_ingested_records(source_name='met') + assert_uploaded_manifests(records) diff --git a/tests/functional/processes/ingest/test_muse_process.py b/tests/functional/processes/ingest/test_muse_process.py index 3530056ed24..f21af817609 100644 --- a/tests/functional/processes/ingest/test_muse_process.py +++ b/tests/functional/processes/ingest/test_muse_process.py @@ -1,6 +1,6 @@ from processes import MUSEProcess from .assert_ingested_records import assert_ingested_records - +from .assert_uploaded_manifests import assert_uploaded_manifests def test_muse_process(): @@ -8,4 +8,6 @@ def test_muse_process(): muse_process.runProcess() - assert_ingested_records(source_name='muse') + records = assert_ingested_records(source_name='muse') + + assert_uploaded_manifests(records)