diff --git a/model/__init__.py b/model/__init__.py index 9854f98dde6..f73d5225106 100644 --- a/model/__init__.py +++ b/model/__init__.py @@ -1,5 +1,5 @@ from .postgres.base import Base, Core -from .postgres.record import Record +from .postgres.record import Record, Part from .postgres.work import Work from .postgres.edition import Edition from .postgres.item import Item diff --git a/model/postgres/record.py b/model/postgres/record.py index cf93bc0c8b0..8e287902ba8 100644 --- a/model/postgres/record.py +++ b/model/postgres/record.py @@ -1,11 +1,39 @@ +from dataclasses import dataclass from sqlalchemy import Column, DateTime, Integer, Unicode, Boolean, Index from sqlalchemy.dialects.postgresql import ARRAY, UUID, ENUM from sqlalchemy.ext.hybrid import hybrid_property from model.utilities.extractDailyEdition import extract +from typing import Optional +from urllib.parse import urlparse from .base import Base, Core +@dataclass +class Part: + index: int + url: str + source: str + file_type: str + flags: dict + + def get_file_bucket(self) -> Optional[str]: + parsed_url = urlparse(self.url) + + if 's3' not in parsed_url.hostname: + return None + + return parsed_url.hostname.split('.')[0] + + def get_file_key(self) -> Optional[str]: + parsed_url = urlparse(self.url) + + if 's3' not in parsed_url.hostname: + return None + + return parsed_url.path[1:] + + class Record(Base, Core): __tablename__ = 'records' id = Column(Integer, primary_key=True) @@ -61,6 +89,17 @@ def __iter__(self): for attr in dir(self): yield attr, getattr(self, attr) + + def get_parts(self) -> list[Part]: + parts = [] + + for part in self.has_part: + index, file_url, source, file_type, flags = part.split('|') + + parts.append(Part(index, file_url, source, file_type, flags)) + + return parts + @hybrid_property def has_version(self): return self._has_version diff --git a/tests/functional/processes/ingest/test_met_process.py b/tests/functional/processes/ingest/test_met_process.py index 680e4241f82..e34cc3f917b 100644 --- a/tests/functional/processes/ingest/test_met_process.py +++ b/tests/functional/processes/ingest/test_met_process.py @@ -1,17 +1,22 @@ from processes import METProcess from .assert_ingested_records import assert_ingested_records +from managers.s3 import S3Manager def test_met_process(): met_process = METProcess('complete', None, None, None, 5, None) met_process.runProcess() - records = assert_ingested_records(source_name='met') - - # assert that for each record there exists a PDF manifest in S3 - # run the S3 process + s3_manager = S3Manager() + s3_manager.createS3Client() + + for record in records: + parts= record.get_parts() + + manifest_part = next(part for part in parts if part.file_type == 'application/webpub+json') - # assert that for each record, we have saved the PDF in S3 + manifest_head_response = s3_manager.s3Client.head_object(Key=manifest_part.get_file_key(), Bucket=manifest_part.get_file_bucket()) + assert manifest_head_response is not None