Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

NO-REF: Test that MET process saves manifest for each record #542

Merged
merged 7 commits into from
Jan 27, 2025
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion model/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from .postgres.base import Base, Core
from .postgres.record import Record
from .postgres.record import Record, Part
from .postgres.work import Work
from .postgres.edition import Edition
from .postgres.item import Item
Expand Down
39 changes: 39 additions & 0 deletions model/postgres/record.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,39 @@
from dataclasses import dataclass
from sqlalchemy import Column, DateTime, Integer, Unicode, Boolean, Index
from sqlalchemy.dialects.postgresql import ARRAY, UUID, ENUM
from sqlalchemy.ext.hybrid import hybrid_property
from model.utilities.extractDailyEdition import extract
from typing import Optional
from urllib.parse import urlparse

from .base import Base, Core


@dataclass
class Part:
index: int
url: str
source: str
file_type: str
flags: dict

def get_file_bucket(self) -> Optional[str]:
parsed_url = urlparse(self.url)

if 's3' not in parsed_url.hostname:
return None

return parsed_url.hostname.split('.')[0]

def get_file_key(self) -> Optional[str]:
parsed_url = urlparse(self.url)

if 's3' not in parsed_url.hostname:
return None

return parsed_url.path[1:]


class Record(Base, Core):
__tablename__ = 'records'
id = Column(Integer, primary_key=True)
Expand Down Expand Up @@ -61,6 +89,17 @@ def __iter__(self):
for attr in dir(self):
yield attr, getattr(self, attr)


def get_parts(self) -> list[Part]:
parts = []

for part in self.has_part:
index, file_url, source, file_type, flags = part.split('|')

parts.append(Part(index, file_url, source, file_type, flags))

return parts

@hybrid_property
def has_version(self):
return self._has_version
Expand Down
16 changes: 11 additions & 5 deletions tests/functional/processes/ingest/test_met_process.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,23 @@
from model import Part
from processes import METProcess
from .assert_ingested_records import assert_ingested_records
from managers.s3 import S3Manager


def test_met_process():
met_process = METProcess('complete', None, None, None, 5, None)

met_process.runProcess()

records = assert_ingested_records(source_name='met')

# assert that for each record there exists a PDF manifest in S3

# run the S3 process
s3_manager = S3Manager()
s3_manager.createS3Client()

for record in records:
parts= record.get_parts()

manifest_part = next(part for part in parts if part.file_type == 'application/webpub+json')

# assert that for each record, we have saved the PDF in S3
manifest_head_response = s3_manager.s3Client.head_object(Key=manifest_part.get_file_key(), Bucket=manifest_part.get_file_bucket())

assert manifest_head_response is not None
Loading