adding formatter and things

NYPL · Feb 5, 2025 · c1987eb · c1987eb
1 parent cb90324
commit c1987eb
Show file tree

Hide file tree

Showing 7 changed files with 208 additions and 638 deletions.
diff --git a/mappings/formatter.py b/mappings/formatter.py
@@ -0,0 +1,44 @@
+from collections import defaultdict
+from collections.abc import Mapping, Sequence
+
+
+def format_value(format_str: str, value):
+    if isinstance(value, Mapping):
+        return format_str.format(**value)
+    if isinstance(value, Sequence) and not isinstance(value, str):
+        return [
+            format_str.format_map(defaultdict(str, v)) if isinstance(v, Mapping) else format_str.format(v) 
+            for v in value
+        ]
+
+    return format_str.format(value)
+
+def map_source_record(source_record: dict, mapping: dict) -> dict:
+    formatted_record = {}
+
+    for key, entries in mapping.items():
+        if isinstance(entries, tuple):
+            record_key, format_str = entries
+            value = source_record.get(record_key)
+
+            if value is not None:
+                formatted_record[key] = format_value(format_str, value)
+        elif isinstance(entries, list):
+            mapped_values = []
+
+            for entry in entries:
+                record_key, format_str = entry
+                value = source_record.get(record_key)
+
+                if value is not None:
+                    formatted_value = format_value(format_str, value)
+
+                    if isinstance(formatted_value, list):
+                        mapped_values.extend(formatted_value)
+                    else:
+                        mapped_values.append(formatted_value)
+
+            if mapped_values:
+                formatted_record[key] = mapped_values
+
+    return formatted_record
diff --git a/mappings/nypl_bib.py b/mappings/nypl_bib.py
@@ -1,32 +1,96 @@
-from collections import defaultdict
-from typing import Optional
-
-from model import Record
-
-
-def map_nypl_bib_to_record(bib: dict) -> Record:
-    language: dict = bib.get('lang', {})
-    marc_fields: dict = get_marc_fields(bib)
-    publication_details = marc_fields.get('260', [])
-
-    record = Record(
-        title=bib.get('title'),
-        authors=[f"{bib.get('author')}|||true"],
-        languages=[f"{language.get('name')}||{language.get('code')}"],
-        dates=[
-            f"{bib.get('publish_year')}|publication_date",
-            f"{bib.get('catalog_date')}|catalog_date"
-        ],
-        spatial=bib.get('country', {}).get('name'),
-        identifiers=get_identifiers(bib, marc_fields),
-        publisher=[f"{publication_detail.get('b')}||" for publication_detail in publication_details if publication_detail.get('b')],
-        contributors=get_contributers(marc_fields)
+import json
+
+from constants.get_constants import get_constants
+from model import FileFlags, Record, Source, Part
+from mappings.formatter import map_source_record
+from mappings.record import map_to_record
+
+
+constants = get_constants()
+NYPL_BIB_MAPPING = {
+    'title': ('title', '{0}'),
+    'alternative': [
+        ('246', '{a} {b} {p}'),
+        ('247', '{a} {f}')
+    ],
+    'authors': [('author', '{0}|||true')],
+    'languages': [('lang', '{name}||{code}')],
+    'dates': [
+        ('publish_year', '{0}|publication_date'),
+        ('catalog_date', '{0}|catalog_date')
+    ],
+    'spatial': ('country', '{name}'),
+    'source_id': ('id', '{0}|nypl'),
+    'identifiers': [
+        ('id', '{0}|nypl'),
+        ('issn', '{0}|issn'),
+        ('lccn', '{0}|lccn'),
+        ('oclc', '{0}|oclc'),
+        ('isbn', '{0}|isbn'),
+        ('010', '{a}|lccn'),
+        ('020', '{a}|isbn'),
+        ('022', '{a}|issn'),
+        ('028', '{a}|{b}'),
+        ('035', '{a}|scn'),
+        ('050', '{a} {b}|lcc'),
+        ('060', '{a}|nlmcn'),
+        ('074', '{a}|gpoin'),
+        ('086', '{a}|gdcn')
+    ],
+    'publisher': [('260', '{b}||')],
+    'contributors': [
+        ('260', '{f}|||manufacturer'),
+        ('700', '{a}|||{e}'),
+        ('710', '{a} {b}|||{e}'),
+        ('711', '{a} {b}|||{e}')
+    ],
+    'has_version': ('250', '{a}|'),
+    'extent': ('300', '{a}{b}{c}'),
+    'is_part_of': [
+        ('440', '{a}|{v}|volume'),
+        ('490', '{a}|{v}|volume')
+    ],
+    'abstract': [
+        ('500', '{a}'),
+        ('520', '{a} {b}')
+    ],
+    'table_of_contents': ('505', '{a}'),
+    'subjects': [
+        ('600', '{a} {d} -- {t} -- {v}|lcsh|'),
+        ('610', '{a} -- {b} -- {v} -- {x} -- {z}|lcsh|'),
+        ('611', '{a} -- {v}|lcsh|'),
+        ('630', '{a} -- {p} -- {v}|lcsh|'),
+        ('650', '{a} -- {b} -- {v} -- {x} -- {z}|lcsh|'),
+        ('651', '{a} -- {b} -- {v} -- {x} -- {z}|lcsh|'),
+        ('655', '{a}||'),
+        ('656', '{a}||'),
+        ('690', '{a} -- {b} -- {v} -- {x} -- {z}|lcsh|'),
+    ],
+    'has_part': [('856', '1|{u}|nypl|text/html|{{"catalog": false, "download": false, "reader": false, "embed": true}}')],
+}
+
+
+def map_nypl_bib_to_record(nypl_bib: dict, nypl_bib_items: list[dict], location_codes: dict) -> Record:
+    marc_fields = get_marc_fields(nypl_bib)
+    nypl_bib.update(marc_fields)
+
+    mapped_source_record = map_source_record(nypl_bib, mapping=NYPL_BIB_MAPPING)
+
+    has_part = clean_has_part(mapped_source_record.get('has_part', []), mapped_source_record.get('source_id'))
+    coverage, has_part = add_requestable_parts(nypl_bib_items, has_part, mapped_source_record.get('source_id'))
+
+    return map_to_record(
+        {
+            **mapped_source_record,
+            'identifiers': clean_identifiers(mapped_source_record.get('identifiers', [])),
+            'subjects': clean_subjects(mapped_source_record.get('subjects', [])),
+            'contributors': clean_contributors(mapped_source_record.get('contributors', [])),
+            'coverage': coverage,
+            'has_part': has_part
+        }, 
+        Source.NYPL
     )
 
-    print(record.identifiers)
-
-    return record
-
 
 def get_marc_fields(bib: dict) -> dict:
     marc_fields = {}
@@ -40,52 +104,63 @@ def get_marc_fields(bib: dict) -> dict:
     return marc_fields
 
 
-def get_identifiers(bib: dict, marc_fields: dict) -> list[str]:
-    identifiers_mapping = {
-        '010': ('{a}|lccn'),
-        '020': ('{a}|isbn'),
-        '022': ('{a}|issn'),
-        '028': ('{a}|{b}'),
-        '035': ('{a}|scn'),
-        '050': ('{a} {b}|lcc'),
-        '060': ('{a}|nlcmn'),
-        '074': ('{a}|gpoin'),
-        '086': ('{a}|gdcn')
-    }
-
-    identifiers = set(
-        [
-            f"{bib.get('id', '')}|nypl",
-            f"{bib.get('issn', '')}|issn",
-            f"{bib.get('lccn', '')}|lccn",
-            f"{bib.get('oclc', '')}|oclc",
-            f"{bib.get('isbn', '')}|isbn"
-        ] +
-        [
-            f"{mapping.format_map(defaultdict(str, marc_field_entry))}"
-            for tag, mapping in identifiers_mapping.items()
-            for marc_field_entry in marc_fields.get(tag, [])
-            if mapping
-        ]
-    )
+def clean_identifiers(identifiers: list[str]) -> list[str]:
+    return list({
+        id.replace('(OCoLC)', '').replace('scn', 'oclc') if isinstance(id, str) and '(OCoLC)' in id else id 
+        for id in identifiers
+        if id.split('|')[0]
+    })
 
-    return [id for id in identifiers if id.split('|')[0]]
 
+def clean_subjects(subjects: list[str]) -> list[str]:
+    return [
+        '{}|{}'.format('--'.join([part.strip() for part in subject.split('|')[0].split('--') if part.strip()]), *subject.split('|')[1:])
+        for subject in subjects
+        if subject.split('|')[0]
+    ]
 
-def get_contributers(marc_fields: dict) -> list[str]:
-    contributor_mapping = {
-        '260': ('{f}|||manufacterer'),
-        '700': ('{a}|||{e}'),
-        '710': ('{a} {b}|||{e}'),
-        '711': ('{a}|||{e}')
-    }
-
-    contributors = [
-        f"{mapping.format_map(defaultdict(str, marc_field_entry))}"
-        for tag, mapping in contributor_mapping.items()
-        for marc_field_entry in marc_fields.get(tag, [])
-        if mapping
+def clean_contributors(contributors: list[str]) -> list[str]:
+    return [
+        '{}|{}|{}|{}'.format(*contributor_data[:-1], constants['lc']['relators'].get(contributor_data[-1], 'Contributor'))
+        for contributor in contributors
+        for contributor_data in [contributor.split('|')]
+        if contributor_data[0]
     ]
 
-    return [contributor for contributor in contributors if contributor.split('|')[0]]
+def clean_has_part(has_part: list[str], source_id: str) -> list[str]:
+    if len(has_part) == 0:
+        catalog_part = Part(
+            index=1, 
+            url=f'https://www.nypl.org/research/collections/shared-collection-catalog/bib/b{source_id}',
+            source=Source.NYPL.value,
+            file_type='application/html+catalog',
+            flags=json.dumps(FileFlags(catalog=True))
+        )
+        has_part.append(catalog_part.to_string())
+
+    return has_part
+
+
+def add_requestable_parts(bib_items: list[dict], has_part: list[str], location_codes: dict, source_id: str) -> tuple:
+    coverage = []
+    requestable_items = [
+        item 
+        for item in bib_items 
+        if location_codes.get(item.get('location', {}).get('code', {}), {}).get('requestable')
+    ]
 
+    for item in requestable_items:
+        location_metadata = item.get('location')
+        index = len(has_part) + 1
+        requestable_item_part = Part(
+            index=index,
+            url=f"http://www.nypl.org/research/collections/shared-collection-catalog/hold/request/b{source_id}-i{item.get('id')}",
+            source=Source.NYPL.name,
+            file_type='applicaton/x.html+edd',
+            flags=FileFlags(edd=True, nypl_login=True)
+        )
+
+        coverage.append(f"{location_metadata.get('code')}|{location_metadata.get('name')}|{index}")
+        has_part.append(requestable_item_part.to_string())
+
+    return (coverage, has_part)
diff --git a/mappings/record.py b/mappings/record.py
@@ -0,0 +1,16 @@
+from datetime import datetime, timezone
+from uuid import uuid4
+
+from model import Record, FRBRStatus, Source
+
+
+def map_to_record(source_record: dict, source: Source) -> Record:
+    return Record(
+        uuid=uuid4(),
+        date_created=datetime.now(timezone.utc).replace(tzinfo=None),
+        date_modified=datetime.now(timezone.utc).replace(tzinfo=None),
+        frbr_status=FRBRStatus.TODO.value,
+        cluster_status=False,
+        source=source.value,
+        **source_record
+    )
diff --git a/model/postgres/record.py b/model/postgres/record.py
@@ -61,10 +61,12 @@ class FRBRStatus(Enum):
 
 @dataclass 
 class FileFlags:
+    edd: bool = False
     catalog: bool = False
     reader: bool = False
     embed: bool = False
     download: bool = False
+    nypl_login: bool = False
 
 
 class Record(Base, Core):

diff --git a/model/source.py b/model/source.py
@@ -2,3 +2,4 @@
 
 class Source(Enum):
     CHICACO_ISAC = 'isac'
+    NYPL = 'nypl'
Original file line number	Diff line number	Diff line change
Expand Up		@@ -2,3 +2,4 @@

		class Source(Enum):
		CHICACO_ISAC = 'isac'
		NYPL = 'nypl'