Skip to content

Commit

Permalink
adding formatter and things
Browse files Browse the repository at this point in the history
  • Loading branch information
kylevillegas93 committed Feb 5, 2025
1 parent cb90324 commit c1987eb
Show file tree
Hide file tree
Showing 7 changed files with 208 additions and 638 deletions.
44 changes: 44 additions & 0 deletions mappings/formatter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
from collections import defaultdict
from collections.abc import Mapping, Sequence


def format_value(format_str: str, value):
if isinstance(value, Mapping):
return format_str.format(**value)
if isinstance(value, Sequence) and not isinstance(value, str):
return [
format_str.format_map(defaultdict(str, v)) if isinstance(v, Mapping) else format_str.format(v)
for v in value
]

return format_str.format(value)

def map_source_record(source_record: dict, mapping: dict) -> dict:
formatted_record = {}

for key, entries in mapping.items():
if isinstance(entries, tuple):
record_key, format_str = entries
value = source_record.get(record_key)

if value is not None:
formatted_record[key] = format_value(format_str, value)
elif isinstance(entries, list):
mapped_values = []

for entry in entries:
record_key, format_str = entry
value = source_record.get(record_key)

if value is not None:
formatted_value = format_value(format_str, value)

if isinstance(formatted_value, list):
mapped_values.extend(formatted_value)
else:
mapped_values.append(formatted_value)

if mapped_values:
formatted_record[key] = mapped_values

return formatted_record
215 changes: 145 additions & 70 deletions mappings/nypl_bib.py
Original file line number Diff line number Diff line change
@@ -1,32 +1,96 @@
from collections import defaultdict
from typing import Optional

from model import Record


def map_nypl_bib_to_record(bib: dict) -> Record:
language: dict = bib.get('lang', {})
marc_fields: dict = get_marc_fields(bib)
publication_details = marc_fields.get('260', [])

record = Record(
title=bib.get('title'),
authors=[f"{bib.get('author')}|||true"],
languages=[f"{language.get('name')}||{language.get('code')}"],
dates=[
f"{bib.get('publish_year')}|publication_date",
f"{bib.get('catalog_date')}|catalog_date"
],
spatial=bib.get('country', {}).get('name'),
identifiers=get_identifiers(bib, marc_fields),
publisher=[f"{publication_detail.get('b')}||" for publication_detail in publication_details if publication_detail.get('b')],
contributors=get_contributers(marc_fields)
import json

from constants.get_constants import get_constants
from model import FileFlags, Record, Source, Part
from mappings.formatter import map_source_record
from mappings.record import map_to_record


constants = get_constants()
NYPL_BIB_MAPPING = {
'title': ('title', '{0}'),
'alternative': [
('246', '{a} {b} {p}'),
('247', '{a} {f}')
],
'authors': [('author', '{0}|||true')],
'languages': [('lang', '{name}||{code}')],
'dates': [
('publish_year', '{0}|publication_date'),
('catalog_date', '{0}|catalog_date')
],
'spatial': ('country', '{name}'),
'source_id': ('id', '{0}|nypl'),
'identifiers': [
('id', '{0}|nypl'),
('issn', '{0}|issn'),
('lccn', '{0}|lccn'),
('oclc', '{0}|oclc'),
('isbn', '{0}|isbn'),
('010', '{a}|lccn'),
('020', '{a}|isbn'),
('022', '{a}|issn'),
('028', '{a}|{b}'),
('035', '{a}|scn'),
('050', '{a} {b}|lcc'),
('060', '{a}|nlmcn'),
('074', '{a}|gpoin'),
('086', '{a}|gdcn')
],
'publisher': [('260', '{b}||')],
'contributors': [
('260', '{f}|||manufacturer'),
('700', '{a}|||{e}'),
('710', '{a} {b}|||{e}'),
('711', '{a} {b}|||{e}')
],
'has_version': ('250', '{a}|'),
'extent': ('300', '{a}{b}{c}'),
'is_part_of': [
('440', '{a}|{v}|volume'),
('490', '{a}|{v}|volume')
],
'abstract': [
('500', '{a}'),
('520', '{a} {b}')
],
'table_of_contents': ('505', '{a}'),
'subjects': [
('600', '{a} {d} -- {t} -- {v}|lcsh|'),
('610', '{a} -- {b} -- {v} -- {x} -- {z}|lcsh|'),
('611', '{a} -- {v}|lcsh|'),
('630', '{a} -- {p} -- {v}|lcsh|'),
('650', '{a} -- {b} -- {v} -- {x} -- {z}|lcsh|'),
('651', '{a} -- {b} -- {v} -- {x} -- {z}|lcsh|'),
('655', '{a}||'),
('656', '{a}||'),
('690', '{a} -- {b} -- {v} -- {x} -- {z}|lcsh|'),
],
'has_part': [('856', '1|{u}|nypl|text/html|{{"catalog": false, "download": false, "reader": false, "embed": true}}')],
}


def map_nypl_bib_to_record(nypl_bib: dict, nypl_bib_items: list[dict], location_codes: dict) -> Record:
marc_fields = get_marc_fields(nypl_bib)
nypl_bib.update(marc_fields)

mapped_source_record = map_source_record(nypl_bib, mapping=NYPL_BIB_MAPPING)

has_part = clean_has_part(mapped_source_record.get('has_part', []), mapped_source_record.get('source_id'))
coverage, has_part = add_requestable_parts(nypl_bib_items, has_part, mapped_source_record.get('source_id'))

return map_to_record(
{
**mapped_source_record,
'identifiers': clean_identifiers(mapped_source_record.get('identifiers', [])),
'subjects': clean_subjects(mapped_source_record.get('subjects', [])),
'contributors': clean_contributors(mapped_source_record.get('contributors', [])),
'coverage': coverage,
'has_part': has_part
},
Source.NYPL
)

print(record.identifiers)

return record


def get_marc_fields(bib: dict) -> dict:
marc_fields = {}
Expand All @@ -40,52 +104,63 @@ def get_marc_fields(bib: dict) -> dict:
return marc_fields


def get_identifiers(bib: dict, marc_fields: dict) -> list[str]:
identifiers_mapping = {
'010': ('{a}|lccn'),
'020': ('{a}|isbn'),
'022': ('{a}|issn'),
'028': ('{a}|{b}'),
'035': ('{a}|scn'),
'050': ('{a} {b}|lcc'),
'060': ('{a}|nlcmn'),
'074': ('{a}|gpoin'),
'086': ('{a}|gdcn')
}

identifiers = set(
[
f"{bib.get('id', '')}|nypl",
f"{bib.get('issn', '')}|issn",
f"{bib.get('lccn', '')}|lccn",
f"{bib.get('oclc', '')}|oclc",
f"{bib.get('isbn', '')}|isbn"
] +
[
f"{mapping.format_map(defaultdict(str, marc_field_entry))}"
for tag, mapping in identifiers_mapping.items()
for marc_field_entry in marc_fields.get(tag, [])
if mapping
]
)
def clean_identifiers(identifiers: list[str]) -> list[str]:
return list({
id.replace('(OCoLC)', '').replace('scn', 'oclc') if isinstance(id, str) and '(OCoLC)' in id else id
for id in identifiers
if id.split('|')[0]
})

return [id for id in identifiers if id.split('|')[0]]

def clean_subjects(subjects: list[str]) -> list[str]:
return [
'{}|{}'.format('--'.join([part.strip() for part in subject.split('|')[0].split('--') if part.strip()]), *subject.split('|')[1:])
for subject in subjects
if subject.split('|')[0]
]

def get_contributers(marc_fields: dict) -> list[str]:
contributor_mapping = {
'260': ('{f}|||manufacterer'),
'700': ('{a}|||{e}'),
'710': ('{a} {b}|||{e}'),
'711': ('{a}|||{e}')
}

contributors = [
f"{mapping.format_map(defaultdict(str, marc_field_entry))}"
for tag, mapping in contributor_mapping.items()
for marc_field_entry in marc_fields.get(tag, [])
if mapping
def clean_contributors(contributors: list[str]) -> list[str]:
return [
'{}|{}|{}|{}'.format(*contributor_data[:-1], constants['lc']['relators'].get(contributor_data[-1], 'Contributor'))
for contributor in contributors
for contributor_data in [contributor.split('|')]
if contributor_data[0]
]

return [contributor for contributor in contributors if contributor.split('|')[0]]
def clean_has_part(has_part: list[str], source_id: str) -> list[str]:
if len(has_part) == 0:
catalog_part = Part(
index=1,
url=f'https://www.nypl.org/research/collections/shared-collection-catalog/bib/b{source_id}',
source=Source.NYPL.value,
file_type='application/html+catalog',
flags=json.dumps(FileFlags(catalog=True))
)
has_part.append(catalog_part.to_string())

return has_part


def add_requestable_parts(bib_items: list[dict], has_part: list[str], location_codes: dict, source_id: str) -> tuple:
coverage = []
requestable_items = [
item
for item in bib_items
if location_codes.get(item.get('location', {}).get('code', {}), {}).get('requestable')
]

for item in requestable_items:
location_metadata = item.get('location')
index = len(has_part) + 1
requestable_item_part = Part(
index=index,
url=f"http://www.nypl.org/research/collections/shared-collection-catalog/hold/request/b{source_id}-i{item.get('id')}",
source=Source.NYPL.name,
file_type='applicaton/x.html+edd',
flags=FileFlags(edd=True, nypl_login=True)
)

coverage.append(f"{location_metadata.get('code')}|{location_metadata.get('name')}|{index}")
has_part.append(requestable_item_part.to_string())

return (coverage, has_part)
16 changes: 16 additions & 0 deletions mappings/record.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
from datetime import datetime, timezone
from uuid import uuid4

from model import Record, FRBRStatus, Source


def map_to_record(source_record: dict, source: Source) -> Record:
return Record(
uuid=uuid4(),
date_created=datetime.now(timezone.utc).replace(tzinfo=None),
date_modified=datetime.now(timezone.utc).replace(tzinfo=None),
frbr_status=FRBRStatus.TODO.value,
cluster_status=False,
source=source.value,
**source_record
)
2 changes: 2 additions & 0 deletions model/postgres/record.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,10 +61,12 @@ class FRBRStatus(Enum):

@dataclass
class FileFlags:
edd: bool = False
catalog: bool = False
reader: bool = False
embed: bool = False
download: bool = False
nypl_login: bool = False


class Record(Base, Core):
Expand Down
1 change: 1 addition & 0 deletions model/source.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,3 +2,4 @@

class Source(Enum):
CHICACO_ISAC = 'isac'
NYPL = 'nypl'
Loading

0 comments on commit c1987eb

Please sign in to comment.