From d4677c9b1f9f89e1e9e83918534c4e253c5cf3d3 Mon Sep 17 00:00:00 2001 From: Scott Black Date: Mon, 28 Aug 2023 08:46:52 -0600 Subject: [PATCH] [#188] - reformat scraped ecl jsonld to match funding identifiers to clusters --- dspback/utils/jsonld/clusters.py | 2 + dspback/utils/jsonld/formatter.py | 4 + tests/__init__.py | 6 ++ tests/data/earthchem_jsonld.json | 143 ++++++++++++++++++++++++++++++ tests/test_jsonld.py | 131 ++------------------------- 5 files changed, 161 insertions(+), 125 deletions(-) create mode 100644 tests/data/earthchem_jsonld.json diff --git a/dspback/utils/jsonld/clusters.py b/dspback/utils/jsonld/clusters.py index a3517e8..b95c427 100644 --- a/dspback/utils/jsonld/clusters.py +++ b/dspback/utils/jsonld/clusters.py @@ -52,6 +52,8 @@ def clusters(json_ld): for funding in json_ld["funding"]: if "identifier" in funding: resource_funding_ids.append(funding["identifier"]) + elif "url" in funding: + resource_funding_ids.append(funding["url"]) clusters = [] for cluster_funding_id, cluster in cluster_by_id.items(): diff --git a/dspback/utils/jsonld/formatter.py b/dspback/utils/jsonld/formatter.py index c568e95..fead8c2 100644 --- a/dspback/utils/jsonld/formatter.py +++ b/dspback/utils/jsonld/formatter.py @@ -69,4 +69,8 @@ def format_fields(json_ld): if not isinstance(json_ld["@context"], str): json_ld["@context"] = json_ld["@context"]["@vocab"] + if "funder" in json_ld and "funder" in json_ld["funder"]: + json_ld["funding"] = json_ld["funder"]["funder"] + del json_ld["funder"] + return json_ld diff --git a/tests/__init__.py b/tests/__init__.py index 00f7b17..4600821 100644 --- a/tests/__init__.py +++ b/tests/__init__.py @@ -159,3 +159,9 @@ async def external(change_test_dir): async def earthchem(change_test_dir): with open("data/earthchem.json", "r") as f: return json.loads(f.read()) + + +@pytest.fixture +async def earthchem_jsonld(change_test_dir): + with open("data/earthchem_jsonld.json", "r") as f: + return json.loads(f.read()) diff --git a/tests/data/earthchem_jsonld.json b/tests/data/earthchem_jsonld.json new file mode 100644 index 0000000..ae5d9c4 --- /dev/null +++ b/tests/data/earthchem_jsonld.json @@ -0,0 +1,143 @@ +{ + "@context": { + "@vocab": "https://schema.org/", + "datacite": "http://purl.org/spar/datacite/" + }, + "@id": "https://doi.org/10.1594/IEDA/100243", + "@type": "Dataset", + "name": "Susquehanna Shale Hills Critical Zone Observatory Stream Water Chemistry (2010)", + "sameAs": "https://ecl.earthchem.org/view.php?id=523", + "isAccessibleForFree": true, + "citation": ["https://doi.org/10.2136/vzj2010.0133"], + "author": { + "@list": [{ + "@type": "Role", + "author": [{ + "@type": "Person", + "name": "Susan L. Brantley", + "givenName": "Susan", + "familyName": "Brantley" + }], + "roleName": "Lead Author" + }, { + "@type": "Role", + "author": [{ + "@type": "Person", + "name": "Pamela L. Sullivan", + "givenName": "Pamela", + "familyName": "Sullivan" + }, { + "@type": "Person", + "name": "Danielle Andrews", + "givenName": "Danielle", + "familyName": "Andrews" + }, { + "@type": "Person", + "name": "George Holmes", + "givenName": "George", + "familyName": "Holmes" + }, { + "@type": "Person", + "name": "Molly Holleran", + "givenName": "Molly", + "familyName": "Holleran" + }, { + "@type": "Person", + "name": "Jennifer Z. Williams", + "givenName": "Jennifer", + "familyName": "Williams" + }, { + "@type": "Person", + "name": "Elizabeth Herndon", + "givenName": "Elizabeth", + "familyName": "Herndon" + }, { + "@type": "Person", + "name": "Maya Bhatt", + "givenName": "Maya", + "familyName": "Bhatt" + }, { + "@type": "Person", + "name": "Ekaterina Bazilevskaya", + "givenName": "Ekaterina", + "familyName": "Bazilevskaya" + }, { + "@type": "Person", + "name": "Tiffany Yesavage", + "givenName": "Tiffany", + "familyName": "Yesavage" + }, { + "@type": "Person", + "name": "Evan Thomas", + "givenName": "Evan", + "familyName": "Thomas" + }, { + "@type": "Person", + "name": "Chris J. Duffy", + "givenName": "Chris", + "familyName": "Duffy" + }], + "roleName": "Coauthor" + }] + }, + "description": "Stream water chemistry at Susquehanna Shale Hills Critical Zone Observatory in 2010. Weekly to monthly grab samples were collected at three locations along the first order Stream: at the Headwater (SH), Middle (SM) and adjacent to the Weir (SW). Daily stream water sample were also collected adjacent to the weir from using automatic samplers (2700 series, Teledyne Isco, Lincoln, NE) and were referenced as SW-ISCO. ", + "distribution": { + "datePublished": "2013-02-05 00:00:00", + "contentUrl": "https://ecl.earthchem.org/view.php?id=523", + "@type": "DataDownload", + "encodingFormat": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" + }, + "license": "https://spdx.org/licenses/CC-BY-SA-4.0", + "dateCreated": "2013-02-04", + "inLanguage": "English", + "keywords": ["Susquehanna Shale Hills", "Pennsylvania", "Regional (Continents, Oceans)", "Stream water", "geochemistry", "DOC", "trace elements", "major ions"], + "publisher": { + "contactPoint": { + "@type": "ContactPoint", + "name": "Information Desk", + "contactType": "Customer Service", + "email": "info@earthchem.org", + "url": "https://www.earthchem.org/contact/" + }, + "@type": "Organization", + "name": "EarthChem Library", + "@id": "https://www.earthchem.org", + "url": "https://www.earthchem.org/library" + }, + "provider": { + "@type": "Organization", + "name": "EarthChem Library" + }, + "spatialCoverage": { + "@type": "Place", + "geo": [{ + "@type": "GeoCoordinates", + "latitude": "40.6644474", + "longitude": "-77.9056298" + }, { + "@type": "GeoCoordinates", + "latitude": "40.6647643", + "longitude": "-77.9040381" + }, { + "@type": "GeoCoordinates", + "latitude": "40.664841", + "longitude": "-77.9072532" + }, { + "@type": "GeoCoordinates", + "latitude": "40.6648488", + "longitude": "-77.9072458" + }] + }, + "url": "https://doi.org/10.1594/IEDA/100243", + "funder": { + "@type": "MonetaryGrant", + "fundedItem": { + "@id": "https://doi.org/10.1594/IEDA/100243" + }, + "funder": [{ + "@type": "Organization", + "name": "National Science Foundation", + "url": "http://www.nsf.gov/awardsearch/showAward.do?AwardNumber=2012123" + }] + } +} \ No newline at end of file diff --git a/tests/test_jsonld.py b/tests/test_jsonld.py index 06f3d40..1dbbdc4 100644 --- a/tests/test_jsonld.py +++ b/tests/test_jsonld.py @@ -8,6 +8,7 @@ from dspback.schemas.discovery import JSONLD from dspback.utils.jsonld.clusters import clusters from dspback.utils.jsonld.scraper import format_fields +from tests import change_test_dir, earthchem_jsonld ids_and_cluster = [ ("2012073", "Bedrock Cluster"), @@ -140,131 +141,11 @@ class MockSubmission(BaseModel): @pytest.mark.asyncio -async def test_earthchem_jsonld(): - metadata_json = { - "@context": {"@vocab": "https://schema.org/", "datacite": "http://purl.org/spar/datacite/"}, - "@id": "https://doi.org/10.1594/IEDA/100243", - "@type": "Dataset", - "name": "Susquehanna Shale Hills Critical Zone Observatory Stream Water Chemistry (2010)", - "sameAs": "https://ecl.earthchem.org/view.php?id=523", - "isAccessibleForFree": True, - "citation": ["https://doi.org/10.2136/vzj2010.0133"], - "author": { - "@list": [ - { - "@type": "Role", - "author": [ - {"@type": "Person", "name": "Susan L. Brantley", "givenName": "Susan", "familyName": "Brantley"} - ], - "roleName": "Lead Author", - }, - { - "@type": "Role", - "author": [ - { - "@type": "Person", - "name": "Pamela L. Sullivan", - "givenName": "Pamela", - "familyName": "Sullivan", - }, - { - "@type": "Person", - "name": "Danielle Andrews", - "givenName": "Danielle", - "familyName": "Andrews", - }, - {"@type": "Person", "name": "George Holmes", "givenName": "George", "familyName": "Holmes"}, - {"@type": "Person", "name": "Molly Holleran", "givenName": "Molly", "familyName": "Holleran"}, - { - "@type": "Person", - "name": "Jennifer Z. Williams", - "givenName": "Jennifer", - "familyName": "Williams", - }, - { - "@type": "Person", - "name": "Elizabeth Herndon", - "givenName": "Elizabeth", - "familyName": "Herndon", - }, - {"@type": "Person", "name": "Maya Bhatt", "givenName": "Maya", "familyName": "Bhatt"}, - { - "@type": "Person", - "name": "Ekaterina Bazilevskaya", - "givenName": "Ekaterina", - "familyName": "Bazilevskaya", - }, - { - "@type": "Person", - "name": "Tiffany Yesavage", - "givenName": "Tiffany", - "familyName": "Yesavage", - }, - {"@type": "Person", "name": "Evan Thomas", "givenName": "Evan", "familyName": "Thomas"}, - {"@type": "Person", "name": "Chris J. Duffy", "givenName": "Chris", "familyName": "Duffy"}, - ], - "roleName": "Coauthor", - }, - ] - }, - "description": "Stream water chemistry at Susquehanna Shale Hills Critical Zone Observatory in 2010. Weekly to monthly grab samples were collected at three locations along the first order Stream: at the Headwater (SH), Middle (SM) and adjacent to the Weir (SW). Daily stream water sample were also collected adjacent to the weir from using automatic samplers (2700 series, Teledyne Isco, Lincoln, NE) and were referenced as SW-ISCO. ", - "distribution": { - "datePublished": "2013-02-05 00:00:00", - "contentUrl": "https://ecl.earthchem.org/view.php?id=523", - "@type": "DataDownload", - "encodingFormat": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", - }, - "license": "https://spdx.org/licenses/CC-BY-SA-4.0", - "dateCreated": "2013-02-04", - "inLanguage": "English", - "keywords": [ - "Susquehanna Shale Hills", - "Pennsylvania", - "Regional (Continents, Oceans)", - "Stream water", - "geochemistry", - "DOC", - "trace elements", - "major ions", - ], - "publisher": { - "contactPoint": { - "@type": "ContactPoint", - "name": "Information Desk", - "contactType": "Customer Service", - "email": "info@earthchem.org", - "url": "https://www.earthchem.org/contact/", - }, - "@type": "Organization", - "name": "EarthChem Library", - "@id": "https://www.earthchem.org", - "url": "https://www.earthchem.org/library", - }, - "provider": {"@type": "Organization", "name": "EarthChem Library"}, - "spatialCoverage": { - "@type": "Place", - "geo": [ - {"@type": "GeoCoordinates", "latitude": "40.6644474", "longitude": "-77.9056298"}, - {"@type": "GeoCoordinates", "latitude": "40.6647643", "longitude": "-77.9040381"}, - {"@type": "GeoCoordinates", "latitude": "40.664841", "longitude": "-77.9072532"}, - {"@type": "GeoCoordinates", "latitude": "40.6648488", "longitude": "-77.9072458"}, - ], - }, - "url": "https://doi.org/10.1594/IEDA/100243", - "funder": { - "@type": "MonetaryGrant", - "fundedItem": {"@id": "https://doi.org/10.1594/IEDA/100243"}, - "funder": [ - { - "@type": "Organization", - "name": "National Science Foundation", - "url": "http://www.nsf.gov/awardsearch/showAward.do?AwardNumber=0725019", - } - ], - }, - } - - scraped_jsonld = format_fields(metadata_json) +async def test_earthchem_jsonld(earthchem_jsonld): + scraped_jsonld = format_fields(earthchem_jsonld) + scraped_jsonld["clusters"] = clusters(scraped_jsonld) jsonld = JSONLD(**scraped_jsonld) assert jsonld.provider.name == "EarthChem Library" assert jsonld.context == "https://schema.org/" + assert len(jsonld.funding) == 1 + assert len(jsonld.clusters) == 1