diff --git a/dspback/config/__init__.py b/dspback/config/__init__.py index 0712099..7377578 100644 --- a/dspback/config/__init__.py +++ b/dspback/config/__init__.py @@ -59,6 +59,7 @@ class Settings(BaseSettings): earthchem_file_delete_url: HttpUrl earthchem_file_read_url: HttpUrl earthchem_view_url: HttpUrl + earthchem_public_view_url: HttpUrl earthchem_health_url: HttpUrl mongo_username: str diff --git a/dspback/pydantic_schemas.py b/dspback/pydantic_schemas.py index 7dcaf7d..c1cbd3f 100644 --- a/dspback/pydantic_schemas.py +++ b/dspback/pydantic_schemas.py @@ -67,7 +67,7 @@ class Submission(Document): identifier: str = None submitted: datetime = datetime.utcnow() url: HttpUrl = None - metadata_json: str = {} + metadata_json: str = "{}" @validator('authors', pre=True, allow_reuse=True) def extract_author_names(cls, values): @@ -264,8 +264,7 @@ class License(BaseModel): def to_submission(self, identifier) -> Submission: settings = get_settings() - view_url = settings.earthchem_view_url - view_url = view_url % identifier + view_url = settings.earthchem_public_view_url % identifier authors = [contributor.name for contributor in self.contributors] authors.insert(0, self.leadAuthor.name) return Submission( diff --git a/dspback/routers/earthchem.py b/dspback/routers/earthchem.py index 664ffa5..c4fc824 100644 --- a/dspback/routers/earthchem.py +++ b/dspback/routers/earthchem.py @@ -116,7 +116,7 @@ async def _retrieve_metadata_from_repository(self, request: Request, identifier) json_metadata["leadAuthor"] = lead_author json_metadata["contributors"] = all_contributors - return self.wrap_metadata(json_metadata, "status" in json_metadata and json_metadata["status"] != "incomplete") + return self.wrap_metadata(json_metadata, "status" in json_metadata and json_metadata["status"] == "published") @router.get( '/metadata/earthchem/{identifier}', diff --git a/dspback/schemas/earthchem/schema.json b/dspback/schemas/earthchem/schema.json index 792952b..7b96279 100644 --- a/dspback/schemas/earthchem/schema.json +++ b/dspback/schemas/earthchem/schema.json @@ -98,7 +98,8 @@ "options": { "hidden": true }, "enum": [ "incomplete", - "submitted" + "submitted", + "published" ] }, "additionalTypes": { @@ -198,7 +199,6 @@ "url": { "type": "string", "default": "https://ror.org/021nxhr62", - "const": "https://ror.org/021nxhr62", "options": { "hidden": true } } } @@ -225,7 +225,6 @@ "url": { "type": "string", "default": "https://ror.org/052csg198", - "const": "https://ror.org/052csg198", "options": { "hidden": true } } } @@ -252,7 +251,6 @@ "url": { "type": "string", "default": "https://ror.org/01bj3aw27", - "const": "https://ror.org/01bj3aw27", "options": { "hidden": true } } } @@ -279,7 +277,6 @@ "url": { "type": "string", "default": "https://ror.org/027ka1x80", - "const": "https://ror.org/027ka1x80", "options": { "hidden": true } } } @@ -306,7 +303,6 @@ "url": { "type": "string", "default": "https://ror.org/0472cxd90", - "const": "https://ror.org/0472cxd90", "options": { "hidden": true } } } @@ -333,7 +329,6 @@ "url": { "type": "string", "default": "https://ror.org/018mejw64", - "const": "https://ror.org/018mejw64", "options": { "hidden": true } } } @@ -360,7 +355,6 @@ "url": { "type": "string", "default": "https://ror.org/01h0zpd94", - "const": "https://ror.org/01h0zpd94", "options": { "hidden": true } } } @@ -387,7 +381,6 @@ "url": { "type": "string", "default": "https://ror.org/05mmh0f86", - "const": "https://ror.org/05mmh0f86", "options": { "hidden": true } } } @@ -414,7 +407,6 @@ "url": { "type": "string", "default": "https://ror.org/03y2gwe85", - "const": "https://ror.org/03y2gwe85", "options": { "hidden": true } } } @@ -441,7 +433,6 @@ "url": { "type": "string", "default": "https://ror.org/02b5d8509", - "const": "https://ror.org/02b5d8509", "options": { "hidden": true } } } diff --git a/dspback/utils/jsonld/formatter.py b/dspback/utils/jsonld/formatter.py index 5578825..c568e95 100644 --- a/dspback/utils/jsonld/formatter.py +++ b/dspback/utils/jsonld/formatter.py @@ -57,7 +57,16 @@ def format_fields(json_ld): json_ld["license"] = {"text": json_ld["license"]} if "author" in json_ld: - for author_role in [author_list['author'] for author_list in json_ld['author']['@list']]: - json_ld["creator"] = {'@list': author_role} + author_roles = [author_list for author_list in json_ld['author']['@list']] + author_list = [] + for author_role in author_roles: + if author_role: + author_list = author_list + author_role["author"] + + json_ld["creator"] = {'@list': author_list} + + if "@context" in json_ld: + if not isinstance(json_ld["@context"], str): + json_ld["@context"] = json_ld["@context"]["@vocab"] return json_ld diff --git a/management/refresh_submission_url_earthchem.py b/management/refresh_submission_url_earthchem.py new file mode 100644 index 0000000..22e91c7 --- /dev/null +++ b/management/refresh_submission_url_earthchem.py @@ -0,0 +1,38 @@ +import asyncio +from dspback.pydantic_schemas import RepositoryType + +import motor +from beanie import init_beanie + +from dspback.config import get_settings +from dspback.pydantic_schemas import Submission + +''' +This python script updates the ECL submission urls. + +Example call: + +docker exec dspback python management/refresh_submission_url_earthchem.py +''' + +async def initiaize_beanie(): + db = motor.motor_asyncio.AsyncIOMotorClient(get_settings().mongo_url) + await init_beanie( + database=db[get_settings().mongo_database], document_models=[Submission] + ) + +async def main(): + await initiaize_beanie() + + count = 0 + for submission in await Submission.find(Submission.repo_type == RepositoryType.EARTHCHEM).to_list(): + print(f"updating {submission.url}") + submission.url = get_settings().earthchem_public_view_url % submission.identifier + await submission.save() + print(f"to {submission.url}") + count = count + 1 + print(f"total submission updated {count}") + + +if __name__ == "__main__": + asyncio.run(main()) \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index da4b3a1..6655342 100644 --- a/requirements.txt +++ b/requirements.txt @@ -15,7 +15,7 @@ cryptography==36.0.2 dnspython==2.2.1 ecdsa==0.17.0 email-validator==1.1.3 -fastapi +fastapi==0.89.0 fastapi-restful==0.4.3 greenlet==1.1.2 h11==0.12.0 @@ -51,7 +51,7 @@ rsa==4.8 six==1.16.0 sniffio==1.2.0 soupsieve==2.3.1 -starlette +starlette==0.22.0 tomli==2.0.1 typing_extensions==4.3.0 urllib3==1.26.9 diff --git a/tests/test_jsonld.py b/tests/test_jsonld.py index 6252468..06f3d40 100644 --- a/tests/test_jsonld.py +++ b/tests/test_jsonld.py @@ -5,7 +5,9 @@ from dspback.pydantic_schemas import RepositoryType, Submission from dspback.scheduler import retrieve_submission_json_ld +from dspback.schemas.discovery import JSONLD from dspback.utils.jsonld.clusters import clusters +from dspback.utils.jsonld.scraper import format_fields ids_and_cluster = [ ("2012073", "Bedrock Cluster"), @@ -135,3 +137,134 @@ class MockSubmission(BaseModel): public_jsonld = await retrieve_submission_json_ld(submission.dict()) assert len(public_jsonld["clusters"]) == 1 assert public_jsonld["clusters"][0] == "Drylands Cluster" + + +@pytest.mark.asyncio +async def test_earthchem_jsonld(): + metadata_json = { + "@context": {"@vocab": "https://schema.org/", "datacite": "http://purl.org/spar/datacite/"}, + "@id": "https://doi.org/10.1594/IEDA/100243", + "@type": "Dataset", + "name": "Susquehanna Shale Hills Critical Zone Observatory Stream Water Chemistry (2010)", + "sameAs": "https://ecl.earthchem.org/view.php?id=523", + "isAccessibleForFree": True, + "citation": ["https://doi.org/10.2136/vzj2010.0133"], + "author": { + "@list": [ + { + "@type": "Role", + "author": [ + {"@type": "Person", "name": "Susan L. Brantley", "givenName": "Susan", "familyName": "Brantley"} + ], + "roleName": "Lead Author", + }, + { + "@type": "Role", + "author": [ + { + "@type": "Person", + "name": "Pamela L. Sullivan", + "givenName": "Pamela", + "familyName": "Sullivan", + }, + { + "@type": "Person", + "name": "Danielle Andrews", + "givenName": "Danielle", + "familyName": "Andrews", + }, + {"@type": "Person", "name": "George Holmes", "givenName": "George", "familyName": "Holmes"}, + {"@type": "Person", "name": "Molly Holleran", "givenName": "Molly", "familyName": "Holleran"}, + { + "@type": "Person", + "name": "Jennifer Z. Williams", + "givenName": "Jennifer", + "familyName": "Williams", + }, + { + "@type": "Person", + "name": "Elizabeth Herndon", + "givenName": "Elizabeth", + "familyName": "Herndon", + }, + {"@type": "Person", "name": "Maya Bhatt", "givenName": "Maya", "familyName": "Bhatt"}, + { + "@type": "Person", + "name": "Ekaterina Bazilevskaya", + "givenName": "Ekaterina", + "familyName": "Bazilevskaya", + }, + { + "@type": "Person", + "name": "Tiffany Yesavage", + "givenName": "Tiffany", + "familyName": "Yesavage", + }, + {"@type": "Person", "name": "Evan Thomas", "givenName": "Evan", "familyName": "Thomas"}, + {"@type": "Person", "name": "Chris J. Duffy", "givenName": "Chris", "familyName": "Duffy"}, + ], + "roleName": "Coauthor", + }, + ] + }, + "description": "Stream water chemistry at Susquehanna Shale Hills Critical Zone Observatory in 2010. Weekly to monthly grab samples were collected at three locations along the first order Stream: at the Headwater (SH), Middle (SM) and adjacent to the Weir (SW). Daily stream water sample were also collected adjacent to the weir from using automatic samplers (2700 series, Teledyne Isco, Lincoln, NE) and were referenced as SW-ISCO. ", + "distribution": { + "datePublished": "2013-02-05 00:00:00", + "contentUrl": "https://ecl.earthchem.org/view.php?id=523", + "@type": "DataDownload", + "encodingFormat": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", + }, + "license": "https://spdx.org/licenses/CC-BY-SA-4.0", + "dateCreated": "2013-02-04", + "inLanguage": "English", + "keywords": [ + "Susquehanna Shale Hills", + "Pennsylvania", + "Regional (Continents, Oceans)", + "Stream water", + "geochemistry", + "DOC", + "trace elements", + "major ions", + ], + "publisher": { + "contactPoint": { + "@type": "ContactPoint", + "name": "Information Desk", + "contactType": "Customer Service", + "email": "info@earthchem.org", + "url": "https://www.earthchem.org/contact/", + }, + "@type": "Organization", + "name": "EarthChem Library", + "@id": "https://www.earthchem.org", + "url": "https://www.earthchem.org/library", + }, + "provider": {"@type": "Organization", "name": "EarthChem Library"}, + "spatialCoverage": { + "@type": "Place", + "geo": [ + {"@type": "GeoCoordinates", "latitude": "40.6644474", "longitude": "-77.9056298"}, + {"@type": "GeoCoordinates", "latitude": "40.6647643", "longitude": "-77.9040381"}, + {"@type": "GeoCoordinates", "latitude": "40.664841", "longitude": "-77.9072532"}, + {"@type": "GeoCoordinates", "latitude": "40.6648488", "longitude": "-77.9072458"}, + ], + }, + "url": "https://doi.org/10.1594/IEDA/100243", + "funder": { + "@type": "MonetaryGrant", + "fundedItem": {"@id": "https://doi.org/10.1594/IEDA/100243"}, + "funder": [ + { + "@type": "Organization", + "name": "National Science Foundation", + "url": "http://www.nsf.gov/awardsearch/showAward.do?AwardNumber=0725019", + } + ], + }, + } + + scraped_jsonld = format_fields(metadata_json) + jsonld = JSONLD(**scraped_jsonld) + assert jsonld.provider.name == "EarthChem Library" + assert jsonld.context == "https://schema.org/" diff --git a/tests/test_records.py b/tests/test_records.py index 07850a3..0efe46c 100644 --- a/tests/test_records.py +++ b/tests/test_records.py @@ -81,4 +81,4 @@ async def test_earthchem_to_submission(earthchem): assert earthchem_submission.repo_type == RepositoryType.EARTHCHEM assert earthchem_submission.submitted <= datetime.utcnow() assert earthchem_submission.identifier == "947940" - assert earthchem_submission.url == get_settings().earthchem_view_url % "947940" + assert earthchem_submission.url == get_settings().earthchem_public_view_url % "947940"