From a0ec3909677467d10ce9518204061e5465599963 Mon Sep 17 00:00:00 2001 From: Sebastin Santy Date: Wed, 4 Jul 2018 17:43:29 +0530 Subject: [PATCH] Using JSONLD to retrieve information --- src/DataDepsGenerators.jl | 4 +- src/JSONLD/JSONLD.jl | 92 +++++++++++++++++++++++++ src/JSONLD/JSONLD_DOI.jl | 10 +++ src/JSONLD/JSONLD_Web.jl | 14 ++++ test/JSONLD/JSONLD.jl | 9 +++ test/references/JSONLD_DOI Figshare.txt | 14 ++++ test/references/JSONLD_Web Kaggle.txt | 20 ++++++ 7 files changed, 161 insertions(+), 2 deletions(-) create mode 100644 src/JSONLD/JSONLD.jl create mode 100644 src/JSONLD/JSONLD_DOI.jl create mode 100644 src/JSONLD/JSONLD_Web.jl create mode 100644 test/JSONLD/JSONLD.jl create mode 100644 test/references/JSONLD_DOI Figshare.txt create mode 100644 test/references/JSONLD_Web Kaggle.txt diff --git a/src/DataDepsGenerators.jl b/src/DataDepsGenerators.jl index de8b5a3..5bcde05 100644 --- a/src/DataDepsGenerators.jl +++ b/src/DataDepsGenerators.jl @@ -5,7 +5,7 @@ using JSON using HTTP export generate, citation_text -export UCI, GitHub, DataDryad, DataOneV1, DataOneV2, CKAN, DataCite, Figshare +export UCI, GitHub, DataDryad, DataOneV1, DataOneV2, CKAN, DataCite, Figshare, JSONLD abstract type DataRepo end @@ -41,7 +41,7 @@ include("DataOneV2/DataOneV2.jl") include("CKAN.jl") include("DataCite.jl") include("Figshare.jl") - +include("JSONLD/JSONLD.jl") function message(meta) escape_multiline_string(""" diff --git a/src/JSONLD/JSONLD.jl b/src/JSONLD/JSONLD.jl new file mode 100644 index 0000000..4c4fd34 --- /dev/null +++ b/src/JSONLD/JSONLD.jl @@ -0,0 +1,92 @@ +abstract type JSONLD <: DataRepo +end + +export JSONLD_Web, JSONLD_DOI + +include("JSONLD_Web.jl") +include("JSONLD_DOI.jl") + +function description(repo::JSONLD, mainpage) + desc = handle_keys("description", "", mainpage) + authors = handle_keys("author", "creator", mainpage) + if authors != nothing + stripauthors = [handle_keys("name", "", ii) for ii in authors if handle_keys("name", "", ii) != nothing] + author = format_authors(stripauthors) + else + author = "Unknown Author" + end + license = get_license(mainpage) + rawdate = Dates.DateTime(handle_keys("datePublished", "dateModified", mainpage)) + date = Dates.format(rawdate, "U d, yyyy") + + """ + Author: $(author) + License: $(license) + Date: $(date) + + $(desc) + """ +end + +function get_license(mainpage) + license = handle_keys("license", "", mainpage) + if license != nothing + if isa(license, String) + return license + elseif isa(license, Dict) + return handle_keys("url", "text", license) + end + end +end + +function handle_keys(key1::String, key2::String, json) + info = "" + try + info = json[key1] + catch error + try + info = json[key2] + catch KeyError + info = nothing + end + end + info +end + +function get_urls(repo::JSONLD, page) + urls = [] + url_list = handle_keys("distribution", "", page) + if url_list != nothing + urls = [handle_keys("contentUrl", "", ii) for ii in url_list if handle_keys("contentUrl", "", ii) != nothing] + else + urls = [] + end + urls +end + +function get_checksums(repo::JSONLD, page) + checksums = [] + checksums +end + +function data_fullname(::JSONLD, mainpage) + mainpage["name"] +end + +function website(::JSONLD, mainpage_url, mainpage) + mainpage_url +end + +function mainpage_url(repo::JSONLD, dataname) + #We are making it work for both figshare id or doi + page=getpage(dataname) + pattern = sel"script[type=\"application/ld+json\"]" + jsonld_blocks = matchall(pattern, page.root) + if length(jsonld_blocks)==0 + error("No JSON-LD Linked Data Found") + end + @assert length(jsonld_blocks)==1 + script_block = text_only(first(jsonld_blocks)) + json = JSON.parse(script_block) + json, dataname +end \ No newline at end of file diff --git a/src/JSONLD/JSONLD_DOI.jl b/src/JSONLD/JSONLD_DOI.jl new file mode 100644 index 0000000..624460d --- /dev/null +++ b/src/JSONLD/JSONLD_DOI.jl @@ -0,0 +1,10 @@ +struct JSONLD_DOI <: JSONLD end + +function mainpage_url(repo::JSONLD_DOI, dataname) + if match_doi(dataname) != nothing + url = joinpath("https://data.datacite.org/", match_doi(dataname)) + resp = HTTP.get(url, ["Accept"=>"application/vnd.schemaorg.ld+json"]; forwardheaders=true) + json = JSON.parse(resp.body |> String |> strip) + end + json, dataname +end \ No newline at end of file diff --git a/src/JSONLD/JSONLD_Web.jl b/src/JSONLD/JSONLD_Web.jl new file mode 100644 index 0000000..d4305f1 --- /dev/null +++ b/src/JSONLD/JSONLD_Web.jl @@ -0,0 +1,14 @@ +struct JSONLD_Web <: JSONLD end + +function mainpage_url(repo::JSONLD_Web, dataname) + page=getpage(dataname) + pattern = sel"script[type=\"application/ld+json\"]" + jsonld_blocks = matchall(pattern, page.root) + if length(jsonld_blocks)==0 + error("No JSON-LD Linked Data Found") + end + @assert length(jsonld_blocks)==1 + script_block = text_only(first(jsonld_blocks)) + json = JSON.parse(script_block) + json, dataname +end \ No newline at end of file diff --git a/test/JSONLD/JSONLD.jl b/test/JSONLD/JSONLD.jl new file mode 100644 index 0000000..3734093 --- /dev/null +++ b/test/JSONLD/JSONLD.jl @@ -0,0 +1,9 @@ +using DataDepsGenerators +using Base.Test + +using ReferenceTests + +@testset "JSONLD test" begin + @test_reference "../references/JSONLD_Web Kaggle.txt" generate(JSONLD_Web(), "https://zenodo.org/record/1287281") + @test_reference "../references/JSONLD_DOI Figshare.txt" generate(JSONLD_DOI(), "10.1371/journal.pbio.2001414") +end diff --git a/test/references/JSONLD_DOI Figshare.txt b/test/references/JSONLD_DOI Figshare.txt new file mode 100644 index 0000000..cfee233 --- /dev/null +++ b/test/references/JSONLD_DOI Figshare.txt @@ -0,0 +1,14 @@ +register(DataDep( + "Identifiers for the 21st century How to design, provision, and reuse persistent identifiers to maximize utility and impact of life science data", + """ + Dataset: Identifiers for the 21st century: How to design, provision, and reuse persistent identifiers to maximize utility and impact of life science data + Website: 10.1371/journal.pbio.2001414 + Author: Julie A. McMurry et al. + License: http://creativecommons.org/licenses/by/4.0 + Date: June 29, 2017 + + nothing + """, + Any[], + [] +)) diff --git a/test/references/JSONLD_Web Kaggle.txt b/test/references/JSONLD_Web Kaggle.txt new file mode 100644 index 0000000..306e8fa --- /dev/null +++ b/test/references/JSONLD_Web Kaggle.txt @@ -0,0 +1,20 @@ +register(DataDep( + "Map of Co-Seismic Landslides for the M 7.8 Kaikoura, New Zealand Earthquake", + """ + Dataset: Map of Co-Seismic Landslides for the M 7.8 Kaikoura, New Zealand Earthquake + Website: https://zenodo.org/record/1287281 + Author: Valkaniotis Sotiris et al. + License: https://creativecommons.org/licenses/by/4.0/ + Date: December 20, 2016 + +

Prepared by the Research Group on Earthquake Geology in Greece (http://eqgeogr.weebly.com/)

+ +

Version 2 (updated)

+ +

With the release of new Sentinel-2 images, and other available resources for the M7.8 Kaikoura earthquake, we present an update of the Map of Co-Seismic Landslides and Surfaces Ruptures (As of 27/11/2016). Landslides were mapped using Sentinel-2 satellite images from Copernicus, European Space Agency, dated November and December 2016. Images were visually compared with previous last available S2A images without cloud cover (13 September and 26 October) and landslides and large slope failures were manually mapped. Areas covered by cloud are omitted and shown on map. 5875 landslide sites are shown in the map. A small number of landslides could have been mis-identified due to insufficient resolution of the images, small gaps of cloud cover or for other reasons. Also, re-activated landslides on the central mountainous area were unabled to identify due to imagery restrictions (medium resolution, relief shadows etc). Some local gaps in Sentinel imagery still exist due to cloud cover, but we believe the current map is very close to the major distribution of mass movement effects. Surface ruptures were mapped using Sentinel-2 imagery and approximate position from photos of the post-earthquake aerial surveys of Environment Canterbury Regional Council (http://ecan.govt.nz)

+ +

KML file contains7355 landslide spots.

+ """, + String["https://zenodo.org/api/files/5a311c7a-bd5e-4df7-be61-341d03ec9a9b/Landslide_Map_V2_A2.pdf", "https://zenodo.org/api/files/5a311c7a-bd5e-4df7-be61-341d03ec9a9b/Landslides_Kaikoura_2016.kmz", "https://zenodo.org/api/files/5a311c7a-bd5e-4df7-be61-341d03ec9a9b/Prelim_Landslide_Map_A2.jpg"], + [] +))