Skip to content

Commit

Permalink
Using JSONLD to retrieve information (#43)
Browse files Browse the repository at this point in the history
* Using JSONLD to retrieve information

* Add to JSONLD runtests

* Use get() instead of try for dict keys

* Make necessary changes

* Necessary changes II

* Resolve mistake

* Make necessary changes III

* Strip HTML from license
  • Loading branch information
SebastinSanty authored and oxinabox committed Jul 6, 2018
1 parent c0b6dfc commit 69da784
Show file tree
Hide file tree
Showing 13 changed files with 240 additions and 3 deletions.
7 changes: 4 additions & 3 deletions src/DataDepsGenerators.jl
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,10 @@ using Gumbo, Cascadia, AbstractTrees
using Suppressor
using JSON
using HTTP
using Missings

export generate, citation_text
export UCI, GitHub, DataDryad, DataOneV1, DataOneV2, CKAN, DataCite, Figshare
export UCI, GitHub, DataDryad, DataOneV1, DataOneV2, CKAN, DataCite, Figshare, JSONLD

abstract type DataRepo end

Expand Down Expand Up @@ -41,7 +42,7 @@ include("DataOneV2/DataOneV2.jl")
include("CKAN.jl")
include("DataCite.jl")
include("Figshare.jl")

include("JSONLD/JSONLD.jl")

function message(meta)
escape_multiline_string("""
Expand Down Expand Up @@ -78,7 +79,7 @@ function generate(repo::DataRepo,
"""
end

get_checksums(repo::DataRepo, page) = ""
get_checksums(repo::DataRepo, page) = nothing

function format_checksums(csums::Vector)
csumvec = join(format_checksums.(csums), ", ")
Expand Down
82 changes: 82 additions & 0 deletions src/JSONLD/JSONLD.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
abstract type JSONLD <: DataRepo
end

export JSONLD_Web, JSONLD_DOI

include("JSONLD_Web.jl")
include("JSONLD_DOI.jl")

function description(repo::JSONLD, mainpage)
desc = filter_html(handle_keys(mainpage, "description"))
authors = get_authors(repo, mainpage)
author = format_authors(authors)
license = filter_html(get_license(mainpage))
date = get_dates(repo, mainpage)

"""
Author: $(author)
Date: $(date)
License: $(license)
$(desc)
"""
end

function get_authors(repo::JSONLD, mainpage)
authors = handle_keys(mainpage, "author", "creator")
if authors isa Vector
return collect(skipmissing(handle_keys.(authors, "name")))
elseif authors isa Dict
return [handle_keys(authors, "name")]
else
@assert(authors==nothing)
return []
end

end

function get_dates(repo::JSONLD, mainpage)
rawdate = handle_keys(mainpage, "datePublished", "dateCreated", "dateModified")
# Dates can be like '2007' or '2016-12-20'. Need to account for all.
try
return Dates.format(Dates.DateTime(rawdate), "U d, yyyy")
catch error
if error isa MethodError
return rawdate
end
end
end

function get_license(mainpage)
license = handle_keys(mainpage, "license")
if license isa String
return license
elseif license isa Dict
return handle_keys(license, "url", "text")
end
end

handle_keys(json, key, otherkeys...) = get(json, key) do
handle_keys(json, otherkeys...)
end

handle_keys(json) = nothing

function get_urls(repo::JSONLD, page)
urls = []
url_list = handle_keys(page, "distribution")
if url_list != nothing
urls = collect(skipmissing(handle_keys.(url_list, "contentUrl")))
else
urls = []
end
urls
end

function data_fullname(::JSONLD, mainpage)
mainpage["name"]
end

function website(::JSONLD, mainpage_url, mainpage)
mainpage_url
end
10 changes: 10 additions & 0 deletions src/JSONLD/JSONLD_DOI.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
struct JSONLD_DOI <: JSONLD end

function mainpage_url(repo::JSONLD_DOI, dataname)
if match_doi(dataname) != nothing
url = joinpath("https://data.datacite.org/", match_doi(dataname))
resp = HTTP.get(url, ["Accept"=>"application/vnd.schemaorg.ld+json"]; forwardheaders=true)
json = resp.body |> String |> JSON.parse
end
json, dataname
end
14 changes: 14 additions & 0 deletions src/JSONLD/JSONLD_Web.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
struct JSONLD_Web <: JSONLD end

function mainpage_url(repo::JSONLD_Web, dataname)
page=getpage(dataname)
pattern = sel"script[type=\"application/ld+json\"]"
jsonld_blocks = matchall(pattern, page.root)
if length(jsonld_blocks)==0
error("No JSON-LD Linked Data Found")
end
@assert length(jsonld_blocks)==1
script_block = text_only(first(jsonld_blocks))
json = JSON.parse(script_block)
json, dataname
end
9 changes: 9 additions & 0 deletions src/utils.jl
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,15 @@ text_only(doc::HTMLDocument) = text_only(doc.root)
text_only(frag) = join([replace(text(leaf), "\r","") for leaf in Leaves(frag) if leaf isa HTMLText], " ")
text_only(frags::Vector) = join(text_only.(frags), " ")

filter_html(::Void) = nothing

function filter_html(content)
#Check if the incoming content is a HTML or not
if ismatch(r"<(\"[^\"]*\"|'[^']*'|[^'\">])*>", content)
return text_only(parsehtml(content))
end
return content
end

"
indent(str)
Expand Down
13 changes: 13 additions & 0 deletions test/JSONLD/JSONLD.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
using DataDepsGenerators
using Base.Test

using ReferenceTests

@testset "JSONLD test" begin
@test_reference "../references/JSONLD_Web Kaggle.txt" generate(JSONLD_Web(), "https://www.kaggle.com/stackoverflow/stack-overflow-2018-developer-survey")
@test_reference "../references/JSONLD_Web Zenodo.txt" generate(JSONLD_Web(), "https://zenodo.org/record/1287281")
@test_reference "../references/JSONLD_Web ICRISAT.txt" generate(JSONLD_Web(), "http://dataverse.icrisat.org/dataset.xhtml?persistentId=doi:10.21421/D2/ZS6XX1")
@test_reference "../references/JSONLD_Web Figshare.txt" generate(JSONLD_Web(), "https://figshare.com/articles/_shows_examples_of_coordinated_and_uncoordinated_motion_for_dangerous_and_non_dangerous_crowd_behavior_/186003")
@test_reference "../references/JSONLD_DOI Figshare.txt" generate(JSONLD_DOI(), "10.1371/journal.pbio.2001414")
@test_reference "../references/JSONLD_DOI PBIO.txt" generate(JSONLD_DOI(), "https://data.datacite.org/10.1371/journal.pbio.2001414")
end
14 changes: 14 additions & 0 deletions test/references/JSONLD_DOI Figshare.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
register(DataDep(
"Identifiers for the 21st century How to design, provision, and reuse persistent identifiers to maximize utility and impact of life science data",
"""
Dataset: Identifiers for the 21st century: How to design, provision, and reuse persistent identifiers to maximize utility and impact of life science data
Website: 10.1371/journal.pbio.2001414
Author: Julie A. McMurry et al.
Date: June 29, 2017
License: http://creativecommons.org/licenses/by/4.0

nothing
""",
Any[],

))
14 changes: 14 additions & 0 deletions test/references/JSONLD_DOI PBIO.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
register(DataDep(
"Identifiers for the 21st century How to design, provision, and reuse persistent identifiers to maximize utility and impact of life science data",
"""
Dataset: Identifiers for the 21st century: How to design, provision, and reuse persistent identifiers to maximize utility and impact of life science data
Website: https://data.datacite.org/10.1371/journal.pbio.2001414
Author: Julie A. McMurry et al.
Date: June 29, 2017
License: http://creativecommons.org/licenses/by/4.0

nothing
""",
Any[],

))
14 changes: 14 additions & 0 deletions test/references/JSONLD_Web Figshare.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
register(DataDep(
"shows examples of coordinated and uncoordinated motion for dangerous and non-dangerous crowd behavior.",
"""
Dataset: shows examples of coordinated and uncoordinated motion for dangerous and non-dangerous crowd behavior.
Website: https://figshare.com/articles/_shows_examples_of_coordinated_and_uncoordinated_motion_for_dangerous_and_non_dangerous_crowd_behavior_/186003
Author: Florian Raudies, Heiko Neumann
Date: nothing
License: nothing

shows examples of coordinated and uncoordinated motion for dangerous and non-dangerous crowd behavior.
""",
String["https://ndownloader.figshare.com/files/515509"],

))
15 changes: 15 additions & 0 deletions test/references/JSONLD_Web ICRISAT.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
register(DataDep(
"Phenotypic evaluation data of medium duration Pigeonpea advanced varieties trial",
"""
Dataset: Phenotypic evaluation data of medium duration Pigeonpea advanced varieties trial
Website: http://dataverse.icrisat.org/dataset.xhtml?persistentId=doi:10.21421/D2/ZS6XX1
Author: Sameer Kumar, CV, Anupama Hingane
Date: December 30, 2017
License:
These data and documents are licensed under a Creative Commons Attribution 4.0 International license. You may copy, distribute and transmit the data as long as you acknowledge the source through proper data citation. Disclaimer Whilst utmost care has been taken by ICRISAT and data authors while collecting and compiling the data, the data is however offered "as is" with no express or implied warranty. In no event shall the data authors, ICRISAT, or relevant funding agencies be liable for any actual, incidental or consequential damages arising from use of the data. By using the ICRISAT Dataverse, the user expressly acknowledges that the Data may contain some nonconformities, defects, or errors. No warranty is given that the data will meet the user's needs or expectations or that all nonconformities, defects, or errors can or will be corrected. The user should always verify actual data; therefore the user bears all responsibility in determining whether the data is fit for the user’s intended use. The user of the data should use the related publications as a baseline for their analysis whenever possible. Doing so will be an added safeguard against misinterpretation of the data. Related publications are listed in the metadata section of the Dataverse study.

This database includes the research work carried out on development of medium duration pigeonpea cultivars including advanced varieties at ICRISAT Center, Patancheru (17°30'N 78°16'46E). Pigeon pea is a very important grain legume crop for food other uses in Asia and Africa. It is often cross-pollinated species with a diploid number of 2n= 2x22 and genome size of 858Mbp. Every year 50 to 100 and above new crosses (and also CMS hybrids) will be made evaluated in nurseries to develop new high yielding cultivars with adaptability to different climatic/agronomic zones. Based on their agronomic performance in nurseries for maturity time, branching pattern and number of branches, pod color, pod yield and other pest and diseases tolerance characters etc, the superior progenies will be selected and advanced to further generations (to F5s). The F5 progenies selected based on preliminary/nursery data will be evaluated along with controls in replicated (twice or thrice) trials every year for further agronomic evaluation and selection. The agronomic data (days to 50% flowering and/or maturity, plant height, grain yield, grain size and color etc) of the progenies evaluated in years 2015 were presented herewith. The trial details and plot sizes were given. This data helps us to select and advance further. Finally the few best progenies among them will be evaluated in on-farm trials (OFTs) and in multi-location trials. The best performed progenies will be considered to promote/release in respective agronomic zones. Experiment location on Google Map
""",
Any[],

))
36 changes: 36 additions & 0 deletions test/references/JSONLD_Web Kaggle.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
register(DataDep(
"Stack Overflow 2018 Developer Survey",
"""
Dataset: Stack Overflow 2018 Developer Survey
Website: https://www.kaggle.com/stackoverflow/stack-overflow-2018-developer-survey
Author: Stack Overflow
Date: May 15, 2018
License: http://opendatacommons.org/licenses/dbcl/1.0/

### Context

Each year, we at [Stack Overflow](https://stackoverflow.com/) ask the developer community about everything from their favorite technologies to their job preferences. This year marks the eighth year we’ve published our Annual Developer Survey results—with the largest number of respondents yet. Over 100,000 developers took the 30-minute survey in January 2018.

This year, we covered a few new topics ranging from artificial intelligence to ethics in coding. We also found that underrepresented groups in tech responded to our survey at even lower rates than we would expect from their participation in the workforce. Want to dive into the results yourself and see what you can learn about salaries or machine learning or diversity in tech? We look forward to seeing what you find!

### Content

This 2018 Developer Survey results are organized on Kaggle in two tables:

**survey_results_public** contains the main survey results, one respondent per row and one column per question

**survey_results_schema** contains each column name from the main results along with the question text corresponding to that column

There are 98,855 responses in this public data release. These responses are what we consider “qualified” for analytical purposes based on completion and time spent on the survey and included at least one non-PII question. Approximately 20,000 responses were started but not included here because respondents did not answer enough questions, or only answered questions with personally identifying information. Of the qualified responses, 67,441 completed the entire survey.

### Acknowledgements

Massive, heartfelt thanks to all Stack Overflow contributors and lurking developers of the world who took part in the survey this year. We value your generous participation more than you know.

### Inspiration

At Stack Overflow, we put developers first and want [all developers to feel welcome and included on our site](https://stackoverflow.blog/2018/04/26/stack-overflow-isnt-very-welcoming-its-time-for-that-to-change/). Can we use our annual survey to understand what kinds of users are less likely to identify as part of our community, participate, or feel kinship with fellow developers? Check out [our blog post](https://stackoverflow.blog/2018/05/30/public-data-release-of-stack-overflows-2018-developer-survey) for more details.
""",
String["https://www.kaggle.com/stackoverflow/stack-overflow-2018-developer-survey/downloads/stack-overflow-2018-developer-survey.zip/2", "https://www.kaggle.com/stackoverflow/stack-overflow-2018-developer-survey/downloads/survey_results_public.csv/2", "https://www.kaggle.com/stackoverflow/stack-overflow-2018-developer-survey/downloads/survey_results_schema.csv/2"],

))
14 changes: 14 additions & 0 deletions test/references/JSONLD_Web Zenodo.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
register(DataDep(
"Map of Co-Seismic Landslides for the M 7.8 Kaikoura, New Zealand Earthquake",
"""
Dataset: Map of Co-Seismic Landslides for the M 7.8 Kaikoura, New Zealand Earthquake
Website: https://zenodo.org/record/1287281
Author: Valkaniotis Sotiris et al.
Date: December 20, 2016
License: https://creativecommons.org/licenses/by/4.0/

Prepared by the Research Group on Earthquake Geology in Greece (http://eqgeogr.weebly.com/) Version 2 (updated) With the release of new Sentinel-2 images, and other available resources for the M7.8 Kaikoura earthquake, we present an update of the Map of Co-Seismic Landslides and Surfaces Ruptures (As of 27/11/2016). Landslides were mapped using Sentinel-2 satellite images from Copernicus, European Space Agency, dated November and December 2016. Images were visually compared with previous last available S2A images without cloud cover (13 September and 26 October) and landslides and large slope failures were manually mapped. Areas covered by cloud are omitted and shown on map. 5875 landslide sites are shown in the map. A small number of landslides could have been mis-identified due to insufficient resolution of the images, small gaps of cloud cover or for other reasons. Also, re-activated landslides on the central mountainous area were unabled to identify due to imagery restrictions (medium resolution, relief shadows etc). Some local gaps in Sentinel imagery still exist due to cloud cover, but we believe the current map is very close to the major distribution of mass movement effects. Surface ruptures were mapped using Sentinel-2 imagery and approximate position from photos of the post-earthquake aerial surveys of Environment Canterbury Regional Council (http://ecan.govt.nz) KML file contains7355 landslide spots.
""",
String["https://zenodo.org/api/files/5a311c7a-bd5e-4df7-be61-341d03ec9a9b/Landslide_Map_V2_A2.pdf", "https://zenodo.org/api/files/5a311c7a-bd5e-4df7-be61-341d03ec9a9b/Landslides_Kaikoura_2016.kmz", "https://zenodo.org/api/files/5a311c7a-bd5e-4df7-be61-341d03ec9a9b/Prelim_Landslide_Map_A2.jpg"],

))
1 change: 1 addition & 0 deletions test/runtests.jl
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ tests = [
"CKAN",
"DataCite",
"Figshare",
"JSONLD/JSONLD",
]

@testset "DataDepGenerators" begin
Expand Down

0 comments on commit 69da784

Please sign in to comment.