-
Notifications
You must be signed in to change notification settings - Fork 6
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Using JSONLD to retrieve information #43
Changes from all commits
a0ec390
63640a2
04a9709
921f137
60483e3
6f0b52e
7fd9f56
a4929ff
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,82 @@ | ||
abstract type JSONLD <: DataRepo | ||
end | ||
|
||
export JSONLD_Web, JSONLD_DOI | ||
|
||
include("JSONLD_Web.jl") | ||
include("JSONLD_DOI.jl") | ||
|
||
function description(repo::JSONLD, mainpage) | ||
desc = filter_html(handle_keys(mainpage, "description")) | ||
authors = get_authors(repo, mainpage) | ||
author = format_authors(authors) | ||
license = filter_html(get_license(mainpage)) | ||
date = get_dates(repo, mainpage) | ||
|
||
""" | ||
Author: $(author) | ||
Date: $(date) | ||
License: $(license) | ||
|
||
$(desc) | ||
""" | ||
end | ||
|
||
function get_authors(repo::JSONLD, mainpage) | ||
authors = handle_keys(mainpage, "author", "creator") | ||
if authors isa Vector | ||
return collect(skipmissing(handle_keys.(authors, "name"))) | ||
elseif authors isa Dict | ||
return [handle_keys(authors, "name")] | ||
else | ||
@assert(authors==nothing) | ||
return [] | ||
end | ||
|
||
end | ||
|
||
function get_dates(repo::JSONLD, mainpage) | ||
rawdate = handle_keys(mainpage, "datePublished", "dateCreated", "dateModified") | ||
# Dates can be like '2007' or '2016-12-20'. Need to account for all. | ||
try | ||
return Dates.format(Dates.DateTime(rawdate), "U d, yyyy") | ||
catch error | ||
if error isa MethodError | ||
return rawdate | ||
end | ||
end | ||
end | ||
|
||
function get_license(mainpage) | ||
license = handle_keys(mainpage, "license") | ||
if license isa String | ||
return license | ||
elseif license isa Dict | ||
return handle_keys(license, "url", "text") | ||
end | ||
end | ||
|
||
handle_keys(json, key, otherkeys...) = get(json, key) do | ||
handle_keys(json, otherkeys...) | ||
end | ||
|
||
handle_keys(json) = nothing | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. make this A bunch of There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. How will I handle There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'll make a PR on this and show you. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Sure, it will be helpful if you can. |
||
|
||
function get_urls(repo::JSONLD, page) | ||
urls = [] | ||
url_list = handle_keys(page, "distribution") | ||
if url_list != nothing | ||
urls = collect(skipmissing(handle_keys.(url_list, "contentUrl"))) | ||
else | ||
urls = [] | ||
end | ||
urls | ||
end | ||
|
||
function data_fullname(::JSONLD, mainpage) | ||
mainpage["name"] | ||
end | ||
|
||
function website(::JSONLD, mainpage_url, mainpage) | ||
mainpage_url | ||
end |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,10 @@ | ||
struct JSONLD_DOI <: JSONLD end | ||
|
||
function mainpage_url(repo::JSONLD_DOI, dataname) | ||
if match_doi(dataname) != nothing | ||
url = joinpath("https://data.datacite.org/", match_doi(dataname)) | ||
resp = HTTP.get(url, ["Accept"=>"application/vnd.schemaorg.ld+json"]; forwardheaders=true) | ||
json = resp.body |> String |> JSON.parse | ||
end | ||
json, dataname | ||
end |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,14 @@ | ||
struct JSONLD_Web <: JSONLD end | ||
|
||
function mainpage_url(repo::JSONLD_Web, dataname) | ||
page=getpage(dataname) | ||
pattern = sel"script[type=\"application/ld+json\"]" | ||
jsonld_blocks = matchall(pattern, page.root) | ||
if length(jsonld_blocks)==0 | ||
error("No JSON-LD Linked Data Found") | ||
end | ||
@assert length(jsonld_blocks)==1 | ||
script_block = text_only(first(jsonld_blocks)) | ||
json = JSON.parse(script_block) | ||
json, dataname | ||
end |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,13 @@ | ||
using DataDepsGenerators | ||
using Base.Test | ||
|
||
using ReferenceTests | ||
|
||
@testset "JSONLD test" begin | ||
@test_reference "../references/JSONLD_Web Kaggle.txt" generate(JSONLD_Web(), "https://www.kaggle.com/stackoverflow/stack-overflow-2018-developer-survey") | ||
@test_reference "../references/JSONLD_Web Zenodo.txt" generate(JSONLD_Web(), "https://zenodo.org/record/1287281") | ||
@test_reference "../references/JSONLD_Web ICRISAT.txt" generate(JSONLD_Web(), "http://dataverse.icrisat.org/dataset.xhtml?persistentId=doi:10.21421/D2/ZS6XX1") | ||
@test_reference "../references/JSONLD_Web Figshare.txt" generate(JSONLD_Web(), "https://figshare.com/articles/_shows_examples_of_coordinated_and_uncoordinated_motion_for_dangerous_and_non_dangerous_crowd_behavior_/186003") | ||
@test_reference "../references/JSONLD_DOI Figshare.txt" generate(JSONLD_DOI(), "10.1371/journal.pbio.2001414") | ||
@test_reference "../references/JSONLD_DOI PBIO.txt" generate(JSONLD_DOI(), "https://data.datacite.org/10.1371/journal.pbio.2001414") | ||
end |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,14 @@ | ||
register(DataDep( | ||
"Identifiers for the 21st century How to design, provision, and reuse persistent identifiers to maximize utility and impact of life science data", | ||
""" | ||
Dataset: Identifiers for the 21st century: How to design, provision, and reuse persistent identifiers to maximize utility and impact of life science data | ||
Website: 10.1371/journal.pbio.2001414 | ||
Author: Julie A. McMurry et al. | ||
Date: June 29, 2017 | ||
License: http://creativecommons.org/licenses/by/4.0 | ||
|
||
nothing | ||
""", | ||
Any[], | ||
|
||
)) |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,14 @@ | ||
register(DataDep( | ||
"Identifiers for the 21st century How to design, provision, and reuse persistent identifiers to maximize utility and impact of life science data", | ||
""" | ||
Dataset: Identifiers for the 21st century: How to design, provision, and reuse persistent identifiers to maximize utility and impact of life science data | ||
Website: https://data.datacite.org/10.1371/journal.pbio.2001414 | ||
Author: Julie A. McMurry et al. | ||
Date: June 29, 2017 | ||
License: http://creativecommons.org/licenses/by/4.0 | ||
|
||
nothing | ||
""", | ||
Any[], | ||
|
||
)) |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,14 @@ | ||
register(DataDep( | ||
"shows examples of coordinated and uncoordinated motion for dangerous and non-dangerous crowd behavior.", | ||
""" | ||
Dataset: shows examples of coordinated and uncoordinated motion for dangerous and non-dangerous crowd behavior. | ||
Website: https://figshare.com/articles/_shows_examples_of_coordinated_and_uncoordinated_motion_for_dangerous_and_non_dangerous_crowd_behavior_/186003 | ||
Author: Florian Raudies, Heiko Neumann | ||
Date: nothing | ||
License: nothing | ||
|
||
shows examples of coordinated and uncoordinated motion for dangerous and non-dangerous crowd behavior. | ||
""", | ||
String["https://ndownloader.figshare.com/files/515509"], | ||
|
||
)) |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,15 @@ | ||
register(DataDep( | ||
"Phenotypic evaluation data of medium duration Pigeonpea advanced varieties trial", | ||
""" | ||
Dataset: Phenotypic evaluation data of medium duration Pigeonpea advanced varieties trial | ||
Website: http://dataverse.icrisat.org/dataset.xhtml?persistentId=doi:10.21421/D2/ZS6XX1 | ||
Author: Sameer Kumar, CV, Anupama Hingane | ||
Date: December 30, 2017 | ||
License: | ||
These data and documents are licensed under a Creative Commons Attribution 4.0 International license. You may copy, distribute and transmit the data as long as you acknowledge the source through proper data citation. Disclaimer Whilst utmost care has been taken by ICRISAT and data authors while collecting and compiling the data, the data is however offered "as is" with no express or implied warranty. In no event shall the data authors, ICRISAT, or relevant funding agencies be liable for any actual, incidental or consequential damages arising from use of the data. By using the ICRISAT Dataverse, the user expressly acknowledges that the Data may contain some nonconformities, defects, or errors. No warranty is given that the data will meet the user's needs or expectations or that all nonconformities, defects, or errors can or will be corrected. The user should always verify actual data; therefore the user bears all responsibility in determining whether the data is fit for the user’s intended use. The user of the data should use the related publications as a baseline for their analysis whenever possible. Doing so will be an added safeguard against misinterpretation of the data. Related publications are listed in the metadata section of the Dataverse study. | ||
|
||
This database includes the research work carried out on development of medium duration pigeonpea cultivars including advanced varieties at ICRISAT Center, Patancheru (17°30'N 78°16'46E). Pigeon pea is a very important grain legume crop for food other uses in Asia and Africa. It is often cross-pollinated species with a diploid number of 2n= 2x22 and genome size of 858Mbp. Every year 50 to 100 and above new crosses (and also CMS hybrids) will be made evaluated in nurseries to develop new high yielding cultivars with adaptability to different climatic/agronomic zones. Based on their agronomic performance in nurseries for maturity time, branching pattern and number of branches, pod color, pod yield and other pest and diseases tolerance characters etc, the superior progenies will be selected and advanced to further generations (to F5s). The F5 progenies selected based on preliminary/nursery data will be evaluated along with controls in replicated (twice or thrice) trials every year for further agronomic evaluation and selection. The agronomic data (days to 50% flowering and/or maturity, plant height, grain yield, grain size and color etc) of the progenies evaluated in years 2015 were presented herewith. The trial details and plot sizes were given. This data helps us to select and advance further. Finally the few best progenies among them will be evaluated in on-farm trials (OFTs) and in multi-location trials. The best performed progenies will be considered to promote/release in respective agronomic zones. Experiment location on Google Map | ||
""", | ||
Any[], | ||
|
||
)) |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,36 @@ | ||
register(DataDep( | ||
"Stack Overflow 2018 Developer Survey", | ||
""" | ||
Dataset: Stack Overflow 2018 Developer Survey | ||
Website: https://www.kaggle.com/stackoverflow/stack-overflow-2018-developer-survey | ||
Author: Stack Overflow | ||
Date: May 15, 2018 | ||
License: http://opendatacommons.org/licenses/dbcl/1.0/ | ||
|
||
### Context | ||
|
||
Each year, we at [Stack Overflow](https://stackoverflow.com/) ask the developer community about everything from their favorite technologies to their job preferences. This year marks the eighth year we’ve published our Annual Developer Survey results—with the largest number of respondents yet. Over 100,000 developers took the 30-minute survey in January 2018. | ||
|
||
This year, we covered a few new topics ranging from artificial intelligence to ethics in coding. We also found that underrepresented groups in tech responded to our survey at even lower rates than we would expect from their participation in the workforce. Want to dive into the results yourself and see what you can learn about salaries or machine learning or diversity in tech? We look forward to seeing what you find! | ||
|
||
### Content | ||
|
||
This 2018 Developer Survey results are organized on Kaggle in two tables: | ||
|
||
**survey_results_public** contains the main survey results, one respondent per row and one column per question | ||
|
||
**survey_results_schema** contains each column name from the main results along with the question text corresponding to that column | ||
|
||
There are 98,855 responses in this public data release. These responses are what we consider “qualified” for analytical purposes based on completion and time spent on the survey and included at least one non-PII question. Approximately 20,000 responses were started but not included here because respondents did not answer enough questions, or only answered questions with personally identifying information. Of the qualified responses, 67,441 completed the entire survey. | ||
|
||
### Acknowledgements | ||
|
||
Massive, heartfelt thanks to all Stack Overflow contributors and lurking developers of the world who took part in the survey this year. We value your generous participation more than you know. | ||
|
||
### Inspiration | ||
|
||
At Stack Overflow, we put developers first and want [all developers to feel welcome and included on our site](https://stackoverflow.blog/2018/04/26/stack-overflow-isnt-very-welcoming-its-time-for-that-to-change/). Can we use our annual survey to understand what kinds of users are less likely to identify as part of our community, participate, or feel kinship with fellow developers? Check out [our blog post](https://stackoverflow.blog/2018/05/30/public-data-release-of-stack-overflows-2018-developer-survey) for more details. | ||
""", | ||
String["https://www.kaggle.com/stackoverflow/stack-overflow-2018-developer-survey/downloads/stack-overflow-2018-developer-survey.zip/2", "https://www.kaggle.com/stackoverflow/stack-overflow-2018-developer-survey/downloads/survey_results_public.csv/2", "https://www.kaggle.com/stackoverflow/stack-overflow-2018-developer-survey/downloads/survey_results_schema.csv/2"], | ||
|
||
)) |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,14 @@ | ||
register(DataDep( | ||
"Map of Co-Seismic Landslides for the M 7.8 Kaikoura, New Zealand Earthquake", | ||
""" | ||
Dataset: Map of Co-Seismic Landslides for the M 7.8 Kaikoura, New Zealand Earthquake | ||
Website: https://zenodo.org/record/1287281 | ||
Author: Valkaniotis Sotiris et al. | ||
Date: December 20, 2016 | ||
License: https://creativecommons.org/licenses/by/4.0/ | ||
|
||
Prepared by the Research Group on Earthquake Geology in Greece (http://eqgeogr.weebly.com/) Version 2 (updated) With the release of new Sentinel-2 images, and other available resources for the M7.8 Kaikoura earthquake, we present an update of the Map of Co-Seismic Landslides and Surfaces Ruptures (As of 27/11/2016). Landslides were mapped using Sentinel-2 satellite images from Copernicus, European Space Agency, dated November and December 2016. Images were visually compared with previous last available S2A images without cloud cover (13 September and 26 October) and landslides and large slope failures were manually mapped. Areas covered by cloud are omitted and shown on map. 5875 landslide sites are shown in the map. A small number of landslides could have been mis-identified due to insufficient resolution of the images, small gaps of cloud cover or for other reasons. Also, re-activated landslides on the central mountainous area were unabled to identify due to imagery restrictions (medium resolution, relief shadows etc). Some local gaps in Sentinel imagery still exist due to cloud cover, but we believe the current map is very close to the major distribution of mass movement effects. Surface ruptures were mapped using Sentinel-2 imagery and approximate position from photos of the post-earthquake aerial surveys of Environment Canterbury Regional Council (http://ecan.govt.nz) KML file contains7355 landslide spots. | ||
""", | ||
String["https://zenodo.org/api/files/5a311c7a-bd5e-4df7-be61-341d03ec9a9b/Landslide_Map_V2_A2.pdf", "https://zenodo.org/api/files/5a311c7a-bd5e-4df7-be61-341d03ec9a9b/Landslides_Kaikoura_2016.kmz", "https://zenodo.org/api/files/5a311c7a-bd5e-4df7-be61-341d03ec9a9b/Prelim_Landslide_Map_A2.jpg"], | ||
|
||
)) |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -12,6 +12,7 @@ tests = [ | |
"CKAN", | ||
"DataCite", | ||
"Figshare", | ||
"JSONLD/JSONLD", | ||
] | ||
|
||
@testset "DataDepGenerators" begin | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
All exports go in src/DataDepsGenerators.jl
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is how I used to do for abstracted types like
DataOnev2
.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
well it isn't a big deal and it is trivial to change later. So I guess either is good.