-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathjournal_article_details.Rmd
197 lines (148 loc) · 5.21 KB
/
journal_article_details.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
---
title: "R Notebook"
---
```{r}
library(furrr) # for parallel processing
library(lubridate)
library(tidyverse)
library(rdimensions) # in development: https://github.com/nicholasmfraser/rdimensions
library(rcrossref)
library(roadoi)
# Retrieve auth token for dimensions
dimensions_login()
```
# Retrieve journal article data from Dimensions
```{r}
# Generate the dimensions query string
dim_q <- '"(\\"coronavirus\\" OR \\"covid-19\\" OR \\"sars-cov\\" OR \\"ncov-2019\\" OR \\"2019-ncov\\" OR \\"hcov-19\\" OR \\"sars-2\\")"'
parse_character <- function(element) {
if(length(element)) {
return(as.character(element))
} else {
return(NA_character_)
}
}
parse_numeric <- function(element) {
if(length(element)) {
return(as.numeric(element))
} else {
return(NA)
}
}
# Function to parse Dimensions data to a data frame
dim_parse <- function(item) {
tibble(
doi = parse_character(item$doi),
type = parse_character(item$type),
year = parse_character(item$year),
published_date = parse_character(item$date),
title = parse_character(item$title),
abstact = parse_character(item$abstract),
journal = parse_character(item$journal$title),
publisher = parse_character(item$publisher),
times_cited = parse_numeric(item$times_cited)
)
}
# Function for querying dimensions API between two dates
dim_data <- function(start_date, end_date) {
# Determine how many total results we can expect
query <- paste0('search publications in title_abstract_only for ',
dim_q,
' where (date>="',
start_date,
'" and date<="',
end_date,
'" and type="article"',
') return publications')
results <- dimensions_query(query)$`_stats`$total_count
# Calculate number of query iterations required (results per page = 1000)
iterations <- ceiling(results/1000)
# Select fields we want to return
fields <- c("doi",
"type",
"year",
"title",
"times_cited",
"abstract",
"journal",
"date",
"publisher")
get_data <- function(i) {
d <- dimensions_query(str_c(query,
'[', str_c(fields, collapse = "+"),
'] limit 1000 skip ',
(i-1)*1000))$publications
Sys.sleep(0.25)
return(map_dfr(d, dim_parse))
}
# Iterate over results set and convert to data frame
data <- map_dfr(1:iterations, get_data)
return(data)
}
# Create two vectors of dates - the start and end day for each month for which
# data should be extract#ed
start_dates <- seq(ymd('2020-01-01'), ymd("2020-10-01"), by = '1 month')
end_dates <- seq(ymd('2020-02-01'), ymd("2020-11-01"), by = '1 month') - 1
# Retrieve publication data
dim_pubs <- map2_dfr(start_dates, end_dates, dim_data) %>%
# clean doi for matching
mutate(doi = str_trim(str_to_lower(doi))) %>%
distinct()
```
# Add additional information from Crossref
```{r}
# Dimensions provides a field "date" which can be used for publication dates,
# but there appear to be a number of inaccuracies, e.g. >1000 papers with publication
# dates on 2020-01-01. Instead we can use Crossref to return more accurate dates
# using the "created" parameter.
# Some journal names are also missing in Dimensions, in particular for articles
# published by Elsevier. Unclear why this occurs.
# Caution: A single API call is made for every doi, so for large datasets this
# may take several hours to run. We can speed this up with parallelisation using
# the furrr package and future_map functions
dois <- dim_pubs %>% pull(doi)
# Set number of cores (use availableCores() to check max number)
plan(multisession, workers = 3)
cr_data <- future_map_dfr(dois, ~ cr_works(.x)$data)
```
```{r}
# Merge dimensions and crossref info
dim_cr_pubs <- dim_pubs %>%
inner_join(cr_data %>%
select(doi, created, issued, published.online, published.print) %>%
# clean doi for matching
mutate(doi = str_trim(str_to_lower(doi))),
by = "doi") %>%
# select relevant fields
select(doi, type, published_date, created, issued, published.online, published.print,
title, journal, publisher, times_cited) %>%
rename(
dim_published_date = published_date,
cr_created = created,
cr_issued = issued,
cr_published_online = published.online,
cr_published_print = published.print
) %>%
distinct()
```
# Detailed OA information from Unpaywall
```{r}
oadoi_fetch_safely <- safely(oadoi_fetch)
safe_dim_pubs_oa <- map(dim_cr_pubs$doi, ~ oadoi_fetch_safely(dois = .x, email = "[email protected]"))
dim_pubs_oa <- safe_dim_pubs_oa %>%
map_dfr(., function(x) tibble(
doi = str_trim(str_to_lower(x$result$doi)),
article_is_oa = x$result$is_oa,
journal_is_oa = x$result$journal_is_oa
)) %>%
select(doi, article_is_oa, journal_is_oa)
```
# Create final dataset
```{r}
dim_cr_pubs %>%
filter(cr_created >= "2020-01-01",
cr_created <= "2020-10-31") %>%
inner_join(dim_pubs_oa, by = "doi") %>%
distinct() %>%
write_csv("data/journal_articles_20200101_20201031.csv")
```