journal_article_details.Rmd

---
title: "R Notebook"
---

```{r}

library(furrr) # for parallel processing
library(lubridate)
library(tidyverse)
library(rdimensions) # in development: https://github.com/nicholasmfraser/rdimensions
library(rcrossref)
library(roadoi)

# Retrieve auth token for dimensions
dimensions_login()

```

# Retrieve journal article data from Dimensions

```{r}

# Generate the dimensions query string
dim_q <- '"(\\"coronavirus\\" OR \\"covid-19\\" OR \\"sars-cov\\" OR \\"ncov-2019\\" OR \\"2019-ncov\\" OR \\"hcov-19\\" OR \\"sars-2\\")"'

parse_character <- function(element) {
  if(length(element)) {
    return(as.character(element))
  } else {
    return(NA_character_)
  }
}
 
parse_numeric <- function(element) {
  if(length(element)) {
    return(as.numeric(element))
  } else {
    return(NA)
  }
}

# Function to parse Dimensions data to a data frame
dim_parse <- function(item) {
  tibble(
    doi = parse_character(item$doi),
    type = parse_character(item$type),
    year = parse_character(item$year),
    published_date = parse_character(item$date),
    title = parse_character(item$title),
    abstact = parse_character(item$abstract),
    journal = parse_character(item$journal$title),
    publisher = parse_character(item$publisher),
    times_cited = parse_numeric(item$times_cited)
  )
}

# Function for querying dimensions API between two dates
dim_data <- function(start_date, end_date) {
  
  # Determine how many total results we can expect
  query <- paste0('search publications in title_abstract_only for ', 
                  dim_q, 
                  ' where (date>="',
                  start_date,
                  '" and date<="',
                  end_date, 
                  '" and type="article"',
                  ') return publications')

  results <- dimensions_query(query)$`_stats`$total_count
  
  # Calculate number of query iterations required (results per page = 1000)
  iterations <- ceiling(results/1000)
  
  # Select fields we want to return
  fields <- c("doi",
              "type",
              "year",
              "title",
              "times_cited",
              "abstract",
              "journal",
              "date",
              "publisher")
  
  get_data <- function(i) {
    
    d <- dimensions_query(str_c(query,
                                '[', str_c(fields, collapse = "+"),
                                '] limit 1000 skip ', 
                                (i-1)*1000))$publications
    
    Sys.sleep(0.25)
  
    return(map_dfr(d, dim_parse))

  }
  
  # Iterate over results set and convert to data frame
  data <- map_dfr(1:iterations, get_data) 
  
  return(data)
  
}

# Create two vectors of dates - the start and end day for each month for which 
# data should be extract#ed
start_dates <- seq(ymd('2020-01-01'), ymd("2020-10-01"), by = '1 month')
end_dates <- seq(ymd('2020-02-01'), ymd("2020-11-01"), by = '1 month') - 1

# Retrieve publication data
dim_pubs <- map2_dfr(start_dates, end_dates, dim_data) %>%
  # clean doi for matching
  mutate(doi = str_trim(str_to_lower(doi))) %>%
  distinct()

```

# Add additional information from Crossref

```{r}
# Dimensions provides a field "date" which can be used for publication dates,
# but there appear to be a number of inaccuracies, e.g. >1000 papers with publication
# dates on 2020-01-01. Instead we can use Crossref to return more accurate dates
# using the "created" parameter.

# Some journal names are also missing in Dimensions, in particular for articles
# published by Elsevier. Unclear why this occurs.

# Caution: A single API call is made for every doi, so for large datasets this
# may take several hours to run. We can speed this up with parallelisation using
# the furrr package and future_map functions
dois <- dim_pubs %>% pull(doi)
# Set number of cores (use availableCores() to check max number)
plan(multisession, workers = 3)

cr_data <- future_map_dfr(dois, ~ cr_works(.x)$data)

```

```{r}

# Merge dimensions and crossref info
dim_cr_pubs <- dim_pubs %>%
  inner_join(cr_data %>% 
               select(doi, created, issued, published.online, published.print) %>%
               # clean doi for matching
               mutate(doi = str_trim(str_to_lower(doi))),
             by = "doi") %>%
  # select relevant fields
  select(doi, type, published_date, created, issued, published.online, published.print,
         title, journal, publisher, times_cited) %>%
  rename(
    dim_published_date = published_date,
    cr_created = created,
    cr_issued = issued,
    cr_published_online = published.online,
    cr_published_print = published.print
  ) %>%
  distinct()

```

# Detailed OA information from Unpaywall

```{r}

oadoi_fetch_safely <- safely(oadoi_fetch)

safe_dim_pubs_oa <- map(dim_cr_pubs$doi, ~ oadoi_fetch_safely(dois = .x, email = "n.fraser@zbw.eu"))

dim_pubs_oa <- safe_dim_pubs_oa %>%
  map_dfr(., function(x) tibble(
    doi = str_trim(str_to_lower(x$result$doi)),
    article_is_oa = x$result$is_oa,
    journal_is_oa = x$result$journal_is_oa
  )) %>%
  select(doi, article_is_oa, journal_is_oa)

```

# Create final dataset

```{r}

dim_cr_pubs %>%
  filter(cr_created >= "2020-01-01",
         cr_created <= "2020-10-31") %>%
  inner_join(dim_pubs_oa, by = "doi") %>%
  distinct() %>%
  write_csv("data/journal_articles_20200101_20201031.csv")

```