-
Notifications
You must be signed in to change notification settings - Fork 32
/
Copy pathset_TextType.R
124 lines (114 loc) · 3.82 KB
/
set_TextType.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
## Developer note: Some functions not fully compatible with pandoc < 2.0
#' set_TextType
#'
#' For any EML element of class TextType, this function can be used to generate
#' the appropriate EML from a markdown-formatted file.
#' @param text a plain text character string which will be used directly as the content
#' of the node if no file is given
#' @param file path to a file providing formatted input text, see details.
#' @return a TextType object that can be coerced into any element inheriting from TextType, see examples
#' @importFrom tools file_ext
#' @importFrom methods as
#' @details If the `rmarkdown` package is installed, then the input file can
#' be a Microsoft Word (.docx) file, a markdown file, or other file
#' recognized by Pandoc (see https://pandoc.org), which will automate the conversion
#' to a docbook. Otherwise, the input file should already be in docbook format (with
#' .xml or .dbk extension). Note that pandoc comes pre-installed in RStudio and is
#' required for the rmarkdown package.
#' @export
#' @examples
#' \donttest{
#' ## using a simple character string
#' a <- set_TextType(text = "This is the abstract")
#'
#' ## Using an external markdown file
#' f <- system.file("examples/hf205-abstract.md", package = "EML")
#' a <- set_TextType(f)
#'
#' ## Can also import from methods written in a .docx MS Word file.
#' f <- system.file("examples/hf205-abstract.docx", package = "EML")
#' a <- set_TextType(f)
#'
#' ## Documents with title headings use `section` instead of `para` notation
#' f <- system.file("examples/hf205-methods.docx", package = "EML")
#' d <- set_TextType(f)
#' }
#'
set_TextType <- function(file = NULL, text = NULL) {
if (!is.null(text)) {
TextType <- text
} else if (!is.null(file)) {
docbook <- to_docbook(file)
TextType <-
list(
section = set_section(docbook),
para = set_para(docbook)
)
}
TextType
}
#' @importFrom xml2 xml_find_all xml_children xml_contents read_xml
#' @importFrom utils compareVersion
#' @importFrom rmarkdown pandoc_version
set_section <- function(docbook) {
## Argh, section tag name changes in different versions of pandoc!!
if (utils::compareVersion(
as.character(rmarkdown::pandoc_version()),
"2.0"
) == 1) {
xpath <- "/article/section"
} else {
xpath <- "/article/sect1"
}
lapply(
xml2::xml_find_all(docbook, xpath),
function(x)
paste(lapply(xml2::xml_children(x), as.character),
collapse = "\n"
)
)
}
set_para <- function(docbook) {
lapply(
xml2::xml_find_all(docbook, "/article/para"),
function(x) as.character(xml2::xml_contents(x))
)
}
#' @importFrom xml2 xml_ns_strip
to_docbook <- function(file = NULL) {
if (!tools::file_ext(file) %in% c("xml", "dbk", "db")) {
## Not xml yet, so use pandoc to generate docbook
if (!requireNamespace("rmarkdown", quietly = TRUE)) {
stop("rmarkdown package required to convert to Docbook format",
call. = FALSE
)
}
if (!rmarkdown::pandoc_available()) {
stop(paste("Pandoc is required to convert to Docbook format.",
"Please supply input text directly"),
call. = FALSE
)
}
pandoc_convert <-
getExportedValue("rmarkdown", "pandoc_convert")
wd <- getwd()
dir <- tempdir()
file.copy(file, file.path(dir, basename(file)), overwrite = TRUE)
setwd(dir)
docbook_file <- tempfile(tmpdir = ".", fileext = ".xml")
pandoc_convert(
basename(file),
to = "docbook",
output = normalizePath(docbook_file, winslash = "/", mustWork = FALSE),
options = "-s"
)
docbook <- xml2::read_xml(docbook_file)
on.exit(setwd(wd))
} else {
## File is already xml/docbook, so no need for pandoc
docbook <- xml2::read_xml(file)
}
## Unlike EML, treat this as literal!
xml2::xml_ns_strip(docbook)
docbook
}