From b9cde49ddefacae5be421831836eec4de99bea1c Mon Sep 17 00:00:00 2001 From: hadley Date: Thu, 17 Aug 2017 11:10:45 -0500 Subject: [PATCH] Always read and write in UTF-8 * Helpers read_lines and write_lines do the right thing * readLines() and writeLines() through errors to prevent accidental re-use in the future * Warn if package encoding is not utf-8 Fixes #564. Fixes #592 --- DESCRIPTION | 1 + NEWS.md | 4 +++ R/enc.R | 8 ------ R/parse.R | 8 +++--- R/rd.R | 2 +- R/roxygenize.R | 5 ++++ R/safety.R | 6 ++-- R/utils-io.R | 16 +++++++++++ R/utils.R | 4 +-- tests/testthat/test-Rbuildignore.R | 5 ++-- tests/testthat/test-nonASCII.R | 19 ------------ tests/testthat/test-utf8.R | 35 +++++++++++++++++++++++ tests/testthat/testEagerData/DESCRIPTION | 1 + tests/testthat/testLazyData/DESCRIPTION | 1 + tests/testthat/testNonASCII/DESCRIPTION | 2 +- tests/testthat/testNonASCII/R/a.r | 9 ++---- tests/testthat/testUtf8Escape/DESCRIPTION | 8 ++++++ tests/testthat/testUtf8Escape/R/a.r | 4 +++ 18 files changed, 91 insertions(+), 47 deletions(-) delete mode 100644 R/enc.R create mode 100644 R/utils-io.R delete mode 100644 tests/testthat/test-nonASCII.R create mode 100644 tests/testthat/test-utf8.R create mode 100644 tests/testthat/testUtf8Escape/DESCRIPTION create mode 100644 tests/testthat/testUtf8Escape/R/a.r diff --git a/DESCRIPTION b/DESCRIPTION index 6edaae114..546ad2d1c 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -38,6 +38,7 @@ Suggests: LinkingTo: Rcpp VignetteBuilder: knitr +Encoding: UTF-8 Roxygen: list(markdown = TRUE) RoxygenNote: 6.0.1.9000 Remotes: diff --git a/NEWS.md b/NEWS.md index 3cb7493f6..ed8b93514 100644 --- a/NEWS.md +++ b/NEWS.md @@ -25,6 +25,10 @@ * If a package logo exists (`man/figures/logo.png`) it will be automatically included in generated package docs (#609). +* roxygen2 now always reads and writes using UTF-8 encoding. If used with a + package that does not have `Encoding: UTF-8` in the DESCRIPTION, you'll + now get a warning (#564, #592). + * Usage for data objects now correctly generated, avoiding double escaping other components of usage (#562). diff --git a/R/enc.R b/R/enc.R deleted file mode 100644 index b7811ec79..000000000 --- a/R/enc.R +++ /dev/null @@ -1,8 +0,0 @@ -read_lines_enc <- function(path, file_encoding = "UTF-8", n = -1L, ok = TRUE, skipNul = FALSE) { - con <- file(path, encoding = file_encoding) - on.exit(close(con), add = TRUE) - - lines <- readLines(con, warn = FALSE, n = n, ok = ok, skipNul = skipNul) - Encoding(lines) <- "UTF-8" - lines -} diff --git a/R/parse.R b/R/parse.R index 73eb82dba..02533c48b 100644 --- a/R/parse.R +++ b/R/parse.R @@ -4,7 +4,7 @@ parse_package <- function(base_path, load_code, registry, global_options = list( files <- package_files(base_path) parsed <- lapply(files, parse_blocks, env = env, registry = registry, - global_options = global_options, fileEncoding = desc$Encoding %||% "UTF-8") + global_options = global_options) blocks <- unlist(parsed, recursive = FALSE) list(env = env, blocks = blocks) @@ -54,7 +54,7 @@ parse_code <- function(file, parse_text <- function(text, registry = default_tags(), global_options = list()) { file <- tempfile() - writeLines(text, file) + write_lines(text, file) on.exit(unlink(file)) env <- new.env(parent = parent.env(globalenv())) @@ -67,9 +67,9 @@ parse_text <- function(text, registry = default_tags(), global_options = list()) list(env = env, blocks = blocks) } -parse_blocks <- function(file, env, registry, global_options = list(), fileEncoding = "UTF-8") { +parse_blocks <- function(file, env, registry, global_options = list()) { - lines <- read_lines_enc(file, file_encoding = fileEncoding) + lines <- read_lines(file) parsed <- parse(text = lines, keep.source = TRUE, srcfile = srcfilecopy(file, lines, isFile = TRUE)) if (length(parsed) == 0) return() diff --git a/R/rd.R b/R/rd.R index a5acc59b9..0b57f200e 100644 --- a/R/rd.R +++ b/R/rd.R @@ -356,7 +356,7 @@ topic_add_examples <- function(topic, block, base_path) { next } - code <- readLines(path) + code <- read_lines(path) examples <- escape_examples(code) topic$add_simple_field("examples", examples) diff --git a/R/roxygenize.R b/R/roxygenize.R index 51df40a39..da84671c9 100644 --- a/R/roxygenize.R +++ b/R/roxygenize.R @@ -43,6 +43,11 @@ roxygenize <- function(package.dir = ".", dir.create(man_path, recursive = TRUE, showWarnings = FALSE) update_roxygen_version(base_path) + encoding <- desc::desc_get("Encoding", file = base_path)[[1]] + if (!identical(encoding, "UTF-8")) { + warning("roxygen2 requires Encoding: UTF-8", call. = FALSE) + } + options <- load_options(base_path) roclets <- roclets %||% options$roclets diff --git a/R/safety.R b/R/safety.R index bf6cdf67f..7d11d7e27 100644 --- a/R/safety.R +++ b/R/safety.R @@ -14,17 +14,17 @@ first_time <- function(path) { made_by_roxygen <- function(path) { if (!file.exists(path)) return(TRUE) - first <- readLines(path, n = 1) + first <- read_lines(path, n = 1) check_made_by(first) } add_made_by_roxygen <- function(path, comment) { if (!file.exists(path)) stop("Can't find ", path, call. = FALSE) - lines <- readLines(path, warn = FALSE) + lines <- read_lines(path) if (check_made_by(lines[1])) return() - writeLines(c(made_by(comment), lines), path) + write_lines(c(made_by(comment), lines), path) } check_made_by <- function(first) { diff --git a/R/utils-io.R b/R/utils-io.R new file mode 100644 index 000000000..fb305ded2 --- /dev/null +++ b/R/utils-io.R @@ -0,0 +1,16 @@ +readLines <- function(...) stop("Use read_lines!") +writeLines <- function(...) stop("Use write_lines!") + +read_lines <- function(path, n = -1L) { + con <- file(path, open = "r", encoding = "utf-8") + on.exit(close(con)) + + base::readLines(con, n = n, warn = FALSE) +} + +write_lines <- function(text, path) { + con <- file(path, open = "w", encoding = "utf-8") + on.exit(close(con)) + + base::writeLines(text, con) +} diff --git a/R/utils.R b/R/utils.R index 86955dec9..1c186a531 100644 --- a/R/utils.R +++ b/R/utils.R @@ -80,7 +80,7 @@ write_if_different <- function(path, contents, check = TRUE) { FALSE } else { cat(sprintf('Writing %s\n', name)) - writeLines(contents, path, useBytes = TRUE) + write_lines(contents, path) TRUE } } @@ -113,7 +113,7 @@ ignore_files <- function(rfiles, path) { rfiles_relative <- sub("^[/]*", "", rfiles_relative) # Remove any files that match any perl-compatible regexp - patterns <- readLines(rbuildignore, warn = FALSE) + patterns <- read_lines(rbuildignore) patterns <- patterns[patterns != ""] matches <- lapply(patterns, grepl, rfiles_relative, perl = TRUE) matches <- Reduce("|", matches) diff --git a/tests/testthat/test-Rbuildignore.R b/tests/testthat/test-Rbuildignore.R index 68b310d0b..071b0218f 100644 --- a/tests/testthat/test-Rbuildignore.R +++ b/tests/testthat/test-Rbuildignore.R @@ -6,8 +6,7 @@ test_that("roxygen ignores files with matching pattern in .Rbuildignore", { expect_equal(basename(package_files(test_pkg)), c("a.R", "ignore_me.R")) - #writeLines("^R/ignore_me.R$", file.path(test_pkg, ".Rbuildignore")) - writeChar("^R/ignore_me.R$\n", file.path(test_pkg, ".Rbuildignore"), eos = NULL) + write_lines("^R/ignore_me.R$\n", file.path(test_pkg, ".Rbuildignore")) expect_equal(basename(package_files(test_pkg)), "a.R") }) @@ -15,6 +14,6 @@ test_that("roxygen works with empty lines in .Rbuildignore", { test_pkg <- temp_copy_pkg(test_path("testRbuildignore")) on.exit(unlink(test_pkg, recursive = TRUE)) - writeChar("^R/ignore_me.R$\n\n.nonexistentfile", file.path(test_pkg, ".Rbuildignore"), eos = NULL) + write_lines("^R/ignore_me.R$\n\n.nonexistentfile", file.path(test_pkg, ".Rbuildignore")) expect_equal(basename(package_files(test_pkg)), "a.R") }) diff --git a/tests/testthat/test-nonASCII.R b/tests/testthat/test-nonASCII.R deleted file mode 100644 index 26202ceae..000000000 --- a/tests/testthat/test-nonASCII.R +++ /dev/null @@ -1,19 +0,0 @@ -context("nonASCII") - -test_that("can generate nonASCII document", { - test_pkg <- temp_copy_pkg('testNonASCII') - on.exit(unlink(test_pkg, recursive = TRUE), add = TRUE) - - expect_output(roxygenise(test_pkg, roclets = "rd"), "printChineseMsg[.]Rd") - expect_true(file.exists(file.path(test_pkg, "man", "printChineseMsg.Rd"))) - - cnChar <- read_lines_enc(file.path(test_pkg, "man", "printChineseMsg.Rd")) - - # Because the parse in testthat::test don't specify encoding to UTF-8 as well, - # so we have to use unicode escapes. - expect_true(any(grepl("\u6211\u7231\u4e2d\u6587", cnChar))) - expect_true(any(grepl("\u4e2d\u6587\u6ce8\u91ca", cnChar))) - - # No output on second run - expect_output(roxygenise(test_pkg, roclets = "rd"), NA) -}) diff --git a/tests/testthat/test-utf8.R b/tests/testthat/test-utf8.R new file mode 100644 index 000000000..53b72d683 --- /dev/null +++ b/tests/testthat/test-utf8.R @@ -0,0 +1,35 @@ +context("nonASCII") + +test_that("can generate nonASCII document", { + test_pkg <- temp_copy_pkg(test_path('testNonASCII')) + on.exit(unlink(test_pkg, recursive = TRUE), add = TRUE) + + expect_output(roxygenise(test_pkg, roclets = "rd"), "printChineseMsg[.]Rd") + + rd_path <- file.path(test_pkg, "man", "printChineseMsg.Rd") + expect_true(file.exists(rd_path)) + rd <- read_lines(rd_path) + + expect_true(any(grepl("\u6211\u7231\u4e2d\u6587", rd))) + expect_true(any(grepl("\u4e2d\u6587\u6ce8\u91ca", rd))) + + # Shouldn't change again + expect_output(roxygenise(test_pkg, roclets = "rd"), NA) +}) + + +test_that("unicode escapes are ok", { + test_pkg <- temp_copy_pkg(test_path('testUtf8Escape')) + on.exit(unlink(test_pkg, recursive = TRUE), add = TRUE) + + expect_output(roxygenise(test_pkg, roclets = "rd"), "a[.]Rd") + + rd_path <- file.path(test_pkg, "man", "a.Rd") + expect_true(file.exists(rd_path)) + rd <- read_lines(rd_path) + + expect_true(any(grepl("7\u00b0C", rd))) + + # Shouldn't change again + expect_output(roxygenise(test_pkg, roclets = "rd"), NA) +}) diff --git a/tests/testthat/testEagerData/DESCRIPTION b/tests/testthat/testEagerData/DESCRIPTION index 90100c7e6..6aab7f12c 100644 --- a/tests/testthat/testEagerData/DESCRIPTION +++ b/tests/testthat/testEagerData/DESCRIPTION @@ -5,3 +5,4 @@ Description: Author: Hadley Maintainer: Hadley Version: 0.1 +Encoding: UTF-8 diff --git a/tests/testthat/testLazyData/DESCRIPTION b/tests/testthat/testLazyData/DESCRIPTION index 25165991d..383391bee 100644 --- a/tests/testthat/testLazyData/DESCRIPTION +++ b/tests/testthat/testLazyData/DESCRIPTION @@ -6,3 +6,4 @@ Author: Hadley Maintainer: Hadley Version: 0.1 LazyData: TRUE +Encoding: UTF-8 diff --git a/tests/testthat/testNonASCII/DESCRIPTION b/tests/testthat/testNonASCII/DESCRIPTION index 9e5ca820e..ae54a67cb 100644 --- a/tests/testthat/testNonASCII/DESCRIPTION +++ b/tests/testthat/testNonASCII/DESCRIPTION @@ -4,5 +4,5 @@ License: GPL-2 Description: Author: Shrektan Maintainer: Shrektan -Encoding: GB2312 +Encoding: UTF-8 Version: 0.1 diff --git a/tests/testthat/testNonASCII/R/a.r b/tests/testthat/testNonASCII/R/a.r index 770d735a9..c76b2d89d 100644 --- a/tests/testthat/testNonASCII/R/a.r +++ b/tests/testthat/testNonASCII/R/a.r @@ -1,9 +1,6 @@ -# This script is intended to be saved in GB2312 to test if non UTF-8 encoding is -# supported. - -#' ÖÐÎÄ×¢ÊÍ +#' 中文注释 #' -#' @note ÎÒ°®ÖÐÎÄ¡£ +#' @note 我爱中文。 printChineseMsg <- function() { - message("ÎÒÊÇGB2312µÄÖÐÎÄ×Ö·û¡£") + message("我是UTF8的中文字符。") } diff --git a/tests/testthat/testUtf8Escape/DESCRIPTION b/tests/testthat/testUtf8Escape/DESCRIPTION new file mode 100644 index 000000000..c0c5348b7 --- /dev/null +++ b/tests/testthat/testUtf8Escape/DESCRIPTION @@ -0,0 +1,8 @@ +Package: testUtf8Escape +Title: Check that utf8 escapes are round tripped ok +License: GPL-2 +Description: +Author: Hadley +Maintainer: Hadley +Encoding: UTF-8 +Version: 0.1 diff --git a/tests/testthat/testUtf8Escape/R/a.r b/tests/testthat/testUtf8Escape/R/a.r new file mode 100644 index 000000000..4d88cd70c --- /dev/null +++ b/tests/testthat/testUtf8Escape/R/a.r @@ -0,0 +1,4 @@ +#' Title +#' +#' @param b Some label +a <- function(b = '7°C') 1