From a4cfc619eac9726c15934e9f180bd7db3dcb46bb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?G=C3=A1bor=20Cs=C3=A1rdi?= Date: Thu, 2 Nov 2017 11:39:52 +0000 Subject: [PATCH] Parse in UTF-8, always, really fixes #605 (#671) --- R/source.R | 7 +++++- R/test-example.R | 4 ++-- tests/testthat/test-source.R | 43 ++++++++++++++++++++++++++++++++++++ 3 files changed, 51 insertions(+), 3 deletions(-) create mode 100644 tests/testthat/test-source.R diff --git a/R/source.R b/R/source.R index 7e4f430ee..a15a75135 100644 --- a/R/source.R +++ b/R/source.R @@ -24,7 +24,12 @@ source_file <- function(path, env = test_env(), chdir = TRUE, lines <- read_lines(path) srcfile <- srcfilecopy(path, lines, file.info(path)[1, "mtime"], isFile = TRUE) - exprs <- parse(text = lines, n = -1, srcfile = srcfile) + + ## We need to parse from a connection, because parse() has a bug, + ## and converts the input to the native encoding, if the text arg is used + exprs <- parse( + textConnection(lines, encoding = "UTF-8"), + n = -1, srcfile = srcfile, encoding = "UTF-8") n <- length(exprs) if (n == 0L) return(invisible()) diff --git a/R/test-example.R b/R/test-example.R index fdef41b71..0306374f8 100644 --- a/R/test-example.R +++ b/R/test-example.R @@ -49,7 +49,7 @@ test_example <- function(path) { env <- new.env(parent = globalenv()) - ok <- test_code(path, parse(ex_path), env = env) + ok <- test_code(path, parse(ex_path, encoding = "UTF-8"), env = env) if (ok) succeed(path) invisible() @@ -65,7 +65,7 @@ test_rd <- function(rd) { env <- new.env(parent = globalenv()) - ok <- test_code(path, parse(ex_path), env = env) + ok <- test_code(path, parse(ex_path, encoding = "UTF-8"), env = env) if (ok) succeed(path) invisible() diff --git a/tests/testthat/test-source.R b/tests/testthat/test-source.R new file mode 100644 index 000000000..227dbc763 --- /dev/null +++ b/tests/testthat/test-source.R @@ -0,0 +1,43 @@ +context("source") + +test_that("source_file always uses UTF-8 encoding", { + has_locale <- function(l) { + has <- TRUE + tryCatch( + withr::with_locale(c(LC_CTYPE = l), "foobar"), + warning = function(w) has <<- FALSE, + error = function(e) has <<- FALSE + ) + has + } + + ## Some text in UTF-8 + tmp <- tempfile() + on.exit(unlink(tmp), add = TRUE) + utf8 <- as.raw(c( + 0xc3, 0xa1, 0x72, 0x76, 0xc3, 0xad, 0x7a, 0x74, 0xc5, 0xb1, 0x72, 0xc5, + 0x91, 0x20, 0x74, 0xc3, 0xbc, 0x6b, 0xc3, 0xb6, 0x72, 0x66, 0xc3, 0xba, + 0x72, 0xc3, 0xb3, 0x67, 0xc3, 0xa9, 0x70 + )) + writeBin(c(charToRaw("x <- \""), utf8, charToRaw("\"\n")), tmp) + + run_test <- function(locale) { + if (has_locale(locale)) { + env <- new.env() + withr::with_locale( + c(LC_CTYPE = locale), + source_file(tmp, env = env, wrap = FALSE) + ) + expect_equal(Encoding(env$x), "UTF-8") + expect_equal(charToRaw(env$x), utf8) + } + } + + ## Try to read it in latin1 and UTF-8 locales + ## They have diffefent names on Unix and Windows + run_test("en_US.ISO8859-1") + run_test("en_US.UTF-8") + run_test("English_United States.1252") + run_test("German_Germany.1252") + run_test(Sys.getlocale("LC_CTYPE")) +})