Skip to content

Commit

Permalink
Parse in UTF-8, always, really fixes #605
Browse files Browse the repository at this point in the history
  • Loading branch information
gaborcsardi committed Nov 2, 2017
1 parent 6d2d998 commit 3450adb
Show file tree
Hide file tree
Showing 3 changed files with 51 additions and 3 deletions.
7 changes: 6 additions & 1 deletion R/source.R
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,12 @@ source_file <- function(path, env = test_env(), chdir = TRUE,

lines <- read_lines(path)
srcfile <- srcfilecopy(path, lines, file.info(path)[1, "mtime"], isFile = TRUE)
exprs <- parse(text = lines, n = -1, srcfile = srcfile)

## We need to parse from a connection, because parse() has a bug,
## and converts the input to the native encoding, if the text arg is used
exprs <- parse(
textConnection(lines, encoding = "UTF-8"),
n = -1, srcfile = srcfile, encoding = "UTF-8")

n <- length(exprs)
if (n == 0L) return(invisible())
Expand Down
4 changes: 2 additions & 2 deletions R/test-example.R
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ test_example <- function(path) {

env <- new.env(parent = globalenv())

ok <- test_code(path, parse(ex_path), env = env)
ok <- test_code(path, parse(ex_path, encoding = "UTF-8"), env = env)
if (ok) succeed(path)

invisible()
Expand All @@ -65,7 +65,7 @@ test_rd <- function(rd) {

env <- new.env(parent = globalenv())

ok <- test_code(path, parse(ex_path), env = env)
ok <- test_code(path, parse(ex_path, encoding = "UTF-8"), env = env)
if (ok) succeed(path)

invisible()
Expand Down
43 changes: 43 additions & 0 deletions tests/testthat/test-source.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
context("source")

test_that("source_file always uses UTF-8 encoding", {
has_locale <- function(l) {
has <- TRUE
tryCatch(
withr::with_locale(c(LC_CTYPE = l), "foobar"),
warning = function(w) has <<- FALSE,
error = function(e) has <<- FALSE
)
has
}

## Some text in UTF-8
tmp <- tempfile()
on.exit(unlink(tmp), add = TRUE)
utf8 <- as.raw(c(
0xc3, 0xa1, 0x72, 0x76, 0xc3, 0xad, 0x7a, 0x74, 0xc5, 0xb1, 0x72, 0xc5,
0x91, 0x20, 0x74, 0xc3, 0xbc, 0x6b, 0xc3, 0xb6, 0x72, 0x66, 0xc3, 0xba,
0x72, 0xc3, 0xb3, 0x67, 0xc3, 0xa9, 0x70
))
writeBin(c(charToRaw("x <- \""), utf8, charToRaw("\"\n")), tmp)

run_test <- function(locale) {
if (has_locale(locale)) {
env <- new.env()
withr::with_locale(
c(LC_CTYPE = locale),
source_file(tmp, env = env, wrap = FALSE)
)
expect_equal(Encoding(env$x), "UTF-8")
expect_equal(charToRaw(env$x), utf8)
}
}

## Try to read it in latin1 and UTF-8 locales
## They have diffefent names on Unix and Windows
run_test("en_US.ISO8859-1")
run_test("en_US.UTF-8")
run_test("English_United States.1252")
run_test("German_Germany.1252")
run_test(Sys.getlocale("LC_CTYPE"))
})

0 comments on commit 3450adb

Please sign in to comment.