-
Notifications
You must be signed in to change notification settings - Fork 6
/
Copy pathASCIIfy.R
83 lines (81 loc) · 2.79 KB
/
ASCIIfy.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
#' Convert Characters to ASCII
#'
#' Convert character vector to ASCII, replacing non-ASCII characters with
#' single-byte (\samp{\x00}) or two-byte (\samp{\u0000}) codes.
#'
#' @param x a character vector, possibly containing non-ASCII characters.
#' @param bytes either \code{1} or \code{2}, for single-byte (\samp{\x00}) or
#' two-byte (\samp{\u0000}) codes.
#' @param fallback an output character to use, when input characters cannot be
#' converted.
#'
#' @return
#' A character vector like \code{x}, except non-ASCII characters have been
#' replaced with \samp{\x00} or \samp{\u0000} codes.
#'
#' @note
#' To render single backslashes, use these or similar techniques: \preformatted{
#' write(ASCIIfy(x), "file.txt")
#' cat(paste(ASCIIfy(x), collapse="\n"), "\n", sep="")}
#'
#' The resulting strings are plain ASCII and can be used in R functions and
#' datasets to improve package portability.
#'
#' @author Arni Magnusson.
#'
#' @seealso
#' \code{\link[tools]{showNonASCII}} identifies non-ASCII characters in a
#' character vector.
#'
#' @keywords utilites character
#'
#' @examples
#' cities <- c("S\u00e3o Paulo", "Reykjav\u00edk")
#' print(cities)
#' ASCIIfy(cities, 1)
#' ASCIIfy(cities, 2)
#'
#' athens <- "\u0391\u03b8\u03ae\u03bd\u03b1"
#' print(athens)
#' ASCIIfy(athens)
#'
#' @export
ASCIIfy <- function(x, bytes = 2, fallback = "?") {
bytes <- match.arg(as.character(bytes), 1:2)
convert <- function(char) # convert to ASCII, e.g. "z", "\xfe", or "\u00fe"
{
raw <- charToRaw(char)
if (length(raw) == 1 && raw <= 127) { # 7-bit
ascii <- char
} else if (length(raw) == 1 && bytes == 1) { # 8-bit to \x00
ascii <- paste0("\\x", raw)
} else if (length(raw) == 1 && bytes == 2) { # 8-bit to \u0000
ascii <- paste0("\\u", chartr(" ", "0", formatC(as.character(raw), width = 4)))
} else if (length(raw) == 2 && bytes == 1) { # 16-bit to \x00, if possible
if (utf8ToInt(char) <= 255) {
ascii <- paste0("\\x", format.hexmode(utf8ToInt(char)))
} else {
ascii <- fallback
warning(char, " could not be converted to 1 byte")
} # UTF-8 to \u0000
} else if (nchar(format.hexmode(utf8ToInt(char))) <= 4 && bytes == 2) {
ascii <- paste0("\\u", format.hexmode(utf8ToInt(char), width = 4))
} else {
ascii <- fallback
warning(char, " could not be converted to ", bytes, " bytes")
}
return(ascii)
}
if (length(x) > 1) {
sapply(x, ASCIIfy, bytes = bytes, fallback = fallback, USE.NAMES = FALSE)
}
else {
input <- unlist(strsplit(x, "")) # "c" "a" "f" "<\'e>"
output <- character(length(input)) # "" "" "" ""
for (i in seq_along(input)) {
output[i] <- convert(input[i])
} # "c" "a" "f" "\\u00e9"
output <- paste(output, collapse = "") # "caf\\u00e9"
return(output)
}
}