Skip to content

Commit

Permalink
Closes #568. fread gains encoding arg. Fixes encoding issue on windows.
Browse files Browse the repository at this point in the history
  • Loading branch information
arunsrinivasan committed Aug 25, 2015
1 parent 4ef5fba commit 43f2dcd
Show file tree
Hide file tree
Showing 5 changed files with 36 additions and 6 deletions.
10 changes: 8 additions & 2 deletions R/fread.R
Original file line number Diff line number Diff line change
@@ -1,6 +1,12 @@

fread <- function(input="",sep="auto",sep2="auto",nrows=-1L,header="auto",na.strings="NA",stringsAsFactors=FALSE,verbose=getOption("datatable.verbose"),autostart=1L,skip=0L,select=NULL,drop=NULL,colClasses=NULL,integer64=getOption("datatable.integer64"),dec=if (sep!=".") "." else ",", check.names=FALSE, showProgress=getOption("datatable.showProgress"),data.table=getOption("datatable.fread.datatable")) {
fread <- function(input="",sep="auto",sep2="auto",nrows=-1L,header="auto",na.strings="NA",stringsAsFactors=FALSE,verbose=getOption("datatable.verbose"),autostart=1L,skip=0L,select=NULL,drop=NULL,colClasses=NULL,integer64=getOption("datatable.integer64"),dec=if (sep!=".") "." else ",", check.names=FALSE, encoding="unknown", showProgress=getOption("datatable.showProgress"),data.table=getOption("datatable.fread.datatable")) {
if (!is.character(dec) || length(dec)!=1L || nchar(dec)!=1) stop("dec must be a single character e.g. '.' or ','")
# handle encoding, #568
if (missing(encoding)) {
encoding = NULL
} else if (!encoding %in% c("unknown", "UTF-8", "Latin-1")) {
stop("Argument 'encoding' must be 'unknown', 'UTF-8' or 'Latin-1'.")
}
if (getOption("datatable.fread.dec.experiment") && Sys.localeconv()["decimal_point"] != dec) {
oldlocale = Sys.getlocale("LC_NUMERIC")
if (verbose) cat("dec='",dec,"' but current locale ('",oldlocale,"') has dec='",Sys.localeconv()["decimal_point"],"'. Attempting to change locale to one that has the desired decimal point.\n",sep="")
Expand Down Expand Up @@ -75,7 +81,7 @@ fread <- function(input="",sep="auto",sep2="auto",nrows=-1L,header="auto",na.str
if (identical(header,"auto")) header=NA
if (identical(sep,"auto")) sep=NULL
if (is.atomic(colClasses) && !is.null(names(colClasses))) colClasses = tapply(names(colClasses),colClasses,c,simplify=FALSE)
ans = .Call(Creadfile,input,sep,as.integer(nrows),header,na.strings,verbose,as.integer(autostart),skip,select,drop,colClasses,integer64,dec,as.integer(showProgress))
ans = .Call(Creadfile,input,sep,as.integer(nrows),header,na.strings,verbose,as.integer(autostart),skip,select,drop,colClasses,integer64,dec,encoding,as.integer(showProgress))
nr = length(ans[[1]])
if ( integer64=="integer64" && !exists("print.integer64") && any(sapply(ans,inherits,"integer64")) )
warning("Some columns have been read as type 'integer64' but package bit64 isn't loaded. Those columns will display as strange looking floating point data. There is no need to reload the data. Just require(bit64) to obtain the integer64 print method and print the data again.")
Expand Down
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,8 @@

26. `merge.data.table` gains arguments `by.x` and `by.y`. Closes [#637](https://github.com/Rdatatable/data.table/issues/637) and [#1130](https://github.com/Rdatatable/data.table/issues/1130). No copies are made even when the specified columns aren't key columns in data.tables, and therefore much more fast and memory efficient. Thanks to @blasern for the initial PRs.

27. `fread()` gains `eocnding` argument. Acceptable values are "unknown", "UTF-8" and "Latin-1" with default value of "unknown". Closes [#568](https://github.com/Rdatatable/data.table/issues/568). Thanks to @BenMarwick for the original report and to the many requests from others, and Q on SO.

#### BUG FIXES

1. `if (TRUE) DT[,LHS:=RHS]` no longer prints, [#869](https://github.com/Rdatatable/data.table/issues/869) and [#1122](https://github.com/Rdatatable/data.table/issues/1122). Tests added. To get this to work we've had to live with one downside: if a `:=` is used inside a function with no `DT[]` before the end of the function, then the next time `DT` or `print(DT)` is typed at the prompt, nothing will be printed. A repeated `DT` or `print(DT)` will print. To avoid this: include a `DT[]` after the last `:=` in your function. If that is not possible (e.g., it's not a function you can change) then `DT[]` at the prompt is guaranteed to print. As before, adding an extra `[]` on the end of a `:=` query is a recommended idiom to update and then print; e.g. `> DT[,foo:=3L][]`. Thanks to Jureiss and Jan Gorecki for reporting.
Expand Down
6 changes: 6 additions & 0 deletions inst/tests/tests.Rraw
Original file line number Diff line number Diff line change
Expand Up @@ -6734,6 +6734,12 @@ test(1546, set(df1, grep("^[ ]*$", df1$cats), 1L, NA_integer_), df2)
foo <- function(x, y, ...) { getdots() }
test(1547, foo(1L, 5L, a=2L, "c"), c("2", "c"))

# Fix for encoding issues in windows, #568
# perhaps a better way to check exact output in addition to testing encoding?
text="A,B\ną,ž\nū,į\nų,ė\nš,ę\n"
test(1548.1, unique(unlist(lapply(fread(text, sep=",", header=TRUE), Encoding))), "unknown")
test(1548.2, unique(unlist(lapply(fread(text, sep=",", header=TRUE, encoding="UTF-8"), Encoding))), "UTF-8")

##########################


Expand Down
3 changes: 2 additions & 1 deletion man/fread.Rd
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ stringsAsFactors=FALSE, verbose=getOption("datatable.verbose"), autostart=1L,
skip=0L, select=NULL, drop=NULL, colClasses=NULL,
integer64=getOption("datatable.integer64"), # default: "integer64"
dec=if (sep!=".") "." else ",",
check.names=FALSE,
check.names=FALSE, encoding="unknown",
showProgress=getOption("datatable.showProgress"), # default: TRUE
data.table=getOption("datatable.fread.datatable") # default: TRUE
)
Expand All @@ -36,6 +36,7 @@ data.table=getOption("datatable.fread.datatable") # default: TRUE
\item{integer64}{ "integer64" (default) reads columns detected as containing integers larger than 2^31 as type \code{bit64::integer64}. Alternatively, \code{"double"|"numeric"} reads as \code{base::read.csv} does; i.e., possibly with loss of precision and if so silently. Or, "character". }
\item{dec}{ The decimal separator as in \code{base::read.csv}. If not "." (default) then usually ",". See details. }
\item{check.names}{ default is \code{FALSE}. If \code{TRUE}, it uses the base function \code{\link{make.unique}} to ensure that column names are all unique.}
\item{encoding}{ default is \code{"unknown"}. Other possible options are \code{"UTF-8"} and \code{"Latin-1"}. }
\item{showProgress}{ TRUE displays progress on the console using \code{\\r}. It is produced in fread's C code where the very nice (but R level) txtProgressBar and tkProgressBar are not easily available. }
\item{data.table}{ TRUE returns a \code{data.table}. FALSE returns a \code{data.frame}. }
}
Expand Down
21 changes: 18 additions & 3 deletions src/fread.c
Original file line number Diff line number Diff line change
Expand Up @@ -413,7 +413,7 @@ static SEXP coerceVectorSoFar(SEXP v, int oldtype, int newtype, R_len_t sofar, R
return(newv);
}

SEXP readfile(SEXP input, SEXP separg, SEXP nrowsarg, SEXP headerarg, SEXP nastrings, SEXP verbosearg, SEXP autostart, SEXP skip, SEXP select, SEXP drop, SEXP colClasses, SEXP integer64, SEXP dec, SEXP showProgressArg)
SEXP readfile(SEXP input, SEXP separg, SEXP nrowsarg, SEXP headerarg, SEXP nastrings, SEXP verbosearg, SEXP autostart, SEXP skip, SEXP select, SEXP drop, SEXP colClasses, SEXP integer64, SEXP dec, SEXP encoding, SEXP showProgressArg)
// can't be named fread here because that's already a C function (from which the R level fread function took its name)
{
SEXP thiscol, ans, thisstr;
Expand All @@ -424,7 +424,19 @@ SEXP readfile(SEXP input, SEXP separg, SEXP nrowsarg, SEXP headerarg, SEXP nastr
verbose=LOGICAL(verbosearg)[0];
clock_t t0 = clock();
ERANGEwarning = FALSE; // just while detecting types, then TRUE before the read data loop


// Encoding, #568: Borrowed from do_setencoding from base R
// https://github.com/wch/r-source/blob/ca5348f0b5e3f3c2b24851d7aff02de5217465eb/src/main/util.c#L1115
// Check for mkCharLenCE function to locate as to where where this is implemented.
cetype_t ienc;
Rboolean is_no_encoding = TRUE;
if (!isNull(encoding)) {
is_no_encoding = FALSE;
if (!strcmp(CHAR(STRING_ELT(encoding, 0)), "Latin-1")) ienc = CE_LATIN1;
else if (!strcmp(CHAR(STRING_ELT(encoding, 0)), "UTF-8")) ienc = CE_UTF8;
else ienc = CE_NATIVE;
}

// Extra tracing for apparent 32bit Windows problem: https://github.com/Rdatatable/data.table/issues/1111
if (!isInteger(showProgressArg)) error("showProgress is not type integer but type '%s'. Please report.", type2char(TYPEOF(showProgressArg)));
if (LENGTH(showProgressArg)!=1) error("showProgress is not length 1 but length %d. Please report.", LENGTH(showProgressArg));
Expand Down Expand Up @@ -1083,7 +1095,10 @@ SEXP readfile(SEXP input, SEXP separg, SEXP nrowsarg, SEXP headerarg, SEXP nastr
SET_VECTOR_ELT(ans, resj, thiscol = coerceVectorSoFar(thiscol, type[j]++, SXP_STR, i, j));
case SXP_STR: case SXP_NULL: case_SXP_STR:
Field(1);
if (type[j]==SXP_STR) SET_STRING_ELT(thiscol, i, mkCharLen(fieldStart, fieldLen));
if (type[j]==SXP_STR) {
SET_STRING_ELT(thiscol, i, (is_no_encoding) ?
mkCharLen(fieldStart, fieldLen) : mkCharLenCE(fieldStart, fieldLen, ienc));
}
}
if (ch<eof && *ch==sep && j<ncol-1) {ch++; continue;} // done, next field
if (j<ncol-1) {
Expand Down

0 comments on commit 43f2dcd

Please sign in to comment.