Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

PUBDEV-4639: Added requirement for data.table. #4265

Merged
merged 14 commits into from
Feb 27, 2020
13 changes: 6 additions & 7 deletions h2o-core/src/main/java/water/api/DatasetServlet.java
Original file line number Diff line number Diff line change
Expand Up @@ -20,18 +20,17 @@ public class DatasetServlet extends HttpServlet {
protected void doGet(HttpServletRequest request, HttpServletResponse response) {
String uri = ServletUtils.getDecodedUri(request);
try {
boolean use_hex = false;
String f_name = request.getParameter("frame_id");
String hex_string = request.getParameter("hex_string");
String escape_quotes_string = request.getParameter("escape_quotes");
if (f_name == null) {
throw new RuntimeException("Cannot find value for parameter \'frame_id\'");
throw new RuntimeException("Cannot find value for parameter 'frame_id'");
}
if (hex_string != null && hex_string.toLowerCase().equals("true")) {
use_hex = true;
}

Frame dataset = DKV.getGet(f_name);
InputStream is = dataset.toCSV(new Frame.CSVStreamParams().setHexString(use_hex));
Frame.CSVStreamParams parms = new Frame.CSVStreamParams()
.setHexString(Boolean.parseBoolean(hex_string))
.setEscapeQuotes(Boolean.parseBoolean(escape_quotes_string));
InputStream is = dataset.toCSV(parms);
response.setContentType("application/octet-stream");
// Clean up the file name
int x = f_name.length() - 1;
Expand Down
11 changes: 9 additions & 2 deletions h2o-core/src/main/java/water/fvec/Frame.java
Original file line number Diff line number Diff line change
Expand Up @@ -1571,6 +1571,7 @@ public static class CSVStreamParams extends Iced<CSVStreamParams> {

boolean _headers = true;
boolean _hex_string = false;
boolean _escape_quotes = false;
char _separator = DEFAULT_SEPARATOR;

public CSVStreamParams setHeaders(boolean headers) {
Expand All @@ -1587,6 +1588,11 @@ public CSVStreamParams setSeparator(byte separator) {
_separator = (char) separator;
return this;
}

public CSVStreamParams setEscapeQuotes(boolean backslash_escape) {
_escape_quotes = backslash_escape;
return this;
}
}

public static class CSVStream extends InputStream {
Expand Down Expand Up @@ -1719,13 +1725,14 @@ else if (v.isString()) {
* @param unescapedString An unescaped {@link String} to escape
* @return String with escaped double-quotes, if found.
*/
private static String escapeQuotesForCsv(final String unescapedString) {
private String escapeQuotesForCsv(final String unescapedString) {
if (!_parms._escape_quotes) return unescapedString;
final Matcher matcher = DOUBLE_QUOTE_PATTERN.matcher(unescapedString);
return matcher.replaceAll("\"\"");
}

@Override
public int available() throws IOException {
public int available() {
// Case 1: There is more data left to read from the current line.
if (_position != _line.length) {
return _line.length - _position;
Expand Down
95 changes: 47 additions & 48 deletions h2o-r/h2o-package/R/frame.R
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,10 @@
#` E$nrow <- the row count (total size, generally much larger than the local cached rows)
#` E$types <- the H2O column types

# since we only import data.table via requireNamespace this is required for data.table calls to
# stop pretending to being data.frame and start behaving as data.table
.datatable.aware = TRUE

#-----------------------------------------------------------------------------------------------------------------------
# Private/Internal Functions
#-----------------------------------------------------------------------------------------------------------------------
Expand Down Expand Up @@ -1559,7 +1563,7 @@ NULL
}

if( is1by1 ) .fetch.data(data,1L)[[1]]
else data
else data
}

#' @rdname H2OFrame-Extract
Expand Down Expand Up @@ -3273,6 +3277,7 @@ destination_frame.guess <- function(x) {
#' It is possible to control just \code{\link[data.table]{fread}} or \code{\link[data.table]{fwrite}} with \code{options("h2o.fread"=FALSE, "h2o.fwrite"=FALSE)}.
#' \code{h2o.fread} and \code{h2o.fwrite} options are not handled in this function but next to \emph{fread} and \emph{fwrite} calls.
#' @export
#' @importFrom utils installed.packages
#' @seealso \code{\link{as.h2o.data.frame}}, \code{\link{as.data.frame.H2OFrame}}
#' @examples
#' op <- options("h2o.use.data.table" = TRUE)
Expand All @@ -3284,21 +3289,21 @@ destination_frame.guess <- function(x) {
#' options(op)
use.package <- function(package,
version="1.9.8"[package=="data.table"],
use=getOption("h2o.use.data.table", FALSE)[package=="data.table"]) {
use=getOption("h2o.use.data.table", TRUE)[package=="data.table"]) {
## methods that depends on use.package default arguments (to have control in single place):
# as.h2o.data.frame
# as.data.frame.H2OFrame
stopifnot(is.character(package), length(package)==1L,
is.character(version), length(version)==1L,
is.logical(use), length(use)==1L)

# if (package=="data.table" && use) { # not sure if this is needed. Keeping it for now.
# if (!("bit64" %in% rownames(installed.packages())) || (packageVersion("bit64") < as.package_version("0.9.7"))) {
# # print out warning to install bit64 in order to use data.table
# warning("data.table cannot be used without R package bit64 version 0.9.7 or higher. Please upgrade to take advangage of data.table speedups.")
# return(FALSE)
# }
# }
if (package=="data.table" && use) { # not sure if this is needed. Keeping it for now.
if (!("bit64" %in% rownames(installed.packages())) || (packageVersion("bit64") < as.package_version("0.9.7"))) {
# print out warning to install bit64 in order to use data.table
warning("data.table cannot be used without R package bit64 version 0.9.7 or higher. Please upgrade to take advangage of data.table speedups.")
return(FALSE)
}
}
use && requireNamespace(package, quietly=TRUE) && (packageVersion(package) >= as.package_version(version))
}

Expand Down Expand Up @@ -3419,21 +3424,17 @@ as.h2o.Matrix <- function(x, destination_frame="", ...) {
}
.key.validate(destination_frame)

tmpf <- tempfile(fileext = ".svm")
if (use.package("data.table") && use.package("slam", version="0.1.40", TRUE)) {
drs <- slam::as.simple_triplet_matrix(x)# need to convert sparse matrix x to a simple triplet matrix format
thefile <- tempfile()
.h2o.write_stm_svm(drs, file = thefile)
h2f <<- h2o.uploadFile(thefile, parse_type = "SVMLight", destination_frame=destination_frame)
unlink(thefile)
h2f[, -1] # remove the first column
drs <- slam::as.simple_triplet_matrix(x)
.h2o.write_stm_svm(drs, file = tmpf)
} else {
warning("as.h2o can be slow for large sparse matrices. Install packages data.table and slam to speed up as.h2o.")
tmpf <- tempfile(fileext = ".svm")
warning("as.h2o can be slow for large sparse matrices. Install packages data.table and slam to speed up as.h2o.")
.h2o.write.matrix.svmlight(x, file = tmpf)
h2f <- .h2o.readSVMLight(tmpf, destination_frame = destination_frame)
file.remove(tmpf)
h2f
}
h2f <- .h2o.readSVMLight(tmpf, destination_frame = destination_frame)
file.remove(tmpf)
h2f # remove the first column
}

.h2o.write.matrix.svmlight <- function(matrix, file) {
Expand All @@ -3450,41 +3451,36 @@ as.h2o.Matrix <- function(x, destination_frame="", ...) {
})
}

.h2o.calc_stm_svm <- function(stm, y){
.h2o.calc_stm_svm <- function(stm) {
# Convert a simple triplet matrix to svm format
# author Peter Ellis
# return a character vector of length n
# fixed bug to return rows of zeros instead of repeating other rows
# returns a character vector of length y ready for writing in svm format
# returns a character vector of length n ready for writing in svm format
if(!"simple_triplet_matrix" %in% class(stm)){
stop("stm must be a simple triple matrix")
}
if(!is.vector(y) | nrow(stm) != length(y)){
stop("y should be a vector of length equal to number of rows of stm")
}
n <- length(y)

# data table solution thanks to roland
n <- nrow(stm)
rowLeft <- setdiff(c(1:n), unique(stm$i))
nrowLeft <- length(rowLeft)
i=NULL # serves no purpose except to pass the R cmd cran check
j=NULL
v=NULL
jv=NULL
stm2 <- data.table::data.table(i = c(stm$i,rowLeft), j = c(stm$j,rep(1,nrowLeft)), v = c(stm$v,rep(0,nrowLeft)))
res <- stm2[, list(i, jv = paste(j, v, sep = ":"))][order(i), list(res = paste(jv, collapse = " ")), by = i][["res"]]

out <- paste(y, res)

return(out)
}

.h2o.write_stm_svm <- function(stm, y = rep(1, nrow(stm)), file){
all.rows <- 1:max(stm2$i)
rows.having.first.col <- stm2$i[which(stm2$j == 1)]
rows.missing.first.col <- setdiff(all.rows, rows.having.first.col)
stm2.fill <- data.table::data.table(i = rows.missing.first.col, j = 1, v = 0)
stm2 <- rbind(stm2.fill, stm2)
res <- stm2[, list(i, jv = ifelse(j==1,v,paste(j-1, v, sep = ":")))
][order(i), list(res = paste(jv, collapse = " ")), by = i
][["res"]]
return(res)
}

.h2o.write_stm_svm <- function(stm, file) {
# param stm a simple triplet matrix (class exported slam) of features (ie explanatory variables)
# param y a vector of labels. If not provided, a dummy of 1s is provided
# param file file to write to.
# author Peter Ellis
out <- .h2o.calc_stm_svm(stm, y)
out <- .h2o.calc_stm_svm(stm)
writeLines(out, con = file)
}

Expand Down Expand Up @@ -3534,10 +3530,13 @@ as.data.frame.H2OFrame <- function(x, ...) {
# Versions of R prior to 3.1 should not use hex string.
# Versions of R including 3.1 and later should use hex string.
useHexString <- getRversion() >= "3.1"

# We cannot use data.table by default since its handling of escaping inside quoted csv values is not very good
# in some edge cases its simply impossible to load data in correct format without additional post processing
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What specifically do you mean by "not very good"? This is unhelpful to write in this way. Tell me what's wrong and I'll fix it. Probably already known somewhere and I'll raise the priority. We're on the same team here.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sorry for being so ambiguous. On second thought I should have been more to the point and link to existing issue.
This is the issue I was reffering to:
This is the issue: Rdatatable/data.table#1109
Here is the test that I was not able to make pass with data.table even with changes on the back-end: https://github.com/h2oai/h2o-3/pull/4265/files#diff-c87a495277fa01b186f5300c48973bf6R33

useDataTable <- getOption("h2o.fread", FALSE) && use.package("data.table")
urlSuffix <- paste0('DownloadDataset',
'?frame_id=', URLencode( h2o.getId(x)),
'&hex_string=', as.numeric(useHexString))
'?frame_id=', URLencode(h2o.getId(x)),
'&hex_string=', ifelse(useHexString, "true", "false"),
'&escape_quotes=', ifelse(useDataTable, "false", "true"))

verbose <- getOption("h2o.verbose", FALSE)

Expand Down Expand Up @@ -3569,16 +3568,16 @@ as.data.frame.H2OFrame <- function(x, ...) {
ttt <- .writeBinToTmpFile(payload)
}
if (verbose) cat(sprintf("fetching from h2o frame to R using '.h2o.doSafeGET' took %.2fs\n", proc.time()[[3]]-pt))

if (verbose) pt <- proc.time()[[3]]
if (getOption("h2o.fread", TRUE) && use.package("data.table")) {
df <- data.table::fread(ttt, blank.lines.skip = FALSE, na.strings = "", colClasses = colClasses, showProgress=FALSE, data.table=FALSE, ...)
if (useDataTable) {
if (identical(colClasses, NA_character_) || identical(colClasses, "")) colClasses <- NULL # workaround for data.table length-1 bug #4237 fixed in v1.12.9
df <- data.table::fread(ttt, sep = ",", blank.lines.skip = FALSE, na.strings = "", colClasses = colClasses, showProgress=FALSE, data.table=FALSE, ...)
if (sum(dates))
for (i in which(dates)) data.table::setattr(df[[i]], "class", "POSIXct")
fun <- "fread"
} else {
# Substitute NAs for blank cells rather than skipping
if(useCon){
if (useCon) {
df <- read.csv((tcon <- textConnection(ttt)), blank.lines.skip = FALSE, na.strings = "", colClasses = colClasses, ...)
close(tcon)
} else {
Expand Down
42 changes: 17 additions & 25 deletions h2o-r/tests/testdir_misc/runit_as.frame.R
Original file line number Diff line number Diff line change
Expand Up @@ -23,32 +23,24 @@ test <- function() {
print(sprintf("nrow(Nhex): %d", nrow(Nhex)))
print(sprintf("nrow(x): %d", nrow(x)))
expect_that(nrow(Nhex), equals(nrow(x)))

# Quote writing
original <- data.frame(
ngram = c(
"SIRET:417 653 698",
"SIRET:417 653 698 00031",
"Sans",
"Sans esc.",
"Sans esc. jusqu\"\"au", # Two quotes in line
"Sans esc. jusqu\"au 15.11.2018"
)
)
print("Original data")
print(original)

h2o_fr <- as.h2o(original)
print("H2O Frame")
print(h2o_fr)

as_df <- as.data.frame(h2o_fr)
print("As data frame:")
print(as_df)

expect_true(all(as_df == original))



df <- data.frame(
c1 = c(1.1, 2.22, 3.345, 4.678, 5.098765),
c2 = c("one", "with, sep", "with\"\"quotes\"", "\"", "quoted\",\"sep")
)

# options(h2o.fread=TRUE) # uncomment to test with data-table but it will fail
frames <- list(df, data.frame(c=df[, 2]))
for (original in frames) {
print("Original:")
print(original)
h2o_fr <- as.h2o(original)
as_df <- as.data.frame(h2o_fr)
print("Converted:")
print(as_df)
expect_true(all(as_df == original))
}
}

doTest("Test data frame", test)
Expand Down