From c37d392fd9fed426a09c5be39570315ee0967bf3 Mon Sep 17 00:00:00 2001 From: wendycwong Date: Sat, 2 Jun 2018 16:36:12 -0700 Subject: [PATCH 01/14] PUBDEV-4639: use data.table if possible. --- h2o-r/h2o-package/R/frame.R | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/h2o-r/h2o-package/R/frame.R b/h2o-r/h2o-package/R/frame.R index c2da57a3a871..95154539281e 100644 --- a/h2o-r/h2o-package/R/frame.R +++ b/h2o-r/h2o-package/R/frame.R @@ -3292,13 +3292,13 @@ use.package <- function(package, is.character(version), length(version)==1L, is.logical(use), length(use)==1L) - # if (package=="data.table" && use) { # not sure if this is needed. Keeping it for now. - # if (!("bit64" %in% rownames(installed.packages())) || (packageVersion("bit64") < as.package_version("0.9.7"))) { - # # print out warning to install bit64 in order to use data.table - # warning("data.table cannot be used without R package bit64 version 0.9.7 or higher. Please upgrade to take advangage of data.table speedups.") - # return(FALSE) - # } - # } + if (package=="data.table" && use) { # not sure if this is needed. Keeping it for now. + if (!("bit64" %in% rownames(installed.packages())) || (packageVersion("bit64") < as.package_version("0.9.7"))) { + # print out warning to install bit64 in order to use data.table + warning("data.table cannot be used without R package bit64 version 0.9.7 or higher. Please upgrade to take advangage of data.table speedups.") + return(FALSE) + } + } use && requireNamespace(package, quietly=TRUE) && (packageVersion(package) >= as.package_version(version)) } From fdb4434d5661a59bf582331545530e4f07085373 Mon Sep 17 00:00:00 2001 From: mattdowle Date: Mon, 10 Feb 2020 23:36:28 -0700 Subject: [PATCH 02/14] workaround for fread length-1 colClasses=NA_character_ bug #4237 --- h2o-r/h2o-package/R/frame.R | 1 + 1 file changed, 1 insertion(+) diff --git a/h2o-r/h2o-package/R/frame.R b/h2o-r/h2o-package/R/frame.R index 95154539281e..7b373f49470c 100644 --- a/h2o-r/h2o-package/R/frame.R +++ b/h2o-r/h2o-package/R/frame.R @@ -3572,6 +3572,7 @@ as.data.frame.H2OFrame <- function(x, ...) { if (verbose) pt <- proc.time()[[3]] if (getOption("h2o.fread", TRUE) && use.package("data.table")) { + if (identical(colClasses, NA_character_) || identical(colClasses, "")) colClasses <- NULL # workaround for data.table length-1 bug #4237 fixed in v1.12.9 df <- data.table::fread(ttt, blank.lines.skip = FALSE, na.strings = "", colClasses = colClasses, showProgress=FALSE, data.table=FALSE, ...) if (sum(dates)) for (i in which(dates)) data.table::setattr(df[[i]], "class", "POSIXct") From 48c6dc50029c114b6a9e9f648e7be2d8a61de81c Mon Sep 17 00:00:00 2001 From: Jan Sterba Date: Fri, 14 Feb 2020 22:28:01 +0100 Subject: [PATCH 03/14] actually change the default value --- h2o-r/h2o-package/R/frame.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/h2o-r/h2o-package/R/frame.R b/h2o-r/h2o-package/R/frame.R index 7b373f49470c..9c4ee5422dd3 100644 --- a/h2o-r/h2o-package/R/frame.R +++ b/h2o-r/h2o-package/R/frame.R @@ -3284,7 +3284,7 @@ destination_frame.guess <- function(x) { #' options(op) use.package <- function(package, version="1.9.8"[package=="data.table"], - use=getOption("h2o.use.data.table", FALSE)[package=="data.table"]) { + use=getOption("h2o.use.data.table", TRUE)[package=="data.table"]) { ## methods that depends on use.package default arguments (to have control in single place): # as.h2o.data.frame # as.data.frame.H2OFrame From 1436a66dc134a34c7830b7aff32286a284c80c51 Mon Sep 17 00:00:00 2001 From: Jan Sterba Date: Fri, 14 Feb 2020 22:34:08 +0100 Subject: [PATCH 04/14] fix R CRAN check --- h2o-r/h2o-package/R/frame.R | 1 + 1 file changed, 1 insertion(+) diff --git a/h2o-r/h2o-package/R/frame.R b/h2o-r/h2o-package/R/frame.R index 9c4ee5422dd3..38e21a32aa0d 100644 --- a/h2o-r/h2o-package/R/frame.R +++ b/h2o-r/h2o-package/R/frame.R @@ -3273,6 +3273,7 @@ destination_frame.guess <- function(x) { #' It is possible to control just \code{\link[data.table]{fread}} or \code{\link[data.table]{fwrite}} with \code{options("h2o.fread"=FALSE, "h2o.fwrite"=FALSE)}. #' \code{h2o.fread} and \code{h2o.fwrite} options are not handled in this function but next to \emph{fread} and \emph{fwrite} calls. #' @export +#' @importFrom utils installed.packages #' @seealso \code{\link{as.h2o.data.frame}}, \code{\link{as.data.frame.H2OFrame}} #' @examples #' op <- options("h2o.use.data.table" = TRUE) From 004057588ccf6940ac37b11169fbd39635e537f7 Mon Sep 17 00:00:00 2001 From: Jan Sterba Date: Fri, 21 Feb 2020 10:35:04 +0100 Subject: [PATCH 05/14] fix as.h2o.Matrix --- h2o-r/h2o-package/R/frame.R | 46 +++++++++++++------------------------ 1 file changed, 16 insertions(+), 30 deletions(-) diff --git a/h2o-r/h2o-package/R/frame.R b/h2o-r/h2o-package/R/frame.R index 38e21a32aa0d..54d165763623 100644 --- a/h2o-r/h2o-package/R/frame.R +++ b/h2o-r/h2o-package/R/frame.R @@ -1559,7 +1559,7 @@ NULL } if( is1by1 ) .fetch.data(data,1L)[[1]] - else data + else data } #' @rdname H2OFrame-Extract @@ -3420,21 +3420,17 @@ as.h2o.Matrix <- function(x, destination_frame="", ...) { } .key.validate(destination_frame) + tmpf <- tempfile(fileext = ".svm") if (use.package("data.table") && use.package("slam", version="0.1.40", TRUE)) { drs <- slam::as.simple_triplet_matrix(x)# need to convert sparse matrix x to a simple triplet matrix format - thefile <- tempfile() - .h2o.write_stm_svm(drs, file = thefile) - h2f <<- h2o.uploadFile(thefile, parse_type = "SVMLight", destination_frame=destination_frame) - unlink(thefile) - h2f[, -1] # remove the first column + .h2o.write_stm_svm(drs, file = tmpf) } else { - warning("as.h2o can be slow for large sparse matrices. Install packages data.table and slam to speed up as.h2o.") - tmpf <- tempfile(fileext = ".svm") + warning("as.h2o can be slow for large sparse matrices. Install packages data.table and slam to speed up as.h2o.") .h2o.write.matrix.svmlight(x, file = tmpf) - h2f <- .h2o.readSVMLight(tmpf, destination_frame = destination_frame) - file.remove(tmpf) - h2f } + h2f <- .h2o.readSVMLight(tmpf, destination_frame = destination_frame) + file.remove(tmpf) + h2f # remove the first column } .h2o.write.matrix.svmlight <- function(matrix, file) { @@ -3451,21 +3447,13 @@ as.h2o.Matrix <- function(x, destination_frame="", ...) { }) } -.h2o.calc_stm_svm <- function(stm, y){ +.h2o.calc_stm_svm <- function(stm) { # Convert a simple triplet matrix to svm format - # author Peter Ellis - # return a character vector of length n - # fixed bug to return rows of zeros instead of repeating other rows - # returns a character vector of length y ready for writing in svm format + # returns a character vector of length n ready for writing in svm format if(!"simple_triplet_matrix" %in% class(stm)){ stop("stm must be a simple triple matrix") } - if(!is.vector(y) | nrow(stm) != length(y)){ - stop("y should be a vector of length equal to number of rows of stm") - } - n <- length(y) - - # data table solution thanks to roland + n <- nrow(stm) rowLeft <- setdiff(c(1:n), unique(stm$i)) nrowLeft <- length(rowLeft) i=NULL # serves no purpose except to pass the R cmd cran check @@ -3473,19 +3461,17 @@ as.h2o.Matrix <- function(x, destination_frame="", ...) { v=NULL jv=NULL stm2 <- data.table::data.table(i = c(stm$i,rowLeft), j = c(stm$j,rep(1,nrowLeft)), v = c(stm$v,rep(0,nrowLeft))) - res <- stm2[, list(i, jv = paste(j, v, sep = ":"))][order(i), list(res = paste(jv, collapse = " ")), by = i][["res"]] - - out <- paste(y, res) - - return(out) + res <- stm2[, list(i, jv = ifelse(j==1,v,paste(j-1, v, sep = ":"))) + ][order(i), list(res = paste(jv, collapse = " ")), by = i + ][["res"]] + return(res) } -.h2o.write_stm_svm <- function(stm, y = rep(1, nrow(stm)), file){ +.h2o.write_stm_svm <- function(stm, file) { # param stm a simple triplet matrix (class exported slam) of features (ie explanatory variables) - # param y a vector of labels. If not provided, a dummy of 1s is provided # param file file to write to. # author Peter Ellis - out <- .h2o.calc_stm_svm(stm, y) + out <- .h2o.calc_stm_svm(stm) writeLines(out, con = file) } From 7484cd57dff5187e642589e30ab43af10ca7f093 Mon Sep 17 00:00:00 2001 From: Jan Sterba Date: Fri, 21 Feb 2020 11:14:40 +0100 Subject: [PATCH 06/14] bring back lost change also try to fix as.data.frame --- h2o-r/h2o-package/R/frame.R | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/h2o-r/h2o-package/R/frame.R b/h2o-r/h2o-package/R/frame.R index 54d165763623..09ce36c75d42 100644 --- a/h2o-r/h2o-package/R/frame.R +++ b/h2o-r/h2o-package/R/frame.R @@ -35,6 +35,10 @@ #` E$nrow <- the row count (total size, generally much larger than the local cached rows) #` E$types <- the H2O column types +# since we only import data.table via requireNamespace this is required for data.table calls to +# stop pretending to being data.frame and start behaving as data.table +.datatable.aware = TRUE + #----------------------------------------------------------------------------------------------------------------------- # Private/Internal Functions #----------------------------------------------------------------------------------------------------------------------- @@ -3560,7 +3564,7 @@ as.data.frame.H2OFrame <- function(x, ...) { if (verbose) pt <- proc.time()[[3]] if (getOption("h2o.fread", TRUE) && use.package("data.table")) { if (identical(colClasses, NA_character_) || identical(colClasses, "")) colClasses <- NULL # workaround for data.table length-1 bug #4237 fixed in v1.12.9 - df <- data.table::fread(ttt, blank.lines.skip = FALSE, na.strings = "", colClasses = colClasses, showProgress=FALSE, data.table=FALSE, ...) + df <- data.table::fread(ttt, sep = ",", blank.lines.skip = FALSE, na.strings = "", colClasses = colClasses, showProgress=FALSE, data.table=FALSE, ...) if (sum(dates)) for (i in which(dates)) data.table::setattr(df[[i]], "class", "POSIXct") fun <- "fread" From 22f0f1cb9c353fd8f829ee94a928ae6f9def3661 Mon Sep 17 00:00:00 2001 From: mattdowle Date: Sat, 22 Feb 2020 04:35:50 -0700 Subject: [PATCH 07/14] debugged and fixed changes in this PR to .h2o.calc_stm_svm by minimally reverting; unrelated to data.table per se --- h2o-r/h2o-package/R/frame.R | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/h2o-r/h2o-package/R/frame.R b/h2o-r/h2o-package/R/frame.R index 09ce36c75d42..c9d3823e67ce 100644 --- a/h2o-r/h2o-package/R/frame.R +++ b/h2o-r/h2o-package/R/frame.R @@ -3434,7 +3434,7 @@ as.h2o.Matrix <- function(x, destination_frame="", ...) { } h2f <- .h2o.readSVMLight(tmpf, destination_frame = destination_frame) file.remove(tmpf) - h2f # remove the first column + h2f[,-1] # remove the first column } .h2o.write.matrix.svmlight <- function(matrix, file) { @@ -3465,10 +3465,10 @@ as.h2o.Matrix <- function(x, destination_frame="", ...) { v=NULL jv=NULL stm2 <- data.table::data.table(i = c(stm$i,rowLeft), j = c(stm$j,rep(1,nrowLeft)), v = c(stm$v,rep(0,nrowLeft))) - res <- stm2[, list(i, jv = ifelse(j==1,v,paste(j-1, v, sep = ":"))) - ][order(i), list(res = paste(jv, collapse = " ")), by = i - ][["res"]] - return(res) + res <- stm2[, list(i, #jv = ifelse(j==1,v,paste(j-1, v, sep = ":"))) + jv = paste(j, v, sep = ":")) + ][order(i), list(res = paste(jv, collapse = " ")), by = i] + return(paste(res[["i"]], res[["res"]])) } .h2o.write_stm_svm <- function(stm, file) { From 5d8a723ee26482b312f318ad1d784da843702d5d Mon Sep 17 00:00:00 2001 From: Jan Sterba Date: Sat, 22 Feb 2020 14:33:04 +0100 Subject: [PATCH 08/14] Revert "debugged and fixed changes in this PR to .h2o.calc_stm_svm by minimally reverting; unrelated to data.table per se" This reverts commit 668b7dc78f6279c69e0a23bd6ae2bc3fee3e6a85. --- h2o-r/h2o-package/R/frame.R | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/h2o-r/h2o-package/R/frame.R b/h2o-r/h2o-package/R/frame.R index c9d3823e67ce..09ce36c75d42 100644 --- a/h2o-r/h2o-package/R/frame.R +++ b/h2o-r/h2o-package/R/frame.R @@ -3434,7 +3434,7 @@ as.h2o.Matrix <- function(x, destination_frame="", ...) { } h2f <- .h2o.readSVMLight(tmpf, destination_frame = destination_frame) file.remove(tmpf) - h2f[,-1] # remove the first column + h2f # remove the first column } .h2o.write.matrix.svmlight <- function(matrix, file) { @@ -3465,10 +3465,10 @@ as.h2o.Matrix <- function(x, destination_frame="", ...) { v=NULL jv=NULL stm2 <- data.table::data.table(i = c(stm$i,rowLeft), j = c(stm$j,rep(1,nrowLeft)), v = c(stm$v,rep(0,nrowLeft))) - res <- stm2[, list(i, #jv = ifelse(j==1,v,paste(j-1, v, sep = ":"))) - jv = paste(j, v, sep = ":")) - ][order(i), list(res = paste(jv, collapse = " ")), by = i] - return(paste(res[["i"]], res[["res"]])) + res <- stm2[, list(i, jv = ifelse(j==1,v,paste(j-1, v, sep = ":"))) + ][order(i), list(res = paste(jv, collapse = " ")), by = i + ][["res"]] + return(res) } .h2o.write_stm_svm <- function(stm, file) { From 7b91d9d1ea910e53963682d6d39f584611ec7609 Mon Sep 17 00:00:00 2001 From: Jan Sterba Date: Sat, 22 Feb 2020 15:18:37 +0100 Subject: [PATCH 09/14] fix matrix to svm serialization --- h2o-r/h2o-package/R/frame.R | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/h2o-r/h2o-package/R/frame.R b/h2o-r/h2o-package/R/frame.R index 09ce36c75d42..a3aa728493b6 100644 --- a/h2o-r/h2o-package/R/frame.R +++ b/h2o-r/h2o-package/R/frame.R @@ -3426,7 +3426,7 @@ as.h2o.Matrix <- function(x, destination_frame="", ...) { tmpf <- tempfile(fileext = ".svm") if (use.package("data.table") && use.package("slam", version="0.1.40", TRUE)) { - drs <- slam::as.simple_triplet_matrix(x)# need to convert sparse matrix x to a simple triplet matrix format + drs <- slam::as.simple_triplet_matrix(x) .h2o.write_stm_svm(drs, file = tmpf) } else { warning("as.h2o can be slow for large sparse matrices. Install packages data.table and slam to speed up as.h2o.") @@ -3465,6 +3465,11 @@ as.h2o.Matrix <- function(x, destination_frame="", ...) { v=NULL jv=NULL stm2 <- data.table::data.table(i = c(stm$i,rowLeft), j = c(stm$j,rep(1,nrowLeft)), v = c(stm$v,rep(0,nrowLeft))) + all.rows <- 1:max(stm2$i) + rows.having.first.col <- stm2$i[which(stm2$j == 1)] + rows.missing.first.col <- setdiff(all.rows, rows.having.first.col) + stm2.fill <- data.table::data.table(i = rows.missing.first.col, j = 1, v = 0) + stm2 <- rbind(stm2.fill, stm2) res <- stm2[, list(i, jv = ifelse(j==1,v,paste(j-1, v, sep = ":"))) ][order(i), list(res = paste(jv, collapse = " ")), by = i ][["res"]] From d84c8c4f23e77137a21c950d6970a5bef204d85d Mon Sep 17 00:00:00 2001 From: Jan Sterba Date: Sun, 23 Feb 2020 22:57:49 +0100 Subject: [PATCH 10/14] more fixes for data.table --- .../main/java/water/api/DatasetServlet.java | 13 +++--- h2o-core/src/main/java/water/fvec/Frame.java | 11 ++++- h2o-r/h2o-package/R/frame.R | 12 +++--- h2o-r/tests/testdir_misc/runit_as.frame.R | 42 ++++++++----------- 4 files changed, 38 insertions(+), 40 deletions(-) diff --git a/h2o-core/src/main/java/water/api/DatasetServlet.java b/h2o-core/src/main/java/water/api/DatasetServlet.java index 3d4a3bb837c1..00c860c81b18 100644 --- a/h2o-core/src/main/java/water/api/DatasetServlet.java +++ b/h2o-core/src/main/java/water/api/DatasetServlet.java @@ -20,18 +20,17 @@ public class DatasetServlet extends HttpServlet { protected void doGet(HttpServletRequest request, HttpServletResponse response) { String uri = ServletUtils.getDecodedUri(request); try { - boolean use_hex = false; String f_name = request.getParameter("frame_id"); String hex_string = request.getParameter("hex_string"); + String escape_quotes_string = request.getParameter("escape_quotes"); if (f_name == null) { - throw new RuntimeException("Cannot find value for parameter \'frame_id\'"); + throw new RuntimeException("Cannot find value for parameter 'frame_id'"); } - if (hex_string != null && hex_string.toLowerCase().equals("true")) { - use_hex = true; - } - Frame dataset = DKV.getGet(f_name); - InputStream is = dataset.toCSV(new Frame.CSVStreamParams().setHexString(use_hex)); + Frame.CSVStreamParams parms = new Frame.CSVStreamParams() + .setHexString(Boolean.parseBoolean(hex_string)) + .setEscapeQuotes(Boolean.parseBoolean(escape_quotes_string)); + InputStream is = dataset.toCSV(parms); response.setContentType("application/octet-stream"); // Clean up the file name int x = f_name.length() - 1; diff --git a/h2o-core/src/main/java/water/fvec/Frame.java b/h2o-core/src/main/java/water/fvec/Frame.java index 0ccfd0622a55..5a6a37b4ec17 100644 --- a/h2o-core/src/main/java/water/fvec/Frame.java +++ b/h2o-core/src/main/java/water/fvec/Frame.java @@ -1571,6 +1571,7 @@ public static class CSVStreamParams extends Iced { boolean _headers = true; boolean _hex_string = false; + boolean _escape_quotes = false; char _separator = DEFAULT_SEPARATOR; public CSVStreamParams setHeaders(boolean headers) { @@ -1587,6 +1588,11 @@ public CSVStreamParams setSeparator(byte separator) { _separator = (char) separator; return this; } + + public CSVStreamParams setEscapeQuotes(boolean backslash_escape) { + _escape_quotes = backslash_escape; + return this; + } } public static class CSVStream extends InputStream { @@ -1719,13 +1725,14 @@ else if (v.isString()) { * @param unescapedString An unescaped {@link String} to escape * @return String with escaped double-quotes, if found. */ - private static String escapeQuotesForCsv(final String unescapedString) { + private String escapeQuotesForCsv(final String unescapedString) { + if (!_parms._escape_quotes) return unescapedString; final Matcher matcher = DOUBLE_QUOTE_PATTERN.matcher(unescapedString); return matcher.replaceAll("\"\""); } @Override - public int available() throws IOException { + public int available() { // Case 1: There is more data left to read from the current line. if (_position != _line.length) { return _line.length - _position; diff --git a/h2o-r/h2o-package/R/frame.R b/h2o-r/h2o-package/R/frame.R index a3aa728493b6..f589408215ce 100644 --- a/h2o-r/h2o-package/R/frame.R +++ b/h2o-r/h2o-package/R/frame.R @@ -3530,10 +3530,11 @@ as.data.frame.H2OFrame <- function(x, ...) { # Versions of R prior to 3.1 should not use hex string. # Versions of R including 3.1 and later should use hex string. useHexString <- getRversion() >= "3.1" - + useDataTable <- getOption("h2o.fread", FALSE) && use.package("data.table") urlSuffix <- paste0('DownloadDataset', - '?frame_id=', URLencode( h2o.getId(x)), - '&hex_string=', as.numeric(useHexString)) + '?frame_id=', URLencode(h2o.getId(x)), + '&hex_string=', ifelse(useHexString, "true", "false"), + '&escape_quotes=', ifelse(useDataTable, "false", "true")) verbose <- getOption("h2o.verbose", FALSE) @@ -3565,9 +3566,8 @@ as.data.frame.H2OFrame <- function(x, ...) { ttt <- .writeBinToTmpFile(payload) } if (verbose) cat(sprintf("fetching from h2o frame to R using '.h2o.doSafeGET' took %.2fs\n", proc.time()[[3]]-pt)) - if (verbose) pt <- proc.time()[[3]] - if (getOption("h2o.fread", TRUE) && use.package("data.table")) { + if (useDataTable) { if (identical(colClasses, NA_character_) || identical(colClasses, "")) colClasses <- NULL # workaround for data.table length-1 bug #4237 fixed in v1.12.9 df <- data.table::fread(ttt, sep = ",", blank.lines.skip = FALSE, na.strings = "", colClasses = colClasses, showProgress=FALSE, data.table=FALSE, ...) if (sum(dates)) @@ -3575,7 +3575,7 @@ as.data.frame.H2OFrame <- function(x, ...) { fun <- "fread" } else { # Substitute NAs for blank cells rather than skipping - if(useCon){ + if (useCon) { df <- read.csv((tcon <- textConnection(ttt)), blank.lines.skip = FALSE, na.strings = "", colClasses = colClasses, ...) close(tcon) } else { diff --git a/h2o-r/tests/testdir_misc/runit_as.frame.R b/h2o-r/tests/testdir_misc/runit_as.frame.R index d079f8a6d776..d4e788b606b5 100644 --- a/h2o-r/tests/testdir_misc/runit_as.frame.R +++ b/h2o-r/tests/testdir_misc/runit_as.frame.R @@ -23,32 +23,24 @@ test <- function() { print(sprintf("nrow(Nhex): %d", nrow(Nhex))) print(sprintf("nrow(x): %d", nrow(x))) expect_that(nrow(Nhex), equals(nrow(x))) - - # Quote writing - original <- data.frame( - ngram = c( - "SIRET:417 653 698", - "SIRET:417 653 698 00031", - "Sans", - "Sans esc.", - "Sans esc. jusqu\"\"au", # Two quotes in line - "Sans esc. jusqu\"au 15.11.2018" - ) - ) - print("Original data") - print(original) - - h2o_fr <- as.h2o(original) - print("H2O Frame") - print(h2o_fr) - - as_df <- as.data.frame(h2o_fr) - print("As data frame:") - print(as_df) - - expect_true(all(as_df == original)) - + + df <- data.frame( + c1 = c(1.1, 2.22, 3.345, 4.678, 5.098765), + c2 = c("one", "with, sep", "with\"\"quotes\"", "\"", "quoted\",\"sep") + ) + + # options(h2o.fread=TRUE) # uncomment to test with data-table but it will fail + frames <- list(df, data.frame(c=df[, 2])) + for (original in frames) { + print("Original:") + print(original) + h2o_fr <- as.h2o(original) + as_df <- as.data.frame(h2o_fr) + print("Converted:") + print(as_df) + expect_true(all(as_df == original)) + } } doTest("Test data frame", test) From 0c727481df386aea8b582cda7a54aa807326681c Mon Sep 17 00:00:00 2001 From: Jan Sterba Date: Sun, 23 Feb 2020 22:59:21 +0100 Subject: [PATCH 11/14] comment data.table limitations --- h2o-r/h2o-package/R/frame.R | 2 ++ 1 file changed, 2 insertions(+) diff --git a/h2o-r/h2o-package/R/frame.R b/h2o-r/h2o-package/R/frame.R index f589408215ce..2ce023cfa848 100644 --- a/h2o-r/h2o-package/R/frame.R +++ b/h2o-r/h2o-package/R/frame.R @@ -3530,6 +3530,8 @@ as.data.frame.H2OFrame <- function(x, ...) { # Versions of R prior to 3.1 should not use hex string. # Versions of R including 3.1 and later should use hex string. useHexString <- getRversion() >= "3.1" + # We cannot use data.table by default since its handling of escaping inside quoted csv values is not very good + # in some edge cases its simply impossible to load data in correct format without additional post processing useDataTable <- getOption("h2o.fread", FALSE) && use.package("data.table") urlSuffix <- paste0('DownloadDataset', '?frame_id=', URLencode(h2o.getId(x)), From 3dd94567a67037d037e5a14bb303b269fe431c89 Mon Sep 17 00:00:00 2001 From: Jan Sterba Date: Sun, 23 Feb 2020 23:07:26 +0100 Subject: [PATCH 12/14] fix as.h2o test --- h2o-r/h2o-package/R/frame.R | 6 ++++-- h2o-r/tests/testdir_misc/runit_as.h2o.R | 1 - 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/h2o-r/h2o-package/R/frame.R b/h2o-r/h2o-package/R/frame.R index 2ce023cfa848..359887f49e44 100644 --- a/h2o-r/h2o-package/R/frame.R +++ b/h2o-r/h2o-package/R/frame.R @@ -3468,8 +3468,10 @@ as.h2o.Matrix <- function(x, destination_frame="", ...) { all.rows <- 1:max(stm2$i) rows.having.first.col <- stm2$i[which(stm2$j == 1)] rows.missing.first.col <- setdiff(all.rows, rows.having.first.col) - stm2.fill <- data.table::data.table(i = rows.missing.first.col, j = 1, v = 0) - stm2 <- rbind(stm2.fill, stm2) + if (length(rows.missing.first.col) > 0) { + stm2.fill <- data.table::data.table(i = rows.missing.first.col, j = 1, v = 0) + stm2 <- rbind(stm2.fill, stm2) + } res <- stm2[, list(i, jv = ifelse(j==1,v,paste(j-1, v, sep = ":"))) ][order(i), list(res = paste(jv, collapse = " ")), by = i ][["res"]] diff --git a/h2o-r/tests/testdir_misc/runit_as.h2o.R b/h2o-r/tests/testdir_misc/runit_as.h2o.R index a980d3e3574f..278a2e1b8353 100644 --- a/h2o-r/tests/testdir_misc/runit_as.h2o.R +++ b/h2o-r/tests/testdir_misc/runit_as.h2o.R @@ -45,7 +45,6 @@ test.as.h2o.destination_frame <- function() { #no dest converted <- as.h2o(dummy_matrix) expect_match(attr(converted, 'id'), "^dummy_matrix_\\w+$") - } doTest("Test as.h2o methods with/without destination frame", test.as.h2o.destination_frame) From 559fafad2366932d78ae0073b9f4cd49b7ad72ab Mon Sep 17 00:00:00 2001 From: Jan Sterba Date: Mon, 24 Feb 2020 14:15:50 +0100 Subject: [PATCH 13/14] fix test --- h2o-r/h2o-package/R/frame.R | 2 +- h2o-r/tests/testdir_jira/runit_pubdev_2844.R | 4 +++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/h2o-r/h2o-package/R/frame.R b/h2o-r/h2o-package/R/frame.R index 359887f49e44..02ff4f29577b 100644 --- a/h2o-r/h2o-package/R/frame.R +++ b/h2o-r/h2o-package/R/frame.R @@ -3273,7 +3273,7 @@ destination_frame.guess <- function(x) { #' @param use logical scalar, extra escape option, to be used as global option. #' @details #' We use this function to control csv read/write with optional \link[data.table]{data.table} package. -#' Currently data.table is disabled by default, to enable it set \code{options("h2o.use.data.table"=TRUE)}. +#' Currently data.table is enabled by default for some operations, to disable it set \code{options("h2o.use.data.table"=FALSE)}. #' It is possible to control just \code{\link[data.table]{fread}} or \code{\link[data.table]{fwrite}} with \code{options("h2o.fread"=FALSE, "h2o.fwrite"=FALSE)}. #' \code{h2o.fread} and \code{h2o.fwrite} options are not handled in this function but next to \emph{fread} and \emph{fwrite} calls. #' @export diff --git a/h2o-r/tests/testdir_jira/runit_pubdev_2844.R b/h2o-r/tests/testdir_jira/runit_pubdev_2844.R index ef15db2a0b40..cea1a8b6fd42 100644 --- a/h2o-r/tests/testdir_jira/runit_pubdev_2844.R +++ b/h2o-r/tests/testdir_jira/runit_pubdev_2844.R @@ -5,7 +5,9 @@ test.pubdev_2844 <- function() { df1 <- iris h2o.no_progress() - + # enable using of data.table for as.data.frame + options(h2o.fread=TRUE) + # as.h2o op <- options("datatable.verbose"=TRUE, "h2o.use.data.table"=TRUE) co <- capture.output( From f144dbce1aacc92f3483a788aa4bd929bdb5a176 Mon Sep 17 00:00:00 2001 From: Jan Sterba Date: Thu, 27 Feb 2020 19:00:40 +0100 Subject: [PATCH 14/14] make test isolated --- h2o-r/tests/testdir_jira/runit_pubdev_2844.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/h2o-r/tests/testdir_jira/runit_pubdev_2844.R b/h2o-r/tests/testdir_jira/runit_pubdev_2844.R index cea1a8b6fd42..0109712efcbe 100644 --- a/h2o-r/tests/testdir_jira/runit_pubdev_2844.R +++ b/h2o-r/tests/testdir_jira/runit_pubdev_2844.R @@ -6,7 +6,7 @@ test.pubdev_2844 <- function() { df1 <- iris h2o.no_progress() # enable using of data.table for as.data.frame - options(h2o.fread=TRUE) + op.original <- options(h2o.fread=TRUE) # as.h2o op <- options("datatable.verbose"=TRUE, "h2o.use.data.table"=TRUE) @@ -80,7 +80,7 @@ test.pubdev_2844 <- function() { } else { expect_true(length(co) && sum(grepl("read.csv", co)), label="as.data.frame.H2OFrame should produce 'read.csv' in timing message when h2o.verbose=TRUE and data.table not used.") } - options(op) + options(op.original) } doTest("PUBDEV-2844", test.pubdev_2844)