Closes #713. And minor tweaks and tests for 'subset' argument.

Rdatatable · Jul 3, 2014 · aff5621 · aff5621
1 parent 8441e0f
commit aff5621
Show file tree

Hide file tree

Showing 3 changed files with 47 additions and 23 deletions.
diff --git a/R/fcast.R b/R/fcast.R
@@ -18,7 +18,7 @@ dcast.data.table <- function(data, formula, fun.aggregate = NULL, ..., margins =
     if (is.character(formula)) {
         ff <- strsplit(strip(formula), "~", fixed=TRUE)[[1]]
         if (length(ff) > 2)
-            stop("Cast formula of length > 2 detected. Data.table has at most two output dimensions.")
+            stop("Cast formula length is > 2, must be = 2.")
         ff <- strsplit(ff, "+", fixed=TRUE)
         setattr(ff, 'names', c("ll", "rr"))
         ff <- lapply(ff, function(x) x[x != "."])
@@ -43,17 +43,20 @@ dcast.data.table <- function(data, formula, fun.aggregate = NULL, ..., margins =
     drop <- as.logical(drop[1])
     if (is.na(drop)) stop("'drop' must be TRUE/FALSE")
 
-    # deal with 'subset' first
+    # subset
     m <- as.list(match.call()[-1])
     subset <- m$subset[[2]]
-    if (!is.null(subset)) data = data[eval(subset), unique(c(ff_, value.var)), with=FALSE] # TODO: revisit. Maybe too costly on large data
-
-    # if original or subset'd data.table has 0 rows or cols, error.
+    if (!is.null(subset)) {
+        if (is.name(subset)) subset = as.call(list(quote(`(`), subset))
+        data = data[eval(subset, data, parent.frame()), unique(c(ff_, value.var)), with=FALSE]
+    }
     if (nrow(data) == 0L || ncol(data) == 0L) stop("Can't 'cast' on an empty data.table")
 
-    # next, check and set 'fun.aggregate = length' it's null but at least one group size is > 1.
+    # set 'fun.aggregate = length' if max group size > 1
+    fun.null=FALSE
     if (is.null(fun.aggregate)) {
-        oo = forderv(data, by=ff_, retGrp=TRUE) # to check if the maximum group size is > 1 and is TRUE set fun.aggregate to length if it's NULL
+        fun.null=TRUE
+        oo = forderv(data, by=ff_, retGrp=TRUE)
         if (attr(oo, 'maxgrpn') > 1L) {
             message("Aggregate function missing, defaulting to 'length'")
             fun.aggregate <- length
@@ -65,41 +68,45 @@ dcast.data.table <- function(data, formula, fun.aggregate = NULL, ..., margins =
         fill.default = fun.aggregate(data[[value.var]][0], ...)
         args <- c("data", "formula", "margins", "subset", "fill", "value.var", "verbose", "drop")
         m <- m[setdiff(names(m), args)]
-        fun.aggregate <- as.call(c(m[1], as.name(value.var), m[-1]))
+        .CASTfun = fun.aggregate # issues/713
+        fun.aggregate <- as.call(c(quote(.CASTfun), as.name(value.var), m[-1]))
         fun.aggregate <- as.call(c(as.name("list"), setattr(list(fun.aggregate), 'names', value.var)))
-        # checking for #5191 (until fixed, this is a workaround
-        if (length(intersect(value.var, ff_))) fun.aggregate = as.call(list(as.name("{"), as.name(".SD"), fun.aggregate))
+        # workaround until #5191 (issues/497) is fixed
+        if (length(intersect(value.var, ff_))) 
+            fun.aggregate = as.call(list(as.name("{"), as.name(".SD"), fun.aggregate))
     }
-    if (length(ff$rr) == 0) { # take care of special case
-        if (is.null(fun.aggregate)) 
+    # special case
+    if (length(ff$rr) == 0) {
+        if (is.null(fun.aggregate))
             ans = data[, c(ff$ll, value.var), with=FALSE]
         else {
-            # checking for #5191 (until fixed, this is a workaround
+            # workaround until #5191 (issues/497) is fixed
             if (length(intersect(value.var, ff_))) ans = data[, eval(fun.aggregate), by=c(ff$ll), .SDcols=value.var]
             else ans = data[, eval(fun.aggregate), by=c(ff$ll)]
         }
-        if (any(duplicated(names(ans)))) {
-            message("Duplicate column names found in cast data.table. Setting unique names using 'make.names'")   
+        if (anyDuplicated(names(ans))) {
+            message("Duplicate column names found in cast data.table. Setting unique names using 'make.unique'")   
             setnames(ans, make.unique(names(ans)))
         }
         if (!identical(key(ans), ff$ll)) setkeyv(ans, names(ans)[seq_along(ff$ll)])
         return(ans)
     }
-    # if fun.aggregate exists, then aggregate in R-side (now that 'adhoc-by' is extremely fast!)
+    # aggregation moved to R now that 'adhoc-by' is crazy fast!
     if (!is.null(fun.aggregate)) {
         if (length(intersect(value.var, ff_))) {
             data = data[, eval(fun.aggregate), by=c(ff_), .SDcols=value.var]
             value.var = tail(make.unique(names(data)), 1L)
             setnames(data, ncol(data), value.var)
         }
         else data = data[, eval(fun.aggregate), by=c(ff_)]
-        setkeyv(data, ff_) # can't use 'oo' here, but should be faster as it's uncommon to have huge number of groups.
+        setkeyv(data, ff_)
+        # issues/693
         fun_agg_chk <- function(x) {
-            pos = uniqlist(as.list(x)[key(x)]) # as.list shallow copies, I believe
-            len = uniqlengths(pos, nrow(x))
-            any(len != 1L)
+            # sorted now, 'forderv' should be as fast as uniqlist+uniqlengths
+            oo = forderv(data, by=key(data), retGrp=TRUE)
+            attr(oo, 'maxgrpn') > 1L
         }
-        if (fun_agg_chk(data))
+        if (!fun.null && fun_agg_chk(data))
             stop("Aggregating function provided to argument 'fun.aggregate' should return a length 1 vector for each group, but returns length != 1 for atleast one group. Please have a look at the DETAILS section of ?dcast.data.table ")
     } else {
         if (is.null(subset))
@@ -112,8 +119,8 @@ dcast.data.table <- function(data, formula, fun.aggregate = NULL, ..., margins =
     assign("CJ", CJ, .CASTenv)
     ans <- .Call("Cfcast", data, ff$ll, ff$rr, value.var, fill, fill.default, is.null(fun.aggregate), .CASTenv, drop)
     setDT(ans)
-    if (any(duplicated(names(ans)))) {
-        message("Duplicate column names found in cast data.table. Setting unique names using 'make.names'")   
+    if (anyDuplicated(names(ans))) {
+        message("Duplicate column names found in cast data.table. Setting unique names using 'make.unique'")
         setnames(ans, make.unique(names(ans)))
     }
     setattr(ans, 'sorted', names(ans)[seq_along(ff$ll)])

diff --git a/README.md b/README.md
@@ -226,6 +226,8 @@ We moved from R-Forge to GitHub on 9 June 2014, including history.
 
   33.  `DT[, list(list(.)), by=.]` returns correct results in R >=3.1.0 as well. The bug was due to recent (welcoming) changes in R v3.1.0 where `list(.)` does not result in a *copy*. Closes [#481](https://github.com/Rdatatable/data.table/issues/481). 
 
+  34.  `dcast.data.table` handles `fun.aggregate` argument properly when called from within a function that accepts `fun.aggregate` argument and passes to `dcast.data.table()`. Closes [#713](https://github.com/Rdatatable/data.table/issues/713). Thanks to mathematicalcoffee for reporting [here](http://stackoverflow.com/q/24542976/559784) on SO.
+
 #### NOTES
 
   1.  Reminder: using `rolltolast` still works but since v1.9.2 now issues the following warning:  

diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw
@@ -4852,6 +4852,21 @@ test(1343, fread("A,B\n1,TRUE\n2,\n3,F"), data.table(A=1:3, B=c(TRUE,NA,FALSE)))
 test(1344, fread("A,B\n1,T\n2,NA\n3,"), data.table(A=1:3, B=c(TRUE,NA,NA)))
 
 
+# issues/713 - dcast.data.table and fun.aggregate
+DT <- data.table(id=rep(1:2, c(3,4)), k=c(rep(letters[1:3], 2), 'c'), v=1:7)
+foo <- function (tbl, fun.aggregate) {
+    dcast.data.table(tbl, id ~ k, value.var='v', fun.aggregate=fun.aggregate)
+}
+test(1345, foo(DT, last), dcast.data.table(DT, id ~ k, value.var='v', fun.aggregate=last))
+
+# more minor changes to dcast.data.table (subset argument handling symbol - removing any surprises with data.table's typical scoping rules) - test for that.
+DT <- data.table(id=rep(1:2, c(3,4)), k=c(rep(letters[1:3], 2), 'c'), v=1:7)
+bla <- c(TRUE, rep(FALSE, 6L)) 
+# calling `subset=.(bla)` gives eval error when testing... not sure what's happeing! using values directly instead for now.
+test(1346.1, dcast.data.table(DT, id ~ k, value.var="v", subset=.(c(TRUE, rep(FALSE, 6L)))), dcast.data.table(DT[1L], id ~ k, value.var="v"))
+DT[, bla := !bla]
+test(1346.2, dcast.data.table(DT, id ~ k, value.var="v", subset=.(bla), fun.aggregate=length), dcast.data.table(DT[(bla)], id ~ k, value.var="v", fun.aggregate=length))
+
 ##########################