Skip to content

Commit

Permalink
Closes #713. And minor tweaks and tests for 'subset' argument.
Browse files Browse the repository at this point in the history
  • Loading branch information
arunsrinivasan committed Jul 3, 2014
1 parent 8441e0f commit aff5621
Show file tree
Hide file tree
Showing 3 changed files with 47 additions and 23 deletions.
53 changes: 30 additions & 23 deletions R/fcast.R
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ dcast.data.table <- function(data, formula, fun.aggregate = NULL, ..., margins =
if (is.character(formula)) {
ff <- strsplit(strip(formula), "~", fixed=TRUE)[[1]]
if (length(ff) > 2)
stop("Cast formula of length > 2 detected. Data.table has at most two output dimensions.")
stop("Cast formula length is > 2, must be = 2.")
ff <- strsplit(ff, "+", fixed=TRUE)
setattr(ff, 'names', c("ll", "rr"))
ff <- lapply(ff, function(x) x[x != "."])
Expand All @@ -43,17 +43,20 @@ dcast.data.table <- function(data, formula, fun.aggregate = NULL, ..., margins =
drop <- as.logical(drop[1])
if (is.na(drop)) stop("'drop' must be TRUE/FALSE")

# deal with 'subset' first
# subset
m <- as.list(match.call()[-1])
subset <- m$subset[[2]]
if (!is.null(subset)) data = data[eval(subset), unique(c(ff_, value.var)), with=FALSE] # TODO: revisit. Maybe too costly on large data

# if original or subset'd data.table has 0 rows or cols, error.
if (!is.null(subset)) {
if (is.name(subset)) subset = as.call(list(quote(`(`), subset))
data = data[eval(subset, data, parent.frame()), unique(c(ff_, value.var)), with=FALSE]
}
if (nrow(data) == 0L || ncol(data) == 0L) stop("Can't 'cast' on an empty data.table")

# next, check and set 'fun.aggregate = length' it's null but at least one group size is > 1.
# set 'fun.aggregate = length' if max group size > 1
fun.null=FALSE
if (is.null(fun.aggregate)) {
oo = forderv(data, by=ff_, retGrp=TRUE) # to check if the maximum group size is > 1 and is TRUE set fun.aggregate to length if it's NULL
fun.null=TRUE
oo = forderv(data, by=ff_, retGrp=TRUE)
if (attr(oo, 'maxgrpn') > 1L) {
message("Aggregate function missing, defaulting to 'length'")
fun.aggregate <- length
Expand All @@ -65,41 +68,45 @@ dcast.data.table <- function(data, formula, fun.aggregate = NULL, ..., margins =
fill.default = fun.aggregate(data[[value.var]][0], ...)
args <- c("data", "formula", "margins", "subset", "fill", "value.var", "verbose", "drop")
m <- m[setdiff(names(m), args)]
fun.aggregate <- as.call(c(m[1], as.name(value.var), m[-1]))
.CASTfun = fun.aggregate # issues/713
fun.aggregate <- as.call(c(quote(.CASTfun), as.name(value.var), m[-1]))
fun.aggregate <- as.call(c(as.name("list"), setattr(list(fun.aggregate), 'names', value.var)))
# checking for #5191 (until fixed, this is a workaround
if (length(intersect(value.var, ff_))) fun.aggregate = as.call(list(as.name("{"), as.name(".SD"), fun.aggregate))
# workaround until #5191 (issues/497) is fixed
if (length(intersect(value.var, ff_)))
fun.aggregate = as.call(list(as.name("{"), as.name(".SD"), fun.aggregate))
}
if (length(ff$rr) == 0) { # take care of special case
if (is.null(fun.aggregate))
# special case
if (length(ff$rr) == 0) {
if (is.null(fun.aggregate))
ans = data[, c(ff$ll, value.var), with=FALSE]
else {
# checking for #5191 (until fixed, this is a workaround
# workaround until #5191 (issues/497) is fixed
if (length(intersect(value.var, ff_))) ans = data[, eval(fun.aggregate), by=c(ff$ll), .SDcols=value.var]
else ans = data[, eval(fun.aggregate), by=c(ff$ll)]
}
if (any(duplicated(names(ans)))) {
message("Duplicate column names found in cast data.table. Setting unique names using 'make.names'")
if (anyDuplicated(names(ans))) {
message("Duplicate column names found in cast data.table. Setting unique names using 'make.unique'")
setnames(ans, make.unique(names(ans)))
}
if (!identical(key(ans), ff$ll)) setkeyv(ans, names(ans)[seq_along(ff$ll)])
return(ans)
}
# if fun.aggregate exists, then aggregate in R-side (now that 'adhoc-by' is extremely fast!)
# aggregation moved to R now that 'adhoc-by' is crazy fast!
if (!is.null(fun.aggregate)) {
if (length(intersect(value.var, ff_))) {
data = data[, eval(fun.aggregate), by=c(ff_), .SDcols=value.var]
value.var = tail(make.unique(names(data)), 1L)
setnames(data, ncol(data), value.var)
}
else data = data[, eval(fun.aggregate), by=c(ff_)]
setkeyv(data, ff_) # can't use 'oo' here, but should be faster as it's uncommon to have huge number of groups.
setkeyv(data, ff_)
# issues/693
fun_agg_chk <- function(x) {
pos = uniqlist(as.list(x)[key(x)]) # as.list shallow copies, I believe
len = uniqlengths(pos, nrow(x))
any(len != 1L)
# sorted now, 'forderv' should be as fast as uniqlist+uniqlengths
oo = forderv(data, by=key(data), retGrp=TRUE)
attr(oo, 'maxgrpn') > 1L
}
if (fun_agg_chk(data))
if (!fun.null && fun_agg_chk(data))
stop("Aggregating function provided to argument 'fun.aggregate' should return a length 1 vector for each group, but returns length != 1 for atleast one group. Please have a look at the DETAILS section of ?dcast.data.table ")
} else {
if (is.null(subset))
Expand All @@ -112,8 +119,8 @@ dcast.data.table <- function(data, formula, fun.aggregate = NULL, ..., margins =
assign("CJ", CJ, .CASTenv)
ans <- .Call("Cfcast", data, ff$ll, ff$rr, value.var, fill, fill.default, is.null(fun.aggregate), .CASTenv, drop)
setDT(ans)
if (any(duplicated(names(ans)))) {
message("Duplicate column names found in cast data.table. Setting unique names using 'make.names'")
if (anyDuplicated(names(ans))) {
message("Duplicate column names found in cast data.table. Setting unique names using 'make.unique'")
setnames(ans, make.unique(names(ans)))
}
setattr(ans, 'sorted', names(ans)[seq_along(ff$ll)])
Expand Down
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -226,6 +226,8 @@ We moved from R-Forge to GitHub on 9 June 2014, including history.
33. `DT[, list(list(.)), by=.]` returns correct results in R >=3.1.0 as well. The bug was due to recent (welcoming) changes in R v3.1.0 where `list(.)` does not result in a *copy*. Closes [#481](https://github.com/Rdatatable/data.table/issues/481).
34. `dcast.data.table` handles `fun.aggregate` argument properly when called from within a function that accepts `fun.aggregate` argument and passes to `dcast.data.table()`. Closes [#713](https://github.com/Rdatatable/data.table/issues/713). Thanks to mathematicalcoffee for reporting [here](http://stackoverflow.com/q/24542976/559784) on SO.
#### NOTES
1. Reminder: using `rolltolast` still works but since v1.9.2 now issues the following warning:
Expand Down
15 changes: 15 additions & 0 deletions inst/tests/tests.Rraw
Original file line number Diff line number Diff line change
Expand Up @@ -4852,6 +4852,21 @@ test(1343, fread("A,B\n1,TRUE\n2,\n3,F"), data.table(A=1:3, B=c(TRUE,NA,FALSE)))
test(1344, fread("A,B\n1,T\n2,NA\n3,"), data.table(A=1:3, B=c(TRUE,NA,NA)))


# issues/713 - dcast.data.table and fun.aggregate
DT <- data.table(id=rep(1:2, c(3,4)), k=c(rep(letters[1:3], 2), 'c'), v=1:7)
foo <- function (tbl, fun.aggregate) {
dcast.data.table(tbl, id ~ k, value.var='v', fun.aggregate=fun.aggregate)
}
test(1345, foo(DT, last), dcast.data.table(DT, id ~ k, value.var='v', fun.aggregate=last))

# more minor changes to dcast.data.table (subset argument handling symbol - removing any surprises with data.table's typical scoping rules) - test for that.
DT <- data.table(id=rep(1:2, c(3,4)), k=c(rep(letters[1:3], 2), 'c'), v=1:7)
bla <- c(TRUE, rep(FALSE, 6L))
# calling `subset=.(bla)` gives eval error when testing... not sure what's happeing! using values directly instead for now.
test(1346.1, dcast.data.table(DT, id ~ k, value.var="v", subset=.(c(TRUE, rep(FALSE, 6L)))), dcast.data.table(DT[1L], id ~ k, value.var="v"))
DT[, bla := !bla]
test(1346.2, dcast.data.table(DT, id ~ k, value.var="v", subset=.(bla), fun.aggregate=length), dcast.data.table(DT[(bla)], id ~ k, value.var="v", fun.aggregate=length))

##########################


Expand Down

0 comments on commit aff5621

Please sign in to comment.