From 55031d79f84d13c91e3488425fefda0638a5ed33 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Mon, 27 Apr 2020 15:47:15 +0200 Subject: [PATCH 01/29] implement AbstractDataFrame functionality --- src/abstractdataframe/selection.jl | 89 +++++++++++++++++++----------- 1 file changed, 57 insertions(+), 32 deletions(-) diff --git a/src/abstractdataframe/selection.jl b/src/abstractdataframe/selection.jl index 5aa917eb14..4fcca42d30 100644 --- a/src/abstractdataframe/selection.jl +++ b/src/abstractdataframe/selection.jl @@ -161,7 +161,7 @@ function select_transform!(nc::Pair{<:Union{Int, AbstractVector{Int}, AsTable}, col_idx, (fun, newname) = nc # It is allowed to request a tranformation operation into a newname column # only once. This is ensured by the logic related to transformed_cols dictionaly - # in _select, therefore in select_transform! such a duplicate should not happen + # in _process, therefore in select_transform! such a duplicate should not happen @assert !hasproperty(newdf, newname) cdf = eachcol(df) if col_idx isa Int @@ -185,6 +185,13 @@ function select_transform!(nc::Pair{<:Union{Int, AbstractVector{Int}, AsTable}, newdfcols[i] = fill!(similar(col, length(res)), first(col)) end end + + # this means that we use `select` or `transform` not `combine` + if !allow_resizing_newdf[] && ncol(newdf) == 0 && length(res) != nrow(df) + throw(ArgumentError("length $(length(res)) of vector returned from " * + "function $fun is different than number of rows" * + "$(nrow(df)) of the source data frame.")) + end allow_resizing_newdf[] = false respar = parent(res) parent_cols = col_idx isa AsTable ? col_idx.cols : col_idx @@ -196,9 +203,14 @@ function select_transform!(nc::Pair{<:Union{Int, AbstractVector{Int}, AsTable}, end else res_unwrap = res isa Union{AbstractArray{<:Any, 0}, Ref} ? res[] : res - # allow squashing a scalar to 0 rows - newdf[!, newname] = fill!(Tables.allocatecolumn(typeof(res_unwrap), - ncol(newdf) == 0 ? 1 : nrow(newdf)), + if ncol(newdf) == 0 + # if allow_resizing_newdf[] is false we know this is select or transform + rows = allow_resizing_newdf[] ? 1 : nrow(df) + else + # allow squashing a scalar to 0 rows + rows = nrow(newdf) + end + newdf[!, newname] = fill!(Tables.allocatecolumn(typeof(res_unwrap), rows), res_unwrap) end # mark that column transformation was applied @@ -518,22 +530,42 @@ julia> select(df, AsTable(:) => ByRow(mean)) ``` """ -select(df::DataFrame, args::AbstractVector{Int}; copycols::Bool=true) = +select(df::AbstractDataFrame, args...; copycols::Bool=true) = + _select(df, args..., copycols=copycols, keeprows=true) + +""" + transform(df::AbstractDataFrame, args...; copycols::Bool=true) + +Create a new data frame that contains columns from `df` and adds columns +specified by `args` and return it. +Equivalent to `select(df, :, args..., copycols=copycols)`. + +See [`select`](@ref) for detailed rules regarding accepted values for `args`. +""" +transform(df::AbstractDataFrame, args...; copycols::Bool=true) = + select(df, :, args..., copycols=copycols) + +combine(df::AbstractDataFrame, args...) = + _select(df, args..., copycols=true, keeprows=false) + +combine(arg, df::AbstractDataFrame) = combine(arg, groupby(df, [])) + +_select(df::DataFrame, args::AbstractVector{Int}; copycols::Bool, keeprows::Bool) = DataFrame(_columns(df)[args], Index(_names(df)[args]), copycols=copycols) -function select(df::DataFrame, c::MultiColumnIndex; copycols::Bool=true) +function _select(df::DataFrame, c::MultiColumnIndex; copycols::Bool, keeprows::Bool) if c isa AbstractVector{<:Pair} - return select(df, c..., copycols=copycols) + return _select(df, c..., copycols=copycols, keeprows=keeprows) else - return select(df, index(df)[c], copycols=copycols) + return _select(df, index(df)[c], copycols=copycols, keeprows=keeprows) end end -select(df::DataFrame, c::ColumnIndex; copycols::Bool=true) = - select(df, [c], copycols=copycols) +_select(df::DataFrame, c::ColumnIndex; copycols::Bool, keeprows::Bool) = + _select(df, [c], copycols=copycols, keeprows=keeprows) -function select(df::DataFrame, cs...; copycols::Bool=true) +function _select(df::DataFrame, cs...; copycols::Bool, keeprows::Bool) cs_vec = [] for v in cs if v isa AbstractVector{<:Pair} @@ -542,10 +574,11 @@ function select(df::DataFrame, cs...; copycols::Bool=true) push!(cs_vec, v) end end - _select(df, [normalize_selection(index(df), c) for c in cs_vec], copycols) + return _process(df, [normalize_selection(index(df), c) for c in cs_vec], + copycols, keeprows) end -function _select(df::AbstractDataFrame, normalized_cs, copycols::Bool) +function _process(df::AbstractDataFrame, normalized_cs, copycols::Bool, keeprows::Bool) @assert !(df isa SubDataFrame && copycols==false) newdf = DataFrame() # the role of transformed_cols is the following @@ -593,7 +626,9 @@ function _select(df::AbstractDataFrame, normalized_cs, copycols::Bool) end # we allow resizing newdf only if up to some point only scalars were put # in it. The moment we put any vector into newdf its number of rows becomes fixed - allow_resizing_newdf = Ref(true) + # Also if keeprows is true then we make sure to rpoduce nrow(df) rows so resizing + # is not allowed + allow_resizing_newdf = Ref(!keeprows) for nc in normalized_cs if nc isa AbstractVector{Int} allunique(nc) || throw(ArgumentError("duplicate column names selected")) @@ -621,6 +656,7 @@ function _select(df::AbstractDataFrame, normalized_cs, copycols::Bool) newdfcols[i] = fill!(similar(col, nrow(df)), first(col)) end end + # here even if keeprows is true all is OK newdf[!, newname] = copycols ? df[:, i] : df[!, i] allow_resizing_newdf[] = false end @@ -643,18 +679,19 @@ function _select(df::AbstractDataFrame, normalized_cs, copycols::Bool) return newdf end -select(dfv::SubDataFrame, ind::ColumnIndex; copycols::Bool=true) = - select(dfv, [ind], copycols=copycols) +_select(dfv::SubDataFrame, ind::ColumnIndex; copycols::Bool, keeprows::Bool) = + _select(dfv, [ind], copycols=copycols, keeprows=keeprows) -function select(dfv::SubDataFrame, args::MultiColumnIndex; copycols::Bool=true) +function _select(dfv::SubDataFrame, args::MultiColumnIndex; + copycols::Bool, keeprows::Bool) if args isa AbstractVector{<:Pair} - return select(dfv, args..., copycols=copycols) + return _select(dfv, args..., copycols=copycols, keeprows=keeprows) else return copycols ? dfv[:, args] : view(dfv, :, args) end end -function select(dfv::SubDataFrame, args...; copycols::Bool=true) +function select(dfv::SubDataFrame, args...; copycols::Bool, keeprows::Bool) if copycols cs_vec = [] for v in args @@ -664,7 +701,7 @@ function select(dfv::SubDataFrame, args...; copycols::Bool=true) push!(cs_vec, v) end end - return _select(dfv, [normalize_selection(index(dfv), c) for c in cs_vec], true) + return _process(dfv, [normalize_selection(index(dfv), c) for c in cs_vec], true, true) else # we do not support transformations here # newinds contains only indexing; making it Vector{Any} avoids some compilation @@ -692,15 +729,3 @@ function select(dfv::SubDataFrame, args...; copycols::Bool=true) return view(dfv, :, isempty(newinds) ? [] : All(newinds...)) end end - -""" - transform(df::AbstractDataFrame, args...; copycols::Bool=true) - -Create a new data frame that contains columns from `df` and adds columns -specified by `args` and return it. -Equivalent to `select(df, :, args..., copycols=copycols)`. - -See [`select`](@ref) for detailed rules regarding accepted values for `args`. -""" -transform(df::AbstractDataFrame, args...; copycols::Bool=true) = - select(df, :, args..., copycols=copycols) From 55bde129b56808f521235e114661ec41642d3a52 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Mon, 27 Apr 2020 23:41:40 +0200 Subject: [PATCH 02/29] preparation in grouping, rename to _mutate in non-grouping --- src/abstractdataframe/selection.jl | 28 +++++++++++------------ src/groupeddataframe/splitapplycombine.jl | 17 ++++++++++---- 2 files changed, 26 insertions(+), 19 deletions(-) diff --git a/src/abstractdataframe/selection.jl b/src/abstractdataframe/selection.jl index 4fcca42d30..5e3f91fcf1 100644 --- a/src/abstractdataframe/selection.jl +++ b/src/abstractdataframe/selection.jl @@ -531,7 +531,7 @@ julia> select(df, AsTable(:) => ByRow(mean)) """ select(df::AbstractDataFrame, args...; copycols::Bool=true) = - _select(df, args..., copycols=copycols, keeprows=true) + _manipulate(df, args..., copycols=copycols, keeprows=true) """ transform(df::AbstractDataFrame, args...; copycols::Bool=true) @@ -546,26 +546,26 @@ transform(df::AbstractDataFrame, args...; copycols::Bool=true) = select(df, :, args..., copycols=copycols) combine(df::AbstractDataFrame, args...) = - _select(df, args..., copycols=true, keeprows=false) + _manipulate(df, args..., copycols=true, keeprows=false) combine(arg, df::AbstractDataFrame) = combine(arg, groupby(df, [])) -_select(df::DataFrame, args::AbstractVector{Int}; copycols::Bool, keeprows::Bool) = +_manipulate(df::DataFrame, args::AbstractVector{Int}; copycols::Bool, keeprows::Bool) = DataFrame(_columns(df)[args], Index(_names(df)[args]), copycols=copycols) -function _select(df::DataFrame, c::MultiColumnIndex; copycols::Bool, keeprows::Bool) +function _manipulate(df::DataFrame, c::MultiColumnIndex; copycols::Bool, keeprows::Bool) if c isa AbstractVector{<:Pair} - return _select(df, c..., copycols=copycols, keeprows=keeprows) + return _manipulate(df, c..., copycols=copycols, keeprows=keeprows) else - return _select(df, index(df)[c], copycols=copycols, keeprows=keeprows) + return _manipulate(df, index(df)[c], copycols=copycols, keeprows=keeprows) end end -_select(df::DataFrame, c::ColumnIndex; copycols::Bool, keeprows::Bool) = - _select(df, [c], copycols=copycols, keeprows=keeprows) +_manipulate(df::DataFrame, c::ColumnIndex; copycols::Bool, keeprows::Bool) = + _manipulate(df, [c], copycols=copycols, keeprows=keeprows) -function _select(df::DataFrame, cs...; copycols::Bool, keeprows::Bool) +function _manipulate(df::DataFrame, cs...; copycols::Bool, keeprows::Bool) cs_vec = [] for v in cs if v isa AbstractVector{<:Pair} @@ -679,19 +679,19 @@ function _process(df::AbstractDataFrame, normalized_cs, copycols::Bool, keeprows return newdf end -_select(dfv::SubDataFrame, ind::ColumnIndex; copycols::Bool, keeprows::Bool) = - _select(dfv, [ind], copycols=copycols, keeprows=keeprows) +_manipulate(dfv::SubDataFrame, ind::ColumnIndex; copycols::Bool, keeprows::Bool) = + _manipulate(dfv, [ind], copycols=copycols, keeprows=keeprows) -function _select(dfv::SubDataFrame, args::MultiColumnIndex; +function _manipulate(dfv::SubDataFrame, args::MultiColumnIndex; copycols::Bool, keeprows::Bool) if args isa AbstractVector{<:Pair} - return _select(dfv, args..., copycols=copycols, keeprows=keeprows) + return _manipulate(dfv, args..., copycols=copycols, keeprows=keeprows) else return copycols ? dfv[:, args] : view(dfv, :, args) end end -function select(dfv::SubDataFrame, args...; copycols::Bool, keeprows::Bool) +function _manipulate(dfv::SubDataFrame, args...; copycols::Bool, keeprows::Bool) if copycols cs_vec = [] for v in args diff --git a/src/groupeddataframe/splitapplycombine.jl b/src/groupeddataframe/splitapplycombine.jl index 566b02171a..1a1aa69083 100644 --- a/src/groupeddataframe/splitapplycombine.jl +++ b/src/groupeddataframe/splitapplycombine.jl @@ -173,6 +173,15 @@ function groupby(df::AbstractDataFrame, cols; return gd end +function _check_cannonical(gdf::GroupedDataFrame) + @assert length(gdf.starts) == length(gdf.ends) + (gdf.starts[1] != 1 || gdf.ends[end] != nrow(parent(gdf))) && return false + for i in 2:length(gdf.starts) + gdf.starts[i] - gdf.ends[i-1] != 1 && return false + end + return true +end + const F_TYPE_RULES = """ `fun` can return a single value, a row, a vector, or multiple rows. @@ -574,14 +583,12 @@ julia> combine(gd, :, AsTable(Not(:a)) => sum) See [`by`](@ref) for more examples. """ -combine(f::Union{Base.Callable, Pair}, gd::GroupedDataFrame; keepkeys::Bool=true) = - combine(gd, f, keepkeys=keepkeys) -combine(gd::GroupedDataFrame, f::Base.Callable; keepkeys::Bool=true) = +combine(f::Base.Callable, gd::GroupedDataFrame; keepkeys::Bool=true) = combine_helper(f, gd, keepkeys=keepkeys) -combine(gd::GroupedDataFrame, f::typeof(nrow); keepkeys::Bool=true) = +combine(f::typeof(nrow), gd::GroupedDataFrame; keepkeys::Bool=true) = combine(gd, [nrow => :nrow], keepkeys=keepkeys) -function combine(gd::GroupedDataFrame, p::Pair; keepkeys::Bool=true) +function combine(p::Pair, gd::GroupedDataFrame; keepkeys::Bool=true) # move handling of aggregate to specialized combine p_from, p_to = p From 2f81c63ba8b54b8225be91c709a64497fcf3220f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Tue, 28 Apr 2020 00:50:38 +0200 Subject: [PATCH 03/29] tentative rework of _combine that should be able to support select and transform efficiently --- src/abstractdataframe/selection.jl | 10 ++--- src/groupeddataframe/splitapplycombine.jl | 45 +++++++++++++++++++++-- 2 files changed, 45 insertions(+), 10 deletions(-) diff --git a/src/abstractdataframe/selection.jl b/src/abstractdataframe/selection.jl index 5e3f91fcf1..f211ee09b8 100644 --- a/src/abstractdataframe/selection.jl +++ b/src/abstractdataframe/selection.jl @@ -3,12 +3,10 @@ # normalize_selection function makes sure that whatever input format of idx is it # will end up in one of four canonical forms -# 1) Int -# 2) AbstractVector{Int} -# 4) Pair{Int, <:Pair{<:Base.Callable, Symbol}} -# 5) Pair{AbstractVector{Int}, <:Pair{<:Base.Callable, Symbol}} -# 6) Pair{Int, Pair{ByRow, Symbol}} -# 7) Pair{AbstractVector{Int}, Pair{ByRow, Symbol}} +# 1) AbstractVector{Int} +# 2) Pair{Int, <:Pair{<:Base.Callable, Symbol}} +# 3) Pair{AbstractVector{Int}, <:Pair{<:Base.Callable, Symbol}} +# 4) Pair{AsTable, <:Pair{<:Base.Callable, Symbol}} """ ByRow diff --git a/src/groupeddataframe/splitapplycombine.jl b/src/groupeddataframe/splitapplycombine.jl index 1a1aa69083..8aed872aa2 100644 --- a/src/groupeddataframe/splitapplycombine.jl +++ b/src/groupeddataframe/splitapplycombine.jl @@ -1097,10 +1097,32 @@ function _agg2idx_map_helper(idx, idx_agg) end function _combine(f::AbstractVector{<:Pair}, - gd::GroupedDataFrame, nms::AbstractVector{Symbol}) + gd::GroupedDataFrame, nms::AbstractVector{Symbol}, + copycols::Bool=true, keeprows::Bool=false) # TODO: remove these defaults # here f should be normalized and in a form of source_cols => fun @assert all(x -> first(x) isa Union{Int, AbstractVector{Int}, AsTable}, f) @assert all(x -> last(x) isa Union{Base.Callable, ByRow}, f) + + if keeprows + if !_check_cannonical(gd) + throw(ArgumentError("select or transform functions require that" * + "GroupedDataFrame is not sorted or subsetted")) + end + idx_keeprows = Vector{Int}(undef, nrow(parent(gd))) + let i = 0 + for (s, e) in zip(gd.starts, gd.ends) + v = gd.idx[s] + for k in s:e + i += 1 + idx_keeprows[i] = v + end + end + @assert i == nrow(parent(gd)) + end + else + idx_keeprows = nothing # should not be used but do not leave it unassigned + end + idx_agg = nothing if any(isagg, f) # Compute indices of representative rows only once for all AbstractAggregates @@ -1120,6 +1142,11 @@ function _combine(f::AbstractVector{<:Pair}, agg = check_aggregate(last(p)) outcol = agg(incol, gd) res[i] = idx_agg, outcol + elseif keeprows && fun isa identity && !(source_cols isa AsTable) + @assert source_cols isa Union{Int, AbstractVector{Int}} + @assert length(source_cols) == 1 + outcol = parentdf[!, first(source_cols)] + res[i] = idx_keeprows, copycols ? copy(outcol) : outcol else if source_cols isa Int incols = (parentdf[!, source_cols],) @@ -1160,11 +1187,15 @@ function _combine(f::AbstractVector{<:Pair}, # idx_agg === nothing then we have only functions that # returned multiple rows and idx_loc = 1 idx_loc = findfirst(x -> x[1] !== idx_agg, res) - if isnothing(idx_loc) + if !keeprows && isnothing(idx_loc) @assert !isnothing(idx_agg) idx = idx_agg else - idx = res[idx_loc][1] + if keeprows + idx = idx_keeprows + else + idx = res[idx_loc][1] + end agg2idx_map = nothing for i in 1:length(res) if res[i][1] !== idx && res[i][1] != idx @@ -1176,7 +1207,13 @@ function _combine(f::AbstractVector{<:Pair}, end res[i] = idx, res[i][2][agg2idx_map] elseif idx != res[i][1] - throw(ArgumentError("all functions must return vectors of the same length")) + if keeprows + throw(ArgumentError("all functions must return vectors of " * + "the length equal to the group rows count " * + "in the source GroupedDataFrame")) + else + throw(ArgumentError("all functions must return vectors of the same length")) + end end end end From fd951c51f9af0aebace945b5d4b4001385dcb72b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Tue, 28 Apr 2020 12:02:27 +0200 Subject: [PATCH 04/29] continue grouping --- src/groupeddataframe/splitapplycombine.jl | 26 ++++++++++++++--------- 1 file changed, 16 insertions(+), 10 deletions(-) diff --git a/src/groupeddataframe/splitapplycombine.jl b/src/groupeddataframe/splitapplycombine.jl index 8aed872aa2..1a9895de5c 100644 --- a/src/groupeddataframe/splitapplycombine.jl +++ b/src/groupeddataframe/splitapplycombine.jl @@ -583,12 +583,15 @@ julia> combine(gd, :, AsTable(Not(:a)) => sum) See [`by`](@ref) for more examples. """ -combine(f::Base.Callable, gd::GroupedDataFrame; keepkeys::Bool=true) = - combine_helper(f, gd, keepkeys=keepkeys) -combine(f::typeof(nrow), gd::GroupedDataFrame; keepkeys::Bool=true) = - combine(gd, [nrow => :nrow], keepkeys=keepkeys) - -function combine(p::Pair, gd::GroupedDataFrame; keepkeys::Bool=true) +combine(f::Base.Callable, gd::GroupedDataFrame; + keepkeys::Bool=true, regroup::Bool=false) = + combine_helper(f, gd, keepkeys=keepkeys, regroup=regroup) +combine(f::typeof(nrow), gd::GroupedDataFrame; + keepkeys::Bool=true, regroup::Bool=false) = + combine(gd, [nrow => :nrow], keepkeys=keepkeys, regroup=regroup) + +function combine(p::Pair, gd::GroupedDataFrame; + keepkeys::Bool=true, regroup::Bool=false) # move handling of aggregate to specialized combine p_from, p_to = p @@ -605,13 +608,13 @@ function combine(p::Pair, gd::GroupedDataFrame; keepkeys::Bool=true) else cs = p_from end - return combine_helper(cs => p_to, gd, keepkeys=keepkeys) + return combine_helper(cs => p_to, gd, keepkeys=keepkeys, regroup=regroup) end function combine(gd::GroupedDataFrame, @nospecialize(cs::Union{Pair, typeof(nrow), ColumnIndex, MultiColumnIndex}...); - keepkeys::Bool=true) + keepkeys::Bool=true, regroup::Bool=false) @assert !isempty(cs) cs_vec = [] for p in cs @@ -684,7 +687,7 @@ function combine(gd::GroupedDataFrame, end f = Pair[first(x) => first(last(x)) for x in cs_norm] nms = Symbol[last(last(x)) for x in cs_norm] - return combine_helper(f, gd, nms, keepkeys=keepkeys) + return combine_helper(f, gd, nms, keepkeys=keepkeys, regroup=regroup) end function combine(gd::GroupedDataFrame; f...) @@ -700,7 +703,10 @@ end function combine_helper(f, gd::GroupedDataFrame, nms::Union{AbstractVector{Symbol},Nothing}=nothing; - keepkeys::Bool=true) + keepkeys::Bool, regroup::Bool) + if regroup && !keepkeys + throw(ArgumentError("keepkeys=false when regroup=true is not allowed")) + end if length(gd) > 0 idx, valscat = _combine(f, gd, nms) keepkeys || return valscat From eb9ace938a92cf72b42b62940071964ac79f4b8f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Tue, 28 Apr 2020 14:48:28 +0200 Subject: [PATCH 05/29] implement select, transform, select! and transform! for GroupedDataFrame, fix bug in map --- src/deprecated.jl | 22 + src/groupeddataframe/splitapplycombine.jl | 503 ++++++---------------- 2 files changed, 164 insertions(+), 361 deletions(-) diff --git a/src/deprecated.jl b/src/deprecated.jl index 13ba76c3c7..e95b82aa6e 100644 --- a/src/deprecated.jl +++ b/src/deprecated.jl @@ -444,3 +444,25 @@ function aggregate(d::AbstractDataFrame, cols, fs::AbstractVector; end @deprecate deleterows!(df::DataFrame, inds) delete!(df, inds) + +@deprecate by(f::Union{Base.Callable, Pair}, d::AbstractDataFrame, cols::Any; + sort::Bool=false, skipmissing::Bool=false, + keepkeys::Bool=true) combine(groupby(d, cols, sort=sort, skipmissing=skipmissing), + f, keepkeys=keepkeys) +@deprecate by(d::AbstractDataFrame, cols::Any, f::Base.Callable; + sort::Bool=false, skipmissing::Bool=false, + keepkeys::Bool=true) combine(groupby(d, cols, sort=sort, skipmissing=skipmissing), + f, keepkeys=keepkeys) +@deprecate by(d::AbstractDataFrame, cols::Any, f::Pair; + sort::Bool=false, skipmissing::Bool=false, + keepkeys::Bool=true) combine(groupby(d, cols, sort=sort, skipmissing=skipmissing), + f, keepkeys=keepkeys) + +@deprecate by(d::AbstractDataFrame, cols::Any, f::Union{Pair, typeof(nrow), + ColumnIndex, MultiColumnIndex}...; + sort::Bool=false, skipmissing::Bool=false, + keepkeys::Bool=true) combine(groupby(d, cols, sort=sort, skipmissing=skipmissing), + f..., keepkeys=keepkeys) + +import Base: map +@deprecate map(f::Union{Base.Callable, Pair}, gd::GroupedDataFrame) combine(f, gd, regroup=true) diff --git a/src/groupeddataframe/splitapplycombine.jl b/src/groupeddataframe/splitapplycombine.jl index 1a9895de5c..e5cb15bb4a 100644 --- a/src/groupeddataframe/splitapplycombine.jl +++ b/src/groupeddataframe/splitapplycombine.jl @@ -174,11 +174,15 @@ function groupby(df::AbstractDataFrame, cols; end function _check_cannonical(gdf::GroupedDataFrame) - @assert length(gdf.starts) == length(gdf.ends) + gmin, gmax = extrema(gdf.groups) + @assert length(gdf.starts) == length(gdf.ends) == gmax + @assert gmin <= 1 (gdf.starts[1] != 1 || gdf.ends[end] != nrow(parent(gdf))) && return false for i in 2:length(gdf.starts) gdf.starts[i] - gdf.ends[i-1] != 1 && return false end + # gmin == 0 means we have dropped groups which is not possible here + @assert gmin == 1 return true end @@ -219,160 +223,6 @@ const F_TYPE_RULES = named `x1`, `x2` and so on. """ -""" - map(fun::Union{Function, Type}, gd::GroupedDataFrame) - map(pair::Pair, gd::GroupedDataFrame) - -Apply `fun` or `pair` to each group of rows and return a [`GroupedDataFrame`](@ref). - -If `fun` is specified it must be a function, and it is passed a [`SubDataFrame`](@ref) -view for each group and can return any return value defined below. -Note that this form is slower than `pair` due to type instability. - -If `pair` is passed then it must follow the rules specified for transformations in -[`select`](@ref) and have the form `source_cols => fun`, -`source_cols => fun => target_col`, or `source_col => target_col`. -Function defined by `fun` is passed `SubArray` views as positional arguments for -each column specified to be selected and can return any return value defined below, -or a `NamedTuple` containing these `SubArray`s if `source_cols` is an `AsTable` selector. -As a special case `nrow` or `nrow => target_col` can be passed without specifying -input columns to efficiently calculate number of rows in each group. -If `nrow` is passed the resulting column name is `:nrow`. - - -$F_TYPE_RULES - -See also [`combine`](@ref) that returns a `DataFrame` rather than a `GroupedDataFrame`. - -# Examples -```jldoctest -julia> df = DataFrame(a = repeat([1, 2, 3, 4], outer=[2]), - b = repeat([2, 1], outer=[4]), - c = 1:8); - -julia> gd = groupby(df, :a); - -julia> map(sdf -> sum(sdf.c), gd) -GroupedDataFrame{DataFrame} with 4 groups based on key: :a -First Group: 1 row -│ Row │ a │ x1 │ -│ │ Int64 │ Int64 │ -├─────┼───────┼───────┤ -│ 1 │ 1 │ 6 │ -⋮ -Last Group: 1 row -│ Row │ a │ x1 │ -│ │ Int64 │ Int64 │ -├─────┼───────┼───────┤ -│ 1 │ 4 │ 12 │ - -julia> map(:c => sum, gd) -GroupedDataFrame with 4 groups based on key: a -First Group (1 row): a = 1 -│ Row │ a │ c_sum │ -│ │ Int64 │ Int64 │ -├─────┼───────┼───────┤ -│ 1 │ 1 │ 6 │ -⋮ -Last Group (1 row): a = 4 -│ Row │ a │ c_sum │ -│ │ Int64 │ Int64 │ -├─────┼───────┼───────┤ -│ 1 │ 4 │ 12 │ - -julia> map(nrow, gd) -GroupedDataFrame with 4 groups based on key: a -First Group (1 row): a = 1 -│ Row │ a │ nrow │ -│ │ Int64 │ Int64 │ -├─────┼───────┼───────┤ -│ 1 │ 1 │ 2 │ -⋮ -Last Group (1 row): a = 4 -│ Row │ a │ nrow │ -│ │ Int64 │ Int64 │ -├─────┼───────┼───────┤ -│ 1 │ 4 │ 2 │ - -julia> map(AsTable(valuecols(gd)) => sum, gd) -GroupedDataFrame with 4 groups based on key: a -First Group (2 rows): a = 1 -│ Row │ a │ b_c_sum │ -│ │ Int64 │ Int64 │ -├─────┼───────┼─────────┤ -│ 1 │ 1 │ 3 │ -│ 2 │ 1 │ 7 │ -⋮ -Last Group (2 rows): a = 4 -│ Row │ a │ b_c_sum │ -│ │ Int64 │ Int64 │ -├─────┼───────┼─────────┤ -│ 1 │ 4 │ 5 │ -│ 2 │ 4 │ 9 │ -``` - -See [`by`](@ref) for more examples. -""" -function Base.map(f::Union{Base.Callable, Pair}, gd::GroupedDataFrame) - if length(gd) > 0 - # here we know that parent(gd) has at least 1 column - if f isa Pair || f === nrow - if f isa Pair && first(f) isa Tuple - Base.depwarn("passing a Tuple $(first(f)) as column selector is deprecated" * - ", use a vector $(collect(first(f))) instead", :combine) - source_cols, (fun, out_col) = normalize_selection(index(parent(gd)), - collect(first(f)) => last(f)) - else - source_cols, (fun, out_col) = normalize_selection(index(parent(gd)), f) - end - # verify if it is not better to use a fast path, which we achieve by - # calling _combine(::AbstractVector, ::GroupedDataFrame, ::AbstractVector) - # as _combine(::Pair, ::GroupedDataFrame, ::Nothing) does not support it - if isagg(source_cols => fun) - idx, valscat = _combine([source_cols => fun], gd, [out_col]) - else - idx, valscat = _combine(source_cols => last(f), gd, nothing) - end - else - idx, valscat = _combine(f, gd, nothing) - end - keys = _names(parent(gd))[gd.cols] - for key in keys - if hasproperty(valscat, key) && - !isequal(valscat[!, key], view(parent(gd)[!, key], idx)) - throw(ArgumentError("column :$key in returned data frame " * - "is not equal to grouping key :$key")) - end - end - newparent = hcat!(parent(gd)[idx, gd.cols], - select(valscat, Not(intersect(keys, _names(valscat))), copycols=false)) - if length(idx) == 0 - return GroupedDataFrame(newparent, collect(1:length(gd.cols)), idx, - Int[], Int[], Int[], 0, Dict{Any,Int}()) - end - starts = Vector{Int}(undef, length(gd)) - ends = Vector{Int}(undef, length(gd)) - starts[1] = 1 - j = 1 - @inbounds for i in 2:length(idx) - if idx[i] != idx[i-1] - j += 1 - starts[j] = i - ends[j-1] = i - 1 - end - end - # In case some groups have to be dropped - resize!(starts, j) - resize!(ends, j) - ends[end] = length(idx) - return GroupedDataFrame(newparent, collect(1:length(gd.cols)), idx, - collect(1:length(idx)), starts, ends, j, nothing) - else - return GroupedDataFrame(parent(gd)[1:0, gd.cols], collect(1:length(gd.cols)), - Int[], Int[], Int[], Int[], 0, Dict{Any,Int}()) - end -end - const F_ARGUMENT_RULES = """ @@ -583,9 +433,12 @@ julia> combine(gd, :, AsTable(Not(:a)) => sum) See [`by`](@ref) for more examples. """ -combine(f::Base.Callable, gd::GroupedDataFrame; - keepkeys::Bool=true, regroup::Bool=false) = - combine_helper(f, gd, keepkeys=keepkeys, regroup=regroup) +function combine(f::Base.Callable, gd::GroupedDataFrame; + keepkeys::Bool=true, regroup::Bool=false) + return combine_helper(f, gd, keepkeys=keepkeys, regroup=regroup, + copycols=true, keeprows=false) +end + combine(f::typeof(nrow), gd::GroupedDataFrame; keepkeys::Bool=true, regroup::Bool=false) = combine(gd, [nrow => :nrow], keepkeys=keepkeys, regroup=regroup) @@ -608,13 +461,20 @@ function combine(p::Pair, gd::GroupedDataFrame; else cs = p_from end - return combine_helper(cs => p_to, gd, keepkeys=keepkeys, regroup=regroup) + return combine_helper(cs => p_to, gd, keepkeys=keepkeys, regroup=regroup, + copycols=true, keeprows=false) end -function combine(gd::GroupedDataFrame, - @nospecialize(cs::Union{Pair, typeof(nrow), - ColumnIndex, MultiColumnIndex}...); - keepkeys::Bool=true, regroup::Bool=false) +combine(gd::GroupedDataFrame, + cs::Union{Pair, typeof(nrow), ColumnIndex, MultiColumnIndex}...; + keepkeys::Bool=true, regroup::Bool=false) = + _combine_executor(gd, cs..., keepkeys=keepkeys, regroup=regroup, + copycols=true, keeprows=false) + +function _combine_executor(gd::GroupedDataFrame, + @nospecialize(cs::Union{Pair, typeof(nrow), + ColumnIndex, MultiColumnIndex}...); + keepkeys::Bool, regroup::Bool, copycols::Bool, keeprows::Bool) @assert !isempty(cs) cs_vec = [] for p in cs @@ -687,7 +547,8 @@ function combine(gd::GroupedDataFrame, end f = Pair[first(x) => first(last(x)) for x in cs_norm] nms = Symbol[last(last(x)) for x in cs_norm] - return combine_helper(f, gd, nms, keepkeys=keepkeys, regroup=regroup) + return combine_helper(f, gd, nms, keepkeys=keepkeys, regroup=regroup, + copycols=copycols, keeprows=keeprows) end function combine(gd::GroupedDataFrame; f...) @@ -703,25 +564,94 @@ end function combine_helper(f, gd::GroupedDataFrame, nms::Union{AbstractVector{Symbol},Nothing}=nothing; - keepkeys::Bool, regroup::Bool) + keepkeys::Bool, regroup::Bool, + copycols::Bool, keeprows::Bool) if regroup && !keepkeys throw(ArgumentError("keepkeys=false when regroup=true is not allowed")) end if length(gd) > 0 - idx, valscat = _combine(f, gd, nms) - keepkeys || return valscat + idx, valscat = _combine(f, gd, nms, copycols, keeprows) + keepkeys || regroup || return valscat keys = groupcols(gd) for key in keys - if hasproperty(valscat, key) && - !isequal(valscat[!, key], view(parent(gd)[!, key], idx)) - throw(ArgumentError("column :$key in returned data frame " * - "is not equal to grouping key :$key")) + if hasproperty(valscat, key) + if keeprows + isequal(valscat[!, key], parent(gd)[!, key]) || + throw(ArgumentError("column :$key in returned data frame " * + "is not equal to grouping key :$key")) + + else + isequal(valscat[!, key], view(parent(gd)[!, key], idx)) || + throw(ArgumentError("column :$key in returned data frame " * + "is not equal to grouping key :$key")) + end end end - return hcat!(parent(gd)[idx, gd.cols], - select(valscat, Not(intersect(keys, _names(valscat))), copycols=false)) + if keeprows + newparent = hcat!(select(parent(gd), gd.cols, copycols=copycols), + select(valscat, Not(intersect(keys, _names(valscat))), + copycols=false)) + else + newparent = hcat!(parent(gd)[idx, gd.cols], + select(valscat, Not(intersect(keys, _names(valscat))), + copycols=false)) + end + regroup || return newparent + + if length(idx) == 0 + @assert nrow(newparent) == 0 + return GroupedDataFrame(newparent, collect(1:length(gd.cols)), Int[], + Int[], Int[], Int[], 0, Dict{Any,Int}()) + end + if keeprows + # in this case we are sure that the result GroupedDataFrame has the + # same structure as the source + # we do not copy data as it should be safe - we never mutate fields of gd + if isnothing(getfield(gd, :keymap)) + return GroupedDataFrame(newparent, gd.cols, gd.groups, gd.idx, + gd.starts, gd.ends, gd.ngroups, nothing) + else + return GroupedDataFrame(newparent, gd.cols, gd.groups, gd.idx, + gd.starts, gd.ends, gd.ngroups, gd.keymap) + end + else + starts = Vector{Int}(undef, length(gd)) + ends = Vector{Int}(undef, length(gd)) + starts[1] = 1 + j = 1 + for i in 2:length(idx) + if idx[i] != idx[i-1] + j += 1 + starts[j] = i + ends[j-1] = i - 1 + end + end + # it is impossible to get more groups in the output than we had initially + @assert j <= length(gd) + # In case some groups have to be dropped + resize!(starts, j) + resize!(ends, j) + ends[end] = length(idx) + + groups = zeros(Int, length(idx)) + for i in 1:j + @inbounds for k in starts[i]:ends[i] + groups[k] = i + end + end + # all groups must be filled + @assert minimum(grouups) == 1 + + return GroupedDataFrame(newparent, collect(1:length(gd.cols)), groups, + collect(1:length(idx)), starts, ends, j, nothing) + end else - return keepkeys ? parent(gd)[1:0, gd.cols] : DataFrame() + if regroup + return GroupedDataFrame(parent(gd)[1:0, gd.cols], collect(1:length(gd.cols)), + Int[], Int[], Int[], Int[], 0, Dict{Any,Int}()) + else + return keepkeys ? parent(gd)[1:0, gd.cols] : DataFrame() + end end end @@ -1104,7 +1034,7 @@ end function _combine(f::AbstractVector{<:Pair}, gd::GroupedDataFrame, nms::AbstractVector{Symbol}, - copycols::Bool=true, keeprows::Bool=false) # TODO: remove these defaults + copycols::Bool, keeprows::Bool) # TODO: remove these defaults # here f should be normalized and in a form of source_cols => fun @assert all(x -> first(x) isa Union{Int, AbstractVector{Int}, AsTable}, f) @assert all(x -> last(x) isa Union{Base.Callable, ByRow}, f) @@ -1126,7 +1056,7 @@ function _combine(f::AbstractVector{<:Pair}, @assert i == nrow(parent(gd)) end else - idx_keeprows = nothing # should not be used but do not leave it unassigned + idx_keeprows = nothing end idx_agg = nothing @@ -1148,7 +1078,7 @@ function _combine(f::AbstractVector{<:Pair}, agg = check_aggregate(last(p)) outcol = agg(incol, gd) res[i] = idx_agg, outcol - elseif keeprows && fun isa identity && !(source_cols isa AsTable) + elseif keeprows && fun === identity && !(source_cols isa AsTable) @assert source_cols isa Union{Int, AbstractVector{Int}} @assert length(source_cols) == 1 outcol = parentdf[!, first(source_cols)] @@ -1224,6 +1154,17 @@ function _combine(f::AbstractVector{<:Pair}, end end end + + for (i, (col_idx, col)) in enumerate(res) + if keeprows && res[i][1] !== idx_keeprows # we need to reorder the column + newcol = similar(col) + # we can probably make it more efficient, but I leave it as an optimization for the future + for i in axes(col, 1) + newcol[gd.idx[i]] = col[i] + end + res[i] = (col_idx, newcol) + end + end outcols = map(x -> x[2], res) # this check is redundant given we check idx above # but it is safer to double check and it is cheap @@ -1496,188 +1437,28 @@ function _combine_tables_with_first!(first::Union{AbstractDataFrame, return outcols, colnames end -""" - by(d::AbstractDataFrame, cols::Any, args...; - sort::Bool=false, skipmissing::Bool=false, keepkeys::Bool=true) - by(fun::Union{Function, Type}, d::AbstractDataFrame, cols::Any; - sort::Bool=false, skipmissing::Bool=false, keepkeys::Bool=true) - by(pair::Pair, d::AbstractDataFrame, cols::Any; - sort::Bool=false, skipmissing::Bool=false, keepkeys::Bool=true) - by(d::AbstractDataFrame, cols::Any, fun::Union{Function, Type}; - sort::Bool=false, skipmissing::Bool=false, keepkeys::Bool=true) - by(d::AbstractDataFrame, cols::Any, pair::Pair; - sort::Bool=false, skipmissing::Bool=false, keepkeys::Bool=true) - -Split-apply-combine in one step: apply `fun`, `pair` or `args` to each grouping -in `df` based on grouping columns `cols`, and return a `DataFrame`. -This is a shorthand for `combine` called on -`groupby(df, cols, sort=sort, skipmissing=skipmissing)`. - -$F_ARGUMENT_RULES - -$F_TYPE_RULES - -$KWARG_PROCESSING_RULES - -The resulting data frame will be sorted if `sort=true` is passed. -Otherwise, ordering of rows is undefined. - -If `skipmissing=true` rows with `missing` values in one of the grouping columns -`cols` will be skipped. - -See [`groupby`](@ref) and [`combine`](@ref) and for details and more examples. - -# Examples -```jldoctest -julia> df = DataFrame(a = repeat([1, 2, 3, 4], outer=[2]), - b = repeat([2, 1], outer=[4]), - c = 1:8); - -julia> by(df, :a, :c => sum, nrow) -4×3 DataFrame -│ Row │ a │ c_sum │ nrow │ -│ │ Int64 │ Int64 │ Int64 │ -├─────┼───────┼───────┼───────┤ -│ 1 │ 1 │ 6 │ 2 │ -│ 2 │ 2 │ 8 │ 2 │ -│ 3 │ 3 │ 10 │ 2 │ -│ 4 │ 4 │ 12 │ 2 │ - -julia> by(sdf -> sum(sdf.c), df, :a) # Slower variant -4×2 DataFrame -│ Row │ a │ x1 │ -│ │ Int64 │ Int64 │ -├─────┼───────┼───────┤ -│ 1 │ 1 │ 6 │ -│ 2 │ 2 │ 8 │ -│ 3 │ 3 │ 10 │ -│ 4 │ 4 │ 12 │ - -julia> by(df, :a) do d # do syntax for the slower variant - sum(d.c) - end -4×2 DataFrame -│ Row │ a │ x1 │ -│ │ Int64 │ Int64 │ -├─────┼───────┼───────┤ -│ 1 │ 1 │ 6 │ -│ 2 │ 2 │ 8 │ -│ 3 │ 3 │ 10 │ -│ 4 │ 4 │ 12 │ - -julia> by(df, :a, :c => (x -> sum(log, x)) => :sum_log_c) # specifying a name for target column -4×2 DataFrame -│ Row │ a │ sum_log_c │ -│ │ Int64 │ Float64 │ -├─────┼───────┼───────────┤ -│ 1 │ 1 │ 1.60944 │ -│ 2 │ 2 │ 2.48491 │ -│ 3 │ 3 │ 3.04452 │ -│ 4 │ 4 │ 3.46574 │ - -julia> by(df, :a, [:b, :c] .=> sum) # passing a vector of pairs -4×3 DataFrame -│ Row │ a │ b_sum │ c_sum │ -│ │ Int64 │ Int64 │ Int64 │ -├─────┼───────┼───────┼───────┤ -│ 1 │ 1 │ 4 │ 6 │ -│ 2 │ 2 │ 2 │ 8 │ -│ 3 │ 3 │ 4 │ 10 │ -│ 4 │ 4 │ 2 │ 12 │ - -julia> by(df, :a) do sdf # dropping group when DataFrame() is returned - sdf.c[1] != 1 ? sdf : DataFrame() - end -6×3 DataFrame -│ Row │ a │ b │ c │ -│ │ Int64 │ Int64 │ Int64 │ -├─────┼───────┼───────┼───────┤ -│ 1 │ 2 │ 1 │ 2 │ -│ 2 │ 2 │ 1 │ 6 │ -│ 3 │ 3 │ 2 │ 3 │ -│ 4 │ 3 │ 2 │ 7 │ -│ 5 │ 4 │ 1 │ 4 │ -│ 6 │ 4 │ 1 │ 8 │ - -julia> by(df, :a, :b => :b1, :c => :c1, - [:b, :c] => +, keepkeys=false) # auto-splatting, renaming and keepkeys -8×3 DataFrame -│ Row │ b1 │ c1 │ b_c_+ │ -│ │ Int64 │ Int64 │ Int64 │ -├─────┼───────┼───────┼───────┤ -│ 1 │ 2 │ 1 │ 3 │ -│ 2 │ 2 │ 5 │ 7 │ -│ 3 │ 1 │ 2 │ 3 │ -│ 4 │ 1 │ 6 │ 7 │ -│ 5 │ 2 │ 3 │ 5 │ -│ 6 │ 2 │ 7 │ 9 │ -│ 7 │ 1 │ 4 │ 5 │ -│ 8 │ 1 │ 8 │ 9 │ - -julia> by(df, :a, :b, :c => sum) # passing columns and broadcasting -8×3 DataFrame -│ Row │ a │ b │ c_sum │ -│ │ Int64 │ Int64 │ Int64 │ -├─────┼───────┼───────┼───────┤ -│ 1 │ 1 │ 2 │ 6 │ -│ 2 │ 1 │ 2 │ 6 │ -│ 3 │ 2 │ 1 │ 8 │ -│ 4 │ 2 │ 1 │ 8 │ -│ 5 │ 3 │ 2 │ 10 │ -│ 6 │ 3 │ 2 │ 10 │ -│ 7 │ 4 │ 1 │ 12 │ -│ 8 │ 4 │ 1 │ 12 │ - -julia> by(df, :a, [:b, :c] .=> Ref) -4×3 DataFrame -│ Row │ a │ b_Ref │ c_Ref │ -│ │ Int64 │ SubArra… │ SubArra… │ -├─────┼───────┼──────────┼──────────┤ -│ 1 │ 1 │ [2, 2] │ [1, 5] │ -│ 2 │ 2 │ [1, 1] │ [2, 6] │ -│ 3 │ 3 │ [2, 2] │ [3, 7] │ -│ 4 │ 4 │ [1, 1] │ [4, 8] │ - -julia> by(df, :a, AsTable(:) => Ref) -4×2 DataFrame -│ Row │ a │ a_b_c_Ref │ -│ │ Int64 │ NamedTuple… │ -├─────┼───────┼──────────────────────────────────────┤ -│ 1 │ 1 │ (a = [1, 1], b = [2, 2], c = [1, 5]) │ -│ 2 │ 2 │ (a = [2, 2], b = [1, 1], c = [2, 6]) │ -│ 3 │ 3 │ (a = [3, 3], b = [2, 2], c = [3, 7]) │ -│ 4 │ 4 │ (a = [4, 4], b = [1, 1], c = [4, 8]) │ +select(gd::GroupedDataFrame, args...; + copycols::Bool=true, keepkeys::Bool=true, regroup::Bool=false) = + _combine_executor(gd, args..., copycols=copycols, keepkeys=keepkeys, + regroup=regroup, keeprows=true) + +DataFrames.transform(gd::GroupedDataFrame, args...; + copycols::Bool=true, keepkeys::Bool=true, regroup::Bool=false) = + select(gd, :, args..., copycols=copycols, keepkeys=keepkeys, + regroup=regroup) + +function select!(gd::GroupedDataFrame{DataFrame}, args...; regroup::Bool=false) + newdf = select(gd, args..., copycols=false, regroup=false) + df = parent(gd) + copy!(_columns(df), _columns(newdf)) + x = index(df) + copy!(_names(x), _names(newdf)) + empty!(x.lookup) + for (i, n) in enumerate(x.names) + x.lookup[n] = i + end + return regroup ? gd : df +end -julia> by(df, :a, :, AsTable(Not(:a)) => sum) -8×4 DataFrame -│ Row │ a │ b │ c │ b_c_sum │ -│ │ Int64 │ Int64 │ Int64 │ Int64 │ -├─────┼───────┼───────┼───────┼─────────┤ -│ 1 │ 1 │ 2 │ 1 │ 3 │ -│ 2 │ 1 │ 2 │ 5 │ 7 │ -│ 3 │ 2 │ 1 │ 2 │ 3 │ -│ 4 │ 2 │ 1 │ 6 │ 7 │ -│ 5 │ 3 │ 2 │ 3 │ 5 │ -│ 6 │ 3 │ 2 │ 7 │ 9 │ -│ 7 │ 4 │ 1 │ 4 │ 5 │ -│ 8 │ 4 │ 1 │ 8 │ 9 │ -``` -""" -by(f::Union{Base.Callable, Pair}, d::AbstractDataFrame, cols::Any; - sort::Bool=false, skipmissing::Bool=false, keepkeys::Bool=true) = - combine(groupby(d, cols, sort=sort, skipmissing=skipmissing), f, - keepkeys=keepkeys) -by(d::AbstractDataFrame, cols::Any, f::Base.Callable; - sort::Bool=false, skipmissing::Bool=false, keepkeys::Bool=true) = - combine(groupby(d, cols, sort=sort, skipmissing=skipmissing), f, - keepkeys=keepkeys) -by(d::AbstractDataFrame, cols::Any, f::Pair; - sort::Bool=false, skipmissing::Bool=false, keepkeys::Bool=true) = - combine(groupby(d, cols, sort=sort, skipmissing=skipmissing), f, - keepkeys=keepkeys) - -by(d::AbstractDataFrame, cols::Any, f::Union{Pair, typeof(nrow), - ColumnIndex, MultiColumnIndex}...; - sort::Bool=false, skipmissing::Bool=false, keepkeys::Bool=true) = - combine(groupby(d, cols, sort=sort, skipmissing=skipmissing), - f..., keepkeys=keepkeys) +transform!(gd::GroupedDataFrame{DataFrame}, args...; regroup::Bool=false) = + select!(gd, :, args..., regroup=regroup) From 6908ee89d02d4d04763ea3dfcaea12a6159a91a1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Tue, 28 Apr 2020 14:54:28 +0200 Subject: [PATCH 06/29] update DataFrame constructor --- src/groupeddataframe/groupeddataframe.jl | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/src/groupeddataframe/groupeddataframe.jl b/src/groupeddataframe/groupeddataframe.jl index 6de06309d2..f04888cad9 100644 --- a/src/groupeddataframe/groupeddataframe.jl +++ b/src/groupeddataframe/groupeddataframe.jl @@ -79,7 +79,7 @@ Base.names(gd::GroupedDataFrame) = names(gd.parent) Base.names(gd::GroupedDataFrame, cols) = names(gd.parent, cols) _names(gd::GroupedDataFrame) = _names(gd.parent) -function DataFrame(gd::GroupedDataFrame; copycols::Bool=true) +function DataFrame(gd::GroupedDataFrame; copycols::Bool=true, keepkeys::Bool=true) if !copycols throw(ArgumentError("It is not possible to construct a `DataFrame`" * "from GroupedDataFrame with `copycols=false`")) @@ -94,7 +94,11 @@ function DataFrame(gd::GroupedDataFrame; copycols::Bool=true) doff += n end resize!(idx, doff - 1) - parent(gd)[idx, :] + if keepkeys + return parent(gd)[idx, :] + else + return parent(gd)[idx, Not(gd.cols)] + end end From 7b644dde03b55cef63b53a15472cf132c99084ef Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Tue, 28 Apr 2020 15:04:05 +0200 Subject: [PATCH 07/29] fix handling of aggregates --- src/groupeddataframe/splitapplycombine.jl | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/groupeddataframe/splitapplycombine.jl b/src/groupeddataframe/splitapplycombine.jl index e5cb15bb4a..a73eff4535 100644 --- a/src/groupeddataframe/splitapplycombine.jl +++ b/src/groupeddataframe/splitapplycombine.jl @@ -1,7 +1,3 @@ -# -# groupby(), map(), combine(), by() and related -# - """ groupby(d::AbstractDataFrame, cols; sort=false, skipmissing=false) @@ -1141,7 +1137,7 @@ function _combine(f::AbstractVector{<:Pair}, if isnothing(agg2idx_map) agg2idx_map = _agg2idx_map_helper(idx, idx_agg) end - res[i] = idx, res[i][2][agg2idx_map] + res[i] = idx_agg, res[i][2][agg2idx_map] elseif idx != res[i][1] if keeprows throw(ArgumentError("all functions must return vectors of " * @@ -1155,6 +1151,10 @@ function _combine(f::AbstractVector{<:Pair}, end end + # remember that here first field in res[i] is not useful - it is just needed + # to keep track how the column was generated + # a correct index is stored in idx variable + for (i, (col_idx, col)) in enumerate(res) if keeprows && res[i][1] !== idx_keeprows # we need to reorder the column newcol = similar(col) From 27532350b7068be01ac5337701b6b9c84ec9b421 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Tue, 28 Apr 2020 17:25:00 +0200 Subject: [PATCH 08/29] code cleanup --- src/groupeddataframe/splitapplycombine.jl | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/src/groupeddataframe/splitapplycombine.jl b/src/groupeddataframe/splitapplycombine.jl index a73eff4535..144f84f505 100644 --- a/src/groupeddataframe/splitapplycombine.jl +++ b/src/groupeddataframe/splitapplycombine.jl @@ -636,7 +636,7 @@ function combine_helper(f, gd::GroupedDataFrame, end end # all groups must be filled - @assert minimum(grouups) == 1 + @assert minimum(groups) == 1 return GroupedDataFrame(newparent, collect(1:length(gd.cols)), groups, collect(1:length(idx)), starts, ends, j, nothing) @@ -1172,16 +1172,20 @@ function _combine(f::AbstractVector{<:Pair}, return idx, DataFrame(collect(AbstractVector, outcols), nms) end -function _combine(fun::Base.Callable, gd::GroupedDataFrame, ::Nothing) +function _combine(fun::Base.Callable, gd::GroupedDataFrame, ::Nothing, + copycols::Bool, keeprows::Bool) + @assert copycols && !keeprows firstres = fun(gd[1]) idx, outcols, nms = _combine_multicol(firstres, fun, gd, nothing) valscat = DataFrame(collect(AbstractVector, outcols), nms) return idx, valscat end -function _combine(p::Pair, gd::GroupedDataFrame, ::Nothing) +function _combine(p::Pair, gd::GroupedDataFrame, ::Nothing, + copycols::Bool, keeprows::Bool) # here p should not be normalized as we allow tabular return value from fun # map and combine should not dispatch here if p is isagg + @assert copycols && !keeprows source_cols, (fun, out_col) = normalize_selection(index(parent(gd)), p) parentdf = parent(gd) if source_cols isa Int From 2a03190e1be0d4cffe56d1908754faa057d8b792 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Tue, 28 Apr 2020 19:54:25 +0200 Subject: [PATCH 09/29] improve canonical check + start rewriting tests --- src/groupeddataframe/groupeddataframe.jl | 4 - src/groupeddataframe/splitapplycombine.jl | 17 ++- test/grouping.jl | 124 ++++++++++++---------- test/string.jl | 25 ++--- 4 files changed, 84 insertions(+), 86 deletions(-) diff --git a/src/groupeddataframe/groupeddataframe.jl b/src/groupeddataframe/groupeddataframe.jl index f04888cad9..5a4349845e 100644 --- a/src/groupeddataframe/groupeddataframe.jl +++ b/src/groupeddataframe/groupeddataframe.jl @@ -1,7 +1,3 @@ -# -# Type definition and basic methods -# - """ GroupedDataFrame diff --git a/src/groupeddataframe/splitapplycombine.jl b/src/groupeddataframe/splitapplycombine.jl index 144f84f505..b9123f88dd 100644 --- a/src/groupeddataframe/splitapplycombine.jl +++ b/src/groupeddataframe/splitapplycombine.jl @@ -169,16 +169,15 @@ function groupby(df::AbstractDataFrame, cols; return gd end -function _check_cannonical(gdf::GroupedDataFrame) - gmin, gmax = extrema(gdf.groups) - @assert length(gdf.starts) == length(gdf.ends) == gmax - @assert gmin <= 1 - (gdf.starts[1] != 1 || gdf.ends[end] != nrow(parent(gdf))) && return false - for i in 2:length(gdf.starts) - gdf.starts[i] - gdf.ends[i-1] != 1 && return false +function _check_cannonical(gd::GroupedDataFrame) + groups = gd.groups + isempty(groups) && return true + maxseen = 1 + for g in groups + 1 <= g <= maxseen + 1 || return false + maxseen = max(maxseen, g) end - # gmin == 0 means we have dropped groups which is not possible here - @assert gmin == 1 + @assert maxseen == gd.ngroups return true end diff --git a/test/grouping.jl b/test/grouping.jl index c9587302c6..349b06b98b 100644 --- a/test/grouping.jl +++ b/test/grouping.jl @@ -28,12 +28,20 @@ function _levels!(x::PooledArray, levels::AbstractVector) end _levels!(x::CategoricalArray, levels::AbstractVector) = levels!(x, levels) -function groupby_checked(df::AbstractDataFrame, keys, args...; kwargs...) - ogd = groupby(df, keys, args...; kwargs...) - +function validate_gdf(ogd::GroupedDataFrame) # To return original object to test when indices have not been computed gd = deepcopy(ogd) + @assert allunique(gd.cols) + @assert minimum(gd.cols) >= 1 + @assert maximum(gd.cols) <= ncol(parent(gd)) + + g = sort!(unique(gd.groups)) + @assert 0 <= g[1] <= 1 + @assert g == g[1]:g[end] + @assert length(gd.starts) == length(gd.ends) == g[end] + @assert isperm(gd.idx) + # checking that groups field is consistent with other fields # (since == and isequal do not use it) # and that idx is increasing per group @@ -54,9 +62,6 @@ function groupby_checked(df::AbstractDataFrame, keys, args...; kwargs...) # correct start-end relations for i in eachindex(se) - firstkeys = gd.parent[gd.idx[se[i][1]], gd.cols] - # all grouping keys must be equal within a group - @assert all(j -> gd.parent[gd.idx[j], gd.cols] ≅ firstkeys, se[i][1]:se[i][2]) @assert se[i][1] <= se[i][2] if i > 1 # the blocks returned by groupby must be continuous @@ -73,7 +78,12 @@ function groupby_checked(df::AbstractDataFrame, keys, args...; kwargs...) @test allunique(eachrow(gd.parent[gd.idx[gd.starts], gd.cols])) end - ogd +end + +function groupby_checked(df::AbstractDataFrame, keys, args...; kwargs...) + ogd = groupby(df, keys, args...; kwargs...) + validate_gdf(ogd) + return ogd end @testset "parent" begin @@ -86,11 +96,11 @@ end @testset "consistency" begin df = DataFrame(a = [1, 1, 2, 2], b = [5, 6, 7, 8], c = 1:4) push!(df.c, 5) - @test_throws AssertionError gd = groupby(df, :a) + @test_throws AssertionError groupby(df, :a) df = DataFrame(a = [1, 1, 2, 2], b = [5, 6, 7, 8], c = 1:4) push!(DataFrames._columns(df), df[:, :a]) - @test_throws AssertionError gd = groupby(df, :a) + @test_throws AssertionError groupby(df, :a) end @testset "accepted columns" begin @@ -142,40 +152,13 @@ end sres3 = sort(res3, colssym) sres4 = sort(res4, colssym) - # by() without groups sorting - @test sort(by(identity, df, cols), colssym) == shcatdf - @test sort(by(df -> df[1, :], df, cols), colssym) == - shcatdf[.!nonunique(shcatdf, colssym), :] - @test by(f1, df, cols) == res - @test by(f2, df, cols) == res - @test rename(by(f3, df, cols), :x1 => :xmax) == res - @test by(f4, df, cols) == res2 - @test by(f5, df, cols) == res2 - @test by(f6, df, cols) == res3 - @test sort(by(f7, df, cols), colssym) == sres4 - @test sort(by(f8, df, cols), colssym) == sres4 - - # by() with groups sorting - @test by(identity, df, cols, sort=true) == shcatdf - @test by(df -> df[1, :], df, cols, sort=true) == - shcatdf[.!nonunique(shcatdf, colssym), :] - @test by(f1, df, cols, sort=true) == sres - @test by(f2, df, cols, sort=true) == sres - @test rename(by(f3, df, cols, sort=true), :x1 => :xmax) == sres - @test by(f4, df, cols, sort=true) == sres2 - @test by(f5, df, cols, sort=true) == sres2 - @test by(f6, df, cols, sort=true) == sres3 - @test by(f7, df, cols, sort=true) == sres4 - @test by(f8, df, cols, sort=true) == sres4 - - @test by(f1, df, [:a]) == by(f1, df, :a) - @test by(f1, df, [:a], sort=true) == by(f1, df, :a, sort=true) - # groupby() without groups sorting gd = groupby_checked(df, cols) @test names(parent(gd))[gd.cols] == string.(colssym) df_comb = combine(identity, gd) @test sort(df_comb, colssym) == shcatdf + @test sort(combine(df -> df[1, :], gd), colssym) == + shcatdf[.!nonunique(shcatdf, colssym), :] df_ref = DataFrame(gd) @test sort(hcat(df_ref[!, cols], df_ref[!, Not(cols)]), colssym) == shcatdf @test df_ref.x == df_comb.x @@ -196,6 +179,8 @@ end @test all(gd[i][!, colssym[2]] .== sres[i, colssym[2]]) end @test combine(identity, gd) == shcatdf + @test combine(df -> df[1, :], gd, cols, sort=true) == + shcatdf[.!nonunique(shcatdf, colssym), :] df_ref = DataFrame(gd) @test hcat(df_ref[!, cols], df_ref[!, Not(cols)]) == shcatdf @test combine(f1, gd) == sres @@ -207,10 +192,10 @@ end @test combine(f7, gd) == sres4 @test combine(f8, gd) == sres4 - # map() without and with groups sorting + # combine() with regroup without and with groups sorting for sort in (false, true) gd = groupby_checked(df, cols, sort=sort) - v = map(d -> d[:, [:x]], gd) + v = combine(d -> d[:, [:x]], gd, regroup=true) @test length(gd) == length(v) nms = [colssym; :x] @test v[1] == gd[1][:, nms] @@ -219,24 +204,33 @@ end v[3] == gd[3][:, nms] && v[4] == gd[4][:, nms] @test names(parent(v))[v.cols] == string.(colssym) - v = map(f1, gd) - @test vcat(v[1], v[2], v[3], v[4]) == by(f1, df, cols, sort=sort) - v = map(f2, gd) - @test vcat(v[1], v[2], v[3], v[4]) == by(f2, df, cols, sort=sort) - v = map(f3, gd) - @test vcat(v[1], v[2], v[3], v[4]) == by(f3, df, cols, sort=sort) - v = map(f4, gd) - @test vcat(v[1], v[2], v[3], v[4]) == by(f4, df, cols, sort=sort) - v = map(f5, gd) - @test vcat(v[1], v[2], v[3], v[4]) == by(f5, df, cols, sort=sort) - v = map(f5, gd) - @test vcat(v[1], v[2], v[3], v[4]) == by(f5, df, cols, sort=sort) - v = map(f6, gd) - @test vcat(v[1], v[2], v[3], v[4]) == by(f6, df, cols, sort=sort) - v = map(f7, gd) - @test vcat(v[1], v[2], v[3], v[4]) == by(f7, df, cols, sort=sort) - v = map(f8, gd) - @test vcat(v[1], v[2], v[3], v[4]) == by(f8, df, cols, sort=sort) + v = combine(f1, gd, regroup=true) + @test extrema(v.grous) == extrema(gd.groups) + @test vcat(v[1], v[2], v[3], v[4]) == combine(f1, gd) + v = combine(f2, gd, regroup=true) + @test extrema(v.grous) == extrema(gd.groups) + @test vcat(v[1], v[2], v[3], v[4]) == combine(f2, gd) + v = combine(f3, gd, regroup=true) + @test extrema(v.grous) == extrema(gd.groups) + @test vcat(v[1], v[2], v[3], v[4]) == combine(f3, gd) + v = combine(f4, gd, regroup=true) + @test extrema(v.grous) == extrema(gd.groups) + @test vcat(v[1], v[2], v[3], v[4]) == combine(f4, gd) + v = combine(f5, gd, regroup=true) + @test extrema(v.grous) == extrema(gd.groups) + @test vcat(v[1], v[2], v[3], v[4]) == combine(f5, gd) + v = combine(f5, gd, regroup=true) + @test extrema(v.grous) == extrema(gd.groups) + @test vcat(v[1], v[2], v[3], v[4]) == combine(f5, gd) + v = combine(f6, gd, regroup=true) + @test extrema(v.grous) == extrema(gd.groups) + @test vcat(v[1], v[2], v[3], v[4]) == combine(f6, gd) + v = combine(f7, gd, regroup=true) + @test extrema(v.grous) == extrema(gd.groups) + @test vcat(v[1], v[2], v[3], v[4]) == combine(f7, gd) + v = combine(f8, gd, regroup=true) + @test extrema(v.grous) == extrema(gd.groups) + @test vcat(v[1], v[2], v[3], v[4]) == combine(f8, gd) end end @@ -259,7 +253,7 @@ end df = DataFrame(v1=x, v2=x) groupby_checked(df, [:v1, :v2]) - df2 = by(e->1, DataFrame(x=Int64[]), :x) + df2 = combine(e->1, groupby(DataFrame(x=Int64[]), :x)) @test size(df2) == (0, 1) @test sum(df2.x) == 0 @@ -1972,4 +1966,16 @@ end @test_throws ArgumentError by(df, :g, AsTable([:x, :y]) => ByRow(x -> df[1, :])) end +@testset "test correctness of regrouping" begin + df = DataFrame(g=[2,2,1,3,1,2,1,2,3]) + gdf = groupby(df, :g) + gdf2 = combine(identity, gdf, regroup=true) + @test combine(gdf, :g => sum) == combine(gdf2, :g => sum) + + df.id = 1:9 + @test select(gdf, :g => sum) == + sort!(combine(gdf, :g => sum, :id), :id)[:, Not(end)] + @test select(gdf2, :g => sum) == combine(gdf2, :g => sum, :g) +end + end # module diff --git a/test/string.jl b/test/string.jl index 817dc830aa..233b9cd8b8 100644 --- a/test/string.jl +++ b/test/string.jl @@ -166,25 +166,22 @@ end @test haskey(k[1], :a) == haskey(k[1], "a") == false @test k[1].g == k[1]."g" == k[1][:g] == k[1]["g"] - @test by(df, :g, :a) == by(df, "g", "a") == combine(gdf, :a) == combine(gdf, "a") == - by(df, :g, [:a]) == by(df, "g", ["a"]) == combine(gdf, [:a]) == combine(gdf, ["a"]) + @test combine(gdf, :a) == combine(gdf, "a") == + combine(gdf, [:a]) == combine(gdf, ["a"]) - @test map("a" => identity, gdf) == map(:a => identity, gdf) - @test map(["a"] => identity, gdf) == map([:a] => identity, gdf) - @test map(nrow => :n, gdf) == map(nrow => "n", gdf) + @test combine("a" => identity, gdf, regroup=true) == + combine(:a => identity, gdf, regroup=true) + @test combine(["a"] => identity, gdf, regroup=true) == + combine([:a] => identity, gdf, regroup=true) + @test combine(nrow => :n, gdf, regroup=true) == + combine(nrow => "n", gdf, regroup=true) @test combine("a" => identity, gdf) == combine(:a => identity, gdf) == - combine(gdf, "a" => identity) == combine(gdf, :a => identity) == - by("a" => identity, df, :g) == by(:a => identity, df, :g) == - by(df, :g, "a" => identity) == by(df, :g, :a => identity) + combine(gdf, "a" => identity) == combine(gdf, :a => identity) @test combine(["a"] => identity, gdf) == combine([:a] => identity, gdf) == - combine(gdf, ["a"] => identity) == combine(gdf, [:a] => identity) == - by(["a"] => identity, df, :g) == by([:a] => identity, df, :g) == - by(df, :g, ["a"] => identity) == by(df, :g, [:a] => identity) + combine(gdf, ["a"] => identity) == combine(gdf, [:a] => identity) @test combine(nrow => :n, gdf) == combine(nrow => "n", gdf) == - combine(gdf, nrow => :n) == combine(gdf, nrow => "n") == - by(nrow => :n, df, :g) == by(nrow => "n", df, :g) == - by(df, :g, nrow => :n) == by(df, :g, nrow => "n") + combine(gdf, nrow => :n) == combine(gdf, nrow => "n") end @testset "DataFrameRow" begin From 7b86eb8244f8bff0c2aef030215b9c1ae662ca15 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Tue, 28 Apr 2020 23:52:19 +0200 Subject: [PATCH 10/29] allow changing sort order of groups in cannonical test --- src/groupeddataframe/splitapplycombine.jl | 23 ++------ test/grouping.jl | 65 ++++++++++++----------- 2 files changed, 37 insertions(+), 51 deletions(-) diff --git a/src/groupeddataframe/splitapplycombine.jl b/src/groupeddataframe/splitapplycombine.jl index b9123f88dd..13d7cc0db1 100644 --- a/src/groupeddataframe/splitapplycombine.jl +++ b/src/groupeddataframe/splitapplycombine.jl @@ -169,17 +169,7 @@ function groupby(df::AbstractDataFrame, cols; return gd end -function _check_cannonical(gd::GroupedDataFrame) - groups = gd.groups - isempty(groups) && return true - maxseen = 1 - for g in groups - 1 <= g <= maxseen + 1 || return false - maxseen = max(maxseen, g) - end - @assert maxseen == gd.ngroups - return true -end +_check_cannonical(gd::GroupedDataFrame) = !any(==(0), gd.groups) const F_TYPE_RULES = """ @@ -602,13 +592,8 @@ function combine_helper(f, gd::GroupedDataFrame, # in this case we are sure that the result GroupedDataFrame has the # same structure as the source # we do not copy data as it should be safe - we never mutate fields of gd - if isnothing(getfield(gd, :keymap)) - return GroupedDataFrame(newparent, gd.cols, gd.groups, gd.idx, - gd.starts, gd.ends, gd.ngroups, nothing) - else - return GroupedDataFrame(newparent, gd.cols, gd.groups, gd.idx, - gd.starts, gd.ends, gd.ngroups, gd.keymap) - end + return GroupedDataFrame(newparent, gd.cols, gd.groups, gd.idx, + gd.starts, gd.ends, gd.ngroups, getfield(gd, :keymap)) else starts = Vector{Int}(undef, length(gd)) ends = Vector{Int}(undef, length(gd)) @@ -1036,7 +1021,7 @@ function _combine(f::AbstractVector{<:Pair}, if keeprows if !_check_cannonical(gd) - throw(ArgumentError("select or transform functions require that" * + throw(ArgumentError("select or transform functions require that " * "GroupedDataFrame is not sorted or subsetted")) end idx_keeprows = Vector{Int}(undef, nrow(parent(gd))) diff --git a/test/grouping.jl b/test/grouping.jl index 349b06b98b..9888728410 100644 --- a/test/grouping.jl +++ b/test/grouping.jl @@ -41,6 +41,7 @@ function validate_gdf(ogd::GroupedDataFrame) @assert g == g[1]:g[end] @assert length(gd.starts) == length(gd.ends) == g[end] @assert isperm(gd.idx) + @assert length(gd.idx) == length(gd.groups) == nrow(parent(gd)) # checking that groups field is consistent with other fields # (since == and isequal do not use it) @@ -88,7 +89,7 @@ end @testset "parent" begin df = DataFrame(a = [1, 1, 2, 2], b = [5, 6, 7, 8]) - gd = groupby(df, :a) + gd = groupby_checked(df, :a) @test parent(gd) === df @test_throws ArgumentError identity.(gd) end @@ -96,20 +97,20 @@ end @testset "consistency" begin df = DataFrame(a = [1, 1, 2, 2], b = [5, 6, 7, 8], c = 1:4) push!(df.c, 5) - @test_throws AssertionError groupby(df, :a) + @test_throws AssertionError groupby_checked(df, :a) df = DataFrame(a = [1, 1, 2, 2], b = [5, 6, 7, 8], c = 1:4) push!(DataFrames._columns(df), df[:, :a]) - @test_throws AssertionError groupby(df, :a) + @test_throws AssertionError groupby_checked(df, :a) end @testset "accepted columns" begin df = DataFrame(A=[1,1,1,2,2,2], B=[1,2,1,2,1,2], C=1:6) - @test groupby(df, [1,2]) == groupby(df, 1:2) == groupby(df, [:A, :B]) - @test groupby(df, [2,1]) == groupby(df, 2:-1:1) == groupby(df, [:B, :A]) + @test groupby_checked(df, [1,2]) == groupby_checked(df, 1:2) == groupby_checked(df, [:A, :B]) + @test groupby_checked(df, [2,1]) == groupby_checked(df, 2:-1:1) == groupby_checked(df, [:B, :A]) end -@testset "by, groupby and map(::Function, ::GroupedDataFrame)" begin +@testset "groupby and combine(::Function, ::GroupedDataFrame)" begin Random.seed!(1) df = DataFrame(a = repeat(Union{Int, Missing}[1, 3, 2, 4], outer=[2]), b = repeat(Union{Int, Missing}[2, 1], outer=[4]), @@ -152,7 +153,7 @@ end sres3 = sort(res3, colssym) sres4 = sort(res4, colssym) - # groupby() without groups sorting + # groupby_checked() without groups sorting gd = groupby_checked(df, cols) @test names(parent(gd))[gd.cols] == string.(colssym) df_comb = combine(identity, gd) @@ -171,7 +172,7 @@ end @test sort(combine(f7, gd), colssym) == sort(res4, colssym) @test sort(combine(f8, gd), colssym) == sort(res4, colssym) - # groupby() with groups sorting + # groupby_checked() with groups sorting gd = groupby_checked(df, cols, sort=true) @test names(parent(gd))[gd.cols] == string.(colssym) for i in 1:length(gd) @@ -253,7 +254,7 @@ end df = DataFrame(v1=x, v2=x) groupby_checked(df, [:v1, :v2]) - df2 = combine(e->1, groupby(DataFrame(x=Int64[]), :x)) + df2 = combine(e->1, groupby_checked(DataFrame(x=Int64[]), :x)) @test size(df2) == (0, 1) @test sum(df2.x) == 0 @@ -349,7 +350,7 @@ end df = DataFrame(x = [1, 2, 3], y = [2, 3, 1]) # Test function returning DataFrameRow - res = by(d -> DataFrameRow(d, 1, :), df, :x) + res = combine(d -> DataFrameRow(d, 1, :), groupby_checked(df, :x)) @test res == DataFrame(x=df.x, y=df.y) # Test function returning Tuple @@ -359,7 +360,7 @@ end # Test with some groups returning empty data frames @test by(d -> d.x == [1] ? DataFrame(z=[]) : DataFrame(z=1), df, :x) == DataFrame(x=[2, 3], z=[1, 1]) - v = map(d -> d.x == [1] ? DataFrame(z=[]) : DataFrame(z=1), groupby_checked(df, :x)) + v = combine(d -> d.x == [1] ? DataFrame(z=[]) : DataFrame(z=1), groupby_checked(df, :x), regroup=true) @test length(v) == 2 @test vcat(v[1], v[2]) == DataFrame(x=[2, 3], z=[1, 1]) @@ -692,7 +693,7 @@ end @test_throws ArgumentError by(df, :a, nrow, nrow) @test_throws ArgumentError by(df, :a, [nrow]) - gd = groupby(df, :a) + gd = groupby_checked(df, :a) # Only test that different combine syntaxes work, # and rely on tests below for deeper checks @@ -958,7 +959,7 @@ end @testset "combine and map with columns named like grouping keys" begin df = DataFrame(x=["a", "a", "b", missing], y=1:4) - gd = groupby(df, :x) + gd = groupby_checked(df, :x) @test combine(identity, gd) ≅ df @test combine(d -> d[:, [2, 1]], gd) ≅ df @test_throws ArgumentError combine(f -> DataFrame(x=["a", "b"], z=[1, 1]), gd) @@ -966,7 +967,7 @@ end @test map(d -> d[:, [2, 1]], gd) ≅ gd @test_throws ArgumentError map(f -> DataFrame(x=["a", "b"], z=[1, 1]), gd) - gd = groupby(df, :x, skipmissing=true) + gd = groupby_checked(df, :x, skipmissing=true) @test combine(identity, gd) == df[1:3, :] @test combine(d -> d[:, [2, 1]], gd) == df[1:3, :] @test_throws ArgumentError combine(f -> DataFrame(x=["a", "b"], z=[1, 1]), gd) @@ -1199,7 +1200,7 @@ end \\end{tabular} """ - gd = groupby(DataFrame(a=[Symbol("&")], b=["&"]), [1,2]) + gd = groupby_checked(DataFrame(a=[Symbol("&")], b=["&"]), [1,2]) summary_str = summary(gd) @test summary_str == "$GroupedDataFrame with 1 group based on keys: a, b" @test sprint(show, gd) === """ @@ -1231,7 +1232,7 @@ end \\end{tabular} """ - gd = groupby(DataFrame(a = [1,2], b = [1.0, 2.0]), :a) + gd = groupby_checked(DataFrame(a = [1,2], b = [1.0, 2.0]), :a) @test sprint(show, "text/csv", gd) == """ "a","b" 1,1.0 @@ -1331,7 +1332,7 @@ end df = DataFrame(a=[2, 2, missing, missing, 1, 1, 3, 3], b=1:8) for dosort in (false, true), doskipmissing in (false, true) @test by(df, :a, :b=>sum, sort=dosort, skipmissing=doskipmissing) ≅ - combine(groupby(df, :a, sort=dosort, skipmissing=doskipmissing), :b=>sum) + combine(groupby_checked(df, :a, sort=dosort, skipmissing=doskipmissing), :b=>sum) end end @@ -1376,7 +1377,7 @@ end "│ 2 │ 2 │ 1 │ 2 │\n│ 3 │ 2 │ 2 │ 3 │" df = DataFrame(a=[1, 1, 2, 2, 2], b=1:5) - gd = groupby(df, :a) + gd = groupby_checked(df, :a) @test_throws ArgumentError combine(gd) end @@ -1486,8 +1487,8 @@ end @test cnt == length(gd) # Indexing using another GroupedDataFrame instance should fail - gd2 = groupby(df, cols, skipmissing=true) - gd3 = groupby(df, cols, skipmissing=true) + gd2 = groupby_checked(df, cols, skipmissing=true) + gd3 = groupby_checked(df, cols, skipmissing=true) @test gd2 == gd3 # Use GDF's without missing so they compare equal @test_throws ErrorException gd3[first(keys(gd2))] @@ -1501,7 +1502,7 @@ end b = repeat(1:2, outer=[6]), c = 1:12) - gd = groupby(df, [:a, :b]) + gd = groupby_checked(df, [:a, :b]) @test map(repr, keys(gd)) == [ "GroupKey: (a = :foo, b = 1)", @@ -1649,7 +1650,7 @@ end end @testset "haskey for GroupKey" begin - gdf = groupby(DataFrame(a=1, b=2, c=3), [:a, :b]) + gdf = groupby_checked(DataFrame(a=1, b=2, c=3), [:a, :b]) k = keys(gdf)[1] @test !haskey(k, 0) @test haskey(k, 1) @@ -1666,7 +1667,7 @@ end @test_throws MethodError haskey(gdf, true) @test haskey(gdf, k) - @test_throws ArgumentError haskey(gdf, keys(groupby(DataFrame(a=1,b=2,c=3), [:a, :b]))[1]) + @test_throws ArgumentError haskey(gdf, keys(groupby_checked(DataFrame(a=1,b=2,c=3), [:a, :b]))[1]) @test_throws BoundsError haskey(gdf, DataFrames.GroupKey(gdf, 0)) @test_throws BoundsError haskey(gdf, DataFrames.GroupKey(gdf, 2)) @test haskey(gdf, (1,2)) @@ -1733,11 +1734,11 @@ end @test by(df, :g, :x1 => :z) == by(df, :g, [:x1 => :z]) == by(:x1 => :z, df, :g) == - combine(groupby(df, :g), :x1 => :z) == - combine(groupby(df, :g), [:x1 => :z]) == - combine(:x1 => :z, groupby(df, :g)) == + combine(groupby_checked(df, :g), :x1 => :z) == + combine(groupby_checked(df, :g), [:x1 => :z]) == + combine(:x1 => :z, groupby_checked(df, :g)) == DataFrame(g=[1,1,1,2,2,2], z=1:6) - @test map(:x1 => :z, groupby(df, :g)) == groupby(DataFrame(g=[1,1,1,2,2,2], z=1:6), :g) + @test map(:x1 => :z, groupby_checked(df, :g)) == groupby_checked(DataFrame(g=[1,1,1,2,2,2], z=1:6), :g) end @testset "hard tabular return value cases" begin @@ -1810,7 +1811,7 @@ end @testset "additional do_call tests" begin Random.seed!(1234) df = DataFrame(g = rand(1:10, 100), x1 = rand(1:1000, 100)) - gdf = groupby(df, :g) + gdf = groupby_checked(df, :g) @test combine(gdf, [] => () -> 1, :x1 => length) == combine(gdf) do sdf (;[:function => 1, :x1_length => nrow(sdf)]...) @@ -1927,7 +1928,7 @@ end @testset "AsTable tests" begin df = DataFrame(g=[1,1,1,2,2], x=1:5, y=6:10) - gdf = groupby(df, :g) + gdf = groupby_checked(df, :g) # whole column 4 options of single pair passed @test by(df, :g , AsTable([:x, :y]) => Ref) == @@ -1936,7 +1937,7 @@ end combine(AsTable([:x, :y]) => Ref, gdf) == DataFrame(g=1:2, x_y_Ref=[(x=[1,2,3], y=[6,7,8]), (x=[4,5], y=[9,10])]) @test map(AsTable([:x, :y]) => Ref, gdf) == - groupby(by(df, :g , AsTable([:x, :y]) => Ref), :g) + groupby_checked(by(df, :g , AsTable([:x, :y]) => Ref), :g) @test by(df, :g, AsTable(1) => Ref) == combine(gdf, AsTable(1) => Ref) == @@ -1951,7 +1952,7 @@ end DataFrame(g=[1,1,1,2,2], x_y_function=[[(x=1,y=6)], [(x=2,y=7)], [(x=3,y=8)], [(x=4,y=9)], [(x=5,y=10)]]) @test map(AsTable([:x, :y]) => ByRow(x -> [x]), gdf) == - groupby(by(df, :g, AsTable([:x, :y]) => ByRow(x -> [x])), :g) + groupby_checked(by(df, :g, AsTable([:x, :y]) => ByRow(x -> [x])), :g) # whole column and ByRow test for multiple pairs passed @test by(df, :g, [:x, :y], [AsTable(v) => (x -> -x[1]) for v in [:x, :y]]) == @@ -1968,7 +1969,7 @@ end @testset "test correctness of regrouping" begin df = DataFrame(g=[2,2,1,3,1,2,1,2,3]) - gdf = groupby(df, :g) + gdf = groupby_checked(df, :g) gdf2 = combine(identity, gdf, regroup=true) @test combine(gdf, :g => sum) == combine(gdf2, :g => sum) From cb94903eb021076134ea02887a48b02e71424c28 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Wed, 29 Apr 2020 12:25:29 +0200 Subject: [PATCH 11/29] make old tests pass --- src/abstractdataframe/selection.jl | 5 +- src/deprecated.jl | 18 +- src/groupeddataframe/splitapplycombine.jl | 2 +- test/deprecated.jl | 8 + test/grouping.jl | 607 ++++++++++------------ test/select.jl | 68 ++- 6 files changed, 348 insertions(+), 360 deletions(-) diff --git a/src/abstractdataframe/selection.jl b/src/abstractdataframe/selection.jl index f211ee09b8..830565791c 100644 --- a/src/abstractdataframe/selection.jl +++ b/src/abstractdataframe/selection.jl @@ -188,7 +188,7 @@ function select_transform!(nc::Pair{<:Union{Int, AbstractVector{Int}, AsTable}, if !allow_resizing_newdf[] && ncol(newdf) == 0 && length(res) != nrow(df) throw(ArgumentError("length $(length(res)) of vector returned from " * "function $fun is different than number of rows" * - "$(nrow(df)) of the source data frame.")) + " $(nrow(df)) of the source data frame.")) end allow_resizing_newdf[] = false respar = parent(res) @@ -699,7 +699,8 @@ function _manipulate(dfv::SubDataFrame, args...; copycols::Bool, keeprows::Bool) push!(cs_vec, v) end end - return _process(dfv, [normalize_selection(index(dfv), c) for c in cs_vec], true, true) + return _process(dfv, [normalize_selection(index(dfv), c) for c in cs_vec], + true, keeprows) else # we do not support transformations here # newinds contains only indexing; making it Vector{Any} avoids some compilation diff --git a/src/deprecated.jl b/src/deprecated.jl index e95b82aa6e..b7d713e917 100644 --- a/src/deprecated.jl +++ b/src/deprecated.jl @@ -360,28 +360,28 @@ end export aggregate function aggregate(d::AbstractDataFrame, f::Any; sort::Bool=false) - df = select(d, names(d) .=> [f]) + df = combine(d, names(d) .=> [f]) if sort Base.depwarn("`aggregate(d, f, sort=true)` is deprecated. " * - "Instead use `sort!(select(d, names(d) .=> f))`.", :aggregate) + "Instead use `sort!(combine(d, names(d) .=> f))`.", :aggregate) sort!(df) else Base.depwarn("`aggregate(d, f)` is deprecated. " * - "Instead use `select(d, names(d) .=> f)`.", :aggregate) + "Instead use `combine(d, names(d) .=> f)`.", :aggregate) end return df end function aggregate(d::AbstractDataFrame, fs::AbstractVector; sort::Bool=false) - df = hcat([select(d, names(d) .=> [f]) for f in fs]..., makeunique=true) + df = hcat([combine(d, names(d) .=> [f]) for f in fs]..., makeunique=true) if sort Base.depwarn("`aggregate(d, fs, sort=true)` is deprecated. Instead" * - " use `sort!(select(d, [names(d) .=> f for f in fs]...))` " * + " use `sort!(combine(d, [names(d) .=> f for f in fs]...))` " * "if functions in `fs` have unique names.", :aggregate) sort!(df) else Base.depwarn("`aggregate(d, fs)` is deprecated. Instead use " * - "`select(d, [names(d) .=> f for f in fs]...)` if functions " * + "`combine(d, [names(d) .=> f for f in fs]...)` if functions " * "in `fs` have unique names.", :aggregate) end return df @@ -424,7 +424,8 @@ function aggregate(d::AbstractDataFrame, cols, f::Any; sort::Bool=false, skipmissing::Bool=false) Base.depwarn("`aggregate(d, cols, f, sort=$sort, skipmissing=$skipmissing)` " * "is deprecated. Instead use " * - "by(gd, cols, names(gd) .=> f, sort=$sort, skipmissing=$skipmissing)`", + "combine(groupby(d, cols, sort=$sort, skipmissing=$skipmissing)," * + " names(d, Not(cols)) .=> f)`", :aggregate) gd = groupby(d, cols, sort=sort, skipmissing=skipmissing) df = combine(gd, valuecols(gd) .=> [f]) @@ -435,7 +436,8 @@ function aggregate(d::AbstractDataFrame, cols, fs::AbstractVector; sort::Bool=false, skipmissing::Bool=false) Base.depwarn("`aggregate(d, cols, fs, sort=$sort, skipmissing=$skipmissing)` " * " is deprecated. Instead use " * - "by(gd, cols, [names(gd) .=> f for f in fs]..., sort=$sort, skipmissing=$skipmissing)`" * + "combine(groupby(d, cols, sort=$sort, skipmissing=$skipmissing), "* + "[names(d, Not(cols)) .=> f for f in fs]...)`" * " if functions in `fs` have unique names.", :aggregate) gd = groupby(d, cols, sort=sort, skipmissing=skipmissing) df = hcat([combine(gd, valuecols(gd) .=> [f], keepkeys=i==1) for (i, f) in enumerate(fs)]..., diff --git a/src/groupeddataframe/splitapplycombine.jl b/src/groupeddataframe/splitapplycombine.jl index 13d7cc0db1..5ebc609c5e 100644 --- a/src/groupeddataframe/splitapplycombine.jl +++ b/src/groupeddataframe/splitapplycombine.jl @@ -436,7 +436,7 @@ function combine(p::Pair, gd::GroupedDataFrame; # verify if it is not better to use a fast path, which we achieve # by moving to combine(::GroupedDataFrame, ::AbstractVector) method if isagg(p_from => (p_to isa Pair ? first(p_to) : p_to)) || p_from === nrow - return combine(gd, [p], keepkeys=keepkeys) + return combine(gd, [p], keepkeys=keepkeys, regroup=regroup) end if p_from isa Tuple diff --git a/test/deprecated.jl b/test/deprecated.jl index 39236be972..11550a2b2e 100644 --- a/test/deprecated.jl +++ b/test/deprecated.jl @@ -696,6 +696,14 @@ end deleterows!(DataFrame(x=[1, 2]), [true, false]) == DataFrame(x=[2]) end +@testset "by skipmissing and sort" begin + df = DataFrame(a=[2, 2, missing, missing, 1, 1, 3, 3], b=1:8) + for dosort in (false, true), doskipmissing in (false, true) + @test by(df, :a, :b=>sum, sort=dosort, skipmissing=doskipmissing) ≅ + combine(groupby(df, :a, sort=dosort, skipmissing=doskipmissing), :b=>sum) + end +end + global_logger(old_logger) end # module diff --git a/test/grouping.jl b/test/grouping.jl index 9888728410..a0285dac76 100644 --- a/test/grouping.jl +++ b/test/grouping.jl @@ -32,14 +32,20 @@ function validate_gdf(ogd::GroupedDataFrame) # To return original object to test when indices have not been computed gd = deepcopy(ogd) - @assert allunique(gd.cols) - @assert minimum(gd.cols) >= 1 - @assert maximum(gd.cols) <= ncol(parent(gd)) + if !isempty(gd.cols) + @assert allunique(gd.cols) + @assert minimum(gd.cols) >= 1 + @assert maximum(gd.cols) <= ncol(parent(gd)) + end g = sort!(unique(gd.groups)) - @assert 0 <= g[1] <= 1 - @assert g == g[1]:g[end] - @assert length(gd.starts) == length(gd.ends) == g[end] + if length(gd) > 0 + @assert 0 <= g[1] <= 1 + @assert g == g[1]:g[end] + @assert length(gd.starts) == length(gd.ends) == g[end] + else + @assert length(gd.starts) == length(gd.ends) == 0 + end @assert isperm(gd.idx) @assert length(gd.idx) == length(gd.groups) == nrow(parent(gd)) @@ -78,7 +84,7 @@ function validate_gdf(ogd::GroupedDataFrame) # all groups have different grouping keys @test allunique(eachrow(gd.parent[gd.idx[gd.starts], gd.cols])) end - + return ogd end function groupby_checked(df::AbstractDataFrame, keys, args...; kwargs...) @@ -106,8 +112,10 @@ end @testset "accepted columns" begin df = DataFrame(A=[1,1,1,2,2,2], B=[1,2,1,2,1,2], C=1:6) - @test groupby_checked(df, [1,2]) == groupby_checked(df, 1:2) == groupby_checked(df, [:A, :B]) - @test groupby_checked(df, [2,1]) == groupby_checked(df, 2:-1:1) == groupby_checked(df, [:B, :A]) + @test groupby_checked(df, [1,2]) == groupby_checked(df, 1:2) == + groupby_checked(df, [:A, :B]) + @test groupby_checked(df, [2,1]) == groupby_checked(df, 2:-1:1) == + groupby_checked(df, [:B, :A]) end @testset "groupby and combine(::Function, ::GroupedDataFrame)" begin @@ -180,7 +188,7 @@ end @test all(gd[i][!, colssym[2]] .== sres[i, colssym[2]]) end @test combine(identity, gd) == shcatdf - @test combine(df -> df[1, :], gd, cols, sort=true) == + @test combine(df -> df[1, :], gd) == shcatdf[.!nonunique(shcatdf, colssym), :] df_ref = DataFrame(gd) @test hcat(df_ref[!, cols], df_ref[!, Not(cols)]) == shcatdf @@ -194,43 +202,41 @@ end @test combine(f8, gd) == sres4 # combine() with regroup without and with groups sorting - for sort in (false, true) - gd = groupby_checked(df, cols, sort=sort) - v = combine(d -> d[:, [:x]], gd, regroup=true) + for dosort in (false, true) + gd = groupby_checked(df, cols, sort=dosort) + v = validate_gdf(combine(d -> d[:, [:x]], gd, regroup=true)) @test length(gd) == length(v) nms = [colssym; :x] @test v[1] == gd[1][:, nms] - @test v[1] == gd[1][:, nms] && - v[2] == gd[2][:, nms] && - v[3] == gd[3][:, nms] && - v[4] == gd[4][:, nms] + @test v[1] == gd[1][:, nms] && v[2] == gd[2][:, nms] && + v[3] == gd[3][:, nms] && v[4] == gd[4][:, nms] @test names(parent(v))[v.cols] == string.(colssym) - v = combine(f1, gd, regroup=true) - @test extrema(v.grous) == extrema(gd.groups) + v = validate_gdf(combine(f1, gd, regroup=true)) + @test extrema(v.groups) == extrema(gd.groups) @test vcat(v[1], v[2], v[3], v[4]) == combine(f1, gd) - v = combine(f2, gd, regroup=true) - @test extrema(v.grous) == extrema(gd.groups) + v = validate_gdf(combine(f2, gd, regroup=true)) + @test extrema(v.groups) == extrema(gd.groups) @test vcat(v[1], v[2], v[3], v[4]) == combine(f2, gd) - v = combine(f3, gd, regroup=true) - @test extrema(v.grous) == extrema(gd.groups) + v = validate_gdf(combine(f3, gd, regroup=true)) + @test extrema(v.groups) == extrema(gd.groups) @test vcat(v[1], v[2], v[3], v[4]) == combine(f3, gd) - v = combine(f4, gd, regroup=true) - @test extrema(v.grous) == extrema(gd.groups) + v = validate_gdf(combine(f4, gd, regroup=true)) + @test extrema(v.groups) == extrema(gd.groups) @test vcat(v[1], v[2], v[3], v[4]) == combine(f4, gd) - v = combine(f5, gd, regroup=true) - @test extrema(v.grous) == extrema(gd.groups) + v = validate_gdf(combine(f5, gd, regroup=true)) + @test extrema(v.groups) == extrema(gd.groups) @test vcat(v[1], v[2], v[3], v[4]) == combine(f5, gd) - v = combine(f5, gd, regroup=true) - @test extrema(v.grous) == extrema(gd.groups) + v = validate_gdf(combine(f5, gd, regroup=true)) + @test extrema(v.groups) == extrema(gd.groups) @test vcat(v[1], v[2], v[3], v[4]) == combine(f5, gd) - v = combine(f6, gd, regroup=true) - @test extrema(v.grous) == extrema(gd.groups) + v = validate_gdf(combine(f6, gd, regroup=true)) + @test extrema(v.groups) == extrema(gd.groups) @test vcat(v[1], v[2], v[3], v[4]) == combine(f6, gd) - v = combine(f7, gd, regroup=true) - @test extrema(v.grous) == extrema(gd.groups) + v = validate_gdf(combine(f7, gd, regroup=true)) + @test extrema(v.groups) == extrema(gd.groups) @test vcat(v[1], v[2], v[3], v[4]) == combine(f7, gd) - v = combine(f8, gd, regroup=true) - @test extrema(v.grous) == extrema(gd.groups) + v = validate_gdf(combine(f8, gd, regroup=true)) + @test extrema(v.groups) == extrema(gd.groups) @test vcat(v[1], v[2], v[3], v[4]) == combine(f8, gd) end end @@ -322,112 +328,109 @@ end df = DataFrame(Key1 = CategoricalArray(["A", "A", "B", "B", "B", "A"]), Key2 = CategoricalArray(["A", "B", "A", "B", "B", "A"]), Value = 1:6) - + gdf = groupby_checked(df, :Key1) # Check that CategoricalArray column is preserved when returning a value... - res = combine(d -> DataFrame(x=d[1, :Key2]), groupby_checked(df, :Key1)) + res = combine(d -> DataFrame(x=d[1, :Key2]), gdf) @test typeof(res.x) == typeof(df.Key2) - res = combine(d -> (x=d[1, :Key2],), groupby_checked(df, :Key1)) + res = combine(d -> (x=d[1, :Key2],), gdf) @test typeof(res.x) == typeof(df.Key2) # ...and when returning an array - res = combine(d -> DataFrame(x=d.Key1), groupby_checked(df, :Key1)) + res = combine(d -> DataFrame(x=d.Key1), gdf) @test typeof(res.x) == typeof(df.Key1) # Check that CategoricalArray and String give a String... - res = combine(d -> d.Key1 == ["A", "A"] ? DataFrame(x=d[1, :Key1]) : DataFrame(x="C"), - groupby_checked(df, :Key1)) + res = combine(d -> d.Key1 == ["A", "A"] ? DataFrame(x=d[1, :Key1]) : DataFrame(x="C"), gdf) @test res.x isa Vector{String} - res = combine(d -> d.Key1 == ["A", "A"] ? (x=d[1, :Key1],) : (x="C",), - groupby_checked(df, :Key1)) + res = combine(d -> d.Key1 == ["A", "A"] ? (x=d[1, :Key1],) : (x="C",), gdf) @test res.x isa Vector{String} # ...even when CategoricalValue comes second - res = combine(d -> d.Key1 == ["B", "B"] ? DataFrame(x=d[1, :Key1]) : DataFrame(x="C"), - groupby_checked(df, :Key1)) + res = combine(d -> d.Key1 == ["B", "B"] ? DataFrame(x=d[1, :Key1]) : DataFrame(x="C"), gdf) @test res.x isa Vector{String} - res = combine(d -> d.Key1 == ["B", "B"] ? (x=d[1, :Key1],) : (x="C",), - groupby_checked(df, :Key1)) + res = combine(d -> d.Key1 == ["B", "B"] ? (x=d[1, :Key1],) : (x="C",), gdf) @test res.x isa Vector{String} df = DataFrame(x = [1, 2, 3], y = [2, 3, 1]) - + gdf = groupby_checked(df, :x) # Test function returning DataFrameRow - res = combine(d -> DataFrameRow(d, 1, :), groupby_checked(df, :x)) + res = combine(d -> DataFrameRow(d, 1, :), gdf) @test res == DataFrame(x=df.x, y=df.y) # Test function returning Tuple - res = by(d -> (sum(d.y),), df, :x) + res = combine(d -> (sum(d.y),), gdf) @test res == DataFrame(x=df.x, x1=tuple.([2, 3, 1])) # Test with some groups returning empty data frames - @test by(d -> d.x == [1] ? DataFrame(z=[]) : DataFrame(z=1), df, :x) == + @test combine(d -> d.x == [1] ? DataFrame(z=[]) : DataFrame(z=1), gdf) == DataFrame(x=[2, 3], z=[1, 1]) - v = combine(d -> d.x == [1] ? DataFrame(z=[]) : DataFrame(z=1), groupby_checked(df, :x), regroup=true) + v = validate_gdf(combine(d -> d.x == [1] ? DataFrame(z=[]) : DataFrame(z=1), + groupby_checked(df, :x), regroup=true)) @test length(v) == 2 @test vcat(v[1], v[2]) == DataFrame(x=[2, 3], z=[1, 1]) # Test that returning values of different types works with NamedTuple - res = by(d -> d.x == [1] ? 1 : 2.0, df, :x) + res = combine(d -> d.x == [1] ? 1 : 2.0, gdf) @test res.x1 isa Vector{Float64} @test res.x1 == [1, 2, 2] # Two columns need to be widened at different times - res = by(d -> (a=d.x == [1] ? 1 : 2.0, b=d.x == [3] ? missing : "a"), df, :x) + res = combine(d -> (a=d.x == [1] ? 1 : 2.0, b=d.x == [3] ? missing : "a"), gdf) @test res.a isa Vector{Float64} @test res.a == [1, 2, 2] @test res.b isa Vector{Union{String,Missing}} @test res.b ≅ ["a", "a", missing] # Corner case: two columns need to be widened at the same time - res = by(d -> (a=d.x == [1] ? 1 : 2.0, b=d.x == [1] ? missing : "a"), df, :x) + res = combine(d -> (a=d.x == [1] ? 1 : 2.0, b=d.x == [1] ? missing : "a"), gdf) @test res.a isa Vector{Float64} @test res.a == [1, 2, 2] @test res.b isa Vector{Union{String,Missing}} @test res.b ≅ [missing, "a", "a"] # Test that returning values of different types works with DataFrame - res = by(d -> DataFrame(x1 = d.x == [1] ? 1 : 2.0), df, :x) + res = combine(d -> DataFrame(x1 = d.x == [1] ? 1 : 2.0), gdf) @test res.x1 isa Vector{Float64} @test res.x1 == [1, 2, 2] # Two columns need to be widened at different times - res = by(d -> DataFrame(a=d.x == [1] ? 1 : 2.0, b=d.x == [3] ? missing : "a"), df, :x) + res = combine(d -> DataFrame(a=d.x == [1] ? 1 : 2.0, b=d.x == [3] ? missing : "a"), gdf) @test res.a isa Vector{Float64} @test res.a == [1, 2, 2] @test res.b isa Vector{Union{String,Missing}} @test res.b ≅ ["a", "a", missing] # Corner case: two columns need to be widened at the same time - res = by(d -> DataFrame(a=d.x == [1] ? 1 : 2.0, b=d.x == [1] ? missing : "a"), df, :x) + res = combine(d -> DataFrame(a=d.x == [1] ? 1 : 2.0, b=d.x == [1] ? missing : "a"), gdf) @test res.a isa Vector{Float64} @test res.a == [1, 2, 2] @test res.b isa Vector{Union{String,Missing}} @test res.b ≅ [missing, "a", "a"] # Test return values with columns in different orders - @test by(d -> d.x == [1] ? (x1=1, x2=3) : (x2=2, x1=4), df, :x) == + @test combine(d -> d.x == [1] ? (x1=1, x2=3) : (x2=2, x1=4), gdf) == DataFrame(x=1:3, x1=[1, 4, 4], x2=[3, 2, 2]) - @test by(d -> d.x == [1] ? DataFrame(x1=1, x2=3) : DataFrame(x2=2, x1=4), df, :x) == + @test combine(d -> d.x == [1] ? DataFrame(x1=1, x2=3) : DataFrame(x2=2, x1=4), gdf) == DataFrame(x=1:3, x1=[1, 4, 4], x2=[3, 2, 2]) # Test with NamedTuple with columns of incompatible lengths - @test_throws DimensionMismatch by(d -> (x1=[1], x2=[3, 4]), df, :x) - @test_throws DimensionMismatch by(d -> d.x == [1] ? (x1=[1], x2=[3]) : - (x1=[1], x2=[3, 4]), df, :x) + @test_throws DimensionMismatch combine(d -> (x1=[1], x2=[3, 4]), gdf) + @test_throws DimensionMismatch combine(d -> d.x == [1] ? (x1=[1], x2=[3]) : + (x1=[1], x2=[3, 4]), gdf) # Test with incompatible return values - @test_throws ArgumentError by(d -> d.x == [1] ? (x1=1,) : DataFrame(x1=1), df, :x) - @test_throws ArgumentError by(d -> d.x == [1] ? DataFrame(x1=1) : (x1=1,), df, :x) - @test_throws ArgumentError by(d -> d.x == [1] ? NamedTuple() : (x1=1), df, :x) - @test_throws ArgumentError by(d -> d.x == [1] ? (x1=1) : NamedTuple(), df, :x) - @test_throws ArgumentError by(d -> d.x == [1] ? 1 : DataFrame(x1=1), df, :x) - @test_throws ArgumentError by(d -> d.x == [1] ? DataFrame(x1=1) : 1, df, :x) - @test_throws ArgumentError by(d -> d.x == [1] ? (x1=1) : (x1=[1]), df, :x) - @test_throws ArgumentError by(d -> d.x == [1] ? (x1=[1]) : (x1=1), df, :x) - @test_throws ArgumentError by(d -> d.x == [1] ? 1 : [1], df, :x) - @test_throws ArgumentError by(d -> d.x == [1] ? [1] : 1, df, :x) - @test_throws ArgumentError by(d -> d.x == [1] ? (x1=1, x2=1) : (x1=[1], x2=1), df, :x) - @test_throws ArgumentError by(d -> d.x == [1] ? (x1=[1], x2=1) : (x1=1, x2=1), df, :x) + @test_throws ArgumentError combine(d -> d.x == [1] ? (x1=1,) : DataFrame(x1=1), gdf) + @test_throws ArgumentError combine(d -> d.x == [1] ? DataFrame(x1=1) : (x1=1,), gdf) + @test_throws ArgumentError combine(d -> d.x == [1] ? NamedTuple() : (x1=1), gdf) + @test_throws ArgumentError combine(d -> d.x == [1] ? (x1=1) : NamedTuple(), gdf) + @test_throws ArgumentError combine(d -> d.x == [1] ? 1 : DataFrame(x1=1), gdf) + @test_throws ArgumentError combine(d -> d.x == [1] ? DataFrame(x1=1) : 1, gdf) + @test_throws ArgumentError combine(d -> d.x == [1] ? (x1=1) : (x1=[1]), gdf) + @test_throws ArgumentError combine(d -> d.x == [1] ? (x1=[1]) : (x1=1), gdf) + @test_throws ArgumentError combine(d -> d.x == [1] ? 1 : [1], gdf) + @test_throws ArgumentError combine(d -> d.x == [1] ? [1] : 1, gdf) + @test_throws ArgumentError combine(d -> d.x == [1] ? (x1=1, x2=1) : (x1=[1], x2=1), gdf) + @test_throws ArgumentError combine(d -> d.x == [1] ? (x1=[1], x2=1) : (x1=1, x2=1), gdf) # Special case allowed due to how implementation works - @test by(d -> d.x == [1] ? 1 : (x1=1), df, :x) == by(d -> 1, df, :x) + @test combine(d -> d.x == [1] ? 1 : (x1=1), gdf) == combine(d -> 1, gdf) # Test that columns names and types are respected for empty input df = DataFrame(x=Int[], y=String[]) - res = by(d -> 1, df, :x) + res = combine(d -> 1, groupby_checked(df, :x)) @test size(res) == (0, 1) @test res.x isa Vector{Int} @@ -435,14 +438,14 @@ end df = DataFrame(x=[], y=[]) gd = groupby_checked(df, :x) @test combine(df -> sum(df.x), gd) == DataFrame(x=[]) - res = map(df -> sum(df.x), gd) + res = validate_gdf(combine(df -> sum(df.x), gd, regroup=true)) @test length(res) == 0 @test res.parent == DataFrame(x=[]) # Test with zero groups in output df = DataFrame(A = [1, 2]) gd = groupby_checked(df, :A) - gd2 = map(d -> DataFrame(), gd) + gd2 = validate_gdf(combine(d -> DataFrame(), gd, regroup=true)) @test length(gd2) == 0 @test gd.cols == [1] @test isempty(gd2.groups) @@ -452,7 +455,7 @@ end @test parent(gd2) == DataFrame(A=[]) @test eltype.(eachcol(parent(gd2))) == [Int] - gd2 = map(d -> DataFrame(X=Int[]), gd) + gd2 = validate_gdf(combine(d -> DataFrame(X=Int[]), gd, regroup=true)) @test length(gd2) == 0 @test gd.cols == [1] @test isempty(gd2.groups) @@ -622,77 +625,13 @@ end @test DataFrame(df) == df end -@testset "by, combine and map with pair interface" begin +@testset "combine with pair interface" begin vexp = x -> exp.(x) Random.seed!(1) df = DataFrame(a = repeat([1, 3, 2, 4], outer=[2]), b = repeat([2, 1], outer=[4]), c = rand(Int, 8)) - # Only test that different by syntaxes work, - # and rely on tests below for deeper checks - @test by(df, :a, :c => sum) == - by(:c => sum, df, :a) == - by(df, :a, :c => sum => :c_sum) == - by(:c => sum => :c_sum, df, :a) == - by(df, :a, [:c => sum]) == - by(df, :a, [:c => sum => :c_sum]) == - by(d -> (c_sum=sum(d.c),), df, :a) == - by(df, :a, d -> (c_sum=sum(d.c),)) - - @test by(df, :a, :c => vexp) == - by(:c => vexp, df, :a) == - by(df, :a, :c => vexp => :c_function) == - by(:c => vexp => :c_function, df, :a) == - by(:c => c -> (c_function = vexp(c),), df, :a) == - by(df, :a, :c => c -> (c_function = vexp(c),)) == - by(df, :a, [:c => vexp]) == - by(df, :a, [:c => vexp => :c_function]) == - by(d -> (c_function=vexp(d.c),), df, :a) == - by(df, :a, d -> (c_function=vexp(d.c),)) - - @test by(df, :a, :b => sum, :c => sum) == - by(df, :a, :b => sum => :b_sum, :c => sum => :c_sum) == - by(df, :a, [:b => sum, :c => sum]) == - by(df, :a, [:b => sum => :b_sum, :c => sum => :c_sum]) == - by(d -> (b_sum=sum(d.b), c_sum=sum(d.c)), df, :a) == - by(df, :a, d -> (b_sum=sum(d.b), c_sum=sum(d.c))) - - @test by(df, :a, :b => vexp, :c => identity) == - by(df, :a, :b => vexp => :b_function, :c => identity => :c_identity) == - by(df, :a, [:b => vexp, :c => identity]) == - by(df, :a, [:b => vexp => :b_function, :c => identity => :c_identity]) == - by(d -> (b_function=vexp(d.b), c_identity=identity(d.c)), df, :a) == - by(df, :a, d -> (b_function=vexp(d.b), c_identity=identity(d.c))) == - by(df, :a, [:b, :c] => (b, c) -> (b_function=vexp(b), c_identity=identity(c))) == - by([:b, :c] => (b, c) -> (b_function=vexp(b), c_identity=identity(c)), df, :a) - - @test by(x -> extrema(x.c), df, :a) == by(:c => (x -> extrema(x)) => :x1, df, :a) - @test by(x -> x.b+x.c, df, :a) == by([:b,:c] => (+) => :x1, df, :a) - @test by(x -> (p=x.b, q=x.c), df, :a) == - by([:b,:c] => (b,c) -> (p=b,q=c), df, :a) == - by(df, :a, x -> (p=x.b, q=x.c)) == - by(df, :a, [:b,:c] => (b,c) -> (p=b,q=c)) - @test by(x -> DataFrame(p=x.b, q=x.c), df, :a) == - by([:b,:c] => (b,c) -> DataFrame(p=b,q=c), df, :a) == - by(df, :a, x -> DataFrame(p=x.b, q=x.c)) == - by(df, :a, [:b,:c] => (b,c) -> DataFrame(p=b,q=c)) - @test by(x -> [1 2; 3 4], df, :a) == - by([:b,:c] => (b,c) -> [1 2; 3 4], df, :a) == - by(df, :a, x -> [1 2; 3 4]) == - by(df, :a, [:b,:c] => (b,c) -> [1 2; 3 4]) - @test by(nrow, df, :a) == by(df, :a, nrow) == by(df, :a, [nrow => :nrow]) == - by(df, :a, 1 => length => :nrow) - @test by(nrow => :res, df, :a) == by(df, :a, nrow => :res) == - by(df, :a, [nrow => :res]) == by(df, :a, 1 => length => :res) - @test by(df, :a, nrow => :res, nrow, [nrow => :res2]) == - by(df, :a, 1 => length => :res, 1 => length => :nrow, 1 => length => :res2) - - @test_throws ArgumentError by([:b,:c] => ((b,c) -> [1 2; 3 4]) => :xxx, df, :a) - @test_throws ArgumentError by(df, :a, [:b,:c] => ((b,c) -> [1 2; 3 4]) => :xxx) - @test_throws ArgumentError by(df, :a, nrow, nrow) - @test_throws ArgumentError by(df, :a, [nrow]) - gd = groupby_checked(df, :a) # Only test that different combine syntaxes work, @@ -703,50 +642,53 @@ end combine(:c => sum => :c_sum, gd) == combine(gd, [:c => sum]) == combine(gd, [:c => sum => :c_sum]) == - combine(d -> (c_sum=sum(d.c),), gd) == - combine(gd, d -> (c_sum=sum(d.c),)) + combine(d -> (c_sum=sum(d.c),), gd) + @test_throws MethodError combine(gd, d -> (c_sum=sum(d.c),)) @test combine(gd, :c => vexp) == combine(:c => vexp, gd) == combine(gd, :c => vexp => :c_function) == combine(:c => vexp => :c_function, gd) == combine(:c => c -> (c_function = vexp(c),), gd) == - combine(gd, :c => c -> (c_function = vexp(c),)) == combine(gd, [:c => vexp]) == combine(gd, [:c => vexp => :c_function]) == - combine(d -> (c_function=exp.(d.c),), gd) == - combine(gd, d -> (c_function=exp.(d.c),)) + combine(d -> (c_function=exp.(d.c),), gd) + @test_throws ArgumentError combine(gd, :c => c -> (c_function = vexp(c),)) + @test_throws MethodError combine(gd, d -> (c_function=exp.(d.c),)) @test combine(gd, :b => sum, :c => sum) == combine(gd, :b => sum => :b_sum, :c => sum => :c_sum) == combine(gd, [:b => sum, :c => sum]) == combine(gd, [:b => sum => :b_sum, :c => sum => :c_sum]) == - combine(d -> (b_sum=sum(d.b), c_sum=sum(d.c)), gd) == - combine(gd, d -> (b_sum=sum(d.b), c_sum=sum(d.c))) + combine(d -> (b_sum=sum(d.b), c_sum=sum(d.c)), gd) + @test_throws MethodError combine(gd, d -> (b_sum=sum(d.b), c_sum=sum(d.c))) @test combine(gd, :b => vexp, :c => identity) == combine(gd, :b => vexp => :b_function, :c => identity => :c_identity) == combine(gd, [:b => vexp, :c => identity]) == combine(gd, [:b => vexp => :b_function, :c => identity => :c_identity]) == combine(d -> (b_function=vexp(d.b), c_identity=d.c), gd) == - combine(gd, d -> (b_function=vexp(d.b), c_identity=d.c)) == - combine([:b, :c] => (b, c) -> (b_function=vexp(b), c_identity=c), gd) == - combine(gd, [:b, :c] => (b, c) -> (b_function=vexp(b), c_identity=c)) + combine([:b, :c] => (b, c) -> (b_function=vexp(b), c_identity=c), gd) + @test_throws MethodError combine(gd, d -> (b_function=vexp(d.b), c_identity=d.c)) + @test_throws ArgumentError combine(gd, [:b, :c] => (b, c) -> (b_function=vexp(b), c_identity=c)) @test combine(x -> extrema(x.c), gd) == combine(:c => (x -> extrema(x)) => :x1, gd) @test combine(x -> x.b+x.c, gd) == combine([:b,:c] => (+) => :x1, gd) @test combine(x -> (p=x.b, q=x.c), gd) == - combine([:b,:c] => (b,c) -> (p=b,q=c), gd) == - combine(gd, x -> (p=x.b, q=x.c)) == - combine(gd, [:b,:c] => (b,c) -> (p=b,q=c)) + combine([:b,:c] => (b,c) -> (p=b,q=c), gd) + @test_throws MethodError combine(gd, x -> (p=x.b, q=x.c)) + @test_throws ArgumentError combine(gd, [:b,:c] => (b,c) -> (p=b,q=c)) + @test combine(x -> DataFrame(p=x.b, q=x.c), gd) == - combine([:b,:c] => (b,c) -> DataFrame(p=b,q=c), gd) == - combine(gd, x -> DataFrame(p=x.b, q=x.c)) == - combine(gd, [:b,:c] => (b,c) -> DataFrame(p=b,q=c)) + combine([:b,:c] => (b,c) -> DataFrame(p=b,q=c), gd) + @test_throws MethodError combine(gd, x -> DataFrame(p=x.b, q=x.c)) + @test_throws ArgumentError combine(gd, [:b,:c] => (b,c) -> DataFrame(p=b,q=c)) + @test combine(x -> [1 2; 3 4], gd) == - combine([:b,:c] => (b,c) -> [1 2; 3 4], gd) == - combine(gd, x -> [1 2; 3 4]) == - combine(gd, [:b,:c] => (b,c) -> [1 2; 3 4]) + combine([:b,:c] => (b,c) -> [1 2; 3 4], gd) + @test_throws MethodError combine(gd, x -> [1 2; 3 4]) + @test_throws ArgumentError combine(gd, [:b,:c] => (b,c) -> [1 2; 3 4]) + @test combine(nrow, gd) == combine(gd, nrow) == combine(gd, [nrow => :nrow]) == combine(gd, 1 => length => :nrow) @test combine(nrow => :res, gd) == combine(gd, nrow => :res) == @@ -758,62 +700,65 @@ end @test_throws ArgumentError combine(gd, nrow, nrow) @test_throws ArgumentError combine(gd, [nrow]) - for f in (map, combine) - for col in (:c, 3) - @test f(col => sum, gd) == f(d -> (c_sum=sum(d.c),), gd) - @test f(col => x -> sum(x), gd) == f(d -> (c_function=sum(d.c),), gd) - @test f(col => x -> (z=sum(x),), gd) == f(d -> (z=sum(d.c),), gd) - @test f(col => x -> DataFrame(z=sum(x),), gd) == f(d -> (z=sum(d.c),), gd) - @test f(col => identity, gd) == f(d -> (c_identity=d.c,), gd) - @test f(col => x -> (z=x,), gd) == f(d -> (z=d.c,), gd) - - @test f(col => sum => :xyz, gd) == - f(d -> (xyz=sum(d.c),), gd) - @test f(col => (x -> sum(x)) => :xyz, gd) == - f(d -> (xyz=sum(d.c),), gd) - @test f(col => (x -> (sum(x),)) => :xyz, gd) == - f(d -> (xyz=(sum(d.c),),), gd) - @test f(nrow, gd) == f(d -> (nrow=length(d.c),), gd) - @test f(nrow => :res, gd) == f(d -> (res=length(d.c),), gd) - @test f(col => sum => :res, gd) == f(d -> (res=sum(d.c),), gd) - @test f(col => (x -> sum(x)) => :res, gd) == f(d -> (res=sum(d.c),), gd) - @test_throws ArgumentError f(col => (x -> (z=sum(x),)) => :xyz, gd) - @test_throws ArgumentError f(col => (x -> DataFrame(z=sum(x),)) => :xyz, gd) - @test_throws ArgumentError f(col => (x -> (z=x,)) => :xyz, gd) - @test_throws ArgumentError f(col => x -> (z=1, xzz=[1]), gd) + for col in (:c, 3) + @test combine(col => sum, gd) == combine(d -> (c_sum=sum(d.c),), gd) + @test combine(col => x -> sum(x), gd) == combine(d -> (c_function=sum(d.c),), gd) + @test combine(col => x -> (z=sum(x),), gd) == combine(d -> (z=sum(d.c),), gd) + @test combine(col => x -> DataFrame(z=sum(x),), gd) == combine(d -> (z=sum(d.c),), gd) + @test combine(col => identity, gd) == combine(d -> (c_identity=d.c,), gd) + @test combine(col => x -> (z=x,), gd) == combine(d -> (z=d.c,), gd) + + @test combine(col => sum => :xyz, gd) == + combine(d -> (xyz=sum(d.c),), gd) + @test combine(col => (x -> sum(x)) => :xyz, gd) == + combine(d -> (xyz=sum(d.c),), gd) + @test combine(col => (x -> (sum(x),)) => :xyz, gd) == + combine(d -> (xyz=(sum(d.c),),), gd) + @test combine(nrow, gd) == combine(d -> (nrow=length(d.c),), gd) + @test combine(nrow => :res, gd) == combine(d -> (res=length(d.c),), gd) + @test combine(col => sum => :res, gd) == combine(d -> (res=sum(d.c),), gd) + @test combine(col => (x -> sum(x)) => :res, gd) == combine(d -> (res=sum(d.c),), gd) + @test_throws ArgumentError combine(col => (x -> (z=sum(x),)) => :xyz, gd) + @test_throws ArgumentError combine(col => (x -> DataFrame(z=sum(x),)) => :xyz, gd) + @test_throws ArgumentError combine(col => (x -> (z=x,)) => :xyz, gd) + @test_throws ArgumentError combine(col => x -> (z=1, xzz=[1]), gd) + end + for cols in ([:b, :c], 2:3, [2, 3], [false, true, true]), regroup in (true, false) + @test combine(cols => (b,c) -> (y=exp.(b), z=c), gd, regroup=regroup) == + combine(d -> (y=exp.(d.b), z=d.c), gd, regroup=regroup) + @test combine(cols => (b,c) -> [exp.(b) c], gd, regroup=regroup) == + combine(d -> [exp.(d.b) d.c], gd, regroup=regroup) + @test combine(cols => ((b,c) -> sum(b) + sum(c)) => :xyz, gd, regroup=regroup) == + combine(d -> (xyz=sum(d.b) + sum(d.c),), gd, regroup=regroup) + if eltype(cols) === Bool + cols2 = [[false, true, false], [false, false, true]] + @test_throws MethodError combine((xyz = cols[1] => sum, xzz = cols2[2] => sum), + gd, regroup=regroup) + @test_throws MethodError combine((xyz = cols[1] => sum, xzz = cols2[1] => sum), + gd, regroup=regroup) + @test_throws MethodError combine((xyz = cols[1] => sum, xzz = cols2[2] => x -> first(x)), + gd, regroup=regroup) + else + cols2 = cols + @test combine(gd, cols2[1] => sum => :xyz, cols2[2] => sum => :xzz, regroup=regroup) == + combine(d -> (xyz=sum(d.b), xzz=sum(d.c)), gd, regroup=regroup) + @test combine(gd, cols2[1] => sum => :xyz, cols2[1] => sum => :xzz, regroup=regroup) == + combine(d -> (xyz=sum(d.b), xzz=sum(d.b)), gd, regroup=regroup) + @test combine(gd, cols2[1] => sum => :xyz, + cols2[2] => (x -> first(x)) => :xzz, regroup=regroup) == + combine(d -> (xyz=sum(d.b), xzz=first(d.c)), gd, regroup=regroup) + @test combine(gd, cols2[1] => vexp => :xyz, + cols2[2] => sum => :xzz, regroup=regroup) == + combine(d -> (xyz=vexp(d.b), xzz=fill(sum(d.c), length(vexp(d.b)))), + gd, regroup=regroup) end - for cols in ([:b, :c], 2:3, [2, 3], [false, true, true]) - @test f(cols => (b,c) -> (y=exp.(b), z=c), gd) == - f(d -> (y=exp.(d.b), z=d.c), gd) - @test f(cols => (b,c) -> [exp.(b) c], gd) == - f(d -> [exp.(d.b) d.c], gd) - @test f(cols => ((b,c) -> sum(b) + sum(c)) => :xyz, gd) == - f(d -> (xyz=sum(d.b) + sum(d.c),), gd) - if eltype(cols) === Bool - cols2 = [[false, true, false], [false, false, true]] - @test_throws MethodError f((xyz = cols[1] => sum, xzz = cols2[2] => sum), gd) - @test_throws MethodError f((xyz = cols[1] => sum, xzz = cols2[1] => sum), gd) - @test_throws MethodError f((xyz = cols[1] => sum, xzz = cols2[2] => x -> first(x)), gd) - else - cols2 = cols - if f === combine - @test f(gd, cols2[1] => sum => :xyz, cols2[2] => sum => :xzz) == - f(d -> (xyz=sum(d.b), xzz=sum(d.c)), gd) - @test f(gd, cols2[1] => sum => :xyz, cols2[1] => sum => :xzz) == - f(d -> (xyz=sum(d.b), xzz=sum(d.b)), gd) - @test f(gd, cols2[1] => sum => :xyz, - cols2[2] => (x -> first(x)) => :xzz) == - f(d -> (xyz=sum(d.b), xzz=first(d.c)), gd) - @test f(gd, cols2[1] => vexp => :xyz, - cols2[2] => sum => :xzz) == - f(d -> (xyz=vexp(d.b), xzz=fill(sum(d.c), length(vexp(d.b)))), gd) - end - end - @test_throws ArgumentError f(cols => (b,c) -> (y=exp.(b), z=sum(c)), gd) - @test_throws ArgumentError f(cols2 => ((b,c) -> DataFrame(y=exp.(b), z=sum(c))) => :xyz, gd) - @test_throws ArgumentError f(cols2 => ((b,c) -> [exp.(b) c]) => :xyz, gd) - end + @test_throws ArgumentError combine(cols => (b,c) -> (y=exp.(b), z=sum(c)), + gd, regroup=regroup) + @test_throws ArgumentError combine(cols2 => ((b,c) -> DataFrame(y=exp.(b), + z=sum(c))) => :xyz, gd, regroup=regroup) + @test_throws ArgumentError combine(cols2 => ((b,c) -> [exp.(b) c]) => :xyz, + gd, regroup=regroup) end end @@ -942,9 +887,9 @@ Base.isless(::TestType, ::TestType) = false end df = DataFrame(x = [1, 1, 2, 2], y = Any[1, 2.0, 3.0, 4.0]) - res = by(df, :x, :y => maximum => :z) + res = combine(groupby_checked(df, :x), :y => maximum => :z) @test res.z isa Vector{Float64} - @test res.z == by(df, :x, :y => (x -> maximum(x)) => :z).z + @test res.z == combine(groupby_checked(df, :x), :y => (x -> maximum(x)) => :z).z # Test maximum when no promotion rule exists df = DataFrame(x = [1, 1, 2, 2], y = [1, TestType(), TestType(), TestType()]) @@ -953,27 +898,29 @@ Base.isless(::TestType, ::TestType) = false for f in (maximum, minimum) res = combine(gd, :y => maximum => :z) @test res.z isa Vector{Any} - @test res.z == by(df, :x, :y => (x -> maximum(x)) => :z).z + @test res.z == combine(gd, :y => (x -> maximum(x)) => :z).z end end -@testset "combine and map with columns named like grouping keys" begin +@testset "combine with columns named like grouping keys" begin df = DataFrame(x=["a", "a", "b", missing], y=1:4) gd = groupby_checked(df, :x) @test combine(identity, gd) ≅ df @test combine(d -> d[:, [2, 1]], gd) ≅ df @test_throws ArgumentError combine(f -> DataFrame(x=["a", "b"], z=[1, 1]), gd) - @test map(identity, gd) ≅ gd - @test map(d -> d[:, [2, 1]], gd) ≅ gd - @test_throws ArgumentError map(f -> DataFrame(x=["a", "b"], z=[1, 1]), gd) + @test validate_gdf(combine(identity, gd, regroup=true)) ≅ gd + @test combine(d -> d[:, [2, 1]], gd, regroup=true) ≅ gd + @test_throws ArgumentError combine(f -> DataFrame(x=["a", "b"], z=[1, 1]), gd, + regroup=true) gd = groupby_checked(df, :x, skipmissing=true) @test combine(identity, gd) == df[1:3, :] @test combine(d -> d[:, [2, 1]], gd) == df[1:3, :] @test_throws ArgumentError combine(f -> DataFrame(x=["a", "b"], z=[1, 1]), gd) - @test map(identity, gd) == gd - @test map(d -> d[:, [2, 1]], gd) == gd - @test_throws ArgumentError map(f -> DataFrame(x=["a", "b"], z=[1, 1]), gd) + @test validate_gdf(combine(identity, gd, regroup=true)) == gd + @test validate_gdf(combine(d -> d[:, [2, 1]], gd, regroup=true)) == gd + @test_throws ArgumentError combine(f -> DataFrame(x=["a", "b"], z=[1, 1]), gd, + regroup=true) end @testset "iteration protocol" begin @@ -1328,14 +1275,6 @@ end @test valuecols(gd) == [:C] end -@testset "by skipmissing and sort" begin - df = DataFrame(a=[2, 2, missing, missing, 1, 1, 3, 3], b=1:8) - for dosort in (false, true), doskipmissing in (false, true) - @test by(df, :a, :b=>sum, sort=dosort, skipmissing=doskipmissing) ≅ - combine(groupby_checked(df, :a, sort=dosort, skipmissing=doskipmissing), :b=>sum) - end -end - @testset "non standard cols arguments" begin df = DataFrame(x1=Int64[1,2,2], x2=Int64[1,1,2], y=Int64[1,2,3]) gdf = groupby_checked(df, r"x") @@ -1358,7 +1297,8 @@ end @test valuecols(gdf) == [:x1, :x2, :y] @test groupindices(gdf) == [1,1,1] - @test by(df, [], :x1 => sum => :a, :x2=>length => :b) == DataFrame(a=5, b=3) + @test combine(groupby_checked(df, []), + :x1 => sum => :a, :x2=>length => :b) == DataFrame(a=5, b=3) gdf = groupby_checked(df, []) @test gdf[1] == df @@ -1366,8 +1306,9 @@ end @test gdf[:] == gdf @test gdf[1:1] == gdf - @test map(nrow => :x1, gdf) == groupby_checked(DataFrame(x1=3), []) - @test map(:x2 => identity => :x2_identity, gdf) == + @test validate_gdf(combine(nrow => :x1, gdf, regroup=true)) == + groupby_checked(DataFrame(x1=3), []) + @test validate_gdf(combine(:x2 => identity => :x2_identity, gdf, regroup=true)) == groupby_checked(DataFrame(x2_identity=[1,1,2]), []) @test DataFrame(gdf) == df @@ -1683,10 +1624,12 @@ end @testset "Check aggregation of DataFrameRow" begin df = DataFrame(a=1) dfr = DataFrame(x=1, y="1")[1, 2:2] - @test by(sdf -> dfr, df, :a) == DataFrame(a=1, y="1") + gdf = groupby_checked(df, :a) + @test combine(sdf -> dfr, gdf) == DataFrame(a=1, y="1") df = DataFrame(a=[1,1,2,2,3,3], b='a':'f', c=string.(1:6)) - @test by(sdf -> sdf[1, [3,2,1]], df, :a) == df[1:2:5, [1,3,2]] + gdf = groupby_checked(df, :a) + @test combine(sdf -> sdf[1, [3,2,1]], gdf) == df[1:2:5, [1,3,2]] end @testset "Allow returning DataFrame() or NamedTuple() to drop group" begin @@ -1700,12 +1643,14 @@ end fr in (DataFrame(x1=[true]), (x1=[true],)) df = DataFrame(a = 1:N, x1 = x1) - res = by(sdf -> sdf.x1[1] ? fr : er, df, :a) - @test res == DataFrame(map(sdf -> sdf.x1[1] ? fr : er, groupby_checked(df, :a))) + gdf = groupby_checked(df, :a) + res = combine(sdf -> sdf.x1[1] ? fr : er, gdf) + @test res == DataFrame(validate_gdf(combine(sdf -> sdf.x1[1] ? fr : er, + groupby_checked(df, :a), regroup=true))) if fr isa AbstractVector && df.x1[1] - @test res == by(:x1 => (x1 -> x1[1] ? fr : er) => :x1, df, :a) + @test res == combine(:x1 => (x1 -> x1[1] ? fr : er) => :x1, gdf) else - @test res == by(:x1 => x1 -> x1[1] ? fr : er, df, :a) + @test res == combine(:x1 => x1 -> x1[1] ? fr : er, gdf) end if nrow(res) == 0 && length(propertynames(er)) == 0 && er != rand(0, 1) @test res == DataFrame(a=[]) @@ -1714,67 +1659,65 @@ end @test res == df[df.x1, :] end if 1 < i < 2^N - @test_throws ArgumentError by(sdf -> sdf.x1[1] ? (x1=true,) : er, df, :a) + @test_throws ArgumentError combine(sdf -> sdf.x1[1] ? (x1=true,) : er, gdf) if df.x1[1] || !(fr isa AbstractVector) - @test_throws ArgumentError by(sdf -> sdf.x1[1] ? fr : (x2=[true],), df, :a) + @test_throws ArgumentError combine(sdf -> sdf.x1[1] ? fr : (x2=[true],), gdf) else - res = by(sdf -> sdf.x1[1] ? fr : (x2=[true],), df, :a) + res = combine(sdf -> sdf.x1[1] ? fr : (x2=[true],), gdf) @test names(res) == ["a", "x2"] end - @test_throws ArgumentError by(sdf -> sdf.x1[1] ? true : er, df, :a) + @test_throws ArgumentError combine(sdf -> sdf.x1[1] ? true : er, gdf) end end end @testset "auto-splatting, ByRow, and column renaming" begin df = DataFrame(g=[1,1,1,2,2,2], x1=1:6, x2=1:6) - @test by(df, :g, r"x" => cor) == DataFrame(g=[1,2], x1_x2_cor = [1.0, 1.0]) - @test by(df, :g, Not(:g) => ByRow(/)) == DataFrame(:g => [1,1,1,2,2,2], Symbol("x1_x2_/") => 1.0) - @test by(df, :g, Between(:x2, :x1) => () -> 1) == DataFrame(:g => 1:2, Symbol("function") => 1) - @test by(df, :g, :x1 => :z) == - by(df, :g, [:x1 => :z]) == - by(:x1 => :z, df, :g) == - combine(groupby_checked(df, :g), :x1 => :z) == - combine(groupby_checked(df, :g), [:x1 => :z]) == - combine(:x1 => :z, groupby_checked(df, :g)) == + gdf = groupby_checked(df, :g) + @test combine(gdf, r"x" => cor) == DataFrame(g=[1,2], x1_x2_cor = [1.0, 1.0]) + @test combine(gdf, Not(:g) => ByRow(/)) == DataFrame(:g => [1,1,1,2,2,2], Symbol("x1_x2_/") => 1.0) + @test combine(gdf, Between(:x2, :x1) => () -> 1) == DataFrame(:g => 1:2, Symbol("function") => 1) + @test combine(gdf, :x1 => :z) == combine(gdf, [:x1 => :z]) == combine(:x1 => :z, gdf) == DataFrame(g=[1,1,1,2,2,2], z=1:6) - @test map(:x1 => :z, groupby_checked(df, :g)) == groupby_checked(DataFrame(g=[1,1,1,2,2,2], z=1:6), :g) + @test validate_gdf(combine(:x1 => :z, groupby_checked(df, :g), regroup=true)) == + groupby_checked(DataFrame(g=[1,1,1,2,2,2], z=1:6), :g) end @testset "hard tabular return value cases" begin Random.seed!(1) df = DataFrame(b = repeat([2, 1], outer=[4]), x = randn(8)) - res = by(sdf -> sdf.x[1:2], df, :b) + gdf = groupby_checked(df, :b) + res = combine(sdf -> sdf.x[1:2], gdf) @test names(res) == ["b", "x1"] - res2 = by(:x => x -> x[1:2], df, :b) + res2 = combine(:x => x -> x[1:2], gdf) @test names(res2) == ["b", "x_function"] @test Matrix(res) == Matrix(res2) - res2 = by(:x => (x -> x[1:2]) => :z, df, :b) + res2 = combine(:x => (x -> x[1:2]) => :z, gdf) @test names(res2) == ["b", "z"] @test Matrix(res) == Matrix(res2) - @test_throws ArgumentError by(df, :b) do sdf + @test_throws ArgumentError combine(gdf) do sdf if sdf.b[1] == 2 return (c=sdf.x[1:2],) else return sdf.x[1:2] end end - @test_throws ArgumentError by(df, :b) do sdf + @test_throws ArgumentError combine(gdf) do sdf if sdf.b[1] == 1 return (c=sdf.x[1:2],) else return sdf.x[1:2] end end - @test_throws ArgumentError by(df, :b) do sdf + @test_throws ArgumentError combine(gdf) do sdf if sdf.b[1] == 2 return (c=sdf.x[1],) else return sdf.x[1] end end - @test_throws ArgumentError by(df, :b) do sdf + @test_throws ArgumentError combine(gdf) do sdf if sdf.b[1] == 1 return (c=sdf.x[1],) else @@ -1783,26 +1726,25 @@ end end for i in 1:2, v1 in [1, 1:2], v2 in [1, 1:2] - @test_throws ArgumentError by([:b, :x] => ((b,x) -> b[1] == i ? x[v1] : (c=x[v2],)) => :v, df, :b) - @test_throws ArgumentError by([:b, :x] => ((b,x) -> b[1] == i ? x[v1] : (v=x[v2],)) => :v, df, :b) + @test_throws ArgumentError combine([:b, :x] => ((b,x) -> b[1] == i ? x[v1] : (c=x[v2],)) => :v, gdf) + @test_throws ArgumentError combine([:b, :x] => ((b,x) -> b[1] == i ? x[v1] : (v=x[v2],)) => :v, gdf) end end @testset "last Pair interface with multiple return values" begin df = DataFrame(g=[1,1,1,2,2,2], x1=1:6) - @test by(df, :g, :x1 => x -> DataFrame()) == by(:x1 => x -> DataFrame(), df, :g) - @test by(df, :g, :x1 => x -> (x=1, y=2)) == by(:x1 => x -> (x=1, y=2), df, :g) - @test by(df, :g, :x1 => x -> (x=[1], y=[2])) == by(:x1 => x -> (x=[1], y=[2]), df, :g) - @test_throws ArgumentError by(df, :g, :x1 => x -> (x=[1],y=2)) - @test_throws ArgumentError by(:x1 => x -> (x=[1], y=2), df, :g) - @test by(df, :g, :x1 => x -> ones(2, 2)) == by(:x1 => x -> ones(2, 2), df, :g) - @test by(df, :g, :x1 => x -> df[1, Not(:g)]) == by(:x1 => x -> df[1, Not(:g)], df, :g) + gdf = groupby_checked(df, :g) + @test_throws ArgumentError combine(gdf, :x1 => x -> DataFrame()) + @test_throws ArgumentError combine(gdf, :x1 => x -> (x=1, y=2)) + @test_throws ArgumentError combine(gdf, :x1 => x -> (x=[1], y=[2])) + @test_throws ArgumentError combine(gdf, :x1 => x -> (x=[1],y=2)) + @test_throws ArgumentError combine(:x1 => x -> (x=[1], y=2), gdf) + @test_throws ArgumentError combine(gdf, :x1 => x -> ones(2, 2)) + @test_throws ArgumentError combine(gdf, :x1 => x -> df[1, Not(:g)]) end @testset "keepkeys" begin df = DataFrame(g=[1,1,1,2,2,2], x1=1:6) - @test by(df, :g, :x1 => identity, keepkeys=false) == DataFrame(x1_identity=1:6) - @test by(x -> DataFrame(g=x.x1), df, :g, keepkeys=false) == DataFrame(g=1:6) gdf = groupby_checked(df, :g) @test combine(gdf, :x1 => identity => :g, keepkeys=false) == DataFrame(g=1:6) @test combine(x -> (z=x.x1,), gdf, keepkeys=false) == DataFrame(z=1:6) @@ -1832,28 +1774,31 @@ end end @testset "mixing of different return lengths and pseudo-broadcasting" begin - df = DataFrame(g=[1,1,1,2,2]); - f1(i) = i[1] == 1 ? ["a", "b"] : ["c"]; - f2(i) = i[1] == 1 ? ["d"] : ["e", "f"]; - @test_throws ArgumentError by(df, :g, :g => f1, :g => f2) - - f1(i) = i[1] == 1 ? ["a"] : ["c"]; - f2(i) = i[1] == 1 ? "d" : "e"; - @test by(df, :g, :g => f1, :g => f2) == + df = DataFrame(g=[1,1,1,2,2]) + gdf = groupby_checked(df, :g) + + f1(i) = i[1] == 1 ? ["a", "b"] : ["c"] + f2(i) = i[1] == 1 ? ["d"] : ["e", "f"] + @test_throws ArgumentError combine(gdf, :g => f1, :g => f2) + + f1(i) = i[1] == 1 ? ["a"] : ["c"] + f2(i) = i[1] == 1 ? "d" : "e" + @test combine(gdf, :g => f1, :g => f2) == DataFrame(g=[1,2], g_f1=["a", "c"], g_f2 = ["d", "e"]) - f1(i) = i[1] == 1 ? ["a","c"] : []; - f2(i) = i[1] == 1 ? "d" : "e"; - @test by(df, :g, :g => f1, :g => f2) == + f1(i) = i[1] == 1 ? ["a","c"] : [] + f2(i) = i[1] == 1 ? "d" : "e" + @test combine(gdf, :g => f1, :g => f2) == DataFrame(g = [1,1], g_f1 = ["a", "c"], g_f2 = ["d", "d"]) - @test by(df, :g, :g => Ref) == DataFrame(g=[1,2], g_Ref=[[1,1,1], [2,2]]) - @test by(df, :g, :g => x -> view([x],1)) == DataFrame(g=[1,2], g_function=[[1,1,1], [2,2]]) + @test combine(gdf, :g => Ref) == DataFrame(g=[1,2], g_Ref=[[1,1,1], [2,2]]) + @test combine(gdf, :g => x -> view([x],1)) == DataFrame(g=[1,2], g_function=[[1,1,1], [2,2]]) Random.seed!(1234) df = DataFrame(g=1:100) + gdf = groupby_checked(df, :g) for i in 1:10 - @test by(df, :g, :g => x -> rand([x[1], Ref(x[1]), view(x, 1)])) == + @test combine(gdf, :g => x -> rand([x[1], Ref(x[1]), view(x, 1)])) == DataFrame(g=1:100, g_function=1:100) end @@ -1862,12 +1807,13 @@ end for i in 0:nrow(df_ref), dosort in [true, false], dokeepkeys in [true, false] df = df_ref[1:i, :] - @test by(df, :g, :x1 => sum => :x1, :x2 => identity => :x2, - :x3 => (x -> Ref(sum(x))) => :x3, nrow, :x4 => ByRow(sin) => :x4, - sort=dosort, keepkeys=dokeepkeys) == - by(df, :g, sort=dosort, keepkeys=dokeepkeys) do sdf - DataFrame(x1 = sum(sdf.x1), x2 = sdf.x2, x3 = sum(sdf.x3), - nrow = nrow(sdf), x4 = sin.(sdf.x4)) + gdf = groupby_checked(df, :g, sort=dosort) + @test combine(gdf, :x1 => sum => :x1, :x2 => identity => :x2, + :x3 => (x -> Ref(sum(x))) => :x3, nrow, :x4 => ByRow(sin) => :x4, + keepkeys=dokeepkeys) == + combine(gdf, keepkeys=dokeepkeys) do sdf + DataFrame(x1 = sum(sdf.x1), x2 = sdf.x2, x3 = sum(sdf.x3), + nrow = nrow(sdf), x4 = sin.(sdf.x4)) end end end @@ -1875,54 +1821,56 @@ end @testset "passing columns" begin df = DataFrame(rand(10, 4)) df.g = shuffle!([1,2,2,3,3,3,4,4,4,4]) + gdf = groupby_checked(df, :g) for selector in [All(), :, r"x", Between(:x1, :x4), Not(:g), [:x1, :x2, :x3, :x4], [1, 2, 3, 4], [true, true, true, true, false]] - @test by(df, :g, selector, :x1 => ByRow(sin) => :x1, :x2 => ByRow(sin) => :x3) == - by(df, :g) do sdf + @test combine(gdf, selector, :x1 => ByRow(sin) => :x1, :x2 => ByRow(sin) => :x3) == + combine(gdf) do sdf DataFrame(x1 = sin.(sdf.x1), x2 = sdf.x2, x3 = sin.(sdf.x2), x4 = sdf.x4) end end for selector in [All(), :, r"x", Between(:x1, :x4), Not(:g), [:x1, :x2, :x3, :x4], [1, 2, 3, 4], [true, true, true, true, false]] - @test by(df, :g, :x1 => ByRow(sin) => :x1, :x2 => ByRow(sin) => :x3, selector) == - by(df, :g) do sdf + @test combine(gdf, :x1 => ByRow(sin) => :x1, :x2 => ByRow(sin) => :x3, selector) == + combine(gdf) do sdf DataFrame(x1 = sin.(sdf.x1), x3 = sin.(sdf.x2), x2 = sdf.x2, x4 = sdf.x4) end end for selector in [Between(:x1, :x3), Not(:x4), [:x1, :x2, :x3], [1, 2, 3], [true, true, true, false, false]] - @test by(df, :g, :x2 => ByRow(sin) => :x3, selector, :x1 => ByRow(sin) => :x1) == - by(df, :g) do sdf + @test combine(gdf, :x2 => ByRow(sin) => :x3, selector, :x1 => ByRow(sin) => :x1) == + combine(gdf) do sdf DataFrame(x3 = sin.(sdf.x2), x1 = sin.(sdf.x1), x2 = sdf.x2) end end - @test by(df, :g, 4, :x1 => ByRow(sin) => :x1, :x2 => ByRow(sin) => :x3, :x2) == - by(df, :g) do sdf + @test combine(gdf, 4, :x1 => ByRow(sin) => :x1, :x2 => ByRow(sin) => :x3, :x2) == + combine(gdf) do sdf DataFrame(x4 = sdf.x4, x1 = sin.(sdf.x1), x3 = sin.(sdf.x2), x2 = sdf.x2) end - @test by(df, :g, 4 => :h, :x1 => ByRow(sin) => :z, :x2 => ByRow(sin) => :x3, :x2) == - by(df, :g) do sdf + @test combine(gdf, 4 => :h, :x1 => ByRow(sin) => :z, :x2 => ByRow(sin) => :x3, :x2) == + combine(gdf) do sdf DataFrame(h = sdf.x4, z = sin.(sdf.x1), x3 = sin.(sdf.x2), x2 = sdf.x2) end - @test_throws ArgumentError by(df, :g, 4 => :h, :x1 => ByRow(sin) => :h) - @test_throws ArgumentError by(df, :g, :x1 => :x1_sin, :x1 => ByRow(sin)) - @test_throws ArgumentError by(df, :g, 1, :x1 => ByRow(sin) => :x1) + @test_throws ArgumentError combine(gdf, 4 => :h, :x1 => ByRow(sin) => :h) + @test_throws ArgumentError combine(gdf, :x1 => :x1_sin, :x1 => ByRow(sin)) + @test_throws ArgumentError combine(gdf, 1, :x1 => ByRow(sin) => :x1) end @testset "correct dropping of groups" begin df = DataFrame(g = 10:-1:1) - + gdf = groupby_checked(df, :g) + sgdf = groupby_checked(df, :g, sort=true) for keep in [[3,2,1], [5,3,1], [9], Int[]] - @test by(df, :g, :g => first => :keep, :g => x -> x[1] in keep ? x : Int[]) == + @test combine(gdf, :g => first => :keep, :g => x -> x[1] in keep ? x : Int[]) == DataFrame(g=keep, keep=keep, g_function=keep) - @test by(df, :g, :g => first => :keep, :g => x -> x[1] in keep ? x : Int[], - sort=true) == sort(DataFrame(g=keep, keep=keep, g_function=keep)) + @test combine(sgdf, :g => first => :keep, :g => x -> x[1] in keep ? x : Int[]) == + sort(DataFrame(g=keep, keep=keep, g_function=keep)) end end @@ -1931,46 +1879,39 @@ end gdf = groupby_checked(df, :g) # whole column 4 options of single pair passed - @test by(df, :g , AsTable([:x, :y]) => Ref) == - by(AsTable([:x, :y]) => Ref, df, :g) == - combine(gdf , AsTable([:x, :y]) => Ref) == + @test combine(gdf , AsTable([:x, :y]) => Ref) == combine(AsTable([:x, :y]) => Ref, gdf) == DataFrame(g=1:2, x_y_Ref=[(x=[1,2,3], y=[6,7,8]), (x=[4,5], y=[9,10])]) - @test map(AsTable([:x, :y]) => Ref, gdf) == - groupby_checked(by(df, :g , AsTable([:x, :y]) => Ref), :g) + @test validate_gdf(combine(AsTable([:x, :y]) => Ref, gdf, regroup=true)) == + groupby_checked(combine(gdf, AsTable([:x, :y]) => Ref), :g) - @test by(df, :g, AsTable(1) => Ref) == - combine(gdf, AsTable(1) => Ref) == + @test combine(gdf, AsTable(1) => Ref) == DataFrame(g=1:2, g_Ref=[(g=[1,1,1],),(g=[2,2],)]) # ByRow 4 options of single pair passed - @test by(df, :g, AsTable([:x, :y]) => ByRow(x -> [x])) == - by(AsTable([:x, :y]) => ByRow(x -> [x]), df, :g) == - combine(gdf, AsTable([:x, :y]) => ByRow(x -> [x])) == + @test combine(gdf, AsTable([:x, :y]) => ByRow(x -> [x])) == combine(AsTable([:x, :y]) => ByRow(x -> [x]), gdf) == DataFrame(g=[1,1,1,2,2], x_y_function=[[(x=1,y=6)], [(x=2,y=7)], [(x=3,y=8)], [(x=4,y=9)], [(x=5,y=10)]]) - @test map(AsTable([:x, :y]) => ByRow(x -> [x]), gdf) == - groupby_checked(by(df, :g, AsTable([:x, :y]) => ByRow(x -> [x])), :g) + @test validate_gdf(combine(AsTable([:x, :y]) => ByRow(x -> [x]), gdf, regroup=true)) == + groupby_checked(combine(gdf, AsTable([:x, :y]) => ByRow(x -> [x])), :g) # whole column and ByRow test for multiple pairs passed - @test by(df, :g, [:x, :y], [AsTable(v) => (x -> -x[1]) for v in [:x, :y]]) == - combine(gdf, [:x, :y], [AsTable(v) => (x -> -x[1]) for v in [:x, :y]]) == + @test combine(gdf, [:x, :y], [AsTable(v) => (x -> -x[1]) for v in [:x, :y]]) == [df DataFrame(x_function=-df.x, y_function=-df.y)] - @test by(df, :g, [:x, :y], [AsTable(v) => ByRow(x -> (-x[1],)) for v in [:x, :y]]) == - combine(gdf, [:x, :y], [AsTable(v) => ByRow(x -> (-x[1],)) for v in [:x, :y]]) == + @test combine(gdf, [:x, :y], [AsTable(v) => ByRow(x -> (-x[1],)) for v in [:x, :y]]) == [df DataFrame(x_function=[(-1,), (-2,) ,(-3,) ,(-4,) ,(-5,)], y_function=[(-6,), (-7,) ,(-8,) ,(-9,) ,(-10,)])] - @test_throws ArgumentError by(df, :g, AsTable([:x, :y]) => ByRow(identity)) - @test_throws ArgumentError by(df, :g, AsTable([:x, :y]) => ByRow(x -> df[1, :])) + @test_throws ArgumentError combine(gdf, AsTable([:x, :y]) => ByRow(identity)) + @test_throws ArgumentError combine(gdf, AsTable([:x, :y]) => ByRow(x -> df[1, :])) end @testset "test correctness of regrouping" begin df = DataFrame(g=[2,2,1,3,1,2,1,2,3]) gdf = groupby_checked(df, :g) - gdf2 = combine(identity, gdf, regroup=true) + gdf2 = validate_gdf(combine(identity, gdf, regroup=true)) @test combine(gdf, :g => sum) == combine(gdf2, :g => sum) df.id = 1:9 diff --git a/test/select.jl b/test/select.jl index 005b87e222..0416a5e0b7 100644 --- a/test/select.jl +++ b/test/select.jl @@ -705,6 +705,8 @@ end df_ref[1:2, []], view(df_ref, 1:2, []), df_ref[[], 1:2], view(df_ref, [], 1:2)] @test select(df, nrow => :z, nrow, [nrow => :z2]) == + repeat(DataFrame(z=nrow(df), nrow=nrow(df), z2=nrow(df)), nrow(df)) + @test combine(df, nrow => :z, nrow, [nrow => :z2]) == DataFrame(z=nrow(df), nrow=nrow(df), z2=nrow(df)) @test_throws ArgumentError select(df, nrow, nrow) @test_throws ArgumentError select(df, [nrow]) @@ -743,10 +745,13 @@ end @testset "select and select! empty selection" begin df = DataFrame(rand(10, 4)) - x = [1,2,3] + x = [1:10;] + y = [1,2,3] @test select(df, r"z") == DataFrame() @test select(df, r"z" => () -> x) == DataFrame(:function => x) + @test_throws ArgumentError select(df, r"z" => () -> y) + @test combine(df, r"z" => () -> y) == DataFrame(:function => y) @test select(df, r"z" => () -> x)[!, 1] === x # no copy even for copycols=true @test_throws MethodError select(df, r"z" => x -> 1) @test_throws ArgumentError select(df, r"z" => ByRow(rand)) @@ -890,10 +895,16 @@ end end @test_throws ArgumentError select(df, [] => (() -> [9]) => :a, :) @test_throws ArgumentError select(df, :, [] => (() -> [9]) => :a) - @test transform(df, names(df) .=> (x -> 9) .=> names(df)) == DataFrame([9 9 9]) + @test transform(df, names(df) .=> (x -> 9) .=> names(df)) == + repeat(DataFrame([9 9 9]), nrow(df)) + @test combine(df, names(df) .=> (x -> 9) .=> names(df)) == + DataFrame([9 9 9]) @test transform(df, names(df) .=> (x -> 9) .=> names(df), :x1 => :x4) == DataFrame([9 9 9 1; 9 9 9 4]) - @test transform(df3, names(df3) .=> (x -> 9) .=> names(df3)) == DataFrame([9 9 9]) + @test transform(df3, names(df3) .=> (x -> 9) .=> names(df3)) == + repeat(DataFrame([9 9 9]), nrow(df3)) + @test combine(df3, names(df3) .=> (x -> 9) .=> names(df3)) == + DataFrame([9 9 9]) @test transform(df3, names(df3) .=> (x -> 9) .=> names(df3), :x1 => :x4) == DataFrame(ones(0, 4)) @@ -901,6 +912,14 @@ end x3=[missing,2], x4=categorical([missing, 2])) df2 = select(df, names(df) .=> first) + @test df2 ≅ repeat(DataFrame(x1_first=1, x2_first=1, x3_first=missing, + x4_first=missing), nrow(df2)) + @test df2.x1_first isa Vector{Int} + @test df2.x2_first isa CategoricalVector{Int} + @test df2.x3_first isa Vector{Missing} + @test df2.x4_first isa Vector{Missing} + + df2 = combine(df, names(df) .=> first) @test df2 ≅ DataFrame(x1_first=1, x2_first=1, x3_first=missing, x4_first=missing) @test df2.x1_first isa Vector{Int} @@ -909,6 +928,14 @@ end @test df2.x4_first isa Vector{Missing} df2 = select(df, names(df) .=> last) + @test df2 ≅ repeat(DataFrame(x1_last=2, x2_last=2, x3_last=2, + x4_last=2), nrow(df2)) + @test df2.x1_last isa Vector{Int} + @test df2.x2_last isa CategoricalVector{Int} + @test df2.x3_last isa Vector{Int} + @test df2.x4_last isa CategoricalVector{Int} + + df2 = combine(df, names(df) .=> last) @test df2 ≅ DataFrame(x1_last=2, x2_last=2, x3_last=2, x4_last=2) @test df2.x1_last isa Vector{Int} @@ -953,31 +980,32 @@ end @test df2.x4_last isa CategoricalVector{Int} end - df2 = select(df, names(df) .=> first, [] => (() -> Int[]) => :x1) + @test_throws ArgumentError select(df, names(df) .=> first, [] => (() -> Int[]) => :x1) + df2 = combine(df, names(df) .=> first, [] => (() -> Int[]) => :x1) @test size(df2) == (0, 5) @test df2.x1_first isa Vector{Int} @test df2.x2_first isa CategoricalVector{Int} @test df2.x3_first isa Vector{Missing} @test df2.x4_first isa Vector{Missing} - - df2 = select(df, names(df) .=> last, [] => (() -> Int[]) => :x1) + @test_throws ArgumentError select(df, names(df) .=> last, [] => (() -> Int[]) => :x1) + df2 = combine(df, names(df) .=> last, [] => (() -> Int[]) => :x1) @test size(df2) == (0, 5) @test df2.x1_last isa Vector{Int} @test df2.x2_last isa CategoricalVector{Int} @test df2.x3_last isa Vector{Int} @test df2.x4_last isa CategoricalVector{Int} - - df2 = select(df, [] => (() -> Int[]) => :x1, names(df) .=> first) + @test_throws ArgumentError select(df, [] => (() -> Int[]) => :x1, names(df) .=> first) + df2 = combine(df, [] => (() -> Int[]) => :x1, names(df) .=> first) @test size(df2) == (0, 5) @test df2.x1_first isa Vector{Int} @test df2.x2_first isa CategoricalVector{Int} @test df2.x3_first isa Vector{Missing} @test df2.x4_first isa Vector{Missing} - - df2 = select(df, [] => (() -> Int[]) => :x1, names(df) .=> last) + @test_throws ArgumentError select(df, [] => (() -> Int[]) => :x1, names(df) .=> last) + df2 = combine(df, [] => (() -> Int[]) => :x1, names(df) .=> last) @test size(df2) == (0, 5) @test df2.x1_last isa Vector{Int} @test df2.x2_last isa CategoricalVector{Int} @@ -988,7 +1016,8 @@ end @testset "copycols special cases" begin df = DataFrame(a=1:3, b=4:6) c = [7, 8] - df2 = select(df, :a => (x -> c) => :c1, :b => (x -> c) => :c2) + @test_throws ArgumentError select(df, :a => (x -> c) => :c1, :b => (x -> c) => :c2) + df2 = combine(df, :a => (x -> c) => :c1, :b => (x -> c) => :c2) @test df2.c1 === df2.c2 df2 = select(df, :a => identity => :c1, :a => :c2) @test df2.c1 !== df2.c2 @@ -996,9 +1025,11 @@ end @test df2.c1 !== df.a df2 = select(df, :a => (x -> df.b) => :c1) @test df2.c1 === df.b - df2 = select(view(df, 1:2, :), :a => parent => :c1) + @test_throws ArgumentError select(view(df, 1:2, :), :a => parent => :c1) + df2 = combine(view(df, 1:2, :), :a => parent => :c1) @test df2.c1 !== df.a - df2 = select(view(df, 1:2, :), :a => (x -> view(x, 1:1)) => :c1) + @test_throws ArgumentError select(view(df, 1:2, :), :a => (x -> view(x, 1:1)) => :c1) + df2 = combine(view(df, 1:2, :), :a => (x -> view(x, 1:1)) => :c1) @test df2.c1 isa Vector df2 = select(df, :a, :a => :b, :a => identity => :c, copycols=false) @test df2.b === df2.c === df.a @@ -1059,14 +1090,16 @@ end df_ref = DataFrame(a=1:3, b=4:6) for df in [df_ref, view(df_ref, :, :)] @test select(df, [] .=> sum) == DataFrame() - @test select(df, names(df) .=> sum) == DataFrame(a_sum=6, b_sum=15) + @test select(df, names(df) .=> sum) == repeat(DataFrame(a_sum=6, b_sum=15), nrow(df)) + @test combine(df, names(df) .=> sum) == DataFrame(a_sum=6, b_sum=15) @test transform(df, names(df) .=> ByRow(-)) == DataFrame(:a => 1:3, :b => 4:6, Symbol("a_-") => -1:-1:-3, Symbol("b_-") => -4:-1:-6) @test select(df, :a, [] .=> sum, :b => :x, [:b, :a] .=> identity) == DataFrame(a=1:3, x=4:6, b_identity=4:6, a_identity=1:3) - @test select(df, names(df) .=> sum .=> [:A, :B]) == DataFrame(A=6, B=15) + @test select(df, names(df) .=> sum .=> [:A, :B]) == repeat(DataFrame(A=6, B=15), nrow(df)) + @test combine(df, names(df) .=> sum .=> [:A, :B]) == DataFrame(A=6, B=15) @test Base.broadcastable(ByRow(+)) isa Base.RefValue{ByRow{typeof(+)}} @test identity.(ByRow(+)) == ByRow(+) end @@ -1079,6 +1112,8 @@ end @test transform(df, AsTable(:) => sum) == DataFrame(a=1:3, b=4:6, c=7:9, a_b_c_sum=map(sum, eachrow(df))) @test select(df, AsTable(:) => sum ∘ sum) == + repeat(DataFrame(a_b_c_function=45), nrow(df)) + @test combine(df, AsTable(:) => sum ∘ sum) == DataFrame(a_b_c_function=45) @test transform(df, AsTable(:) => sum ∘ sum) == DataFrame(a=1:3, b=4:6, c=7:9, a_b_c_function=45) @@ -1095,7 +1130,8 @@ end @test_throws ArgumentError select(df, AsTable(:) => ByRow(x -> df[1, :])) @test_throws ArgumentError transform(df, AsTable(Not(:)) => ByRow(identity)) - @test select(df, AsTable(Not(:)) => Ref) == DataFrame(Ref = NamedTuple()) + @test select(df, AsTable(Not(:)) => Ref) == repeat(DataFrame(Ref = NamedTuple()), nrow(df)) + @test combine(df, AsTable(Not(:)) => Ref) == DataFrame(Ref = NamedTuple()) @test transform(df, AsTable(Not(:)) => Ref) == DataFrame(a=1:3, b=4:6, c=7:9, Ref=NamedTuple()) From 384c0b1b8275ad077f7b1dd8a3da2072544bf338 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Wed, 29 Apr 2020 15:14:38 +0200 Subject: [PATCH 12/29] change error thrown on Julia 1.0 --- test/select.jl | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/test/select.jl b/test/select.jl index 0416a5e0b7..584b43fa90 100644 --- a/test/select.jl +++ b/test/select.jl @@ -764,12 +764,7 @@ end @test_throws MethodError select!(df, r"z" => x -> 1) @test_throws ArgumentError select!(df, r"z" => ByRow(rand)) - - if VERSION >= v"1.4" - @test_throws MethodError select!(df, r"z" => () -> x, copycols=false) - else - @test_throws ErrorException select!(df, r"z" => () -> x, copycols=false) - end + @test_throws MethodError select!(df, r"z" => () -> x, copycols=false) select!(df, r"z" => () -> x) @test df == DataFrame(:function => x) From ea574c4f577fc12461cca0ce331eb5ad1c27b44b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Wed, 29 Apr 2020 16:19:40 +0200 Subject: [PATCH 13/29] done tests of combine --- test/select.jl | 168 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 168 insertions(+) diff --git a/test/select.jl b/test/select.jl index 584b43fa90..9a7a7a0784 100644 --- a/test/select.jl +++ b/test/select.jl @@ -1158,4 +1158,172 @@ end @test propertynames(df) == [:a,] end +@testset "combine AbstractDataFrame" begin + df = DataFrame(x=1:3, y=4:6) + + @test combine(x -> Matrix(x), df) == rename(df, [:x1, :x2]) + @test combine(x -> Ref(1:3), df) == DataFrame(x1=[1:3]) + @test_throws ArgumentError combine(df, x -> Ref(1:3)) + + @test combine(AsTable(:) => identity, df) == df + @test combine((:) => cor, df) == DataFrame(x_y_cor = 1.0) + @test combine(:x => x -> Ref(1:3), df) == DataFrame(x_function=[1:3]) + @test_throws ArgumentError combine(df, :x => x -> ones(1,1)) + + df2 = combine(df, :x => identity) + @test df2[:, 1] == df.x + @test df2[:, 1] !== df.x + + @test combine(df, :x => sum, :y => collect ∘ extrema) == + DataFrame(x_sum=[6, 6], y_function = [4, 6]) + @test combine(df, :y => collect ∘ extrema, :x => sum) == + DataFrame(y_function = [4, 6], x_sum=[6, 6]) + @test combine(df, :x => sum, :y => x -> []) == + DataFrame(x_sum=[], y_function = []) + @test combine(df, :y => x -> [], :x => sum) == + DataFrame(y_function = [], x_sum=[]) + + dfv = view(df, [2, 1], [2, 1]) + + @test combine(x -> Matrix(x), dfv) == rename(dfv, [:x1, :x2]) + + @test combine(AsTable(:) => identity, dfv) == dfv + @test combine((:) => cor, dfv) == DataFrame(y_x_cor = 1.0) + + df2 = combine(dfv, :x => identity) + @test df2[:, 1] == dfv.x + @test df2[:, 1] !== dfv.x + + @test combine(dfv, :x => sum, :y => collect ∘ extrema) == + DataFrame(x_sum=[3, 3], y_function = [4, 5]) + @test combine(dfv, :y => collect ∘ extrema, :x => sum) == + DataFrame(y_function = [4, 5], x_sum=[3, 3]) +end + +@testset "combine GroupedDataFrame" begin + for df in (DataFrame(g=[3,1,1,missing],x=1:4, y=5:8), + DataFrame(g=categorical([3,1,1,missing]),x=1:4, y=5:8)) + if !(df.g isa CategoricalVector) + gdf = groupby(df, :g, sort=false, skipmissing=false) + @test combine(gdf, :x => sum, keepkeys=false, regroup=false) == + DataFrame(x_sum = [1, 5, 4]) + @test_throws ArgumentError combine(gdf, :x => sum, keepkeys=false, regroup=true) + @test combine(gdf, :x => sum, keepkeys=true, regroup=false) ≅ + DataFrame(g = [3, 1, missing], x_sum = [1, 5, 4]) + gdf2 = combine(gdf, :x => sum, keepkeys=true, regroup=true) + @test gdf2 isa GroupedDataFrame{DataFrame} + @test gdf2.groups == 1:3 + @test DataFrame(gdf2) ≅ DataFrame(g = [3, 1, missing], x_sum = [1, 5, 4]) + + @test combine(gdf, :x => sum, :g, keepkeys=false, regroup=false) ≅ + DataFrame(x_sum = [1, 5, 5, 4], g = [3, 1, 1, missing]) + @test combine(gdf, :x => sum, :g, keepkeys=true, regroup=false) ≅ + DataFrame(g = [3, 1, 1, missing], x_sum = [1, 5, 5, 4]) + gdf2 = combine(gdf, :x => sum, :g, keepkeys=true, regroup=true) + @test gdf2 isa GroupedDataFrame{DataFrame} + @test gdf2.groups == [1, 2, 2, 3] + @test DataFrame(gdf2) ≅ DataFrame(g = [3, 1, 1, missing], x_sum = [1, 5, 5, 4]) + + @test combine(x -> (x_sum = sum(x.x),), gdf, keepkeys=false, regroup=false) == + DataFrame(x_sum = [1, 5, 4]) + @test combine(x -> (x_sum = sum(x.x),), gdf, keepkeys=true, regroup=false) ≅ + DataFrame(g = [3, 1, missing], x_sum = [1, 5, 4]) + gdf2 = combine(x -> (x_sum = sum(x.x),), gdf, keepkeys=true, regroup=true) + @test gdf2 isa GroupedDataFrame{DataFrame} + @test gdf2.groups == 1:3 + @test DataFrame(gdf2) ≅ DataFrame(g = [3, 1, missing], x_sum = [1, 5, 4]) + + gdf = groupby(df, :g, sort=false, skipmissing=true) + + @test combine(gdf, :x => sum, keepkeys=false, regroup=false) == + DataFrame(x_sum = [1, 5]) + @test_throws ArgumentError combine(gdf, :x => sum, keepkeys=false, regroup=true) + @test combine(gdf, :x => sum, keepkeys=true, regroup=false) ≅ + DataFrame(g = [3, 1], x_sum = [1, 5]) + gdf2 = combine(gdf, :x => sum, keepkeys=true, regroup=true) + @test gdf2 isa GroupedDataFrame{DataFrame} + @test gdf2.groups == 1:2 + @test DataFrame(gdf2) ≅ DataFrame(g = [3, 1], x_sum = [1, 5]) + + @test combine(gdf, :x => sum, :g, keepkeys=false, regroup=false) ≅ + DataFrame(x_sum = [1, 5, 5], g = [3, 1, 1]) + @test combine(gdf, :x => sum, :g, keepkeys=true, regroup=false) ≅ + DataFrame(g = [3, 1, 1], x_sum = [1, 5, 5]) + gdf2 = combine(gdf, :x => sum, :g, keepkeys=true, regroup=true) + @test gdf2 isa GroupedDataFrame{DataFrame} + @test gdf2.groups == [1, 2, 2] + @test DataFrame(gdf2) ≅ DataFrame(g = [3, 1, 1], x_sum = [1, 5, 5]) + + @test combine(x -> (x_sum = sum(x.x),), gdf, keepkeys=false, regroup=false) == + DataFrame(x_sum = [1, 5]) + @test combine(x -> (x_sum = sum(x.x),), gdf, keepkeys=true, regroup=false) ≅ + DataFrame(g = [3, 1], x_sum = [1, 5]) + gdf2 = combine(x -> (x_sum = sum(x.x),), gdf, keepkeys=true, regroup=true) + @test gdf2 isa GroupedDataFrame{DataFrame} + @test gdf2.groups == 1:2 + @test DataFrame(gdf2) ≅ DataFrame(g = [3, 1], x_sum = [1, 5]) + end + + gdf = groupby(df, :g, sort=true, skipmissing=false) + + @test combine(gdf, :x => sum, keepkeys=false, regroup=false) == + DataFrame(x_sum = [5, 1, 4]) + @test_throws ArgumentError combine(gdf, :x => sum, keepkeys=false, regroup=true) + @test combine(gdf, :x => sum, keepkeys=true, regroup=false) ≅ + DataFrame(g = [1, 3, missing], x_sum = [5, 1, 4]) + gdf2 = combine(gdf, :x => sum, keepkeys=true, regroup=true) + @test gdf2 isa GroupedDataFrame{DataFrame} + @test gdf2.groups == 1:3 + @test DataFrame(gdf2) ≅ DataFrame(g = [1, 3, missing], x_sum = [5, 1, 4]) + + @test combine(gdf, :x => sum, :g, keepkeys=false, regroup=false) ≅ + DataFrame(x_sum = [5, 5, 1, 4], g = [1, 1, 3, missing]) + @test combine(gdf, :x => sum, :g, keepkeys=true, regroup=false) ≅ + DataFrame(g = [1, 1, 3, missing], x_sum = [5, 5, 1, 4]) + gdf2 = combine(gdf, :x => sum, :g, keepkeys=true, regroup=true) + @test gdf2 isa GroupedDataFrame{DataFrame} + @test gdf2.groups == [1, 1, 2, 3] + @test DataFrame(gdf2) ≅ DataFrame(g = [1, 1, 3, missing], x_sum = [5, 5, 1, 4]) + + @test combine(x -> (x_sum = sum(x.x),), gdf, keepkeys=false, regroup=false) == + DataFrame(x_sum = [5, 1, 4]) + @test combine(x -> (x_sum = sum(x.x),), gdf, keepkeys=true, regroup=false) ≅ + DataFrame(g = [1, 3, missing], x_sum = [5, 1, 4]) + gdf2 = combine(x -> (x_sum = sum(x.x),), gdf, keepkeys=true, regroup=true) + @test gdf2 isa GroupedDataFrame{DataFrame} + @test gdf2.groups == 1:3 + @test DataFrame(gdf2) ≅ DataFrame(g = [1, 3, missing], x_sum = [5, 1, 4]) + + gdf = groupby(df, :g, sort=true, skipmissing=true) + + @test combine(gdf, :x => sum, keepkeys=false, regroup=false) == + DataFrame(x_sum = [5, 1]) + @test_throws ArgumentError combine(gdf, :x => sum, keepkeys=false, regroup=true) + @test combine(gdf, :x => sum, keepkeys=true, regroup=false) ≅ + DataFrame(g = [1, 3], x_sum = [5, 1]) + gdf2 = combine(gdf, :x => sum, keepkeys=true, regroup=true) + @test gdf2 isa GroupedDataFrame{DataFrame} + @test gdf2.groups == 1:2 + @test DataFrame(gdf2) ≅ DataFrame(g = [1, 3], x_sum = [5, 1]) + + @test combine(gdf, :x => sum, :g, keepkeys=false, regroup=false) ≅ + DataFrame(x_sum = [5, 5, 1], g = [1, 1, 3]) + @test combine(gdf, :x => sum, :g, keepkeys=true, regroup=false) ≅ + DataFrame(g = [1, 1, 3], x_sum = [5, 5, 1]) + gdf2 = combine(gdf, :x => sum, :g, keepkeys=true, regroup=true) + @test gdf2 isa GroupedDataFrame{DataFrame} + @test gdf2.groups == [1, 1, 2] + @test DataFrame(gdf2) ≅ DataFrame(g = [1, 1, 3], x_sum = [5, 5, 1]) + + @test combine(x -> (x_sum = sum(x.x),), gdf, keepkeys=false, regroup=false) == + DataFrame(x_sum = [5, 1]) + @test combine(x -> (x_sum = sum(x.x),), gdf, keepkeys=true, regroup=false) ≅ + DataFrame(g = [1, 3], x_sum = [5, 1]) + gdf2 = combine(x -> (x_sum = sum(x.x),), gdf, keepkeys=true, regroup=true) + @test gdf2 isa GroupedDataFrame{DataFrame} + @test gdf2.groups == 1:2 + @test DataFrame(gdf2) ≅ DataFrame(g = [1, 3], x_sum = [5, 1]) + end +end + end # module From 8977017c0726c3035a01a35606b1b69fe6aadde9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Thu, 30 Apr 2020 00:28:50 +0200 Subject: [PATCH 14/29] finish tests and documentation --- docs/src/man/getting_started.md | 30 ++- docs/src/man/split_apply_combine.md | 148 ++++++++++-- src/abstractdataframe/selection.jl | 45 +++- src/dataframe/dataframe.jl | 5 +- src/groupeddataframe/splitapplycombine.jl | 92 ++++++-- test/grouping.jl | 276 ++++++++++++++++++++++ test/select.jl | 201 ++++++---------- 7 files changed, 623 insertions(+), 174 deletions(-) diff --git a/docs/src/man/getting_started.md b/docs/src/man/getting_started.md index 2ae81786cf..5cb011a4f8 100644 --- a/docs/src/man/getting_started.md +++ b/docs/src/man/getting_started.md @@ -792,7 +792,9 @@ julia> mean(df.A) 2.5 ``` -We can also apply a function to each column of a `DataFrame` using `select`. For example: +We can also apply a function to each column of a `DataFrame` using `select`. +`select` always returns the same number of rows in the result as the source +data frame. For example: ```jldoctest dataframe julia> df = DataFrame(A = 1:4, B = 4.0:-1.0:1.0) 4×2 DataFrame @@ -805,13 +807,37 @@ julia> df = DataFrame(A = 1:4, B = 4.0:-1.0:1.0) │ 4 │ 4 │ 1.0 │ julia> select(df, names(df) .=> sum) -1×2 DataFrame +4×2 DataFrame │ Row │ A_sum │ B_sum │ │ │ Int64 │ Float64 │ ├─────┼───────┼─────────┤ │ 1 │ 10 │ 10.0 │ +│ 2 │ 10 │ 10.0 │ +│ 3 │ 10 │ 10.0 │ +│ 4 │ 10 │ 10.0 │ julia> select(df, names(df) .=> sum, names(df) .=> prod) +4×4 DataFrame +│ Row │ A_sum │ B_sum │ A_prod │ B_prod │ +│ │ Int64 │ Float64 │ Int64 │ Float64 │ +├─────┼───────┼─────────┼────────┼─────────┤ +│ 1 │ 10 │ 10.0 │ 24 │ 24.0 │ +│ 2 │ 10 │ 10.0 │ 24 │ 24.0 │ +│ 3 │ 10 │ 10.0 │ 24 │ 24.0 │ +│ 4 │ 10 │ 10.0 │ 24 │ 24.0 │ +``` + +If instead you prefer to get a result collapsed to the number of rows returned +by the applied functions use the `combine` function: +``` +julia> combine(df, names(df) .=> sum) +1×2 DataFrame +│ Row │ A_sum │ B_sum │ +│ │ Int64 │ Float64 │ +├─────┼───────┼─────────┤ +│ 1 │ 10 │ 10.0 │ + +julia> combine(df, names(df) .=> sum, names(df) .=> prod) 1×4 DataFrame │ Row │ A_sum │ B_sum │ A_prod │ B_prod │ │ │ Int64 │ Float64 │ Int64 │ Float64 │ diff --git a/docs/src/man/split_apply_combine.md b/docs/src/man/split_apply_combine.md index 588f2b7fcc..99af65a2e6 100644 --- a/docs/src/man/split_apply_combine.md +++ b/docs/src/man/split_apply_combine.md @@ -6,10 +6,24 @@ framework for handling this sort of computation is described in the paper "[The Split-Apply-Combine Strategy for Data Analysis](http://www.jstatsoft.org/v40/i01)", written by Hadley Wickham. -The DataFrames package supports the split-apply-combine strategy through the `by` -function, which is a shorthand for `groupby` followed by `map` and/or `combine`. -`by` takes in three arguments: (1) a `DataFrame`, (2) one or more columns to split -the `DataFrame` on, and (3) a specification of one or more functions to apply to +The DataFrames package supports the split-apply-combine strategy through the +`combine`, `select`/`select!` and `transform`/`transform!` functions. + +In order to perform operations by groups you first need to create a `GroupedDataFrame` +object from your data frame using `groupby` function that takes two arguments: +(1) a data frame to be grouped, and (2) a set of columns to group by. + +The differences between the above functions are the following: +* `select`: return a data frame with the number and order of rows exactly the same + as the source, preserve only columns that have been calculated; +* `transform`: return a data frame with the number and order of rows exactly the same + as the source, preserve all columns from the source and columns that have been calculated; +* `select!`: is an in-place version of `select`; +* `transform!`: is an in-place version of `transform`; +* `combine`: does not put restrictions on number of rows returned, the order of rows + is specified by the order of groups in `GroupedDataFrame`. + +All these functions take a specification of one or more functions to apply to each subset of the `DataFrame`. This specification can be of the following forms: 1. standard column selectors (integers, symbols, vectors of integers, vectors of symbols, `All`, `:`, `Between`, `Not` and regular expressions) @@ -27,19 +41,20 @@ each subset of the `DataFrame`. This specification can be of the following forms number of columns are processed (in which case `SubDataFrame` avoids excessive compilation) -All forms except 1 and 6 can be also passed as the first argument to `map`. - As a special rule that applies to `cols => function` syntax, if `cols` is wrapped in an `AsTable` object then a `NamedTuple` containing columns selected by `cols` is passed to `function`. In all of these cases, `function` can return either a single row or multiple rows. `function` can always generate a single column by returning a single value or a vector. -Additionally, if `by` is passed exactly one `function` and `target_col` is not specified, +Additionally, if `combine` is passed exactly one `function` as a first argument +and `target_col` is not specified, `function` can return multiple columns in the form of an `AbstractDataFrame`, `AbstractMatrix`, `NamedTuple` or `DataFrameRow`. -Here are the rules specifying the shape of the resulting `DataFrame`: +Here are the rules specifying the shape of the resulting `DataFrame` in `combine` +(in `select`/`select!` and `transform`/`transform!` the result has the number +and order of rows equal to the source): - a single value produces a single row and column per group - a named tuple or `DataFrameRow` produces a single row and one column per field - a vector produces a single column with one row per entry @@ -87,7 +102,51 @@ julia> iris = DataFrame(CSV.File(joinpath(dirname(pathof(DataFrames)), "../docs/ │ 149 │ 6.2 │ 3.4 │ 5.4 │ 2.3 │ Iris-virginica │ │ 150 │ 5.9 │ 3.0 │ 5.1 │ 1.8 │ Iris-virginica │ -julia> by(iris, :Species, :PetalLength => mean) +julia> gdf = groupby(iris, :Species) +GroupedDataFrame with 3 groups based on key: Species +First Group (50 rows): Species = "Iris-setosa" +│ Row │ SepalLength │ SepalWidth │ PetalLength │ PetalWidth │ Species │ +│ │ Float64 │ Float64 │ Float64 │ Float64 │ String │ +├─────┼─────────────┼────────────┼─────────────┼────────────┼─────────────┤ +│ 1 │ 5.1 │ 3.5 │ 1.4 │ 0.2 │ Iris-setosa │ +│ 2 │ 4.9 │ 3.0 │ 1.4 │ 0.2 │ Iris-setosa │ +│ 3 │ 4.7 │ 3.2 │ 1.3 │ 0.2 │ Iris-setosa │ +│ 4 │ 4.6 │ 3.1 │ 1.5 │ 0.2 │ Iris-setosa │ +│ 5 │ 5.0 │ 3.6 │ 1.4 │ 0.2 │ Iris-setosa │ +│ 6 │ 5.4 │ 3.9 │ 1.7 │ 0.4 │ Iris-setosa │ +│ 7 │ 4.6 │ 3.4 │ 1.4 │ 0.3 │ Iris-setosa │ +⋮ +│ 43 │ 4.4 │ 3.2 │ 1.3 │ 0.2 │ Iris-setosa │ +│ 44 │ 5.0 │ 3.5 │ 1.6 │ 0.6 │ Iris-setosa │ +│ 45 │ 5.1 │ 3.8 │ 1.9 │ 0.4 │ Iris-setosa │ +│ 46 │ 4.8 │ 3.0 │ 1.4 │ 0.3 │ Iris-setosa │ +│ 47 │ 5.1 │ 3.8 │ 1.6 │ 0.2 │ Iris-setosa │ +│ 48 │ 4.6 │ 3.2 │ 1.4 │ 0.2 │ Iris-setosa │ +│ 49 │ 5.3 │ 3.7 │ 1.5 │ 0.2 │ Iris-setosa │ +│ 50 │ 5.0 │ 3.3 │ 1.4 │ 0.2 │ Iris-setosa │ +⋮ +Last Group (50 rows): Species = "Iris-virginica" +│ Row │ SepalLength │ SepalWidth │ PetalLength │ PetalWidth │ Species │ +│ │ Float64 │ Float64 │ Float64 │ Float64 │ String │ +├─────┼─────────────┼────────────┼─────────────┼────────────┼────────────────┤ +│ 1 │ 6.3 │ 3.3 │ 6.0 │ 2.5 │ Iris-virginica │ +│ 2 │ 5.8 │ 2.7 │ 5.1 │ 1.9 │ Iris-virginica │ +│ 3 │ 7.1 │ 3.0 │ 5.9 │ 2.1 │ Iris-virginica │ +│ 4 │ 6.3 │ 2.9 │ 5.6 │ 1.8 │ Iris-virginica │ +│ 5 │ 6.5 │ 3.0 │ 5.8 │ 2.2 │ Iris-virginica │ +│ 6 │ 7.6 │ 3.0 │ 6.6 │ 2.1 │ Iris-virginica │ +│ 7 │ 4.9 │ 2.5 │ 4.5 │ 1.7 │ Iris-virginica │ +⋮ +│ 43 │ 5.8 │ 2.7 │ 5.1 │ 1.9 │ Iris-virginica │ +│ 44 │ 6.8 │ 3.2 │ 5.9 │ 2.3 │ Iris-virginica │ +│ 45 │ 6.7 │ 3.3 │ 5.7 │ 2.5 │ Iris-virginica │ +│ 46 │ 6.7 │ 3.0 │ 5.2 │ 2.3 │ Iris-virginica │ +│ 47 │ 6.3 │ 2.5 │ 5.0 │ 1.9 │ Iris-virginica │ +│ 48 │ 6.5 │ 3.0 │ 5.2 │ 2.0 │ Iris-virginica │ +│ 49 │ 6.2 │ 3.4 │ 5.4 │ 2.3 │ Iris-virginica │ +│ 50 │ 5.9 │ 3.0 │ 5.1 │ 1.8 │ Iris-virginica │ + +julia> combine(gdf, :PetalLength => mean) 3×2 DataFrame │ Row │ Species │ PetalLength_mean │ │ │ String │ Float64 │ @@ -96,7 +155,7 @@ julia> by(iris, :Species, :PetalLength => mean) │ 2 │ Iris-versicolor │ 4.26 │ │ 3 │ Iris-virginica │ 5.552 │ -julia> by(iris, :Species, nrow) +julia> combine(gdf, nrow) 3×2 DataFrame │ Row │ Species │ nrow │ │ │ String │ Int64 │ @@ -105,7 +164,7 @@ julia> by(iris, :Species, nrow) │ 2 │ Iris-versicolor │ 50 │ │ 3 │ Iris-virginica │ 50 │ -julia> by(iris, :Species, nrow, :PetalLength => mean => :mean) +julia> combine(gdf, nrow, :PetalLength => mean => :mean) 3×3 DataFrame │ Row │ Species │ nrow │ mean │ │ │ String │ Int64 │ Float64 │ @@ -114,9 +173,8 @@ julia> by(iris, :Species, nrow, :PetalLength => mean => :mean) │ 2 │ Iris-versicolor │ 50 │ 4.26 │ │ 3 │ Iris-virginica │ 50 │ 5.552 │ -julia> by(iris, :Species, - [:PetalLength, :SepalLength] => - (p, s) -> (a=mean(p)/mean(s), b=sum(p))) # multiple columns are passed as arguments +julia> combine([:PetalLength, :SepalLength] => (p, s) -> (a=mean(p)/mean(s), b=sum(p)), + gdf) # multiple columns are passed as arguments 3×3 DataFrame │ Row │ Species │ a │ b │ │ │ String │ Float64 │ Float64 │ @@ -125,9 +183,9 @@ julia> by(iris, :Species, │ 2 │ Iris-versicolor │ 0.717655 │ 213.0 │ │ 3 │ Iris-virginica │ 0.842744 │ 277.6 │ -julia> by(iris, :Species, - AsTable([:PetalLength, :SepalLength]) => - x -> std(x.PetalLength) / std(x.SepalLength)) # passing a NamedTuple +julia> combine(gdf, + AsTable([:PetalLength, :SepalLength]) => + x -> std(x.PetalLength) / std(x.SepalLength)) # passing a NamedTuple 3×2 DataFrame │ Row │ Species │ PetalLength_SepalLength_function │ │ │ String │ Float64 │ @@ -136,7 +194,7 @@ julia> by(iris, :Species, │ 2 │ Iris-versicolor │ 0.910378 │ │ 3 │ Iris-virginica │ 0.867923 │ -julia> by(iris, :Species, 1:2 => cor, nrow) +julia> combine(gdf, 1:2 => cor, nrow) 3×3 DataFrame │ Row │ Species │ SepalLength_SepalWidth_cor │ nrow │ │ │ String │ Float64 │ Int64 │ @@ -147,11 +205,61 @@ julia> by(iris, :Species, 1:2 => cor, nrow) ``` -The `by` function also supports the `do` block form. However, as noted above, +If we use `select` or `transform` instead of `combine` we always obtain the number +and of order of rows in the result equal to the source. In the example below +the return values in columns `:SepalLength_SepalWidth_cor` and `:nrow` are +broadcasted to match the number of elements in each group: +``` +julia> select(gdf, 1:2 => cor, nrow) +150×3 DataFrame +│ Row │ Species │ SepalLength_SepalWidth_cor │ nrow │ +│ │ String │ Float64 │ Int64 │ +├─────┼────────────────┼────────────────────────────┼───────┤ +│ 1 │ Iris-setosa │ 0.74678 │ 50 │ +│ 2 │ Iris-setosa │ 0.74678 │ 50 │ +│ 3 │ Iris-setosa │ 0.74678 │ 50 │ +│ 4 │ Iris-setosa │ 0.74678 │ 50 │ +│ 5 │ Iris-setosa │ 0.74678 │ 50 │ +│ 6 │ Iris-setosa │ 0.74678 │ 50 │ +│ 7 │ Iris-setosa │ 0.74678 │ 50 │ +⋮ +│ 143 │ Iris-virginica │ 0.457228 │ 50 │ +│ 144 │ Iris-virginica │ 0.457228 │ 50 │ +│ 145 │ Iris-virginica │ 0.457228 │ 50 │ +│ 146 │ Iris-virginica │ 0.457228 │ 50 │ +│ 147 │ Iris-virginica │ 0.457228 │ 50 │ +│ 148 │ Iris-virginica │ 0.457228 │ 50 │ +│ 149 │ Iris-virginica │ 0.457228 │ 50 │ +│ 150 │ Iris-virginica │ 0.457228 │ 50 │ + +julia> transform(gdf, nrow) +150×6 DataFrame +│ Row │ Species │ SepalLength │ SepalWidth │ PetalLength │ PetalWidth │ nrow │ +│ │ String │ Float64 │ Float64 │ Float64 │ Float64 │ Int64 │ +├─────┼────────────────┼─────────────┼────────────┼─────────────┼────────────┼───────┤ +│ 1 │ Iris-setosa │ 5.1 │ 3.5 │ 1.4 │ 0.2 │ 50 │ +│ 2 │ Iris-setosa │ 4.9 │ 3.0 │ 1.4 │ 0.2 │ 50 │ +│ 3 │ Iris-setosa │ 4.7 │ 3.2 │ 1.3 │ 0.2 │ 50 │ +│ 4 │ Iris-setosa │ 4.6 │ 3.1 │ 1.5 │ 0.2 │ 50 │ +│ 5 │ Iris-setosa │ 5.0 │ 3.6 │ 1.4 │ 0.2 │ 50 │ +│ 6 │ Iris-setosa │ 5.4 │ 3.9 │ 1.7 │ 0.4 │ 50 │ +│ 7 │ Iris-setosa │ 4.6 │ 3.4 │ 1.4 │ 0.3 │ 50 │ +⋮ +│ 143 │ Iris-virginica │ 5.8 │ 2.7 │ 5.1 │ 1.9 │ 50 │ +│ 144 │ Iris-virginica │ 6.8 │ 3.2 │ 5.9 │ 2.3 │ 50 │ +│ 145 │ Iris-virginica │ 6.7 │ 3.3 │ 5.7 │ 2.5 │ 50 │ +│ 146 │ Iris-virginica │ 6.7 │ 3.0 │ 5.2 │ 2.3 │ 50 │ +│ 147 │ Iris-virginica │ 6.3 │ 2.5 │ 5.0 │ 1.9 │ 50 │ +│ 148 │ Iris-virginica │ 6.5 │ 3.0 │ 5.2 │ 2.0 │ 50 │ +│ 149 │ Iris-virginica │ 6.2 │ 3.4 │ 5.4 │ 2.3 │ 50 │ +│ 150 │ Iris-virginica │ 5.9 │ 3.0 │ 5.1 │ 1.8 │ 50 │ +``` + +The `combine` function also supports the `do` block form. However, as noted above, this form is slow and should therefore be avoided when performance matters. ```jldoctest sac -julia> by(iris, :Species) do df +julia> combine(gdf) do df (m = mean(df.PetalLength), s² = var(df.PetalLength)) end 3×3 DataFrame diff --git a/src/abstractdataframe/selection.jl b/src/abstractdataframe/selection.jl index 52ea1b71cd..dec34a49c5 100644 --- a/src/abstractdataframe/selection.jl +++ b/src/abstractdataframe/selection.jl @@ -277,6 +277,7 @@ SELECT_ARG_RULES = select!(df::DataFrame, args...) Mutate `df` in place to retain only columns specified by `args...` and return it. +The result is guaranteed to have the same number of rows as `df`. $SELECT_ARG_RULES @@ -330,11 +331,13 @@ julia> df = DataFrame(a=1:3, b=4:6); julia> select!(df, names(df) .=> sum); julia> df -1×2 DataFrame +3×2 DataFrame │ Row │ a_sum │ b_sum │ │ │ Int64 │ Int64 │ ├─────┼───────┼───────┤ │ 1 │ 6 │ 15 │ +│ 2 │ 6 │ 15 │ +│ 3 │ 6 │ 15 │ julia> df = DataFrame(a=1:3, b=4:6); @@ -403,6 +406,7 @@ end transform!(df::DataFrame, args...) Mutate `df` in place to add columns specified by `args...` and return it. +The result is guaranteed to have the same number of rows as `df`. Equivalent to `select!(df, :, args...)`. See [`select!`](@ref) for detailed rules regarding accepted values for `args`. @@ -413,7 +417,7 @@ transform!(df::DataFrame, args...) = select!(df, :, args...) select(df::AbstractDataFrame, args...; copycols::Bool=true) Create a new data frame that contains columns from `df` specified by `args` and -return it. +return it. The result is guaranteed to have the same number of rows as `df`. If `df` is a `DataFrame` or `copycols=true` then column renaming and transformations are supported. @@ -500,18 +504,22 @@ julia> select(df, :, [:a, :b] => (a,b) -> a .+ b .- sum(b)/length(b)) │ 3 │ 3 │ 6 │ 4.0 │ julia> select(df, names(df) .=> sum) -1×2 DataFrame +3×2 DataFrame │ Row │ a_sum │ b_sum │ │ │ Int64 │ Int64 │ ├─────┼───────┼───────┤ │ 1 │ 6 │ 15 │ +│ 2 │ 6 │ 15 │ +│ 3 │ 6 │ 15 │ julia> select(df, names(df) .=> sum .=> [:A, :B]) -1×2 DataFrame +3×2 DataFrame │ Row │ A │ B │ │ │ Int64 │ Int64 │ ├─────┼───────┼───────┤ │ 1 │ 6 │ 15 │ +│ 2 │ 6 │ 15 │ +│ 3 │ 6 │ 15 │ julia> select(df, AsTable(:) => ByRow(mean)) 3×1 DataFrame @@ -532,6 +540,7 @@ select(df::AbstractDataFrame, args...; copycols::Bool=true) = Create a new data frame that contains columns from `df` and adds columns specified by `args` and return it. +The result is guaranteed to have the same number of rows as `df`. Equivalent to `select(df, :, args..., copycols=copycols)`. See [`select`](@ref) for detailed rules regarding accepted values for `args`. @@ -539,6 +548,34 @@ See [`select`](@ref) for detailed rules regarding accepted values for `args`. transform(df::AbstractDataFrame, args...; copycols::Bool=true) = select(df, :, args..., copycols=copycols) + +""" + combine(df::AbstractDataFrame, args...) + +Create a new data frame that contains columns from `df` specified by `args` and +return it. The result can have any number of rows that is determined by the +passed transformations. + +See [`select`](@ref) for detailed rules regarding accepted values for `args`. + +# Examples +```jldoctest +julia> df = DataFrame(a=1:3, b=4:6) +3×2 DataFrame +│ Row │ a │ b │ +│ │ Int64 │ Int64 │ +├─────┼───────┼───────┤ +│ 1 │ 1 │ 4 │ +│ 2 │ 2 │ 5 │ +│ 3 │ 3 │ 6 │ + +julia> combine(df, :a => sum, nrow) +1×2 DataFrame +│ Row │ a_sum │ nrow │ +│ │ Int64 │ Int64 │ +├─────┼───────┼───────┤ +│ 1 │ 6 │ 3 │ +""" combine(df::AbstractDataFrame, args...) = _manipulate(df, args..., copycols=true, keeprows=false) diff --git a/src/dataframe/dataframe.jl b/src/dataframe/dataframe.jl index 42864a3bd2..3d1fa9eff6 100644 --- a/src/dataframe/dataframe.jl +++ b/src/dataframe/dataframe.jl @@ -31,7 +31,7 @@ DataFrame(column_eltypes::AbstractVector, names::AbstractVector{<:AbstractString DataFrame(ds::AbstractDict; copycols::Bool=true) DataFrame(table; makeunique::Bool=false, copycols::Bool=true) DataFrame(::Union{DataFrame, SubDataFrame}; copycols::Bool=true) -DataFrame(::GroupedDataFrame) +DataFrame(::GroupedDataFrame; keepkeys::Bool=true) ``` # Arguments @@ -65,6 +65,9 @@ to fill a new vector of the appropriate length. As a particular rule values stored in a `Ref` or a `0`-dimensional `AbstractArray` are unwrapped and treated in the same way. +Additionally `DataFrame` can be used to collect [`GroupedDataFrame`](@ref) +into a `DataFrame`. + # Notes The `DataFrame` constructor by default copies all columns vectors passed to it. Pass `copycols=false` to reuse vectors without copying them diff --git a/src/groupeddataframe/splitapplycombine.jl b/src/groupeddataframe/splitapplycombine.jl index 9d0df8791f..91ce4a0e91 100644 --- a/src/groupeddataframe/splitapplycombine.jl +++ b/src/groupeddataframe/splitapplycombine.jl @@ -8,9 +8,10 @@ into row groups. - `df` : an `AbstractDataFrame` to split - `cols` : data frame columns to group by. Can be any column selector ($COLUMNINDEX_STR; $MULTICOLUMNINDEX_STR). -- `sort` : whether to sort rows according to the values of the grouping columns - `cols` -- `skipmissing` : whether to skip rows with `missing` values in one of the +- `sort` : whether to sort groups according to the values of the grouping columns + `cols`; if all `cols` are `CategoricalVector` then groups are always sorted + irrespective of the value of `sort` +- `skipmissing` : whether to skip groups with `missing` values in one of the grouping columns `cols` # Details @@ -217,7 +218,6 @@ const F_ARGUMENT_RULES = * Column transformation operations using the `Pair` notation that is described below and vectors of such pairs. - Transformations allowed using `Pair`s follow the rules specified for [`select`](@ref) and have the form `source_cols => fun`, `source_cols => fun => target_col`, or `source_col => target_col`. @@ -251,31 +251,33 @@ const KWARG_PROCESSING_RULES = in addition to those generated. In this case if the returned value contains columns with the same names as the grouping columns, they are required to be equal. + + If `regroup=false`, if the returned value should be a `DataFrame` or a + `GroupedDataFrame` grouped using `keycols(gdf)`. """ """ - combine(gd::GroupedDataFrame, args...; keepkeys::Bool=true) - combine(fun::Union{Function, Type}, gd::GroupedDataFrame; keepkeys::Bool=true) - combine(pair::Pair, gd::GroupedDataFrame; keepkeys::Bool=true) - combine(gd::GroupedDataFrame, fun::Union{Function, Type}; keepkeys::Bool=true) - combine(gd::GroupedDataFrame, pair::Pair; keepkeys::Bool=true) + combine(gd::GroupedDataFrame, args...; keepkeys::Bool=true, regroup::Bool=false) + combine(fun::Union{Function, Type}, gd::GroupedDataFrame; + keepkeys::Bool=true, regroup::Bool=false) + combine(pair::Pair, gd::GroupedDataFrame; keepkeys::Bool=true, regroup::Bool=false) + combine(fun::Union{Function, Type}, df::AbstractDataFrame) + combine(pair::Pair, df::AbstractDataFrame) Transform a [`GroupedDataFrame`](@ref) into a `DataFrame`. +As a special case if `combine` is passed an `AbstractDataFrame` it applies `fun` +or `pair` to the passed data frame as a whole. + $F_ARGUMENT_RULES $F_TYPE_RULES $KWARG_PROCESSING_RULES -The resulting data frame will be sorted if `sort=true` was passed to the +The resulting data frame will be sorted by `keycols(gdf)` if `sort=true` was passed to the [`groupby`](@ref) call from which `gd` was constructed. Otherwise, ordering of rows -is undefined. - -See also: -- [`by(f, df, cols)`](@ref) is a shorthand for `combine(f, groupby(df, cols))`. -- [`map`](@ref): `combine(f, groupby(df, cols))` is a more efficient equivalent - of `combine(map(f, groupby(df, cols)))`. +follows the order of groups in `gdf`. # Examples ```jldoctest @@ -295,6 +297,20 @@ julia> combine(gd, :c => sum, nrow) │ 3 │ 3 │ 10 │ 2 │ │ 4 │ 4 │ 12 │ 2 │ +julia> combine(gd, :c => sum, nrow, regroup=true) +GroupedDataFrame with 4 groups based on key: a +First Group (1 row): a = 1 +│ Row │ a │ c_sum │ nrow │ +│ │ Int64 │ Int64 │ Int64 │ +├─────┼───────┼───────┼───────┤ +│ 1 │ 1 │ 6 │ 2 │ +⋮ +Last Group (1 row): a = 4 +│ Row │ a │ c_sum │ nrow │ +│ │ Int64 │ Int64 │ Int64 │ +├─────┼───────┼───────┼───────┤ +│ 1 │ 4 │ 12 │ 2 │ + julia> combine(sdf -> sum(sdf.c), gd) # Slower variant 4×2 DataFrame │ Row │ a │ x1 │ @@ -305,7 +321,7 @@ julia> combine(sdf -> sum(sdf.c), gd) # Slower variant │ 3 │ 3 │ 10 │ │ 4 │ 4 │ 12 │ -julia> by(df, :a) do d # do syntax for the slower variant +julia> combine(gdf) do d # do syntax for the slower variant sum(d.c) end 4×2 DataFrame @@ -415,8 +431,6 @@ julia> combine(gd, :, AsTable(Not(:a)) => sum) │ 7 │ 4 │ 1 │ 4 │ 5 │ │ 8 │ 4 │ 1 │ 8 │ 9 │ ``` - -See [`by`](@ref) for more examples. """ function combine(f::Base.Callable, gd::GroupedDataFrame; keepkeys::Bool=true, regroup::Bool=false) @@ -575,11 +589,11 @@ function combine_helper(f, gd::GroupedDataFrame, if keeprows newparent = hcat!(select(parent(gd), gd.cols, copycols=copycols), select(valscat, Not(intersect(keys, _names(valscat))), - copycols=false)) + copycols=false), copycols=false) else newparent = hcat!(parent(gd)[idx, gd.cols], select(valscat, Not(intersect(keys, _names(valscat))), - copycols=false)) + copycols=false), copycols=false) end regroup || return newparent @@ -1022,7 +1036,7 @@ function _combine(f::AbstractVector{<:Pair}, if keeprows if !_check_cannonical(gd) throw(ArgumentError("select or transform functions require that " * - "GroupedDataFrame is not sorted or subsetted")) + "GroupedDataFrame is not subsetted")) end idx_keeprows = Vector{Int}(undef, nrow(parent(gd))) let i = 0 @@ -1153,7 +1167,7 @@ function _combine(f::AbstractVector{<:Pair}, # this check is redundant given we check idx above # but it is safer to double check and it is cheap @assert all(x -> length(x) == length(outcols[1]), outcols) - return idx, DataFrame(collect(AbstractVector, outcols), nms) + return idx, DataFrame!(collect(AbstractVector, outcols), nms) end function _combine(fun::Base.Callable, gd::GroupedDataFrame, ::Nothing, @@ -1425,16 +1439,39 @@ function _combine_tables_with_first!(first::Union{AbstractDataFrame, return outcols, colnames end +""" + select(gd::GroupedDataFrame, args...; + copycols::Bool=true, keepkeys::Bool=true, regroup::Bool=false) + +Apply `args` to `gd` following the rules described in [`combine`](@ref). +Ensure that the return value has number of rows equal to `nrow(parent(gd))`. + +If `copycols=false` then do not perform copying of columns that are not transformed. +""" select(gd::GroupedDataFrame, args...; copycols::Bool=true, keepkeys::Bool=true, regroup::Bool=false) = _combine_executor(gd, args..., copycols=copycols, keepkeys=keepkeys, regroup=regroup, keeprows=true) -DataFrames.transform(gd::GroupedDataFrame, args...; +""" + transform(gd::GroupedDataFrame, args...; + copycols::Bool=true, keepkeys::Bool=true, regroup::Bool=false) + +An equivalent of +`select(gd, :, args..., copycols=copycols, keepkeys=keepkeys, regroup=regroup)` +""" +transform(gd::GroupedDataFrame, args...; copycols::Bool=true, keepkeys::Bool=true, regroup::Bool=false) = select(gd, :, args..., copycols=copycols, keepkeys=keepkeys, regroup=regroup) +""" + select!(gd::GroupedDataFrame{DataFrame}, args...; regroup::Bool=false) + +An equivalent of +`select(gd, args..., copycols=false, keepkeys=true, regroup=regroup)` +but updates the `parent(gd)` in place. +""" function select!(gd::GroupedDataFrame{DataFrame}, args...; regroup::Bool=false) newdf = select(gd, args..., copycols=false, regroup=false) df = parent(gd) @@ -1448,5 +1485,12 @@ function select!(gd::GroupedDataFrame{DataFrame}, args...; regroup::Bool=false) return regroup ? gd : df end +""" + transform!(gd::GroupedDataFrame{DataFrame}, args...; regroup::Bool=false) + +An equivalent of +`transform(gd, args..., copycols=false, keepkeys=true, regroup=regroup)` +but updates the `parent(gd)` in place. +""" transform!(gd::GroupedDataFrame{DataFrame}, args...; regroup::Bool=false) = select!(gd, :, args..., regroup=regroup) diff --git a/test/grouping.jl b/test/grouping.jl index a0285dac76..2b4a5feba8 100644 --- a/test/grouping.jl +++ b/test/grouping.jl @@ -1920,4 +1920,280 @@ end @test select(gdf2, :g => sum) == combine(gdf2, :g => sum, :g) end +@testset "combine GroupedDataFrame" begin + for df in (DataFrame(g=[3,1,1,missing],x=1:4, y=5:8), + DataFrame(g=categorical([3,1,1,missing]),x=1:4, y=5:8)) + if !(df.g isa CategoricalVector) + gdf = groupby_checked(df, :g, sort=false, skipmissing=false) + + @test combine(gdf, :x => sum, keepkeys=false, regroup=false) == + DataFrame(x_sum = [1, 5, 4]) + @test_throws ArgumentError combine(gdf, :x => sum, keepkeys=false, regroup=true) + @test combine(gdf, :x => sum, keepkeys=true, regroup=false) ≅ + DataFrame(g = [3, 1, missing], x_sum = [1, 5, 4]) + gdf2 = validate_gdf(combine(gdf, :x => sum, keepkeys=true, regroup=true)) + @test gdf2 isa GroupedDataFrame{DataFrame} + @test gdf2.groups == 1:3 + @test DataFrame(gdf2) ≅ DataFrame(g = [3, 1, missing], x_sum = [1, 5, 4]) + @test DataFrame(gdf2, keepkeys=false) == DataFrame(x_sum = [1, 5, 4]) + + @test combine(gdf, :x => sum, :g, keepkeys=false, regroup=false) ≅ + DataFrame(x_sum = [1, 5, 5, 4], g = [3, 1, 1, missing]) + @test combine(gdf, :x => sum, :g, keepkeys=true, regroup=false) ≅ + DataFrame(g = [3, 1, 1, missing], x_sum = [1, 5, 5, 4]) + gdf2 = validate_gdf(combine(gdf, :x => sum, :g, keepkeys=true, regroup=true)) + @test gdf2 isa GroupedDataFrame{DataFrame} + @test gdf2.groups == [1, 2, 2, 3] + @test DataFrame(gdf2) ≅ DataFrame(g = [3, 1, 1, missing], x_sum = [1, 5, 5, 4]) + @test DataFrame(gdf2, keepkeys=false) ≅ DataFrame(x_sum = [1, 5, 5, 4]) + + @test combine(x -> (x_sum = sum(x.x),), gdf, keepkeys=false, regroup=false) == + DataFrame(x_sum = [1, 5, 4]) + @test combine(x -> (x_sum = sum(x.x),), gdf, keepkeys=true, regroup=false) ≅ + DataFrame(g = [3, 1, missing], x_sum = [1, 5, 4]) + gdf2 = validate_gdf(combine(x -> (x_sum = sum(x.x),), gdf, keepkeys=true, regroup=true)) + @test gdf2 isa GroupedDataFrame{DataFrame} + @test gdf2.groups == 1:3 + @test DataFrame(gdf2) ≅ DataFrame(g = [3, 1, missing], x_sum = [1, 5, 4]) + @test DataFrame(gdf2, keepkeys=false) ≅ DataFrame(x_sum = [1, 5, 4]) + + gdf = groupby_checked(df, :g, sort=false, skipmissing=true) + + @test combine(gdf, :x => sum, keepkeys=false, regroup=false) == + DataFrame(x_sum = [1, 5]) + @test_throws ArgumentError combine(gdf, :x => sum, keepkeys=false, regroup=true) + @test combine(gdf, :x => sum, keepkeys=true, regroup=false) ≅ + DataFrame(g = [3, 1], x_sum = [1, 5]) + gdf2 = validate_gdf(combine(gdf, :x => sum, keepkeys=true, regroup=true)) + @test gdf2 isa GroupedDataFrame{DataFrame} + @test gdf2.groups == 1:2 + @test DataFrame(gdf2) ≅ DataFrame(g = [3, 1], x_sum = [1, 5]) + @test DataFrame(gdf2, keepkeys=false) ≅ DataFrame(x_sum = [1, 5]) + + @test combine(gdf, :x => sum, :g, keepkeys=false, regroup=false) ≅ + DataFrame(x_sum = [1, 5, 5], g = [3, 1, 1]) + @test combine(gdf, :x => sum, :g, keepkeys=true, regroup=false) ≅ + DataFrame(g = [3, 1, 1], x_sum = [1, 5, 5]) + gdf2 = validate_gdf(combine(gdf, :x => sum, :g, keepkeys=true, regroup=true)) + @test gdf2 isa GroupedDataFrame{DataFrame} + @test gdf2.groups == [1, 2, 2] + @test DataFrame(gdf2) ≅ DataFrame(g = [3, 1, 1], x_sum = [1, 5, 5]) + @test DataFrame(gdf2, keepkeys=false) ≅ DataFrame(x_sum = [1, 5, 5]) + + @test combine(x -> (x_sum = sum(x.x),), gdf, keepkeys=false, regroup=false) == + DataFrame(x_sum = [1, 5]) + @test combine(x -> (x_sum = sum(x.x),), gdf, keepkeys=true, regroup=false) ≅ + DataFrame(g = [3, 1], x_sum = [1, 5]) + gdf2 = validate_gdf(combine(x -> (x_sum = sum(x.x),), gdf, keepkeys=true, regroup=true)) + @test gdf2 isa GroupedDataFrame{DataFrame} + @test gdf2.groups == 1:2 + @test DataFrame(gdf2) ≅ DataFrame(g = [3, 1], x_sum = [1, 5]) + @test DataFrame(gdf2, keepkeys=false) ≅ DataFrame(x_sum = [1, 5]) + end + + gdf = groupby_checked(df, :g, sort=true, skipmissing=false) + + @test combine(gdf, :x => sum, keepkeys=false, regroup=false) == + DataFrame(x_sum = [5, 1, 4]) + @test_throws ArgumentError validate_gdf(combine(gdf, :x => sum, keepkeys=false, regroup=true)) + @test combine(gdf, :x => sum, keepkeys=true, regroup=false) ≅ + DataFrame(g = [1, 3, missing], x_sum = [5, 1, 4]) + gdf2 = validate_gdf(combine(gdf, :x => sum, keepkeys=true, regroup=true)) + @test gdf2 isa GroupedDataFrame{DataFrame} + @test gdf2.groups == 1:3 + @test DataFrame(gdf2) ≅ DataFrame(g = [1, 3, missing], x_sum = [5, 1, 4]) + @test DataFrame(gdf2, keepkeys=false) ≅ DataFrame(x_sum = [5, 1, 4]) + + @test combine(gdf, :x => sum, :g, keepkeys=false, regroup=false) ≅ + DataFrame(x_sum = [5, 5, 1, 4], g = [1, 1, 3, missing]) + @test combine(gdf, :x => sum, :g, keepkeys=true, regroup=false) ≅ + DataFrame(g = [1, 1, 3, missing], x_sum = [5, 5, 1, 4]) + gdf2 = validate_gdf(combine(gdf, :x => sum, :g, keepkeys=true, regroup=true)) + @test gdf2 isa GroupedDataFrame{DataFrame} + @test gdf2.groups == [1, 1, 2, 3] + @test DataFrame(gdf2) ≅ DataFrame(g = [1, 1, 3, missing], x_sum = [5, 5, 1, 4]) + @test DataFrame(gdf2, keepkeys=false) ≅ DataFrame(x_sum = [5, 5, 1, 4]) + + @test combine(x -> (x_sum = sum(x.x),), gdf, keepkeys=false, regroup=false) == + DataFrame(x_sum = [5, 1, 4]) + @test combine(x -> (x_sum = sum(x.x),), gdf, keepkeys=true, regroup=false) ≅ + DataFrame(g = [1, 3, missing], x_sum = [5, 1, 4]) + gdf2 = validate_gdf(combine(x -> (x_sum = sum(x.x),), gdf, keepkeys=true, regroup=true)) + @test gdf2 isa GroupedDataFrame{DataFrame} + @test gdf2.groups == 1:3 + @test DataFrame(gdf2) ≅ DataFrame(g = [1, 3, missing], x_sum = [5, 1, 4]) + @test DataFrame(gdf2, keepkeys=false) ≅ DataFrame(x_sum = [5, 1, 4]) + + gdf = groupby_checked(df, :g, sort=true, skipmissing=true) + + @test combine(gdf, :x => sum, keepkeys=false, regroup=false) == + DataFrame(x_sum = [5, 1]) + @test_throws ArgumentError combine(gdf, :x => sum, keepkeys=false, regroup=true) + @test combine(gdf, :x => sum, keepkeys=true, regroup=false) ≅ + DataFrame(g = [1, 3], x_sum = [5, 1]) + gdf2 = validate_gdf(combine(gdf, :x => sum, keepkeys=true, regroup=true)) + @test gdf2 isa GroupedDataFrame{DataFrame} + @test gdf2.groups == 1:2 + @test DataFrame(gdf2) ≅ DataFrame(g = [1, 3], x_sum = [5, 1]) + @test DataFrame(gdf2, keepkeys=false) ≅ DataFrame(x_sum = [5, 1]) + + @test combine(gdf, :x => sum, :g, keepkeys=false, regroup=false) ≅ + DataFrame(x_sum = [5, 5, 1], g = [1, 1, 3]) + @test combine(gdf, :x => sum, :g, keepkeys=true, regroup=false) ≅ + DataFrame(g = [1, 1, 3], x_sum = [5, 5, 1]) + gdf2 = validate_gdf(combine(gdf, :x => sum, :g, keepkeys=true, regroup=true)) + @test gdf2 isa GroupedDataFrame{DataFrame} + @test gdf2.groups == [1, 1, 2] + @test DataFrame(gdf2) ≅ DataFrame(g = [1, 1, 3], x_sum = [5, 5, 1]) + @test DataFrame(gdf2, keepkeys=false) ≅ DataFrame(x_sum = [5, 5, 1]) + + @test combine(x -> (x_sum = sum(x.x),), gdf, keepkeys=false, regroup=false) == + DataFrame(x_sum = [5, 1]) + @test combine(x -> (x_sum = sum(x.x),), gdf, keepkeys=true, regroup=false) ≅ + DataFrame(g = [1, 3], x_sum = [5, 1]) + gdf2 = validate_gdf(combine(x -> (x_sum = sum(x.x),), gdf, keepkeys=true, regroup=true)) + @test gdf2 isa GroupedDataFrame{DataFrame} + @test gdf2.groups == 1:2 + @test DataFrame(gdf2) ≅ DataFrame(g = [1, 3], x_sum = [5, 1]) + @test DataFrame(gdf2, keepkeys=false) ≅ DataFrame(x_sum = [5, 1]) + end +end + +@testset "select and transform GroupedDataFrame" begin + for df in (DataFrame(g=[3,1,1,missing],x=1:4, y=5:8), + DataFrame(g=categorical([3,1,1,missing]),x=1:4, y=5:8)), + dosort in (true, false) + + gdf = groupby_checked(df, :g, sort=dosort, skipmissing=false) + + @test select(gdf, :x => sum, keepkeys=false, regroup=false) == + DataFrame(x_sum = [1, 5, 5, 4]) + @test_throws ArgumentError select(gdf, :x => sum, keepkeys=false, regroup=true) + @test select(gdf, :x => sum, keepkeys=true, regroup=false) ≅ + DataFrame(g = df.g, x_sum = [1, 5, 5, 4]) + gdf2 = validate_gdf(select(gdf, :x => sum, keepkeys=true, regroup=true)) + @test gdf2 isa GroupedDataFrame{DataFrame} + @test gdf2.groups == gdf.groups + @test parent(gdf2).g ≅ df.g + @test parent(gdf2).g !== df.g + + @test select(gdf, :x => sum, :g, keepkeys=false, regroup=false) ≅ + DataFrame(x_sum = [1, 5, 5, 4], g = df.g) + @test select(gdf, :x => sum, :g, keepkeys=true, regroup=false) ≅ + DataFrame(g = df.g, x_sum = [1, 5, 5, 4]) + gdf2 = validate_gdf(select(gdf, :x => sum, :g, keepkeys=true, regroup=true)) + @test gdf2 isa GroupedDataFrame{DataFrame} + @test gdf2.groups == gdf.groups + @test parent(gdf2).g ≅ df.g + @test parent(gdf2).g !== df.g + + @test transform(gdf, :x => sum, keepkeys=false, regroup=false) ≅ + [df DataFrame(x_sum = [1, 5, 5, 4])] + @test_throws ArgumentError transform(gdf, :x => sum, keepkeys=false, regroup=true) + @test transform(gdf, :x => sum, keepkeys=true, regroup=false) ≅ + DataFrame(g = df.g, x = df.x, y = df.y, x_sum = [1, 5, 5, 4]) + gdf2 = validate_gdf(transform(gdf, :x => sum, keepkeys=true, regroup=true)) + @test gdf2 isa GroupedDataFrame{DataFrame} + @test gdf2.groups == gdf.groups + @test parent(gdf2).g ≅ df.g + @test parent(gdf2).x ≅ df.x + @test parent(gdf2).y ≅ df.y + @test parent(gdf2).g !== df.g + + @test transform(gdf, :x => sum, :g, keepkeys=false, regroup=false) ≅ + [df DataFrame(x_sum = [1, 5, 5, 4])] + @test transform(gdf, :x => sum, :g, keepkeys=true, regroup=false) ≅ + [df DataFrame(x_sum = [1, 5, 5, 4])] + gdf2 = validate_gdf(transform(gdf, :x => sum, :g, keepkeys=true, regroup=true)) + @test gdf2 isa GroupedDataFrame{DataFrame} + @test gdf2.groups == gdf.groups + @test parent(gdf2).g ≅ df.g + @test parent(gdf2).x ≅ df.x + @test parent(gdf2).y ≅ df.y + @test parent(gdf2).g !== df.g + + df2 = transform(gdf, :x => sum, :g, keepkeys=false, regroup=false, copycols=false) + @test df2 ≅ [df DataFrame(x_sum = [1, 5, 5, 4])] + @test df2.g === df.g + @test df2.x === df.x + @test df2.y === df.y + df2 = transform(gdf, :x => sum, :g, keepkeys=true, regroup=false, copycols=false) + @test df2 ≅ [df DataFrame(x_sum = [1, 5, 5, 4])] + @test df2.g === df.g + @test df2.x === df.x + @test df2.y === df.y + gdf2 = validate_gdf(transform(gdf, :x => sum, :g, keepkeys=true, regroup=true, copycols=false)) + @test gdf2 isa GroupedDataFrame{DataFrame} + @test gdf2.groups == gdf.groups + @test parent(gdf2).g ≅ df.g + @test parent(gdf2).x ≅ df.x + @test parent(gdf2).y ≅ df.y + @test parent(gdf2).g === df.g + + gdf = groupby_checked(df, :g, sort=dosort, skipmissing=true) + @test_throws ArgumentError select(gdf, :x => sum) + @test_throws ArgumentError select(gdf, :x => sum, regroup=true) + @test_throws ArgumentError transform(gdf, :x => sum) + @test_throws ArgumentError transform(gdf, :x => sum, regroup=true) + end +end + +@testset "select! and transform! GroupedDataFrame" begin + for df in (DataFrame(g=[3,1,1,missing],x=1:4, y=5:8), + DataFrame(g=categorical([3,1,1,missing]),x=1:4, y=5:8)), + dosort in (true, false) + + @test_throws MethodError select!(groupby_checked(view(df, :, :), :g), :x) + @test_throws MethodError transform!(groupby_checked(view(df, :, :), :g), :x) + + dfc = copy(df) + g = dfc.g + gdf = groupby_checked(dfc, :g, sort=dosort, skipmissing=false) + @test select!(gdf, :x => sum) === dfc + @test dfc.g === g + @test dfc.x_sum == [1, 5, 5, 4] + @test propertynames(dfc) == [:g, :x_sum] + + dfc = copy(df) + g = dfc.g + x = dfc.x + y = dfc.y + gdf = groupby_checked(dfc, :g, sort=dosort, skipmissing=false) + @test transform!(gdf, :g => first => :g, :x => first) === dfc + @test dfc.g === g + @test dfc.x === x + @test dfc.y === y + @test dfc.x_first == [1, 2, 2, 4] + @test propertynames(dfc) == [:g, :x, :y, :x_first] + + dfc = copy(df) + g = dfc.g + gdf = groupby_checked(dfc, :g, sort=dosort, skipmissing=false) + @test select!(gdf, :x => sum, regroup=true) === gdf + @test dfc.g === g + @test dfc.x_sum == [1, 5, 5, 4] + @test propertynames(dfc) == [:g, :x_sum] + + dfc = copy(df) + g = dfc.g + x = dfc.x + y = dfc.y + gdf = groupby_checked(dfc, :g, sort=dosort, skipmissing=false) + @test transform!(gdf, :g => first => :g, :x => first, regroup=true) === gdf + @test dfc.g === g + @test dfc.x === x + @test dfc.y === y + @test dfc.x_first == [1, 2, 2, 4] + @test propertynames(dfc) == [:g, :x, :y, :x_first] + + dfc = copy(df) + gdf = groupby_checked(dfc, :g, sort=dosort, skipmissing=true) + @test_throws ArgumentError select!(gdf, :x => sum) + @test_throws ArgumentError select!(gdf, :x => sum, regroup=true) + @test_throws ArgumentError transform!(gdf, :x => sum) + @test_throws ArgumentError transform!(gdf, :x => sum, regroup=true) + @test dfc ≅ df + end +end + end # module diff --git a/test/select.jl b/test/select.jl index 9a7a7a0784..a63a30e329 100644 --- a/test/select.jl +++ b/test/select.jl @@ -1200,130 +1200,85 @@ end DataFrame(y_function = [4, 5], x_sum=[3, 3]) end -@testset "combine GroupedDataFrame" begin - for df in (DataFrame(g=[3,1,1,missing],x=1:4, y=5:8), - DataFrame(g=categorical([3,1,1,missing]),x=1:4, y=5:8)) - if !(df.g isa CategoricalVector) - gdf = groupby(df, :g, sort=false, skipmissing=false) - @test combine(gdf, :x => sum, keepkeys=false, regroup=false) == - DataFrame(x_sum = [1, 5, 4]) - @test_throws ArgumentError combine(gdf, :x => sum, keepkeys=false, regroup=true) - @test combine(gdf, :x => sum, keepkeys=true, regroup=false) ≅ - DataFrame(g = [3, 1, missing], x_sum = [1, 5, 4]) - gdf2 = combine(gdf, :x => sum, keepkeys=true, regroup=true) - @test gdf2 isa GroupedDataFrame{DataFrame} - @test gdf2.groups == 1:3 - @test DataFrame(gdf2) ≅ DataFrame(g = [3, 1, missing], x_sum = [1, 5, 4]) - - @test combine(gdf, :x => sum, :g, keepkeys=false, regroup=false) ≅ - DataFrame(x_sum = [1, 5, 5, 4], g = [3, 1, 1, missing]) - @test combine(gdf, :x => sum, :g, keepkeys=true, regroup=false) ≅ - DataFrame(g = [3, 1, 1, missing], x_sum = [1, 5, 5, 4]) - gdf2 = combine(gdf, :x => sum, :g, keepkeys=true, regroup=true) - @test gdf2 isa GroupedDataFrame{DataFrame} - @test gdf2.groups == [1, 2, 2, 3] - @test DataFrame(gdf2) ≅ DataFrame(g = [3, 1, 1, missing], x_sum = [1, 5, 5, 4]) - - @test combine(x -> (x_sum = sum(x.x),), gdf, keepkeys=false, regroup=false) == - DataFrame(x_sum = [1, 5, 4]) - @test combine(x -> (x_sum = sum(x.x),), gdf, keepkeys=true, regroup=false) ≅ - DataFrame(g = [3, 1, missing], x_sum = [1, 5, 4]) - gdf2 = combine(x -> (x_sum = sum(x.x),), gdf, keepkeys=true, regroup=true) - @test gdf2 isa GroupedDataFrame{DataFrame} - @test gdf2.groups == 1:3 - @test DataFrame(gdf2) ≅ DataFrame(g = [3, 1, missing], x_sum = [1, 5, 4]) - - gdf = groupby(df, :g, sort=false, skipmissing=true) - - @test combine(gdf, :x => sum, keepkeys=false, regroup=false) == - DataFrame(x_sum = [1, 5]) - @test_throws ArgumentError combine(gdf, :x => sum, keepkeys=false, regroup=true) - @test combine(gdf, :x => sum, keepkeys=true, regroup=false) ≅ - DataFrame(g = [3, 1], x_sum = [1, 5]) - gdf2 = combine(gdf, :x => sum, keepkeys=true, regroup=true) - @test gdf2 isa GroupedDataFrame{DataFrame} - @test gdf2.groups == 1:2 - @test DataFrame(gdf2) ≅ DataFrame(g = [3, 1], x_sum = [1, 5]) - - @test combine(gdf, :x => sum, :g, keepkeys=false, regroup=false) ≅ - DataFrame(x_sum = [1, 5, 5], g = [3, 1, 1]) - @test combine(gdf, :x => sum, :g, keepkeys=true, regroup=false) ≅ - DataFrame(g = [3, 1, 1], x_sum = [1, 5, 5]) - gdf2 = combine(gdf, :x => sum, :g, keepkeys=true, regroup=true) - @test gdf2 isa GroupedDataFrame{DataFrame} - @test gdf2.groups == [1, 2, 2] - @test DataFrame(gdf2) ≅ DataFrame(g = [3, 1, 1], x_sum = [1, 5, 5]) - - @test combine(x -> (x_sum = sum(x.x),), gdf, keepkeys=false, regroup=false) == - DataFrame(x_sum = [1, 5]) - @test combine(x -> (x_sum = sum(x.x),), gdf, keepkeys=true, regroup=false) ≅ - DataFrame(g = [3, 1], x_sum = [1, 5]) - gdf2 = combine(x -> (x_sum = sum(x.x),), gdf, keepkeys=true, regroup=true) - @test gdf2 isa GroupedDataFrame{DataFrame} - @test gdf2.groups == 1:2 - @test DataFrame(gdf2) ≅ DataFrame(g = [3, 1], x_sum = [1, 5]) - end +@testset "select and transform AbstractDataFrame" begin + df = DataFrame(x=1:3, y=4:6) + @test select(df, :x => first) == DataFrame(x_first=fill(1,3)) + df2 = select(df, :x, :x => first, copycols=true) + @test df2 == DataFrame(x=df.x, x_first=fill(1,3)) + @test df2.x !== df.x + df2 = select(df, :x, :x => first, copycols=false) + @test df2 == DataFrame(x=df.x, x_first=fill(1,3)) + @test df2.x === df.x + @test_throws ArgumentError select(df, :x => x -> [first(x)], copycols=true) + @test_throws ArgumentError select(df, :x => x -> [first(x)], copycols=false) + + df2 = transform(df, :x => first, copycols=true) + @test df2 == [df DataFrame(x_first=fill(1,3))] + @test df2.x !== df.x + @test df2.y !== df.y + df2 = transform(df, :x => first, copycols=false) + @test df2 == [df DataFrame(x_first=fill(1,3))] + @test df2.x === df.x + @test df2.y === df.y + @test transform(df, names(df) .=> first .=> names(df)) == + DataFrame(x=fill(1, 3), y=fill(4, 3)) + @test_throws ArgumentError transform(df, :x => x -> [first(x)], copycols=true) + @test_throws ArgumentError transform(df, :x => x -> [first(x)], copycols=false) - gdf = groupby(df, :g, sort=true, skipmissing=false) - - @test combine(gdf, :x => sum, keepkeys=false, regroup=false) == - DataFrame(x_sum = [5, 1, 4]) - @test_throws ArgumentError combine(gdf, :x => sum, keepkeys=false, regroup=true) - @test combine(gdf, :x => sum, keepkeys=true, regroup=false) ≅ - DataFrame(g = [1, 3, missing], x_sum = [5, 1, 4]) - gdf2 = combine(gdf, :x => sum, keepkeys=true, regroup=true) - @test gdf2 isa GroupedDataFrame{DataFrame} - @test gdf2.groups == 1:3 - @test DataFrame(gdf2) ≅ DataFrame(g = [1, 3, missing], x_sum = [5, 1, 4]) - - @test combine(gdf, :x => sum, :g, keepkeys=false, regroup=false) ≅ - DataFrame(x_sum = [5, 5, 1, 4], g = [1, 1, 3, missing]) - @test combine(gdf, :x => sum, :g, keepkeys=true, regroup=false) ≅ - DataFrame(g = [1, 1, 3, missing], x_sum = [5, 5, 1, 4]) - gdf2 = combine(gdf, :x => sum, :g, keepkeys=true, regroup=true) - @test gdf2 isa GroupedDataFrame{DataFrame} - @test gdf2.groups == [1, 1, 2, 3] - @test DataFrame(gdf2) ≅ DataFrame(g = [1, 1, 3, missing], x_sum = [5, 5, 1, 4]) - - @test combine(x -> (x_sum = sum(x.x),), gdf, keepkeys=false, regroup=false) == - DataFrame(x_sum = [5, 1, 4]) - @test combine(x -> (x_sum = sum(x.x),), gdf, keepkeys=true, regroup=false) ≅ - DataFrame(g = [1, 3, missing], x_sum = [5, 1, 4]) - gdf2 = combine(x -> (x_sum = sum(x.x),), gdf, keepkeys=true, regroup=true) - @test gdf2 isa GroupedDataFrame{DataFrame} - @test gdf2.groups == 1:3 - @test DataFrame(gdf2) ≅ DataFrame(g = [1, 3, missing], x_sum = [5, 1, 4]) - - gdf = groupby(df, :g, sort=true, skipmissing=true) - - @test combine(gdf, :x => sum, keepkeys=false, regroup=false) == - DataFrame(x_sum = [5, 1]) - @test_throws ArgumentError combine(gdf, :x => sum, keepkeys=false, regroup=true) - @test combine(gdf, :x => sum, keepkeys=true, regroup=false) ≅ - DataFrame(g = [1, 3], x_sum = [5, 1]) - gdf2 = combine(gdf, :x => sum, keepkeys=true, regroup=true) - @test gdf2 isa GroupedDataFrame{DataFrame} - @test gdf2.groups == 1:2 - @test DataFrame(gdf2) ≅ DataFrame(g = [1, 3], x_sum = [5, 1]) - - @test combine(gdf, :x => sum, :g, keepkeys=false, regroup=false) ≅ - DataFrame(x_sum = [5, 5, 1], g = [1, 1, 3]) - @test combine(gdf, :x => sum, :g, keepkeys=true, regroup=false) ≅ - DataFrame(g = [1, 1, 3], x_sum = [5, 5, 1]) - gdf2 = combine(gdf, :x => sum, :g, keepkeys=true, regroup=true) - @test gdf2 isa GroupedDataFrame{DataFrame} - @test gdf2.groups == [1, 1, 2] - @test DataFrame(gdf2) ≅ DataFrame(g = [1, 1, 3], x_sum = [5, 5, 1]) - - @test combine(x -> (x_sum = sum(x.x),), gdf, keepkeys=false, regroup=false) == - DataFrame(x_sum = [5, 1]) - @test combine(x -> (x_sum = sum(x.x),), gdf, keepkeys=true, regroup=false) ≅ - DataFrame(g = [1, 3], x_sum = [5, 1]) - gdf2 = combine(x -> (x_sum = sum(x.x),), gdf, keepkeys=true, regroup=true) - @test gdf2 isa GroupedDataFrame{DataFrame} - @test gdf2.groups == 1:2 - @test DataFrame(gdf2) ≅ DataFrame(g = [1, 3], x_sum = [5, 1]) - end + dfv = view(df, [2, 1], [2, 1]) + @test select(dfv, :x => first) == DataFrame(x_first=fill(2,2)) + df2 = select(dfv, :x, :x => first, copycols=true) + @test df2 == DataFrame(x=dfv.x, x_first=fill(2,2)) + @test df2.x !== dfv.x + @test_throws ArgumentError select(dfv, :x, :x => first, copycols=false) + @test_throws ArgumentError select(dfv, :x => x -> [first(x)], copycols=true) + @test_throws ArgumentError select(dfv, :x => x -> [first(x)], copycols=false) + + df2 = transform(dfv, :x => first, copycols=true) + @test df2 == [dfv DataFrame(x_first=fill(2,2))] + @test df2.x !== dfv.x + @test df2.y !== dfv.y + @test_throws ArgumentError transform(dfv, :x => first, copycols=false) + @test transform(dfv, names(dfv) .=> first .=> names(dfv)) == + DataFrame(y=fill(5, 2), x=fill(2, 2)) + @test_throws ArgumentError transform(df, :x => x -> [first(x)], copycols=true) + @test_throws ArgumentError transform(df, :x => x -> [first(x)], copycols=false) +end + +@testset "select! and transform! AbstractDataFrame" begin + df = DataFrame(x=1:3, y=4:6) + select!(df, :x => first) + @test df == DataFrame(x_first = fill(1,3)) + + # if we select! we do copycols=false, so we can get aliases + df = DataFrame(x=1:3, y=4:6) + x = df.x + select!(df, :x => (x->x), :x) + @test x === df.x_function === df.x + + df = DataFrame(x=1:3, y=4:6) + @test_throws ArgumentError select!(df, :x => x -> [1]) + @test df == DataFrame(x=1:3, y=4:6) + + df = DataFrame(x=1:3, y=4:6) + x = df.x + y = df.y + transform!(df, :x => first) + @test df == DataFrame(x=x, y=y, x_first=fill(1,3)) + @test df.x == x + @test df.y == y + + df = DataFrame(x=1:3, y=4:6) + transform!(df, names(df) .=> first .=> names(df)) + @test df == DataFrame(x=fill(1,3), y=fill(4,3)) + + df = DataFrame(x=1:3, y=4:6) + @test_throws ArgumentError transform!(df, :x => x -> [1]) + @test df == DataFrame(x=1:3, y=4:6) + + dfv = view(df, [2, 1], [2, 1]) + @test_throws MethodError select!(dfv, 1) + @test_throws MethodError transform!(dfv, 1) end end # module From d51f3f8767c8862a7c12d7e4f112408517c66ce5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Thu, 30 Apr 2020 09:29:29 +0200 Subject: [PATCH 15/29] updates after review comments --- docs/src/man/split_apply_combine.md | 4 ++-- test/grouping.jl | 25 +++++++++++++++++++++++-- 2 files changed, 25 insertions(+), 4 deletions(-) diff --git a/docs/src/man/split_apply_combine.md b/docs/src/man/split_apply_combine.md index 99af65a2e6..a9287084fc 100644 --- a/docs/src/man/split_apply_combine.md +++ b/docs/src/man/split_apply_combine.md @@ -16,10 +16,10 @@ object from your data frame using `groupby` function that takes two arguments: The differences between the above functions are the following: * `select`: return a data frame with the number and order of rows exactly the same as the source, preserve only columns that have been calculated; + `select!`: is an in-place version of `select`; * `transform`: return a data frame with the number and order of rows exactly the same as the source, preserve all columns from the source and columns that have been calculated; -* `select!`: is an in-place version of `select`; -* `transform!`: is an in-place version of `transform`; + `transform!`: is an in-place version of `transform`; * `combine`: does not put restrictions on number of rows returned, the order of rows is specified by the order of groups in `GroupedDataFrame`. diff --git a/test/grouping.jl b/test/grouping.jl index 2b4a5feba8..7e0144e9e9 100644 --- a/test/grouping.jl +++ b/test/grouping.jl @@ -2136,6 +2136,27 @@ end @test_throws ArgumentError transform(gdf, :x => sum) @test_throws ArgumentError transform(gdf, :x => sum, regroup=true) end + + # show the difference between the ordering of rows in select and combine + Random.seed!(1) + for df in (DataFrame(g=rand(1:20, 1000), x=rand(1000), id=1:1000), + DataFrame(g=categorical(rand(1:20, 1000)), x=rand(1000), id=1:1000)), + dosort in (true, false) + + gdf = groupby(df, :g, sort=dosort) + + res1 = select(gdf, :x => mean, :x => x -> x .- mean(x), :id) + @test res1.g == df.g + @test res1.id == df.id + @test res1.x_mean + res1.x_function ≈ df.x + + res2 = combine(gdf, :x => mean, :x => x -> x .- mean(x), :id) + @test unique(res2.g) == + (dosort || df.g isa CategoricalVector ? sort! : identity)(unique(df.g)) + for i in unique(res2.g) + @test issorted(filter(:g => x -> x == i, res2).id) + end + end end @testset "select! and transform! GroupedDataFrame" begin @@ -2169,7 +2190,7 @@ end dfc = copy(df) g = dfc.g gdf = groupby_checked(dfc, :g, sort=dosort, skipmissing=false) - @test select!(gdf, :x => sum, regroup=true) === gdf + @test validate_gdf(select!(gdf, :x => sum, regroup=true)) === gdf @test dfc.g === g @test dfc.x_sum == [1, 5, 5, 4] @test propertynames(dfc) == [:g, :x_sum] @@ -2179,7 +2200,7 @@ end x = dfc.x y = dfc.y gdf = groupby_checked(dfc, :g, sort=dosort, skipmissing=false) - @test transform!(gdf, :g => first => :g, :x => first, regroup=true) === gdf + @test validate_gdf(transform!(gdf, :g => first => :g, :x => first, regroup=true)) === gdf @test dfc.g === g @test dfc.x === x @test dfc.y === y From ef461e6767ca4ea21b072f2f057c29751df49d8c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Fri, 1 May 2020 10:20:45 +0200 Subject: [PATCH 16/29] Apply suggestions from code review Co-authored-by: Milan Bouchet-Valat --- docs/src/man/getting_started.md | 6 +-- docs/src/man/split_apply_combine.md | 26 ++++++----- src/abstractdataframe/selection.jl | 8 ++-- src/dataframe/dataframe.jl | 2 +- src/groupeddataframe/splitapplycombine.jl | 57 +++++++++-------------- 5 files changed, 45 insertions(+), 54 deletions(-) diff --git a/docs/src/man/getting_started.md b/docs/src/man/getting_started.md index 5cb011a4f8..bd6b7890fa 100644 --- a/docs/src/man/getting_started.md +++ b/docs/src/man/getting_started.md @@ -792,9 +792,9 @@ julia> mean(df.A) 2.5 ``` -We can also apply a function to each column of a `DataFrame` using `select`. -`select` always returns the same number of rows in the result as the source -data frame. For example: +We can also apply a function to each column of a `DataFrame` using `select`, +which always returns the same number of rows in the result as the source +data frame (repeating values as necessary). For example: ```jldoctest dataframe julia> df = DataFrame(A = 1:4, B = 4.0:-1.0:1.0) 4×2 DataFrame diff --git a/docs/src/man/split_apply_combine.md b/docs/src/man/split_apply_combine.md index a9287084fc..ed2252009c 100644 --- a/docs/src/man/split_apply_combine.md +++ b/docs/src/man/split_apply_combine.md @@ -7,19 +7,19 @@ framework for handling this sort of computation is described in the paper written by Hadley Wickham. The DataFrames package supports the split-apply-combine strategy through the -`combine`, `select`/`select!` and `transform`/`transform!` functions. +`groupby` function followed by `combine`, `select`/`select!` or transform`/`transform!`. In order to perform operations by groups you first need to create a `GroupedDataFrame` -object from your data frame using `groupby` function that takes two arguments: +object from your data frame using the `groupby` function that takes two arguments: (1) a data frame to be grouped, and (2) a set of columns to group by. -The differences between the above functions are the following: +Operations can then be applied on each group using one of the following functions: * `select`: return a data frame with the number and order of rows exactly the same - as the source, preserve only columns that have been calculated; - `select!`: is an in-place version of `select`; + as the source data frame, including only new calculated columns; + `select!` is an in-place version of `select`; * `transform`: return a data frame with the number and order of rows exactly the same - as the source, preserve all columns from the source and columns that have been calculated; - `transform!`: is an in-place version of `transform`; + as the source data frame, including all columns from the source and new calculated columns; + `transform!` is an in-place version of `transform`; * `combine`: does not put restrictions on number of rows returned, the order of rows is specified by the order of groups in `GroupedDataFrame`. @@ -52,9 +52,10 @@ and `target_col` is not specified, `function` can return multiple columns in the form of an `AbstractDataFrame`, `AbstractMatrix`, `NamedTuple` or `DataFrameRow`. -Here are the rules specifying the shape of the resulting `DataFrame` in `combine` -(in `select`/`select!` and `transform`/`transform!` the result has the number -and order of rows equal to the source): +`select`/`select!` and `transform`/`transform!` always return a `DataFrame` +with the same number of rows as the source. +For `combine`, the shape of the resulting `DataFrame` is determined +according to the following rules: - a single value produces a single row and column per group - a named tuple or `DataFrameRow` produces a single row and one column per field - a vector produces a single column with one row per entry @@ -205,8 +206,9 @@ julia> combine(gdf, 1:2 => cor, nrow) ``` -If we use `select` or `transform` instead of `combine` we always obtain the number -and of order of rows in the result equal to the source. In the example below +Contrary to `combine`, the `select` and `transform` functions always return +a data frame with the same number and order of rows as the source. +In the example below the return values in columns `:SepalLength_SepalWidth_cor` and `:nrow` are broadcasted to match the number of elements in each group: ``` diff --git a/src/abstractdataframe/selection.jl b/src/abstractdataframe/selection.jl index dec34a49c5..e86d10ed70 100644 --- a/src/abstractdataframe/selection.jl +++ b/src/abstractdataframe/selection.jl @@ -183,8 +183,8 @@ function select_transform!(nc::Pair{<:Union{Int, AbstractVector{Int}, AsTable}, # this means that we use `select` or `transform` not `combine` if !allow_resizing_newdf[] && ncol(newdf) == 0 && length(res) != nrow(df) throw(ArgumentError("length $(length(res)) of vector returned from " * - "function $fun is different than number of rows" * - " $(nrow(df)) of the source data frame.")) + "function $fun is different from number of rows " * + "$(nrow(df)) of the source data frame.")) end allow_resizing_newdf[] = false respar = parent(res) @@ -554,7 +554,7 @@ transform(df::AbstractDataFrame, args...; copycols::Bool=true) = Create a new data frame that contains columns from `df` specified by `args` and return it. The result can have any number of rows that is determined by the -passed transformations. +values returned by passed transformations. See [`select`](@ref) for detailed rules regarding accepted values for `args`. @@ -657,7 +657,7 @@ function _process(df::AbstractDataFrame, normalized_cs, copycols::Bool, keeprows end # we allow resizing newdf only if up to some point only scalars were put # in it. The moment we put any vector into newdf its number of rows becomes fixed - # Also if keeprows is true then we make sure to rpoduce nrow(df) rows so resizing + # Also if keeprows is true then we make sure to produce nrow(df) rows so resizing # is not allowed allow_resizing_newdf = Ref(!keeprows) for nc in normalized_cs diff --git a/src/dataframe/dataframe.jl b/src/dataframe/dataframe.jl index 3d1fa9eff6..71fffd6923 100644 --- a/src/dataframe/dataframe.jl +++ b/src/dataframe/dataframe.jl @@ -65,7 +65,7 @@ to fill a new vector of the appropriate length. As a particular rule values stored in a `Ref` or a `0`-dimensional `AbstractArray` are unwrapped and treated in the same way. -Additionally `DataFrame` can be used to collect [`GroupedDataFrame`](@ref) +Additionally `DataFrame` can be used to collect a [`GroupedDataFrame`](@ref) into a `DataFrame`. # Notes diff --git a/src/groupeddataframe/splitapplycombine.jl b/src/groupeddataframe/splitapplycombine.jl index 91ce4a0e91..7e6955d4f6 100644 --- a/src/groupeddataframe/splitapplycombine.jl +++ b/src/groupeddataframe/splitapplycombine.jl @@ -9,7 +9,7 @@ into row groups. - `cols` : data frame columns to group by. Can be any column selector ($COLUMNINDEX_STR; $MULTICOLUMNINDEX_STR). - `sort` : whether to sort groups according to the values of the grouping columns - `cols`; if all `cols` are `CategoricalVector` then groups are always sorted + `cols`; if all `cols` are `CategoricalVector`s then groups are always sorted irrespective of the value of `sort` - `skipmissing` : whether to skip groups with `missing` values in one of the grouping columns `cols` @@ -252,7 +252,7 @@ const KWARG_PROCESSING_RULES = value contains columns with the same names as the grouping columns, they are required to be equal. - If `regroup=false`, if the returned value should be a `DataFrame` or a + If `regroup=true`, the returned value must be a `DataFrame` or a `GroupedDataFrame` grouped using `keycols(gdf)`. """ @@ -264,10 +264,9 @@ const KWARG_PROCESSING_RULES = combine(fun::Union{Function, Type}, df::AbstractDataFrame) combine(pair::Pair, df::AbstractDataFrame) -Transform a [`GroupedDataFrame`](@ref) into a `DataFrame`. - -As a special case if `combine` is passed an `AbstractDataFrame` it applies `fun` -or `pair` to the passed data frame as a whole. +Apply operations to each group in a [`GroupedDataFrame`](@ref) and return +the combined result as a `DataFrame`. +If an `AbstractDataFrame` is passed, apply operations to the data frame as a whole. $F_ARGUMENT_RULES @@ -574,27 +573,20 @@ function combine_helper(f, gd::GroupedDataFrame, keys = groupcols(gd) for key in keys if hasproperty(valscat, key) - if keeprows - isequal(valscat[!, key], parent(gd)[!, key]) || - throw(ArgumentError("column :$key in returned data frame " * - "is not equal to grouping key :$key")) - - else - isequal(valscat[!, key], view(parent(gd)[!, key], idx)) || + if (keeprows && !isequal(valscat[!, key], parent(gd)[!, key])) || + (!keeprows && !isequal(valscat[!, key], view(parent(gd)[!, key], idx))) throw(ArgumentError("column :$key in returned data frame " * "is not equal to grouping key :$key")) end end end if keeprows - newparent = hcat!(select(parent(gd), gd.cols, copycols=copycols), - select(valscat, Not(intersect(keys, _names(valscat))), - copycols=false), copycols=false) + newparent = select(parent(gd), gd.cols, copycols=copycols) else - newparent = hcat!(parent(gd)[idx, gd.cols], - select(valscat, Not(intersect(keys, _names(valscat))), - copycols=false), copycols=false) + newparent = parent(gd)[idx, gd.cols] end + hcat!(newparent, select(valscat, Not(intersect(keys, _names(valscat))), copycols=false), + copycols=false) regroup || return newparent if length(idx) == 0 @@ -1035,8 +1027,9 @@ function _combine(f::AbstractVector{<:Pair}, if keeprows if !_check_cannonical(gd) - throw(ArgumentError("select or transform functions require that " * - "GroupedDataFrame is not subsetted")) + throw(ArgumentError("select and transform do not support " * + "GroupedDataFrames from which some groups have been dropped "* + "(including skipmissing=true)"))) end idx_keeprows = Vector{Int}(undef, nrow(parent(gd))) let i = 0 @@ -1121,11 +1114,7 @@ function _combine(f::AbstractVector{<:Pair}, @assert !isnothing(idx_agg) idx = idx_agg else - if keeprows - idx = idx_keeprows - else - idx = res[idx_loc][1] - end + idx = keeprows ? idx_keeprows : res[idx_loc][1] agg2idx_map = nothing for i in 1:length(res) if res[i][1] !== idx && res[i][1] != idx @@ -1138,9 +1127,8 @@ function _combine(f::AbstractVector{<:Pair}, res[i] = idx_agg, res[i][2][agg2idx_map] elseif idx != res[i][1] if keeprows - throw(ArgumentError("all functions must return vectors of " * - "the length equal to the group rows count " * - "in the source GroupedDataFrame")) + throw(ArgumentError("all functions must return vectors with " * + "as many values as rows in each group")) else throw(ArgumentError("all functions must return vectors of the same length")) end @@ -1157,8 +1145,9 @@ function _combine(f::AbstractVector{<:Pair}, if keeprows && res[i][1] !== idx_keeprows # we need to reorder the column newcol = similar(col) # we can probably make it more efficient, but I leave it as an optimization for the future - for i in axes(col, 1) - newcol[gd.idx[i]] = col[i] + gd_idx = gd.idx + for j in eachindex(gd.idx, col) + newcol[gd_idx[j]] = col[j] end res[i] = (col_idx, newcol) end @@ -1470,10 +1459,10 @@ transform(gd::GroupedDataFrame, args...; An equivalent of `select(gd, args..., copycols=false, keepkeys=true, regroup=regroup)` -but updates the `parent(gd)` in place. +but updates `parent(gd)` in place. """ function select!(gd::GroupedDataFrame{DataFrame}, args...; regroup::Bool=false) - newdf = select(gd, args..., copycols=false, regroup=false) + newdf = select(gd, args..., copycols=false) df = parent(gd) copy!(_columns(df), _columns(newdf)) x = index(df) @@ -1490,7 +1479,7 @@ end An equivalent of `transform(gd, args..., copycols=false, keepkeys=true, regroup=regroup)` -but updates the `parent(gd)` in place. +but updates `parent(gd)` in place. """ transform!(gd::GroupedDataFrame{DataFrame}, args...; regroup::Bool=false) = select!(gd, :, args..., regroup=regroup) From 245714d8bfa6269a16bd3e070e9baa1f064466f4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Fri, 1 May 2020 11:41:57 +0200 Subject: [PATCH 17/29] fixes after code review --- docs/src/man/getting_started.md | 35 +--- docs/src/man/split_apply_combine.md | 89 +++++----- src/abstractdataframe/selection.jl | 89 +++------- src/dataframe/dataframe.jl | 11 +- src/deprecated.jl | 20 +-- src/groupeddataframe/splitapplycombine.jl | 205 ++++++++++++++++++---- 6 files changed, 260 insertions(+), 189 deletions(-) diff --git a/docs/src/man/getting_started.md b/docs/src/man/getting_started.md index bd6b7890fa..d8d22f0bce 100644 --- a/docs/src/man/getting_started.md +++ b/docs/src/man/getting_started.md @@ -773,8 +773,8 @@ julia> describe(df) ``` -If you are interested in describing only a subset of columns then the easiest way to do it is to -pass a subset of an original data frame to `describe` like this: +If you are interested in describing only a subset of columns then the easiest way +to do it is to pass a subset of an original data frame to `describe` like this: ```jldoctest dataframe julia> describe(df[!, [:A])) 1×8 DataFrame @@ -792,9 +792,7 @@ julia> mean(df.A) 2.5 ``` -We can also apply a function to each column of a `DataFrame` using `select`, -which always returns the same number of rows in the result as the source -data frame (repeating values as necessary). For example: +We can also apply a function to each column of a `DataFrame` using `combine`. For example: ```jldoctest dataframe julia> df = DataFrame(A = 1:4, B = 4.0:-1.0:1.0) 4×2 DataFrame @@ -806,30 +804,6 @@ julia> df = DataFrame(A = 1:4, B = 4.0:-1.0:1.0) │ 3 │ 3 │ 2.0 │ │ 4 │ 4 │ 1.0 │ -julia> select(df, names(df) .=> sum) -4×2 DataFrame -│ Row │ A_sum │ B_sum │ -│ │ Int64 │ Float64 │ -├─────┼───────┼─────────┤ -│ 1 │ 10 │ 10.0 │ -│ 2 │ 10 │ 10.0 │ -│ 3 │ 10 │ 10.0 │ -│ 4 │ 10 │ 10.0 │ - -julia> select(df, names(df) .=> sum, names(df) .=> prod) -4×4 DataFrame -│ Row │ A_sum │ B_sum │ A_prod │ B_prod │ -│ │ Int64 │ Float64 │ Int64 │ Float64 │ -├─────┼───────┼─────────┼────────┼─────────┤ -│ 1 │ 10 │ 10.0 │ 24 │ 24.0 │ -│ 2 │ 10 │ 10.0 │ 24 │ 24.0 │ -│ 3 │ 10 │ 10.0 │ 24 │ 24.0 │ -│ 4 │ 10 │ 10.0 │ 24 │ 24.0 │ -``` - -If instead you prefer to get a result collapsed to the number of rows returned -by the applied functions use the `combine` function: -``` julia> combine(df, names(df) .=> sum) 1×2 DataFrame │ Row │ A_sum │ B_sum │ @@ -845,6 +819,9 @@ julia> combine(df, names(df) .=> sum, names(df) .=> prod) │ 1 │ 10 │ 10.0 │ 24 │ 24.0 │ ``` +If you would prefer the result to have the same number of rows as the source data +frame use `select` instead of `combine`. + ### Handling of Columns Stored in a `DataFrame` Functions that transform a `DataFrame` to produce a diff --git a/docs/src/man/split_apply_combine.md b/docs/src/man/split_apply_combine.md index ed2252009c..96375c9f77 100644 --- a/docs/src/man/split_apply_combine.md +++ b/docs/src/man/split_apply_combine.md @@ -7,21 +7,22 @@ framework for handling this sort of computation is described in the paper written by Hadley Wickham. The DataFrames package supports the split-apply-combine strategy through the -`groupby` function followed by `combine`, `select`/`select!` or transform`/`transform!`. +`groupby` function followed by `combine`, `select`/`select!` or `transform`/`transform!`. In order to perform operations by groups you first need to create a `GroupedDataFrame` object from your data frame using the `groupby` function that takes two arguments: (1) a data frame to be grouped, and (2) a set of columns to group by. Operations can then be applied on each group using one of the following functions: +* `combine`: does not put restrictions on number of rows returned, the order of rows + is specified by the order of groups in `GroupedDataFrame`; it is typically used + to compute summary statistics by group; * `select`: return a data frame with the number and order of rows exactly the same as the source data frame, including only new calculated columns; `select!` is an in-place version of `select`; * `transform`: return a data frame with the number and order of rows exactly the same as the source data frame, including all columns from the source and new calculated columns; - `transform!` is an in-place version of `transform`; -* `combine`: does not put restrictions on number of rows returned, the order of rows - is specified by the order of groups in `GroupedDataFrame`. + `transform!` is an in-place version of `transform`. All these functions take a specification of one or more functions to apply to each subset of the `DataFrame`. This specification can be of the following forms: @@ -212,49 +213,49 @@ In the example below the return values in columns `:SepalLength_SepalWidth_cor` and `:nrow` are broadcasted to match the number of elements in each group: ``` -julia> select(gdf, 1:2 => cor, nrow) -150×3 DataFrame -│ Row │ Species │ SepalLength_SepalWidth_cor │ nrow │ -│ │ String │ Float64 │ Int64 │ -├─────┼────────────────┼────────────────────────────┼───────┤ -│ 1 │ Iris-setosa │ 0.74678 │ 50 │ -│ 2 │ Iris-setosa │ 0.74678 │ 50 │ -│ 3 │ Iris-setosa │ 0.74678 │ 50 │ -│ 4 │ Iris-setosa │ 0.74678 │ 50 │ -│ 5 │ Iris-setosa │ 0.74678 │ 50 │ -│ 6 │ Iris-setosa │ 0.74678 │ 50 │ -│ 7 │ Iris-setosa │ 0.74678 │ 50 │ +julia> select(gdf, 1:2 => cor) +150×2 DataFrame +│ Row │ Species │ SepalLength_SepalWidth_cor │ +│ │ String │ Float64 │ +├─────┼────────────────┼────────────────────────────┤ +│ 1 │ Iris-setosa │ 0.74678 │ +│ 2 │ Iris-setosa │ 0.74678 │ +│ 3 │ Iris-setosa │ 0.74678 │ +│ 4 │ Iris-setosa │ 0.74678 │ +│ 5 │ Iris-setosa │ 0.74678 │ +│ 6 │ Iris-setosa │ 0.74678 │ +│ 7 │ Iris-setosa │ 0.74678 │ ⋮ -│ 143 │ Iris-virginica │ 0.457228 │ 50 │ -│ 144 │ Iris-virginica │ 0.457228 │ 50 │ -│ 145 │ Iris-virginica │ 0.457228 │ 50 │ -│ 146 │ Iris-virginica │ 0.457228 │ 50 │ -│ 147 │ Iris-virginica │ 0.457228 │ 50 │ -│ 148 │ Iris-virginica │ 0.457228 │ 50 │ -│ 149 │ Iris-virginica │ 0.457228 │ 50 │ -│ 150 │ Iris-virginica │ 0.457228 │ 50 │ - -julia> transform(gdf, nrow) +│ 143 │ Iris-virginica │ 0.457228 │ +│ 144 │ Iris-virginica │ 0.457228 │ +│ 145 │ Iris-virginica │ 0.457228 │ +│ 146 │ Iris-virginica │ 0.457228 │ +│ 147 │ Iris-virginica │ 0.457228 │ +│ 148 │ Iris-virginica │ 0.457228 │ +│ 149 │ Iris-virginica │ 0.457228 │ +│ 150 │ Iris-virginica │ 0.457228 │ + +julia> transform(gdf, :Species => x -> chop.(x, head=5, tail=0)) 150×6 DataFrame -│ Row │ Species │ SepalLength │ SepalWidth │ PetalLength │ PetalWidth │ nrow │ -│ │ String │ Float64 │ Float64 │ Float64 │ Float64 │ Int64 │ -├─────┼────────────────┼─────────────┼────────────┼─────────────┼────────────┼───────┤ -│ 1 │ Iris-setosa │ 5.1 │ 3.5 │ 1.4 │ 0.2 │ 50 │ -│ 2 │ Iris-setosa │ 4.9 │ 3.0 │ 1.4 │ 0.2 │ 50 │ -│ 3 │ Iris-setosa │ 4.7 │ 3.2 │ 1.3 │ 0.2 │ 50 │ -│ 4 │ Iris-setosa │ 4.6 │ 3.1 │ 1.5 │ 0.2 │ 50 │ -│ 5 │ Iris-setosa │ 5.0 │ 3.6 │ 1.4 │ 0.2 │ 50 │ -│ 6 │ Iris-setosa │ 5.4 │ 3.9 │ 1.7 │ 0.4 │ 50 │ -│ 7 │ Iris-setosa │ 4.6 │ 3.4 │ 1.4 │ 0.3 │ 50 │ +│ Row │ Species │ SepalLength │ SepalWidth │ PetalLength │ PetalWidth │ Species_function │ +│ │ String │ Float64 │ Float64 │ Float64 │ Float64 │ SubString… │ +├─────┼────────────────┼─────────────┼────────────┼─────────────┼────────────┼──────────────────┤ +│ 1 │ Iris-setosa │ 5.1 │ 3.5 │ 1.4 │ 0.2 │ setosa │ +│ 2 │ Iris-setosa │ 4.9 │ 3.0 │ 1.4 │ 0.2 │ setosa │ +│ 3 │ Iris-setosa │ 4.7 │ 3.2 │ 1.3 │ 0.2 │ setosa │ +│ 4 │ Iris-setosa │ 4.6 │ 3.1 │ 1.5 │ 0.2 │ setosa │ +│ 5 │ Iris-setosa │ 5.0 │ 3.6 │ 1.4 │ 0.2 │ setosa │ +│ 6 │ Iris-setosa │ 5.4 │ 3.9 │ 1.7 │ 0.4 │ setosa │ +│ 7 │ Iris-setosa │ 4.6 │ 3.4 │ 1.4 │ 0.3 │ setosa │ ⋮ -│ 143 │ Iris-virginica │ 5.8 │ 2.7 │ 5.1 │ 1.9 │ 50 │ -│ 144 │ Iris-virginica │ 6.8 │ 3.2 │ 5.9 │ 2.3 │ 50 │ -│ 145 │ Iris-virginica │ 6.7 │ 3.3 │ 5.7 │ 2.5 │ 50 │ -│ 146 │ Iris-virginica │ 6.7 │ 3.0 │ 5.2 │ 2.3 │ 50 │ -│ 147 │ Iris-virginica │ 6.3 │ 2.5 │ 5.0 │ 1.9 │ 50 │ -│ 148 │ Iris-virginica │ 6.5 │ 3.0 │ 5.2 │ 2.0 │ 50 │ -│ 149 │ Iris-virginica │ 6.2 │ 3.4 │ 5.4 │ 2.3 │ 50 │ -│ 150 │ Iris-virginica │ 5.9 │ 3.0 │ 5.1 │ 1.8 │ 50 │ +│ 143 │ Iris-virginica │ 5.8 │ 2.7 │ 5.1 │ 1.9 │ virginica │ +│ 144 │ Iris-virginica │ 6.8 │ 3.2 │ 5.9 │ 2.3 │ virginica │ +│ 145 │ Iris-virginica │ 6.7 │ 3.3 │ 5.7 │ 2.5 │ virginica │ +│ 146 │ Iris-virginica │ 6.7 │ 3.0 │ 5.2 │ 2.3 │ virginica │ +│ 147 │ Iris-virginica │ 6.3 │ 2.5 │ 5.0 │ 1.9 │ virginica │ +│ 148 │ Iris-virginica │ 6.5 │ 3.0 │ 5.2 │ 2.0 │ virginica │ +│ 149 │ Iris-virginica │ 6.2 │ 3.4 │ 5.4 │ 2.3 │ virginica │ +│ 150 │ Iris-virginica │ 5.9 │ 3.0 │ 5.1 │ 1.8 │ virginica │ ``` The `combine` function also supports the `do` block form. However, as noted above, diff --git a/src/abstractdataframe/selection.jl b/src/abstractdataframe/selection.jl index e86d10ed70..df252a2b97 100644 --- a/src/abstractdataframe/selection.jl +++ b/src/abstractdataframe/selection.jl @@ -155,7 +155,7 @@ function select_transform!(nc::Pair{<:Union{Int, AbstractVector{Int}, AsTable}, col_idx, (fun, newname) = nc # It is allowed to request a tranformation operation into a newname column # only once. This is ensured by the logic related to transformed_cols dictionaly - # in _process, therefore in select_transform! such a duplicate should not happen + # in _manipulate, therefore in select_transform! such a duplicate should not happen @assert !hasproperty(newdf, newname) cdf = eachcol(df) if col_idx isa Int @@ -180,7 +180,8 @@ function select_transform!(nc::Pair{<:Union{Int, AbstractVector{Int}, AsTable}, end end - # this means that we use `select` or `transform` not `combine` + # !allow_resizing_newdf[] && ncol(newdf) == 0 + # means that we use `select` or `transform` not `combine` if !allow_resizing_newdf[] && ncol(newdf) == 0 && length(res) != nrow(df) throw(ArgumentError("length $(length(res)) of vector returned from " * "function $fun is different from number of rows " * @@ -354,53 +355,8 @@ julia> select!(df, AsTable(:) => ByRow(mean)) ``` """ -function select!(df::DataFrame, args::AbstractVector{Int}) - if isempty(args) - empty!(_columns(df)) - empty!(index(df)) - return df - end - indmin, indmax = extrema(args) - if indmin < 1 - throw(ArgumentError("indices must be positive")) - end - if indmax > ncol(df) - throw(ArgumentError("indices must not be greater than number of columns")) - end - if !allunique(args) - throw(ArgumentError("indices must not contain duplicates")) - end - copy!(_columns(df), _columns(df)[args]) - x = index(df) - copy!(_names(x), _names(df)[args]) - empty!(x.lookup) - for (i, n) in enumerate(x.names) - x.lookup[n] = i - end - return df -end - -select!(df::DataFrame, c::Int) = select!(df, [c]) - -function select!(df::DataFrame, c::MultiColumnIndex) - if c isa AbstractVector{<:Pair} - return select!(df, c...) - else - return select!(df, index(df)[c]) - end -end - -function select!(df::DataFrame, cs...) - newdf = select(df, cs..., copycols=false) - copy!(_columns(df), _columns(newdf)) - x = index(df) - copy!(_names(x), _names(newdf)) - empty!(x.lookup) - for (i, n) in enumerate(x.names) - x.lookup[n] = i - end - return df -end +select!(df::DataFrame, args...) = + _replace_columns!(df, select(df, args..., copycols=false)) """ transform!(df::DataFrame, args...) @@ -533,7 +489,7 @@ julia> select(df, AsTable(:) => ByRow(mean)) """ select(df::AbstractDataFrame, args...; copycols::Bool=true) = - _manipulate(df, args..., copycols=copycols, keeprows=true) + manipulate(df, args..., copycols=copycols, keeprows=true) """ transform(df::AbstractDataFrame, args...; copycols::Bool=true) @@ -548,7 +504,6 @@ See [`select`](@ref) for detailed rules regarding accepted values for `args`. transform(df::AbstractDataFrame, args...; copycols::Bool=true) = select(df, :, args..., copycols=copycols) - """ combine(df::AbstractDataFrame, args...) @@ -577,26 +532,26 @@ julia> combine(df, :a => sum, nrow) │ 1 │ 6 │ 3 │ """ combine(df::AbstractDataFrame, args...) = - _manipulate(df, args..., copycols=true, keeprows=false) + manipulate(df, args..., copycols=true, keeprows=false) combine(arg, df::AbstractDataFrame) = combine(arg, groupby(df, [])) -_manipulate(df::DataFrame, args::AbstractVector{Int}; copycols::Bool, keeprows::Bool) = +manipulate(df::DataFrame, args::AbstractVector{Int}; copycols::Bool, keeprows::Bool) = DataFrame(_columns(df)[args], Index(_names(df)[args]), copycols=copycols) -function _manipulate(df::DataFrame, c::MultiColumnIndex; copycols::Bool, keeprows::Bool) +function manipulate(df::DataFrame, c::MultiColumnIndex; copycols::Bool, keeprows::Bool) if c isa AbstractVector{<:Pair} - return _manipulate(df, c..., copycols=copycols, keeprows=keeprows) + return manipulate(df, c..., copycols=copycols, keeprows=keeprows) else - return _manipulate(df, index(df)[c], copycols=copycols, keeprows=keeprows) + return manipulate(df, index(df)[c], copycols=copycols, keeprows=keeprows) end end -_manipulate(df::DataFrame, c::ColumnIndex; copycols::Bool, keeprows::Bool) = - _manipulate(df, [c], copycols=copycols, keeprows=keeprows) +manipulate(df::DataFrame, c::ColumnIndex; copycols::Bool, keeprows::Bool) = + manipulate(df, [c], copycols=copycols, keeprows=keeprows) -function _manipulate(df::DataFrame, cs...; copycols::Bool, keeprows::Bool) +function manipulate(df::DataFrame, cs...; copycols::Bool, keeprows::Bool) cs_vec = [] for v in cs if v isa AbstractVector{<:Pair} @@ -605,11 +560,11 @@ function _manipulate(df::DataFrame, cs...; copycols::Bool, keeprows::Bool) push!(cs_vec, v) end end - return _process(df, [normalize_selection(index(df), c) for c in cs_vec], + return _manipulate(df, [normalize_selection(index(df), c) for c in cs_vec], copycols, keeprows) end -function _process(df::AbstractDataFrame, normalized_cs, copycols::Bool, keeprows::Bool) +function _manipulate(df::AbstractDataFrame, normalized_cs, copycols::Bool, keeprows::Bool) @assert !(df isa SubDataFrame && copycols==false) newdf = DataFrame() # the role of transformed_cols is the following @@ -710,19 +665,19 @@ function _process(df::AbstractDataFrame, normalized_cs, copycols::Bool, keeprows return newdf end -_manipulate(dfv::SubDataFrame, ind::ColumnIndex; copycols::Bool, keeprows::Bool) = - _manipulate(dfv, [ind], copycols=copycols, keeprows=keeprows) +manipulate(dfv::SubDataFrame, ind::ColumnIndex; copycols::Bool, keeprows::Bool) = + manipulate(dfv, [ind], copycols=copycols, keeprows=keeprows) -function _manipulate(dfv::SubDataFrame, args::MultiColumnIndex; +function manipulate(dfv::SubDataFrame, args::MultiColumnIndex; copycols::Bool, keeprows::Bool) if args isa AbstractVector{<:Pair} - return _manipulate(dfv, args..., copycols=copycols, keeprows=keeprows) + return manipulate(dfv, args..., copycols=copycols, keeprows=keeprows) else return copycols ? dfv[:, args] : view(dfv, :, args) end end -function _manipulate(dfv::SubDataFrame, args...; copycols::Bool, keeprows::Bool) +function manipulate(dfv::SubDataFrame, args...; copycols::Bool, keeprows::Bool) if copycols cs_vec = [] for v in args @@ -732,7 +687,7 @@ function _manipulate(dfv::SubDataFrame, args...; copycols::Bool, keeprows::Bool) push!(cs_vec, v) end end - return _process(dfv, [normalize_selection(index(dfv), c) for c in cs_vec], + return _manipulate(dfv, [normalize_selection(index(dfv), c) for c in cs_vec], true, keeprows) else # we do not support transformations here diff --git a/src/dataframe/dataframe.jl b/src/dataframe/dataframe.jl index 71fffd6923..06ebd27d53 100644 --- a/src/dataframe/dataframe.jl +++ b/src/dataframe/dataframe.jl @@ -66,7 +66,8 @@ stored in a `Ref` or a `0`-dimensional `AbstractArray` are unwrapped and treated in the same way. Additionally `DataFrame` can be used to collect a [`GroupedDataFrame`](@ref) -into a `DataFrame`. +into a `DataFrame`. In this case the row ofder of the result follows the order +of groups in the `GroupedDataFrame` passed. # Notes The `DataFrame` constructor by default copies all columns vectors passed to it. @@ -1671,3 +1672,11 @@ function repeat!(df::DataFrame, count::Integer) count < 0 && throw(ArgumentError("count must be non-negative")) return mapcols!(x -> repeat(x, count), df) end + +# it is not exactly copy! as in general we alow axes to be different +function _replace_columns!(df::DataFrame, newdf::DataFrame) + copy!(_columns(df), _columns(newdf)) + copy!(_names(index(df)), _names(newdf)) + copy!(index(df).lookup, index(newdf).lookup) + return df +end diff --git a/src/deprecated.jl b/src/deprecated.jl index b7d713e917..59fe83f214 100644 --- a/src/deprecated.jl +++ b/src/deprecated.jl @@ -448,23 +448,19 @@ end @deprecate deleterows!(df::DataFrame, inds) delete!(df, inds) @deprecate by(f::Union{Base.Callable, Pair}, d::AbstractDataFrame, cols::Any; - sort::Bool=false, skipmissing::Bool=false, - keepkeys::Bool=true) combine(groupby(d, cols, sort=sort, skipmissing=skipmissing), - f, keepkeys=keepkeys) + sort::Bool=false, skipmissing::Bool=false) combine(groupby(d, cols, sort=sort, + skipmissing=skipmissing), f) @deprecate by(d::AbstractDataFrame, cols::Any, f::Base.Callable; - sort::Bool=false, skipmissing::Bool=false, - keepkeys::Bool=true) combine(groupby(d, cols, sort=sort, skipmissing=skipmissing), - f, keepkeys=keepkeys) + sort::Bool=false, skipmissing::Bool=false) combine(groupby(d, cols, sort=sort, + skipmissing=skipmissing), f) @deprecate by(d::AbstractDataFrame, cols::Any, f::Pair; - sort::Bool=false, skipmissing::Bool=false, - keepkeys::Bool=true) combine(groupby(d, cols, sort=sort, skipmissing=skipmissing), - f, keepkeys=keepkeys) + sort::Bool=false, skipmissing::Bool=false) combine(groupby(d, cols, sort=sort, + skipmissing=skipmissing), f) @deprecate by(d::AbstractDataFrame, cols::Any, f::Union{Pair, typeof(nrow), ColumnIndex, MultiColumnIndex}...; - sort::Bool=false, skipmissing::Bool=false, - keepkeys::Bool=true) combine(groupby(d, cols, sort=sort, skipmissing=skipmissing), - f..., keepkeys=keepkeys) + sort::Bool=false, skipmissing::Bool=false) combine(groupby(d, cols, sort=sort, + skipmissing=skipmissing), f...) import Base: map @deprecate map(f::Union{Base.Callable, Pair}, gd::GroupedDataFrame) combine(f, gd, regroup=true) diff --git a/src/groupeddataframe/splitapplycombine.jl b/src/groupeddataframe/splitapplycombine.jl index 7e6955d4f6..e37bc9bf03 100644 --- a/src/groupeddataframe/splitapplycombine.jl +++ b/src/groupeddataframe/splitapplycombine.jl @@ -28,14 +28,6 @@ indexing by groups, `map` (which applies a function to each group) and `combine` (which applies a function to each group and combines the result into a data frame). -See the following for additional split-apply-combine operations: - -* [`by`](@ref) : split-apply-combine using functions -* [`map`](@ref) : apply a function to each group of a `GroupedDataFrame` - (without combining) -* [`combine`](@ref) : combine a `GroupedDataFrame`, optionally applying - a function to each group - `GroupedDataFrame` also supports the dictionary interface. The keys are [`GroupKey`](@ref) objects returned by [`keys(::GroupedDataFrame)`](@ref), which can also be used to get the values of the grouping columns for each group. @@ -43,6 +35,10 @@ which can also be used to get the values of the grouping columns for each group. same order as the `cols` argument) are also accepted as indices, but this will be slower than using the equivalent `GroupKey`. +# See also + +[`combine`](@ref), [`select`](@ref), [`select!`](@ref), [`transform`](@ref), [`transform!](@ref) + # Examples ```julia julia> df = DataFrame(a = repeat([1, 2, 3, 4], outer=[2]), @@ -170,8 +166,6 @@ function groupby(df::AbstractDataFrame, cols; return gd end -_check_cannonical(gd::GroupedDataFrame) = !any(==(0), gd.groups) - const F_TYPE_RULES = """ `fun` can return a single value, a row, a vector, or multiple rows. @@ -252,8 +246,8 @@ const KWARG_PROCESSING_RULES = value contains columns with the same names as the grouping columns, they are required to be equal. - If `regroup=true`, the returned value must be a `DataFrame` or a - `GroupedDataFrame` grouped using `keycols(gdf)`. + If `regroup=false` (the default) a `DataFrame` is returned. + If `regroup=true` a `GroupedDataFrame` grouped using `keycols(gdf)` is returned. """ """ @@ -274,9 +268,11 @@ $F_TYPE_RULES $KWARG_PROCESSING_RULES -The resulting data frame will be sorted by `keycols(gdf)` if `sort=true` was passed to the -[`groupby`](@ref) call from which `gd` was constructed. Otherwise, ordering of rows -follows the order of groups in `gdf`. +Ordering of rows follows the order of groups in `gdf`. + +# See also + +[`groupby`](@ref), [`select`](@ref), [`select!`](@ref), [`transform`](@ref), [`transform!](@ref) # Examples ```jldoctest @@ -466,12 +462,12 @@ end combine(gd::GroupedDataFrame, cs::Union{Pair, typeof(nrow), ColumnIndex, MultiColumnIndex}...; keepkeys::Bool=true, regroup::Bool=false) = - _combine_executor(gd, cs..., keepkeys=keepkeys, regroup=regroup, - copycols=true, keeprows=false) + _combine_prepare(gd, cs..., keepkeys=keepkeys, regroup=regroup, + copycols=true, keeprows=false) -function _combine_executor(gd::GroupedDataFrame, - @nospecialize(cs::Union{Pair, typeof(nrow), - ColumnIndex, MultiColumnIndex}...); +function _combine_prepare(gd::GroupedDataFrame, + @nospecialize(cs::Union{Pair, typeof(nrow), + ColumnIndex, MultiColumnIndex}...); keepkeys::Bool, regroup::Bool, copycols::Bool, keeprows::Bool) @assert !isempty(cs) cs_vec = [] @@ -1020,16 +1016,16 @@ end function _combine(f::AbstractVector{<:Pair}, gd::GroupedDataFrame, nms::AbstractVector{Symbol}, - copycols::Bool, keeprows::Bool) # TODO: remove these defaults + copycols::Bool, keeprows::Bool) # here f should be normalized and in a form of source_cols => fun @assert all(x -> first(x) isa Union{Int, AbstractVector{Int}, AsTable}, f) @assert all(x -> last(x) isa Base.Callable, f) if keeprows - if !_check_cannonical(gd) + if minimum(gd.groups) == 0 throw(ArgumentError("select and transform do not support " * - "GroupedDataFrames from which some groups have been dropped "* - "(including skipmissing=true)"))) + "`GroupedDataFrame`s from which some groups have "* + "been dropped (including skipmissing=true)")) end idx_keeprows = Vector{Int}(undef, nrow(parent(gd))) let i = 0 @@ -1137,8 +1133,7 @@ function _combine(f::AbstractVector{<:Pair}, end end - # remember that here first field in res[i] is not useful - it is just needed - # to keep track how the column was generated + # here first field in res[i] is used to keep track how the column was generated # a correct index is stored in idx variable for (i, (col_idx, col)) in enumerate(res) @@ -1433,14 +1428,145 @@ end copycols::Bool=true, keepkeys::Bool=true, regroup::Bool=false) Apply `args` to `gd` following the rules described in [`combine`](@ref). -Ensure that the return value has number of rows equal to `nrow(parent(gd))`. +The return value has number of rows equal to `nrow(parent(gd))` +(if single value is returned it is always broadcasted to have this number of rows). If `copycols=false` then do not perform copying of columns that are not transformed. + +# See also + +[`groupby](@ref), [`combine`](@ref), [`select!`](@ref), [`transform`](@ref), [`transform!`](@ref) + +# Examples +```jldoctest +julia> df = DataFrame(a = [1, 1, 1, 2, 2, 1, 1, 2], + b = repeat([2, 1], outer=[4]), + c = 1:8) +8×3 DataFrame +│ Row │ a │ b │ c │ +│ │ Int64 │ Int64 │ Int64 │ +├─────┼───────┼───────┼───────┤ +│ 1 │ 1 │ 2 │ 1 │ +│ 2 │ 1 │ 1 │ 2 │ +│ 3 │ 1 │ 2 │ 3 │ +│ 4 │ 2 │ 1 │ 4 │ +│ 5 │ 2 │ 2 │ 5 │ +│ 6 │ 1 │ 1 │ 6 │ +│ 7 │ 1 │ 2 │ 7 │ +│ 8 │ 2 │ 1 │ 8 │ + +julia> gd = groupby(df, :a); + +julia> select(gd, :c => sum, nrow) +8×3 DataFrame +│ Row │ a │ c_sum │ nrow │ +│ │ Int64 │ Int64 │ Int64 │ +├─────┼───────┼───────┼───────┤ +│ 1 │ 1 │ 19 │ 5 │ +│ 2 │ 1 │ 19 │ 5 │ +│ 3 │ 1 │ 19 │ 5 │ +│ 4 │ 2 │ 17 │ 3 │ +│ 5 │ 2 │ 17 │ 3 │ +│ 6 │ 1 │ 19 │ 5 │ +│ 7 │ 1 │ 19 │ 5 │ +│ 8 │ 2 │ 17 │ 3 │ + +julia> select(gd, :c => sum, nrow, regroup=true) +GroupedDataFrame with 2 groups based on key: a +First Group (5 rows): a = 1 +│ Row │ a │ c_sum │ nrow │ +│ │ Int64 │ Int64 │ Int64 │ +├─────┼───────┼───────┼───────┤ +│ 1 │ 1 │ 19 │ 5 │ +│ 2 │ 1 │ 19 │ 5 │ +│ 3 │ 1 │ 19 │ 5 │ +│ 4 │ 1 │ 19 │ 5 │ +│ 5 │ 1 │ 19 │ 5 │ +⋮ +Last Group (3 rows): a = 2 +│ Row │ a │ c_sum │ nrow │ +│ │ Int64 │ Int64 │ Int64 │ +├─────┼───────┼───────┼───────┤ +│ 1 │ 2 │ 17 │ 3 │ +│ 2 │ 2 │ 17 │ 3 │ +│ 3 │ 2 │ 17 │ 3 │ + +julia> select(gd, :c => (x -> sum(log, x)) => :sum_log_c) # specifying a name for target column +8×2 DataFrame +│ Row │ a │ sum_log_c │ +│ │ Int64 │ Float64 │ +├─────┼───────┼───────────┤ +│ 1 │ 1 │ 5.52943 │ +│ 2 │ 1 │ 5.52943 │ +│ 3 │ 1 │ 5.52943 │ +│ 4 │ 2 │ 5.07517 │ +│ 5 │ 2 │ 5.07517 │ +│ 6 │ 1 │ 5.52943 │ +│ 7 │ 1 │ 5.52943 │ +│ 8 │ 2 │ 5.07517 │ + +julia> select(gd, [:b, :c] .=> sum) # passing a vector of pairs +8×3 DataFrame +│ Row │ a │ b_sum │ c_sum │ +│ │ Int64 │ Int64 │ Int64 │ +├─────┼───────┼───────┼───────┤ +│ 1 │ 1 │ 8 │ 19 │ +│ 2 │ 1 │ 8 │ 19 │ +│ 3 │ 1 │ 8 │ 19 │ +│ 4 │ 2 │ 4 │ 17 │ +│ 5 │ 2 │ 4 │ 17 │ +│ 6 │ 1 │ 8 │ 19 │ +│ 7 │ 1 │ 8 │ 19 │ +│ 8 │ 2 │ 4 │ 17 │ + +julia> select(gd, :b => :b1, :c => :c1, + [:b, :c] => +, keepkeys=false) # auto-splatting, renaming and keepkeys +8×3 DataFrame +│ Row │ b1 │ c1 │ b_c_+ │ +│ │ Int64 │ Int64 │ Int64 │ +├─────┼───────┼───────┼───────┤ +│ 1 │ 2 │ 1 │ 3 │ +│ 2 │ 1 │ 2 │ 3 │ +│ 3 │ 2 │ 3 │ 5 │ +│ 4 │ 1 │ 4 │ 5 │ +│ 5 │ 2 │ 5 │ 7 │ +│ 6 │ 1 │ 6 │ 7 │ +│ 7 │ 2 │ 7 │ 9 │ +│ 8 │ 1 │ 8 │ 9 │ + +julia> select(gd, :b, :c => sum) # passing columns and broadcasting +8×3 DataFrame +│ Row │ a │ b │ c_sum │ +│ │ Int64 │ Int64 │ Int64 │ +├─────┼───────┼───────┼───────┤ +│ 1 │ 1 │ 2 │ 19 │ +│ 2 │ 1 │ 1 │ 19 │ +│ 3 │ 1 │ 2 │ 19 │ +│ 4 │ 2 │ 1 │ 17 │ +│ 5 │ 2 │ 2 │ 17 │ +│ 6 │ 1 │ 1 │ 19 │ +│ 7 │ 1 │ 2 │ 19 │ +│ 8 │ 2 │ 1 │ 17 │ + +julia> select(gd, :, AsTable(Not(:a)) => sum) +8×4 DataFrame +│ Row │ a │ b │ c │ b_c_sum │ +│ │ Int64 │ Int64 │ Int64 │ Int64 │ +├─────┼───────┼───────┼───────┼─────────┤ +│ 1 │ 1 │ 2 │ 1 │ 3 │ +│ 2 │ 1 │ 1 │ 2 │ 3 │ +│ 3 │ 1 │ 2 │ 3 │ 5 │ +│ 4 │ 2 │ 1 │ 4 │ 5 │ +│ 5 │ 2 │ 2 │ 5 │ 7 │ +│ 6 │ 1 │ 1 │ 6 │ 7 │ +│ 7 │ 1 │ 2 │ 7 │ 9 │ +│ 8 │ 2 │ 1 │ 8 │ 9 │ +``` """ select(gd::GroupedDataFrame, args...; copycols::Bool=true, keepkeys::Bool=true, regroup::Bool=false) = - _combine_executor(gd, args..., copycols=copycols, keepkeys=keepkeys, - regroup=regroup, keeprows=true) + _combine_prepare(gd, args..., copycols=copycols, keepkeys=keepkeys, + regroup=regroup, keeprows=true) """ transform(gd::GroupedDataFrame, args...; @@ -1448,6 +1574,10 @@ select(gd::GroupedDataFrame, args...; An equivalent of `select(gd, :, args..., copycols=copycols, keepkeys=keepkeys, regroup=regroup)` + +# See also + +[`groupby](@ref), [`combine`](@ref), [`select`](@ref), [`select!`](@ref), [`transform!`](@ref) """ transform(gd::GroupedDataFrame, args...; copycols::Bool=true, keepkeys::Bool=true, regroup::Bool=false) = @@ -1460,17 +1590,15 @@ transform(gd::GroupedDataFrame, args...; An equivalent of `select(gd, args..., copycols=false, keepkeys=true, regroup=regroup)` but updates `parent(gd)` in place. + +# See also + +[`groupby](@ref), [`combine`](@ref), [`select`](@ref), [`transform`](@ref), [`transform!`](@ref) """ function select!(gd::GroupedDataFrame{DataFrame}, args...; regroup::Bool=false) newdf = select(gd, args..., copycols=false) df = parent(gd) - copy!(_columns(df), _columns(newdf)) - x = index(df) - copy!(_names(x), _names(newdf)) - empty!(x.lookup) - for (i, n) in enumerate(x.names) - x.lookup[n] = i - end + _replace_columns!(df, newdf) return regroup ? gd : df end @@ -1480,6 +1608,11 @@ end An equivalent of `transform(gd, args..., copycols=false, keepkeys=true, regroup=regroup)` but updates `parent(gd)` in place. + + +# See also + +[`groupby](@ref), [`combine`](@ref), [`select`](@ref), [`select!`](@ref), [`transform`](@ref) """ transform!(gd::GroupedDataFrame{DataFrame}, args...; regroup::Bool=false) = select!(gd, :, args..., regroup=regroup) From 2bd31ff611a86ed65c587b948c9acd470d5a1cf7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Fri, 1 May 2020 11:44:37 +0200 Subject: [PATCH 18/29] add deprecated map tests --- test/deprecated.jl | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/test/deprecated.jl b/test/deprecated.jl index 11550a2b2e..d9532f63e1 100644 --- a/test/deprecated.jl +++ b/test/deprecated.jl @@ -704,6 +704,15 @@ end end end +@testset "map skipmissing and sort" begin + df = DataFrame(a=[2, 2, missing, missing, 1, 1, 3, 3], b=1:8) + for dosort in (false, true), doskipmissing in (false, true) + gdf = groupby(df, :a, sort=dosort, skipmissing=doskipmissing) + @test map(identity, gdf) ≅ combine(identity, gdf, regroup=true) + @test map(:b => sum, gdf) ≅ combine(:b => sum, gdf, regroup=true) + end +end + global_logger(old_logger) end # module From 9d1b20d69a245f4fa70b381e6fd1d0189bd7d63e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Fri, 1 May 2020 11:49:44 +0200 Subject: [PATCH 19/29] fix error types in select --- test/select.jl | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/test/select.jl b/test/select.jl index a63a30e329..b76c91f3d2 100644 --- a/test/select.jl +++ b/test/select.jl @@ -239,8 +239,8 @@ end @testset "select!" begin df = DataFrame(a=1, b=2, c=3, d=4, e=5) - @test_throws ArgumentError select!(df, 0) - @test_throws ArgumentError select!(df, 6) + @test_throws BoundsError select!(df, 0) + @test_throws BoundsError select!(df, 6) @test_throws ArgumentError select!(df, [1, 1]) @test_throws ArgumentError select!(df, :f) @test_throws BoundsError select!(df, [true, false]) @@ -552,7 +552,7 @@ end @test df == expected df = DataFrame(a=a, b=b, c=c) - @test_throws ArgumentError select!(df, 1:4) + @test_throws BoundsError select!(df, 1:4) @test_throws ArgumentError select!(df, [:a, :b, :c, :d]) @test_throws ArgumentError select!(df, [1, 2, 3, 1]) @test_throws ArgumentError select!(df, [:a, :b, :c, :a]) From 0f3d30900023b480b51b46b8f6c678191f652607 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Fri, 1 May 2020 15:53:12 +0200 Subject: [PATCH 20/29] avoid computing idx, starts and ends in combine if regroup=true --- src/groupeddataframe/splitapplycombine.jl | 34 ++++++----------------- 1 file changed, 9 insertions(+), 25 deletions(-) diff --git a/src/groupeddataframe/splitapplycombine.jl b/src/groupeddataframe/splitapplycombine.jl index e37bc9bf03..ef9eb053ff 100644 --- a/src/groupeddataframe/splitapplycombine.jl +++ b/src/groupeddataframe/splitapplycombine.jl @@ -597,35 +597,19 @@ function combine_helper(f, gd::GroupedDataFrame, return GroupedDataFrame(newparent, gd.cols, gd.groups, gd.idx, gd.starts, gd.ends, gd.ngroups, getfield(gd, :keymap)) else - starts = Vector{Int}(undef, length(gd)) - ends = Vector{Int}(undef, length(gd)) - starts[1] = 1 + groups = zeros(Int, length(idx)) + groups[1] = 1 j = 1 - for i in 2:length(idx) - if idx[i] != idx[i-1] - j += 1 - starts[j] = i - ends[j-1] = i - 1 - end + last_idx = idx[1] + @inbounds for i in 2:length(idx) + cur_idx = idx[i] + j += cur_idx != last_idx + last_idx = cur_idx + groups[i] = j end - # it is impossible to get more groups in the output than we had initially @assert j <= length(gd) - # In case some groups have to be dropped - resize!(starts, j) - resize!(ends, j) - ends[end] = length(idx) - - groups = zeros(Int, length(idx)) - for i in 1:j - @inbounds for k in starts[i]:ends[i] - groups[k] = i - end - end - # all groups must be filled - @assert minimum(groups) == 1 - return GroupedDataFrame(newparent, collect(1:length(gd.cols)), groups, - collect(1:length(idx)), starts, ends, j, nothing) + nothing, nothing, nothing, j, nothing) end else if regroup From 1d69fa3cacc6526e3f9160b2cbb4866485c4fa0a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Fri, 1 May 2020 16:51:30 +0200 Subject: [PATCH 21/29] performance improvements --- src/groupeddataframe/splitapplycombine.jl | 55 +++++++++++++---------- 1 file changed, 32 insertions(+), 23 deletions(-) diff --git a/src/groupeddataframe/splitapplycombine.jl b/src/groupeddataframe/splitapplycombine.jl index ef9eb053ff..d10e7e15fc 100644 --- a/src/groupeddataframe/splitapplycombine.jl +++ b/src/groupeddataframe/splitapplycombine.jl @@ -556,6 +556,20 @@ function combine(gd::GroupedDataFrame; f...) return combine(gd, [source_cols => fun => out_col for (out_col, (source_cols, fun)) in f]) end +function gen_groups(idx::Vector{Int}) + groups = zeros(Int, length(idx)) + groups[1] = 1 + j = 1 + last_idx = idx[1] + @inbounds @simd for i in 2:length(idx) + cur_idx = idx[i] + j += cur_idx != last_idx + last_idx = cur_idx + groups[i] = j + end + return groups +end + function combine_helper(f, gd::GroupedDataFrame, nms::Union{AbstractVector{Symbol},Nothing}=nothing; keepkeys::Bool, regroup::Bool, @@ -597,19 +611,10 @@ function combine_helper(f, gd::GroupedDataFrame, return GroupedDataFrame(newparent, gd.cols, gd.groups, gd.idx, gd.starts, gd.ends, gd.ngroups, getfield(gd, :keymap)) else - groups = zeros(Int, length(idx)) - groups[1] = 1 - j = 1 - last_idx = idx[1] - @inbounds for i in 2:length(idx) - cur_idx = idx[i] - j += cur_idx != last_idx - last_idx = cur_idx - groups[i] = j - end - @assert j <= length(gd) + groups = gen_groups(idx) + @assert groups[end] <= length(gd) return GroupedDataFrame(newparent, collect(1:length(gd.cols)), groups, - nothing, nothing, nothing, j, nothing) + nothing, nothing, nothing, groups[end], nothing) end else if regroup @@ -998,6 +1003,20 @@ function _agg2idx_map_helper(idx, idx_agg) return agg2idx_map end +function prepare_idx_keeprows(idx, starts, ends, nrowparent) + idx_keeprows = Vector{Int}(undef, nrowparent) + i = 0 + for (s, e) in zip(starts, ends) + v = idx[s] + for k in s:e + i += 1 + idx_keeprows[i] = v + end + end + @assert i == nrowparent + return idx_keeprows +end + function _combine(f::AbstractVector{<:Pair}, gd::GroupedDataFrame, nms::AbstractVector{Symbol}, copycols::Bool, keeprows::Bool) @@ -1011,17 +1030,7 @@ function _combine(f::AbstractVector{<:Pair}, "`GroupedDataFrame`s from which some groups have "* "been dropped (including skipmissing=true)")) end - idx_keeprows = Vector{Int}(undef, nrow(parent(gd))) - let i = 0 - for (s, e) in zip(gd.starts, gd.ends) - v = gd.idx[s] - for k in s:e - i += 1 - idx_keeprows[i] = v - end - end - @assert i == nrow(parent(gd)) - end + idx_keeprows = prepare_idx_keeprows(gd.idx, gd.starts, gd.ends, nrow(parent(gd))) else idx_keeprows = nothing end From 5713194acdec6d314b95235f74c2d8f3bcd17b64 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Fri, 1 May 2020 16:59:00 +0200 Subject: [PATCH 22/29] @simd did not improve the performance here --- src/groupeddataframe/splitapplycombine.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/groupeddataframe/splitapplycombine.jl b/src/groupeddataframe/splitapplycombine.jl index d10e7e15fc..f58a21f5bb 100644 --- a/src/groupeddataframe/splitapplycombine.jl +++ b/src/groupeddataframe/splitapplycombine.jl @@ -561,7 +561,7 @@ function gen_groups(idx::Vector{Int}) groups[1] = 1 j = 1 last_idx = idx[1] - @inbounds @simd for i in 2:length(idx) + @inbounds for i in 2:length(idx) cur_idx = idx[i] j += cur_idx != last_idx last_idx = cur_idx From 1f34d55ee969c55d11da7bdbab3b4c38328c99e2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Fri, 1 May 2020 17:36:27 +0200 Subject: [PATCH 23/29] Update docs/src/man/split_apply_combine.md Co-authored-by: pdeffebach <23196228+pdeffebach@users.noreply.github.com> --- docs/src/man/split_apply_combine.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/src/man/split_apply_combine.md b/docs/src/man/split_apply_combine.md index 96375c9f77..5c42a1de52 100644 --- a/docs/src/man/split_apply_combine.md +++ b/docs/src/man/split_apply_combine.md @@ -48,7 +48,7 @@ passed to `function`. In all of these cases, `function` can return either a single row or multiple rows. `function` can always generate a single column by returning a single value or a vector. -Additionally, if `combine` is passed exactly one `function` as a first argument +Additionally, if `combine` is passed exactly one `function`, `cols => function`, or `cols => function => outcol` as a first argument and `target_col` is not specified, `function` can return multiple columns in the form of an `AbstractDataFrame`, `AbstractMatrix`, `NamedTuple` or `DataFrameRow`. From 2201789c32b60b6bce3662fec310a25c96ec5c2e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Fri, 1 May 2020 17:39:08 +0200 Subject: [PATCH 24/29] add an example of passing function as a first argument to combine --- docs/src/man/split_apply_combine.md | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/docs/src/man/split_apply_combine.md b/docs/src/man/split_apply_combine.md index 5c42a1de52..b743d4250d 100644 --- a/docs/src/man/split_apply_combine.md +++ b/docs/src/man/split_apply_combine.md @@ -196,6 +196,15 @@ julia> combine(gdf, │ 2 │ Iris-versicolor │ 0.910378 │ │ 3 │ Iris-virginica │ 0.867923 │ +julia> combine(x -> std(x.PetalLength) / std(x.SepalLength), gdf) # passing a SubDataFrame +3×2 DataFrame +│ Row │ Species │ PetalLength_SepalLength_function │ +│ │ String │ Float64 │ +├─────┼─────────────────┼──────────────────────────────────┤ +│ 1 │ Iris-setosa │ 0.492245 │ +│ 2 │ Iris-versicolor │ 0.910378 │ +│ 3 │ Iris-virginica │ 0.867923 │ + julia> combine(gdf, 1:2 => cor, nrow) 3×3 DataFrame │ Row │ Species │ SepalLength_SepalWidth_cor │ nrow │ From 2aa9170b2b713188741b006b7ba826a105babe20 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Sat, 2 May 2020 07:08:52 +0200 Subject: [PATCH 25/29] change regroup to ungroup --- src/deprecated.jl | 2 +- src/groupeddataframe/splitapplycombine.jl | 86 ++++---- test/deprecated.jl | 4 +- test/grouping.jl | 226 +++++++++++----------- test/string.jl | 12 +- 5 files changed, 165 insertions(+), 165 deletions(-) diff --git a/src/deprecated.jl b/src/deprecated.jl index 59fe83f214..c38150ad2e 100644 --- a/src/deprecated.jl +++ b/src/deprecated.jl @@ -463,4 +463,4 @@ end skipmissing=skipmissing), f...) import Base: map -@deprecate map(f::Union{Base.Callable, Pair}, gd::GroupedDataFrame) combine(f, gd, regroup=true) +@deprecate map(f::Union{Base.Callable, Pair}, gd::GroupedDataFrame) combine(f, gd, ungroup=false) diff --git a/src/groupeddataframe/splitapplycombine.jl b/src/groupeddataframe/splitapplycombine.jl index f58a21f5bb..b6d2ddb4ac 100644 --- a/src/groupeddataframe/splitapplycombine.jl +++ b/src/groupeddataframe/splitapplycombine.jl @@ -246,17 +246,17 @@ const KWARG_PROCESSING_RULES = value contains columns with the same names as the grouping columns, they are required to be equal. - If `regroup=false` (the default) a `DataFrame` is returned. - If `regroup=true` a `GroupedDataFrame` grouped using `keycols(gdf)` is returned. + If `ungroup=true` (the default) a `DataFrame` is returned. + If `ungroup=false` a `GroupedDataFrame` grouped using `keycols(gdf)` is returned. """ """ - combine(gd::GroupedDataFrame, args...; keepkeys::Bool=true, regroup::Bool=false) + combine(gd::GroupedDataFrame, args...; keepkeys::Bool=true, ungroup::Bool=true) combine(fun::Union{Function, Type}, gd::GroupedDataFrame; - keepkeys::Bool=true, regroup::Bool=false) - combine(pair::Pair, gd::GroupedDataFrame; keepkeys::Bool=true, regroup::Bool=false) - combine(fun::Union{Function, Type}, df::AbstractDataFrame) - combine(pair::Pair, df::AbstractDataFrame) + keepkeys::Bool=true, ungroup::Bool=true) + combine(pair::Pair, gd::GroupedDataFrame; keepkeys::Bool=true, ungroup::Bool=true) + combine(fun::Union{Function, Type}, df::AbstractDataFrame, ungroup::Bool=true) + combine(pair::Pair, df::AbstractDataFrame, ungroup::Bool=true) Apply operations to each group in a [`GroupedDataFrame`](@ref) and return the combined result as a `DataFrame`. @@ -292,7 +292,7 @@ julia> combine(gd, :c => sum, nrow) │ 3 │ 3 │ 10 │ 2 │ │ 4 │ 4 │ 12 │ 2 │ -julia> combine(gd, :c => sum, nrow, regroup=true) +julia> combine(gd, :c => sum, nrow, ungroup=false) GroupedDataFrame with 4 groups based on key: a First Group (1 row): a = 1 │ Row │ a │ c_sum │ nrow │ @@ -428,24 +428,24 @@ julia> combine(gd, :, AsTable(Not(:a)) => sum) ``` """ function combine(f::Base.Callable, gd::GroupedDataFrame; - keepkeys::Bool=true, regroup::Bool=false) - return combine_helper(f, gd, keepkeys=keepkeys, regroup=regroup, + keepkeys::Bool=true, ungroup::Bool=true) + return combine_helper(f, gd, keepkeys=keepkeys, ungroup=ungroup, copycols=true, keeprows=false) end combine(f::typeof(nrow), gd::GroupedDataFrame; - keepkeys::Bool=true, regroup::Bool=false) = - combine(gd, [nrow => :nrow], keepkeys=keepkeys, regroup=regroup) + keepkeys::Bool=true, ungroup::Bool=true) = + combine(gd, [nrow => :nrow], keepkeys=keepkeys, ungroup=ungroup) function combine(p::Pair, gd::GroupedDataFrame; - keepkeys::Bool=true, regroup::Bool=false) + keepkeys::Bool=true, ungroup::Bool=true) # move handling of aggregate to specialized combine p_from, p_to = p # verify if it is not better to use a fast path, which we achieve # by moving to combine(::GroupedDataFrame, ::AbstractVector) method if isagg(p_from => (p_to isa Pair ? first(p_to) : p_to)) || p_from === nrow - return combine(gd, [p], keepkeys=keepkeys, regroup=regroup) + return combine(gd, [p], keepkeys=keepkeys, ungroup=ungroup) end if p_from isa Tuple @@ -455,20 +455,20 @@ function combine(p::Pair, gd::GroupedDataFrame; else cs = p_from end - return combine_helper(cs => p_to, gd, keepkeys=keepkeys, regroup=regroup, + return combine_helper(cs => p_to, gd, keepkeys=keepkeys, ungroup=ungroup, copycols=true, keeprows=false) end combine(gd::GroupedDataFrame, cs::Union{Pair, typeof(nrow), ColumnIndex, MultiColumnIndex}...; - keepkeys::Bool=true, regroup::Bool=false) = - _combine_prepare(gd, cs..., keepkeys=keepkeys, regroup=regroup, + keepkeys::Bool=true, ungroup::Bool=true) = + _combine_prepare(gd, cs..., keepkeys=keepkeys, ungroup=ungroup, copycols=true, keeprows=false) function _combine_prepare(gd::GroupedDataFrame, @nospecialize(cs::Union{Pair, typeof(nrow), ColumnIndex, MultiColumnIndex}...); - keepkeys::Bool, regroup::Bool, copycols::Bool, keeprows::Bool) + keepkeys::Bool, ungroup::Bool, copycols::Bool, keeprows::Bool) @assert !isempty(cs) cs_vec = [] for p in cs @@ -541,7 +541,7 @@ function _combine_prepare(gd::GroupedDataFrame, end f = Pair[first(x) => first(last(x)) for x in cs_norm] nms = Symbol[last(last(x)) for x in cs_norm] - return combine_helper(f, gd, nms, keepkeys=keepkeys, regroup=regroup, + return combine_helper(f, gd, nms, keepkeys=keepkeys, ungroup=ungroup, copycols=copycols, keeprows=keeprows) end @@ -572,14 +572,14 @@ end function combine_helper(f, gd::GroupedDataFrame, nms::Union{AbstractVector{Symbol},Nothing}=nothing; - keepkeys::Bool, regroup::Bool, + keepkeys::Bool, ungroup::Bool, copycols::Bool, keeprows::Bool) - if regroup && !keepkeys - throw(ArgumentError("keepkeys=false when regroup=true is not allowed")) + if !ungroup && !keepkeys + throw(ArgumentError("keepkeys=false when ungroup=false is not allowed")) end if length(gd) > 0 idx, valscat = _combine(f, gd, nms, copycols, keeprows) - keepkeys || regroup || return valscat + !keepkeys && ungroup && return valscat keys = groupcols(gd) for key in keys if hasproperty(valscat, key) @@ -597,7 +597,7 @@ function combine_helper(f, gd::GroupedDataFrame, end hcat!(newparent, select(valscat, Not(intersect(keys, _names(valscat))), copycols=false), copycols=false) - regroup || return newparent + ungroup && return newparent if length(idx) == 0 @assert nrow(newparent) == 0 @@ -617,11 +617,11 @@ function combine_helper(f, gd::GroupedDataFrame, nothing, nothing, nothing, groups[end], nothing) end else - if regroup + if ungroup + return keepkeys ? parent(gd)[1:0, gd.cols] : DataFrame() + else return GroupedDataFrame(parent(gd)[1:0, gd.cols], collect(1:length(gd.cols)), Int[], Int[], Int[], Int[], 0, Dict{Any,Int}()) - else - return keepkeys ? parent(gd)[1:0, gd.cols] : DataFrame() end end end @@ -1418,7 +1418,7 @@ end """ select(gd::GroupedDataFrame, args...; - copycols::Bool=true, keepkeys::Bool=true, regroup::Bool=false) + copycols::Bool=true, keepkeys::Bool=true, ungroup::Bool=true) Apply `args` to `gd` following the rules described in [`combine`](@ref). The return value has number of rows equal to `nrow(parent(gd))` @@ -1464,7 +1464,7 @@ julia> select(gd, :c => sum, nrow) │ 7 │ 1 │ 19 │ 5 │ │ 8 │ 2 │ 17 │ 3 │ -julia> select(gd, :c => sum, nrow, regroup=true) +julia> select(gd, :c => sum, nrow, ungroup=false) GroupedDataFrame with 2 groups based on key: a First Group (5 rows): a = 1 │ Row │ a │ c_sum │ nrow │ @@ -1557,49 +1557,49 @@ julia> select(gd, :, AsTable(Not(:a)) => sum) ``` """ select(gd::GroupedDataFrame, args...; - copycols::Bool=true, keepkeys::Bool=true, regroup::Bool=false) = + copycols::Bool=true, keepkeys::Bool=true, ungroup::Bool=true) = _combine_prepare(gd, args..., copycols=copycols, keepkeys=keepkeys, - regroup=regroup, keeprows=true) + ungroup=ungroup, keeprows=true) """ transform(gd::GroupedDataFrame, args...; - copycols::Bool=true, keepkeys::Bool=true, regroup::Bool=false) + copycols::Bool=true, keepkeys::Bool=true, ungroup::Bool=true) An equivalent of -`select(gd, :, args..., copycols=copycols, keepkeys=keepkeys, regroup=regroup)` +`select(gd, :, args..., copycols=copycols, keepkeys=keepkeys, ungroup=ungroup)` # See also [`groupby](@ref), [`combine`](@ref), [`select`](@ref), [`select!`](@ref), [`transform!`](@ref) """ transform(gd::GroupedDataFrame, args...; - copycols::Bool=true, keepkeys::Bool=true, regroup::Bool=false) = + copycols::Bool=true, keepkeys::Bool=true, ungroup::Bool=true) = select(gd, :, args..., copycols=copycols, keepkeys=keepkeys, - regroup=regroup) + ungroup=ungroup) """ - select!(gd::GroupedDataFrame{DataFrame}, args...; regroup::Bool=false) + select!(gd::GroupedDataFrame{DataFrame}, args...; ungroup::Bool=true) An equivalent of -`select(gd, args..., copycols=false, keepkeys=true, regroup=regroup)` +`select(gd, args..., copycols=false, keepkeys=true, ungroup=ungroup)` but updates `parent(gd)` in place. # See also [`groupby](@ref), [`combine`](@ref), [`select`](@ref), [`transform`](@ref), [`transform!`](@ref) """ -function select!(gd::GroupedDataFrame{DataFrame}, args...; regroup::Bool=false) +function select!(gd::GroupedDataFrame{DataFrame}, args...; ungroup::Bool=true) newdf = select(gd, args..., copycols=false) df = parent(gd) _replace_columns!(df, newdf) - return regroup ? gd : df + return ungroup ? df : gd end """ - transform!(gd::GroupedDataFrame{DataFrame}, args...; regroup::Bool=false) + transform!(gd::GroupedDataFrame{DataFrame}, args...; ungroup::Bool=true) An equivalent of -`transform(gd, args..., copycols=false, keepkeys=true, regroup=regroup)` +`transform(gd, args..., copycols=false, keepkeys=true, ungroup=ungroup)` but updates `parent(gd)` in place. @@ -1607,5 +1607,5 @@ but updates `parent(gd)` in place. [`groupby](@ref), [`combine`](@ref), [`select`](@ref), [`select!`](@ref), [`transform`](@ref) """ -transform!(gd::GroupedDataFrame{DataFrame}, args...; regroup::Bool=false) = - select!(gd, :, args..., regroup=regroup) +transform!(gd::GroupedDataFrame{DataFrame}, args...; ungroup::Bool=true) = + select!(gd, :, args..., ungroup=ungroup) diff --git a/test/deprecated.jl b/test/deprecated.jl index d9532f63e1..ccd8f10c69 100644 --- a/test/deprecated.jl +++ b/test/deprecated.jl @@ -708,8 +708,8 @@ end df = DataFrame(a=[2, 2, missing, missing, 1, 1, 3, 3], b=1:8) for dosort in (false, true), doskipmissing in (false, true) gdf = groupby(df, :a, sort=dosort, skipmissing=doskipmissing) - @test map(identity, gdf) ≅ combine(identity, gdf, regroup=true) - @test map(:b => sum, gdf) ≅ combine(:b => sum, gdf, regroup=true) + @test map(identity, gdf) ≅ combine(identity, gdf, ungroup=false) + @test map(:b => sum, gdf) ≅ combine(:b => sum, gdf, ungroup=false) end end diff --git a/test/grouping.jl b/test/grouping.jl index 7e0144e9e9..85a572d76f 100644 --- a/test/grouping.jl +++ b/test/grouping.jl @@ -201,41 +201,41 @@ end @test combine(f7, gd) == sres4 @test combine(f8, gd) == sres4 - # combine() with regroup without and with groups sorting + # combine() with ungroup without and with groups sorting for dosort in (false, true) gd = groupby_checked(df, cols, sort=dosort) - v = validate_gdf(combine(d -> d[:, [:x]], gd, regroup=true)) + v = validate_gdf(combine(d -> d[:, [:x]], gd, ungroup=false)) @test length(gd) == length(v) nms = [colssym; :x] @test v[1] == gd[1][:, nms] @test v[1] == gd[1][:, nms] && v[2] == gd[2][:, nms] && v[3] == gd[3][:, nms] && v[4] == gd[4][:, nms] @test names(parent(v))[v.cols] == string.(colssym) - v = validate_gdf(combine(f1, gd, regroup=true)) + v = validate_gdf(combine(f1, gd, ungroup=false)) @test extrema(v.groups) == extrema(gd.groups) @test vcat(v[1], v[2], v[3], v[4]) == combine(f1, gd) - v = validate_gdf(combine(f2, gd, regroup=true)) + v = validate_gdf(combine(f2, gd, ungroup=false)) @test extrema(v.groups) == extrema(gd.groups) @test vcat(v[1], v[2], v[3], v[4]) == combine(f2, gd) - v = validate_gdf(combine(f3, gd, regroup=true)) + v = validate_gdf(combine(f3, gd, ungroup=false)) @test extrema(v.groups) == extrema(gd.groups) @test vcat(v[1], v[2], v[3], v[4]) == combine(f3, gd) - v = validate_gdf(combine(f4, gd, regroup=true)) + v = validate_gdf(combine(f4, gd, ungroup=false)) @test extrema(v.groups) == extrema(gd.groups) @test vcat(v[1], v[2], v[3], v[4]) == combine(f4, gd) - v = validate_gdf(combine(f5, gd, regroup=true)) + v = validate_gdf(combine(f5, gd, ungroup=false)) @test extrema(v.groups) == extrema(gd.groups) @test vcat(v[1], v[2], v[3], v[4]) == combine(f5, gd) - v = validate_gdf(combine(f5, gd, regroup=true)) + v = validate_gdf(combine(f5, gd, ungroup=false)) @test extrema(v.groups) == extrema(gd.groups) @test vcat(v[1], v[2], v[3], v[4]) == combine(f5, gd) - v = validate_gdf(combine(f6, gd, regroup=true)) + v = validate_gdf(combine(f6, gd, ungroup=false)) @test extrema(v.groups) == extrema(gd.groups) @test vcat(v[1], v[2], v[3], v[4]) == combine(f6, gd) - v = validate_gdf(combine(f7, gd, regroup=true)) + v = validate_gdf(combine(f7, gd, ungroup=false)) @test extrema(v.groups) == extrema(gd.groups) @test vcat(v[1], v[2], v[3], v[4]) == combine(f7, gd) - v = validate_gdf(combine(f8, gd, regroup=true)) + v = validate_gdf(combine(f8, gd, ungroup=false)) @test extrema(v.groups) == extrema(gd.groups) @test vcat(v[1], v[2], v[3], v[4]) == combine(f8, gd) end @@ -363,7 +363,7 @@ end @test combine(d -> d.x == [1] ? DataFrame(z=[]) : DataFrame(z=1), gdf) == DataFrame(x=[2, 3], z=[1, 1]) v = validate_gdf(combine(d -> d.x == [1] ? DataFrame(z=[]) : DataFrame(z=1), - groupby_checked(df, :x), regroup=true)) + groupby_checked(df, :x), ungroup=false)) @test length(v) == 2 @test vcat(v[1], v[2]) == DataFrame(x=[2, 3], z=[1, 1]) @@ -438,14 +438,14 @@ end df = DataFrame(x=[], y=[]) gd = groupby_checked(df, :x) @test combine(df -> sum(df.x), gd) == DataFrame(x=[]) - res = validate_gdf(combine(df -> sum(df.x), gd, regroup=true)) + res = validate_gdf(combine(df -> sum(df.x), gd, ungroup=false)) @test length(res) == 0 @test res.parent == DataFrame(x=[]) # Test with zero groups in output df = DataFrame(A = [1, 2]) gd = groupby_checked(df, :A) - gd2 = validate_gdf(combine(d -> DataFrame(), gd, regroup=true)) + gd2 = validate_gdf(combine(d -> DataFrame(), gd, ungroup=false)) @test length(gd2) == 0 @test gd.cols == [1] @test isempty(gd2.groups) @@ -455,7 +455,7 @@ end @test parent(gd2) == DataFrame(A=[]) @test eltype.(eachcol(parent(gd2))) == [Int] - gd2 = validate_gdf(combine(d -> DataFrame(X=Int[]), gd, regroup=true)) + gd2 = validate_gdf(combine(d -> DataFrame(X=Int[]), gd, ungroup=false)) @test length(gd2) == 0 @test gd.cols == [1] @test isempty(gd2.groups) @@ -723,42 +723,42 @@ end @test_throws ArgumentError combine(col => (x -> (z=x,)) => :xyz, gd) @test_throws ArgumentError combine(col => x -> (z=1, xzz=[1]), gd) end - for cols in ([:b, :c], 2:3, [2, 3], [false, true, true]), regroup in (true, false) - @test combine(cols => (b,c) -> (y=exp.(b), z=c), gd, regroup=regroup) == - combine(d -> (y=exp.(d.b), z=d.c), gd, regroup=regroup) - @test combine(cols => (b,c) -> [exp.(b) c], gd, regroup=regroup) == - combine(d -> [exp.(d.b) d.c], gd, regroup=regroup) - @test combine(cols => ((b,c) -> sum(b) + sum(c)) => :xyz, gd, regroup=regroup) == - combine(d -> (xyz=sum(d.b) + sum(d.c),), gd, regroup=regroup) + for cols in ([:b, :c], 2:3, [2, 3], [false, true, true]), ungroup in (true, false) + @test combine(cols => (b,c) -> (y=exp.(b), z=c), gd, ungroup=ungroup) == + combine(d -> (y=exp.(d.b), z=d.c), gd, ungroup=ungroup) + @test combine(cols => (b,c) -> [exp.(b) c], gd, ungroup=ungroup) == + combine(d -> [exp.(d.b) d.c], gd, ungroup=ungroup) + @test combine(cols => ((b,c) -> sum(b) + sum(c)) => :xyz, gd, ungroup=ungroup) == + combine(d -> (xyz=sum(d.b) + sum(d.c),), gd, ungroup=ungroup) if eltype(cols) === Bool cols2 = [[false, true, false], [false, false, true]] @test_throws MethodError combine((xyz = cols[1] => sum, xzz = cols2[2] => sum), - gd, regroup=regroup) + gd, ungroup=ungroup) @test_throws MethodError combine((xyz = cols[1] => sum, xzz = cols2[1] => sum), - gd, regroup=regroup) + gd, ungroup=ungroup) @test_throws MethodError combine((xyz = cols[1] => sum, xzz = cols2[2] => x -> first(x)), - gd, regroup=regroup) + gd, ungroup=ungroup) else cols2 = cols - @test combine(gd, cols2[1] => sum => :xyz, cols2[2] => sum => :xzz, regroup=regroup) == - combine(d -> (xyz=sum(d.b), xzz=sum(d.c)), gd, regroup=regroup) - @test combine(gd, cols2[1] => sum => :xyz, cols2[1] => sum => :xzz, regroup=regroup) == - combine(d -> (xyz=sum(d.b), xzz=sum(d.b)), gd, regroup=regroup) + @test combine(gd, cols2[1] => sum => :xyz, cols2[2] => sum => :xzz, ungroup=ungroup) == + combine(d -> (xyz=sum(d.b), xzz=sum(d.c)), gd, ungroup=ungroup) + @test combine(gd, cols2[1] => sum => :xyz, cols2[1] => sum => :xzz, ungroup=ungroup) == + combine(d -> (xyz=sum(d.b), xzz=sum(d.b)), gd, ungroup=ungroup) @test combine(gd, cols2[1] => sum => :xyz, - cols2[2] => (x -> first(x)) => :xzz, regroup=regroup) == - combine(d -> (xyz=sum(d.b), xzz=first(d.c)), gd, regroup=regroup) + cols2[2] => (x -> first(x)) => :xzz, ungroup=ungroup) == + combine(d -> (xyz=sum(d.b), xzz=first(d.c)), gd, ungroup=ungroup) @test combine(gd, cols2[1] => vexp => :xyz, - cols2[2] => sum => :xzz, regroup=regroup) == + cols2[2] => sum => :xzz, ungroup=ungroup) == combine(d -> (xyz=vexp(d.b), xzz=fill(sum(d.c), length(vexp(d.b)))), - gd, regroup=regroup) + gd, ungroup=ungroup) end @test_throws ArgumentError combine(cols => (b,c) -> (y=exp.(b), z=sum(c)), - gd, regroup=regroup) + gd, ungroup=ungroup) @test_throws ArgumentError combine(cols2 => ((b,c) -> DataFrame(y=exp.(b), - z=sum(c))) => :xyz, gd, regroup=regroup) + z=sum(c))) => :xyz, gd, ungroup=ungroup) @test_throws ArgumentError combine(cols2 => ((b,c) -> [exp.(b) c]) => :xyz, - gd, regroup=regroup) + gd, ungroup=ungroup) end end @@ -908,19 +908,19 @@ end @test combine(identity, gd) ≅ df @test combine(d -> d[:, [2, 1]], gd) ≅ df @test_throws ArgumentError combine(f -> DataFrame(x=["a", "b"], z=[1, 1]), gd) - @test validate_gdf(combine(identity, gd, regroup=true)) ≅ gd - @test combine(d -> d[:, [2, 1]], gd, regroup=true) ≅ gd + @test validate_gdf(combine(identity, gd, ungroup=false)) ≅ gd + @test combine(d -> d[:, [2, 1]], gd, ungroup=false) ≅ gd @test_throws ArgumentError combine(f -> DataFrame(x=["a", "b"], z=[1, 1]), gd, - regroup=true) + ungroup=false) gd = groupby_checked(df, :x, skipmissing=true) @test combine(identity, gd) == df[1:3, :] @test combine(d -> d[:, [2, 1]], gd) == df[1:3, :] @test_throws ArgumentError combine(f -> DataFrame(x=["a", "b"], z=[1, 1]), gd) - @test validate_gdf(combine(identity, gd, regroup=true)) == gd - @test validate_gdf(combine(d -> d[:, [2, 1]], gd, regroup=true)) == gd + @test validate_gdf(combine(identity, gd, ungroup=false)) == gd + @test validate_gdf(combine(d -> d[:, [2, 1]], gd, ungroup=false)) == gd @test_throws ArgumentError combine(f -> DataFrame(x=["a", "b"], z=[1, 1]), gd, - regroup=true) + ungroup=false) end @testset "iteration protocol" begin @@ -1306,9 +1306,9 @@ end @test gdf[:] == gdf @test gdf[1:1] == gdf - @test validate_gdf(combine(nrow => :x1, gdf, regroup=true)) == + @test validate_gdf(combine(nrow => :x1, gdf, ungroup=false)) == groupby_checked(DataFrame(x1=3), []) - @test validate_gdf(combine(:x2 => identity => :x2_identity, gdf, regroup=true)) == + @test validate_gdf(combine(:x2 => identity => :x2_identity, gdf, ungroup=false)) == groupby_checked(DataFrame(x2_identity=[1,1,2]), []) @test DataFrame(gdf) == df @@ -1646,7 +1646,7 @@ end gdf = groupby_checked(df, :a) res = combine(sdf -> sdf.x1[1] ? fr : er, gdf) @test res == DataFrame(validate_gdf(combine(sdf -> sdf.x1[1] ? fr : er, - groupby_checked(df, :a), regroup=true))) + groupby_checked(df, :a), ungroup=false))) if fr isa AbstractVector && df.x1[1] @test res == combine(:x1 => (x1 -> x1[1] ? fr : er) => :x1, gdf) else @@ -1679,7 +1679,7 @@ end @test combine(gdf, Between(:x2, :x1) => () -> 1) == DataFrame(:g => 1:2, Symbol("function") => 1) @test combine(gdf, :x1 => :z) == combine(gdf, [:x1 => :z]) == combine(:x1 => :z, gdf) == DataFrame(g=[1,1,1,2,2,2], z=1:6) - @test validate_gdf(combine(:x1 => :z, groupby_checked(df, :g), regroup=true)) == + @test validate_gdf(combine(:x1 => :z, groupby_checked(df, :g), ungroup=false)) == groupby_checked(DataFrame(g=[1,1,1,2,2,2], z=1:6), :g) end @@ -1882,7 +1882,7 @@ end @test combine(gdf , AsTable([:x, :y]) => Ref) == combine(AsTable([:x, :y]) => Ref, gdf) == DataFrame(g=1:2, x_y_Ref=[(x=[1,2,3], y=[6,7,8]), (x=[4,5], y=[9,10])]) - @test validate_gdf(combine(AsTable([:x, :y]) => Ref, gdf, regroup=true)) == + @test validate_gdf(combine(AsTable([:x, :y]) => Ref, gdf, ungroup=false)) == groupby_checked(combine(gdf, AsTable([:x, :y]) => Ref), :g) @test combine(gdf, AsTable(1) => Ref) == @@ -1894,7 +1894,7 @@ end combine(AsTable([:x, :y]) => ByRow(x -> [x]), gdf) == DataFrame(g=[1,1,1,2,2], x_y_function=[[(x=1,y=6)], [(x=2,y=7)], [(x=3,y=8)], [(x=4,y=9)], [(x=5,y=10)]]) - @test validate_gdf(combine(AsTable([:x, :y]) => ByRow(x -> [x]), gdf, regroup=true)) == + @test validate_gdf(combine(AsTable([:x, :y]) => ByRow(x -> [x]), gdf, ungroup=false)) == groupby_checked(combine(gdf, AsTable([:x, :y]) => ByRow(x -> [x])), :g) # whole column and ByRow test for multiple pairs passed @@ -1908,10 +1908,10 @@ end @test_throws ArgumentError combine(gdf, AsTable([:x, :y]) => ByRow(x -> df[1, :])) end -@testset "test correctness of regrouping" begin +@testset "test correctness of ungrouping" begin df = DataFrame(g=[2,2,1,3,1,2,1,2,3]) gdf = groupby_checked(df, :g) - gdf2 = validate_gdf(combine(identity, gdf, regroup=true)) + gdf2 = validate_gdf(combine(identity, gdf, ungroup=false)) @test combine(gdf, :g => sum) == combine(gdf2, :g => sum) df.id = 1:9 @@ -1926,32 +1926,32 @@ end if !(df.g isa CategoricalVector) gdf = groupby_checked(df, :g, sort=false, skipmissing=false) - @test combine(gdf, :x => sum, keepkeys=false, regroup=false) == + @test combine(gdf, :x => sum, keepkeys=false, ungroup=true) == DataFrame(x_sum = [1, 5, 4]) - @test_throws ArgumentError combine(gdf, :x => sum, keepkeys=false, regroup=true) - @test combine(gdf, :x => sum, keepkeys=true, regroup=false) ≅ + @test_throws ArgumentError combine(gdf, :x => sum, keepkeys=false, ungroup=false) + @test combine(gdf, :x => sum, keepkeys=true, ungroup=true) ≅ DataFrame(g = [3, 1, missing], x_sum = [1, 5, 4]) - gdf2 = validate_gdf(combine(gdf, :x => sum, keepkeys=true, regroup=true)) + gdf2 = validate_gdf(combine(gdf, :x => sum, keepkeys=true, ungroup=false)) @test gdf2 isa GroupedDataFrame{DataFrame} @test gdf2.groups == 1:3 @test DataFrame(gdf2) ≅ DataFrame(g = [3, 1, missing], x_sum = [1, 5, 4]) @test DataFrame(gdf2, keepkeys=false) == DataFrame(x_sum = [1, 5, 4]) - @test combine(gdf, :x => sum, :g, keepkeys=false, regroup=false) ≅ + @test combine(gdf, :x => sum, :g, keepkeys=false, ungroup=true) ≅ DataFrame(x_sum = [1, 5, 5, 4], g = [3, 1, 1, missing]) - @test combine(gdf, :x => sum, :g, keepkeys=true, regroup=false) ≅ + @test combine(gdf, :x => sum, :g, keepkeys=true, ungroup=true) ≅ DataFrame(g = [3, 1, 1, missing], x_sum = [1, 5, 5, 4]) - gdf2 = validate_gdf(combine(gdf, :x => sum, :g, keepkeys=true, regroup=true)) + gdf2 = validate_gdf(combine(gdf, :x => sum, :g, keepkeys=true, ungroup=false)) @test gdf2 isa GroupedDataFrame{DataFrame} @test gdf2.groups == [1, 2, 2, 3] @test DataFrame(gdf2) ≅ DataFrame(g = [3, 1, 1, missing], x_sum = [1, 5, 5, 4]) @test DataFrame(gdf2, keepkeys=false) ≅ DataFrame(x_sum = [1, 5, 5, 4]) - @test combine(x -> (x_sum = sum(x.x),), gdf, keepkeys=false, regroup=false) == + @test combine(x -> (x_sum = sum(x.x),), gdf, keepkeys=false, ungroup=true) == DataFrame(x_sum = [1, 5, 4]) - @test combine(x -> (x_sum = sum(x.x),), gdf, keepkeys=true, regroup=false) ≅ + @test combine(x -> (x_sum = sum(x.x),), gdf, keepkeys=true, ungroup=true) ≅ DataFrame(g = [3, 1, missing], x_sum = [1, 5, 4]) - gdf2 = validate_gdf(combine(x -> (x_sum = sum(x.x),), gdf, keepkeys=true, regroup=true)) + gdf2 = validate_gdf(combine(x -> (x_sum = sum(x.x),), gdf, keepkeys=true, ungroup=false)) @test gdf2 isa GroupedDataFrame{DataFrame} @test gdf2.groups == 1:3 @test DataFrame(gdf2) ≅ DataFrame(g = [3, 1, missing], x_sum = [1, 5, 4]) @@ -1959,32 +1959,32 @@ end gdf = groupby_checked(df, :g, sort=false, skipmissing=true) - @test combine(gdf, :x => sum, keepkeys=false, regroup=false) == + @test combine(gdf, :x => sum, keepkeys=false, ungroup=true) == DataFrame(x_sum = [1, 5]) - @test_throws ArgumentError combine(gdf, :x => sum, keepkeys=false, regroup=true) - @test combine(gdf, :x => sum, keepkeys=true, regroup=false) ≅ + @test_throws ArgumentError combine(gdf, :x => sum, keepkeys=false, ungroup=false) + @test combine(gdf, :x => sum, keepkeys=true, ungroup=true) ≅ DataFrame(g = [3, 1], x_sum = [1, 5]) - gdf2 = validate_gdf(combine(gdf, :x => sum, keepkeys=true, regroup=true)) + gdf2 = validate_gdf(combine(gdf, :x => sum, keepkeys=true, ungroup=false)) @test gdf2 isa GroupedDataFrame{DataFrame} @test gdf2.groups == 1:2 @test DataFrame(gdf2) ≅ DataFrame(g = [3, 1], x_sum = [1, 5]) @test DataFrame(gdf2, keepkeys=false) ≅ DataFrame(x_sum = [1, 5]) - @test combine(gdf, :x => sum, :g, keepkeys=false, regroup=false) ≅ + @test combine(gdf, :x => sum, :g, keepkeys=false, ungroup=true) ≅ DataFrame(x_sum = [1, 5, 5], g = [3, 1, 1]) - @test combine(gdf, :x => sum, :g, keepkeys=true, regroup=false) ≅ + @test combine(gdf, :x => sum, :g, keepkeys=true, ungroup=true) ≅ DataFrame(g = [3, 1, 1], x_sum = [1, 5, 5]) - gdf2 = validate_gdf(combine(gdf, :x => sum, :g, keepkeys=true, regroup=true)) + gdf2 = validate_gdf(combine(gdf, :x => sum, :g, keepkeys=true, ungroup=false)) @test gdf2 isa GroupedDataFrame{DataFrame} @test gdf2.groups == [1, 2, 2] @test DataFrame(gdf2) ≅ DataFrame(g = [3, 1, 1], x_sum = [1, 5, 5]) @test DataFrame(gdf2, keepkeys=false) ≅ DataFrame(x_sum = [1, 5, 5]) - @test combine(x -> (x_sum = sum(x.x),), gdf, keepkeys=false, regroup=false) == + @test combine(x -> (x_sum = sum(x.x),), gdf, keepkeys=false, ungroup=true) == DataFrame(x_sum = [1, 5]) - @test combine(x -> (x_sum = sum(x.x),), gdf, keepkeys=true, regroup=false) ≅ + @test combine(x -> (x_sum = sum(x.x),), gdf, keepkeys=true, ungroup=true) ≅ DataFrame(g = [3, 1], x_sum = [1, 5]) - gdf2 = validate_gdf(combine(x -> (x_sum = sum(x.x),), gdf, keepkeys=true, regroup=true)) + gdf2 = validate_gdf(combine(x -> (x_sum = sum(x.x),), gdf, keepkeys=true, ungroup=false)) @test gdf2 isa GroupedDataFrame{DataFrame} @test gdf2.groups == 1:2 @test DataFrame(gdf2) ≅ DataFrame(g = [3, 1], x_sum = [1, 5]) @@ -1993,32 +1993,32 @@ end gdf = groupby_checked(df, :g, sort=true, skipmissing=false) - @test combine(gdf, :x => sum, keepkeys=false, regroup=false) == + @test combine(gdf, :x => sum, keepkeys=false, ungroup=true) == DataFrame(x_sum = [5, 1, 4]) - @test_throws ArgumentError validate_gdf(combine(gdf, :x => sum, keepkeys=false, regroup=true)) - @test combine(gdf, :x => sum, keepkeys=true, regroup=false) ≅ + @test_throws ArgumentError validate_gdf(combine(gdf, :x => sum, keepkeys=false, ungroup=false)) + @test combine(gdf, :x => sum, keepkeys=true, ungroup=true) ≅ DataFrame(g = [1, 3, missing], x_sum = [5, 1, 4]) - gdf2 = validate_gdf(combine(gdf, :x => sum, keepkeys=true, regroup=true)) + gdf2 = validate_gdf(combine(gdf, :x => sum, keepkeys=true, ungroup=false)) @test gdf2 isa GroupedDataFrame{DataFrame} @test gdf2.groups == 1:3 @test DataFrame(gdf2) ≅ DataFrame(g = [1, 3, missing], x_sum = [5, 1, 4]) @test DataFrame(gdf2, keepkeys=false) ≅ DataFrame(x_sum = [5, 1, 4]) - @test combine(gdf, :x => sum, :g, keepkeys=false, regroup=false) ≅ + @test combine(gdf, :x => sum, :g, keepkeys=false, ungroup=true) ≅ DataFrame(x_sum = [5, 5, 1, 4], g = [1, 1, 3, missing]) - @test combine(gdf, :x => sum, :g, keepkeys=true, regroup=false) ≅ + @test combine(gdf, :x => sum, :g, keepkeys=true, ungroup=true) ≅ DataFrame(g = [1, 1, 3, missing], x_sum = [5, 5, 1, 4]) - gdf2 = validate_gdf(combine(gdf, :x => sum, :g, keepkeys=true, regroup=true)) + gdf2 = validate_gdf(combine(gdf, :x => sum, :g, keepkeys=true, ungroup=false)) @test gdf2 isa GroupedDataFrame{DataFrame} @test gdf2.groups == [1, 1, 2, 3] @test DataFrame(gdf2) ≅ DataFrame(g = [1, 1, 3, missing], x_sum = [5, 5, 1, 4]) @test DataFrame(gdf2, keepkeys=false) ≅ DataFrame(x_sum = [5, 5, 1, 4]) - @test combine(x -> (x_sum = sum(x.x),), gdf, keepkeys=false, regroup=false) == + @test combine(x -> (x_sum = sum(x.x),), gdf, keepkeys=false, ungroup=true) == DataFrame(x_sum = [5, 1, 4]) - @test combine(x -> (x_sum = sum(x.x),), gdf, keepkeys=true, regroup=false) ≅ + @test combine(x -> (x_sum = sum(x.x),), gdf, keepkeys=true, ungroup=true) ≅ DataFrame(g = [1, 3, missing], x_sum = [5, 1, 4]) - gdf2 = validate_gdf(combine(x -> (x_sum = sum(x.x),), gdf, keepkeys=true, regroup=true)) + gdf2 = validate_gdf(combine(x -> (x_sum = sum(x.x),), gdf, keepkeys=true, ungroup=false)) @test gdf2 isa GroupedDataFrame{DataFrame} @test gdf2.groups == 1:3 @test DataFrame(gdf2) ≅ DataFrame(g = [1, 3, missing], x_sum = [5, 1, 4]) @@ -2026,32 +2026,32 @@ end gdf = groupby_checked(df, :g, sort=true, skipmissing=true) - @test combine(gdf, :x => sum, keepkeys=false, regroup=false) == + @test combine(gdf, :x => sum, keepkeys=false, ungroup=true) == DataFrame(x_sum = [5, 1]) - @test_throws ArgumentError combine(gdf, :x => sum, keepkeys=false, regroup=true) - @test combine(gdf, :x => sum, keepkeys=true, regroup=false) ≅ + @test_throws ArgumentError combine(gdf, :x => sum, keepkeys=false, ungroup=false) + @test combine(gdf, :x => sum, keepkeys=true, ungroup=true) ≅ DataFrame(g = [1, 3], x_sum = [5, 1]) - gdf2 = validate_gdf(combine(gdf, :x => sum, keepkeys=true, regroup=true)) + gdf2 = validate_gdf(combine(gdf, :x => sum, keepkeys=true, ungroup=false)) @test gdf2 isa GroupedDataFrame{DataFrame} @test gdf2.groups == 1:2 @test DataFrame(gdf2) ≅ DataFrame(g = [1, 3], x_sum = [5, 1]) @test DataFrame(gdf2, keepkeys=false) ≅ DataFrame(x_sum = [5, 1]) - @test combine(gdf, :x => sum, :g, keepkeys=false, regroup=false) ≅ + @test combine(gdf, :x => sum, :g, keepkeys=false, ungroup=true) ≅ DataFrame(x_sum = [5, 5, 1], g = [1, 1, 3]) - @test combine(gdf, :x => sum, :g, keepkeys=true, regroup=false) ≅ + @test combine(gdf, :x => sum, :g, keepkeys=true, ungroup=true) ≅ DataFrame(g = [1, 1, 3], x_sum = [5, 5, 1]) - gdf2 = validate_gdf(combine(gdf, :x => sum, :g, keepkeys=true, regroup=true)) + gdf2 = validate_gdf(combine(gdf, :x => sum, :g, keepkeys=true, ungroup=false)) @test gdf2 isa GroupedDataFrame{DataFrame} @test gdf2.groups == [1, 1, 2] @test DataFrame(gdf2) ≅ DataFrame(g = [1, 1, 3], x_sum = [5, 5, 1]) @test DataFrame(gdf2, keepkeys=false) ≅ DataFrame(x_sum = [5, 5, 1]) - @test combine(x -> (x_sum = sum(x.x),), gdf, keepkeys=false, regroup=false) == + @test combine(x -> (x_sum = sum(x.x),), gdf, keepkeys=false, ungroup=true) == DataFrame(x_sum = [5, 1]) - @test combine(x -> (x_sum = sum(x.x),), gdf, keepkeys=true, regroup=false) ≅ + @test combine(x -> (x_sum = sum(x.x),), gdf, keepkeys=true, ungroup=true) ≅ DataFrame(g = [1, 3], x_sum = [5, 1]) - gdf2 = validate_gdf(combine(x -> (x_sum = sum(x.x),), gdf, keepkeys=true, regroup=true)) + gdf2 = validate_gdf(combine(x -> (x_sum = sum(x.x),), gdf, keepkeys=true, ungroup=false)) @test gdf2 isa GroupedDataFrame{DataFrame} @test gdf2.groups == 1:2 @test DataFrame(gdf2) ≅ DataFrame(g = [1, 3], x_sum = [5, 1]) @@ -2066,33 +2066,33 @@ end gdf = groupby_checked(df, :g, sort=dosort, skipmissing=false) - @test select(gdf, :x => sum, keepkeys=false, regroup=false) == + @test select(gdf, :x => sum, keepkeys=false, ungroup=true) == DataFrame(x_sum = [1, 5, 5, 4]) - @test_throws ArgumentError select(gdf, :x => sum, keepkeys=false, regroup=true) - @test select(gdf, :x => sum, keepkeys=true, regroup=false) ≅ + @test_throws ArgumentError select(gdf, :x => sum, keepkeys=false, ungroup=false) + @test select(gdf, :x => sum, keepkeys=true, ungroup=true) ≅ DataFrame(g = df.g, x_sum = [1, 5, 5, 4]) - gdf2 = validate_gdf(select(gdf, :x => sum, keepkeys=true, regroup=true)) + gdf2 = validate_gdf(select(gdf, :x => sum, keepkeys=true, ungroup=false)) @test gdf2 isa GroupedDataFrame{DataFrame} @test gdf2.groups == gdf.groups @test parent(gdf2).g ≅ df.g @test parent(gdf2).g !== df.g - @test select(gdf, :x => sum, :g, keepkeys=false, regroup=false) ≅ + @test select(gdf, :x => sum, :g, keepkeys=false, ungroup=true) ≅ DataFrame(x_sum = [1, 5, 5, 4], g = df.g) - @test select(gdf, :x => sum, :g, keepkeys=true, regroup=false) ≅ + @test select(gdf, :x => sum, :g, keepkeys=true, ungroup=true) ≅ DataFrame(g = df.g, x_sum = [1, 5, 5, 4]) - gdf2 = validate_gdf(select(gdf, :x => sum, :g, keepkeys=true, regroup=true)) + gdf2 = validate_gdf(select(gdf, :x => sum, :g, keepkeys=true, ungroup=false)) @test gdf2 isa GroupedDataFrame{DataFrame} @test gdf2.groups == gdf.groups @test parent(gdf2).g ≅ df.g @test parent(gdf2).g !== df.g - @test transform(gdf, :x => sum, keepkeys=false, regroup=false) ≅ + @test transform(gdf, :x => sum, keepkeys=false, ungroup=true) ≅ [df DataFrame(x_sum = [1, 5, 5, 4])] - @test_throws ArgumentError transform(gdf, :x => sum, keepkeys=false, regroup=true) - @test transform(gdf, :x => sum, keepkeys=true, regroup=false) ≅ + @test_throws ArgumentError transform(gdf, :x => sum, keepkeys=false, ungroup=false) + @test transform(gdf, :x => sum, keepkeys=true, ungroup=true) ≅ DataFrame(g = df.g, x = df.x, y = df.y, x_sum = [1, 5, 5, 4]) - gdf2 = validate_gdf(transform(gdf, :x => sum, keepkeys=true, regroup=true)) + gdf2 = validate_gdf(transform(gdf, :x => sum, keepkeys=true, ungroup=false)) @test gdf2 isa GroupedDataFrame{DataFrame} @test gdf2.groups == gdf.groups @test parent(gdf2).g ≅ df.g @@ -2100,11 +2100,11 @@ end @test parent(gdf2).y ≅ df.y @test parent(gdf2).g !== df.g - @test transform(gdf, :x => sum, :g, keepkeys=false, regroup=false) ≅ + @test transform(gdf, :x => sum, :g, keepkeys=false, ungroup=true) ≅ [df DataFrame(x_sum = [1, 5, 5, 4])] - @test transform(gdf, :x => sum, :g, keepkeys=true, regroup=false) ≅ + @test transform(gdf, :x => sum, :g, keepkeys=true, ungroup=true) ≅ [df DataFrame(x_sum = [1, 5, 5, 4])] - gdf2 = validate_gdf(transform(gdf, :x => sum, :g, keepkeys=true, regroup=true)) + gdf2 = validate_gdf(transform(gdf, :x => sum, :g, keepkeys=true, ungroup=false)) @test gdf2 isa GroupedDataFrame{DataFrame} @test gdf2.groups == gdf.groups @test parent(gdf2).g ≅ df.g @@ -2112,17 +2112,17 @@ end @test parent(gdf2).y ≅ df.y @test parent(gdf2).g !== df.g - df2 = transform(gdf, :x => sum, :g, keepkeys=false, regroup=false, copycols=false) + df2 = transform(gdf, :x => sum, :g, keepkeys=false, ungroup=true, copycols=false) @test df2 ≅ [df DataFrame(x_sum = [1, 5, 5, 4])] @test df2.g === df.g @test df2.x === df.x @test df2.y === df.y - df2 = transform(gdf, :x => sum, :g, keepkeys=true, regroup=false, copycols=false) + df2 = transform(gdf, :x => sum, :g, keepkeys=true, ungroup=true, copycols=false) @test df2 ≅ [df DataFrame(x_sum = [1, 5, 5, 4])] @test df2.g === df.g @test df2.x === df.x @test df2.y === df.y - gdf2 = validate_gdf(transform(gdf, :x => sum, :g, keepkeys=true, regroup=true, copycols=false)) + gdf2 = validate_gdf(transform(gdf, :x => sum, :g, keepkeys=true, ungroup=false, copycols=false)) @test gdf2 isa GroupedDataFrame{DataFrame} @test gdf2.groups == gdf.groups @test parent(gdf2).g ≅ df.g @@ -2132,9 +2132,9 @@ end gdf = groupby_checked(df, :g, sort=dosort, skipmissing=true) @test_throws ArgumentError select(gdf, :x => sum) - @test_throws ArgumentError select(gdf, :x => sum, regroup=true) + @test_throws ArgumentError select(gdf, :x => sum, ungroup=false) @test_throws ArgumentError transform(gdf, :x => sum) - @test_throws ArgumentError transform(gdf, :x => sum, regroup=true) + @test_throws ArgumentError transform(gdf, :x => sum, ungroup=false) end # show the difference between the ordering of rows in select and combine @@ -2190,7 +2190,7 @@ end dfc = copy(df) g = dfc.g gdf = groupby_checked(dfc, :g, sort=dosort, skipmissing=false) - @test validate_gdf(select!(gdf, :x => sum, regroup=true)) === gdf + @test validate_gdf(select!(gdf, :x => sum, ungroup=false)) === gdf @test dfc.g === g @test dfc.x_sum == [1, 5, 5, 4] @test propertynames(dfc) == [:g, :x_sum] @@ -2200,7 +2200,7 @@ end x = dfc.x y = dfc.y gdf = groupby_checked(dfc, :g, sort=dosort, skipmissing=false) - @test validate_gdf(transform!(gdf, :g => first => :g, :x => first, regroup=true)) === gdf + @test validate_gdf(transform!(gdf, :g => first => :g, :x => first, ungroup=false)) === gdf @test dfc.g === g @test dfc.x === x @test dfc.y === y @@ -2210,9 +2210,9 @@ end dfc = copy(df) gdf = groupby_checked(dfc, :g, sort=dosort, skipmissing=true) @test_throws ArgumentError select!(gdf, :x => sum) - @test_throws ArgumentError select!(gdf, :x => sum, regroup=true) + @test_throws ArgumentError select!(gdf, :x => sum, ungroup=false) @test_throws ArgumentError transform!(gdf, :x => sum) - @test_throws ArgumentError transform!(gdf, :x => sum, regroup=true) + @test_throws ArgumentError transform!(gdf, :x => sum, ungroup=false) @test dfc ≅ df end end diff --git a/test/string.jl b/test/string.jl index 233b9cd8b8..589d4ca825 100644 --- a/test/string.jl +++ b/test/string.jl @@ -169,12 +169,12 @@ end @test combine(gdf, :a) == combine(gdf, "a") == combine(gdf, [:a]) == combine(gdf, ["a"]) - @test combine("a" => identity, gdf, regroup=true) == - combine(:a => identity, gdf, regroup=true) - @test combine(["a"] => identity, gdf, regroup=true) == - combine([:a] => identity, gdf, regroup=true) - @test combine(nrow => :n, gdf, regroup=true) == - combine(nrow => "n", gdf, regroup=true) + @test combine("a" => identity, gdf, ungroup=false) == + combine(:a => identity, gdf, ungroup=false) + @test combine(["a"] => identity, gdf, ungroup=false) == + combine([:a] => identity, gdf, ungroup=false) + @test combine(nrow => :n, gdf, ungroup=false) == + combine(nrow => "n", gdf, ungroup=false) @test combine("a" => identity, gdf) == combine(:a => identity, gdf) == combine(gdf, "a" => identity) == combine(gdf, :a => identity) From 333cca222319ef1fa9fe39fd0d8f28041c9302f5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Tue, 5 May 2020 10:50:31 +0200 Subject: [PATCH 26/29] Apply suggestions from code review Co-authored-by: Milan Bouchet-Valat --- docs/src/man/split_apply_combine.md | 3 ++- src/dataframe/dataframe.jl | 4 ++-- src/groupeddataframe/splitapplycombine.jl | 11 +++++++---- 3 files changed, 11 insertions(+), 7 deletions(-) diff --git a/docs/src/man/split_apply_combine.md b/docs/src/man/split_apply_combine.md index b743d4250d..1eb02f4889 100644 --- a/docs/src/man/split_apply_combine.md +++ b/docs/src/man/split_apply_combine.md @@ -48,7 +48,8 @@ passed to `function`. In all of these cases, `function` can return either a single row or multiple rows. `function` can always generate a single column by returning a single value or a vector. -Additionally, if `combine` is passed exactly one `function`, `cols => function`, or `cols => function => outcol` as a first argument +Additionally, if `combine` is passed exactly one `function`, `cols => function`, +or `cols => function => outcol` as a first argument and `target_col` is not specified, `function` can return multiple columns in the form of an `AbstractDataFrame`, `AbstractMatrix`, `NamedTuple` or `DataFrameRow`. diff --git a/src/dataframe/dataframe.jl b/src/dataframe/dataframe.jl index 06ebd27d53..07d45426dd 100644 --- a/src/dataframe/dataframe.jl +++ b/src/dataframe/dataframe.jl @@ -66,7 +66,7 @@ stored in a `Ref` or a `0`-dimensional `AbstractArray` are unwrapped and treated in the same way. Additionally `DataFrame` can be used to collect a [`GroupedDataFrame`](@ref) -into a `DataFrame`. In this case the row ofder of the result follows the order +into a `DataFrame`. In this case the order of rows in the result follows the order of groups in the `GroupedDataFrame` passed. # Notes @@ -1673,7 +1673,7 @@ function repeat!(df::DataFrame, count::Integer) return mapcols!(x -> repeat(x, count), df) end -# it is not exactly copy! as in general we alow axes to be different +# This is not exactly copy! as in general we allow axes to be different function _replace_columns!(df::DataFrame, newdf::DataFrame) copy!(_columns(df), _columns(newdf)) copy!(_names(index(df)), _names(newdf)) diff --git a/src/groupeddataframe/splitapplycombine.jl b/src/groupeddataframe/splitapplycombine.jl index b6d2ddb4ac..86df05b953 100644 --- a/src/groupeddataframe/splitapplycombine.jl +++ b/src/groupeddataframe/splitapplycombine.jl @@ -1003,7 +1003,10 @@ function _agg2idx_map_helper(idx, idx_agg) return agg2idx_map end -function prepare_idx_keeprows(idx, starts, ends, nrowparent) +function prepare_idx_keeprows(idx::AbstractVector{<:Integer}, + starts::AbstractVector{<:Integer}, + ends::AbstractVector{<:Integer}, + nrowparent::Integer) idx_keeprows = Vector{Int}(undef, nrowparent) i = 0 for (s, e) in zip(starts, ends) @@ -1421,8 +1424,8 @@ end copycols::Bool=true, keepkeys::Bool=true, ungroup::Bool=true) Apply `args` to `gd` following the rules described in [`combine`](@ref). -The return value has number of rows equal to `nrow(parent(gd))` -(if single value is returned it is always broadcasted to have this number of rows). +The returned object has as many rows as `parent(gd)`. +If an operation returns a single value it is always broadcasted to have this number of rows. If `copycols=false` then do not perform copying of columns that are not transformed. @@ -1513,7 +1516,7 @@ julia> select(gd, [:b, :c] .=> sum) # passing a vector of pairs │ 8 │ 2 │ 4 │ 17 │ julia> select(gd, :b => :b1, :c => :c1, - [:b, :c] => +, keepkeys=false) # auto-splatting, renaming and keepkeys + [:b, :c] => +, keepkeys=false) # multiple arguments, renaming and keepkeys 8×3 DataFrame │ Row │ b1 │ c1 │ b_c_+ │ │ │ Int64 │ Int64 │ Int64 │ From 10b947467d4b86c0bfa2d4b48c45eca11759d949 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Tue, 5 May 2020 11:17:07 +0200 Subject: [PATCH 27/29] update docs --- src/groupeddataframe/splitapplycombine.jl | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/groupeddataframe/splitapplycombine.jl b/src/groupeddataframe/splitapplycombine.jl index 86df05b953..3671b618bb 100644 --- a/src/groupeddataframe/splitapplycombine.jl +++ b/src/groupeddataframe/splitapplycombine.jl @@ -1429,6 +1429,8 @@ If an operation returns a single value it is always broadcasted to have this num If `copycols=false` then do not perform copying of columns that are not transformed. +$KWARG_PROCESSING_RULES + # See also [`groupby](@ref), [`combine`](@ref), [`select!`](@ref), [`transform`](@ref), [`transform!`](@ref) From 792b57d9221b4169f27459e5eecaaa17460acd90 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Tue, 5 May 2020 11:45:16 +0200 Subject: [PATCH 28/29] improve description of what gets returned in combine and select --- src/groupeddataframe/splitapplycombine.jl | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/src/groupeddataframe/splitapplycombine.jl b/src/groupeddataframe/splitapplycombine.jl index 3671b618bb..ef2f81be15 100644 --- a/src/groupeddataframe/splitapplycombine.jl +++ b/src/groupeddataframe/splitapplycombine.jl @@ -258,9 +258,11 @@ const KWARG_PROCESSING_RULES = combine(fun::Union{Function, Type}, df::AbstractDataFrame, ungroup::Bool=true) combine(pair::Pair, df::AbstractDataFrame, ungroup::Bool=true) -Apply operations to each group in a [`GroupedDataFrame`](@ref) and return -the combined result as a `DataFrame`. -If an `AbstractDataFrame` is passed, apply operations to the data frame as a whole. +Apply operations to each group in a [`GroupedDataFrame`](@ref) and return the combined +result as a `DataFrame` if `ungroup=true` or `GroupedDataFrame` if `ungroup=false`. + +If an `AbstractDataFrame` is passed, apply operations to the data frame as a whole +and a `DataFrame` is always returend. $F_ARGUMENT_RULES @@ -1423,9 +1425,15 @@ end select(gd::GroupedDataFrame, args...; copycols::Bool=true, keepkeys::Bool=true, ungroup::Bool=true) -Apply `args` to `gd` following the rules described in [`combine`](@ref). -The returned object has as many rows as `parent(gd)`. -If an operation returns a single value it is always broadcasted to have this number of rows. +Apply `args` to `gd` following the rules described in [`combine`](@ref) and return the +result as a `DataFrame` if `ungroup=true` or `GroupedDataFrame` if `ungroup=false`. + +The `parent` of the returned value has as many rows as `parent(gd)`. If an operation +in `args` returns a single value it is always broadcasted to have this number of rows. + +Apply operations to each group in a [`GroupedDataFrame`](@ref) and return the combined +result as a `DataFrame` if `ungroup=true` or `GroupedDataFrame` if `ungroup=false`. + If `copycols=false` then do not perform copying of columns that are not transformed. From f34873cbf773c06986d4f17b939a49ec9542d40e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Tue, 5 May 2020 12:24:10 +0200 Subject: [PATCH 29/29] fix repeated code --- src/groupeddataframe/splitapplycombine.jl | 4 ---- 1 file changed, 4 deletions(-) diff --git a/src/groupeddataframe/splitapplycombine.jl b/src/groupeddataframe/splitapplycombine.jl index ef2f81be15..28b8be5c23 100644 --- a/src/groupeddataframe/splitapplycombine.jl +++ b/src/groupeddataframe/splitapplycombine.jl @@ -1431,10 +1431,6 @@ result as a `DataFrame` if `ungroup=true` or `GroupedDataFrame` if `ungroup=fals The `parent` of the returned value has as many rows as `parent(gd)`. If an operation in `args` returns a single value it is always broadcasted to have this number of rows. -Apply operations to each group in a [`GroupedDataFrame`](@ref) and return the combined -result as a `DataFrame` if `ungroup=true` or `GroupedDataFrame` if `ungroup=false`. - - If `copycols=false` then do not perform copying of columns that are not transformed. $KWARG_PROCESSING_RULES