From 55031d79f84d13c91e3488425fefda0638a5ed33 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= <bkamins@sgh.waw.pl>
Date: Mon, 27 Apr 2020 15:47:15 +0200
Subject: [PATCH 01/29] implement AbstractDataFrame functionality

---
 src/abstractdataframe/selection.jl | 89 +++++++++++++++++++-----------
 1 file changed, 57 insertions(+), 32 deletions(-)

diff --git a/src/abstractdataframe/selection.jl b/src/abstractdataframe/selection.jl
index 5aa917eb14..4fcca42d30 100644
--- a/src/abstractdataframe/selection.jl
+++ b/src/abstractdataframe/selection.jl
@@ -161,7 +161,7 @@ function select_transform!(nc::Pair{<:Union{Int, AbstractVector{Int}, AsTable},
     col_idx, (fun, newname) = nc
     # It is allowed to request a tranformation operation into a newname column
     # only once. This is ensured by the logic related to transformed_cols dictionaly
-    # in _select, therefore in select_transform! such a duplicate should not happen
+    # in _process, therefore in select_transform! such a duplicate should not happen
     @assert !hasproperty(newdf, newname)
     cdf = eachcol(df)
     if col_idx isa Int
@@ -185,6 +185,13 @@ function select_transform!(nc::Pair{<:Union{Int, AbstractVector{Int}, AsTable},
                 newdfcols[i] = fill!(similar(col, length(res)), first(col))
             end
         end
+
+        # this means that we use `select` or `transform` not `combine`
+        if !allow_resizing_newdf[] && ncol(newdf) == 0 && length(res) != nrow(df)
+            throw(ArgumentError("length $(length(res)) of vector returned from " *
+                                "function $fun is different than number of rows" *
+                                "$(nrow(df)) of the source data frame."))
+        end
         allow_resizing_newdf[] = false
         respar = parent(res)
         parent_cols = col_idx isa AsTable ? col_idx.cols : col_idx
@@ -196,9 +203,14 @@ function select_transform!(nc::Pair{<:Union{Int, AbstractVector{Int}, AsTable},
         end
     else
         res_unwrap = res isa Union{AbstractArray{<:Any, 0}, Ref} ? res[] : res
-        # allow squashing a scalar to 0 rows
-        newdf[!, newname] = fill!(Tables.allocatecolumn(typeof(res_unwrap),
-                                                        ncol(newdf) == 0 ? 1 : nrow(newdf)),
+        if ncol(newdf) == 0
+            # if allow_resizing_newdf[] is false we know this is select or transform
+            rows = allow_resizing_newdf[] ? 1 : nrow(df)
+        else
+            # allow squashing a scalar to 0 rows
+            rows = nrow(newdf)
+        end
+        newdf[!, newname] = fill!(Tables.allocatecolumn(typeof(res_unwrap), rows),
                                   res_unwrap)
     end
     # mark that column transformation was applied
@@ -518,22 +530,42 @@ julia> select(df, AsTable(:) => ByRow(mean))
 ```
 
 """
-select(df::DataFrame, args::AbstractVector{Int}; copycols::Bool=true) =
+select(df::AbstractDataFrame, args...; copycols::Bool=true) =
+    _select(df, args..., copycols=copycols, keeprows=true)
+
+"""
+    transform(df::AbstractDataFrame, args...; copycols::Bool=true)
+
+Create a new data frame that contains columns from `df` and adds columns
+specified by `args` and return it.
+Equivalent to `select(df, :, args..., copycols=copycols)`.
+
+See [`select`](@ref) for detailed rules regarding accepted values for `args`.
+"""
+transform(df::AbstractDataFrame, args...; copycols::Bool=true) =
+    select(df, :, args..., copycols=copycols)
+
+combine(df::AbstractDataFrame, args...) =
+    _select(df, args..., copycols=true, keeprows=false)
+
+combine(arg, df::AbstractDataFrame) = combine(arg, groupby(df, []))
+
+_select(df::DataFrame, args::AbstractVector{Int}; copycols::Bool, keeprows::Bool) =
     DataFrame(_columns(df)[args], Index(_names(df)[args]),
               copycols=copycols)
 
-function select(df::DataFrame, c::MultiColumnIndex; copycols::Bool=true)
+function _select(df::DataFrame, c::MultiColumnIndex; copycols::Bool, keeprows::Bool)
     if c isa AbstractVector{<:Pair}
-        return select(df, c..., copycols=copycols)
+        return _select(df, c..., copycols=copycols, keeprows=keeprows)
     else
-        return select(df, index(df)[c], copycols=copycols)
+        return _select(df, index(df)[c], copycols=copycols, keeprows=keeprows)
     end
 end
 
-select(df::DataFrame, c::ColumnIndex; copycols::Bool=true) =
-    select(df, [c], copycols=copycols)
+_select(df::DataFrame, c::ColumnIndex; copycols::Bool, keeprows::Bool) =
+    _select(df, [c], copycols=copycols, keeprows=keeprows)
 
-function select(df::DataFrame, cs...; copycols::Bool=true)
+function _select(df::DataFrame, cs...; copycols::Bool, keeprows::Bool)
     cs_vec = []
     for v in cs
         if v isa AbstractVector{<:Pair}
@@ -542,10 +574,11 @@ function select(df::DataFrame, cs...; copycols::Bool=true)
             push!(cs_vec, v)
         end
     end
-    _select(df, [normalize_selection(index(df), c) for c in cs_vec], copycols)
+    return _process(df, [normalize_selection(index(df), c) for c in cs_vec],
+                    copycols, keeprows)
 end
 
-function _select(df::AbstractDataFrame, normalized_cs, copycols::Bool)
+function _process(df::AbstractDataFrame, normalized_cs, copycols::Bool, keeprows::Bool)
     @assert !(df isa SubDataFrame && copycols==false)
     newdf = DataFrame()
     # the role of transformed_cols is the following
@@ -593,7 +626,9 @@ function _select(df::AbstractDataFrame, normalized_cs, copycols::Bool)
     end
     # we allow resizing newdf only if up to some point only scalars were put
     # in it. The moment we put any vector into newdf its number of rows becomes fixed
-    allow_resizing_newdf = Ref(true)
+    # Also if keeprows is true then we make sure to rpoduce nrow(df) rows so resizing
+    # is not allowed
+    allow_resizing_newdf = Ref(!keeprows)
     for nc in normalized_cs
         if nc isa AbstractVector{Int}
             allunique(nc) || throw(ArgumentError("duplicate column names selected"))
@@ -621,6 +656,7 @@ function _select(df::AbstractDataFrame, normalized_cs, copycols::Bool)
                                 newdfcols[i] = fill!(similar(col, nrow(df)), first(col))
                             end
                         end
+                        # here even if keeprows is true all is OK
                         newdf[!, newname] = copycols ? df[:, i] : df[!, i]
                         allow_resizing_newdf[] = false
                     end
@@ -643,18 +679,19 @@ function _select(df::AbstractDataFrame, normalized_cs, copycols::Bool)
     return newdf
 end
 
-select(dfv::SubDataFrame, ind::ColumnIndex; copycols::Bool=true) =
-    select(dfv, [ind], copycols=copycols)
+_select(dfv::SubDataFrame, ind::ColumnIndex; copycols::Bool, keeprows::Bool) =
+    _select(dfv, [ind], copycols=copycols, keeprows=keeprows)
 
-function select(dfv::SubDataFrame, args::MultiColumnIndex; copycols::Bool=true)
+function _select(dfv::SubDataFrame, args::MultiColumnIndex;
+                 copycols::Bool, keeprows::Bool)
     if args isa AbstractVector{<:Pair}
-        return select(dfv, args..., copycols=copycols)
+        return _select(dfv, args..., copycols=copycols, keeprows=keeprows)
     else
         return copycols ? dfv[:, args] : view(dfv, :, args)
     end
 end
 
-function select(dfv::SubDataFrame, args...; copycols::Bool=true)
+function select(dfv::SubDataFrame, args...; copycols::Bool, keeprows::Bool)
     if copycols
         cs_vec = []
         for v in args
@@ -664,7 +701,7 @@ function select(dfv::SubDataFrame, args...; copycols::Bool=true)
                 push!(cs_vec, v)
             end
         end
-        return _select(dfv, [normalize_selection(index(dfv), c) for c in cs_vec], true)
+        return _process(dfv, [normalize_selection(index(dfv), c) for c in cs_vec], true, true)
     else
         # we do not support transformations here
         # newinds contains only indexing; making it Vector{Any} avoids some compilation
@@ -692,15 +729,3 @@ function select(dfv::SubDataFrame, args...; copycols::Bool=true)
         return view(dfv, :, isempty(newinds) ? [] : All(newinds...))
     end
 end
-
-"""
-    transform(df::AbstractDataFrame, args...; copycols::Bool=true)
-
-Create a new data frame that contains columns from `df` and adds columns
-specified by `args` and return it.
-Equivalent to `select(df, :, args..., copycols=copycols)`.
-
-See [`select`](@ref) for detailed rules regarding accepted values for `args`.
-"""
-transform(df::AbstractDataFrame, args...; copycols::Bool=true) =
-    select(df, :, args..., copycols=copycols)

From 55bde129b56808f521235e114661ec41642d3a52 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= <bkamins@sgh.waw.pl>
Date: Mon, 27 Apr 2020 23:41:40 +0200
Subject: [PATCH 02/29] preparation in grouping, rename to _mutate in
 non-grouping

---
 src/abstractdataframe/selection.jl        | 28 +++++++++++------------
 src/groupeddataframe/splitapplycombine.jl | 17 ++++++++++----
 2 files changed, 26 insertions(+), 19 deletions(-)

diff --git a/src/abstractdataframe/selection.jl b/src/abstractdataframe/selection.jl
index 4fcca42d30..5e3f91fcf1 100644
--- a/src/abstractdataframe/selection.jl
+++ b/src/abstractdataframe/selection.jl
@@ -531,7 +531,7 @@ julia> select(df, AsTable(:) => ByRow(mean))
 
 """
 select(df::AbstractDataFrame, args...; copycols::Bool=true) =
-    _select(df, args..., copycols=copycols, keeprows=true)
+    _manipulate(df, args..., copycols=copycols, keeprows=true)
 
 """
     transform(df::AbstractDataFrame, args...; copycols::Bool=true)
@@ -546,26 +546,26 @@ transform(df::AbstractDataFrame, args...; copycols::Bool=true) =
     select(df, :, args..., copycols=copycols)
 
 combine(df::AbstractDataFrame, args...) =
-    _select(df, args..., copycols=true, keeprows=false)
+    _manipulate(df, args..., copycols=true, keeprows=false)
 
 combine(arg, df::AbstractDataFrame) = combine(arg, groupby(df, []))
 
-_select(df::DataFrame, args::AbstractVector{Int}; copycols::Bool, keeprows::Bool) =
+_manipulate(df::DataFrame, args::AbstractVector{Int}; copycols::Bool, keeprows::Bool) =
     DataFrame(_columns(df)[args], Index(_names(df)[args]),
               copycols=copycols)
 
-function _select(df::DataFrame, c::MultiColumnIndex; copycols::Bool, keeprows::Bool)
+function _manipulate(df::DataFrame, c::MultiColumnIndex; copycols::Bool, keeprows::Bool)
     if c isa AbstractVector{<:Pair}
-        return _select(df, c..., copycols=copycols, keeprows=keeprows)
+        return _manipulate(df, c..., copycols=copycols, keeprows=keeprows)
     else
-        return _select(df, index(df)[c], copycols=copycols, keeprows=keeprows)
+        return _manipulate(df, index(df)[c], copycols=copycols, keeprows=keeprows)
     end
 end
 
-_select(df::DataFrame, c::ColumnIndex; copycols::Bool, keeprows::Bool) =
-    _select(df, [c], copycols=copycols, keeprows=keeprows)
+_manipulate(df::DataFrame, c::ColumnIndex; copycols::Bool, keeprows::Bool) =
+    _manipulate(df, [c], copycols=copycols, keeprows=keeprows)
 
-function _select(df::DataFrame, cs...; copycols::Bool, keeprows::Bool)
+function _manipulate(df::DataFrame, cs...; copycols::Bool, keeprows::Bool)
     cs_vec = []
     for v in cs
         if v isa AbstractVector{<:Pair}
@@ -679,19 +679,19 @@ function _process(df::AbstractDataFrame, normalized_cs, copycols::Bool, keeprows
     return newdf
 end
 
-_select(dfv::SubDataFrame, ind::ColumnIndex; copycols::Bool, keeprows::Bool) =
-    _select(dfv, [ind], copycols=copycols, keeprows=keeprows)
+_manipulate(dfv::SubDataFrame, ind::ColumnIndex; copycols::Bool, keeprows::Bool) =
+    _manipulate(dfv, [ind], copycols=copycols, keeprows=keeprows)
 
-function _select(dfv::SubDataFrame, args::MultiColumnIndex;
+function _manipulate(dfv::SubDataFrame, args::MultiColumnIndex;
                  copycols::Bool, keeprows::Bool)
     if args isa AbstractVector{<:Pair}
-        return _select(dfv, args..., copycols=copycols, keeprows=keeprows)
+        return _manipulate(dfv, args..., copycols=copycols, keeprows=keeprows)
     else
         return copycols ? dfv[:, args] : view(dfv, :, args)
     end
 end
 
-function select(dfv::SubDataFrame, args...; copycols::Bool, keeprows::Bool)
+function _manipulate(dfv::SubDataFrame, args...; copycols::Bool, keeprows::Bool)
     if copycols
         cs_vec = []
         for v in args
diff --git a/src/groupeddataframe/splitapplycombine.jl b/src/groupeddataframe/splitapplycombine.jl
index 566b02171a..1a1aa69083 100644
--- a/src/groupeddataframe/splitapplycombine.jl
+++ b/src/groupeddataframe/splitapplycombine.jl
@@ -173,6 +173,15 @@ function groupby(df::AbstractDataFrame, cols;
     return gd
 end
 
+function _check_cannonical(gdf::GroupedDataFrame)
+    @assert length(gdf.starts) == length(gdf.ends)
+    (gdf.starts[1] != 1 || gdf.ends[end] != nrow(parent(gdf))) && return false
+    for i in 2:length(gdf.starts)
+        gdf.starts[i] - gdf.ends[i-1] != 1 && return false
+    end
+    return true
+end
+
 const F_TYPE_RULES =
     """
     `fun` can return a single value, a row, a vector, or multiple rows.
@@ -574,14 +583,12 @@ julia> combine(gd, :, AsTable(Not(:a)) => sum)
 
 See [`by`](@ref) for more examples.
 """
-combine(f::Union{Base.Callable, Pair}, gd::GroupedDataFrame; keepkeys::Bool=true) =
-    combine(gd, f, keepkeys=keepkeys)
-combine(gd::GroupedDataFrame, f::Base.Callable; keepkeys::Bool=true) =
+combine(f::Base.Callable, gd::GroupedDataFrame; keepkeys::Bool=true) =
     combine_helper(f, gd, keepkeys=keepkeys)
-combine(gd::GroupedDataFrame, f::typeof(nrow); keepkeys::Bool=true) =
+combine(f::typeof(nrow), gd::GroupedDataFrame; keepkeys::Bool=true) =
     combine(gd, [nrow => :nrow], keepkeys=keepkeys)
 
-function combine(gd::GroupedDataFrame, p::Pair; keepkeys::Bool=true)
+function combine(p::Pair, gd::GroupedDataFrame; keepkeys::Bool=true)
     # move handling of aggregate to specialized combine
     p_from, p_to = p
 

From 2f81c63ba8b54b8225be91c709a64497fcf3220f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= <bkamins@sgh.waw.pl>
Date: Tue, 28 Apr 2020 00:50:38 +0200
Subject: [PATCH 03/29] tentative rework of _combine that should be able to
 support select and transform efficiently

---
 src/abstractdataframe/selection.jl        | 10 ++---
 src/groupeddataframe/splitapplycombine.jl | 45 +++++++++++++++++++++--
 2 files changed, 45 insertions(+), 10 deletions(-)

diff --git a/src/abstractdataframe/selection.jl b/src/abstractdataframe/selection.jl
index 5e3f91fcf1..f211ee09b8 100644
--- a/src/abstractdataframe/selection.jl
+++ b/src/abstractdataframe/selection.jl
@@ -3,12 +3,10 @@
 
 # normalize_selection function makes sure that whatever input format of idx is it
 # will end up in one of four canonical forms
-# 1) Int
-# 2) AbstractVector{Int}
-# 4) Pair{Int, <:Pair{<:Base.Callable, Symbol}}
-# 5) Pair{AbstractVector{Int}, <:Pair{<:Base.Callable, Symbol}}
-# 6) Pair{Int, Pair{ByRow, Symbol}}
-# 7) Pair{AbstractVector{Int}, Pair{ByRow, Symbol}}
+# 1) AbstractVector{Int}
+# 2) Pair{Int, <:Pair{<:Base.Callable, Symbol}}
+# 3) Pair{AbstractVector{Int}, <:Pair{<:Base.Callable, Symbol}}
+# 4) Pair{AsTable, <:Pair{<:Base.Callable, Symbol}}
 
 """
     ByRow
diff --git a/src/groupeddataframe/splitapplycombine.jl b/src/groupeddataframe/splitapplycombine.jl
index 1a1aa69083..8aed872aa2 100644
--- a/src/groupeddataframe/splitapplycombine.jl
+++ b/src/groupeddataframe/splitapplycombine.jl
@@ -1097,10 +1097,32 @@ function _agg2idx_map_helper(idx, idx_agg)
 end
 
 function _combine(f::AbstractVector{<:Pair},
-                  gd::GroupedDataFrame, nms::AbstractVector{Symbol})
+                  gd::GroupedDataFrame, nms::AbstractVector{Symbol},
+                  copycols::Bool=true, keeprows::Bool=false) # TODO: remove these defaults
     # here f should be normalized and in a form of source_cols => fun
     @assert all(x -> first(x) isa Union{Int, AbstractVector{Int}, AsTable}, f)
     @assert all(x -> last(x) isa Union{Base.Callable, ByRow}, f)
+
+    if keeprows
+        if !_check_cannonical(gd)
+            throw(ArgumentError("select or transform functions require that" *
+                                "GroupedDataFrame is not sorted or subsetted"))
+        end
+        idx_keeprows = Vector{Int}(undef, nrow(parent(gd)))
+        let i = 0
+            for (s, e) in zip(gd.starts, gd.ends)
+                v = gd.idx[s]
+                for k in s:e
+                    i += 1
+                    idx_keeprows[i] = v
+                end
+            end
+            @assert i == nrow(parent(gd))
+        end
+    else
+        idx_keeprows = nothing # should not be used but do not leave it unassigned
+    end
+
     idx_agg = nothing
     if any(isagg, f)
         # Compute indices of representative rows only once for all AbstractAggregates
@@ -1120,6 +1142,11 @@ function _combine(f::AbstractVector{<:Pair},
             agg = check_aggregate(last(p))
             outcol = agg(incol, gd)
             res[i] = idx_agg, outcol
+        elseif keeprows && fun isa identity && !(source_cols isa AsTable)
+            @assert source_cols isa Union{Int, AbstractVector{Int}}
+            @assert length(source_cols) == 1
+            outcol = parentdf[!, first(source_cols)]
+            res[i] = idx_keeprows, copycols ? copy(outcol) : outcol
         else
             if source_cols isa Int
                 incols = (parentdf[!, source_cols],)
@@ -1160,11 +1187,15 @@ function _combine(f::AbstractVector{<:Pair},
     # idx_agg === nothing then we have only functions that
     # returned multiple rows and idx_loc = 1
     idx_loc = findfirst(x -> x[1] !== idx_agg, res)
-    if isnothing(idx_loc)
+    if !keeprows && isnothing(idx_loc)
         @assert !isnothing(idx_agg)
         idx = idx_agg
     else
-        idx = res[idx_loc][1]
+        if keeprows
+            idx = idx_keeprows
+        else
+            idx = res[idx_loc][1]
+        end
         agg2idx_map = nothing
         for i in 1:length(res)
             if res[i][1] !== idx && res[i][1] != idx
@@ -1176,7 +1207,13 @@ function _combine(f::AbstractVector{<:Pair},
                     end
                     res[i] = idx, res[i][2][agg2idx_map]
                 elseif idx != res[i][1]
-                    throw(ArgumentError("all functions must return vectors of the same length"))
+                    if keeprows
+                        throw(ArgumentError("all functions must return vectors of " *
+                                            "the length equal to the group rows count " *
+                                            "in the source GroupedDataFrame"))
+                    else
+                        throw(ArgumentError("all functions must return vectors of the same length"))
+                    end
                 end
             end
         end

From fd951c51f9af0aebace945b5d4b4001385dcb72b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= <bkamins@sgh.waw.pl>
Date: Tue, 28 Apr 2020 12:02:27 +0200
Subject: [PATCH 04/29] continue grouping

---
 src/groupeddataframe/splitapplycombine.jl | 26 ++++++++++++++---------
 1 file changed, 16 insertions(+), 10 deletions(-)

diff --git a/src/groupeddataframe/splitapplycombine.jl b/src/groupeddataframe/splitapplycombine.jl
index 8aed872aa2..1a9895de5c 100644
--- a/src/groupeddataframe/splitapplycombine.jl
+++ b/src/groupeddataframe/splitapplycombine.jl
@@ -583,12 +583,15 @@ julia> combine(gd, :, AsTable(Not(:a)) => sum)
 
 See [`by`](@ref) for more examples.
 """
-combine(f::Base.Callable, gd::GroupedDataFrame; keepkeys::Bool=true) =
-    combine_helper(f, gd, keepkeys=keepkeys)
-combine(f::typeof(nrow), gd::GroupedDataFrame; keepkeys::Bool=true) =
-    combine(gd, [nrow => :nrow], keepkeys=keepkeys)
-
-function combine(p::Pair, gd::GroupedDataFrame; keepkeys::Bool=true)
+combine(f::Base.Callable, gd::GroupedDataFrame;
+        keepkeys::Bool=true, regroup::Bool=false) =
+    combine_helper(f, gd, keepkeys=keepkeys, regroup=regroup)
+combine(f::typeof(nrow), gd::GroupedDataFrame;
+        keepkeys::Bool=true, regroup::Bool=false) =
+    combine(gd, [nrow => :nrow], keepkeys=keepkeys, regroup=regroup)
+
+function combine(p::Pair, gd::GroupedDataFrame;
+                 keepkeys::Bool=true, regroup::Bool=false)
     # move handling of aggregate to specialized combine
     p_from, p_to = p
 
@@ -605,13 +608,13 @@ function combine(p::Pair, gd::GroupedDataFrame; keepkeys::Bool=true)
     else
         cs = p_from
     end
-    return combine_helper(cs => p_to, gd, keepkeys=keepkeys)
+    return combine_helper(cs => p_to, gd, keepkeys=keepkeys, regroup=regroup)
 end
 
 function combine(gd::GroupedDataFrame,
                  @nospecialize(cs::Union{Pair, typeof(nrow),
                                          ColumnIndex, MultiColumnIndex}...);
-                 keepkeys::Bool=true)
+                 keepkeys::Bool=true, regroup::Bool=false)
     @assert !isempty(cs)
     cs_vec = []
     for p in cs
@@ -684,7 +687,7 @@ function combine(gd::GroupedDataFrame,
     end
     f = Pair[first(x) => first(last(x)) for x in cs_norm]
     nms = Symbol[last(last(x)) for x in cs_norm]
-    return combine_helper(f, gd, nms, keepkeys=keepkeys)
+    return combine_helper(f, gd, nms, keepkeys=keepkeys, regroup=regroup)
 end
 
 function combine(gd::GroupedDataFrame; f...)
@@ -700,7 +703,10 @@ end
 
 function combine_helper(f, gd::GroupedDataFrame,
                         nms::Union{AbstractVector{Symbol},Nothing}=nothing;
-                        keepkeys::Bool=true)
+                        keepkeys::Bool, regroup::Bool)
+    if regroup && !keepkeys
+        throw(ArgumentError("keepkeys=false when regroup=true is not allowed"))
+    end
     if length(gd) > 0
         idx, valscat = _combine(f, gd, nms)
         keepkeys || return valscat

From eb9ace938a92cf72b42b62940071964ac79f4b8f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= <bkamins@sgh.waw.pl>
Date: Tue, 28 Apr 2020 14:48:28 +0200
Subject: [PATCH 05/29] implement select, transform, select! and transform! for
 GroupedDataFrame, fix bug in map

---
 src/deprecated.jl                         |  22 +
 src/groupeddataframe/splitapplycombine.jl | 503 ++++++----------------
 2 files changed, 164 insertions(+), 361 deletions(-)

diff --git a/src/deprecated.jl b/src/deprecated.jl
index 13ba76c3c7..e95b82aa6e 100644
--- a/src/deprecated.jl
+++ b/src/deprecated.jl
@@ -444,3 +444,25 @@ function aggregate(d::AbstractDataFrame, cols, fs::AbstractVector;
 end
 
 @deprecate deleterows!(df::DataFrame, inds) delete!(df, inds)
+
+@deprecate by(f::Union{Base.Callable, Pair}, d::AbstractDataFrame, cols::Any;
+   sort::Bool=false, skipmissing::Bool=false,
+   keepkeys::Bool=true) combine(groupby(d, cols, sort=sort, skipmissing=skipmissing),
+                                f, keepkeys=keepkeys)
+@deprecate by(d::AbstractDataFrame, cols::Any, f::Base.Callable;
+   sort::Bool=false, skipmissing::Bool=false,
+   keepkeys::Bool=true) combine(groupby(d, cols, sort=sort, skipmissing=skipmissing),
+                                f, keepkeys=keepkeys)
+@deprecate by(d::AbstractDataFrame, cols::Any, f::Pair;
+   sort::Bool=false, skipmissing::Bool=false,
+   keepkeys::Bool=true) combine(groupby(d, cols, sort=sort, skipmissing=skipmissing),
+                                f, keepkeys=keepkeys)
+
+@deprecate by(d::AbstractDataFrame, cols::Any, f::Union{Pair, typeof(nrow),
+                                             ColumnIndex, MultiColumnIndex}...;
+   sort::Bool=false, skipmissing::Bool=false,
+   keepkeys::Bool=true) combine(groupby(d, cols, sort=sort, skipmissing=skipmissing),
+                                f..., keepkeys=keepkeys)
+
+import Base: map
+@deprecate map(f::Union{Base.Callable, Pair}, gd::GroupedDataFrame) combine(f, gd, regroup=true)
diff --git a/src/groupeddataframe/splitapplycombine.jl b/src/groupeddataframe/splitapplycombine.jl
index 1a9895de5c..e5cb15bb4a 100644
--- a/src/groupeddataframe/splitapplycombine.jl
+++ b/src/groupeddataframe/splitapplycombine.jl
@@ -174,11 +174,15 @@ function groupby(df::AbstractDataFrame, cols;
 end
 
 function _check_cannonical(gdf::GroupedDataFrame)
-    @assert length(gdf.starts) == length(gdf.ends)
+    gmin, gmax = extrema(gdf.groups)
+    @assert length(gdf.starts) == length(gdf.ends) == gmax
+    @assert gmin <= 1
     (gdf.starts[1] != 1 || gdf.ends[end] != nrow(parent(gdf))) && return false
     for i in 2:length(gdf.starts)
         gdf.starts[i] - gdf.ends[i-1] != 1 && return false
     end
+    # gmin == 0 means we have dropped groups which is not possible here
+    @assert gmin == 1
     return true
 end
 
@@ -219,160 +223,6 @@ const F_TYPE_RULES =
     named `x1`, `x2` and so on.
     """
 
-"""
-    map(fun::Union{Function, Type}, gd::GroupedDataFrame)
-    map(pair::Pair, gd::GroupedDataFrame)
-
-Apply `fun` or `pair` to each group of rows and return a [`GroupedDataFrame`](@ref).
-
-If `fun` is specified it must be a function, and it is passed a [`SubDataFrame`](@ref)
-view for each group and can return any return value defined below.
-Note that this form is slower than `pair` due to type instability.
-
-If `pair` is passed then it must follow the rules specified for transformations in
-[`select`](@ref) and have the form `source_cols => fun`,
-`source_cols => fun => target_col`, or `source_col => target_col`.
-Function defined by `fun` is passed `SubArray` views as positional arguments for
-each column specified to be selected and can return any return value defined below,
-or a `NamedTuple` containing these `SubArray`s if `source_cols` is an `AsTable` selector.
-As a special case `nrow` or `nrow => target_col` can be passed without specifying
-input columns to efficiently calculate number of rows in each group.
-If `nrow` is passed the resulting column name is `:nrow`.
-
-
-$F_TYPE_RULES
-
-See also [`combine`](@ref) that returns a `DataFrame` rather than a `GroupedDataFrame`.
-
-# Examples
-```jldoctest
-julia> df = DataFrame(a = repeat([1, 2, 3, 4], outer=[2]),
-                      b = repeat([2, 1], outer=[4]),
-                      c = 1:8);
-
-julia> gd = groupby(df, :a);
-
-julia> map(sdf -> sum(sdf.c), gd)
-GroupedDataFrame{DataFrame} with 4 groups based on key: :a
-First Group: 1 row
-│ Row │ a     │ x1    │
-│     │ Int64 │ Int64 │
-├─────┼───────┼───────┤
-│ 1   │ 1     │ 6     │
-⋮
-Last Group: 1 row
-│ Row │ a     │ x1    │
-│     │ Int64 │ Int64 │
-├─────┼───────┼───────┤
-│ 1   │ 4     │ 12    │
-
-julia> map(:c => sum, gd)
-GroupedDataFrame with 4 groups based on key: a
-First Group (1 row): a = 1
-│ Row │ a     │ c_sum │
-│     │ Int64 │ Int64 │
-├─────┼───────┼───────┤
-│ 1   │ 1     │ 6     │
-⋮
-Last Group (1 row): a = 4
-│ Row │ a     │ c_sum │
-│     │ Int64 │ Int64 │
-├─────┼───────┼───────┤
-│ 1   │ 4     │ 12    │
-
-julia> map(nrow, gd)
-GroupedDataFrame with 4 groups based on key: a
-First Group (1 row): a = 1
-│ Row │ a     │ nrow  │
-│     │ Int64 │ Int64 │
-├─────┼───────┼───────┤
-│ 1   │ 1     │ 2     │
-⋮
-Last Group (1 row): a = 4
-│ Row │ a     │ nrow  │
-│     │ Int64 │ Int64 │
-├─────┼───────┼───────┤
-│ 1   │ 4     │ 2     │
-
-julia> map(AsTable(valuecols(gd)) => sum, gd)
-GroupedDataFrame with 4 groups based on key: a
-First Group (2 rows): a = 1
-│ Row │ a     │ b_c_sum │
-│     │ Int64 │ Int64   │
-├─────┼───────┼─────────┤
-│ 1   │ 1     │ 3       │
-│ 2   │ 1     │ 7       │
-⋮
-Last Group (2 rows): a = 4
-│ Row │ a     │ b_c_sum │
-│     │ Int64 │ Int64   │
-├─────┼───────┼─────────┤
-│ 1   │ 4     │ 5       │
-│ 2   │ 4     │ 9       │
-```
-
-See [`by`](@ref) for more examples.
-"""
-function Base.map(f::Union{Base.Callable, Pair}, gd::GroupedDataFrame)
-    if length(gd) > 0
-        # here we know that parent(gd) has at least 1 column
-        if f isa Pair || f === nrow
-            if f isa Pair && first(f) isa Tuple
-                Base.depwarn("passing a Tuple $(first(f)) as column selector is deprecated" *
-                             ", use a vector $(collect(first(f))) instead", :combine)
-                source_cols, (fun, out_col) = normalize_selection(index(parent(gd)),
-                                                                  collect(first(f)) => last(f))
-            else
-                source_cols, (fun, out_col) = normalize_selection(index(parent(gd)), f)
-            end
-            # verify if it is not better to use a fast path, which we achieve by
-            # calling _combine(::AbstractVector, ::GroupedDataFrame, ::AbstractVector)
-            # as _combine(::Pair, ::GroupedDataFrame, ::Nothing) does not support it
-            if isagg(source_cols => fun)
-                idx, valscat = _combine([source_cols => fun], gd, [out_col])
-            else
-                idx, valscat = _combine(source_cols => last(f), gd, nothing)
-            end
-        else
-            idx, valscat = _combine(f, gd, nothing)
-        end
-        keys = _names(parent(gd))[gd.cols]
-        for key in keys
-            if hasproperty(valscat, key) &&
-               !isequal(valscat[!, key], view(parent(gd)[!, key], idx))
-               throw(ArgumentError("column :$key in returned data frame " *
-                                   "is not equal to grouping key :$key"))
-            end
-        end
-        newparent = hcat!(parent(gd)[idx, gd.cols],
-                          select(valscat, Not(intersect(keys, _names(valscat))), copycols=false))
-        if length(idx) == 0
-            return GroupedDataFrame(newparent, collect(1:length(gd.cols)), idx,
-                                    Int[], Int[], Int[], 0, Dict{Any,Int}())
-        end
-        starts = Vector{Int}(undef, length(gd))
-        ends = Vector{Int}(undef, length(gd))
-        starts[1] = 1
-        j = 1
-        @inbounds for i in 2:length(idx)
-            if idx[i] != idx[i-1]
-                j += 1
-                starts[j] = i
-                ends[j-1] = i - 1
-            end
-        end
-        # In case some groups have to be dropped
-        resize!(starts, j)
-        resize!(ends, j)
-        ends[end] = length(idx)
-        return GroupedDataFrame(newparent, collect(1:length(gd.cols)), idx,
-                                collect(1:length(idx)), starts, ends, j, nothing)
-    else
-        return GroupedDataFrame(parent(gd)[1:0, gd.cols], collect(1:length(gd.cols)),
-                                Int[], Int[], Int[], Int[], 0, Dict{Any,Int}())
-    end
-end
-
 const F_ARGUMENT_RULES =
     """
 
@@ -583,9 +433,12 @@ julia> combine(gd, :, AsTable(Not(:a)) => sum)
 
 See [`by`](@ref) for more examples.
 """
-combine(f::Base.Callable, gd::GroupedDataFrame;
-        keepkeys::Bool=true, regroup::Bool=false) =
-    combine_helper(f, gd, keepkeys=keepkeys, regroup=regroup)
+function combine(f::Base.Callable, gd::GroupedDataFrame;
+                 keepkeys::Bool=true, regroup::Bool=false)
+    return combine_helper(f, gd, keepkeys=keepkeys, regroup=regroup,
+                          copycols=true, keeprows=false)
+end
+
 combine(f::typeof(nrow), gd::GroupedDataFrame;
         keepkeys::Bool=true, regroup::Bool=false) =
     combine(gd, [nrow => :nrow], keepkeys=keepkeys, regroup=regroup)
@@ -608,13 +461,20 @@ function combine(p::Pair, gd::GroupedDataFrame;
     else
         cs = p_from
     end
-    return combine_helper(cs => p_to, gd, keepkeys=keepkeys, regroup=regroup)
+    return combine_helper(cs => p_to, gd, keepkeys=keepkeys, regroup=regroup,
+                          copycols=true, keeprows=false)
 end
 
-function combine(gd::GroupedDataFrame,
-                 @nospecialize(cs::Union{Pair, typeof(nrow),
-                                         ColumnIndex, MultiColumnIndex}...);
-                 keepkeys::Bool=true, regroup::Bool=false)
+combine(gd::GroupedDataFrame,
+        cs::Union{Pair, typeof(nrow), ColumnIndex, MultiColumnIndex}...;
+        keepkeys::Bool=true, regroup::Bool=false) =
+    _combine_executor(gd, cs..., keepkeys=keepkeys, regroup=regroup,
+                      copycols=true, keeprows=false)
+
+function _combine_executor(gd::GroupedDataFrame,
+                           @nospecialize(cs::Union{Pair, typeof(nrow),
+                                                   ColumnIndex, MultiColumnIndex}...);
+                 keepkeys::Bool, regroup::Bool, copycols::Bool, keeprows::Bool)
     @assert !isempty(cs)
     cs_vec = []
     for p in cs
@@ -687,7 +547,8 @@ function combine(gd::GroupedDataFrame,
     end
     f = Pair[first(x) => first(last(x)) for x in cs_norm]
     nms = Symbol[last(last(x)) for x in cs_norm]
-    return combine_helper(f, gd, nms, keepkeys=keepkeys, regroup=regroup)
+    return combine_helper(f, gd, nms, keepkeys=keepkeys, regroup=regroup,
+                          copycols=copycols, keeprows=keeprows)
 end
 
 function combine(gd::GroupedDataFrame; f...)
@@ -703,25 +564,94 @@ end
 
 function combine_helper(f, gd::GroupedDataFrame,
                         nms::Union{AbstractVector{Symbol},Nothing}=nothing;
-                        keepkeys::Bool, regroup::Bool)
+                        keepkeys::Bool, regroup::Bool,
+                        copycols::Bool, keeprows::Bool)
     if regroup && !keepkeys
         throw(ArgumentError("keepkeys=false when regroup=true is not allowed"))
     end
     if length(gd) > 0
-        idx, valscat = _combine(f, gd, nms)
-        keepkeys || return valscat
+        idx, valscat = _combine(f, gd, nms, copycols, keeprows)
+        keepkeys || regroup || return valscat
         keys = groupcols(gd)
         for key in keys
-            if hasproperty(valscat, key) &&
-               !isequal(valscat[!, key], view(parent(gd)[!, key], idx))
-               throw(ArgumentError("column :$key in returned data frame " *
-                                   "is not equal to grouping key :$key"))
+            if hasproperty(valscat, key)
+                if keeprows
+                    isequal(valscat[!, key], parent(gd)[!, key]) ||
+                    throw(ArgumentError("column :$key in returned data frame " *
+                                        "is not equal to grouping key :$key"))
+
+                else
+                    isequal(valscat[!, key], view(parent(gd)[!, key], idx)) ||
+                    throw(ArgumentError("column :$key in returned data frame " *
+                                        "is not equal to grouping key :$key"))
+                end
             end
         end
-        return hcat!(parent(gd)[idx, gd.cols],
-                     select(valscat, Not(intersect(keys, _names(valscat))), copycols=false))
+        if keeprows
+            newparent = hcat!(select(parent(gd), gd.cols, copycols=copycols),
+                              select(valscat, Not(intersect(keys, _names(valscat))),
+                                     copycols=false))
+        else
+            newparent = hcat!(parent(gd)[idx, gd.cols],
+                              select(valscat, Not(intersect(keys, _names(valscat))),
+                                     copycols=false))
+        end
+        regroup || return newparent
+
+        if length(idx) == 0
+            @assert nrow(newparent) == 0
+            return GroupedDataFrame(newparent, collect(1:length(gd.cols)), Int[],
+                                    Int[], Int[], Int[], 0, Dict{Any,Int}())
+        end
+        if keeprows
+            # in this case we are sure that the result GroupedDataFrame has the
+            # same structure as the source
+            # we do not copy data as it should be safe - we never mutate fields of gd
+            if isnothing(getfield(gd, :keymap))
+                return GroupedDataFrame(newparent, gd.cols, gd.groups, gd.idx,
+                                        gd.starts, gd.ends, gd.ngroups, nothing)
+            else
+                return GroupedDataFrame(newparent, gd.cols, gd.groups, gd.idx,
+                                        gd.starts, gd.ends, gd.ngroups, gd.keymap)
+            end
+        else
+            starts = Vector{Int}(undef, length(gd))
+            ends = Vector{Int}(undef, length(gd))
+            starts[1] = 1
+            j = 1
+            for i in 2:length(idx)
+                if idx[i] != idx[i-1]
+                    j += 1
+                    starts[j] = i
+                    ends[j-1] = i - 1
+                end
+            end
+            # it is impossible to get more groups in the output than we had initially
+            @assert j <= length(gd)
+            # In case some groups have to be dropped
+            resize!(starts, j)
+            resize!(ends, j)
+            ends[end] = length(idx)
+
+            groups = zeros(Int, length(idx))
+            for i in 1:j
+                @inbounds for k in starts[i]:ends[i]
+                    groups[k] = i
+                end
+            end
+            # all groups must be filled
+            @assert minimum(grouups) == 1
+
+            return GroupedDataFrame(newparent, collect(1:length(gd.cols)), groups,
+                                    collect(1:length(idx)), starts, ends, j, nothing)
+        end
     else
-        return keepkeys ? parent(gd)[1:0, gd.cols] : DataFrame()
+        if regroup
+            return GroupedDataFrame(parent(gd)[1:0, gd.cols], collect(1:length(gd.cols)),
+                                    Int[], Int[], Int[], Int[], 0, Dict{Any,Int}())
+        else
+            return keepkeys ? parent(gd)[1:0, gd.cols] : DataFrame()
+        end
     end
 end
 
@@ -1104,7 +1034,7 @@ end
 
 function _combine(f::AbstractVector{<:Pair},
                   gd::GroupedDataFrame, nms::AbstractVector{Symbol},
-                  copycols::Bool=true, keeprows::Bool=false) # TODO: remove these defaults
+                  copycols::Bool, keeprows::Bool) # TODO: remove these defaults
     # here f should be normalized and in a form of source_cols => fun
     @assert all(x -> first(x) isa Union{Int, AbstractVector{Int}, AsTable}, f)
     @assert all(x -> last(x) isa Union{Base.Callable, ByRow}, f)
@@ -1126,7 +1056,7 @@ function _combine(f::AbstractVector{<:Pair},
             @assert i == nrow(parent(gd))
         end
     else
-        idx_keeprows = nothing # should not be used but do not leave it unassigned
+        idx_keeprows = nothing
     end
 
     idx_agg = nothing
@@ -1148,7 +1078,7 @@ function _combine(f::AbstractVector{<:Pair},
             agg = check_aggregate(last(p))
             outcol = agg(incol, gd)
             res[i] = idx_agg, outcol
-        elseif keeprows && fun isa identity && !(source_cols isa AsTable)
+        elseif keeprows && fun === identity && !(source_cols isa AsTable)
             @assert source_cols isa Union{Int, AbstractVector{Int}}
             @assert length(source_cols) == 1
             outcol = parentdf[!, first(source_cols)]
@@ -1224,6 +1154,17 @@ function _combine(f::AbstractVector{<:Pair},
             end
         end
     end
+
+    for (i, (col_idx, col)) in enumerate(res)
+        if keeprows && res[i][1] !== idx_keeprows # we need to reorder the column
+            newcol = similar(col)
+            # we can probably make it more efficient, but I leave it as an optimization for the future
+            for i in axes(col, 1)
+                newcol[gd.idx[i]] = col[i]
+            end
+            res[i] = (col_idx, newcol)
+        end
+    end
     outcols = map(x -> x[2], res)
     # this check is redundant given we check idx above
     # but it is safer to double check and it is cheap
@@ -1496,188 +1437,28 @@ function _combine_tables_with_first!(first::Union{AbstractDataFrame,
     return outcols, colnames
 end
 
-"""
-    by(d::AbstractDataFrame, cols::Any, args...;
-       sort::Bool=false, skipmissing::Bool=false, keepkeys::Bool=true)
-    by(fun::Union{Function, Type}, d::AbstractDataFrame, cols::Any;
-       sort::Bool=false, skipmissing::Bool=false, keepkeys::Bool=true)
-    by(pair::Pair, d::AbstractDataFrame, cols::Any;
-       sort::Bool=false, skipmissing::Bool=false, keepkeys::Bool=true)
-    by(d::AbstractDataFrame, cols::Any, fun::Union{Function, Type};
-       sort::Bool=false, skipmissing::Bool=false, keepkeys::Bool=true)
-    by(d::AbstractDataFrame, cols::Any, pair::Pair;
-       sort::Bool=false, skipmissing::Bool=false, keepkeys::Bool=true)
-
-Split-apply-combine in one step: apply `fun`, `pair` or `args` to each grouping
-in `df` based on grouping columns `cols`, and return a `DataFrame`.
-This is a shorthand for `combine` called on
-`groupby(df, cols, sort=sort, skipmissing=skipmissing)`.
-
-$F_ARGUMENT_RULES
-
-$F_TYPE_RULES
-
-$KWARG_PROCESSING_RULES
-
-The resulting data frame will be sorted if `sort=true` is passed.
-Otherwise, ordering of rows is undefined.
-
-If `skipmissing=true` rows with `missing` values in one of the grouping columns
-`cols` will be skipped.
-
-See [`groupby`](@ref) and [`combine`](@ref) and for details and more examples.
-
-# Examples
-```jldoctest
-julia> df = DataFrame(a = repeat([1, 2, 3, 4], outer=[2]),
-                      b = repeat([2, 1], outer=[4]),
-                      c = 1:8);
-
-julia> by(df, :a, :c => sum, nrow)
-4×3 DataFrame
-│ Row │ a     │ c_sum │ nrow  │
-│     │ Int64 │ Int64 │ Int64 │
-├─────┼───────┼───────┼───────┤
-│ 1   │ 1     │ 6     │ 2     │
-│ 2   │ 2     │ 8     │ 2     │
-│ 3   │ 3     │ 10    │ 2     │
-│ 4   │ 4     │ 12    │ 2     │
-
-julia> by(sdf -> sum(sdf.c), df, :a) # Slower variant
-4×2 DataFrame
-│ Row │ a     │ x1    │
-│     │ Int64 │ Int64 │
-├─────┼───────┼───────┤
-│ 1   │ 1     │ 6     │
-│ 2   │ 2     │ 8     │
-│ 3   │ 3     │ 10    │
-│ 4   │ 4     │ 12    │
-
-julia> by(df, :a) do d # do syntax for the slower variant
-           sum(d.c)
-       end
-4×2 DataFrame
-│ Row │ a     │ x1    │
-│     │ Int64 │ Int64 │
-├─────┼───────┼───────┤
-│ 1   │ 1     │ 6     │
-│ 2   │ 2     │ 8     │
-│ 3   │ 3     │ 10    │
-│ 4   │ 4     │ 12    │
-
-julia> by(df, :a, :c => (x -> sum(log, x)) => :sum_log_c) # specifying a name for target column
-4×2 DataFrame
-│ Row │ a     │ sum_log_c │
-│     │ Int64 │ Float64   │
-├─────┼───────┼───────────┤
-│ 1   │ 1     │ 1.60944   │
-│ 2   │ 2     │ 2.48491   │
-│ 3   │ 3     │ 3.04452   │
-│ 4   │ 4     │ 3.46574   │
-
-julia> by(df, :a, [:b, :c] .=> sum) # passing a vector of pairs
-4×3 DataFrame
-│ Row │ a     │ b_sum │ c_sum │
-│     │ Int64 │ Int64 │ Int64 │
-├─────┼───────┼───────┼───────┤
-│ 1   │ 1     │ 4     │ 6     │
-│ 2   │ 2     │ 2     │ 8     │
-│ 3   │ 3     │ 4     │ 10    │
-│ 4   │ 4     │ 2     │ 12    │
-
-julia> by(df, :a) do sdf # dropping group when DataFrame() is returned
-          sdf.c[1] != 1 ? sdf : DataFrame()
-       end
-6×3 DataFrame
-│ Row │ a     │ b     │ c     │
-│     │ Int64 │ Int64 │ Int64 │
-├─────┼───────┼───────┼───────┤
-│ 1   │ 2     │ 1     │ 2     │
-│ 2   │ 2     │ 1     │ 6     │
-│ 3   │ 3     │ 2     │ 3     │
-│ 4   │ 3     │ 2     │ 7     │
-│ 5   │ 4     │ 1     │ 4     │
-│ 6   │ 4     │ 1     │ 8     │
-
-julia> by(df, :a, :b => :b1, :c => :c1,
-               [:b, :c] => +, keepkeys=false) # auto-splatting, renaming and keepkeys
-8×3 DataFrame
-│ Row │ b1    │ c1    │ b_c_+ │
-│     │ Int64 │ Int64 │ Int64 │
-├─────┼───────┼───────┼───────┤
-│ 1   │ 2     │ 1     │ 3     │
-│ 2   │ 2     │ 5     │ 7     │
-│ 3   │ 1     │ 2     │ 3     │
-│ 4   │ 1     │ 6     │ 7     │
-│ 5   │ 2     │ 3     │ 5     │
-│ 6   │ 2     │ 7     │ 9     │
-│ 7   │ 1     │ 4     │ 5     │
-│ 8   │ 1     │ 8     │ 9     │
-
-julia> by(df, :a, :b, :c => sum) # passing columns and broadcasting
-8×3 DataFrame
-│ Row │ a     │ b     │ c_sum │
-│     │ Int64 │ Int64 │ Int64 │
-├─────┼───────┼───────┼───────┤
-│ 1   │ 1     │ 2     │ 6     │
-│ 2   │ 1     │ 2     │ 6     │
-│ 3   │ 2     │ 1     │ 8     │
-│ 4   │ 2     │ 1     │ 8     │
-│ 5   │ 3     │ 2     │ 10    │
-│ 6   │ 3     │ 2     │ 10    │
-│ 7   │ 4     │ 1     │ 12    │
-│ 8   │ 4     │ 1     │ 12    │
-
-julia> by(df, :a, [:b, :c] .=> Ref)
-4×3 DataFrame
-│ Row │ a     │ b_Ref    │ c_Ref    │
-│     │ Int64 │ SubArra… │ SubArra… │
-├─────┼───────┼──────────┼──────────┤
-│ 1   │ 1     │ [2, 2]   │ [1, 5]   │
-│ 2   │ 2     │ [1, 1]   │ [2, 6]   │
-│ 3   │ 3     │ [2, 2]   │ [3, 7]   │
-│ 4   │ 4     │ [1, 1]   │ [4, 8]   │
-
-julia> by(df, :a, AsTable(:) => Ref)
-4×2 DataFrame
-│ Row │ a     │ a_b_c_Ref                            │
-│     │ Int64 │ NamedTuple…                          │
-├─────┼───────┼──────────────────────────────────────┤
-│ 1   │ 1     │ (a = [1, 1], b = [2, 2], c = [1, 5]) │
-│ 2   │ 2     │ (a = [2, 2], b = [1, 1], c = [2, 6]) │
-│ 3   │ 3     │ (a = [3, 3], b = [2, 2], c = [3, 7]) │
-│ 4   │ 4     │ (a = [4, 4], b = [1, 1], c = [4, 8]) │
+select(gd::GroupedDataFrame, args...;
+       copycols::Bool=true, keepkeys::Bool=true, regroup::Bool=false) =
+    _combine_executor(gd, args..., copycols=copycols, keepkeys=keepkeys,
+                      regroup=regroup, keeprows=true)
+
+DataFrames.transform(gd::GroupedDataFrame, args...;
+          copycols::Bool=true, keepkeys::Bool=true, regroup::Bool=false) =
+    select(gd, :, args..., copycols=copycols, keepkeys=keepkeys,
+           regroup=regroup)
+
+function select!(gd::GroupedDataFrame{DataFrame}, args...; regroup::Bool=false)
+    newdf = select(gd, args..., copycols=false, regroup=false)
+    df = parent(gd)
+    copy!(_columns(df), _columns(newdf))
+    x = index(df)
+    copy!(_names(x), _names(newdf))
+    empty!(x.lookup)
+    for (i, n) in enumerate(x.names)
+        x.lookup[n] = i
+    end
+    return regroup ? gd : df
+end
 
-julia> by(df, :a, :, AsTable(Not(:a)) => sum)
-8×4 DataFrame
-│ Row │ a     │ b     │ c     │ b_c_sum │
-│     │ Int64 │ Int64 │ Int64 │ Int64   │
-├─────┼───────┼───────┼───────┼─────────┤
-│ 1   │ 1     │ 2     │ 1     │ 3       │
-│ 2   │ 1     │ 2     │ 5     │ 7       │
-│ 3   │ 2     │ 1     │ 2     │ 3       │
-│ 4   │ 2     │ 1     │ 6     │ 7       │
-│ 5   │ 3     │ 2     │ 3     │ 5       │
-│ 6   │ 3     │ 2     │ 7     │ 9       │
-│ 7   │ 4     │ 1     │ 4     │ 5       │
-│ 8   │ 4     │ 1     │ 8     │ 9       │
-```
-"""
-by(f::Union{Base.Callable, Pair}, d::AbstractDataFrame, cols::Any;
-   sort::Bool=false, skipmissing::Bool=false, keepkeys::Bool=true) =
-    combine(groupby(d, cols, sort=sort, skipmissing=skipmissing), f,
-            keepkeys=keepkeys)
-by(d::AbstractDataFrame, cols::Any, f::Base.Callable;
-   sort::Bool=false, skipmissing::Bool=false, keepkeys::Bool=true) =
-    combine(groupby(d, cols, sort=sort, skipmissing=skipmissing), f,
-            keepkeys=keepkeys)
-by(d::AbstractDataFrame, cols::Any, f::Pair;
-   sort::Bool=false, skipmissing::Bool=false, keepkeys::Bool=true) =
-    combine(groupby(d, cols, sort=sort, skipmissing=skipmissing), f,
-            keepkeys=keepkeys)
-
-by(d::AbstractDataFrame, cols::Any, f::Union{Pair, typeof(nrow),
-                                             ColumnIndex, MultiColumnIndex}...;
-   sort::Bool=false, skipmissing::Bool=false, keepkeys::Bool=true) =
-    combine(groupby(d, cols, sort=sort, skipmissing=skipmissing),
-            f..., keepkeys=keepkeys)
+transform!(gd::GroupedDataFrame{DataFrame}, args...; regroup::Bool=false) =
+    select!(gd, :, args..., regroup=regroup)

From 6908ee89d02d4d04763ea3dfcaea12a6159a91a1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= <bkamins@sgh.waw.pl>
Date: Tue, 28 Apr 2020 14:54:28 +0200
Subject: [PATCH 06/29] update DataFrame constructor

---
 src/groupeddataframe/groupeddataframe.jl | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/src/groupeddataframe/groupeddataframe.jl b/src/groupeddataframe/groupeddataframe.jl
index 6de06309d2..f04888cad9 100644
--- a/src/groupeddataframe/groupeddataframe.jl
+++ b/src/groupeddataframe/groupeddataframe.jl
@@ -79,7 +79,7 @@ Base.names(gd::GroupedDataFrame) = names(gd.parent)
 Base.names(gd::GroupedDataFrame, cols) = names(gd.parent, cols)
 _names(gd::GroupedDataFrame) = _names(gd.parent)
 
-function DataFrame(gd::GroupedDataFrame; copycols::Bool=true)
+function DataFrame(gd::GroupedDataFrame; copycols::Bool=true, keepkeys::Bool=true)
     if !copycols
         throw(ArgumentError("It is not possible to construct a `DataFrame`" *
                             "from GroupedDataFrame with `copycols=false`"))
@@ -94,7 +94,11 @@ function DataFrame(gd::GroupedDataFrame; copycols::Bool=true)
         doff += n
     end
     resize!(idx, doff - 1)
-    parent(gd)[idx, :]
+    if keepkeys
+        return parent(gd)[idx, :]
+    else
+        return parent(gd)[idx, Not(gd.cols)]
+    end
 end
 
 

From 7b644dde03b55cef63b53a15472cf132c99084ef Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= <bkamins@sgh.waw.pl>
Date: Tue, 28 Apr 2020 15:04:05 +0200
Subject: [PATCH 07/29] fix handling of aggregates

---
 src/groupeddataframe/splitapplycombine.jl | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/groupeddataframe/splitapplycombine.jl b/src/groupeddataframe/splitapplycombine.jl
index e5cb15bb4a..a73eff4535 100644
--- a/src/groupeddataframe/splitapplycombine.jl
+++ b/src/groupeddataframe/splitapplycombine.jl
@@ -1,7 +1,3 @@
-#
-# groupby(), map(), combine(), by() and related
-#
-
 """
     groupby(d::AbstractDataFrame, cols; sort=false, skipmissing=false)
 
@@ -1141,7 +1137,7 @@ function _combine(f::AbstractVector{<:Pair},
                     if isnothing(agg2idx_map)
                         agg2idx_map = _agg2idx_map_helper(idx, idx_agg)
                     end
-                    res[i] = idx, res[i][2][agg2idx_map]
+                    res[i] = idx_agg, res[i][2][agg2idx_map]
                 elseif idx != res[i][1]
                     if keeprows
                         throw(ArgumentError("all functions must return vectors of " *
@@ -1155,6 +1151,10 @@ function _combine(f::AbstractVector{<:Pair},
         end
     end
 
+    # remember that here first field in res[i] is not useful - it is just needed
+    # to keep track how the column was generated
+    # a correct index is stored in idx variable
+
     for (i, (col_idx, col)) in enumerate(res)
         if keeprows && res[i][1] !== idx_keeprows # we need to reorder the column
             newcol = similar(col)

From 27532350b7068be01ac5337701b6b9c84ec9b421 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= <bkamins@sgh.waw.pl>
Date: Tue, 28 Apr 2020 17:25:00 +0200
Subject: [PATCH 08/29] code cleanup

---
 src/groupeddataframe/splitapplycombine.jl | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/src/groupeddataframe/splitapplycombine.jl b/src/groupeddataframe/splitapplycombine.jl
index a73eff4535..144f84f505 100644
--- a/src/groupeddataframe/splitapplycombine.jl
+++ b/src/groupeddataframe/splitapplycombine.jl
@@ -636,7 +636,7 @@ function combine_helper(f, gd::GroupedDataFrame,
                 end
             end
             # all groups must be filled
-            @assert minimum(grouups) == 1
+            @assert minimum(groups) == 1
 
             return GroupedDataFrame(newparent, collect(1:length(gd.cols)), groups,
                                     collect(1:length(idx)), starts, ends, j, nothing)
@@ -1172,16 +1172,20 @@ function _combine(f::AbstractVector{<:Pair},
     return idx, DataFrame(collect(AbstractVector, outcols), nms)
 end
 
-function _combine(fun::Base.Callable, gd::GroupedDataFrame, ::Nothing)
+function _combine(fun::Base.Callable, gd::GroupedDataFrame, ::Nothing,
+                  copycols::Bool, keeprows::Bool)
+    @assert copycols && !keeprows
     firstres = fun(gd[1])
     idx, outcols, nms = _combine_multicol(firstres, fun, gd, nothing)
     valscat = DataFrame(collect(AbstractVector, outcols), nms)
     return idx, valscat
 end
 
-function _combine(p::Pair, gd::GroupedDataFrame, ::Nothing)
+function _combine(p::Pair, gd::GroupedDataFrame, ::Nothing,
+                  copycols::Bool, keeprows::Bool)
     # here p should not be normalized as we allow tabular return value from fun
     # map and combine should not dispatch here if p is isagg
+    @assert copycols && !keeprows
     source_cols, (fun, out_col) = normalize_selection(index(parent(gd)), p)
     parentdf = parent(gd)
     if source_cols isa Int

From 2a03190e1be0d4cffe56d1908754faa057d8b792 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= <bkamins@sgh.waw.pl>
Date: Tue, 28 Apr 2020 19:54:25 +0200
Subject: [PATCH 09/29] improve canonical check + start rewriting tests

---
 src/groupeddataframe/groupeddataframe.jl  |   4 -
 src/groupeddataframe/splitapplycombine.jl |  17 ++-
 test/grouping.jl                          | 124 ++++++++++++----------
 test/string.jl                            |  25 ++---
 4 files changed, 84 insertions(+), 86 deletions(-)

diff --git a/src/groupeddataframe/groupeddataframe.jl b/src/groupeddataframe/groupeddataframe.jl
index f04888cad9..5a4349845e 100644
--- a/src/groupeddataframe/groupeddataframe.jl
+++ b/src/groupeddataframe/groupeddataframe.jl
@@ -1,7 +1,3 @@
-#
-# Type definition and basic methods
-#
-
 """
     GroupedDataFrame
 
diff --git a/src/groupeddataframe/splitapplycombine.jl b/src/groupeddataframe/splitapplycombine.jl
index 144f84f505..b9123f88dd 100644
--- a/src/groupeddataframe/splitapplycombine.jl
+++ b/src/groupeddataframe/splitapplycombine.jl
@@ -169,16 +169,15 @@ function groupby(df::AbstractDataFrame, cols;
     return gd
 end
 
-function _check_cannonical(gdf::GroupedDataFrame)
-    gmin, gmax = extrema(gdf.groups)
-    @assert length(gdf.starts) == length(gdf.ends) == gmax
-    @assert gmin <= 1
-    (gdf.starts[1] != 1 || gdf.ends[end] != nrow(parent(gdf))) && return false
-    for i in 2:length(gdf.starts)
-        gdf.starts[i] - gdf.ends[i-1] != 1 && return false
+function _check_cannonical(gd::GroupedDataFrame)
+    groups = gd.groups
+    isempty(groups) && return true
+    maxseen = 1
+    for g in groups
+        1 <= g <= maxseen + 1 || return false
+        maxseen = max(maxseen, g)
     end
-    # gmin == 0 means we have dropped groups which is not possible here
-    @assert gmin == 1
+    @assert maxseen == gd.ngroups
     return true
 end
 
diff --git a/test/grouping.jl b/test/grouping.jl
index c9587302c6..349b06b98b 100644
--- a/test/grouping.jl
+++ b/test/grouping.jl
@@ -28,12 +28,20 @@ function _levels!(x::PooledArray, levels::AbstractVector)
 end
 _levels!(x::CategoricalArray, levels::AbstractVector) = levels!(x, levels)
 
-function groupby_checked(df::AbstractDataFrame, keys, args...; kwargs...)
-    ogd = groupby(df, keys, args...; kwargs...)
-
+function validate_gdf(ogd::GroupedDataFrame)
     # To return original object to test when indices have not been computed
     gd = deepcopy(ogd)
 
+    @assert allunique(gd.cols)
+    @assert minimum(gd.cols) >= 1
+    @assert maximum(gd.cols) <= ncol(parent(gd))
+
+    g = sort!(unique(gd.groups))
+    @assert 0 <= g[1] <= 1
+    @assert g == g[1]:g[end]
+    @assert length(gd.starts) == length(gd.ends) == g[end]
+    @assert isperm(gd.idx)
+
     # checking that groups field is consistent with other fields
     # (since == and isequal do not use it)
     # and that idx is increasing per group
@@ -54,9 +62,6 @@ function groupby_checked(df::AbstractDataFrame, keys, args...; kwargs...)
 
         # correct start-end relations
         for i in eachindex(se)
-            firstkeys = gd.parent[gd.idx[se[i][1]], gd.cols]
-            # all grouping keys must be equal within a group
-            @assert all(j -> gd.parent[gd.idx[j], gd.cols] ≅ firstkeys, se[i][1]:se[i][2])
             @assert se[i][1] <= se[i][2]
             if i > 1
                 # the blocks returned by groupby must be continuous
@@ -73,7 +78,12 @@ function groupby_checked(df::AbstractDataFrame, keys, args...; kwargs...)
         @test allunique(eachrow(gd.parent[gd.idx[gd.starts], gd.cols]))
     end
 
-    ogd
+end
+
+function groupby_checked(df::AbstractDataFrame, keys, args...; kwargs...)
+    ogd = groupby(df, keys, args...; kwargs...)
+    validate_gdf(ogd)
+    return ogd
 end
 
 @testset "parent" begin
@@ -86,11 +96,11 @@ end
 @testset "consistency" begin
     df = DataFrame(a = [1, 1, 2, 2], b = [5, 6, 7, 8], c = 1:4)
     push!(df.c, 5)
-    @test_throws AssertionError gd = groupby(df, :a)
+    @test_throws AssertionError groupby(df, :a)
 
     df = DataFrame(a = [1, 1, 2, 2], b = [5, 6, 7, 8], c = 1:4)
     push!(DataFrames._columns(df), df[:, :a])
-    @test_throws AssertionError gd = groupby(df, :a)
+    @test_throws AssertionError groupby(df, :a)
 end
 
 @testset "accepted columns" begin
@@ -142,40 +152,13 @@ end
         sres3 = sort(res3, colssym)
         sres4 = sort(res4, colssym)
 
-        # by() without groups sorting
-        @test sort(by(identity, df, cols), colssym) == shcatdf
-        @test sort(by(df -> df[1, :], df, cols), colssym) ==
-            shcatdf[.!nonunique(shcatdf, colssym), :]
-        @test by(f1, df, cols) == res
-        @test by(f2, df, cols) == res
-        @test rename(by(f3, df, cols), :x1 => :xmax) == res
-        @test by(f4, df, cols) == res2
-        @test by(f5, df, cols) == res2
-        @test by(f6, df, cols) == res3
-        @test sort(by(f7, df, cols), colssym) == sres4
-        @test sort(by(f8, df, cols), colssym) == sres4
-
-        # by() with groups sorting
-        @test by(identity, df, cols, sort=true) == shcatdf
-        @test by(df -> df[1, :], df, cols, sort=true) ==
-            shcatdf[.!nonunique(shcatdf, colssym), :]
-        @test by(f1, df, cols, sort=true) == sres
-        @test by(f2, df, cols, sort=true) == sres
-        @test rename(by(f3, df, cols, sort=true), :x1 => :xmax) == sres
-        @test by(f4, df, cols, sort=true) == sres2
-        @test by(f5, df, cols, sort=true) == sres2
-        @test by(f6, df, cols, sort=true) == sres3
-        @test by(f7, df, cols, sort=true) == sres4
-        @test by(f8, df, cols, sort=true) == sres4
-
-        @test by(f1, df, [:a]) == by(f1, df, :a)
-        @test by(f1, df, [:a], sort=true) == by(f1, df, :a, sort=true)
-
         # groupby() without groups sorting
         gd = groupby_checked(df, cols)
         @test names(parent(gd))[gd.cols] == string.(colssym)
         df_comb = combine(identity, gd)
         @test sort(df_comb, colssym) == shcatdf
+        @test sort(combine(df -> df[1, :], gd), colssym) ==
+            shcatdf[.!nonunique(shcatdf, colssym), :]
         df_ref = DataFrame(gd)
         @test sort(hcat(df_ref[!, cols], df_ref[!, Not(cols)]), colssym) == shcatdf
         @test df_ref.x == df_comb.x
@@ -196,6 +179,8 @@ end
             @test all(gd[i][!, colssym[2]] .== sres[i, colssym[2]])
         end
         @test combine(identity, gd) == shcatdf
+        @test combine(df -> df[1, :], gd, cols, sort=true) ==
+            shcatdf[.!nonunique(shcatdf, colssym), :]
         df_ref = DataFrame(gd)
         @test hcat(df_ref[!, cols], df_ref[!, Not(cols)]) == shcatdf
         @test combine(f1, gd) == sres
@@ -207,10 +192,10 @@ end
         @test combine(f7, gd) == sres4
         @test combine(f8, gd) == sres4
 
-        # map() without and with groups sorting
+        # combine() with regroup without and with groups sorting
         for sort in (false, true)
             gd = groupby_checked(df, cols, sort=sort)
-            v = map(d -> d[:, [:x]], gd)
+            v = combine(d -> d[:, [:x]], gd, regroup=true)
             @test length(gd) == length(v)
             nms = [colssym; :x]
             @test v[1] == gd[1][:, nms]
@@ -219,24 +204,33 @@ end
                 v[3] == gd[3][:, nms] &&
                 v[4] == gd[4][:, nms]
             @test names(parent(v))[v.cols] == string.(colssym)
-            v = map(f1, gd)
-            @test vcat(v[1], v[2], v[3], v[4]) == by(f1, df, cols, sort=sort)
-            v = map(f2, gd)
-            @test vcat(v[1], v[2], v[3], v[4]) == by(f2, df, cols, sort=sort)
-            v = map(f3, gd)
-            @test vcat(v[1], v[2], v[3], v[4]) == by(f3, df, cols, sort=sort)
-            v = map(f4, gd)
-            @test vcat(v[1], v[2], v[3], v[4]) == by(f4, df, cols, sort=sort)
-            v = map(f5, gd)
-            @test vcat(v[1], v[2], v[3], v[4]) == by(f5, df, cols, sort=sort)
-            v = map(f5, gd)
-            @test vcat(v[1], v[2], v[3], v[4]) == by(f5, df, cols, sort=sort)
-            v = map(f6, gd)
-            @test vcat(v[1], v[2], v[3], v[4]) == by(f6, df, cols, sort=sort)
-            v = map(f7, gd)
-            @test vcat(v[1], v[2], v[3], v[4]) == by(f7, df, cols, sort=sort)
-            v = map(f8, gd)
-            @test vcat(v[1], v[2], v[3], v[4]) == by(f8, df, cols, sort=sort)
+            v = combine(f1, gd, regroup=true)
+            @test extrema(v.grous) == extrema(gd.groups)
+            @test vcat(v[1], v[2], v[3], v[4]) == combine(f1, gd)
+            v = combine(f2, gd, regroup=true)
+            @test extrema(v.grous) == extrema(gd.groups)
+            @test vcat(v[1], v[2], v[3], v[4]) == combine(f2, gd)
+            v = combine(f3, gd, regroup=true)
+            @test extrema(v.grous) == extrema(gd.groups)
+            @test vcat(v[1], v[2], v[3], v[4]) == combine(f3, gd)
+            v = combine(f4, gd, regroup=true)
+            @test extrema(v.grous) == extrema(gd.groups)
+            @test vcat(v[1], v[2], v[3], v[4]) == combine(f4, gd)
+            v = combine(f5, gd, regroup=true)
+            @test extrema(v.grous) == extrema(gd.groups)
+            @test vcat(v[1], v[2], v[3], v[4]) == combine(f5, gd)
+            v = combine(f5, gd, regroup=true)
+            @test extrema(v.grous) == extrema(gd.groups)
+            @test vcat(v[1], v[2], v[3], v[4]) == combine(f5, gd)
+            v = combine(f6, gd, regroup=true)
+            @test extrema(v.grous) == extrema(gd.groups)
+            @test vcat(v[1], v[2], v[3], v[4]) == combine(f6, gd)
+            v = combine(f7, gd, regroup=true)
+            @test extrema(v.grous) == extrema(gd.groups)
+            @test vcat(v[1], v[2], v[3], v[4]) == combine(f7, gd)
+            v = combine(f8, gd, regroup=true)
+            @test extrema(v.grous) == extrema(gd.groups)
+            @test vcat(v[1], v[2], v[3], v[4]) == combine(f8, gd)
         end
     end
 
@@ -259,7 +253,7 @@ end
     df = DataFrame(v1=x, v2=x)
     groupby_checked(df, [:v1, :v2])
 
-    df2 = by(e->1, DataFrame(x=Int64[]), :x)
+    df2 = combine(e->1, groupby(DataFrame(x=Int64[]), :x))
     @test size(df2) == (0, 1)
     @test sum(df2.x) == 0
 
@@ -1972,4 +1966,16 @@ end
     @test_throws ArgumentError by(df, :g, AsTable([:x, :y]) => ByRow(x -> df[1, :]))
 end
 
+@testset "test correctness of regrouping" begin
+    df = DataFrame(g=[2,2,1,3,1,2,1,2,3])
+    gdf = groupby(df, :g)
+    gdf2 = combine(identity, gdf, regroup=true)
+    @test combine(gdf, :g => sum) == combine(gdf2, :g => sum)
+
+    df.id = 1:9
+    @test select(gdf, :g => sum) ==
+          sort!(combine(gdf, :g => sum, :id), :id)[:, Not(end)]
+    @test select(gdf2, :g => sum) == combine(gdf2, :g => sum, :g)
+end
+
 end # module
diff --git a/test/string.jl b/test/string.jl
index 817dc830aa..233b9cd8b8 100644
--- a/test/string.jl
+++ b/test/string.jl
@@ -166,25 +166,22 @@ end
     @test haskey(k[1], :a) == haskey(k[1], "a") == false
     @test  k[1].g == k[1]."g" == k[1][:g] == k[1]["g"]
 
-    @test by(df, :g, :a) == by(df, "g", "a") == combine(gdf, :a) == combine(gdf, "a") ==
-          by(df, :g, [:a]) == by(df, "g", ["a"]) == combine(gdf, [:a]) == combine(gdf, ["a"])
+    @test combine(gdf, :a) == combine(gdf, "a") ==
+          combine(gdf, [:a]) == combine(gdf, ["a"])
 
-    @test map("a" => identity, gdf) == map(:a => identity, gdf)
-    @test map(["a"] => identity, gdf) == map([:a] => identity, gdf)
-    @test map(nrow => :n, gdf) == map(nrow => "n", gdf)
+    @test combine("a" => identity, gdf, regroup=true) ==
+          combine(:a => identity, gdf, regroup=true)
+    @test combine(["a"] => identity, gdf, regroup=true) ==
+          combine([:a] => identity, gdf, regroup=true)
+    @test combine(nrow => :n, gdf, regroup=true) ==
+          combine(nrow => "n", gdf, regroup=true)
 
     @test combine("a" => identity, gdf) == combine(:a => identity, gdf) ==
-          combine(gdf, "a" => identity) == combine(gdf, :a => identity) ==
-          by("a" => identity, df, :g) == by(:a => identity, df, :g) ==
-          by(df, :g, "a" => identity) == by(df, :g, :a => identity)
+          combine(gdf, "a" => identity) == combine(gdf, :a => identity)
     @test combine(["a"] => identity, gdf) == combine([:a] => identity, gdf) ==
-          combine(gdf, ["a"] => identity) == combine(gdf, [:a] => identity) ==
-          by(["a"] => identity, df, :g) == by([:a] => identity, df, :g) ==
-          by(df, :g, ["a"] => identity) == by(df, :g, [:a] => identity)
+          combine(gdf, ["a"] => identity) == combine(gdf, [:a] => identity)
     @test combine(nrow => :n, gdf) == combine(nrow => "n", gdf) ==
-          combine(gdf, nrow => :n) == combine(gdf, nrow => "n") ==
-          by(nrow => :n, df, :g) == by(nrow => "n", df, :g) ==
-          by(df, :g, nrow => :n) == by(df, :g, nrow => "n")
+          combine(gdf, nrow => :n) == combine(gdf, nrow => "n")
 end
 
 @testset "DataFrameRow" begin

From 7b86eb8244f8bff0c2aef030215b9c1ae662ca15 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= <bkamins@sgh.waw.pl>
Date: Tue, 28 Apr 2020 23:52:19 +0200
Subject: [PATCH 10/29] allow changing sort order of groups in cannonical test

---
 src/groupeddataframe/splitapplycombine.jl | 23 ++------
 test/grouping.jl                          | 65 ++++++++++++-----------
 2 files changed, 37 insertions(+), 51 deletions(-)

diff --git a/src/groupeddataframe/splitapplycombine.jl b/src/groupeddataframe/splitapplycombine.jl
index b9123f88dd..13d7cc0db1 100644
--- a/src/groupeddataframe/splitapplycombine.jl
+++ b/src/groupeddataframe/splitapplycombine.jl
@@ -169,17 +169,7 @@ function groupby(df::AbstractDataFrame, cols;
     return gd
 end
 
-function _check_cannonical(gd::GroupedDataFrame)
-    groups = gd.groups
-    isempty(groups) && return true
-    maxseen = 1
-    for g in groups
-        1 <= g <= maxseen + 1 || return false
-        maxseen = max(maxseen, g)
-    end
-    @assert maxseen == gd.ngroups
-    return true
-end
+_check_cannonical(gd::GroupedDataFrame) = !any(==(0), gd.groups)
 
 const F_TYPE_RULES =
     """
@@ -602,13 +592,8 @@ function combine_helper(f, gd::GroupedDataFrame,
             # in this case we are sure that the result GroupedDataFrame has the
             # same structure as the source
             # we do not copy data as it should be safe - we never mutate fields of gd
-            if isnothing(getfield(gd, :keymap))
-                return GroupedDataFrame(newparent, gd.cols, gd.groups, gd.idx,
-                                        gd.starts, gd.ends, gd.ngroups, nothing)
-            else
-                return GroupedDataFrame(newparent, gd.cols, gd.groups, gd.idx,
-                                        gd.starts, gd.ends, gd.ngroups, gd.keymap)
-            end
+            return GroupedDataFrame(newparent, gd.cols, gd.groups, gd.idx,
+                                    gd.starts, gd.ends, gd.ngroups, getfield(gd, :keymap))
         else
             starts = Vector{Int}(undef, length(gd))
             ends = Vector{Int}(undef, length(gd))
@@ -1036,7 +1021,7 @@ function _combine(f::AbstractVector{<:Pair},
 
     if keeprows
         if !_check_cannonical(gd)
-            throw(ArgumentError("select or transform functions require that" *
+            throw(ArgumentError("select or transform functions require that " *
                                 "GroupedDataFrame is not sorted or subsetted"))
         end
         idx_keeprows = Vector{Int}(undef, nrow(parent(gd)))
diff --git a/test/grouping.jl b/test/grouping.jl
index 349b06b98b..9888728410 100644
--- a/test/grouping.jl
+++ b/test/grouping.jl
@@ -41,6 +41,7 @@ function validate_gdf(ogd::GroupedDataFrame)
     @assert g == g[1]:g[end]
     @assert length(gd.starts) == length(gd.ends) == g[end]
     @assert isperm(gd.idx)
+    @assert length(gd.idx) == length(gd.groups) == nrow(parent(gd))
 
     # checking that groups field is consistent with other fields
     # (since == and isequal do not use it)
@@ -88,7 +89,7 @@ end
 
 @testset "parent" begin
     df = DataFrame(a = [1, 1, 2, 2], b = [5, 6, 7, 8])
-    gd = groupby(df, :a)
+    gd = groupby_checked(df, :a)
     @test parent(gd) === df
     @test_throws ArgumentError identity.(gd)
 end
@@ -96,20 +97,20 @@ end
 @testset "consistency" begin
     df = DataFrame(a = [1, 1, 2, 2], b = [5, 6, 7, 8], c = 1:4)
     push!(df.c, 5)
-    @test_throws AssertionError groupby(df, :a)
+    @test_throws AssertionError groupby_checked(df, :a)
 
     df = DataFrame(a = [1, 1, 2, 2], b = [5, 6, 7, 8], c = 1:4)
     push!(DataFrames._columns(df), df[:, :a])
-    @test_throws AssertionError groupby(df, :a)
+    @test_throws AssertionError groupby_checked(df, :a)
 end
 
 @testset "accepted columns" begin
     df = DataFrame(A=[1,1,1,2,2,2], B=[1,2,1,2,1,2], C=1:6)
-    @test groupby(df, [1,2]) == groupby(df, 1:2) == groupby(df, [:A, :B])
-    @test groupby(df, [2,1]) == groupby(df, 2:-1:1) == groupby(df, [:B, :A])
+    @test groupby_checked(df, [1,2]) == groupby_checked(df, 1:2) == groupby_checked(df, [:A, :B])
+    @test groupby_checked(df, [2,1]) == groupby_checked(df, 2:-1:1) == groupby_checked(df, [:B, :A])
 end
 
-@testset "by, groupby and map(::Function, ::GroupedDataFrame)" begin
+@testset "groupby and combine(::Function, ::GroupedDataFrame)" begin
     Random.seed!(1)
     df = DataFrame(a = repeat(Union{Int, Missing}[1, 3, 2, 4], outer=[2]),
                    b = repeat(Union{Int, Missing}[2, 1], outer=[4]),
@@ -152,7 +153,7 @@ end
         sres3 = sort(res3, colssym)
         sres4 = sort(res4, colssym)
 
-        # groupby() without groups sorting
+        # groupby_checked() without groups sorting
         gd = groupby_checked(df, cols)
         @test names(parent(gd))[gd.cols] == string.(colssym)
         df_comb = combine(identity, gd)
@@ -171,7 +172,7 @@ end
         @test sort(combine(f7, gd), colssym) == sort(res4, colssym)
         @test sort(combine(f8, gd), colssym) == sort(res4, colssym)
 
-        # groupby() with groups sorting
+        # groupby_checked() with groups sorting
         gd = groupby_checked(df, cols, sort=true)
         @test names(parent(gd))[gd.cols] == string.(colssym)
         for i in 1:length(gd)
@@ -253,7 +254,7 @@ end
     df = DataFrame(v1=x, v2=x)
     groupby_checked(df, [:v1, :v2])
 
-    df2 = combine(e->1, groupby(DataFrame(x=Int64[]), :x))
+    df2 = combine(e->1, groupby_checked(DataFrame(x=Int64[]), :x))
     @test size(df2) == (0, 1)
     @test sum(df2.x) == 0
 
@@ -349,7 +350,7 @@ end
     df = DataFrame(x = [1, 2, 3], y = [2, 3, 1])
 
     # Test function returning DataFrameRow
-    res = by(d -> DataFrameRow(d, 1, :), df, :x)
+    res = combine(d -> DataFrameRow(d, 1, :), groupby_checked(df, :x))
     @test res == DataFrame(x=df.x, y=df.y)
 
     # Test function returning Tuple
@@ -359,7 +360,7 @@ end
     # Test with some groups returning empty data frames
     @test by(d -> d.x == [1] ? DataFrame(z=[]) : DataFrame(z=1), df, :x) ==
         DataFrame(x=[2, 3], z=[1, 1])
-    v = map(d -> d.x == [1] ? DataFrame(z=[]) : DataFrame(z=1), groupby_checked(df, :x))
+    v = combine(d -> d.x == [1] ? DataFrame(z=[]) : DataFrame(z=1), groupby_checked(df, :x), regroup=true)
     @test length(v) == 2
     @test vcat(v[1], v[2]) == DataFrame(x=[2, 3], z=[1, 1])
 
@@ -692,7 +693,7 @@ end
     @test_throws ArgumentError by(df, :a, nrow, nrow)
     @test_throws ArgumentError by(df, :a, [nrow])
 
-    gd = groupby(df, :a)
+    gd = groupby_checked(df, :a)
 
     # Only test that different combine syntaxes work,
     # and rely on tests below for deeper checks
@@ -958,7 +959,7 @@ end
 
 @testset "combine and map with columns named like grouping keys" begin
     df = DataFrame(x=["a", "a", "b", missing], y=1:4)
-    gd = groupby(df, :x)
+    gd = groupby_checked(df, :x)
     @test combine(identity, gd) ≅ df
     @test combine(d -> d[:, [2, 1]], gd) ≅ df
     @test_throws ArgumentError combine(f -> DataFrame(x=["a", "b"], z=[1, 1]), gd)
@@ -966,7 +967,7 @@ end
     @test map(d -> d[:, [2, 1]], gd) ≅ gd
     @test_throws ArgumentError map(f -> DataFrame(x=["a", "b"], z=[1, 1]), gd)
 
-    gd = groupby(df, :x, skipmissing=true)
+    gd = groupby_checked(df, :x, skipmissing=true)
     @test combine(identity, gd) == df[1:3, :]
     @test combine(d -> d[:, [2, 1]], gd) == df[1:3, :]
     @test_throws ArgumentError combine(f -> DataFrame(x=["a", "b"], z=[1, 1]), gd)
@@ -1199,7 +1200,7 @@ end
         \\end{tabular}
         """
 
-    gd = groupby(DataFrame(a=[Symbol("&")], b=["&"]), [1,2])
+    gd = groupby_checked(DataFrame(a=[Symbol("&")], b=["&"]), [1,2])
     summary_str = summary(gd)
     @test summary_str == "$GroupedDataFrame with 1 group based on keys: a, b"
     @test sprint(show, gd) === """
@@ -1231,7 +1232,7 @@ end
         \\end{tabular}
         """
 
-        gd = groupby(DataFrame(a = [1,2], b = [1.0, 2.0]), :a)
+        gd = groupby_checked(DataFrame(a = [1,2], b = [1.0, 2.0]), :a)
         @test sprint(show, "text/csv", gd) == """
         "a","b"
         1,1.0
@@ -1331,7 +1332,7 @@ end
     df = DataFrame(a=[2, 2, missing, missing, 1, 1, 3, 3], b=1:8)
     for dosort in (false, true), doskipmissing in (false, true)
         @test by(df, :a, :b=>sum, sort=dosort, skipmissing=doskipmissing) ≅
-            combine(groupby(df, :a, sort=dosort, skipmissing=doskipmissing), :b=>sum)
+            combine(groupby_checked(df, :a, sort=dosort, skipmissing=doskipmissing), :b=>sum)
     end
 end
 
@@ -1376,7 +1377,7 @@ end
         "│ 2   │ 2     │ 1     │ 2     │\n│ 3   │ 2     │ 2     │ 3     │"
 
     df = DataFrame(a=[1, 1, 2, 2, 2], b=1:5)
-    gd = groupby(df, :a)
+    gd = groupby_checked(df, :a)
     @test_throws ArgumentError combine(gd)
 end
 
@@ -1486,8 +1487,8 @@ end
     @test cnt == length(gd)
 
     # Indexing using another GroupedDataFrame instance should fail
-    gd2 = groupby(df, cols, skipmissing=true)
-    gd3 = groupby(df, cols, skipmissing=true)
+    gd2 = groupby_checked(df, cols, skipmissing=true)
+    gd3 = groupby_checked(df, cols, skipmissing=true)
     @test gd2 == gd3  # Use GDF's without missing so they compare equal
     @test_throws ErrorException gd3[first(keys(gd2))]
 
@@ -1501,7 +1502,7 @@ end
                    b = repeat(1:2, outer=[6]),
                    c = 1:12)
 
-    gd = groupby(df, [:a, :b])
+    gd = groupby_checked(df, [:a, :b])
 
     @test map(repr, keys(gd)) == [
         "GroupKey: (a = :foo, b = 1)",
@@ -1649,7 +1650,7 @@ end
 end
 
 @testset "haskey for GroupKey" begin
-    gdf = groupby(DataFrame(a=1, b=2, c=3), [:a, :b])
+    gdf = groupby_checked(DataFrame(a=1, b=2, c=3), [:a, :b])
     k = keys(gdf)[1]
     @test !haskey(k, 0)
     @test haskey(k, 1)
@@ -1666,7 +1667,7 @@ end
     @test_throws MethodError haskey(gdf, true)
 
     @test haskey(gdf, k)
-    @test_throws ArgumentError haskey(gdf, keys(groupby(DataFrame(a=1,b=2,c=3), [:a, :b]))[1])
+    @test_throws ArgumentError haskey(gdf, keys(groupby_checked(DataFrame(a=1,b=2,c=3), [:a, :b]))[1])
     @test_throws BoundsError haskey(gdf, DataFrames.GroupKey(gdf, 0))
     @test_throws BoundsError haskey(gdf, DataFrames.GroupKey(gdf, 2))
     @test haskey(gdf, (1,2))
@@ -1733,11 +1734,11 @@ end
     @test by(df, :g, :x1 => :z) ==
           by(df, :g, [:x1 => :z]) ==
           by(:x1 => :z, df, :g) ==
-          combine(groupby(df, :g), :x1 => :z) ==
-          combine(groupby(df, :g), [:x1 => :z]) ==
-          combine(:x1 => :z, groupby(df, :g)) ==
+          combine(groupby_checked(df, :g), :x1 => :z) ==
+          combine(groupby_checked(df, :g), [:x1 => :z]) ==
+          combine(:x1 => :z, groupby_checked(df, :g)) ==
           DataFrame(g=[1,1,1,2,2,2], z=1:6)
-    @test map(:x1 => :z, groupby(df, :g)) == groupby(DataFrame(g=[1,1,1,2,2,2], z=1:6), :g)
+    @test map(:x1 => :z, groupby_checked(df, :g)) == groupby_checked(DataFrame(g=[1,1,1,2,2,2], z=1:6), :g)
 end
 
 @testset "hard tabular return value cases" begin
@@ -1810,7 +1811,7 @@ end
 @testset "additional do_call tests" begin
     Random.seed!(1234)
     df = DataFrame(g = rand(1:10, 100), x1 = rand(1:1000, 100))
-    gdf = groupby(df, :g)
+    gdf = groupby_checked(df, :g)
 
     @test combine(gdf, [] => () -> 1, :x1 => length) == combine(gdf) do sdf
         (;[:function => 1, :x1_length => nrow(sdf)]...)
@@ -1927,7 +1928,7 @@ end
 
 @testset "AsTable tests" begin
     df = DataFrame(g=[1,1,1,2,2], x=1:5, y=6:10)
-    gdf = groupby(df, :g)
+    gdf = groupby_checked(df, :g)
 
     # whole column 4 options of single pair passed
     @test by(df, :g , AsTable([:x, :y]) => Ref) ==
@@ -1936,7 +1937,7 @@ end
           combine(AsTable([:x, :y]) => Ref, gdf) ==
           DataFrame(g=1:2, x_y_Ref=[(x=[1,2,3], y=[6,7,8]), (x=[4,5], y=[9,10])])
     @test map(AsTable([:x, :y]) => Ref, gdf) ==
-          groupby(by(df, :g , AsTable([:x, :y]) => Ref), :g)
+          groupby_checked(by(df, :g , AsTable([:x, :y]) => Ref), :g)
 
     @test by(df, :g, AsTable(1) => Ref) ==
           combine(gdf, AsTable(1) => Ref) ==
@@ -1951,7 +1952,7 @@ end
           DataFrame(g=[1,1,1,2,2],
                     x_y_function=[[(x=1,y=6)], [(x=2,y=7)], [(x=3,y=8)], [(x=4,y=9)], [(x=5,y=10)]])
     @test map(AsTable([:x, :y]) => ByRow(x -> [x]), gdf) ==
-          groupby(by(df, :g, AsTable([:x, :y]) => ByRow(x -> [x])), :g)
+          groupby_checked(by(df, :g, AsTable([:x, :y]) => ByRow(x -> [x])), :g)
 
     # whole column and ByRow test for multiple pairs passed
     @test by(df, :g, [:x, :y], [AsTable(v) => (x -> -x[1]) for v in [:x, :y]]) ==
@@ -1968,7 +1969,7 @@ end
 
 @testset "test correctness of regrouping" begin
     df = DataFrame(g=[2,2,1,3,1,2,1,2,3])
-    gdf = groupby(df, :g)
+    gdf = groupby_checked(df, :g)
     gdf2 = combine(identity, gdf, regroup=true)
     @test combine(gdf, :g => sum) == combine(gdf2, :g => sum)
 

From cb94903eb021076134ea02887a48b02e71424c28 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= <bkamins@sgh.waw.pl>
Date: Wed, 29 Apr 2020 12:25:29 +0200
Subject: [PATCH 11/29] make old tests pass

---
 src/abstractdataframe/selection.jl        |   5 +-
 src/deprecated.jl                         |  18 +-
 src/groupeddataframe/splitapplycombine.jl |   2 +-
 test/deprecated.jl                        |   8 +
 test/grouping.jl                          | 607 ++++++++++------------
 test/select.jl                            |  68 ++-
 6 files changed, 348 insertions(+), 360 deletions(-)

diff --git a/src/abstractdataframe/selection.jl b/src/abstractdataframe/selection.jl
index f211ee09b8..830565791c 100644
--- a/src/abstractdataframe/selection.jl
+++ b/src/abstractdataframe/selection.jl
@@ -188,7 +188,7 @@ function select_transform!(nc::Pair{<:Union{Int, AbstractVector{Int}, AsTable},
         if !allow_resizing_newdf[] && ncol(newdf) == 0 && length(res) != nrow(df)
             throw(ArgumentError("length $(length(res)) of vector returned from " *
                                 "function $fun is different than number of rows" *
-                                "$(nrow(df)) of the source data frame."))
+                                " $(nrow(df)) of the source data frame."))
         end
         allow_resizing_newdf[] = false
         respar = parent(res)
@@ -699,7 +699,8 @@ function _manipulate(dfv::SubDataFrame, args...; copycols::Bool, keeprows::Bool)
                 push!(cs_vec, v)
             end
         end
-        return _process(dfv, [normalize_selection(index(dfv), c) for c in cs_vec], true, true)
+        return _process(dfv, [normalize_selection(index(dfv), c) for c in cs_vec],
+                        true, keeprows)
     else
         # we do not support transformations here
         # newinds contains only indexing; making it Vector{Any} avoids some compilation
diff --git a/src/deprecated.jl b/src/deprecated.jl
index e95b82aa6e..b7d713e917 100644
--- a/src/deprecated.jl
+++ b/src/deprecated.jl
@@ -360,28 +360,28 @@ end
 export aggregate
 
 function aggregate(d::AbstractDataFrame, f::Any; sort::Bool=false)
-    df = select(d, names(d) .=> [f])
+    df = combine(d, names(d) .=> [f])
     if sort
         Base.depwarn("`aggregate(d, f, sort=true)` is deprecated. " *
-                     "Instead use `sort!(select(d, names(d) .=> f))`.", :aggregate)
+                     "Instead use `sort!(combine(d, names(d) .=> f))`.", :aggregate)
         sort!(df)
     else
         Base.depwarn("`aggregate(d, f)` is deprecated. " *
-                     "Instead use `select(d, names(d) .=> f)`.", :aggregate)
+                     "Instead use `combine(d, names(d) .=> f)`.", :aggregate)
     end
     return df
 end
 
 function aggregate(d::AbstractDataFrame, fs::AbstractVector; sort::Bool=false)
-    df = hcat([select(d, names(d) .=> [f]) for f in fs]..., makeunique=true)
+    df = hcat([combine(d, names(d) .=> [f]) for f in fs]..., makeunique=true)
     if sort
         Base.depwarn("`aggregate(d, fs, sort=true)` is deprecated. Instead" *
-                     " use `sort!(select(d, [names(d) .=> f for f in fs]...))` " *
+                     " use `sort!(combine(d, [names(d) .=> f for f in fs]...))` " *
                      "if functions in `fs` have unique names.", :aggregate)
         sort!(df)
     else
         Base.depwarn("`aggregate(d, fs)` is deprecated. Instead use " *
-                     "`select(d, [names(d) .=> f for f in fs]...)` if functions " *
+                     "`combine(d, [names(d) .=> f for f in fs]...)` if functions " *
                      "in `fs` have unique names.", :aggregate)
     end
     return df
@@ -424,7 +424,8 @@ function aggregate(d::AbstractDataFrame, cols, f::Any;
                    sort::Bool=false, skipmissing::Bool=false)
     Base.depwarn("`aggregate(d, cols, f, sort=$sort, skipmissing=$skipmissing)` " *
                  "is deprecated. Instead use " *
-                 "by(gd, cols, names(gd) .=> f, sort=$sort, skipmissing=$skipmissing)`",
+                 "combine(groupby(d, cols, sort=$sort, skipmissing=$skipmissing)," *
+                 " names(d, Not(cols)) .=> f)`",
                  :aggregate)
     gd = groupby(d, cols, sort=sort, skipmissing=skipmissing)
     df = combine(gd, valuecols(gd) .=> [f])
@@ -435,7 +436,8 @@ function aggregate(d::AbstractDataFrame, cols, fs::AbstractVector;
                    sort::Bool=false, skipmissing::Bool=false)
     Base.depwarn("`aggregate(d, cols, fs, sort=$sort, skipmissing=$skipmissing)` " *
                  " is deprecated. Instead use " *
-                 "by(gd, cols, [names(gd) .=> f for f in fs]..., sort=$sort, skipmissing=$skipmissing)`" *
+                 "combine(groupby(d, cols, sort=$sort, skipmissing=$skipmissing), "*
+                 "[names(d, Not(cols)) .=> f for f in fs]...)`" *
                  " if functions in `fs` have unique names.", :aggregate)
     gd = groupby(d, cols, sort=sort, skipmissing=skipmissing)
     df = hcat([combine(gd, valuecols(gd) .=> [f], keepkeys=i==1) for (i, f) in enumerate(fs)]...,
diff --git a/src/groupeddataframe/splitapplycombine.jl b/src/groupeddataframe/splitapplycombine.jl
index 13d7cc0db1..5ebc609c5e 100644
--- a/src/groupeddataframe/splitapplycombine.jl
+++ b/src/groupeddataframe/splitapplycombine.jl
@@ -436,7 +436,7 @@ function combine(p::Pair, gd::GroupedDataFrame;
     # verify if it is not better to use a fast path, which we achieve
     # by moving to combine(::GroupedDataFrame, ::AbstractVector) method
     if isagg(p_from => (p_to isa Pair ? first(p_to) : p_to)) || p_from === nrow
-        return combine(gd, [p], keepkeys=keepkeys)
+        return combine(gd, [p], keepkeys=keepkeys, regroup=regroup)
     end
 
     if p_from isa Tuple
diff --git a/test/deprecated.jl b/test/deprecated.jl
index 39236be972..11550a2b2e 100644
--- a/test/deprecated.jl
+++ b/test/deprecated.jl
@@ -696,6 +696,14 @@ end
         deleterows!(DataFrame(x=[1, 2]), [true, false]) == DataFrame(x=[2])
 end
 
+@testset "by skipmissing and sort" begin
+    df = DataFrame(a=[2, 2, missing, missing, 1, 1, 3, 3], b=1:8)
+    for dosort in (false, true), doskipmissing in (false, true)
+        @test by(df, :a, :b=>sum, sort=dosort, skipmissing=doskipmissing) ≅
+            combine(groupby(df, :a, sort=dosort, skipmissing=doskipmissing), :b=>sum)
+    end
+end
+
 global_logger(old_logger)
 
 end # module
diff --git a/test/grouping.jl b/test/grouping.jl
index 9888728410..a0285dac76 100644
--- a/test/grouping.jl
+++ b/test/grouping.jl
@@ -32,14 +32,20 @@ function validate_gdf(ogd::GroupedDataFrame)
     # To return original object to test when indices have not been computed
     gd = deepcopy(ogd)
 
-    @assert allunique(gd.cols)
-    @assert minimum(gd.cols) >= 1
-    @assert maximum(gd.cols) <= ncol(parent(gd))
+    if !isempty(gd.cols)
+        @assert allunique(gd.cols)
+        @assert minimum(gd.cols) >= 1
+        @assert maximum(gd.cols) <= ncol(parent(gd))
+    end
 
     g = sort!(unique(gd.groups))
-    @assert 0 <= g[1] <= 1
-    @assert g == g[1]:g[end]
-    @assert length(gd.starts) == length(gd.ends) == g[end]
+    if length(gd) > 0
+        @assert 0 <= g[1] <= 1
+        @assert g == g[1]:g[end]
+        @assert length(gd.starts) == length(gd.ends) == g[end]
+    else
+        @assert length(gd.starts) == length(gd.ends) == 0
+    end
     @assert isperm(gd.idx)
     @assert length(gd.idx) == length(gd.groups) == nrow(parent(gd))
 
@@ -78,7 +84,7 @@ function validate_gdf(ogd::GroupedDataFrame)
         # all groups have different grouping keys
         @test allunique(eachrow(gd.parent[gd.idx[gd.starts], gd.cols]))
     end
-
+    return ogd
 end
 
 function groupby_checked(df::AbstractDataFrame, keys, args...; kwargs...)
@@ -106,8 +112,10 @@ end
 
 @testset "accepted columns" begin
     df = DataFrame(A=[1,1,1,2,2,2], B=[1,2,1,2,1,2], C=1:6)
-    @test groupby_checked(df, [1,2]) == groupby_checked(df, 1:2) == groupby_checked(df, [:A, :B])
-    @test groupby_checked(df, [2,1]) == groupby_checked(df, 2:-1:1) == groupby_checked(df, [:B, :A])
+    @test groupby_checked(df, [1,2]) == groupby_checked(df, 1:2) ==
+          groupby_checked(df, [:A, :B])
+    @test groupby_checked(df, [2,1]) == groupby_checked(df, 2:-1:1) ==
+          groupby_checked(df, [:B, :A])
 end
 
 @testset "groupby and combine(::Function, ::GroupedDataFrame)" begin
@@ -180,7 +188,7 @@ end
             @test all(gd[i][!, colssym[2]] .== sres[i, colssym[2]])
         end
         @test combine(identity, gd) == shcatdf
-        @test combine(df -> df[1, :], gd, cols, sort=true) ==
+        @test combine(df -> df[1, :], gd) ==
             shcatdf[.!nonunique(shcatdf, colssym), :]
         df_ref = DataFrame(gd)
         @test hcat(df_ref[!, cols], df_ref[!, Not(cols)]) == shcatdf
@@ -194,43 +202,41 @@ end
         @test combine(f8, gd) == sres4
 
         # combine() with regroup without and with groups sorting
-        for sort in (false, true)
-            gd = groupby_checked(df, cols, sort=sort)
-            v = combine(d -> d[:, [:x]], gd, regroup=true)
+        for dosort in (false, true)
+            gd = groupby_checked(df, cols, sort=dosort)
+            v = validate_gdf(combine(d -> d[:, [:x]], gd, regroup=true))
             @test length(gd) == length(v)
             nms = [colssym; :x]
             @test v[1] == gd[1][:, nms]
-            @test v[1] == gd[1][:, nms] &&
-                v[2] == gd[2][:, nms] &&
-                v[3] == gd[3][:, nms] &&
-                v[4] == gd[4][:, nms]
+            @test v[1] == gd[1][:, nms] && v[2] == gd[2][:, nms] &&
+                v[3] == gd[3][:, nms] && v[4] == gd[4][:, nms]
             @test names(parent(v))[v.cols] == string.(colssym)
-            v = combine(f1, gd, regroup=true)
-            @test extrema(v.grous) == extrema(gd.groups)
+            v = validate_gdf(combine(f1, gd, regroup=true))
+            @test extrema(v.groups) == extrema(gd.groups)
             @test vcat(v[1], v[2], v[3], v[4]) == combine(f1, gd)
-            v = combine(f2, gd, regroup=true)
-            @test extrema(v.grous) == extrema(gd.groups)
+            v = validate_gdf(combine(f2, gd, regroup=true))
+            @test extrema(v.groups) == extrema(gd.groups)
             @test vcat(v[1], v[2], v[3], v[4]) == combine(f2, gd)
-            v = combine(f3, gd, regroup=true)
-            @test extrema(v.grous) == extrema(gd.groups)
+            v = validate_gdf(combine(f3, gd, regroup=true))
+            @test extrema(v.groups) == extrema(gd.groups)
             @test vcat(v[1], v[2], v[3], v[4]) == combine(f3, gd)
-            v = combine(f4, gd, regroup=true)
-            @test extrema(v.grous) == extrema(gd.groups)
+            v = validate_gdf(combine(f4, gd, regroup=true))
+            @test extrema(v.groups) == extrema(gd.groups)
             @test vcat(v[1], v[2], v[3], v[4]) == combine(f4, gd)
-            v = combine(f5, gd, regroup=true)
-            @test extrema(v.grous) == extrema(gd.groups)
+            v = validate_gdf(combine(f5, gd, regroup=true))
+            @test extrema(v.groups) == extrema(gd.groups)
             @test vcat(v[1], v[2], v[3], v[4]) == combine(f5, gd)
-            v = combine(f5, gd, regroup=true)
-            @test extrema(v.grous) == extrema(gd.groups)
+            v = validate_gdf(combine(f5, gd, regroup=true))
+            @test extrema(v.groups) == extrema(gd.groups)
             @test vcat(v[1], v[2], v[3], v[4]) == combine(f5, gd)
-            v = combine(f6, gd, regroup=true)
-            @test extrema(v.grous) == extrema(gd.groups)
+            v = validate_gdf(combine(f6, gd, regroup=true))
+            @test extrema(v.groups) == extrema(gd.groups)
             @test vcat(v[1], v[2], v[3], v[4]) == combine(f6, gd)
-            v = combine(f7, gd, regroup=true)
-            @test extrema(v.grous) == extrema(gd.groups)
+            v = validate_gdf(combine(f7, gd, regroup=true))
+            @test extrema(v.groups) == extrema(gd.groups)
             @test vcat(v[1], v[2], v[3], v[4]) == combine(f7, gd)
-            v = combine(f8, gd, regroup=true)
-            @test extrema(v.grous) == extrema(gd.groups)
+            v = validate_gdf(combine(f8, gd, regroup=true))
+            @test extrema(v.groups) == extrema(gd.groups)
             @test vcat(v[1], v[2], v[3], v[4]) == combine(f8, gd)
         end
     end
@@ -322,112 +328,109 @@ end
     df = DataFrame(Key1 = CategoricalArray(["A", "A", "B", "B", "B", "A"]),
                     Key2 = CategoricalArray(["A", "B", "A", "B", "B", "A"]),
                     Value = 1:6)
-
+    gdf = groupby_checked(df, :Key1)
     # Check that CategoricalArray column is preserved when returning a value...
-    res = combine(d -> DataFrame(x=d[1, :Key2]), groupby_checked(df, :Key1))
+    res = combine(d -> DataFrame(x=d[1, :Key2]), gdf)
     @test typeof(res.x) == typeof(df.Key2)
-    res = combine(d -> (x=d[1, :Key2],), groupby_checked(df, :Key1))
+    res = combine(d -> (x=d[1, :Key2],), gdf)
     @test typeof(res.x) == typeof(df.Key2)
     # ...and when returning an array
-    res = combine(d -> DataFrame(x=d.Key1), groupby_checked(df, :Key1))
+    res = combine(d -> DataFrame(x=d.Key1), gdf)
     @test typeof(res.x) == typeof(df.Key1)
 
     # Check that CategoricalArray and String give a String...
-    res = combine(d -> d.Key1 == ["A", "A"] ? DataFrame(x=d[1, :Key1]) : DataFrame(x="C"),
-                  groupby_checked(df, :Key1))
+    res = combine(d -> d.Key1 == ["A", "A"] ? DataFrame(x=d[1, :Key1]) : DataFrame(x="C"), gdf)
     @test res.x isa Vector{String}
-    res = combine(d -> d.Key1 == ["A", "A"] ? (x=d[1, :Key1],) : (x="C",),
-                  groupby_checked(df, :Key1))
+    res = combine(d -> d.Key1 == ["A", "A"] ? (x=d[1, :Key1],) : (x="C",), gdf)
     @test res.x isa Vector{String}
     # ...even when CategoricalValue comes second
-    res = combine(d -> d.Key1 == ["B", "B"] ? DataFrame(x=d[1, :Key1]) : DataFrame(x="C"),
-                  groupby_checked(df, :Key1))
+    res = combine(d -> d.Key1 == ["B", "B"] ? DataFrame(x=d[1, :Key1]) : DataFrame(x="C"), gdf)
     @test res.x isa Vector{String}
-    res = combine(d -> d.Key1 == ["B", "B"] ? (x=d[1, :Key1],) : (x="C",),
-                  groupby_checked(df, :Key1))
+    res = combine(d -> d.Key1 == ["B", "B"] ? (x=d[1, :Key1],) : (x="C",), gdf)
     @test res.x isa Vector{String}
 
     df = DataFrame(x = [1, 2, 3], y = [2, 3, 1])
-
+    gdf = groupby_checked(df, :x)
     # Test function returning DataFrameRow
-    res = combine(d -> DataFrameRow(d, 1, :), groupby_checked(df, :x))
+    res = combine(d -> DataFrameRow(d, 1, :), gdf)
     @test res == DataFrame(x=df.x, y=df.y)
 
     # Test function returning Tuple
-    res = by(d -> (sum(d.y),), df, :x)
+    res = combine(d -> (sum(d.y),), gdf)
     @test res == DataFrame(x=df.x, x1=tuple.([2, 3, 1]))
 
     # Test with some groups returning empty data frames
-    @test by(d -> d.x == [1] ? DataFrame(z=[]) : DataFrame(z=1), df, :x) ==
+    @test combine(d -> d.x == [1] ? DataFrame(z=[]) : DataFrame(z=1), gdf) ==
         DataFrame(x=[2, 3], z=[1, 1])
-    v = combine(d -> d.x == [1] ? DataFrame(z=[]) : DataFrame(z=1), groupby_checked(df, :x), regroup=true)
+    v = validate_gdf(combine(d -> d.x == [1] ? DataFrame(z=[]) : DataFrame(z=1),
+                             groupby_checked(df, :x), regroup=true))
     @test length(v) == 2
     @test vcat(v[1], v[2]) == DataFrame(x=[2, 3], z=[1, 1])
 
     # Test that returning values of different types works with NamedTuple
-    res = by(d -> d.x == [1] ? 1 : 2.0, df, :x)
+    res = combine(d -> d.x == [1] ? 1 : 2.0, gdf)
     @test res.x1 isa Vector{Float64}
     @test res.x1 == [1, 2, 2]
     # Two columns need to be widened at different times
-    res = by(d -> (a=d.x == [1] ? 1 : 2.0, b=d.x == [3] ? missing : "a"), df, :x)
+    res = combine(d -> (a=d.x == [1] ? 1 : 2.0, b=d.x == [3] ? missing : "a"), gdf)
     @test res.a isa Vector{Float64}
     @test res.a == [1, 2, 2]
     @test res.b isa Vector{Union{String,Missing}}
     @test res.b ≅ ["a", "a", missing]
     # Corner case: two columns need to be widened at the same time
-    res = by(d -> (a=d.x == [1] ? 1 : 2.0, b=d.x == [1] ? missing : "a"), df, :x)
+    res = combine(d -> (a=d.x == [1] ? 1 : 2.0, b=d.x == [1] ? missing : "a"), gdf)
     @test res.a isa Vector{Float64}
     @test res.a == [1, 2, 2]
     @test res.b isa Vector{Union{String,Missing}}
     @test res.b ≅ [missing, "a", "a"]
 
     # Test that returning values of different types works with DataFrame
-    res = by(d -> DataFrame(x1 = d.x == [1] ? 1 : 2.0), df, :x)
+    res = combine(d -> DataFrame(x1 = d.x == [1] ? 1 : 2.0), gdf)
     @test res.x1 isa Vector{Float64}
     @test res.x1 == [1, 2, 2]
     # Two columns need to be widened at different times
-    res = by(d -> DataFrame(a=d.x == [1] ? 1 : 2.0, b=d.x == [3] ? missing : "a"), df, :x)
+    res = combine(d -> DataFrame(a=d.x == [1] ? 1 : 2.0, b=d.x == [3] ? missing : "a"), gdf)
     @test res.a isa Vector{Float64}
     @test res.a == [1, 2, 2]
     @test res.b isa Vector{Union{String,Missing}}
     @test res.b ≅ ["a", "a", missing]
     # Corner case: two columns need to be widened at the same time
-    res = by(d -> DataFrame(a=d.x == [1] ? 1 : 2.0, b=d.x == [1] ? missing : "a"), df, :x)
+    res = combine(d -> DataFrame(a=d.x == [1] ? 1 : 2.0, b=d.x == [1] ? missing : "a"), gdf)
     @test res.a isa Vector{Float64}
     @test res.a == [1, 2, 2]
     @test res.b isa Vector{Union{String,Missing}}
     @test res.b ≅ [missing, "a", "a"]
 
     # Test return values with columns in different orders
-    @test by(d -> d.x == [1] ? (x1=1, x2=3) : (x2=2, x1=4), df, :x) ==
+    @test combine(d -> d.x == [1] ? (x1=1, x2=3) : (x2=2, x1=4), gdf) ==
         DataFrame(x=1:3, x1=[1, 4, 4], x2=[3, 2, 2])
-    @test by(d -> d.x == [1] ? DataFrame(x1=1, x2=3) : DataFrame(x2=2, x1=4), df, :x) ==
+    @test combine(d -> d.x == [1] ? DataFrame(x1=1, x2=3) : DataFrame(x2=2, x1=4), gdf) ==
         DataFrame(x=1:3, x1=[1, 4, 4], x2=[3, 2, 2])
 
     # Test with NamedTuple with columns of incompatible lengths
-    @test_throws DimensionMismatch by(d -> (x1=[1], x2=[3, 4]), df, :x)
-    @test_throws DimensionMismatch by(d -> d.x == [1] ? (x1=[1], x2=[3]) :
-                                                        (x1=[1], x2=[3, 4]), df, :x)
+    @test_throws DimensionMismatch combine(d -> (x1=[1], x2=[3, 4]), gdf)
+    @test_throws DimensionMismatch combine(d -> d.x == [1] ? (x1=[1], x2=[3]) :
+                                                        (x1=[1], x2=[3, 4]), gdf)
 
     # Test with incompatible return values
-    @test_throws ArgumentError by(d -> d.x == [1] ? (x1=1,) : DataFrame(x1=1), df, :x)
-    @test_throws ArgumentError by(d -> d.x == [1] ? DataFrame(x1=1) : (x1=1,), df, :x)
-    @test_throws ArgumentError by(d -> d.x == [1] ? NamedTuple() : (x1=1), df, :x)
-    @test_throws ArgumentError by(d -> d.x == [1] ? (x1=1) : NamedTuple(), df, :x)
-    @test_throws ArgumentError by(d -> d.x == [1] ? 1 : DataFrame(x1=1), df, :x)
-    @test_throws ArgumentError by(d -> d.x == [1] ? DataFrame(x1=1) : 1, df, :x)
-    @test_throws ArgumentError by(d -> d.x == [1] ? (x1=1) : (x1=[1]), df, :x)
-    @test_throws ArgumentError by(d -> d.x == [1] ? (x1=[1]) : (x1=1), df, :x)
-    @test_throws ArgumentError by(d -> d.x == [1] ? 1 : [1], df, :x)
-    @test_throws ArgumentError by(d -> d.x == [1] ? [1] : 1, df, :x)
-    @test_throws ArgumentError by(d -> d.x == [1] ? (x1=1, x2=1) : (x1=[1], x2=1), df, :x)
-    @test_throws ArgumentError by(d -> d.x == [1] ? (x1=[1], x2=1) : (x1=1, x2=1), df, :x)
+    @test_throws ArgumentError combine(d -> d.x == [1] ? (x1=1,) : DataFrame(x1=1), gdf)
+    @test_throws ArgumentError combine(d -> d.x == [1] ? DataFrame(x1=1) : (x1=1,), gdf)
+    @test_throws ArgumentError combine(d -> d.x == [1] ? NamedTuple() : (x1=1), gdf)
+    @test_throws ArgumentError combine(d -> d.x == [1] ? (x1=1) : NamedTuple(), gdf)
+    @test_throws ArgumentError combine(d -> d.x == [1] ? 1 : DataFrame(x1=1), gdf)
+    @test_throws ArgumentError combine(d -> d.x == [1] ? DataFrame(x1=1) : 1, gdf)
+    @test_throws ArgumentError combine(d -> d.x == [1] ? (x1=1) : (x1=[1]), gdf)
+    @test_throws ArgumentError combine(d -> d.x == [1] ? (x1=[1]) : (x1=1), gdf)
+    @test_throws ArgumentError combine(d -> d.x == [1] ? 1 : [1], gdf)
+    @test_throws ArgumentError combine(d -> d.x == [1] ? [1] : 1, gdf)
+    @test_throws ArgumentError combine(d -> d.x == [1] ? (x1=1, x2=1) : (x1=[1], x2=1), gdf)
+    @test_throws ArgumentError combine(d -> d.x == [1] ? (x1=[1], x2=1) : (x1=1, x2=1), gdf)
     # Special case allowed due to how implementation works
-    @test by(d -> d.x == [1] ? 1 : (x1=1), df, :x) == by(d -> 1, df, :x)
+    @test combine(d -> d.x == [1] ? 1 : (x1=1), gdf) == combine(d -> 1, gdf)
 
     # Test that columns names and types are respected for empty input
     df = DataFrame(x=Int[], y=String[])
-    res = by(d -> 1, df, :x)
+    res = combine(d -> 1, groupby_checked(df, :x))
     @test size(res) == (0, 1)
     @test res.x isa Vector{Int}
 
@@ -435,14 +438,14 @@ end
     df = DataFrame(x=[], y=[])
     gd = groupby_checked(df, :x)
     @test combine(df -> sum(df.x), gd) == DataFrame(x=[])
-    res = map(df -> sum(df.x), gd)
+    res = validate_gdf(combine(df -> sum(df.x), gd, regroup=true))
     @test length(res) == 0
     @test res.parent == DataFrame(x=[])
 
     # Test with zero groups in output
     df = DataFrame(A = [1, 2])
     gd = groupby_checked(df, :A)
-    gd2 = map(d -> DataFrame(), gd)
+    gd2 = validate_gdf(combine(d -> DataFrame(), gd, regroup=true))
     @test length(gd2) == 0
     @test gd.cols == [1]
     @test isempty(gd2.groups)
@@ -452,7 +455,7 @@ end
     @test parent(gd2) == DataFrame(A=[])
     @test eltype.(eachcol(parent(gd2))) == [Int]
 
-    gd2 = map(d -> DataFrame(X=Int[]), gd)
+    gd2 = validate_gdf(combine(d -> DataFrame(X=Int[]), gd, regroup=true))
     @test length(gd2) == 0
     @test gd.cols == [1]
     @test isempty(gd2.groups)
@@ -622,77 +625,13 @@ end
     @test DataFrame(df) == df
 end
 
-@testset "by, combine and map with pair interface" begin
+@testset "combine with pair interface" begin
     vexp = x -> exp.(x)
     Random.seed!(1)
     df = DataFrame(a = repeat([1, 3, 2, 4], outer=[2]),
                    b = repeat([2, 1], outer=[4]),
                    c = rand(Int, 8))
 
-    # Only test that different by syntaxes work,
-    # and rely on tests below for deeper checks
-    @test by(df, :a, :c => sum) ==
-        by(:c => sum, df, :a) ==
-        by(df, :a, :c => sum => :c_sum) ==
-        by(:c => sum => :c_sum, df, :a) ==
-        by(df, :a, [:c => sum]) ==
-        by(df, :a, [:c => sum => :c_sum]) ==
-        by(d -> (c_sum=sum(d.c),), df, :a) ==
-        by(df, :a, d -> (c_sum=sum(d.c),))
-
-    @test by(df, :a, :c => vexp) ==
-        by(:c => vexp, df, :a) ==
-        by(df, :a, :c => vexp => :c_function) ==
-        by(:c => vexp => :c_function, df, :a) ==
-        by(:c => c -> (c_function = vexp(c),), df, :a) ==
-        by(df, :a, :c => c -> (c_function = vexp(c),)) ==
-        by(df, :a, [:c => vexp]) ==
-        by(df, :a, [:c => vexp => :c_function]) ==
-        by(d -> (c_function=vexp(d.c),), df, :a) ==
-        by(df, :a, d -> (c_function=vexp(d.c),))
-
-    @test by(df, :a, :b => sum, :c => sum) ==
-        by(df, :a, :b => sum => :b_sum, :c => sum => :c_sum) ==
-        by(df, :a, [:b => sum, :c => sum]) ==
-        by(df, :a, [:b => sum => :b_sum, :c => sum => :c_sum]) ==
-        by(d -> (b_sum=sum(d.b), c_sum=sum(d.c)), df, :a) ==
-        by(df, :a, d -> (b_sum=sum(d.b), c_sum=sum(d.c)))
-
-    @test by(df, :a, :b => vexp, :c => identity) ==
-        by(df, :a, :b => vexp => :b_function, :c => identity => :c_identity) ==
-        by(df, :a, [:b => vexp, :c => identity]) ==
-        by(df, :a, [:b => vexp => :b_function, :c => identity => :c_identity]) ==
-        by(d -> (b_function=vexp(d.b), c_identity=identity(d.c)), df, :a) ==
-        by(df, :a, d -> (b_function=vexp(d.b), c_identity=identity(d.c))) ==
-        by(df, :a, [:b, :c] => (b, c) -> (b_function=vexp(b), c_identity=identity(c))) ==
-        by([:b, :c] => (b, c) -> (b_function=vexp(b), c_identity=identity(c)), df, :a)
-
-    @test by(x -> extrema(x.c), df, :a) == by(:c => (x -> extrema(x)) => :x1, df, :a)
-    @test by(x -> x.b+x.c, df, :a) == by([:b,:c] => (+) => :x1, df, :a)
-    @test by(x -> (p=x.b, q=x.c), df, :a) ==
-          by([:b,:c] => (b,c) -> (p=b,q=c), df, :a) ==
-          by(df, :a, x -> (p=x.b, q=x.c)) ==
-          by(df, :a, [:b,:c] => (b,c) -> (p=b,q=c))
-    @test by(x -> DataFrame(p=x.b, q=x.c), df, :a) ==
-          by([:b,:c] => (b,c) -> DataFrame(p=b,q=c), df, :a) ==
-          by(df, :a, x -> DataFrame(p=x.b, q=x.c)) ==
-          by(df, :a, [:b,:c] => (b,c) -> DataFrame(p=b,q=c))
-    @test by(x -> [1 2; 3 4], df, :a) ==
-          by([:b,:c] => (b,c) -> [1 2; 3 4], df, :a) ==
-          by(df, :a, x -> [1 2; 3 4]) ==
-          by(df, :a, [:b,:c] => (b,c) -> [1 2; 3 4])
-    @test by(nrow, df, :a) == by(df, :a, nrow) == by(df, :a, [nrow => :nrow]) ==
-          by(df, :a, 1 => length => :nrow)
-    @test by(nrow => :res, df, :a) == by(df, :a, nrow => :res) ==
-          by(df, :a, [nrow => :res]) == by(df, :a, 1 => length => :res)
-    @test by(df, :a, nrow => :res, nrow, [nrow => :res2]) ==
-          by(df, :a, 1 => length => :res, 1 => length => :nrow, 1 => length => :res2)
-
-    @test_throws ArgumentError by([:b,:c] => ((b,c) -> [1 2; 3 4]) => :xxx, df, :a)
-    @test_throws ArgumentError by(df, :a, [:b,:c] => ((b,c) -> [1 2; 3 4]) => :xxx)
-    @test_throws ArgumentError by(df, :a, nrow, nrow)
-    @test_throws ArgumentError by(df, :a, [nrow])
-
     gd = groupby_checked(df, :a)
 
     # Only test that different combine syntaxes work,
@@ -703,50 +642,53 @@ end
         combine(:c => sum => :c_sum, gd) ==
         combine(gd, [:c => sum]) ==
         combine(gd, [:c => sum => :c_sum]) ==
-        combine(d -> (c_sum=sum(d.c),), gd) ==
-        combine(gd, d -> (c_sum=sum(d.c),))
+        combine(d -> (c_sum=sum(d.c),), gd)
+    @test_throws MethodError combine(gd, d -> (c_sum=sum(d.c),))
 
     @test combine(gd, :c => vexp) ==
         combine(:c => vexp, gd) ==
         combine(gd, :c => vexp => :c_function) ==
         combine(:c => vexp => :c_function, gd) ==
         combine(:c => c -> (c_function = vexp(c),), gd) ==
-        combine(gd, :c => c -> (c_function = vexp(c),)) ==
         combine(gd, [:c => vexp]) ==
         combine(gd, [:c => vexp => :c_function]) ==
-        combine(d -> (c_function=exp.(d.c),), gd) ==
-        combine(gd, d -> (c_function=exp.(d.c),))
+        combine(d -> (c_function=exp.(d.c),), gd)
+    @test_throws ArgumentError combine(gd, :c => c -> (c_function = vexp(c),))
+    @test_throws MethodError combine(gd, d -> (c_function=exp.(d.c),))
 
     @test combine(gd, :b => sum, :c => sum) ==
         combine(gd, :b => sum => :b_sum, :c => sum => :c_sum) ==
         combine(gd, [:b => sum, :c => sum]) ==
         combine(gd, [:b => sum => :b_sum, :c => sum => :c_sum]) ==
-        combine(d -> (b_sum=sum(d.b), c_sum=sum(d.c)), gd) ==
-        combine(gd, d -> (b_sum=sum(d.b), c_sum=sum(d.c)))
+        combine(d -> (b_sum=sum(d.b), c_sum=sum(d.c)), gd)
+    @test_throws MethodError combine(gd, d -> (b_sum=sum(d.b), c_sum=sum(d.c)))
 
     @test combine(gd, :b => vexp, :c => identity) ==
         combine(gd, :b => vexp => :b_function, :c => identity => :c_identity) ==
         combine(gd, [:b => vexp, :c => identity]) ==
         combine(gd, [:b => vexp => :b_function, :c => identity => :c_identity]) ==
         combine(d -> (b_function=vexp(d.b), c_identity=d.c), gd) ==
-        combine(gd, d -> (b_function=vexp(d.b), c_identity=d.c)) ==
-        combine([:b, :c] => (b, c) -> (b_function=vexp(b), c_identity=c), gd) ==
-        combine(gd, [:b, :c] => (b, c) -> (b_function=vexp(b), c_identity=c))
+        combine([:b, :c] => (b, c) -> (b_function=vexp(b), c_identity=c), gd)
+    @test_throws MethodError combine(gd, d -> (b_function=vexp(d.b), c_identity=d.c))
+    @test_throws ArgumentError combine(gd, [:b, :c] => (b, c) -> (b_function=vexp(b), c_identity=c))
 
     @test combine(x -> extrema(x.c), gd) == combine(:c => (x -> extrema(x)) => :x1, gd)
     @test combine(x -> x.b+x.c, gd) == combine([:b,:c] => (+) => :x1, gd)
     @test combine(x -> (p=x.b, q=x.c), gd) ==
-          combine([:b,:c] => (b,c) -> (p=b,q=c), gd) ==
-          combine(gd, x -> (p=x.b, q=x.c)) ==
-          combine(gd, [:b,:c] => (b,c) -> (p=b,q=c))
+          combine([:b,:c] => (b,c) -> (p=b,q=c), gd)
+    @test_throws MethodError combine(gd, x -> (p=x.b, q=x.c))
+    @test_throws ArgumentError combine(gd, [:b,:c] => (b,c) -> (p=b,q=c))
+
     @test combine(x -> DataFrame(p=x.b, q=x.c), gd) ==
-          combine([:b,:c] => (b,c) -> DataFrame(p=b,q=c), gd) ==
-          combine(gd, x -> DataFrame(p=x.b, q=x.c)) ==
-          combine(gd, [:b,:c] => (b,c) -> DataFrame(p=b,q=c))
+          combine([:b,:c] => (b,c) -> DataFrame(p=b,q=c), gd)
+    @test_throws MethodError combine(gd, x -> DataFrame(p=x.b, q=x.c))
+    @test_throws ArgumentError combine(gd, [:b,:c] => (b,c) -> DataFrame(p=b,q=c))
+
     @test combine(x -> [1 2; 3 4], gd) ==
-          combine([:b,:c] => (b,c) -> [1 2; 3 4], gd) ==
-          combine(gd, x -> [1 2; 3 4]) ==
-          combine(gd, [:b,:c] => (b,c) -> [1 2; 3 4])
+          combine([:b,:c] => (b,c) -> [1 2; 3 4], gd)
+    @test_throws MethodError combine(gd, x -> [1 2; 3 4])
+    @test_throws ArgumentError combine(gd, [:b,:c] => (b,c) -> [1 2; 3 4])
+
     @test combine(nrow, gd) == combine(gd, nrow) == combine(gd, [nrow => :nrow]) ==
           combine(gd, 1 => length => :nrow)
     @test combine(nrow => :res, gd) == combine(gd, nrow => :res) ==
@@ -758,62 +700,65 @@ end
     @test_throws ArgumentError combine(gd, nrow, nrow)
     @test_throws ArgumentError combine(gd, [nrow])
 
-    for f in (map, combine)
-        for col in (:c, 3)
-            @test f(col => sum, gd) == f(d -> (c_sum=sum(d.c),), gd)
-            @test f(col => x -> sum(x), gd) == f(d -> (c_function=sum(d.c),), gd)
-            @test f(col => x -> (z=sum(x),), gd) == f(d -> (z=sum(d.c),), gd)
-            @test f(col => x -> DataFrame(z=sum(x),), gd) == f(d -> (z=sum(d.c),), gd)
-            @test f(col => identity, gd) == f(d -> (c_identity=d.c,), gd)
-            @test f(col => x -> (z=x,), gd) == f(d -> (z=d.c,), gd)
-
-            @test f(col => sum => :xyz, gd) ==
-                f(d -> (xyz=sum(d.c),), gd)
-            @test f(col => (x -> sum(x)) => :xyz, gd) ==
-                f(d -> (xyz=sum(d.c),), gd)
-            @test f(col => (x -> (sum(x),)) => :xyz, gd) ==
-                f(d -> (xyz=(sum(d.c),),), gd)
-            @test f(nrow, gd) == f(d -> (nrow=length(d.c),), gd)
-            @test f(nrow => :res, gd) == f(d -> (res=length(d.c),), gd)
-            @test f(col => sum => :res, gd) == f(d -> (res=sum(d.c),), gd)
-            @test f(col => (x -> sum(x)) => :res, gd) == f(d -> (res=sum(d.c),), gd)
-            @test_throws ArgumentError f(col => (x -> (z=sum(x),)) => :xyz, gd)
-            @test_throws ArgumentError f(col => (x -> DataFrame(z=sum(x),)) => :xyz, gd)
-            @test_throws ArgumentError f(col => (x -> (z=x,)) => :xyz, gd)
-            @test_throws ArgumentError f(col => x -> (z=1, xzz=[1]), gd)
+    for col in (:c, 3)
+        @test combine(col => sum, gd) == combine(d -> (c_sum=sum(d.c),), gd)
+        @test combine(col => x -> sum(x), gd) == combine(d -> (c_function=sum(d.c),), gd)
+        @test combine(col => x -> (z=sum(x),), gd) == combine(d -> (z=sum(d.c),), gd)
+        @test combine(col => x -> DataFrame(z=sum(x),), gd) == combine(d -> (z=sum(d.c),), gd)
+        @test combine(col => identity, gd) == combine(d -> (c_identity=d.c,), gd)
+        @test combine(col => x -> (z=x,), gd) == combine(d -> (z=d.c,), gd)
+
+        @test combine(col => sum => :xyz, gd) ==
+            combine(d -> (xyz=sum(d.c),), gd)
+        @test combine(col => (x -> sum(x)) => :xyz, gd) ==
+            combine(d -> (xyz=sum(d.c),), gd)
+        @test combine(col => (x -> (sum(x),)) => :xyz, gd) ==
+            combine(d -> (xyz=(sum(d.c),),), gd)
+        @test combine(nrow, gd) == combine(d -> (nrow=length(d.c),), gd)
+        @test combine(nrow => :res, gd) == combine(d -> (res=length(d.c),), gd)
+        @test combine(col => sum => :res, gd) == combine(d -> (res=sum(d.c),), gd)
+        @test combine(col => (x -> sum(x)) => :res, gd) == combine(d -> (res=sum(d.c),), gd)
+        @test_throws ArgumentError combine(col => (x -> (z=sum(x),)) => :xyz, gd)
+        @test_throws ArgumentError combine(col => (x -> DataFrame(z=sum(x),)) => :xyz, gd)
+        @test_throws ArgumentError combine(col => (x -> (z=x,)) => :xyz, gd)
+        @test_throws ArgumentError combine(col => x -> (z=1, xzz=[1]), gd)
+    end
+    for cols in ([:b, :c], 2:3, [2, 3], [false, true, true]), regroup in (true, false)
+        @test combine(cols => (b,c) -> (y=exp.(b), z=c), gd, regroup=regroup) ==
+            combine(d -> (y=exp.(d.b), z=d.c), gd, regroup=regroup)
+        @test combine(cols => (b,c) -> [exp.(b) c], gd, regroup=regroup) ==
+            combine(d -> [exp.(d.b) d.c], gd, regroup=regroup)
+        @test combine(cols => ((b,c) -> sum(b) + sum(c)) => :xyz, gd, regroup=regroup) ==
+            combine(d -> (xyz=sum(d.b) + sum(d.c),), gd, regroup=regroup)
+        if eltype(cols) === Bool
+            cols2 = [[false, true, false], [false, false, true]]
+            @test_throws MethodError combine((xyz = cols[1] => sum, xzz = cols2[2] => sum),
+                                             gd, regroup=regroup)
+            @test_throws MethodError combine((xyz = cols[1] => sum, xzz = cols2[1] => sum),
+                                             gd, regroup=regroup)
+            @test_throws MethodError combine((xyz = cols[1] => sum, xzz = cols2[2] => x -> first(x)),
+                                             gd, regroup=regroup)
+        else
+            cols2 = cols
+            @test combine(gd, cols2[1] => sum => :xyz, cols2[2] => sum => :xzz, regroup=regroup) ==
+                combine(d -> (xyz=sum(d.b), xzz=sum(d.c)), gd, regroup=regroup)
+            @test combine(gd, cols2[1] => sum => :xyz, cols2[1] => sum => :xzz, regroup=regroup) ==
+                combine(d -> (xyz=sum(d.b), xzz=sum(d.b)), gd, regroup=regroup)
+            @test combine(gd, cols2[1] => sum => :xyz,
+                    cols2[2] => (x -> first(x)) => :xzz, regroup=regroup) ==
+                combine(d -> (xyz=sum(d.b), xzz=first(d.c)), gd, regroup=regroup)
+            @test combine(gd, cols2[1] => vexp => :xyz,
+                    cols2[2] => sum => :xzz, regroup=regroup) ==
+                combine(d -> (xyz=vexp(d.b), xzz=fill(sum(d.c), length(vexp(d.b)))),
+                        gd, regroup=regroup)
         end
-        for cols in ([:b, :c], 2:3, [2, 3], [false, true, true])
-            @test f(cols => (b,c) -> (y=exp.(b), z=c), gd) ==
-                f(d -> (y=exp.(d.b), z=d.c), gd)
-            @test f(cols => (b,c) -> [exp.(b) c], gd) ==
-                f(d -> [exp.(d.b) d.c], gd)
-            @test f(cols => ((b,c) -> sum(b) + sum(c)) => :xyz, gd) ==
-                f(d -> (xyz=sum(d.b) + sum(d.c),), gd)
-            if eltype(cols) === Bool
-                cols2 = [[false, true, false], [false, false, true]]
-                @test_throws MethodError f((xyz = cols[1] => sum, xzz = cols2[2] => sum), gd)
-                @test_throws MethodError f((xyz = cols[1] => sum, xzz = cols2[1] => sum), gd)
-                @test_throws MethodError f((xyz = cols[1] => sum, xzz = cols2[2] => x -> first(x)), gd)
-            else
-                cols2 = cols
-                if f === combine
-                    @test f(gd, cols2[1] => sum => :xyz, cols2[2] => sum => :xzz) ==
-                        f(d -> (xyz=sum(d.b), xzz=sum(d.c)), gd)
-                    @test f(gd, cols2[1] => sum => :xyz, cols2[1] => sum => :xzz) ==
-                        f(d -> (xyz=sum(d.b), xzz=sum(d.b)), gd)
-                    @test f(gd, cols2[1] => sum => :xyz,
-                            cols2[2] => (x -> first(x)) => :xzz) ==
-                        f(d -> (xyz=sum(d.b), xzz=first(d.c)), gd)
-                    @test f(gd, cols2[1] => vexp => :xyz,
-                            cols2[2] => sum => :xzz) ==
-                        f(d -> (xyz=vexp(d.b), xzz=fill(sum(d.c), length(vexp(d.b)))), gd)
-                end
-            end
 
-            @test_throws ArgumentError f(cols => (b,c) -> (y=exp.(b), z=sum(c)), gd)
-            @test_throws ArgumentError f(cols2 => ((b,c) -> DataFrame(y=exp.(b), z=sum(c))) => :xyz, gd)
-            @test_throws ArgumentError f(cols2 => ((b,c) -> [exp.(b) c]) => :xyz, gd)
-        end
+        @test_throws ArgumentError combine(cols => (b,c) -> (y=exp.(b), z=sum(c)),
+                                           gd, regroup=regroup)
+        @test_throws ArgumentError combine(cols2 => ((b,c) -> DataFrame(y=exp.(b),
+                                           z=sum(c))) => :xyz, gd, regroup=regroup)
+        @test_throws ArgumentError combine(cols2 => ((b,c) -> [exp.(b) c]) => :xyz,
+                                           gd, regroup=regroup)
     end
 end
 
@@ -942,9 +887,9 @@ Base.isless(::TestType, ::TestType) = false
     end
 
     df = DataFrame(x = [1, 1, 2, 2], y = Any[1, 2.0, 3.0, 4.0])
-    res = by(df, :x, :y => maximum => :z)
+    res = combine(groupby_checked(df, :x), :y => maximum => :z)
     @test res.z isa Vector{Float64}
-    @test res.z == by(df, :x, :y => (x -> maximum(x)) => :z).z
+    @test res.z == combine(groupby_checked(df, :x), :y => (x -> maximum(x)) => :z).z
 
     # Test maximum when no promotion rule exists
     df = DataFrame(x = [1, 1, 2, 2], y = [1, TestType(), TestType(), TestType()])
@@ -953,27 +898,29 @@ Base.isless(::TestType, ::TestType) = false
     for f in (maximum, minimum)
         res = combine(gd, :y => maximum => :z)
         @test res.z isa Vector{Any}
-        @test res.z == by(df, :x, :y => (x -> maximum(x)) => :z).z
+        @test res.z == combine(gd, :y => (x -> maximum(x)) => :z).z
     end
 end
 
-@testset "combine and map with columns named like grouping keys" begin
+@testset "combine with columns named like grouping keys" begin
     df = DataFrame(x=["a", "a", "b", missing], y=1:4)
     gd = groupby_checked(df, :x)
     @test combine(identity, gd) ≅ df
     @test combine(d -> d[:, [2, 1]], gd) ≅ df
     @test_throws ArgumentError combine(f -> DataFrame(x=["a", "b"], z=[1, 1]), gd)
-    @test map(identity, gd) ≅ gd
-    @test map(d -> d[:, [2, 1]], gd) ≅ gd
-    @test_throws ArgumentError map(f -> DataFrame(x=["a", "b"], z=[1, 1]), gd)
+    @test validate_gdf(combine(identity, gd, regroup=true)) ≅ gd
+    @test combine(d -> d[:, [2, 1]], gd, regroup=true) ≅ gd
+    @test_throws ArgumentError combine(f -> DataFrame(x=["a", "b"], z=[1, 1]), gd,
+                                       regroup=true)
 
     gd = groupby_checked(df, :x, skipmissing=true)
     @test combine(identity, gd) == df[1:3, :]
     @test combine(d -> d[:, [2, 1]], gd) == df[1:3, :]
     @test_throws ArgumentError combine(f -> DataFrame(x=["a", "b"], z=[1, 1]), gd)
-    @test map(identity, gd) == gd
-    @test map(d -> d[:, [2, 1]], gd) == gd
-    @test_throws ArgumentError map(f -> DataFrame(x=["a", "b"], z=[1, 1]), gd)
+    @test validate_gdf(combine(identity, gd, regroup=true)) == gd
+    @test validate_gdf(combine(d -> d[:, [2, 1]], gd, regroup=true)) == gd
+    @test_throws ArgumentError combine(f -> DataFrame(x=["a", "b"], z=[1, 1]), gd,
+                                       regroup=true)
 end
 
 @testset "iteration protocol" begin
@@ -1328,14 +1275,6 @@ end
     @test valuecols(gd) == [:C]
 end
 
-@testset "by skipmissing and sort" begin
-    df = DataFrame(a=[2, 2, missing, missing, 1, 1, 3, 3], b=1:8)
-    for dosort in (false, true), doskipmissing in (false, true)
-        @test by(df, :a, :b=>sum, sort=dosort, skipmissing=doskipmissing) ≅
-            combine(groupby_checked(df, :a, sort=dosort, skipmissing=doskipmissing), :b=>sum)
-    end
-end
-
 @testset "non standard cols arguments" begin
     df = DataFrame(x1=Int64[1,2,2], x2=Int64[1,1,2], y=Int64[1,2,3])
     gdf = groupby_checked(df, r"x")
@@ -1358,7 +1297,8 @@ end
     @test valuecols(gdf) == [:x1, :x2, :y]
     @test groupindices(gdf) == [1,1,1]
 
-    @test by(df, [], :x1 => sum => :a, :x2=>length => :b) == DataFrame(a=5, b=3)
+    @test combine(groupby_checked(df, []),
+                  :x1 => sum => :a, :x2=>length => :b) == DataFrame(a=5, b=3)
 
     gdf = groupby_checked(df, [])
     @test gdf[1] == df
@@ -1366,8 +1306,9 @@ end
     @test gdf[:] == gdf
     @test gdf[1:1] == gdf
 
-    @test map(nrow => :x1, gdf) == groupby_checked(DataFrame(x1=3), [])
-    @test map(:x2 => identity => :x2_identity, gdf) ==
+    @test validate_gdf(combine(nrow => :x1, gdf, regroup=true)) ==
+          groupby_checked(DataFrame(x1=3), [])
+    @test validate_gdf(combine(:x2 => identity => :x2_identity, gdf, regroup=true)) ==
           groupby_checked(DataFrame(x2_identity=[1,1,2]), [])
     @test DataFrame(gdf) == df
 
@@ -1683,10 +1624,12 @@ end
 @testset "Check aggregation of DataFrameRow" begin
     df = DataFrame(a=1)
     dfr = DataFrame(x=1, y="1")[1, 2:2]
-    @test by(sdf -> dfr, df, :a) == DataFrame(a=1, y="1")
+    gdf = groupby_checked(df, :a)
+    @test combine(sdf -> dfr, gdf) == DataFrame(a=1, y="1")
 
     df = DataFrame(a=[1,1,2,2,3,3], b='a':'f', c=string.(1:6))
-    @test by(sdf -> sdf[1, [3,2,1]], df, :a) == df[1:2:5, [1,3,2]]
+    gdf = groupby_checked(df, :a)
+    @test combine(sdf -> sdf[1, [3,2,1]], gdf) == df[1:2:5, [1,3,2]]
 end
 
 @testset "Allow returning DataFrame() or NamedTuple() to drop group" begin
@@ -1700,12 +1643,14 @@ end
         fr in (DataFrame(x1=[true]), (x1=[true],))
 
         df = DataFrame(a = 1:N, x1 = x1)
-        res = by(sdf -> sdf.x1[1] ? fr : er, df, :a)
-        @test res == DataFrame(map(sdf -> sdf.x1[1] ? fr : er, groupby_checked(df, :a)))
+        gdf = groupby_checked(df, :a)
+        res = combine(sdf -> sdf.x1[1] ? fr : er, gdf)
+        @test res == DataFrame(validate_gdf(combine(sdf -> sdf.x1[1] ? fr : er,
+                                                    groupby_checked(df, :a), regroup=true)))
         if fr isa AbstractVector && df.x1[1]
-            @test res == by(:x1 => (x1 -> x1[1] ? fr : er) => :x1, df, :a)
+            @test res == combine(:x1 => (x1 -> x1[1] ? fr : er) => :x1, gdf)
         else
-            @test res == by(:x1 => x1 -> x1[1] ? fr : er, df, :a)
+            @test res == combine(:x1 => x1 -> x1[1] ? fr : er, gdf)
         end
         if nrow(res) == 0 && length(propertynames(er)) == 0 && er != rand(0, 1)
             @test res == DataFrame(a=[])
@@ -1714,67 +1659,65 @@ end
             @test res == df[df.x1, :]
         end
         if 1 < i < 2^N
-            @test_throws ArgumentError by(sdf -> sdf.x1[1] ? (x1=true,) : er, df, :a)
+            @test_throws ArgumentError combine(sdf -> sdf.x1[1] ? (x1=true,) : er, gdf)
             if df.x1[1] || !(fr isa AbstractVector)
-                @test_throws ArgumentError by(sdf -> sdf.x1[1] ? fr : (x2=[true],), df, :a)
+                @test_throws ArgumentError combine(sdf -> sdf.x1[1] ? fr : (x2=[true],), gdf)
             else
-                res = by(sdf -> sdf.x1[1] ? fr : (x2=[true],), df, :a)
+                res = combine(sdf -> sdf.x1[1] ? fr : (x2=[true],), gdf)
                 @test names(res) == ["a", "x2"]
             end
-            @test_throws ArgumentError by(sdf -> sdf.x1[1] ? true : er, df, :a)
+            @test_throws ArgumentError combine(sdf -> sdf.x1[1] ? true : er, gdf)
         end
     end
 end
 
 @testset "auto-splatting, ByRow, and column renaming" begin
     df = DataFrame(g=[1,1,1,2,2,2], x1=1:6, x2=1:6)
-    @test by(df, :g, r"x" => cor) == DataFrame(g=[1,2], x1_x2_cor = [1.0, 1.0])
-    @test by(df, :g, Not(:g) => ByRow(/)) == DataFrame(:g => [1,1,1,2,2,2], Symbol("x1_x2_/") => 1.0)
-    @test by(df, :g, Between(:x2, :x1) => () -> 1) == DataFrame(:g => 1:2, Symbol("function") => 1)
-    @test by(df, :g, :x1 => :z) ==
-          by(df, :g, [:x1 => :z]) ==
-          by(:x1 => :z, df, :g) ==
-          combine(groupby_checked(df, :g), :x1 => :z) ==
-          combine(groupby_checked(df, :g), [:x1 => :z]) ==
-          combine(:x1 => :z, groupby_checked(df, :g)) ==
+    gdf = groupby_checked(df, :g)
+    @test combine(gdf, r"x" => cor) == DataFrame(g=[1,2], x1_x2_cor = [1.0, 1.0])
+    @test combine(gdf, Not(:g) => ByRow(/)) == DataFrame(:g => [1,1,1,2,2,2], Symbol("x1_x2_/") => 1.0)
+    @test combine(gdf, Between(:x2, :x1) => () -> 1) == DataFrame(:g => 1:2, Symbol("function") => 1)
+    @test combine(gdf, :x1 => :z) == combine(gdf, [:x1 => :z]) == combine(:x1 => :z, gdf) ==
           DataFrame(g=[1,1,1,2,2,2], z=1:6)
-    @test map(:x1 => :z, groupby_checked(df, :g)) == groupby_checked(DataFrame(g=[1,1,1,2,2,2], z=1:6), :g)
+    @test validate_gdf(combine(:x1 => :z, groupby_checked(df, :g), regroup=true)) ==
+          groupby_checked(DataFrame(g=[1,1,1,2,2,2], z=1:6), :g)
 end
 
 @testset "hard tabular return value cases" begin
     Random.seed!(1)
     df = DataFrame(b = repeat([2, 1], outer=[4]), x = randn(8))
-    res = by(sdf -> sdf.x[1:2], df, :b)
+    gdf = groupby_checked(df, :b)
+    res = combine(sdf -> sdf.x[1:2], gdf)
     @test names(res) == ["b", "x1"]
-    res2 = by(:x => x -> x[1:2], df, :b)
+    res2 = combine(:x => x -> x[1:2], gdf)
     @test names(res2) == ["b", "x_function"]
     @test Matrix(res) == Matrix(res2)
-    res2 = by(:x => (x -> x[1:2]) => :z, df, :b)
+    res2 = combine(:x => (x -> x[1:2]) => :z, gdf)
     @test names(res2) == ["b", "z"]
     @test Matrix(res) == Matrix(res2)
 
-    @test_throws ArgumentError by(df, :b) do sdf
+    @test_throws ArgumentError combine(gdf) do sdf
         if sdf.b[1] == 2
             return (c=sdf.x[1:2],)
         else
             return sdf.x[1:2]
         end
     end
-    @test_throws ArgumentError by(df, :b) do sdf
+    @test_throws ArgumentError combine(gdf) do sdf
         if sdf.b[1] == 1
             return (c=sdf.x[1:2],)
         else
             return sdf.x[1:2]
         end
     end
-    @test_throws ArgumentError by(df, :b) do sdf
+    @test_throws ArgumentError combine(gdf) do sdf
         if sdf.b[1] == 2
             return (c=sdf.x[1],)
         else
             return sdf.x[1]
         end
     end
-    @test_throws ArgumentError by(df, :b) do sdf
+    @test_throws ArgumentError combine(gdf) do sdf
         if sdf.b[1] == 1
             return (c=sdf.x[1],)
         else
@@ -1783,26 +1726,25 @@ end
     end
 
     for i in 1:2, v1 in [1, 1:2], v2 in [1, 1:2]
-        @test_throws ArgumentError by([:b, :x] => ((b,x) -> b[1] == i ? x[v1] : (c=x[v2],)) => :v, df, :b)
-        @test_throws ArgumentError by([:b, :x] => ((b,x) -> b[1] == i ? x[v1] : (v=x[v2],)) => :v, df, :b)
+        @test_throws ArgumentError combine([:b, :x] => ((b,x) -> b[1] == i ? x[v1] : (c=x[v2],)) => :v, gdf)
+        @test_throws ArgumentError combine([:b, :x] => ((b,x) -> b[1] == i ? x[v1] : (v=x[v2],)) => :v, gdf)
     end
 end
 
 @testset "last Pair interface with multiple return values" begin
     df = DataFrame(g=[1,1,1,2,2,2], x1=1:6)
-    @test by(df, :g, :x1 => x -> DataFrame()) == by(:x1 => x -> DataFrame(), df, :g)
-    @test by(df, :g, :x1 => x -> (x=1, y=2)) == by(:x1 => x -> (x=1, y=2), df, :g)
-    @test by(df, :g, :x1 => x -> (x=[1], y=[2])) == by(:x1 => x -> (x=[1], y=[2]), df, :g)
-    @test_throws ArgumentError by(df, :g, :x1 => x -> (x=[1],y=2))
-    @test_throws ArgumentError by(:x1 => x -> (x=[1], y=2), df, :g)
-    @test by(df, :g, :x1 => x -> ones(2, 2)) == by(:x1 => x -> ones(2, 2), df, :g)
-    @test by(df, :g, :x1 => x -> df[1, Not(:g)]) == by(:x1 => x -> df[1, Not(:g)], df, :g)
+    gdf = groupby_checked(df, :g)
+    @test_throws ArgumentError combine(gdf, :x1 => x -> DataFrame())
+    @test_throws ArgumentError combine(gdf, :x1 => x -> (x=1, y=2))
+    @test_throws ArgumentError combine(gdf, :x1 => x -> (x=[1], y=[2]))
+    @test_throws ArgumentError combine(gdf, :x1 => x -> (x=[1],y=2))
+    @test_throws ArgumentError combine(:x1 => x -> (x=[1], y=2), gdf)
+    @test_throws ArgumentError combine(gdf, :x1 => x -> ones(2, 2))
+    @test_throws ArgumentError combine(gdf, :x1 => x -> df[1, Not(:g)])
 end
 
 @testset "keepkeys" begin
     df = DataFrame(g=[1,1,1,2,2,2], x1=1:6)
-    @test by(df, :g, :x1 => identity, keepkeys=false) == DataFrame(x1_identity=1:6)
-    @test by(x -> DataFrame(g=x.x1), df, :g, keepkeys=false) == DataFrame(g=1:6)
     gdf = groupby_checked(df, :g)
     @test combine(gdf, :x1 => identity => :g, keepkeys=false) == DataFrame(g=1:6)
     @test combine(x -> (z=x.x1,), gdf, keepkeys=false) == DataFrame(z=1:6)
@@ -1832,28 +1774,31 @@ end
 end
 
 @testset "mixing of different return lengths and pseudo-broadcasting" begin
-    df = DataFrame(g=[1,1,1,2,2]);
-    f1(i) = i[1] == 1 ? ["a", "b"] : ["c"];
-    f2(i) = i[1] == 1 ? ["d"] : ["e", "f"];
-    @test_throws ArgumentError by(df, :g, :g => f1, :g => f2)
-
-    f1(i) = i[1] == 1 ? ["a"] : ["c"];
-    f2(i) = i[1] == 1 ? "d" : "e";
-    @test by(df, :g, :g => f1, :g => f2) ==
+    df = DataFrame(g=[1,1,1,2,2])
+    gdf = groupby_checked(df, :g)
+
+    f1(i) = i[1] == 1 ? ["a", "b"] : ["c"]
+    f2(i) = i[1] == 1 ? ["d"] : ["e", "f"]
+    @test_throws ArgumentError combine(gdf, :g => f1, :g => f2)
+
+    f1(i) = i[1] == 1 ? ["a"] : ["c"]
+    f2(i) = i[1] == 1 ? "d" : "e"
+    @test combine(gdf, :g => f1, :g => f2) ==
           DataFrame(g=[1,2], g_f1=["a", "c"], g_f2 = ["d", "e"])
 
-    f1(i) = i[1] == 1 ? ["a","c"] : [];
-    f2(i) = i[1] == 1 ? "d" : "e";
-    @test by(df, :g, :g => f1, :g => f2) ==
+    f1(i) = i[1] == 1 ? ["a","c"] : []
+    f2(i) = i[1] == 1 ? "d" : "e"
+    @test combine(gdf, :g => f1, :g => f2) ==
           DataFrame(g = [1,1], g_f1 = ["a", "c"], g_f2 = ["d", "d"])
 
-    @test by(df, :g, :g => Ref) == DataFrame(g=[1,2], g_Ref=[[1,1,1], [2,2]])
-    @test by(df, :g, :g => x -> view([x],1)) == DataFrame(g=[1,2], g_function=[[1,1,1], [2,2]])
+    @test combine(gdf, :g => Ref) == DataFrame(g=[1,2], g_Ref=[[1,1,1], [2,2]])
+    @test combine(gdf, :g => x -> view([x],1)) == DataFrame(g=[1,2], g_function=[[1,1,1], [2,2]])
 
     Random.seed!(1234)
     df = DataFrame(g=1:100)
+    gdf = groupby_checked(df, :g)
     for i in 1:10
-        @test by(df, :g, :g => x -> rand([x[1], Ref(x[1]), view(x, 1)])) ==
+        @test combine(gdf, :g => x -> rand([x[1], Ref(x[1]), view(x, 1)])) ==
               DataFrame(g=1:100, g_function=1:100)
     end
 
@@ -1862,12 +1807,13 @@ end
 
     for i in 0:nrow(df_ref), dosort in [true, false], dokeepkeys in [true, false]
         df = df_ref[1:i, :]
-        @test by(df, :g, :x1 => sum => :x1, :x2 => identity => :x2,
-                 :x3 => (x -> Ref(sum(x))) => :x3, nrow, :x4 => ByRow(sin) => :x4,
-                 sort=dosort, keepkeys=dokeepkeys) ==
-              by(df, :g, sort=dosort, keepkeys=dokeepkeys) do sdf
-                  DataFrame(x1 = sum(sdf.x1), x2 = sdf.x2, x3 = sum(sdf.x3),
-                            nrow = nrow(sdf), x4 = sin.(sdf.x4))
+        gdf = groupby_checked(df, :g, sort=dosort)
+        @test combine(gdf, :x1 => sum => :x1, :x2 => identity => :x2,
+                      :x3 => (x -> Ref(sum(x))) => :x3, nrow, :x4 => ByRow(sin) => :x4,
+                      keepkeys=dokeepkeys) ==
+              combine(gdf, keepkeys=dokeepkeys) do sdf
+                      DataFrame(x1 = sum(sdf.x1), x2 = sdf.x2, x3 = sum(sdf.x3),
+                                nrow = nrow(sdf), x4 = sin.(sdf.x4))
               end
     end
 end
@@ -1875,54 +1821,56 @@ end
 @testset "passing columns" begin
     df = DataFrame(rand(10, 4))
     df.g = shuffle!([1,2,2,3,3,3,4,4,4,4])
+    gdf = groupby_checked(df, :g)
 
     for selector in [All(), :, r"x", Between(:x1, :x4), Not(:g), [:x1, :x2, :x3, :x4],
                      [1, 2, 3, 4], [true, true, true, true, false]]
-        @test by(df, :g, selector, :x1 => ByRow(sin) => :x1, :x2 => ByRow(sin) => :x3) ==
-              by(df, :g) do sdf
+        @test combine(gdf, selector, :x1 => ByRow(sin) => :x1, :x2 => ByRow(sin) => :x3) ==
+              combine(gdf) do sdf
                   DataFrame(x1 = sin.(sdf.x1), x2 = sdf.x2, x3 = sin.(sdf.x2), x4 = sdf.x4)
               end
     end
 
     for selector in [All(), :, r"x", Between(:x1, :x4), Not(:g), [:x1, :x2, :x3, :x4],
                      [1, 2, 3, 4], [true, true, true, true, false]]
-        @test by(df, :g, :x1 => ByRow(sin) => :x1, :x2 => ByRow(sin) => :x3, selector) ==
-              by(df, :g) do sdf
+        @test combine(gdf, :x1 => ByRow(sin) => :x1, :x2 => ByRow(sin) => :x3, selector) ==
+              combine(gdf) do sdf
                   DataFrame(x1 = sin.(sdf.x1), x3 = sin.(sdf.x2), x2 = sdf.x2, x4 = sdf.x4)
               end
     end
 
     for selector in [Between(:x1, :x3), Not(:x4), [:x1, :x2, :x3], [1, 2, 3],
                      [true, true, true, false, false]]
-        @test by(df, :g, :x2 => ByRow(sin) => :x3, selector, :x1 => ByRow(sin) => :x1) ==
-              by(df, :g) do sdf
+        @test combine(gdf, :x2 => ByRow(sin) => :x3, selector, :x1 => ByRow(sin) => :x1) ==
+              combine(gdf) do sdf
                   DataFrame(x3 = sin.(sdf.x2), x1 = sin.(sdf.x1), x2 = sdf.x2)
               end
     end
 
-    @test by(df, :g, 4, :x1 => ByRow(sin) => :x1, :x2 => ByRow(sin) => :x3, :x2) ==
-          by(df, :g) do sdf
+    @test combine(gdf, 4, :x1 => ByRow(sin) => :x1, :x2 => ByRow(sin) => :x3, :x2) ==
+          combine(gdf) do sdf
               DataFrame(x4 = sdf.x4, x1 = sin.(sdf.x1), x3 = sin.(sdf.x2), x2 = sdf.x2)
           end
 
-    @test by(df, :g, 4 => :h, :x1 => ByRow(sin) => :z, :x2 => ByRow(sin) => :x3, :x2) ==
-          by(df, :g) do sdf
+    @test combine(gdf, 4 => :h, :x1 => ByRow(sin) => :z, :x2 => ByRow(sin) => :x3, :x2) ==
+          combine(gdf) do sdf
               DataFrame(h = sdf.x4, z = sin.(sdf.x1), x3 = sin.(sdf.x2), x2 = sdf.x2)
           end
 
-    @test_throws ArgumentError by(df, :g, 4 => :h, :x1 => ByRow(sin) => :h)
-    @test_throws ArgumentError by(df, :g, :x1 => :x1_sin, :x1 => ByRow(sin))
-    @test_throws ArgumentError by(df, :g, 1, :x1 => ByRow(sin) => :x1)
+    @test_throws ArgumentError combine(gdf, 4 => :h, :x1 => ByRow(sin) => :h)
+    @test_throws ArgumentError combine(gdf, :x1 => :x1_sin, :x1 => ByRow(sin))
+    @test_throws ArgumentError combine(gdf, 1, :x1 => ByRow(sin) => :x1)
 end
 
 @testset "correct dropping of groups" begin
     df = DataFrame(g = 10:-1:1)
-
+    gdf = groupby_checked(df, :g)
+    sgdf = groupby_checked(df, :g, sort=true)
     for keep in [[3,2,1], [5,3,1], [9], Int[]]
-        @test by(df, :g, :g => first => :keep, :g => x -> x[1] in keep ? x : Int[]) ==
+        @test combine(gdf, :g => first => :keep, :g => x -> x[1] in keep ? x : Int[]) ==
               DataFrame(g=keep, keep=keep, g_function=keep)
-        @test by(df, :g, :g => first => :keep, :g => x -> x[1] in keep ? x : Int[],
-                 sort=true) == sort(DataFrame(g=keep, keep=keep, g_function=keep))
+        @test combine(sgdf, :g => first => :keep, :g => x -> x[1] in keep ? x : Int[]) ==
+              sort(DataFrame(g=keep, keep=keep, g_function=keep))
     end
 end
 
@@ -1931,46 +1879,39 @@ end
     gdf = groupby_checked(df, :g)
 
     # whole column 4 options of single pair passed
-    @test by(df, :g , AsTable([:x, :y]) => Ref) ==
-          by(AsTable([:x, :y]) => Ref, df, :g) ==
-          combine(gdf , AsTable([:x, :y]) => Ref) ==
+    @test combine(gdf , AsTable([:x, :y]) => Ref) ==
           combine(AsTable([:x, :y]) => Ref, gdf) ==
           DataFrame(g=1:2, x_y_Ref=[(x=[1,2,3], y=[6,7,8]), (x=[4,5], y=[9,10])])
-    @test map(AsTable([:x, :y]) => Ref, gdf) ==
-          groupby_checked(by(df, :g , AsTable([:x, :y]) => Ref), :g)
+    @test validate_gdf(combine(AsTable([:x, :y]) => Ref, gdf, regroup=true)) ==
+          groupby_checked(combine(gdf, AsTable([:x, :y]) => Ref), :g)
 
-    @test by(df, :g, AsTable(1) => Ref) ==
-          combine(gdf, AsTable(1) => Ref) ==
+    @test combine(gdf, AsTable(1) => Ref) ==
           DataFrame(g=1:2, g_Ref=[(g=[1,1,1],),(g=[2,2],)])
 
 
     # ByRow 4 options of single pair passed
-    @test by(df, :g, AsTable([:x, :y]) => ByRow(x -> [x])) ==
-          by(AsTable([:x, :y]) => ByRow(x -> [x]), df, :g) ==
-          combine(gdf, AsTable([:x, :y]) => ByRow(x -> [x])) ==
+    @test combine(gdf, AsTable([:x, :y]) => ByRow(x -> [x])) ==
           combine(AsTable([:x, :y]) => ByRow(x -> [x]), gdf) ==
           DataFrame(g=[1,1,1,2,2],
                     x_y_function=[[(x=1,y=6)], [(x=2,y=7)], [(x=3,y=8)], [(x=4,y=9)], [(x=5,y=10)]])
-    @test map(AsTable([:x, :y]) => ByRow(x -> [x]), gdf) ==
-          groupby_checked(by(df, :g, AsTable([:x, :y]) => ByRow(x -> [x])), :g)
+    @test validate_gdf(combine(AsTable([:x, :y]) => ByRow(x -> [x]), gdf, regroup=true)) ==
+          groupby_checked(combine(gdf, AsTable([:x, :y]) => ByRow(x -> [x])), :g)
 
     # whole column and ByRow test for multiple pairs passed
-    @test by(df, :g, [:x, :y], [AsTable(v) => (x -> -x[1]) for v in [:x, :y]]) ==
-          combine(gdf, [:x, :y], [AsTable(v) => (x -> -x[1]) for v in [:x, :y]]) ==
+    @test combine(gdf, [:x, :y], [AsTable(v) => (x -> -x[1]) for v in [:x, :y]]) ==
           [df DataFrame(x_function=-df.x, y_function=-df.y)]
-    @test by(df, :g, [:x, :y], [AsTable(v) => ByRow(x -> (-x[1],)) for v in [:x, :y]]) ==
-          combine(gdf, [:x, :y], [AsTable(v) => ByRow(x -> (-x[1],)) for v in [:x, :y]]) ==
+    @test combine(gdf, [:x, :y], [AsTable(v) => ByRow(x -> (-x[1],)) for v in [:x, :y]]) ==
           [df DataFrame(x_function=[(-1,), (-2,) ,(-3,) ,(-4,) ,(-5,)],
                         y_function=[(-6,), (-7,) ,(-8,) ,(-9,) ,(-10,)])]
 
-    @test_throws ArgumentError by(df, :g, AsTable([:x, :y]) => ByRow(identity))
-    @test_throws ArgumentError by(df, :g, AsTable([:x, :y]) => ByRow(x -> df[1, :]))
+    @test_throws ArgumentError combine(gdf, AsTable([:x, :y]) => ByRow(identity))
+    @test_throws ArgumentError combine(gdf, AsTable([:x, :y]) => ByRow(x -> df[1, :]))
 end
 
 @testset "test correctness of regrouping" begin
     df = DataFrame(g=[2,2,1,3,1,2,1,2,3])
     gdf = groupby_checked(df, :g)
-    gdf2 = combine(identity, gdf, regroup=true)
+    gdf2 = validate_gdf(combine(identity, gdf, regroup=true))
     @test combine(gdf, :g => sum) == combine(gdf2, :g => sum)
 
     df.id = 1:9
diff --git a/test/select.jl b/test/select.jl
index 005b87e222..0416a5e0b7 100644
--- a/test/select.jl
+++ b/test/select.jl
@@ -705,6 +705,8 @@ end
                df_ref[1:2, []], view(df_ref, 1:2, []),
                df_ref[[], 1:2], view(df_ref, [], 1:2)]
         @test select(df, nrow => :z, nrow, [nrow => :z2]) ==
+              repeat(DataFrame(z=nrow(df), nrow=nrow(df), z2=nrow(df)), nrow(df))
+        @test combine(df, nrow => :z, nrow, [nrow => :z2]) ==
               DataFrame(z=nrow(df), nrow=nrow(df), z2=nrow(df))
         @test_throws ArgumentError select(df, nrow, nrow)
         @test_throws ArgumentError select(df, [nrow])
@@ -743,10 +745,13 @@ end
 
 @testset "select and select! empty selection" begin
     df = DataFrame(rand(10, 4))
-    x = [1,2,3]
+    x = [1:10;]
+    y = [1,2,3]
 
     @test select(df, r"z") == DataFrame()
     @test select(df, r"z" => () -> x) == DataFrame(:function => x)
+    @test_throws ArgumentError select(df, r"z" => () -> y)
+    @test combine(df, r"z" => () -> y) == DataFrame(:function => y)
     @test select(df, r"z" => () -> x)[!, 1] === x # no copy even for copycols=true
     @test_throws MethodError select(df, r"z" => x -> 1)
     @test_throws ArgumentError select(df, r"z" => ByRow(rand))
@@ -890,10 +895,16 @@ end
     end
     @test_throws ArgumentError select(df, [] => (() -> [9]) => :a, :)
     @test_throws ArgumentError select(df, :, [] => (() -> [9]) => :a)
-    @test transform(df, names(df) .=> (x -> 9) .=> names(df)) == DataFrame([9 9 9])
+    @test transform(df, names(df) .=> (x -> 9) .=> names(df)) ==
+          repeat(DataFrame([9 9 9]), nrow(df))
+    @test combine(df, names(df) .=> (x -> 9) .=> names(df)) ==
+          DataFrame([9 9 9])
     @test transform(df, names(df) .=> (x -> 9) .=> names(df), :x1 => :x4) ==
           DataFrame([9 9 9 1; 9 9 9 4])
-    @test transform(df3, names(df3) .=> (x -> 9) .=> names(df3)) == DataFrame([9 9 9])
+    @test transform(df3, names(df3) .=> (x -> 9) .=> names(df3)) ==
+          repeat(DataFrame([9 9 9]), nrow(df3))
+    @test combine(df3, names(df3) .=> (x -> 9) .=> names(df3)) ==
+          DataFrame([9 9 9])
     @test transform(df3, names(df3) .=> (x -> 9) .=> names(df3), :x1 => :x4) ==
           DataFrame(ones(0, 4))
 
@@ -901,6 +912,14 @@ end
                    x3=[missing,2], x4=categorical([missing, 2]))
 
     df2 = select(df, names(df) .=> first)
+    @test df2 ≅ repeat(DataFrame(x1_first=1, x2_first=1, x3_first=missing,
+                                 x4_first=missing), nrow(df2))
+    @test df2.x1_first isa Vector{Int}
+    @test df2.x2_first isa CategoricalVector{Int}
+    @test df2.x3_first isa Vector{Missing}
+    @test df2.x4_first isa Vector{Missing}
+
+    df2 = combine(df, names(df) .=> first)
     @test df2 ≅ DataFrame(x1_first=1, x2_first=1, x3_first=missing,
                           x4_first=missing)
     @test df2.x1_first isa Vector{Int}
@@ -909,6 +928,14 @@ end
     @test df2.x4_first isa Vector{Missing}
 
     df2 = select(df, names(df) .=> last)
+    @test df2 ≅ repeat(DataFrame(x1_last=2, x2_last=2, x3_last=2,
+                                 x4_last=2), nrow(df2))
+    @test df2.x1_last isa Vector{Int}
+    @test df2.x2_last isa CategoricalVector{Int}
+    @test df2.x3_last isa Vector{Int}
+    @test df2.x4_last isa CategoricalVector{Int}
+
+    df2 = combine(df, names(df) .=> last)
     @test df2 ≅ DataFrame(x1_last=2, x2_last=2, x3_last=2,
                           x4_last=2)
     @test df2.x1_last isa Vector{Int}
@@ -953,31 +980,32 @@ end
         @test df2.x4_last isa CategoricalVector{Int}
     end
 
-    df2 = select(df, names(df) .=> first, [] => (() -> Int[]) => :x1)
+    @test_throws ArgumentError select(df, names(df) .=> first, [] => (() -> Int[]) => :x1)
+    df2 = combine(df, names(df) .=> first, [] => (() -> Int[]) => :x1)
     @test size(df2) == (0, 5)
     @test df2.x1_first isa Vector{Int}
     @test df2.x2_first isa CategoricalVector{Int}
     @test df2.x3_first isa Vector{Missing}
     @test df2.x4_first isa Vector{Missing}
 
-
-    df2 = select(df, names(df) .=> last, [] => (() -> Int[]) => :x1)
+    @test_throws ArgumentError select(df, names(df) .=> last, [] => (() -> Int[]) => :x1)
+    df2 = combine(df, names(df) .=> last, [] => (() -> Int[]) => :x1)
     @test size(df2) == (0, 5)
     @test df2.x1_last isa Vector{Int}
     @test df2.x2_last isa CategoricalVector{Int}
     @test df2.x3_last isa Vector{Int}
     @test df2.x4_last isa CategoricalVector{Int}
 
-
-    df2 = select(df, [] => (() -> Int[]) => :x1, names(df) .=> first)
+    @test_throws ArgumentError select(df, [] => (() -> Int[]) => :x1, names(df) .=> first)
+    df2 = combine(df, [] => (() -> Int[]) => :x1, names(df) .=> first)
     @test size(df2) == (0, 5)
     @test df2.x1_first isa Vector{Int}
     @test df2.x2_first isa CategoricalVector{Int}
     @test df2.x3_first isa Vector{Missing}
     @test df2.x4_first isa Vector{Missing}
 
-
-    df2 = select(df, [] => (() -> Int[]) => :x1, names(df) .=> last)
+    @test_throws ArgumentError select(df, [] => (() -> Int[]) => :x1, names(df) .=> last)
+    df2 = combine(df, [] => (() -> Int[]) => :x1, names(df) .=> last)
     @test size(df2) == (0, 5)
     @test df2.x1_last isa Vector{Int}
     @test df2.x2_last isa CategoricalVector{Int}
@@ -988,7 +1016,8 @@ end
 @testset "copycols special cases" begin
     df = DataFrame(a=1:3, b=4:6)
     c = [7, 8]
-    df2 = select(df, :a => (x -> c) => :c1, :b => (x -> c) => :c2)
+    @test_throws ArgumentError select(df, :a => (x -> c) => :c1, :b => (x -> c) => :c2)
+    df2 = combine(df, :a => (x -> c) => :c1, :b => (x -> c) => :c2)
     @test df2.c1 === df2.c2
     df2 = select(df, :a => identity => :c1, :a => :c2)
     @test df2.c1 !== df2.c2
@@ -996,9 +1025,11 @@ end
     @test df2.c1 !== df.a
     df2 = select(df, :a => (x -> df.b) => :c1)
     @test df2.c1 === df.b
-    df2 = select(view(df, 1:2, :), :a => parent => :c1)
+    @test_throws ArgumentError select(view(df, 1:2, :), :a => parent => :c1)
+    df2 = combine(view(df, 1:2, :), :a => parent => :c1)
     @test df2.c1 !== df.a
-    df2 = select(view(df, 1:2, :), :a => (x -> view(x, 1:1)) => :c1)
+    @test_throws ArgumentError select(view(df, 1:2, :), :a => (x -> view(x, 1:1)) => :c1)
+    df2 = combine(view(df, 1:2, :), :a => (x -> view(x, 1:1)) => :c1)
     @test df2.c1 isa Vector
     df2 = select(df, :a, :a => :b, :a => identity => :c, copycols=false)
     @test df2.b === df2.c === df.a
@@ -1059,14 +1090,16 @@ end
     df_ref = DataFrame(a=1:3, b=4:6)
     for df in [df_ref, view(df_ref, :, :)]
         @test select(df, [] .=> sum) == DataFrame()
-        @test select(df, names(df) .=> sum) == DataFrame(a_sum=6, b_sum=15)
+        @test select(df, names(df) .=> sum) == repeat(DataFrame(a_sum=6, b_sum=15), nrow(df))
+        @test combine(df, names(df) .=> sum) == DataFrame(a_sum=6, b_sum=15)
         @test transform(df, names(df) .=> ByRow(-)) ==
               DataFrame(:a => 1:3, :b => 4:6,
                         Symbol("a_-") => -1:-1:-3,
                         Symbol("b_-") => -4:-1:-6)
         @test select(df, :a, [] .=> sum, :b => :x, [:b, :a] .=> identity) ==
               DataFrame(a=1:3, x=4:6, b_identity=4:6, a_identity=1:3)
-        @test select(df, names(df) .=> sum .=> [:A, :B]) == DataFrame(A=6, B=15)
+        @test select(df, names(df) .=> sum .=> [:A, :B]) == repeat(DataFrame(A=6, B=15), nrow(df))
+        @test combine(df, names(df) .=> sum .=> [:A, :B]) == DataFrame(A=6, B=15)
         @test Base.broadcastable(ByRow(+)) isa Base.RefValue{ByRow{typeof(+)}}
         @test identity.(ByRow(+)) == ByRow(+)
     end
@@ -1079,6 +1112,8 @@ end
     @test transform(df, AsTable(:) => sum) ==
           DataFrame(a=1:3, b=4:6, c=7:9, a_b_c_sum=map(sum, eachrow(df)))
     @test select(df, AsTable(:) => sum ∘ sum) ==
+          repeat(DataFrame(a_b_c_function=45), nrow(df))
+    @test combine(df, AsTable(:) => sum ∘ sum) ==
           DataFrame(a_b_c_function=45)
     @test transform(df, AsTable(:) => sum ∘ sum) ==
           DataFrame(a=1:3, b=4:6, c=7:9, a_b_c_function=45)
@@ -1095,7 +1130,8 @@ end
     @test_throws ArgumentError select(df, AsTable(:) => ByRow(x -> df[1, :]))
     @test_throws ArgumentError transform(df, AsTable(Not(:)) => ByRow(identity))
 
-    @test select(df, AsTable(Not(:)) => Ref) == DataFrame(Ref = NamedTuple())
+    @test select(df, AsTable(Not(:)) => Ref) == repeat(DataFrame(Ref = NamedTuple()), nrow(df))
+    @test combine(df, AsTable(Not(:)) => Ref) == DataFrame(Ref = NamedTuple())
     @test transform(df, AsTable(Not(:)) => Ref) ==
           DataFrame(a=1:3, b=4:6, c=7:9, Ref=NamedTuple())
 

From 384c0b1b8275ad077f7b1dd8a3da2072544bf338 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= <bkamins@sgh.waw.pl>
Date: Wed, 29 Apr 2020 15:14:38 +0200
Subject: [PATCH 12/29] change error thrown on Julia 1.0

---
 test/select.jl | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/test/select.jl b/test/select.jl
index 0416a5e0b7..584b43fa90 100644
--- a/test/select.jl
+++ b/test/select.jl
@@ -764,12 +764,7 @@ end
 
     @test_throws MethodError select!(df, r"z" => x -> 1)
     @test_throws ArgumentError select!(df, r"z" => ByRow(rand))
-
-    if VERSION >= v"1.4"
-        @test_throws MethodError select!(df, r"z" => () -> x, copycols=false)
-    else
-        @test_throws ErrorException select!(df, r"z" => () -> x, copycols=false)
-    end
+    @test_throws MethodError select!(df, r"z" => () -> x, copycols=false)
 
     select!(df, r"z" => () -> x)
     @test df == DataFrame(:function => x)

From ea574c4f577fc12461cca0ce331eb5ad1c27b44b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= <bkamins@sgh.waw.pl>
Date: Wed, 29 Apr 2020 16:19:40 +0200
Subject: [PATCH 13/29] done tests of combine

---
 test/select.jl | 168 +++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 168 insertions(+)

diff --git a/test/select.jl b/test/select.jl
index 584b43fa90..9a7a7a0784 100644
--- a/test/select.jl
+++ b/test/select.jl
@@ -1158,4 +1158,172 @@ end
     @test propertynames(df) == [:a,]
 end
 
+@testset "combine AbstractDataFrame" begin
+    df = DataFrame(x=1:3, y=4:6)
+
+    @test combine(x -> Matrix(x), df) == rename(df, [:x1, :x2])
+    @test combine(x -> Ref(1:3), df) == DataFrame(x1=[1:3])
+    @test_throws ArgumentError combine(df, x -> Ref(1:3))
+
+    @test combine(AsTable(:) => identity, df) == df
+    @test combine((:) => cor, df) == DataFrame(x_y_cor = 1.0)
+    @test combine(:x => x -> Ref(1:3), df) == DataFrame(x_function=[1:3])
+    @test_throws ArgumentError combine(df, :x => x -> ones(1,1))
+
+    df2 = combine(df, :x => identity)
+    @test df2[:, 1] == df.x
+    @test df2[:, 1] !== df.x
+
+    @test combine(df, :x => sum, :y => collect ∘ extrema) ==
+          DataFrame(x_sum=[6, 6], y_function = [4, 6])
+    @test combine(df, :y => collect ∘ extrema, :x => sum) ==
+          DataFrame(y_function = [4, 6], x_sum=[6, 6])
+    @test combine(df, :x => sum, :y => x -> []) ==
+          DataFrame(x_sum=[], y_function = [])
+    @test combine(df, :y => x -> [], :x => sum) ==
+          DataFrame(y_function = [], x_sum=[])
+
+    dfv = view(df, [2, 1], [2, 1])
+
+    @test combine(x -> Matrix(x), dfv) == rename(dfv, [:x1, :x2])
+
+    @test combine(AsTable(:) => identity, dfv) == dfv
+    @test combine((:) => cor, dfv) == DataFrame(y_x_cor = 1.0)
+
+    df2 = combine(dfv, :x => identity)
+    @test df2[:, 1] == dfv.x
+    @test df2[:, 1] !== dfv.x
+
+    @test combine(dfv, :x => sum, :y => collect ∘ extrema) ==
+          DataFrame(x_sum=[3, 3], y_function = [4, 5])
+    @test combine(dfv, :y => collect ∘ extrema, :x => sum) ==
+          DataFrame(y_function = [4, 5], x_sum=[3, 3])
+end
+
+@testset "combine GroupedDataFrame" begin
+    for df in (DataFrame(g=[3,1,1,missing],x=1:4, y=5:8),
+               DataFrame(g=categorical([3,1,1,missing]),x=1:4, y=5:8))
+        if !(df.g isa CategoricalVector)
+            gdf = groupby(df, :g, sort=false, skipmissing=false)
+            @test combine(gdf, :x => sum, keepkeys=false, regroup=false) ==
+                  DataFrame(x_sum = [1, 5, 4])
+            @test_throws ArgumentError combine(gdf, :x => sum, keepkeys=false, regroup=true)
+            @test combine(gdf, :x => sum, keepkeys=true, regroup=false) ≅
+                  DataFrame(g = [3, 1, missing], x_sum = [1, 5, 4])
+            gdf2 = combine(gdf, :x => sum, keepkeys=true, regroup=true)
+            @test gdf2 isa GroupedDataFrame{DataFrame}
+            @test gdf2.groups == 1:3
+            @test DataFrame(gdf2) ≅ DataFrame(g = [3, 1, missing], x_sum = [1, 5, 4])
+
+            @test combine(gdf, :x => sum, :g, keepkeys=false, regroup=false) ≅
+                  DataFrame(x_sum = [1, 5, 5, 4], g = [3, 1, 1, missing])
+            @test combine(gdf, :x => sum, :g, keepkeys=true, regroup=false) ≅
+                  DataFrame(g = [3, 1, 1, missing], x_sum = [1, 5, 5, 4])
+            gdf2 = combine(gdf, :x => sum, :g, keepkeys=true, regroup=true)
+            @test gdf2 isa GroupedDataFrame{DataFrame}
+            @test gdf2.groups == [1, 2, 2, 3]
+            @test DataFrame(gdf2) ≅ DataFrame(g = [3, 1, 1, missing], x_sum = [1, 5, 5, 4])
+
+            @test combine(x -> (x_sum = sum(x.x),), gdf, keepkeys=false, regroup=false) ==
+                  DataFrame(x_sum = [1, 5, 4])
+            @test combine(x -> (x_sum = sum(x.x),), gdf, keepkeys=true, regroup=false) ≅
+                  DataFrame(g = [3, 1, missing], x_sum = [1, 5, 4])
+            gdf2 = combine(x -> (x_sum = sum(x.x),), gdf, keepkeys=true, regroup=true)
+            @test gdf2 isa GroupedDataFrame{DataFrame}
+            @test gdf2.groups == 1:3
+            @test DataFrame(gdf2) ≅ DataFrame(g = [3, 1, missing], x_sum = [1, 5, 4])
+
+            gdf = groupby(df, :g, sort=false, skipmissing=true)
+
+            @test combine(gdf, :x => sum, keepkeys=false, regroup=false) ==
+                  DataFrame(x_sum = [1, 5])
+            @test_throws ArgumentError combine(gdf, :x => sum, keepkeys=false, regroup=true)
+            @test combine(gdf, :x => sum, keepkeys=true, regroup=false) ≅
+                  DataFrame(g = [3, 1], x_sum = [1, 5])
+            gdf2 = combine(gdf, :x => sum, keepkeys=true, regroup=true)
+            @test gdf2 isa GroupedDataFrame{DataFrame}
+            @test gdf2.groups == 1:2
+            @test DataFrame(gdf2) ≅ DataFrame(g = [3, 1], x_sum = [1, 5])
+
+            @test combine(gdf, :x => sum, :g, keepkeys=false, regroup=false) ≅
+                  DataFrame(x_sum = [1, 5, 5], g = [3, 1, 1])
+            @test combine(gdf, :x => sum, :g, keepkeys=true, regroup=false) ≅
+                  DataFrame(g = [3, 1, 1], x_sum = [1, 5, 5])
+            gdf2 = combine(gdf, :x => sum, :g, keepkeys=true, regroup=true)
+            @test gdf2 isa GroupedDataFrame{DataFrame}
+            @test gdf2.groups == [1, 2, 2]
+            @test DataFrame(gdf2) ≅ DataFrame(g = [3, 1, 1], x_sum = [1, 5, 5])
+
+            @test combine(x -> (x_sum = sum(x.x),), gdf, keepkeys=false, regroup=false) ==
+                  DataFrame(x_sum = [1, 5])
+            @test combine(x -> (x_sum = sum(x.x),), gdf, keepkeys=true, regroup=false) ≅
+                  DataFrame(g = [3, 1], x_sum = [1, 5])
+            gdf2 = combine(x -> (x_sum = sum(x.x),), gdf, keepkeys=true, regroup=true)
+            @test gdf2 isa GroupedDataFrame{DataFrame}
+            @test gdf2.groups == 1:2
+            @test DataFrame(gdf2) ≅ DataFrame(g = [3, 1], x_sum = [1, 5])
+        end
+
+        gdf = groupby(df, :g, sort=true, skipmissing=false)
+
+        @test combine(gdf, :x => sum, keepkeys=false, regroup=false) ==
+              DataFrame(x_sum = [5, 1, 4])
+        @test_throws ArgumentError combine(gdf, :x => sum, keepkeys=false, regroup=true)
+        @test combine(gdf, :x => sum, keepkeys=true, regroup=false) ≅
+              DataFrame(g = [1, 3, missing], x_sum = [5, 1, 4])
+        gdf2 = combine(gdf, :x => sum, keepkeys=true, regroup=true)
+        @test gdf2 isa GroupedDataFrame{DataFrame}
+        @test gdf2.groups == 1:3
+        @test DataFrame(gdf2) ≅ DataFrame(g = [1, 3, missing], x_sum = [5, 1, 4])
+
+        @test combine(gdf, :x => sum, :g, keepkeys=false, regroup=false) ≅
+              DataFrame(x_sum = [5, 5, 1, 4], g = [1, 1, 3, missing])
+        @test combine(gdf, :x => sum, :g, keepkeys=true, regroup=false) ≅
+              DataFrame(g = [1, 1, 3, missing], x_sum = [5, 5, 1, 4])
+        gdf2 = combine(gdf, :x => sum, :g, keepkeys=true, regroup=true)
+        @test gdf2 isa GroupedDataFrame{DataFrame}
+        @test gdf2.groups == [1, 1, 2, 3]
+        @test DataFrame(gdf2) ≅ DataFrame(g = [1, 1, 3, missing], x_sum = [5, 5, 1, 4])
+
+        @test combine(x -> (x_sum = sum(x.x),), gdf, keepkeys=false, regroup=false) ==
+              DataFrame(x_sum = [5, 1, 4])
+        @test combine(x -> (x_sum = sum(x.x),), gdf, keepkeys=true, regroup=false) ≅
+              DataFrame(g = [1, 3, missing], x_sum = [5, 1, 4])
+        gdf2 = combine(x -> (x_sum = sum(x.x),), gdf, keepkeys=true, regroup=true)
+        @test gdf2 isa GroupedDataFrame{DataFrame}
+        @test gdf2.groups == 1:3
+        @test DataFrame(gdf2) ≅ DataFrame(g = [1, 3, missing], x_sum = [5, 1, 4])
+
+        gdf = groupby(df, :g, sort=true, skipmissing=true)
+
+        @test combine(gdf, :x => sum, keepkeys=false, regroup=false) ==
+              DataFrame(x_sum = [5, 1])
+        @test_throws ArgumentError combine(gdf, :x => sum, keepkeys=false, regroup=true)
+        @test combine(gdf, :x => sum, keepkeys=true, regroup=false) ≅
+              DataFrame(g = [1, 3], x_sum = [5, 1])
+        gdf2 = combine(gdf, :x => sum, keepkeys=true, regroup=true)
+        @test gdf2 isa GroupedDataFrame{DataFrame}
+        @test gdf2.groups == 1:2
+        @test DataFrame(gdf2) ≅ DataFrame(g = [1, 3], x_sum = [5, 1])
+
+        @test combine(gdf, :x => sum, :g, keepkeys=false, regroup=false) ≅
+              DataFrame(x_sum = [5, 5, 1], g = [1, 1, 3])
+        @test combine(gdf, :x => sum, :g, keepkeys=true, regroup=false) ≅
+              DataFrame(g = [1, 1, 3], x_sum = [5, 5, 1])
+        gdf2 = combine(gdf, :x => sum, :g, keepkeys=true, regroup=true)
+        @test gdf2 isa GroupedDataFrame{DataFrame}
+        @test gdf2.groups == [1, 1, 2]
+        @test DataFrame(gdf2) ≅ DataFrame(g = [1, 1, 3], x_sum = [5, 5, 1])
+
+        @test combine(x -> (x_sum = sum(x.x),), gdf, keepkeys=false, regroup=false) ==
+              DataFrame(x_sum = [5, 1])
+        @test combine(x -> (x_sum = sum(x.x),), gdf, keepkeys=true, regroup=false) ≅
+              DataFrame(g = [1, 3], x_sum = [5, 1])
+        gdf2 = combine(x -> (x_sum = sum(x.x),), gdf, keepkeys=true, regroup=true)
+        @test gdf2 isa GroupedDataFrame{DataFrame}
+        @test gdf2.groups == 1:2
+        @test DataFrame(gdf2) ≅ DataFrame(g = [1, 3], x_sum = [5, 1])
+    end
+end
+
 end # module

From 8977017c0726c3035a01a35606b1b69fe6aadde9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= <bkamins@sgh.waw.pl>
Date: Thu, 30 Apr 2020 00:28:50 +0200
Subject: [PATCH 14/29] finish tests and documentation

---
 docs/src/man/getting_started.md           |  30 ++-
 docs/src/man/split_apply_combine.md       | 148 ++++++++++--
 src/abstractdataframe/selection.jl        |  45 +++-
 src/dataframe/dataframe.jl                |   5 +-
 src/groupeddataframe/splitapplycombine.jl |  92 ++++++--
 test/grouping.jl                          | 276 ++++++++++++++++++++++
 test/select.jl                            | 201 ++++++----------
 7 files changed, 623 insertions(+), 174 deletions(-)

diff --git a/docs/src/man/getting_started.md b/docs/src/man/getting_started.md
index 2ae81786cf..5cb011a4f8 100644
--- a/docs/src/man/getting_started.md
+++ b/docs/src/man/getting_started.md
@@ -792,7 +792,9 @@ julia> mean(df.A)
 2.5
 ```
 
-We can also apply a function to each column of a `DataFrame` using `select`. For example:
+We can also apply a function to each column of a `DataFrame` using `select`.
+`select` always returns the same number of rows in the result as the source
+data frame. For example:
 ```jldoctest dataframe
 julia> df = DataFrame(A = 1:4, B = 4.0:-1.0:1.0)
 4×2 DataFrame
@@ -805,13 +807,37 @@ julia> df = DataFrame(A = 1:4, B = 4.0:-1.0:1.0)
 │ 4   │ 4     │ 1.0     │
 
 julia> select(df, names(df) .=> sum)
-1×2 DataFrame
+4×2 DataFrame
 │ Row │ A_sum │ B_sum   │
 │     │ Int64 │ Float64 │
 ├─────┼───────┼─────────┤
 │ 1   │ 10    │ 10.0    │
+│ 2   │ 10    │ 10.0    │
+│ 3   │ 10    │ 10.0    │
+│ 4   │ 10    │ 10.0    │
 
 julia> select(df, names(df) .=> sum, names(df) .=> prod)
+4×4 DataFrame
+│ Row │ A_sum │ B_sum   │ A_prod │ B_prod  │
+│     │ Int64 │ Float64 │ Int64  │ Float64 │
+├─────┼───────┼─────────┼────────┼─────────┤
+│ 1   │ 10    │ 10.0    │ 24     │ 24.0    │
+│ 2   │ 10    │ 10.0    │ 24     │ 24.0    │
+│ 3   │ 10    │ 10.0    │ 24     │ 24.0    │
+│ 4   │ 10    │ 10.0    │ 24     │ 24.0    │
+```
+
+If instead you prefer to get a result collapsed to the number of rows returned
+by the applied functions use the `combine` function:
+```
+julia> combine(df, names(df) .=> sum)
+1×2 DataFrame
+│ Row │ A_sum │ B_sum   │
+│     │ Int64 │ Float64 │
+├─────┼───────┼─────────┤
+│ 1   │ 10    │ 10.0    │
+
+julia> combine(df, names(df) .=> sum, names(df) .=> prod)
 1×4 DataFrame
 │ Row │ A_sum │ B_sum   │ A_prod │ B_prod  │
 │     │ Int64 │ Float64 │ Int64  │ Float64 │
diff --git a/docs/src/man/split_apply_combine.md b/docs/src/man/split_apply_combine.md
index 588f2b7fcc..99af65a2e6 100644
--- a/docs/src/man/split_apply_combine.md
+++ b/docs/src/man/split_apply_combine.md
@@ -6,10 +6,24 @@ framework for handling this sort of computation is described in the paper
 "[The Split-Apply-Combine Strategy for Data Analysis](http://www.jstatsoft.org/v40/i01)",
 written by Hadley Wickham.
 
-The DataFrames package supports the split-apply-combine strategy through the `by`
-function, which is a shorthand for `groupby` followed by `map` and/or `combine`.
-`by` takes in three arguments: (1) a `DataFrame`, (2) one or more columns to split
-the `DataFrame` on, and (3) a specification of one or more functions to apply to
+The DataFrames package supports the split-apply-combine strategy through the
+`combine`, `select`/`select!` and `transform`/`transform!` functions.
+
+In order to perform operations by groups you first need to create a `GroupedDataFrame`
+object from your data frame using `groupby` function that takes two arguments:
+(1) a data frame to be grouped, and (2) a set of columns to group by.
+
+The differences between the above functions are the following:
+* `select`: return a data frame with the number and order of rows exactly the same
+  as the source, preserve only columns that have been calculated;
+* `transform`: return a data frame with the number and order of rows exactly the same
+  as the source, preserve all columns from the source and columns that have been calculated;
+* `select!`: is an in-place version of `select`;
+* `transform!`: is an in-place version of `transform`;
+* `combine`: does not put restrictions on number of rows returned, the order of rows
+  is specified by the order of groups in `GroupedDataFrame`.
+
+All these functions take a specification of one or more functions to apply to
 each subset of the `DataFrame`. This specification can be of the following forms:
 1. standard column selectors (integers, symbols, vectors of integers, vectors of symbols,
    `All`, `:`, `Between`, `Not` and regular expressions)
@@ -27,19 +41,20 @@ each subset of the `DataFrame`. This specification can be of the following forms
    number of columns are processed (in which case `SubDataFrame` avoids excessive
    compilation)
 
-All forms except 1 and 6 can be also passed as the first argument to `map`.
-
 As a special rule that applies to `cols => function` syntax, if `cols` is wrapped
 in an `AsTable` object then a `NamedTuple` containing columns selected by `cols` is
 passed to `function`.
 
 In all of these cases, `function` can return either a single row or multiple rows.
 `function` can always generate a single column by returning a single value or a vector.
-Additionally, if `by` is passed exactly one `function` and `target_col` is not specified,
+Additionally, if `combine` is passed exactly one `function` as a first argument
+and `target_col` is not specified,
 `function` can return multiple columns in the form of an `AbstractDataFrame`,
 `AbstractMatrix`, `NamedTuple` or `DataFrameRow`.
 
-Here are the rules specifying the shape of the resulting `DataFrame`:
+Here are the rules specifying the shape of the resulting `DataFrame` in `combine`
+(in `select`/`select!` and `transform`/`transform!` the result has the number
+and order of rows equal to the source):
 - a single value produces a single row and column per group
 - a named tuple or `DataFrameRow` produces a single row and one column per field
 - a vector produces a single column with one row per entry
@@ -87,7 +102,51 @@ julia> iris = DataFrame(CSV.File(joinpath(dirname(pathof(DataFrames)), "../docs/
 │ 149 │ 6.2         │ 3.4        │ 5.4         │ 2.3        │ Iris-virginica │
 │ 150 │ 5.9         │ 3.0        │ 5.1         │ 1.8        │ Iris-virginica │
 
-julia> by(iris, :Species, :PetalLength => mean)
+julia> gdf = groupby(iris, :Species)
+GroupedDataFrame with 3 groups based on key: Species
+First Group (50 rows): Species = "Iris-setosa"
+│ Row │ SepalLength │ SepalWidth │ PetalLength │ PetalWidth │ Species     │
+│     │ Float64     │ Float64    │ Float64     │ Float64    │ String      │
+├─────┼─────────────┼────────────┼─────────────┼────────────┼─────────────┤
+│ 1   │ 5.1         │ 3.5        │ 1.4         │ 0.2        │ Iris-setosa │
+│ 2   │ 4.9         │ 3.0        │ 1.4         │ 0.2        │ Iris-setosa │
+│ 3   │ 4.7         │ 3.2        │ 1.3         │ 0.2        │ Iris-setosa │
+│ 4   │ 4.6         │ 3.1        │ 1.5         │ 0.2        │ Iris-setosa │
+│ 5   │ 5.0         │ 3.6        │ 1.4         │ 0.2        │ Iris-setosa │
+│ 6   │ 5.4         │ 3.9        │ 1.7         │ 0.4        │ Iris-setosa │
+│ 7   │ 4.6         │ 3.4        │ 1.4         │ 0.3        │ Iris-setosa │
+⋮
+│ 43  │ 4.4         │ 3.2        │ 1.3         │ 0.2        │ Iris-setosa │
+│ 44  │ 5.0         │ 3.5        │ 1.6         │ 0.6        │ Iris-setosa │
+│ 45  │ 5.1         │ 3.8        │ 1.9         │ 0.4        │ Iris-setosa │
+│ 46  │ 4.8         │ 3.0        │ 1.4         │ 0.3        │ Iris-setosa │
+│ 47  │ 5.1         │ 3.8        │ 1.6         │ 0.2        │ Iris-setosa │
+│ 48  │ 4.6         │ 3.2        │ 1.4         │ 0.2        │ Iris-setosa │
+│ 49  │ 5.3         │ 3.7        │ 1.5         │ 0.2        │ Iris-setosa │
+│ 50  │ 5.0         │ 3.3        │ 1.4         │ 0.2        │ Iris-setosa │
+⋮
+Last Group (50 rows): Species = "Iris-virginica"
+│ Row │ SepalLength │ SepalWidth │ PetalLength │ PetalWidth │ Species        │
+│     │ Float64     │ Float64    │ Float64     │ Float64    │ String         │
+├─────┼─────────────┼────────────┼─────────────┼────────────┼────────────────┤
+│ 1   │ 6.3         │ 3.3        │ 6.0         │ 2.5        │ Iris-virginica │
+│ 2   │ 5.8         │ 2.7        │ 5.1         │ 1.9        │ Iris-virginica │
+│ 3   │ 7.1         │ 3.0        │ 5.9         │ 2.1        │ Iris-virginica │
+│ 4   │ 6.3         │ 2.9        │ 5.6         │ 1.8        │ Iris-virginica │
+│ 5   │ 6.5         │ 3.0        │ 5.8         │ 2.2        │ Iris-virginica │
+│ 6   │ 7.6         │ 3.0        │ 6.6         │ 2.1        │ Iris-virginica │
+│ 7   │ 4.9         │ 2.5        │ 4.5         │ 1.7        │ Iris-virginica │
+⋮
+│ 43  │ 5.8         │ 2.7        │ 5.1         │ 1.9        │ Iris-virginica │
+│ 44  │ 6.8         │ 3.2        │ 5.9         │ 2.3        │ Iris-virginica │
+│ 45  │ 6.7         │ 3.3        │ 5.7         │ 2.5        │ Iris-virginica │
+│ 46  │ 6.7         │ 3.0        │ 5.2         │ 2.3        │ Iris-virginica │
+│ 47  │ 6.3         │ 2.5        │ 5.0         │ 1.9        │ Iris-virginica │
+│ 48  │ 6.5         │ 3.0        │ 5.2         │ 2.0        │ Iris-virginica │
+│ 49  │ 6.2         │ 3.4        │ 5.4         │ 2.3        │ Iris-virginica │
+│ 50  │ 5.9         │ 3.0        │ 5.1         │ 1.8        │ Iris-virginica │
+
+julia> combine(gdf, :PetalLength => mean)
 3×2 DataFrame
 │ Row │ Species         │ PetalLength_mean │
 │     │ String          │ Float64          │
@@ -96,7 +155,7 @@ julia> by(iris, :Species, :PetalLength => mean)
 │ 2   │ Iris-versicolor │ 4.26             │
 │ 3   │ Iris-virginica  │ 5.552            │
 
-julia> by(iris, :Species, nrow)
+julia> combine(gdf, nrow)
 3×2 DataFrame
 │ Row │ Species         │ nrow  │
 │     │ String          │ Int64 │
@@ -105,7 +164,7 @@ julia> by(iris, :Species, nrow)
 │ 2   │ Iris-versicolor │ 50    │
 │ 3   │ Iris-virginica  │ 50    │
 
-julia> by(iris, :Species, nrow, :PetalLength => mean => :mean)
+julia> combine(gdf, nrow, :PetalLength => mean => :mean)
 3×3 DataFrame
 │ Row │ Species         │ nrow  │ mean    │
 │     │ String          │ Int64 │ Float64 │
@@ -114,9 +173,8 @@ julia> by(iris, :Species, nrow, :PetalLength => mean => :mean)
 │ 2   │ Iris-versicolor │ 50    │ 4.26    │
 │ 3   │ Iris-virginica  │ 50    │ 5.552   │
 
-julia> by(iris, :Species,
-          [:PetalLength, :SepalLength] =>
-          (p, s) -> (a=mean(p)/mean(s), b=sum(p))) # multiple columns are passed as arguments
+julia> combine([:PetalLength, :SepalLength] => (p, s) -> (a=mean(p)/mean(s), b=sum(p)),
+               gdf) # multiple columns are passed as arguments
 3×3 DataFrame
 │ Row │ Species         │ a        │ b       │
 │     │ String          │ Float64  │ Float64 │
@@ -125,9 +183,9 @@ julia> by(iris, :Species,
 │ 2   │ Iris-versicolor │ 0.717655 │ 213.0   │
 │ 3   │ Iris-virginica  │ 0.842744 │ 277.6   │
 
-julia> by(iris, :Species,
-          AsTable([:PetalLength, :SepalLength]) =>
-          x -> std(x.PetalLength) / std(x.SepalLength)) # passing a NamedTuple
+julia> combine(gdf,
+               AsTable([:PetalLength, :SepalLength]) =>
+               x -> std(x.PetalLength) / std(x.SepalLength)) # passing a NamedTuple
 3×2 DataFrame
 │ Row │ Species         │ PetalLength_SepalLength_function │
 │     │ String          │ Float64                          │
@@ -136,7 +194,7 @@ julia> by(iris, :Species,
 │ 2   │ Iris-versicolor │ 0.910378                         │
 │ 3   │ Iris-virginica  │ 0.867923                         │
 
-julia> by(iris, :Species, 1:2 => cor, nrow)
+julia> combine(gdf, 1:2 => cor, nrow)
 3×3 DataFrame
 │ Row │ Species         │ SepalLength_SepalWidth_cor │ nrow  │
 │     │ String          │ Float64                    │ Int64 │
@@ -147,11 +205,61 @@ julia> by(iris, :Species, 1:2 => cor, nrow)
 
 ```
 
-The `by` function also supports the `do` block form. However, as noted above,
+If we use `select` or `transform` instead of `combine` we always obtain the number
+and of order of rows in the result equal to the source. In the example below
+the return values in columns `:SepalLength_SepalWidth_cor` and `:nrow` are
+broadcasted to match the number of elements in each group:
+```
+julia> select(gdf, 1:2 => cor, nrow)
+150×3 DataFrame
+│ Row │ Species        │ SepalLength_SepalWidth_cor │ nrow  │
+│     │ String         │ Float64                    │ Int64 │
+├─────┼────────────────┼────────────────────────────┼───────┤
+│ 1   │ Iris-setosa    │ 0.74678                    │ 50    │
+│ 2   │ Iris-setosa    │ 0.74678                    │ 50    │
+│ 3   │ Iris-setosa    │ 0.74678                    │ 50    │
+│ 4   │ Iris-setosa    │ 0.74678                    │ 50    │
+│ 5   │ Iris-setosa    │ 0.74678                    │ 50    │
+│ 6   │ Iris-setosa    │ 0.74678                    │ 50    │
+│ 7   │ Iris-setosa    │ 0.74678                    │ 50    │
+⋮
+│ 143 │ Iris-virginica │ 0.457228                   │ 50    │
+│ 144 │ Iris-virginica │ 0.457228                   │ 50    │
+│ 145 │ Iris-virginica │ 0.457228                   │ 50    │
+│ 146 │ Iris-virginica │ 0.457228                   │ 50    │
+│ 147 │ Iris-virginica │ 0.457228                   │ 50    │
+│ 148 │ Iris-virginica │ 0.457228                   │ 50    │
+│ 149 │ Iris-virginica │ 0.457228                   │ 50    │
+│ 150 │ Iris-virginica │ 0.457228                   │ 50    │
+
+julia> transform(gdf, nrow)
+150×6 DataFrame
+│ Row │ Species        │ SepalLength │ SepalWidth │ PetalLength │ PetalWidth │ nrow  │
+│     │ String         │ Float64     │ Float64    │ Float64     │ Float64    │ Int64 │
+├─────┼────────────────┼─────────────┼────────────┼─────────────┼────────────┼───────┤
+│ 1   │ Iris-setosa    │ 5.1         │ 3.5        │ 1.4         │ 0.2        │ 50    │
+│ 2   │ Iris-setosa    │ 4.9         │ 3.0        │ 1.4         │ 0.2        │ 50    │
+│ 3   │ Iris-setosa    │ 4.7         │ 3.2        │ 1.3         │ 0.2        │ 50    │
+│ 4   │ Iris-setosa    │ 4.6         │ 3.1        │ 1.5         │ 0.2        │ 50    │
+│ 5   │ Iris-setosa    │ 5.0         │ 3.6        │ 1.4         │ 0.2        │ 50    │
+│ 6   │ Iris-setosa    │ 5.4         │ 3.9        │ 1.7         │ 0.4        │ 50    │
+│ 7   │ Iris-setosa    │ 4.6         │ 3.4        │ 1.4         │ 0.3        │ 50    │
+⋮
+│ 143 │ Iris-virginica │ 5.8         │ 2.7        │ 5.1         │ 1.9        │ 50    │
+│ 144 │ Iris-virginica │ 6.8         │ 3.2        │ 5.9         │ 2.3        │ 50    │
+│ 145 │ Iris-virginica │ 6.7         │ 3.3        │ 5.7         │ 2.5        │ 50    │
+│ 146 │ Iris-virginica │ 6.7         │ 3.0        │ 5.2         │ 2.3        │ 50    │
+│ 147 │ Iris-virginica │ 6.3         │ 2.5        │ 5.0         │ 1.9        │ 50    │
+│ 148 │ Iris-virginica │ 6.5         │ 3.0        │ 5.2         │ 2.0        │ 50    │
+│ 149 │ Iris-virginica │ 6.2         │ 3.4        │ 5.4         │ 2.3        │ 50    │
+│ 150 │ Iris-virginica │ 5.9         │ 3.0        │ 5.1         │ 1.8        │ 50    │
+```
+
+The `combine` function also supports the `do` block form. However, as noted above,
 this form is slow and should therefore be avoided when performance matters.
 
 ```jldoctest sac
-julia> by(iris, :Species) do df
+julia> combine(gdf) do df
            (m = mean(df.PetalLength), s² = var(df.PetalLength))
        end
 3×3 DataFrame
diff --git a/src/abstractdataframe/selection.jl b/src/abstractdataframe/selection.jl
index 52ea1b71cd..dec34a49c5 100644
--- a/src/abstractdataframe/selection.jl
+++ b/src/abstractdataframe/selection.jl
@@ -277,6 +277,7 @@ SELECT_ARG_RULES =
     select!(df::DataFrame, args...)
 
 Mutate `df` in place to retain only columns specified by `args...` and return it.
+The result is guaranteed to have the same number of rows as `df`.
 
 $SELECT_ARG_RULES
 
@@ -330,11 +331,13 @@ julia> df = DataFrame(a=1:3, b=4:6);
 julia> select!(df, names(df) .=> sum);
 
 julia> df
-1×2 DataFrame
+3×2 DataFrame
 │ Row │ a_sum │ b_sum │
 │     │ Int64 │ Int64 │
 ├─────┼───────┼───────┤
 │ 1   │ 6     │ 15    │
+│ 2   │ 6     │ 15    │
+│ 3   │ 6     │ 15    │
 
 julia> df = DataFrame(a=1:3, b=4:6);
 
@@ -403,6 +406,7 @@ end
     transform!(df::DataFrame, args...)
 
 Mutate `df` in place to add columns specified by `args...` and return it.
+The result is guaranteed to have the same number of rows as `df`.
 Equivalent to `select!(df, :, args...)`.
 
 See [`select!`](@ref) for detailed rules regarding accepted values for `args`.
@@ -413,7 +417,7 @@ transform!(df::DataFrame, args...) = select!(df, :, args...)
     select(df::AbstractDataFrame, args...; copycols::Bool=true)
 
 Create a new data frame that contains columns from `df` specified by `args` and
-return it.
+return it. The result is guaranteed to have the same number of rows as `df`.
 
 If `df` is a `DataFrame` or `copycols=true` then column renaming and transformations
 are supported.
@@ -500,18 +504,22 @@ julia> select(df, :, [:a, :b] => (a,b) -> a .+ b .- sum(b)/length(b))
 │ 3   │ 3     │ 6     │ 4.0          │
 
 julia> select(df, names(df) .=> sum)
-1×2 DataFrame
+3×2 DataFrame
 │ Row │ a_sum │ b_sum │
 │     │ Int64 │ Int64 │
 ├─────┼───────┼───────┤
 │ 1   │ 6     │ 15    │
+│ 2   │ 6     │ 15    │
+│ 3   │ 6     │ 15    │
 
 julia> select(df, names(df) .=> sum .=> [:A, :B])
-1×2 DataFrame
+3×2 DataFrame
 │ Row │ A     │ B     │
 │     │ Int64 │ Int64 │
 ├─────┼───────┼───────┤
 │ 1   │ 6     │ 15    │
+│ 2   │ 6     │ 15    │
+│ 3   │ 6     │ 15    │
 
 julia> select(df, AsTable(:) => ByRow(mean))
 3×1 DataFrame
@@ -532,6 +540,7 @@ select(df::AbstractDataFrame, args...; copycols::Bool=true) =
 
 Create a new data frame that contains columns from `df` and adds columns
 specified by `args` and return it.
+The result is guaranteed to have the same number of rows as `df`.
 Equivalent to `select(df, :, args..., copycols=copycols)`.
 
 See [`select`](@ref) for detailed rules regarding accepted values for `args`.
@@ -539,6 +548,34 @@ See [`select`](@ref) for detailed rules regarding accepted values for `args`.
 transform(df::AbstractDataFrame, args...; copycols::Bool=true) =
     select(df, :, args..., copycols=copycols)
 
+
+"""
+    combine(df::AbstractDataFrame, args...)
+
+Create a new data frame that contains columns from `df` specified by `args` and
+return it. The result can have any number of rows that is determined by the
+passed transformations.
+
+See [`select`](@ref) for detailed rules regarding accepted values for `args`.
+
+# Examples
+```jldoctest
+julia> df = DataFrame(a=1:3, b=4:6)
+3×2 DataFrame
+│ Row │ a     │ b     │
+│     │ Int64 │ Int64 │
+├─────┼───────┼───────┤
+│ 1   │ 1     │ 4     │
+│ 2   │ 2     │ 5     │
+│ 3   │ 3     │ 6     │
+
+julia> combine(df, :a => sum, nrow)
+1×2 DataFrame
+│ Row │ a_sum │ nrow  │
+│     │ Int64 │ Int64 │
+├─────┼───────┼───────┤
+│ 1   │ 6     │ 3     │
+"""
 combine(df::AbstractDataFrame, args...) =
     _manipulate(df, args..., copycols=true, keeprows=false)
 
diff --git a/src/dataframe/dataframe.jl b/src/dataframe/dataframe.jl
index 42864a3bd2..3d1fa9eff6 100644
--- a/src/dataframe/dataframe.jl
+++ b/src/dataframe/dataframe.jl
@@ -31,7 +31,7 @@ DataFrame(column_eltypes::AbstractVector, names::AbstractVector{<:AbstractString
 DataFrame(ds::AbstractDict; copycols::Bool=true)
 DataFrame(table; makeunique::Bool=false, copycols::Bool=true)
 DataFrame(::Union{DataFrame, SubDataFrame}; copycols::Bool=true)
-DataFrame(::GroupedDataFrame)
+DataFrame(::GroupedDataFrame; keepkeys::Bool=true)
 ```
 
 # Arguments
@@ -65,6 +65,9 @@ to fill a new vector of the appropriate length. As a particular rule values
 stored in a `Ref` or a `0`-dimensional `AbstractArray` are unwrapped and treated
 in the same way.
 
+Additionally `DataFrame` can be used to collect [`GroupedDataFrame`](@ref)
+into a `DataFrame`.
+
 # Notes
 The `DataFrame` constructor by default copies all columns vectors passed to it.
 Pass `copycols=false` to reuse vectors without copying them
diff --git a/src/groupeddataframe/splitapplycombine.jl b/src/groupeddataframe/splitapplycombine.jl
index 9d0df8791f..91ce4a0e91 100644
--- a/src/groupeddataframe/splitapplycombine.jl
+++ b/src/groupeddataframe/splitapplycombine.jl
@@ -8,9 +8,10 @@ into row groups.
 - `df` : an `AbstractDataFrame` to split
 - `cols` : data frame columns to group by. Can be any column selector
   ($COLUMNINDEX_STR; $MULTICOLUMNINDEX_STR).
-- `sort` : whether to sort rows according to the values of the grouping columns
-  `cols`
-- `skipmissing` : whether to skip rows with `missing` values in one of the
+- `sort` : whether to sort groups according to the values of the grouping columns
+  `cols`; if all `cols` are `CategoricalVector` then groups are always sorted
+  irrespective of the value of `sort`
+- `skipmissing` : whether to skip groups with `missing` values in one of the
   grouping columns `cols`
 
 # Details
@@ -217,7 +218,6 @@ const F_ARGUMENT_RULES =
     * Column transformation operations using the `Pair` notation that is described below
       and vectors of such pairs.
 
-
     Transformations allowed using `Pair`s follow the rules specified
     for [`select`](@ref) and have the form `source_cols => fun`,
     `source_cols => fun => target_col`, or `source_col => target_col`.
@@ -251,31 +251,33 @@ const KWARG_PROCESSING_RULES =
     in addition to those generated. In this case if the returned
     value contains columns with the same names as the grouping columns, they are
     required to be equal.
+
+    If `regroup=false`, if the returned value should be a `DataFrame` or a
+    `GroupedDataFrame` grouped using `keycols(gdf)`.
     """
 
 """
-    combine(gd::GroupedDataFrame, args...; keepkeys::Bool=true)
-    combine(fun::Union{Function, Type}, gd::GroupedDataFrame; keepkeys::Bool=true)
-    combine(pair::Pair, gd::GroupedDataFrame; keepkeys::Bool=true)
-    combine(gd::GroupedDataFrame, fun::Union{Function, Type}; keepkeys::Bool=true)
-    combine(gd::GroupedDataFrame, pair::Pair; keepkeys::Bool=true)
+    combine(gd::GroupedDataFrame, args...; keepkeys::Bool=true, regroup::Bool=false)
+    combine(fun::Union{Function, Type}, gd::GroupedDataFrame;
+            keepkeys::Bool=true, regroup::Bool=false)
+    combine(pair::Pair, gd::GroupedDataFrame; keepkeys::Bool=true, regroup::Bool=false)
+    combine(fun::Union{Function, Type}, df::AbstractDataFrame)
+    combine(pair::Pair, df::AbstractDataFrame)
 
 Transform a [`GroupedDataFrame`](@ref) into a `DataFrame`.
 
+As a special case if `combine` is passed an `AbstractDataFrame` it applies `fun`
+or `pair` to the passed data frame as a whole.
+
 $F_ARGUMENT_RULES
 
 $F_TYPE_RULES
 
 $KWARG_PROCESSING_RULES
 
-The resulting data frame will be sorted if `sort=true` was passed to the
+The resulting data frame will be sorted by `keycols(gdf)` if `sort=true` was passed to the
 [`groupby`](@ref) call from which `gd` was constructed. Otherwise, ordering of rows
-is undefined.
-
-See also:
-- [`by(f, df, cols)`](@ref) is a shorthand for `combine(f, groupby(df, cols))`.
-- [`map`](@ref): `combine(f, groupby(df, cols))` is a more efficient equivalent
-  of `combine(map(f, groupby(df, cols)))`.
+follows the order of groups in `gdf`.
 
 # Examples
 ```jldoctest
@@ -295,6 +297,20 @@ julia> combine(gd, :c => sum, nrow)
 │ 3   │ 3     │ 10    │ 2     │
 │ 4   │ 4     │ 12    │ 2     │
 
+julia> combine(gd, :c => sum, nrow, regroup=true)
+GroupedDataFrame with 4 groups based on key: a
+First Group (1 row): a = 1
+│ Row │ a     │ c_sum │ nrow  │
+│     │ Int64 │ Int64 │ Int64 │
+├─────┼───────┼───────┼───────┤
+│ 1   │ 1     │ 6     │ 2     │
+⋮
+Last Group (1 row): a = 4
+│ Row │ a     │ c_sum │ nrow  │
+│     │ Int64 │ Int64 │ Int64 │
+├─────┼───────┼───────┼───────┤
+│ 1   │ 4     │ 12    │ 2     │
+
 julia> combine(sdf -> sum(sdf.c), gd) # Slower variant
 4×2 DataFrame
 │ Row │ a     │ x1    │
@@ -305,7 +321,7 @@ julia> combine(sdf -> sum(sdf.c), gd) # Slower variant
 │ 3   │ 3     │ 10    │
 │ 4   │ 4     │ 12    │
 
-julia> by(df, :a) do d # do syntax for the slower variant
+julia> combine(gdf) do d # do syntax for the slower variant
            sum(d.c)
        end
 4×2 DataFrame
@@ -415,8 +431,6 @@ julia> combine(gd, :, AsTable(Not(:a)) => sum)
 │ 7   │ 4     │ 1     │ 4     │ 5       │
 │ 8   │ 4     │ 1     │ 8     │ 9       │
 ```
-
-See [`by`](@ref) for more examples.
 """
 function combine(f::Base.Callable, gd::GroupedDataFrame;
                  keepkeys::Bool=true, regroup::Bool=false)
@@ -575,11 +589,11 @@ function combine_helper(f, gd::GroupedDataFrame,
         if keeprows
             newparent = hcat!(select(parent(gd), gd.cols, copycols=copycols),
                               select(valscat, Not(intersect(keys, _names(valscat))),
-                                     copycols=false))
+                                     copycols=false), copycols=false)
         else
             newparent = hcat!(parent(gd)[idx, gd.cols],
                               select(valscat, Not(intersect(keys, _names(valscat))),
-                                     copycols=false))
+                                     copycols=false), copycols=false)
         end
         regroup || return newparent
 
@@ -1022,7 +1036,7 @@ function _combine(f::AbstractVector{<:Pair},
     if keeprows
         if !_check_cannonical(gd)
             throw(ArgumentError("select or transform functions require that " *
-                                "GroupedDataFrame is not sorted or subsetted"))
+                                "GroupedDataFrame is not subsetted"))
         end
         idx_keeprows = Vector{Int}(undef, nrow(parent(gd)))
         let i = 0
@@ -1153,7 +1167,7 @@ function _combine(f::AbstractVector{<:Pair},
     # this check is redundant given we check idx above
     # but it is safer to double check and it is cheap
     @assert all(x -> length(x) == length(outcols[1]), outcols)
-    return idx, DataFrame(collect(AbstractVector, outcols), nms)
+    return idx, DataFrame!(collect(AbstractVector, outcols), nms)
 end
 
 function _combine(fun::Base.Callable, gd::GroupedDataFrame, ::Nothing,
@@ -1425,16 +1439,39 @@ function _combine_tables_with_first!(first::Union{AbstractDataFrame,
     return outcols, colnames
 end
 
+"""
+    select(gd::GroupedDataFrame, args...;
+           copycols::Bool=true, keepkeys::Bool=true, regroup::Bool=false)
+
+Apply `args` to `gd` following the rules described in [`combine`](@ref).
+Ensure that the return value has number of rows equal to `nrow(parent(gd))`.
+
+If `copycols=false` then do not perform copying of columns that are not transformed.
+"""
 select(gd::GroupedDataFrame, args...;
        copycols::Bool=true, keepkeys::Bool=true, regroup::Bool=false) =
     _combine_executor(gd, args..., copycols=copycols, keepkeys=keepkeys,
                       regroup=regroup, keeprows=true)
 
-DataFrames.transform(gd::GroupedDataFrame, args...;
+"""
+    transform(gd::GroupedDataFrame, args...;
+              copycols::Bool=true, keepkeys::Bool=true, regroup::Bool=false)
+
+An equivalent of
+`select(gd, :, args..., copycols=copycols, keepkeys=keepkeys, regroup=regroup)`
+"""
+transform(gd::GroupedDataFrame, args...;
           copycols::Bool=true, keepkeys::Bool=true, regroup::Bool=false) =
     select(gd, :, args..., copycols=copycols, keepkeys=keepkeys,
            regroup=regroup)
 
+"""
+    select!(gd::GroupedDataFrame{DataFrame}, args...; regroup::Bool=false)
+
+An equivalent of
+`select(gd, args..., copycols=false, keepkeys=true, regroup=regroup)`
+but updates the `parent(gd)` in place.
+"""
 function select!(gd::GroupedDataFrame{DataFrame}, args...; regroup::Bool=false)
     newdf = select(gd, args..., copycols=false, regroup=false)
     df = parent(gd)
@@ -1448,5 +1485,12 @@ function select!(gd::GroupedDataFrame{DataFrame}, args...; regroup::Bool=false)
     return regroup ? gd : df
 end
 
+"""
+    transform!(gd::GroupedDataFrame{DataFrame}, args...; regroup::Bool=false)
+
+An equivalent of
+`transform(gd, args..., copycols=false, keepkeys=true, regroup=regroup)`
+but updates the `parent(gd)` in place.
+"""
 transform!(gd::GroupedDataFrame{DataFrame}, args...; regroup::Bool=false) =
     select!(gd, :, args..., regroup=regroup)
diff --git a/test/grouping.jl b/test/grouping.jl
index a0285dac76..2b4a5feba8 100644
--- a/test/grouping.jl
+++ b/test/grouping.jl
@@ -1920,4 +1920,280 @@ end
     @test select(gdf2, :g => sum) == combine(gdf2, :g => sum, :g)
 end
 
+@testset "combine GroupedDataFrame" begin
+    for df in (DataFrame(g=[3,1,1,missing],x=1:4, y=5:8),
+               DataFrame(g=categorical([3,1,1,missing]),x=1:4, y=5:8))
+        if !(df.g isa CategoricalVector)
+            gdf = groupby_checked(df, :g, sort=false, skipmissing=false)
+
+            @test combine(gdf, :x => sum, keepkeys=false, regroup=false) ==
+                  DataFrame(x_sum = [1, 5, 4])
+            @test_throws ArgumentError combine(gdf, :x => sum, keepkeys=false, regroup=true)
+            @test combine(gdf, :x => sum, keepkeys=true, regroup=false) ≅
+                  DataFrame(g = [3, 1, missing], x_sum = [1, 5, 4])
+            gdf2 = validate_gdf(combine(gdf, :x => sum, keepkeys=true, regroup=true))
+            @test gdf2 isa GroupedDataFrame{DataFrame}
+            @test gdf2.groups == 1:3
+            @test DataFrame(gdf2) ≅ DataFrame(g = [3, 1, missing], x_sum = [1, 5, 4])
+            @test DataFrame(gdf2, keepkeys=false) == DataFrame(x_sum = [1, 5, 4])
+
+            @test combine(gdf, :x => sum, :g, keepkeys=false, regroup=false) ≅
+                  DataFrame(x_sum = [1, 5, 5, 4], g = [3, 1, 1, missing])
+            @test combine(gdf, :x => sum, :g, keepkeys=true, regroup=false) ≅
+                  DataFrame(g = [3, 1, 1, missing], x_sum = [1, 5, 5, 4])
+            gdf2 = validate_gdf(combine(gdf, :x => sum, :g, keepkeys=true, regroup=true))
+            @test gdf2 isa GroupedDataFrame{DataFrame}
+            @test gdf2.groups == [1, 2, 2, 3]
+            @test DataFrame(gdf2) ≅ DataFrame(g = [3, 1, 1, missing], x_sum = [1, 5, 5, 4])
+            @test DataFrame(gdf2, keepkeys=false) ≅ DataFrame(x_sum = [1, 5, 5, 4])
+
+            @test combine(x -> (x_sum = sum(x.x),), gdf, keepkeys=false, regroup=false) ==
+                  DataFrame(x_sum = [1, 5, 4])
+            @test combine(x -> (x_sum = sum(x.x),), gdf, keepkeys=true, regroup=false) ≅
+                  DataFrame(g = [3, 1, missing], x_sum = [1, 5, 4])
+            gdf2 = validate_gdf(combine(x -> (x_sum = sum(x.x),), gdf, keepkeys=true, regroup=true))
+            @test gdf2 isa GroupedDataFrame{DataFrame}
+            @test gdf2.groups == 1:3
+            @test DataFrame(gdf2) ≅ DataFrame(g = [3, 1, missing], x_sum = [1, 5, 4])
+            @test DataFrame(gdf2, keepkeys=false) ≅ DataFrame(x_sum = [1, 5, 4])
+
+            gdf = groupby_checked(df, :g, sort=false, skipmissing=true)
+
+            @test combine(gdf, :x => sum, keepkeys=false, regroup=false) ==
+                  DataFrame(x_sum = [1, 5])
+            @test_throws ArgumentError combine(gdf, :x => sum, keepkeys=false, regroup=true)
+            @test combine(gdf, :x => sum, keepkeys=true, regroup=false) ≅
+                  DataFrame(g = [3, 1], x_sum = [1, 5])
+            gdf2 = validate_gdf(combine(gdf, :x => sum, keepkeys=true, regroup=true))
+            @test gdf2 isa GroupedDataFrame{DataFrame}
+            @test gdf2.groups == 1:2
+            @test DataFrame(gdf2) ≅ DataFrame(g = [3, 1], x_sum = [1, 5])
+            @test DataFrame(gdf2, keepkeys=false) ≅ DataFrame(x_sum = [1, 5])
+
+            @test combine(gdf, :x => sum, :g, keepkeys=false, regroup=false) ≅
+                  DataFrame(x_sum = [1, 5, 5], g = [3, 1, 1])
+            @test combine(gdf, :x => sum, :g, keepkeys=true, regroup=false) ≅
+                  DataFrame(g = [3, 1, 1], x_sum = [1, 5, 5])
+            gdf2 = validate_gdf(combine(gdf, :x => sum, :g, keepkeys=true, regroup=true))
+            @test gdf2 isa GroupedDataFrame{DataFrame}
+            @test gdf2.groups == [1, 2, 2]
+            @test DataFrame(gdf2) ≅ DataFrame(g = [3, 1, 1], x_sum = [1, 5, 5])
+            @test DataFrame(gdf2, keepkeys=false) ≅ DataFrame(x_sum = [1, 5, 5])
+
+            @test combine(x -> (x_sum = sum(x.x),), gdf, keepkeys=false, regroup=false) ==
+                  DataFrame(x_sum = [1, 5])
+            @test combine(x -> (x_sum = sum(x.x),), gdf, keepkeys=true, regroup=false) ≅
+                  DataFrame(g = [3, 1], x_sum = [1, 5])
+            gdf2 = validate_gdf(combine(x -> (x_sum = sum(x.x),), gdf, keepkeys=true, regroup=true))
+            @test gdf2 isa GroupedDataFrame{DataFrame}
+            @test gdf2.groups == 1:2
+            @test DataFrame(gdf2) ≅ DataFrame(g = [3, 1], x_sum = [1, 5])
+            @test DataFrame(gdf2, keepkeys=false) ≅ DataFrame(x_sum = [1, 5])
+        end
+
+        gdf = groupby_checked(df, :g, sort=true, skipmissing=false)
+
+        @test combine(gdf, :x => sum, keepkeys=false, regroup=false) ==
+              DataFrame(x_sum = [5, 1, 4])
+        @test_throws ArgumentError validate_gdf(combine(gdf, :x => sum, keepkeys=false, regroup=true))
+        @test combine(gdf, :x => sum, keepkeys=true, regroup=false) ≅
+              DataFrame(g = [1, 3, missing], x_sum = [5, 1, 4])
+        gdf2 = validate_gdf(combine(gdf, :x => sum, keepkeys=true, regroup=true))
+        @test gdf2 isa GroupedDataFrame{DataFrame}
+        @test gdf2.groups == 1:3
+        @test DataFrame(gdf2) ≅ DataFrame(g = [1, 3, missing], x_sum = [5, 1, 4])
+        @test DataFrame(gdf2, keepkeys=false) ≅ DataFrame(x_sum = [5, 1, 4])
+
+        @test combine(gdf, :x => sum, :g, keepkeys=false, regroup=false) ≅
+              DataFrame(x_sum = [5, 5, 1, 4], g = [1, 1, 3, missing])
+        @test combine(gdf, :x => sum, :g, keepkeys=true, regroup=false) ≅
+              DataFrame(g = [1, 1, 3, missing], x_sum = [5, 5, 1, 4])
+        gdf2 = validate_gdf(combine(gdf, :x => sum, :g, keepkeys=true, regroup=true))
+        @test gdf2 isa GroupedDataFrame{DataFrame}
+        @test gdf2.groups == [1, 1, 2, 3]
+        @test DataFrame(gdf2) ≅ DataFrame(g = [1, 1, 3, missing], x_sum = [5, 5, 1, 4])
+        @test DataFrame(gdf2, keepkeys=false) ≅ DataFrame(x_sum = [5, 5, 1, 4])
+
+        @test combine(x -> (x_sum = sum(x.x),), gdf, keepkeys=false, regroup=false) ==
+              DataFrame(x_sum = [5, 1, 4])
+        @test combine(x -> (x_sum = sum(x.x),), gdf, keepkeys=true, regroup=false) ≅
+              DataFrame(g = [1, 3, missing], x_sum = [5, 1, 4])
+        gdf2 = validate_gdf(combine(x -> (x_sum = sum(x.x),), gdf, keepkeys=true, regroup=true))
+        @test gdf2 isa GroupedDataFrame{DataFrame}
+        @test gdf2.groups == 1:3
+        @test DataFrame(gdf2) ≅ DataFrame(g = [1, 3, missing], x_sum = [5, 1, 4])
+        @test DataFrame(gdf2, keepkeys=false) ≅ DataFrame(x_sum = [5, 1, 4])
+
+        gdf = groupby_checked(df, :g, sort=true, skipmissing=true)
+
+        @test combine(gdf, :x => sum, keepkeys=false, regroup=false) ==
+              DataFrame(x_sum = [5, 1])
+        @test_throws ArgumentError combine(gdf, :x => sum, keepkeys=false, regroup=true)
+        @test combine(gdf, :x => sum, keepkeys=true, regroup=false) ≅
+              DataFrame(g = [1, 3], x_sum = [5, 1])
+        gdf2 = validate_gdf(combine(gdf, :x => sum, keepkeys=true, regroup=true))
+        @test gdf2 isa GroupedDataFrame{DataFrame}
+        @test gdf2.groups == 1:2
+        @test DataFrame(gdf2) ≅ DataFrame(g = [1, 3], x_sum = [5, 1])
+        @test DataFrame(gdf2, keepkeys=false) ≅ DataFrame(x_sum = [5, 1])
+
+        @test combine(gdf, :x => sum, :g, keepkeys=false, regroup=false) ≅
+              DataFrame(x_sum = [5, 5, 1], g = [1, 1, 3])
+        @test combine(gdf, :x => sum, :g, keepkeys=true, regroup=false) ≅
+              DataFrame(g = [1, 1, 3], x_sum = [5, 5, 1])
+        gdf2 = validate_gdf(combine(gdf, :x => sum, :g, keepkeys=true, regroup=true))
+        @test gdf2 isa GroupedDataFrame{DataFrame}
+        @test gdf2.groups == [1, 1, 2]
+        @test DataFrame(gdf2) ≅ DataFrame(g = [1, 1, 3], x_sum = [5, 5, 1])
+        @test DataFrame(gdf2, keepkeys=false) ≅ DataFrame(x_sum = [5, 5, 1])
+
+        @test combine(x -> (x_sum = sum(x.x),), gdf, keepkeys=false, regroup=false) ==
+              DataFrame(x_sum = [5, 1])
+        @test combine(x -> (x_sum = sum(x.x),), gdf, keepkeys=true, regroup=false) ≅
+              DataFrame(g = [1, 3], x_sum = [5, 1])
+        gdf2 = validate_gdf(combine(x -> (x_sum = sum(x.x),), gdf, keepkeys=true, regroup=true))
+        @test gdf2 isa GroupedDataFrame{DataFrame}
+        @test gdf2.groups == 1:2
+        @test DataFrame(gdf2) ≅ DataFrame(g = [1, 3], x_sum = [5, 1])
+        @test DataFrame(gdf2, keepkeys=false) ≅ DataFrame(x_sum = [5, 1])
+    end
+end
+
+@testset "select and transform GroupedDataFrame" begin
+    for df in (DataFrame(g=[3,1,1,missing],x=1:4, y=5:8),
+               DataFrame(g=categorical([3,1,1,missing]),x=1:4, y=5:8)),
+        dosort in (true, false)
+
+        gdf = groupby_checked(df, :g, sort=dosort, skipmissing=false)
+
+        @test select(gdf, :x => sum, keepkeys=false, regroup=false) ==
+              DataFrame(x_sum = [1, 5, 5, 4])
+        @test_throws ArgumentError select(gdf, :x => sum, keepkeys=false, regroup=true)
+        @test select(gdf, :x => sum, keepkeys=true, regroup=false) ≅
+              DataFrame(g = df.g, x_sum = [1, 5, 5, 4])
+        gdf2 = validate_gdf(select(gdf, :x => sum, keepkeys=true, regroup=true))
+        @test gdf2 isa GroupedDataFrame{DataFrame}
+        @test gdf2.groups == gdf.groups
+        @test parent(gdf2).g ≅ df.g
+        @test parent(gdf2).g !== df.g
+
+        @test select(gdf, :x => sum, :g, keepkeys=false, regroup=false) ≅
+              DataFrame(x_sum = [1, 5, 5, 4], g = df.g)
+        @test select(gdf, :x => sum, :g, keepkeys=true, regroup=false) ≅
+              DataFrame(g = df.g, x_sum = [1, 5, 5, 4])
+        gdf2 = validate_gdf(select(gdf, :x => sum, :g, keepkeys=true, regroup=true))
+        @test gdf2 isa GroupedDataFrame{DataFrame}
+        @test gdf2.groups == gdf.groups
+        @test parent(gdf2).g ≅ df.g
+        @test parent(gdf2).g !== df.g
+
+        @test transform(gdf, :x => sum, keepkeys=false, regroup=false) ≅
+              [df DataFrame(x_sum = [1, 5, 5, 4])]
+        @test_throws ArgumentError transform(gdf, :x => sum, keepkeys=false, regroup=true)
+        @test transform(gdf, :x => sum, keepkeys=true, regroup=false) ≅
+              DataFrame(g = df.g, x = df.x, y = df.y, x_sum = [1, 5, 5, 4])
+        gdf2 = validate_gdf(transform(gdf, :x => sum, keepkeys=true, regroup=true))
+        @test gdf2 isa GroupedDataFrame{DataFrame}
+        @test gdf2.groups == gdf.groups
+        @test parent(gdf2).g ≅ df.g
+        @test parent(gdf2).x ≅ df.x
+        @test parent(gdf2).y ≅ df.y
+        @test parent(gdf2).g !== df.g
+
+        @test transform(gdf, :x => sum, :g, keepkeys=false, regroup=false) ≅
+              [df DataFrame(x_sum = [1, 5, 5, 4])]
+        @test transform(gdf, :x => sum, :g, keepkeys=true, regroup=false) ≅
+              [df DataFrame(x_sum = [1, 5, 5, 4])]
+        gdf2 = validate_gdf(transform(gdf, :x => sum, :g, keepkeys=true, regroup=true))
+        @test gdf2 isa GroupedDataFrame{DataFrame}
+        @test gdf2.groups == gdf.groups
+        @test parent(gdf2).g ≅ df.g
+        @test parent(gdf2).x ≅ df.x
+        @test parent(gdf2).y ≅ df.y
+        @test parent(gdf2).g !== df.g
+
+        df2 = transform(gdf, :x => sum, :g, keepkeys=false, regroup=false, copycols=false)
+        @test df2 ≅ [df DataFrame(x_sum = [1, 5, 5, 4])]
+        @test df2.g === df.g
+        @test df2.x === df.x
+        @test df2.y === df.y
+        df2 = transform(gdf, :x => sum, :g, keepkeys=true, regroup=false, copycols=false)
+        @test df2 ≅ [df DataFrame(x_sum = [1, 5, 5, 4])]
+        @test df2.g === df.g
+        @test df2.x === df.x
+        @test df2.y === df.y
+        gdf2 = validate_gdf(transform(gdf, :x => sum, :g, keepkeys=true, regroup=true, copycols=false))
+        @test gdf2 isa GroupedDataFrame{DataFrame}
+        @test gdf2.groups == gdf.groups
+        @test parent(gdf2).g ≅ df.g
+        @test parent(gdf2).x ≅ df.x
+        @test parent(gdf2).y ≅ df.y
+        @test parent(gdf2).g === df.g
+
+        gdf = groupby_checked(df, :g, sort=dosort, skipmissing=true)
+        @test_throws ArgumentError select(gdf, :x => sum)
+        @test_throws ArgumentError select(gdf, :x => sum, regroup=true)
+        @test_throws ArgumentError transform(gdf, :x => sum)
+        @test_throws ArgumentError transform(gdf, :x => sum, regroup=true)
+    end
+end
+
+@testset "select! and transform! GroupedDataFrame" begin
+    for df in (DataFrame(g=[3,1,1,missing],x=1:4, y=5:8),
+               DataFrame(g=categorical([3,1,1,missing]),x=1:4, y=5:8)),
+        dosort in (true, false)
+
+        @test_throws MethodError select!(groupby_checked(view(df, :, :), :g), :x)
+        @test_throws MethodError transform!(groupby_checked(view(df, :, :), :g), :x)
+
+        dfc = copy(df)
+        g = dfc.g
+        gdf = groupby_checked(dfc, :g, sort=dosort, skipmissing=false)
+        @test select!(gdf, :x => sum) === dfc
+        @test dfc.g === g
+        @test dfc.x_sum == [1, 5, 5, 4]
+        @test propertynames(dfc) == [:g, :x_sum]
+
+        dfc = copy(df)
+        g = dfc.g
+        x = dfc.x
+        y = dfc.y
+        gdf = groupby_checked(dfc, :g, sort=dosort, skipmissing=false)
+        @test transform!(gdf, :g => first => :g, :x => first) === dfc
+        @test dfc.g === g
+        @test dfc.x === x
+        @test dfc.y === y
+        @test dfc.x_first == [1, 2, 2, 4]
+        @test propertynames(dfc) == [:g, :x, :y, :x_first]
+
+        dfc = copy(df)
+        g = dfc.g
+        gdf = groupby_checked(dfc, :g, sort=dosort, skipmissing=false)
+        @test select!(gdf, :x => sum, regroup=true) === gdf
+        @test dfc.g === g
+        @test dfc.x_sum == [1, 5, 5, 4]
+        @test propertynames(dfc) == [:g, :x_sum]
+
+        dfc = copy(df)
+        g = dfc.g
+        x = dfc.x
+        y = dfc.y
+        gdf = groupby_checked(dfc, :g, sort=dosort, skipmissing=false)
+        @test transform!(gdf, :g => first => :g, :x => first, regroup=true) === gdf
+        @test dfc.g === g
+        @test dfc.x === x
+        @test dfc.y === y
+        @test dfc.x_first == [1, 2, 2, 4]
+        @test propertynames(dfc) == [:g, :x, :y, :x_first]
+
+        dfc = copy(df)
+        gdf = groupby_checked(dfc, :g, sort=dosort, skipmissing=true)
+        @test_throws ArgumentError select!(gdf, :x => sum)
+        @test_throws ArgumentError select!(gdf, :x => sum, regroup=true)
+        @test_throws ArgumentError transform!(gdf, :x => sum)
+        @test_throws ArgumentError transform!(gdf, :x => sum, regroup=true)
+        @test dfc ≅ df
+    end
+end
+
 end # module
diff --git a/test/select.jl b/test/select.jl
index 9a7a7a0784..a63a30e329 100644
--- a/test/select.jl
+++ b/test/select.jl
@@ -1200,130 +1200,85 @@ end
           DataFrame(y_function = [4, 5], x_sum=[3, 3])
 end
 
-@testset "combine GroupedDataFrame" begin
-    for df in (DataFrame(g=[3,1,1,missing],x=1:4, y=5:8),
-               DataFrame(g=categorical([3,1,1,missing]),x=1:4, y=5:8))
-        if !(df.g isa CategoricalVector)
-            gdf = groupby(df, :g, sort=false, skipmissing=false)
-            @test combine(gdf, :x => sum, keepkeys=false, regroup=false) ==
-                  DataFrame(x_sum = [1, 5, 4])
-            @test_throws ArgumentError combine(gdf, :x => sum, keepkeys=false, regroup=true)
-            @test combine(gdf, :x => sum, keepkeys=true, regroup=false) ≅
-                  DataFrame(g = [3, 1, missing], x_sum = [1, 5, 4])
-            gdf2 = combine(gdf, :x => sum, keepkeys=true, regroup=true)
-            @test gdf2 isa GroupedDataFrame{DataFrame}
-            @test gdf2.groups == 1:3
-            @test DataFrame(gdf2) ≅ DataFrame(g = [3, 1, missing], x_sum = [1, 5, 4])
-
-            @test combine(gdf, :x => sum, :g, keepkeys=false, regroup=false) ≅
-                  DataFrame(x_sum = [1, 5, 5, 4], g = [3, 1, 1, missing])
-            @test combine(gdf, :x => sum, :g, keepkeys=true, regroup=false) ≅
-                  DataFrame(g = [3, 1, 1, missing], x_sum = [1, 5, 5, 4])
-            gdf2 = combine(gdf, :x => sum, :g, keepkeys=true, regroup=true)
-            @test gdf2 isa GroupedDataFrame{DataFrame}
-            @test gdf2.groups == [1, 2, 2, 3]
-            @test DataFrame(gdf2) ≅ DataFrame(g = [3, 1, 1, missing], x_sum = [1, 5, 5, 4])
-
-            @test combine(x -> (x_sum = sum(x.x),), gdf, keepkeys=false, regroup=false) ==
-                  DataFrame(x_sum = [1, 5, 4])
-            @test combine(x -> (x_sum = sum(x.x),), gdf, keepkeys=true, regroup=false) ≅
-                  DataFrame(g = [3, 1, missing], x_sum = [1, 5, 4])
-            gdf2 = combine(x -> (x_sum = sum(x.x),), gdf, keepkeys=true, regroup=true)
-            @test gdf2 isa GroupedDataFrame{DataFrame}
-            @test gdf2.groups == 1:3
-            @test DataFrame(gdf2) ≅ DataFrame(g = [3, 1, missing], x_sum = [1, 5, 4])
-
-            gdf = groupby(df, :g, sort=false, skipmissing=true)
-
-            @test combine(gdf, :x => sum, keepkeys=false, regroup=false) ==
-                  DataFrame(x_sum = [1, 5])
-            @test_throws ArgumentError combine(gdf, :x => sum, keepkeys=false, regroup=true)
-            @test combine(gdf, :x => sum, keepkeys=true, regroup=false) ≅
-                  DataFrame(g = [3, 1], x_sum = [1, 5])
-            gdf2 = combine(gdf, :x => sum, keepkeys=true, regroup=true)
-            @test gdf2 isa GroupedDataFrame{DataFrame}
-            @test gdf2.groups == 1:2
-            @test DataFrame(gdf2) ≅ DataFrame(g = [3, 1], x_sum = [1, 5])
-
-            @test combine(gdf, :x => sum, :g, keepkeys=false, regroup=false) ≅
-                  DataFrame(x_sum = [1, 5, 5], g = [3, 1, 1])
-            @test combine(gdf, :x => sum, :g, keepkeys=true, regroup=false) ≅
-                  DataFrame(g = [3, 1, 1], x_sum = [1, 5, 5])
-            gdf2 = combine(gdf, :x => sum, :g, keepkeys=true, regroup=true)
-            @test gdf2 isa GroupedDataFrame{DataFrame}
-            @test gdf2.groups == [1, 2, 2]
-            @test DataFrame(gdf2) ≅ DataFrame(g = [3, 1, 1], x_sum = [1, 5, 5])
-
-            @test combine(x -> (x_sum = sum(x.x),), gdf, keepkeys=false, regroup=false) ==
-                  DataFrame(x_sum = [1, 5])
-            @test combine(x -> (x_sum = sum(x.x),), gdf, keepkeys=true, regroup=false) ≅
-                  DataFrame(g = [3, 1], x_sum = [1, 5])
-            gdf2 = combine(x -> (x_sum = sum(x.x),), gdf, keepkeys=true, regroup=true)
-            @test gdf2 isa GroupedDataFrame{DataFrame}
-            @test gdf2.groups == 1:2
-            @test DataFrame(gdf2) ≅ DataFrame(g = [3, 1], x_sum = [1, 5])
-        end
+@testset "select and transform AbstractDataFrame" begin
+    df = DataFrame(x=1:3, y=4:6)
+    @test select(df, :x => first) == DataFrame(x_first=fill(1,3))
+    df2 = select(df, :x, :x => first, copycols=true)
+    @test df2 == DataFrame(x=df.x, x_first=fill(1,3))
+    @test df2.x !== df.x
+    df2 = select(df, :x, :x => first, copycols=false)
+    @test df2 == DataFrame(x=df.x, x_first=fill(1,3))
+    @test df2.x === df.x
+    @test_throws ArgumentError select(df, :x => x -> [first(x)], copycols=true)
+    @test_throws ArgumentError select(df, :x => x -> [first(x)], copycols=false)
+
+    df2 = transform(df, :x => first, copycols=true)
+    @test df2 == [df DataFrame(x_first=fill(1,3))]
+    @test df2.x !== df.x
+    @test df2.y !== df.y
+    df2 = transform(df, :x => first, copycols=false)
+    @test df2 == [df DataFrame(x_first=fill(1,3))]
+    @test df2.x === df.x
+    @test df2.y === df.y
+    @test transform(df, names(df) .=> first .=> names(df)) ==
+          DataFrame(x=fill(1, 3), y=fill(4, 3))
+    @test_throws ArgumentError transform(df, :x => x -> [first(x)], copycols=true)
+    @test_throws ArgumentError transform(df, :x => x -> [first(x)], copycols=false)
 
-        gdf = groupby(df, :g, sort=true, skipmissing=false)
-
-        @test combine(gdf, :x => sum, keepkeys=false, regroup=false) ==
-              DataFrame(x_sum = [5, 1, 4])
-        @test_throws ArgumentError combine(gdf, :x => sum, keepkeys=false, regroup=true)
-        @test combine(gdf, :x => sum, keepkeys=true, regroup=false) ≅
-              DataFrame(g = [1, 3, missing], x_sum = [5, 1, 4])
-        gdf2 = combine(gdf, :x => sum, keepkeys=true, regroup=true)
-        @test gdf2 isa GroupedDataFrame{DataFrame}
-        @test gdf2.groups == 1:3
-        @test DataFrame(gdf2) ≅ DataFrame(g = [1, 3, missing], x_sum = [5, 1, 4])
-
-        @test combine(gdf, :x => sum, :g, keepkeys=false, regroup=false) ≅
-              DataFrame(x_sum = [5, 5, 1, 4], g = [1, 1, 3, missing])
-        @test combine(gdf, :x => sum, :g, keepkeys=true, regroup=false) ≅
-              DataFrame(g = [1, 1, 3, missing], x_sum = [5, 5, 1, 4])
-        gdf2 = combine(gdf, :x => sum, :g, keepkeys=true, regroup=true)
-        @test gdf2 isa GroupedDataFrame{DataFrame}
-        @test gdf2.groups == [1, 1, 2, 3]
-        @test DataFrame(gdf2) ≅ DataFrame(g = [1, 1, 3, missing], x_sum = [5, 5, 1, 4])
-
-        @test combine(x -> (x_sum = sum(x.x),), gdf, keepkeys=false, regroup=false) ==
-              DataFrame(x_sum = [5, 1, 4])
-        @test combine(x -> (x_sum = sum(x.x),), gdf, keepkeys=true, regroup=false) ≅
-              DataFrame(g = [1, 3, missing], x_sum = [5, 1, 4])
-        gdf2 = combine(x -> (x_sum = sum(x.x),), gdf, keepkeys=true, regroup=true)
-        @test gdf2 isa GroupedDataFrame{DataFrame}
-        @test gdf2.groups == 1:3
-        @test DataFrame(gdf2) ≅ DataFrame(g = [1, 3, missing], x_sum = [5, 1, 4])
-
-        gdf = groupby(df, :g, sort=true, skipmissing=true)
-
-        @test combine(gdf, :x => sum, keepkeys=false, regroup=false) ==
-              DataFrame(x_sum = [5, 1])
-        @test_throws ArgumentError combine(gdf, :x => sum, keepkeys=false, regroup=true)
-        @test combine(gdf, :x => sum, keepkeys=true, regroup=false) ≅
-              DataFrame(g = [1, 3], x_sum = [5, 1])
-        gdf2 = combine(gdf, :x => sum, keepkeys=true, regroup=true)
-        @test gdf2 isa GroupedDataFrame{DataFrame}
-        @test gdf2.groups == 1:2
-        @test DataFrame(gdf2) ≅ DataFrame(g = [1, 3], x_sum = [5, 1])
-
-        @test combine(gdf, :x => sum, :g, keepkeys=false, regroup=false) ≅
-              DataFrame(x_sum = [5, 5, 1], g = [1, 1, 3])
-        @test combine(gdf, :x => sum, :g, keepkeys=true, regroup=false) ≅
-              DataFrame(g = [1, 1, 3], x_sum = [5, 5, 1])
-        gdf2 = combine(gdf, :x => sum, :g, keepkeys=true, regroup=true)
-        @test gdf2 isa GroupedDataFrame{DataFrame}
-        @test gdf2.groups == [1, 1, 2]
-        @test DataFrame(gdf2) ≅ DataFrame(g = [1, 1, 3], x_sum = [5, 5, 1])
-
-        @test combine(x -> (x_sum = sum(x.x),), gdf, keepkeys=false, regroup=false) ==
-              DataFrame(x_sum = [5, 1])
-        @test combine(x -> (x_sum = sum(x.x),), gdf, keepkeys=true, regroup=false) ≅
-              DataFrame(g = [1, 3], x_sum = [5, 1])
-        gdf2 = combine(x -> (x_sum = sum(x.x),), gdf, keepkeys=true, regroup=true)
-        @test gdf2 isa GroupedDataFrame{DataFrame}
-        @test gdf2.groups == 1:2
-        @test DataFrame(gdf2) ≅ DataFrame(g = [1, 3], x_sum = [5, 1])
-    end
+    dfv = view(df, [2, 1], [2, 1])
+    @test select(dfv, :x => first) == DataFrame(x_first=fill(2,2))
+    df2 = select(dfv, :x, :x => first, copycols=true)
+    @test df2 == DataFrame(x=dfv.x, x_first=fill(2,2))
+    @test df2.x !== dfv.x
+    @test_throws ArgumentError select(dfv, :x, :x => first, copycols=false)
+    @test_throws ArgumentError select(dfv, :x => x -> [first(x)], copycols=true)
+    @test_throws ArgumentError select(dfv, :x => x -> [first(x)], copycols=false)
+
+    df2 = transform(dfv, :x => first, copycols=true)
+    @test df2 == [dfv DataFrame(x_first=fill(2,2))]
+    @test df2.x !== dfv.x
+    @test df2.y !== dfv.y
+    @test_throws ArgumentError transform(dfv, :x => first, copycols=false)
+    @test transform(dfv, names(dfv) .=> first .=> names(dfv)) ==
+          DataFrame(y=fill(5, 2), x=fill(2, 2))
+    @test_throws ArgumentError transform(df, :x => x -> [first(x)], copycols=true)
+    @test_throws ArgumentError transform(df, :x => x -> [first(x)], copycols=false)
+end
+
+@testset "select! and transform! AbstractDataFrame" begin
+    df = DataFrame(x=1:3, y=4:6)
+    select!(df, :x => first)
+    @test df == DataFrame(x_first = fill(1,3))
+
+    # if we select! we do copycols=false, so we can get aliases
+    df = DataFrame(x=1:3, y=4:6)
+    x = df.x
+    select!(df, :x => (x->x), :x)
+    @test x === df.x_function === df.x
+
+    df = DataFrame(x=1:3, y=4:6)
+    @test_throws ArgumentError select!(df, :x => x -> [1])
+    @test df == DataFrame(x=1:3, y=4:6)
+
+    df = DataFrame(x=1:3, y=4:6)
+    x = df.x
+    y = df.y
+    transform!(df, :x => first)
+    @test df == DataFrame(x=x, y=y, x_first=fill(1,3))
+    @test df.x == x
+    @test df.y == y
+
+    df = DataFrame(x=1:3, y=4:6)
+    transform!(df, names(df) .=> first .=> names(df))
+    @test df == DataFrame(x=fill(1,3), y=fill(4,3))
+
+    df = DataFrame(x=1:3, y=4:6)
+    @test_throws ArgumentError transform!(df, :x => x -> [1])
+    @test df == DataFrame(x=1:3, y=4:6)
+
+    dfv = view(df, [2, 1], [2, 1])
+    @test_throws MethodError select!(dfv, 1)
+    @test_throws MethodError transform!(dfv, 1)
 end
 
 end # module

From d51f3f8767c8862a7c12d7e4f112408517c66ce5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= <bkamins@sgh.waw.pl>
Date: Thu, 30 Apr 2020 09:29:29 +0200
Subject: [PATCH 15/29] updates after review comments

---
 docs/src/man/split_apply_combine.md |  4 ++--
 test/grouping.jl                    | 25 +++++++++++++++++++++++--
 2 files changed, 25 insertions(+), 4 deletions(-)

diff --git a/docs/src/man/split_apply_combine.md b/docs/src/man/split_apply_combine.md
index 99af65a2e6..a9287084fc 100644
--- a/docs/src/man/split_apply_combine.md
+++ b/docs/src/man/split_apply_combine.md
@@ -16,10 +16,10 @@ object from your data frame using `groupby` function that takes two arguments:
 The differences between the above functions are the following:
 * `select`: return a data frame with the number and order of rows exactly the same
   as the source, preserve only columns that have been calculated;
+  `select!`: is an in-place version of `select`;
 * `transform`: return a data frame with the number and order of rows exactly the same
   as the source, preserve all columns from the source and columns that have been calculated;
-* `select!`: is an in-place version of `select`;
-* `transform!`: is an in-place version of `transform`;
+  `transform!`: is an in-place version of `transform`;
 * `combine`: does not put restrictions on number of rows returned, the order of rows
   is specified by the order of groups in `GroupedDataFrame`.
 
diff --git a/test/grouping.jl b/test/grouping.jl
index 2b4a5feba8..7e0144e9e9 100644
--- a/test/grouping.jl
+++ b/test/grouping.jl
@@ -2136,6 +2136,27 @@ end
         @test_throws ArgumentError transform(gdf, :x => sum)
         @test_throws ArgumentError transform(gdf, :x => sum, regroup=true)
     end
+
+    # show the difference between the ordering of rows in select and combine
+    Random.seed!(1)
+    for df in (DataFrame(g=rand(1:20, 1000), x=rand(1000), id=1:1000),
+               DataFrame(g=categorical(rand(1:20, 1000)), x=rand(1000), id=1:1000)),
+        dosort in (true, false)
+
+        gdf = groupby(df, :g, sort=dosort)
+
+        res1 = select(gdf, :x => mean, :x => x -> x .- mean(x), :id)
+        @test res1.g == df.g
+        @test res1.id == df.id
+        @test res1.x_mean + res1.x_function ≈ df.x
+
+        res2 = combine(gdf, :x => mean, :x => x -> x .- mean(x), :id)
+        @test unique(res2.g) ==
+              (dosort || df.g isa CategoricalVector ? sort! : identity)(unique(df.g))
+        for i in unique(res2.g)
+            @test issorted(filter(:g => x -> x == i, res2).id)
+        end
+    end
 end
 
 @testset "select! and transform! GroupedDataFrame" begin
@@ -2169,7 +2190,7 @@ end
         dfc = copy(df)
         g = dfc.g
         gdf = groupby_checked(dfc, :g, sort=dosort, skipmissing=false)
-        @test select!(gdf, :x => sum, regroup=true) === gdf
+        @test validate_gdf(select!(gdf, :x => sum, regroup=true)) === gdf
         @test dfc.g === g
         @test dfc.x_sum == [1, 5, 5, 4]
         @test propertynames(dfc) == [:g, :x_sum]
@@ -2179,7 +2200,7 @@ end
         x = dfc.x
         y = dfc.y
         gdf = groupby_checked(dfc, :g, sort=dosort, skipmissing=false)
-        @test transform!(gdf, :g => first => :g, :x => first, regroup=true) === gdf
+        @test validate_gdf(transform!(gdf, :g => first => :g, :x => first, regroup=true)) === gdf
         @test dfc.g === g
         @test dfc.x === x
         @test dfc.y === y

From ef461e6767ca4ea21b072f2f057c29751df49d8c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= <bkamins@sgh.waw.pl>
Date: Fri, 1 May 2020 10:20:45 +0200
Subject: [PATCH 16/29] Apply suggestions from code review

Co-authored-by: Milan Bouchet-Valat <nalimilan@club.fr>
---
 docs/src/man/getting_started.md           |  6 +--
 docs/src/man/split_apply_combine.md       | 26 ++++++-----
 src/abstractdataframe/selection.jl        |  8 ++--
 src/dataframe/dataframe.jl                |  2 +-
 src/groupeddataframe/splitapplycombine.jl | 57 +++++++++--------------
 5 files changed, 45 insertions(+), 54 deletions(-)

diff --git a/docs/src/man/getting_started.md b/docs/src/man/getting_started.md
index 5cb011a4f8..bd6b7890fa 100644
--- a/docs/src/man/getting_started.md
+++ b/docs/src/man/getting_started.md
@@ -792,9 +792,9 @@ julia> mean(df.A)
 2.5
 ```
 
-We can also apply a function to each column of a `DataFrame` using `select`.
-`select` always returns the same number of rows in the result as the source
-data frame. For example:
+We can also apply a function to each column of a `DataFrame` using `select`,
+which always returns the same number of rows in the result as the source
+data frame (repeating values as necessary). For example:
 ```jldoctest dataframe
 julia> df = DataFrame(A = 1:4, B = 4.0:-1.0:1.0)
 4×2 DataFrame
diff --git a/docs/src/man/split_apply_combine.md b/docs/src/man/split_apply_combine.md
index a9287084fc..ed2252009c 100644
--- a/docs/src/man/split_apply_combine.md
+++ b/docs/src/man/split_apply_combine.md
@@ -7,19 +7,19 @@ framework for handling this sort of computation is described in the paper
 written by Hadley Wickham.
 
 The DataFrames package supports the split-apply-combine strategy through the
-`combine`, `select`/`select!` and `transform`/`transform!` functions.
+`groupby` function followed by `combine`, `select`/`select!` or transform`/`transform!`.
 
 In order to perform operations by groups you first need to create a `GroupedDataFrame`
-object from your data frame using `groupby` function that takes two arguments:
+object from your data frame using the `groupby` function that takes two arguments:
 (1) a data frame to be grouped, and (2) a set of columns to group by.
 
-The differences between the above functions are the following:
+Operations can then be applied on each group using one of the following functions:
 * `select`: return a data frame with the number and order of rows exactly the same
-  as the source, preserve only columns that have been calculated;
-  `select!`: is an in-place version of `select`;
+  as the source data frame, including only new calculated columns;
+  `select!` is an in-place version of `select`;
 * `transform`: return a data frame with the number and order of rows exactly the same
-  as the source, preserve all columns from the source and columns that have been calculated;
-  `transform!`: is an in-place version of `transform`;
+  as the source data frame, including all columns from the source and new calculated columns;
+  `transform!` is an in-place version of `transform`;
 * `combine`: does not put restrictions on number of rows returned, the order of rows
   is specified by the order of groups in `GroupedDataFrame`.
 
@@ -52,9 +52,10 @@ and `target_col` is not specified,
 `function` can return multiple columns in the form of an `AbstractDataFrame`,
 `AbstractMatrix`, `NamedTuple` or `DataFrameRow`.
 
-Here are the rules specifying the shape of the resulting `DataFrame` in `combine`
-(in `select`/`select!` and `transform`/`transform!` the result has the number
-and order of rows equal to the source):
+`select`/`select!` and `transform`/`transform!` always return a `DataFrame`
+with the same number of rows as the source.
+For `combine`, the shape of the resulting `DataFrame` is determined
+according to the following rules:
 - a single value produces a single row and column per group
 - a named tuple or `DataFrameRow` produces a single row and one column per field
 - a vector produces a single column with one row per entry
@@ -205,8 +206,9 @@ julia> combine(gdf, 1:2 => cor, nrow)
 
 ```
 
-If we use `select` or `transform` instead of `combine` we always obtain the number
-and of order of rows in the result equal to the source. In the example below
+Contrary to `combine`, the `select` and `transform` functions always return
+a data frame with the same number and order of rows as the source.
+In the example below
 the return values in columns `:SepalLength_SepalWidth_cor` and `:nrow` are
 broadcasted to match the number of elements in each group:
 ```
diff --git a/src/abstractdataframe/selection.jl b/src/abstractdataframe/selection.jl
index dec34a49c5..e86d10ed70 100644
--- a/src/abstractdataframe/selection.jl
+++ b/src/abstractdataframe/selection.jl
@@ -183,8 +183,8 @@ function select_transform!(nc::Pair{<:Union{Int, AbstractVector{Int}, AsTable},
         # this means that we use `select` or `transform` not `combine`
         if !allow_resizing_newdf[] && ncol(newdf) == 0 && length(res) != nrow(df)
             throw(ArgumentError("length $(length(res)) of vector returned from " *
-                                "function $fun is different than number of rows" *
-                                " $(nrow(df)) of the source data frame."))
+                                "function $fun is different from number of rows " *
+                                "$(nrow(df)) of the source data frame."))
         end
         allow_resizing_newdf[] = false
         respar = parent(res)
@@ -554,7 +554,7 @@ transform(df::AbstractDataFrame, args...; copycols::Bool=true) =
 
 Create a new data frame that contains columns from `df` specified by `args` and
 return it. The result can have any number of rows that is determined by the
-passed transformations.
+values returned by passed transformations.
 
 See [`select`](@ref) for detailed rules regarding accepted values for `args`.
 
@@ -657,7 +657,7 @@ function _process(df::AbstractDataFrame, normalized_cs, copycols::Bool, keeprows
     end
     # we allow resizing newdf only if up to some point only scalars were put
     # in it. The moment we put any vector into newdf its number of rows becomes fixed
-    # Also if keeprows is true then we make sure to rpoduce nrow(df) rows so resizing
+    # Also if keeprows is true then we make sure to produce nrow(df) rows so resizing
     # is not allowed
     allow_resizing_newdf = Ref(!keeprows)
     for nc in normalized_cs
diff --git a/src/dataframe/dataframe.jl b/src/dataframe/dataframe.jl
index 3d1fa9eff6..71fffd6923 100644
--- a/src/dataframe/dataframe.jl
+++ b/src/dataframe/dataframe.jl
@@ -65,7 +65,7 @@ to fill a new vector of the appropriate length. As a particular rule values
 stored in a `Ref` or a `0`-dimensional `AbstractArray` are unwrapped and treated
 in the same way.
 
-Additionally `DataFrame` can be used to collect [`GroupedDataFrame`](@ref)
+Additionally `DataFrame` can be used to collect a [`GroupedDataFrame`](@ref)
 into a `DataFrame`.
 
 # Notes
diff --git a/src/groupeddataframe/splitapplycombine.jl b/src/groupeddataframe/splitapplycombine.jl
index 91ce4a0e91..7e6955d4f6 100644
--- a/src/groupeddataframe/splitapplycombine.jl
+++ b/src/groupeddataframe/splitapplycombine.jl
@@ -9,7 +9,7 @@ into row groups.
 - `cols` : data frame columns to group by. Can be any column selector
   ($COLUMNINDEX_STR; $MULTICOLUMNINDEX_STR).
 - `sort` : whether to sort groups according to the values of the grouping columns
-  `cols`; if all `cols` are `CategoricalVector` then groups are always sorted
+  `cols`; if all `cols` are `CategoricalVector`s then groups are always sorted
   irrespective of the value of `sort`
 - `skipmissing` : whether to skip groups with `missing` values in one of the
   grouping columns `cols`
@@ -252,7 +252,7 @@ const KWARG_PROCESSING_RULES =
     value contains columns with the same names as the grouping columns, they are
     required to be equal.
 
-    If `regroup=false`, if the returned value should be a `DataFrame` or a
+    If `regroup=true`, the returned value must be a `DataFrame` or a
     `GroupedDataFrame` grouped using `keycols(gdf)`.
     """
 
@@ -264,10 +264,9 @@ const KWARG_PROCESSING_RULES =
     combine(fun::Union{Function, Type}, df::AbstractDataFrame)
     combine(pair::Pair, df::AbstractDataFrame)
 
-Transform a [`GroupedDataFrame`](@ref) into a `DataFrame`.
-
-As a special case if `combine` is passed an `AbstractDataFrame` it applies `fun`
-or `pair` to the passed data frame as a whole.
+Apply operations to each group in a [`GroupedDataFrame`](@ref) and return
+the combined result as a `DataFrame`.
+If an `AbstractDataFrame` is passed, apply operations to the data frame as a whole.
 
 $F_ARGUMENT_RULES
 
@@ -574,27 +573,20 @@ function combine_helper(f, gd::GroupedDataFrame,
         keys = groupcols(gd)
         for key in keys
             if hasproperty(valscat, key)
-                if keeprows
-                    isequal(valscat[!, key], parent(gd)[!, key]) ||
-                    throw(ArgumentError("column :$key in returned data frame " *
-                                        "is not equal to grouping key :$key"))
-
-                else
-                    isequal(valscat[!, key], view(parent(gd)[!, key], idx)) ||
+                if (keeprows && !isequal(valscat[!, key], parent(gd)[!, key])) ||
+                    (!keeprows && !isequal(valscat[!, key], view(parent(gd)[!, key], idx)))
                     throw(ArgumentError("column :$key in returned data frame " *
                                         "is not equal to grouping key :$key"))
                 end
             end
         end
         if keeprows
-            newparent = hcat!(select(parent(gd), gd.cols, copycols=copycols),
-                              select(valscat, Not(intersect(keys, _names(valscat))),
-                                     copycols=false), copycols=false)
+            newparent = select(parent(gd), gd.cols, copycols=copycols)
         else
-            newparent = hcat!(parent(gd)[idx, gd.cols],
-                              select(valscat, Not(intersect(keys, _names(valscat))),
-                                     copycols=false), copycols=false)
+            newparent = parent(gd)[idx, gd.cols]
         end
+        hcat!(newparent, select(valscat, Not(intersect(keys, _names(valscat))), copycols=false),
+              copycols=false)
         regroup || return newparent
 
         if length(idx) == 0
@@ -1035,8 +1027,9 @@ function _combine(f::AbstractVector{<:Pair},
 
     if keeprows
         if !_check_cannonical(gd)
-            throw(ArgumentError("select or transform functions require that " *
-                                "GroupedDataFrame is not subsetted"))
+            throw(ArgumentError("select and transform do not support " *
+                                "GroupedDataFrames from which some groups have been dropped "*
+                                "(including skipmissing=true)")))
         end
         idx_keeprows = Vector{Int}(undef, nrow(parent(gd)))
         let i = 0
@@ -1121,11 +1114,7 @@ function _combine(f::AbstractVector{<:Pair},
         @assert !isnothing(idx_agg)
         idx = idx_agg
     else
-        if keeprows
-            idx = idx_keeprows
-        else
-            idx = res[idx_loc][1]
-        end
+        idx = keeprows ? idx_keeprows : res[idx_loc][1]
         agg2idx_map = nothing
         for i in 1:length(res)
             if res[i][1] !== idx && res[i][1] != idx
@@ -1138,9 +1127,8 @@ function _combine(f::AbstractVector{<:Pair},
                     res[i] = idx_agg, res[i][2][agg2idx_map]
                 elseif idx != res[i][1]
                     if keeprows
-                        throw(ArgumentError("all functions must return vectors of " *
-                                            "the length equal to the group rows count " *
-                                            "in the source GroupedDataFrame"))
+                        throw(ArgumentError("all functions must return vectors with " *
+                                            "as many values as rows in each group"))
                     else
                         throw(ArgumentError("all functions must return vectors of the same length"))
                     end
@@ -1157,8 +1145,9 @@ function _combine(f::AbstractVector{<:Pair},
         if keeprows && res[i][1] !== idx_keeprows # we need to reorder the column
             newcol = similar(col)
             # we can probably make it more efficient, but I leave it as an optimization for the future
-            for i in axes(col, 1)
-                newcol[gd.idx[i]] = col[i]
+            gd_idx = gd.idx
+            for j in eachindex(gd.idx, col)
+                newcol[gd_idx[j]] = col[j]
             end
             res[i] = (col_idx, newcol)
         end
@@ -1470,10 +1459,10 @@ transform(gd::GroupedDataFrame, args...;
 
 An equivalent of
 `select(gd, args..., copycols=false, keepkeys=true, regroup=regroup)`
-but updates the `parent(gd)` in place.
+but updates `parent(gd)` in place.
 """
 function select!(gd::GroupedDataFrame{DataFrame}, args...; regroup::Bool=false)
-    newdf = select(gd, args..., copycols=false, regroup=false)
+    newdf = select(gd, args..., copycols=false)
     df = parent(gd)
     copy!(_columns(df), _columns(newdf))
     x = index(df)
@@ -1490,7 +1479,7 @@ end
 
 An equivalent of
 `transform(gd, args..., copycols=false, keepkeys=true, regroup=regroup)`
-but updates the `parent(gd)` in place.
+but updates `parent(gd)` in place.
 """
 transform!(gd::GroupedDataFrame{DataFrame}, args...; regroup::Bool=false) =
     select!(gd, :, args..., regroup=regroup)

From 245714d8bfa6269a16bd3e070e9baa1f064466f4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= <bkamins@sgh.waw.pl>
Date: Fri, 1 May 2020 11:41:57 +0200
Subject: [PATCH 17/29] fixes after code review

---
 docs/src/man/getting_started.md           |  35 +---
 docs/src/man/split_apply_combine.md       |  89 +++++-----
 src/abstractdataframe/selection.jl        |  89 +++-------
 src/dataframe/dataframe.jl                |  11 +-
 src/deprecated.jl                         |  20 +--
 src/groupeddataframe/splitapplycombine.jl | 205 ++++++++++++++++++----
 6 files changed, 260 insertions(+), 189 deletions(-)

diff --git a/docs/src/man/getting_started.md b/docs/src/man/getting_started.md
index bd6b7890fa..d8d22f0bce 100644
--- a/docs/src/man/getting_started.md
+++ b/docs/src/man/getting_started.md
@@ -773,8 +773,8 @@ julia> describe(df)
 
 ```
 
-If you are interested in describing only a subset of columns then the easiest way to do it is to
-pass a subset of an original data frame to `describe` like this:
+If you are interested in describing only a subset of columns then the easiest way
+to do it is to pass a subset of an original data frame to `describe` like this:
 ```jldoctest dataframe
 julia> describe(df[!, [:A]))
 1×8 DataFrame
@@ -792,9 +792,7 @@ julia> mean(df.A)
 2.5
 ```
 
-We can also apply a function to each column of a `DataFrame` using `select`,
-which always returns the same number of rows in the result as the source
-data frame (repeating values as necessary). For example:
+We can also apply a function to each column of a `DataFrame` using `combine`. For example:
 ```jldoctest dataframe
 julia> df = DataFrame(A = 1:4, B = 4.0:-1.0:1.0)
 4×2 DataFrame
@@ -806,30 +804,6 @@ julia> df = DataFrame(A = 1:4, B = 4.0:-1.0:1.0)
 │ 3   │ 3     │ 2.0     │
 │ 4   │ 4     │ 1.0     │
 
-julia> select(df, names(df) .=> sum)
-4×2 DataFrame
-│ Row │ A_sum │ B_sum   │
-│     │ Int64 │ Float64 │
-├─────┼───────┼─────────┤
-│ 1   │ 10    │ 10.0    │
-│ 2   │ 10    │ 10.0    │
-│ 3   │ 10    │ 10.0    │
-│ 4   │ 10    │ 10.0    │
-
-julia> select(df, names(df) .=> sum, names(df) .=> prod)
-4×4 DataFrame
-│ Row │ A_sum │ B_sum   │ A_prod │ B_prod  │
-│     │ Int64 │ Float64 │ Int64  │ Float64 │
-├─────┼───────┼─────────┼────────┼─────────┤
-│ 1   │ 10    │ 10.0    │ 24     │ 24.0    │
-│ 2   │ 10    │ 10.0    │ 24     │ 24.0    │
-│ 3   │ 10    │ 10.0    │ 24     │ 24.0    │
-│ 4   │ 10    │ 10.0    │ 24     │ 24.0    │
-```
-
-If instead you prefer to get a result collapsed to the number of rows returned
-by the applied functions use the `combine` function:
-```
 julia> combine(df, names(df) .=> sum)
 1×2 DataFrame
 │ Row │ A_sum │ B_sum   │
@@ -845,6 +819,9 @@ julia> combine(df, names(df) .=> sum, names(df) .=> prod)
 │ 1   │ 10    │ 10.0    │ 24     │ 24.0    │
 ```
 
+If you would prefer the result to have the same number of rows as the source data
+frame use `select` instead of `combine`.
+
 ### Handling of Columns Stored in a `DataFrame`
 
 Functions that transform a `DataFrame` to produce a
diff --git a/docs/src/man/split_apply_combine.md b/docs/src/man/split_apply_combine.md
index ed2252009c..96375c9f77 100644
--- a/docs/src/man/split_apply_combine.md
+++ b/docs/src/man/split_apply_combine.md
@@ -7,21 +7,22 @@ framework for handling this sort of computation is described in the paper
 written by Hadley Wickham.
 
 The DataFrames package supports the split-apply-combine strategy through the
-`groupby` function followed by `combine`, `select`/`select!` or transform`/`transform!`.
+`groupby` function followed by `combine`, `select`/`select!` or `transform`/`transform!`.
 
 In order to perform operations by groups you first need to create a `GroupedDataFrame`
 object from your data frame using the `groupby` function that takes two arguments:
 (1) a data frame to be grouped, and (2) a set of columns to group by.
 
 Operations can then be applied on each group using one of the following functions:
+* `combine`: does not put restrictions on number of rows returned, the order of rows
+  is specified by the order of groups in `GroupedDataFrame`; it is typically used
+  to compute summary statistics by group;
 * `select`: return a data frame with the number and order of rows exactly the same
   as the source data frame, including only new calculated columns;
   `select!` is an in-place version of `select`;
 * `transform`: return a data frame with the number and order of rows exactly the same
   as the source data frame, including all columns from the source and new calculated columns;
-  `transform!` is an in-place version of `transform`;
-* `combine`: does not put restrictions on number of rows returned, the order of rows
-  is specified by the order of groups in `GroupedDataFrame`.
+  `transform!` is an in-place version of `transform`.
 
 All these functions take a specification of one or more functions to apply to
 each subset of the `DataFrame`. This specification can be of the following forms:
@@ -212,49 +213,49 @@ In the example below
 the return values in columns `:SepalLength_SepalWidth_cor` and `:nrow` are
 broadcasted to match the number of elements in each group:
 ```
-julia> select(gdf, 1:2 => cor, nrow)
-150×3 DataFrame
-│ Row │ Species        │ SepalLength_SepalWidth_cor │ nrow  │
-│     │ String         │ Float64                    │ Int64 │
-├─────┼────────────────┼────────────────────────────┼───────┤
-│ 1   │ Iris-setosa    │ 0.74678                    │ 50    │
-│ 2   │ Iris-setosa    │ 0.74678                    │ 50    │
-│ 3   │ Iris-setosa    │ 0.74678                    │ 50    │
-│ 4   │ Iris-setosa    │ 0.74678                    │ 50    │
-│ 5   │ Iris-setosa    │ 0.74678                    │ 50    │
-│ 6   │ Iris-setosa    │ 0.74678                    │ 50    │
-│ 7   │ Iris-setosa    │ 0.74678                    │ 50    │
+julia> select(gdf, 1:2 => cor)
+150×2 DataFrame
+│ Row │ Species        │ SepalLength_SepalWidth_cor │
+│     │ String         │ Float64                    │
+├─────┼────────────────┼────────────────────────────┤
+│ 1   │ Iris-setosa    │ 0.74678                    │
+│ 2   │ Iris-setosa    │ 0.74678                    │
+│ 3   │ Iris-setosa    │ 0.74678                    │
+│ 4   │ Iris-setosa    │ 0.74678                    │
+│ 5   │ Iris-setosa    │ 0.74678                    │
+│ 6   │ Iris-setosa    │ 0.74678                    │
+│ 7   │ Iris-setosa    │ 0.74678                    │
 ⋮
-│ 143 │ Iris-virginica │ 0.457228                   │ 50    │
-│ 144 │ Iris-virginica │ 0.457228                   │ 50    │
-│ 145 │ Iris-virginica │ 0.457228                   │ 50    │
-│ 146 │ Iris-virginica │ 0.457228                   │ 50    │
-│ 147 │ Iris-virginica │ 0.457228                   │ 50    │
-│ 148 │ Iris-virginica │ 0.457228                   │ 50    │
-│ 149 │ Iris-virginica │ 0.457228                   │ 50    │
-│ 150 │ Iris-virginica │ 0.457228                   │ 50    │
-
-julia> transform(gdf, nrow)
+│ 143 │ Iris-virginica │ 0.457228                   │
+│ 144 │ Iris-virginica │ 0.457228                   │
+│ 145 │ Iris-virginica │ 0.457228                   │
+│ 146 │ Iris-virginica │ 0.457228                   │
+│ 147 │ Iris-virginica │ 0.457228                   │
+│ 148 │ Iris-virginica │ 0.457228                   │
+│ 149 │ Iris-virginica │ 0.457228                   │
+│ 150 │ Iris-virginica │ 0.457228                   │
+
+julia> transform(gdf, :Species => x -> chop.(x, head=5, tail=0))
 150×6 DataFrame
-│ Row │ Species        │ SepalLength │ SepalWidth │ PetalLength │ PetalWidth │ nrow  │
-│     │ String         │ Float64     │ Float64    │ Float64     │ Float64    │ Int64 │
-├─────┼────────────────┼─────────────┼────────────┼─────────────┼────────────┼───────┤
-│ 1   │ Iris-setosa    │ 5.1         │ 3.5        │ 1.4         │ 0.2        │ 50    │
-│ 2   │ Iris-setosa    │ 4.9         │ 3.0        │ 1.4         │ 0.2        │ 50    │
-│ 3   │ Iris-setosa    │ 4.7         │ 3.2        │ 1.3         │ 0.2        │ 50    │
-│ 4   │ Iris-setosa    │ 4.6         │ 3.1        │ 1.5         │ 0.2        │ 50    │
-│ 5   │ Iris-setosa    │ 5.0         │ 3.6        │ 1.4         │ 0.2        │ 50    │
-│ 6   │ Iris-setosa    │ 5.4         │ 3.9        │ 1.7         │ 0.4        │ 50    │
-│ 7   │ Iris-setosa    │ 4.6         │ 3.4        │ 1.4         │ 0.3        │ 50    │
+│ Row │ Species        │ SepalLength │ SepalWidth │ PetalLength │ PetalWidth │ Species_function │
+│     │ String         │ Float64     │ Float64    │ Float64     │ Float64    │ SubString…       │
+├─────┼────────────────┼─────────────┼────────────┼─────────────┼────────────┼──────────────────┤
+│ 1   │ Iris-setosa    │ 5.1         │ 3.5        │ 1.4         │ 0.2        │ setosa           │
+│ 2   │ Iris-setosa    │ 4.9         │ 3.0        │ 1.4         │ 0.2        │ setosa           │
+│ 3   │ Iris-setosa    │ 4.7         │ 3.2        │ 1.3         │ 0.2        │ setosa           │
+│ 4   │ Iris-setosa    │ 4.6         │ 3.1        │ 1.5         │ 0.2        │ setosa           │
+│ 5   │ Iris-setosa    │ 5.0         │ 3.6        │ 1.4         │ 0.2        │ setosa           │
+│ 6   │ Iris-setosa    │ 5.4         │ 3.9        │ 1.7         │ 0.4        │ setosa           │
+│ 7   │ Iris-setosa    │ 4.6         │ 3.4        │ 1.4         │ 0.3        │ setosa           │
 ⋮
-│ 143 │ Iris-virginica │ 5.8         │ 2.7        │ 5.1         │ 1.9        │ 50    │
-│ 144 │ Iris-virginica │ 6.8         │ 3.2        │ 5.9         │ 2.3        │ 50    │
-│ 145 │ Iris-virginica │ 6.7         │ 3.3        │ 5.7         │ 2.5        │ 50    │
-│ 146 │ Iris-virginica │ 6.7         │ 3.0        │ 5.2         │ 2.3        │ 50    │
-│ 147 │ Iris-virginica │ 6.3         │ 2.5        │ 5.0         │ 1.9        │ 50    │
-│ 148 │ Iris-virginica │ 6.5         │ 3.0        │ 5.2         │ 2.0        │ 50    │
-│ 149 │ Iris-virginica │ 6.2         │ 3.4        │ 5.4         │ 2.3        │ 50    │
-│ 150 │ Iris-virginica │ 5.9         │ 3.0        │ 5.1         │ 1.8        │ 50    │
+│ 143 │ Iris-virginica │ 5.8         │ 2.7        │ 5.1         │ 1.9        │ virginica        │
+│ 144 │ Iris-virginica │ 6.8         │ 3.2        │ 5.9         │ 2.3        │ virginica        │
+│ 145 │ Iris-virginica │ 6.7         │ 3.3        │ 5.7         │ 2.5        │ virginica        │
+│ 146 │ Iris-virginica │ 6.7         │ 3.0        │ 5.2         │ 2.3        │ virginica        │
+│ 147 │ Iris-virginica │ 6.3         │ 2.5        │ 5.0         │ 1.9        │ virginica        │
+│ 148 │ Iris-virginica │ 6.5         │ 3.0        │ 5.2         │ 2.0        │ virginica        │
+│ 149 │ Iris-virginica │ 6.2         │ 3.4        │ 5.4         │ 2.3        │ virginica        │
+│ 150 │ Iris-virginica │ 5.9         │ 3.0        │ 5.1         │ 1.8        │ virginica        │
 ```
 
 The `combine` function also supports the `do` block form. However, as noted above,
diff --git a/src/abstractdataframe/selection.jl b/src/abstractdataframe/selection.jl
index e86d10ed70..df252a2b97 100644
--- a/src/abstractdataframe/selection.jl
+++ b/src/abstractdataframe/selection.jl
@@ -155,7 +155,7 @@ function select_transform!(nc::Pair{<:Union{Int, AbstractVector{Int}, AsTable},
     col_idx, (fun, newname) = nc
     # It is allowed to request a tranformation operation into a newname column
     # only once. This is ensured by the logic related to transformed_cols dictionaly
-    # in _process, therefore in select_transform! such a duplicate should not happen
+    # in _manipulate, therefore in select_transform! such a duplicate should not happen
     @assert !hasproperty(newdf, newname)
     cdf = eachcol(df)
     if col_idx isa Int
@@ -180,7 +180,8 @@ function select_transform!(nc::Pair{<:Union{Int, AbstractVector{Int}, AsTable},
             end
         end
 
-        # this means that we use `select` or `transform` not `combine`
+        # !allow_resizing_newdf[] && ncol(newdf) == 0
+        # means that we use `select` or `transform` not `combine`
         if !allow_resizing_newdf[] && ncol(newdf) == 0 && length(res) != nrow(df)
             throw(ArgumentError("length $(length(res)) of vector returned from " *
                                 "function $fun is different from number of rows " *
@@ -354,53 +355,8 @@ julia> select!(df, AsTable(:) => ByRow(mean))
 ```
 
 """
-function select!(df::DataFrame, args::AbstractVector{Int})
-    if isempty(args)
-        empty!(_columns(df))
-        empty!(index(df))
-        return df
-    end
-    indmin, indmax = extrema(args)
-    if indmin < 1
-        throw(ArgumentError("indices must be positive"))
-    end
-    if indmax > ncol(df)
-        throw(ArgumentError("indices must not be greater than number of columns"))
-    end
-    if !allunique(args)
-        throw(ArgumentError("indices must not contain duplicates"))
-    end
-    copy!(_columns(df), _columns(df)[args])
-    x = index(df)
-    copy!(_names(x), _names(df)[args])
-    empty!(x.lookup)
-    for (i, n) in enumerate(x.names)
-        x.lookup[n] = i
-    end
-    return df
-end
-
-select!(df::DataFrame, c::Int) = select!(df, [c])
-
-function select!(df::DataFrame, c::MultiColumnIndex)
-    if c isa AbstractVector{<:Pair}
-        return select!(df, c...)
-    else
-        return select!(df, index(df)[c])
-    end
-end
-
-function select!(df::DataFrame, cs...)
-    newdf = select(df, cs..., copycols=false)
-    copy!(_columns(df), _columns(newdf))
-    x = index(df)
-    copy!(_names(x), _names(newdf))
-    empty!(x.lookup)
-    for (i, n) in enumerate(x.names)
-        x.lookup[n] = i
-    end
-    return df
-end
+select!(df::DataFrame, args...) =
+    _replace_columns!(df, select(df, args..., copycols=false))
 
 """
     transform!(df::DataFrame, args...)
@@ -533,7 +489,7 @@ julia> select(df, AsTable(:) => ByRow(mean))
 
 """
 select(df::AbstractDataFrame, args...; copycols::Bool=true) =
-    _manipulate(df, args..., copycols=copycols, keeprows=true)
+    manipulate(df, args..., copycols=copycols, keeprows=true)
 
 """
     transform(df::AbstractDataFrame, args...; copycols::Bool=true)
@@ -548,7 +504,6 @@ See [`select`](@ref) for detailed rules regarding accepted values for `args`.
 transform(df::AbstractDataFrame, args...; copycols::Bool=true) =
     select(df, :, args..., copycols=copycols)
 
-
 """
     combine(df::AbstractDataFrame, args...)
 
@@ -577,26 +532,26 @@ julia> combine(df, :a => sum, nrow)
 │ 1   │ 6     │ 3     │
 """
 combine(df::AbstractDataFrame, args...) =
-    _manipulate(df, args..., copycols=true, keeprows=false)
+    manipulate(df, args..., copycols=true, keeprows=false)
 
 combine(arg, df::AbstractDataFrame) = combine(arg, groupby(df, []))
 
-_manipulate(df::DataFrame, args::AbstractVector{Int}; copycols::Bool, keeprows::Bool) =
+manipulate(df::DataFrame, args::AbstractVector{Int}; copycols::Bool, keeprows::Bool) =
     DataFrame(_columns(df)[args], Index(_names(df)[args]),
               copycols=copycols)
 
-function _manipulate(df::DataFrame, c::MultiColumnIndex; copycols::Bool, keeprows::Bool)
+function manipulate(df::DataFrame, c::MultiColumnIndex; copycols::Bool, keeprows::Bool)
     if c isa AbstractVector{<:Pair}
-        return _manipulate(df, c..., copycols=copycols, keeprows=keeprows)
+        return manipulate(df, c..., copycols=copycols, keeprows=keeprows)
     else
-        return _manipulate(df, index(df)[c], copycols=copycols, keeprows=keeprows)
+        return manipulate(df, index(df)[c], copycols=copycols, keeprows=keeprows)
     end
 end
 
-_manipulate(df::DataFrame, c::ColumnIndex; copycols::Bool, keeprows::Bool) =
-    _manipulate(df, [c], copycols=copycols, keeprows=keeprows)
+manipulate(df::DataFrame, c::ColumnIndex; copycols::Bool, keeprows::Bool) =
+    manipulate(df, [c], copycols=copycols, keeprows=keeprows)
 
-function _manipulate(df::DataFrame, cs...; copycols::Bool, keeprows::Bool)
+function manipulate(df::DataFrame, cs...; copycols::Bool, keeprows::Bool)
     cs_vec = []
     for v in cs
         if v isa AbstractVector{<:Pair}
@@ -605,11 +560,11 @@ function _manipulate(df::DataFrame, cs...; copycols::Bool, keeprows::Bool)
             push!(cs_vec, v)
         end
     end
-    return _process(df, [normalize_selection(index(df), c) for c in cs_vec],
+    return _manipulate(df, [normalize_selection(index(df), c) for c in cs_vec],
                     copycols, keeprows)
 end
 
-function _process(df::AbstractDataFrame, normalized_cs, copycols::Bool, keeprows::Bool)
+function _manipulate(df::AbstractDataFrame, normalized_cs, copycols::Bool, keeprows::Bool)
     @assert !(df isa SubDataFrame && copycols==false)
     newdf = DataFrame()
     # the role of transformed_cols is the following
@@ -710,19 +665,19 @@ function _process(df::AbstractDataFrame, normalized_cs, copycols::Bool, keeprows
     return newdf
 end
 
-_manipulate(dfv::SubDataFrame, ind::ColumnIndex; copycols::Bool, keeprows::Bool) =
-    _manipulate(dfv, [ind], copycols=copycols, keeprows=keeprows)
+manipulate(dfv::SubDataFrame, ind::ColumnIndex; copycols::Bool, keeprows::Bool) =
+    manipulate(dfv, [ind], copycols=copycols, keeprows=keeprows)
 
-function _manipulate(dfv::SubDataFrame, args::MultiColumnIndex;
+function manipulate(dfv::SubDataFrame, args::MultiColumnIndex;
                  copycols::Bool, keeprows::Bool)
     if args isa AbstractVector{<:Pair}
-        return _manipulate(dfv, args..., copycols=copycols, keeprows=keeprows)
+        return manipulate(dfv, args..., copycols=copycols, keeprows=keeprows)
     else
         return copycols ? dfv[:, args] : view(dfv, :, args)
     end
 end
 
-function _manipulate(dfv::SubDataFrame, args...; copycols::Bool, keeprows::Bool)
+function manipulate(dfv::SubDataFrame, args...; copycols::Bool, keeprows::Bool)
     if copycols
         cs_vec = []
         for v in args
@@ -732,7 +687,7 @@ function _manipulate(dfv::SubDataFrame, args...; copycols::Bool, keeprows::Bool)
                 push!(cs_vec, v)
             end
         end
-        return _process(dfv, [normalize_selection(index(dfv), c) for c in cs_vec],
+        return _manipulate(dfv, [normalize_selection(index(dfv), c) for c in cs_vec],
                         true, keeprows)
     else
         # we do not support transformations here
diff --git a/src/dataframe/dataframe.jl b/src/dataframe/dataframe.jl
index 71fffd6923..06ebd27d53 100644
--- a/src/dataframe/dataframe.jl
+++ b/src/dataframe/dataframe.jl
@@ -66,7 +66,8 @@ stored in a `Ref` or a `0`-dimensional `AbstractArray` are unwrapped and treated
 in the same way.
 
 Additionally `DataFrame` can be used to collect a [`GroupedDataFrame`](@ref)
-into a `DataFrame`.
+into a `DataFrame`. In this case the row ofder of the result follows the order
+of groups in the `GroupedDataFrame` passed.
 
 # Notes
 The `DataFrame` constructor by default copies all columns vectors passed to it.
@@ -1671,3 +1672,11 @@ function repeat!(df::DataFrame, count::Integer)
     count < 0 && throw(ArgumentError("count must be non-negative"))
     return mapcols!(x -> repeat(x, count), df)
 end
+
+# it is not exactly copy! as in general we alow axes to be different
+function _replace_columns!(df::DataFrame, newdf::DataFrame)
+    copy!(_columns(df), _columns(newdf))
+    copy!(_names(index(df)), _names(newdf))
+    copy!(index(df).lookup, index(newdf).lookup)
+    return df
+end
diff --git a/src/deprecated.jl b/src/deprecated.jl
index b7d713e917..59fe83f214 100644
--- a/src/deprecated.jl
+++ b/src/deprecated.jl
@@ -448,23 +448,19 @@ end
 @deprecate deleterows!(df::DataFrame, inds) delete!(df, inds)
 
 @deprecate by(f::Union{Base.Callable, Pair}, d::AbstractDataFrame, cols::Any;
-   sort::Bool=false, skipmissing::Bool=false,
-   keepkeys::Bool=true) combine(groupby(d, cols, sort=sort, skipmissing=skipmissing),
-                                f, keepkeys=keepkeys)
+   sort::Bool=false, skipmissing::Bool=false) combine(groupby(d, cols, sort=sort,
+                                                              skipmissing=skipmissing), f)
 @deprecate by(d::AbstractDataFrame, cols::Any, f::Base.Callable;
-   sort::Bool=false, skipmissing::Bool=false,
-   keepkeys::Bool=true) combine(groupby(d, cols, sort=sort, skipmissing=skipmissing),
-                                f, keepkeys=keepkeys)
+   sort::Bool=false, skipmissing::Bool=false) combine(groupby(d, cols, sort=sort,
+                                                              skipmissing=skipmissing), f)
 @deprecate by(d::AbstractDataFrame, cols::Any, f::Pair;
-   sort::Bool=false, skipmissing::Bool=false,
-   keepkeys::Bool=true) combine(groupby(d, cols, sort=sort, skipmissing=skipmissing),
-                                f, keepkeys=keepkeys)
+   sort::Bool=false, skipmissing::Bool=false) combine(groupby(d, cols, sort=sort,
+                                                              skipmissing=skipmissing), f)
 
 @deprecate by(d::AbstractDataFrame, cols::Any, f::Union{Pair, typeof(nrow),
                                              ColumnIndex, MultiColumnIndex}...;
-   sort::Bool=false, skipmissing::Bool=false,
-   keepkeys::Bool=true) combine(groupby(d, cols, sort=sort, skipmissing=skipmissing),
-                                f..., keepkeys=keepkeys)
+   sort::Bool=false, skipmissing::Bool=false) combine(groupby(d, cols, sort=sort,
+                                                              skipmissing=skipmissing), f...)
 
 import Base: map
 @deprecate map(f::Union{Base.Callable, Pair}, gd::GroupedDataFrame) combine(f, gd, regroup=true)
diff --git a/src/groupeddataframe/splitapplycombine.jl b/src/groupeddataframe/splitapplycombine.jl
index 7e6955d4f6..e37bc9bf03 100644
--- a/src/groupeddataframe/splitapplycombine.jl
+++ b/src/groupeddataframe/splitapplycombine.jl
@@ -28,14 +28,6 @@ indexing by groups, `map` (which applies a function to each group)
 and `combine` (which applies a function to each group
 and combines the result into a data frame).
 
-See the following for additional split-apply-combine operations:
-
-* [`by`](@ref) : split-apply-combine using functions
-* [`map`](@ref) : apply a function to each group of a `GroupedDataFrame`
-  (without combining)
-* [`combine`](@ref) : combine a `GroupedDataFrame`, optionally applying
-  a function to each group
-
 `GroupedDataFrame` also supports the dictionary interface. The keys are
 [`GroupKey`](@ref) objects returned by [`keys(::GroupedDataFrame)`](@ref),
 which can also be used to get the values of the grouping columns for each group.
@@ -43,6 +35,10 @@ which can also be used to get the values of the grouping columns for each group.
 same order as the `cols` argument) are also accepted as indices, but this will
 be slower than using the equivalent `GroupKey`.
 
+# See also
+
+[`combine`](@ref), [`select`](@ref), [`select!`](@ref), [`transform`](@ref), [`transform!](@ref)
+
 # Examples
 ```julia
 julia> df = DataFrame(a = repeat([1, 2, 3, 4], outer=[2]),
@@ -170,8 +166,6 @@ function groupby(df::AbstractDataFrame, cols;
     return gd
 end
 
-_check_cannonical(gd::GroupedDataFrame) = !any(==(0), gd.groups)
-
 const F_TYPE_RULES =
     """
     `fun` can return a single value, a row, a vector, or multiple rows.
@@ -252,8 +246,8 @@ const KWARG_PROCESSING_RULES =
     value contains columns with the same names as the grouping columns, they are
     required to be equal.
 
-    If `regroup=true`, the returned value must be a `DataFrame` or a
-    `GroupedDataFrame` grouped using `keycols(gdf)`.
+    If `regroup=false` (the default) a `DataFrame` is returned.
+    If `regroup=true` a `GroupedDataFrame` grouped using `keycols(gdf)` is returned.
     """
 
 """
@@ -274,9 +268,11 @@ $F_TYPE_RULES
 
 $KWARG_PROCESSING_RULES
 
-The resulting data frame will be sorted by `keycols(gdf)` if `sort=true` was passed to the
-[`groupby`](@ref) call from which `gd` was constructed. Otherwise, ordering of rows
-follows the order of groups in `gdf`.
+Ordering of rows follows the order of groups in `gdf`.
+
+# See also
+
+[`groupby`](@ref), [`select`](@ref), [`select!`](@ref), [`transform`](@ref), [`transform!](@ref)
 
 # Examples
 ```jldoctest
@@ -466,12 +462,12 @@ end
 combine(gd::GroupedDataFrame,
         cs::Union{Pair, typeof(nrow), ColumnIndex, MultiColumnIndex}...;
         keepkeys::Bool=true, regroup::Bool=false) =
-    _combine_executor(gd, cs..., keepkeys=keepkeys, regroup=regroup,
-                      copycols=true, keeprows=false)
+    _combine_prepare(gd, cs..., keepkeys=keepkeys, regroup=regroup,
+                     copycols=true, keeprows=false)
 
-function _combine_executor(gd::GroupedDataFrame,
-                           @nospecialize(cs::Union{Pair, typeof(nrow),
-                                                   ColumnIndex, MultiColumnIndex}...);
+function _combine_prepare(gd::GroupedDataFrame,
+                          @nospecialize(cs::Union{Pair, typeof(nrow),
+                                                  ColumnIndex, MultiColumnIndex}...);
                  keepkeys::Bool, regroup::Bool, copycols::Bool, keeprows::Bool)
     @assert !isempty(cs)
     cs_vec = []
@@ -1020,16 +1016,16 @@ end
 
 function _combine(f::AbstractVector{<:Pair},
                   gd::GroupedDataFrame, nms::AbstractVector{Symbol},
-                  copycols::Bool, keeprows::Bool) # TODO: remove these defaults
+                  copycols::Bool, keeprows::Bool)
     # here f should be normalized and in a form of source_cols => fun
     @assert all(x -> first(x) isa Union{Int, AbstractVector{Int}, AsTable}, f)
     @assert all(x -> last(x) isa Base.Callable, f)
 
     if keeprows
-        if !_check_cannonical(gd)
+        if minimum(gd.groups) == 0
             throw(ArgumentError("select and transform do not support " *
-                                "GroupedDataFrames from which some groups have been dropped "*
-                                "(including skipmissing=true)")))
+                                "`GroupedDataFrame`s from which some groups have "*
+                                "been dropped (including skipmissing=true)"))
         end
         idx_keeprows = Vector{Int}(undef, nrow(parent(gd)))
         let i = 0
@@ -1137,8 +1133,7 @@ function _combine(f::AbstractVector{<:Pair},
         end
     end
 
-    # remember that here first field in res[i] is not useful - it is just needed
-    # to keep track how the column was generated
+    # here first field in res[i] is used to keep track how the column was generated
     # a correct index is stored in idx variable
 
     for (i, (col_idx, col)) in enumerate(res)
@@ -1433,14 +1428,145 @@ end
            copycols::Bool=true, keepkeys::Bool=true, regroup::Bool=false)
 
 Apply `args` to `gd` following the rules described in [`combine`](@ref).
-Ensure that the return value has number of rows equal to `nrow(parent(gd))`.
+The return value has number of rows equal to `nrow(parent(gd))`
+(if single value is returned it is always broadcasted to have this number of rows).
 
 If `copycols=false` then do not perform copying of columns that are not transformed.
+
+# See also
+
+[`groupby](@ref), [`combine`](@ref), [`select!`](@ref), [`transform`](@ref), [`transform!`](@ref)
+
+# Examples
+```jldoctest
+julia> df = DataFrame(a = [1, 1, 1, 2, 2, 1, 1, 2],
+                      b = repeat([2, 1], outer=[4]),
+                      c = 1:8)
+8×3 DataFrame
+│ Row │ a     │ b     │ c     │
+│     │ Int64 │ Int64 │ Int64 │
+├─────┼───────┼───────┼───────┤
+│ 1   │ 1     │ 2     │ 1     │
+│ 2   │ 1     │ 1     │ 2     │
+│ 3   │ 1     │ 2     │ 3     │
+│ 4   │ 2     │ 1     │ 4     │
+│ 5   │ 2     │ 2     │ 5     │
+│ 6   │ 1     │ 1     │ 6     │
+│ 7   │ 1     │ 2     │ 7     │
+│ 8   │ 2     │ 1     │ 8     │
+
+julia> gd = groupby(df, :a);
+
+julia> select(gd, :c => sum, nrow)
+8×3 DataFrame
+│ Row │ a     │ c_sum │ nrow  │
+│     │ Int64 │ Int64 │ Int64 │
+├─────┼───────┼───────┼───────┤
+│ 1   │ 1     │ 19    │ 5     │
+│ 2   │ 1     │ 19    │ 5     │
+│ 3   │ 1     │ 19    │ 5     │
+│ 4   │ 2     │ 17    │ 3     │
+│ 5   │ 2     │ 17    │ 3     │
+│ 6   │ 1     │ 19    │ 5     │
+│ 7   │ 1     │ 19    │ 5     │
+│ 8   │ 2     │ 17    │ 3     │
+
+julia> select(gd, :c => sum, nrow, regroup=true)
+GroupedDataFrame with 2 groups based on key: a
+First Group (5 rows): a = 1
+│ Row │ a     │ c_sum │ nrow  │
+│     │ Int64 │ Int64 │ Int64 │
+├─────┼───────┼───────┼───────┤
+│ 1   │ 1     │ 19    │ 5     │
+│ 2   │ 1     │ 19    │ 5     │
+│ 3   │ 1     │ 19    │ 5     │
+│ 4   │ 1     │ 19    │ 5     │
+│ 5   │ 1     │ 19    │ 5     │
+⋮
+Last Group (3 rows): a = 2
+│ Row │ a     │ c_sum │ nrow  │
+│     │ Int64 │ Int64 │ Int64 │
+├─────┼───────┼───────┼───────┤
+│ 1   │ 2     │ 17    │ 3     │
+│ 2   │ 2     │ 17    │ 3     │
+│ 3   │ 2     │ 17    │ 3     │
+
+julia> select(gd, :c => (x -> sum(log, x)) => :sum_log_c) # specifying a name for target column
+8×2 DataFrame
+│ Row │ a     │ sum_log_c │
+│     │ Int64 │ Float64   │
+├─────┼───────┼───────────┤
+│ 1   │ 1     │ 5.52943   │
+│ 2   │ 1     │ 5.52943   │
+│ 3   │ 1     │ 5.52943   │
+│ 4   │ 2     │ 5.07517   │
+│ 5   │ 2     │ 5.07517   │
+│ 6   │ 1     │ 5.52943   │
+│ 7   │ 1     │ 5.52943   │
+│ 8   │ 2     │ 5.07517   │
+
+julia> select(gd, [:b, :c] .=> sum) # passing a vector of pairs
+8×3 DataFrame
+│ Row │ a     │ b_sum │ c_sum │
+│     │ Int64 │ Int64 │ Int64 │
+├─────┼───────┼───────┼───────┤
+│ 1   │ 1     │ 8     │ 19    │
+│ 2   │ 1     │ 8     │ 19    │
+│ 3   │ 1     │ 8     │ 19    │
+│ 4   │ 2     │ 4     │ 17    │
+│ 5   │ 2     │ 4     │ 17    │
+│ 6   │ 1     │ 8     │ 19    │
+│ 7   │ 1     │ 8     │ 19    │
+│ 8   │ 2     │ 4     │ 17    │
+
+julia> select(gd, :b => :b1, :c => :c1,
+              [:b, :c] => +, keepkeys=false) # auto-splatting, renaming and keepkeys
+8×3 DataFrame
+│ Row │ b1    │ c1    │ b_c_+ │
+│     │ Int64 │ Int64 │ Int64 │
+├─────┼───────┼───────┼───────┤
+│ 1   │ 2     │ 1     │ 3     │
+│ 2   │ 1     │ 2     │ 3     │
+│ 3   │ 2     │ 3     │ 5     │
+│ 4   │ 1     │ 4     │ 5     │
+│ 5   │ 2     │ 5     │ 7     │
+│ 6   │ 1     │ 6     │ 7     │
+│ 7   │ 2     │ 7     │ 9     │
+│ 8   │ 1     │ 8     │ 9     │
+
+julia> select(gd, :b, :c => sum) # passing columns and broadcasting
+8×3 DataFrame
+│ Row │ a     │ b     │ c_sum │
+│     │ Int64 │ Int64 │ Int64 │
+├─────┼───────┼───────┼───────┤
+│ 1   │ 1     │ 2     │ 19    │
+│ 2   │ 1     │ 1     │ 19    │
+│ 3   │ 1     │ 2     │ 19    │
+│ 4   │ 2     │ 1     │ 17    │
+│ 5   │ 2     │ 2     │ 17    │
+│ 6   │ 1     │ 1     │ 19    │
+│ 7   │ 1     │ 2     │ 19    │
+│ 8   │ 2     │ 1     │ 17    │
+
+julia> select(gd, :, AsTable(Not(:a)) => sum)
+8×4 DataFrame
+│ Row │ a     │ b     │ c     │ b_c_sum │
+│     │ Int64 │ Int64 │ Int64 │ Int64   │
+├─────┼───────┼───────┼───────┼─────────┤
+│ 1   │ 1     │ 2     │ 1     │ 3       │
+│ 2   │ 1     │ 1     │ 2     │ 3       │
+│ 3   │ 1     │ 2     │ 3     │ 5       │
+│ 4   │ 2     │ 1     │ 4     │ 5       │
+│ 5   │ 2     │ 2     │ 5     │ 7       │
+│ 6   │ 1     │ 1     │ 6     │ 7       │
+│ 7   │ 1     │ 2     │ 7     │ 9       │
+│ 8   │ 2     │ 1     │ 8     │ 9       │
+```
 """
 select(gd::GroupedDataFrame, args...;
        copycols::Bool=true, keepkeys::Bool=true, regroup::Bool=false) =
-    _combine_executor(gd, args..., copycols=copycols, keepkeys=keepkeys,
-                      regroup=regroup, keeprows=true)
+    _combine_prepare(gd, args..., copycols=copycols, keepkeys=keepkeys,
+                     regroup=regroup, keeprows=true)
 
 """
     transform(gd::GroupedDataFrame, args...;
@@ -1448,6 +1574,10 @@ select(gd::GroupedDataFrame, args...;
 
 An equivalent of
 `select(gd, :, args..., copycols=copycols, keepkeys=keepkeys, regroup=regroup)`
+
+# See also
+
+[`groupby](@ref), [`combine`](@ref), [`select`](@ref), [`select!`](@ref), [`transform!`](@ref)
 """
 transform(gd::GroupedDataFrame, args...;
           copycols::Bool=true, keepkeys::Bool=true, regroup::Bool=false) =
@@ -1460,17 +1590,15 @@ transform(gd::GroupedDataFrame, args...;
 An equivalent of
 `select(gd, args..., copycols=false, keepkeys=true, regroup=regroup)`
 but updates `parent(gd)` in place.
+
+# See also
+
+[`groupby](@ref), [`combine`](@ref), [`select`](@ref), [`transform`](@ref), [`transform!`](@ref)
 """
 function select!(gd::GroupedDataFrame{DataFrame}, args...; regroup::Bool=false)
     newdf = select(gd, args..., copycols=false)
     df = parent(gd)
-    copy!(_columns(df), _columns(newdf))
-    x = index(df)
-    copy!(_names(x), _names(newdf))
-    empty!(x.lookup)
-    for (i, n) in enumerate(x.names)
-        x.lookup[n] = i
-    end
+    _replace_columns!(df, newdf)
     return regroup ? gd : df
 end
 
@@ -1480,6 +1608,11 @@ end
 An equivalent of
 `transform(gd, args..., copycols=false, keepkeys=true, regroup=regroup)`
 but updates `parent(gd)` in place.
+
+
+# See also
+
+[`groupby](@ref), [`combine`](@ref), [`select`](@ref), [`select!`](@ref), [`transform`](@ref)
 """
 transform!(gd::GroupedDataFrame{DataFrame}, args...; regroup::Bool=false) =
     select!(gd, :, args..., regroup=regroup)

From 2bd31ff611a86ed65c587b948c9acd470d5a1cf7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= <bkamins@sgh.waw.pl>
Date: Fri, 1 May 2020 11:44:37 +0200
Subject: [PATCH 18/29] add deprecated map tests

---
 test/deprecated.jl | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/test/deprecated.jl b/test/deprecated.jl
index 11550a2b2e..d9532f63e1 100644
--- a/test/deprecated.jl
+++ b/test/deprecated.jl
@@ -704,6 +704,15 @@ end
     end
 end
 
+@testset "map skipmissing and sort" begin
+    df = DataFrame(a=[2, 2, missing, missing, 1, 1, 3, 3], b=1:8)
+    for dosort in (false, true), doskipmissing in (false, true)
+        gdf = groupby(df, :a, sort=dosort, skipmissing=doskipmissing)
+        @test map(identity, gdf) ≅ combine(identity, gdf, regroup=true)
+        @test map(:b => sum, gdf) ≅ combine(:b => sum, gdf, regroup=true)
+    end
+end
+
 global_logger(old_logger)
 
 end # module

From 9d1b20d69a245f4fa70b381e6fd1d0189bd7d63e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= <bkamins@sgh.waw.pl>
Date: Fri, 1 May 2020 11:49:44 +0200
Subject: [PATCH 19/29] fix error types in select

---
 test/select.jl | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/test/select.jl b/test/select.jl
index a63a30e329..b76c91f3d2 100644
--- a/test/select.jl
+++ b/test/select.jl
@@ -239,8 +239,8 @@ end
 
 @testset "select!" begin
     df = DataFrame(a=1, b=2, c=3, d=4, e=5)
-    @test_throws ArgumentError select!(df, 0)
-    @test_throws ArgumentError select!(df, 6)
+    @test_throws BoundsError select!(df, 0)
+    @test_throws BoundsError select!(df, 6)
     @test_throws ArgumentError select!(df, [1, 1])
     @test_throws ArgumentError select!(df, :f)
     @test_throws BoundsError select!(df, [true, false])
@@ -552,7 +552,7 @@ end
     @test df == expected
 
     df = DataFrame(a=a, b=b, c=c)
-    @test_throws ArgumentError select!(df, 1:4)
+    @test_throws BoundsError select!(df, 1:4)
     @test_throws ArgumentError select!(df, [:a, :b, :c, :d])
     @test_throws ArgumentError select!(df, [1, 2, 3, 1])
     @test_throws ArgumentError select!(df, [:a, :b, :c, :a])

From 0f3d30900023b480b51b46b8f6c678191f652607 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= <bkamins@sgh.waw.pl>
Date: Fri, 1 May 2020 15:53:12 +0200
Subject: [PATCH 20/29] avoid computing idx, starts and ends in combine if
 regroup=true

---
 src/groupeddataframe/splitapplycombine.jl | 34 ++++++-----------------
 1 file changed, 9 insertions(+), 25 deletions(-)

diff --git a/src/groupeddataframe/splitapplycombine.jl b/src/groupeddataframe/splitapplycombine.jl
index e37bc9bf03..ef9eb053ff 100644
--- a/src/groupeddataframe/splitapplycombine.jl
+++ b/src/groupeddataframe/splitapplycombine.jl
@@ -597,35 +597,19 @@ function combine_helper(f, gd::GroupedDataFrame,
             return GroupedDataFrame(newparent, gd.cols, gd.groups, gd.idx,
                                     gd.starts, gd.ends, gd.ngroups, getfield(gd, :keymap))
         else
-            starts = Vector{Int}(undef, length(gd))
-            ends = Vector{Int}(undef, length(gd))
-            starts[1] = 1
+            groups = zeros(Int, length(idx))
+            groups[1] = 1
             j = 1
-            for i in 2:length(idx)
-                if idx[i] != idx[i-1]
-                    j += 1
-                    starts[j] = i
-                    ends[j-1] = i - 1
-                end
+            last_idx = idx[1]
+            @inbounds for i in 2:length(idx)
+                cur_idx = idx[i]
+                j += cur_idx != last_idx
+                last_idx = cur_idx
+                groups[i] = j
             end
-            # it is impossible to get more groups in the output than we had initially
             @assert j <= length(gd)
-            # In case some groups have to be dropped
-            resize!(starts, j)
-            resize!(ends, j)
-            ends[end] = length(idx)
-
-            groups = zeros(Int, length(idx))
-            for i in 1:j
-                @inbounds for k in starts[i]:ends[i]
-                    groups[k] = i
-                end
-            end
-            # all groups must be filled
-            @assert minimum(groups) == 1
-
             return GroupedDataFrame(newparent, collect(1:length(gd.cols)), groups,
-                                    collect(1:length(idx)), starts, ends, j, nothing)
+                                    nothing, nothing, nothing, j, nothing)
         end
     else
         if regroup

From 1d69fa3cacc6526e3f9160b2cbb4866485c4fa0a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= <bkamins@sgh.waw.pl>
Date: Fri, 1 May 2020 16:51:30 +0200
Subject: [PATCH 21/29] performance improvements

---
 src/groupeddataframe/splitapplycombine.jl | 55 +++++++++++++----------
 1 file changed, 32 insertions(+), 23 deletions(-)

diff --git a/src/groupeddataframe/splitapplycombine.jl b/src/groupeddataframe/splitapplycombine.jl
index ef9eb053ff..d10e7e15fc 100644
--- a/src/groupeddataframe/splitapplycombine.jl
+++ b/src/groupeddataframe/splitapplycombine.jl
@@ -556,6 +556,20 @@ function combine(gd::GroupedDataFrame; f...)
     return combine(gd, [source_cols => fun => out_col for (out_col, (source_cols, fun)) in f])
 end
 
+function gen_groups(idx::Vector{Int})
+    groups = zeros(Int, length(idx))
+    groups[1] = 1
+    j = 1
+    last_idx = idx[1]
+    @inbounds @simd for i in 2:length(idx)
+        cur_idx = idx[i]
+        j += cur_idx != last_idx
+        last_idx = cur_idx
+        groups[i] = j
+    end
+    return groups
+end
+
 function combine_helper(f, gd::GroupedDataFrame,
                         nms::Union{AbstractVector{Symbol},Nothing}=nothing;
                         keepkeys::Bool, regroup::Bool,
@@ -597,19 +611,10 @@ function combine_helper(f, gd::GroupedDataFrame,
             return GroupedDataFrame(newparent, gd.cols, gd.groups, gd.idx,
                                     gd.starts, gd.ends, gd.ngroups, getfield(gd, :keymap))
         else
-            groups = zeros(Int, length(idx))
-            groups[1] = 1
-            j = 1
-            last_idx = idx[1]
-            @inbounds for i in 2:length(idx)
-                cur_idx = idx[i]
-                j += cur_idx != last_idx
-                last_idx = cur_idx
-                groups[i] = j
-            end
-            @assert j <= length(gd)
+            groups = gen_groups(idx)
+            @assert groups[end] <= length(gd)
             return GroupedDataFrame(newparent, collect(1:length(gd.cols)), groups,
-                                    nothing, nothing, nothing, j, nothing)
+                                    nothing, nothing, nothing, groups[end], nothing)
         end
     else
         if regroup
@@ -998,6 +1003,20 @@ function _agg2idx_map_helper(idx, idx_agg)
     return agg2idx_map
 end
 
+function prepare_idx_keeprows(idx, starts, ends, nrowparent)
+    idx_keeprows = Vector{Int}(undef, nrowparent)
+    i = 0
+    for (s, e) in zip(starts, ends)
+        v = idx[s]
+        for k in s:e
+            i += 1
+            idx_keeprows[i] = v
+        end
+    end
+    @assert i == nrowparent
+    return idx_keeprows
+end
+
 function _combine(f::AbstractVector{<:Pair},
                   gd::GroupedDataFrame, nms::AbstractVector{Symbol},
                   copycols::Bool, keeprows::Bool)
@@ -1011,17 +1030,7 @@ function _combine(f::AbstractVector{<:Pair},
                                 "`GroupedDataFrame`s from which some groups have "*
                                 "been dropped (including skipmissing=true)"))
         end
-        idx_keeprows = Vector{Int}(undef, nrow(parent(gd)))
-        let i = 0
-            for (s, e) in zip(gd.starts, gd.ends)
-                v = gd.idx[s]
-                for k in s:e
-                    i += 1
-                    idx_keeprows[i] = v
-                end
-            end
-            @assert i == nrow(parent(gd))
-        end
+        idx_keeprows = prepare_idx_keeprows(gd.idx, gd.starts, gd.ends, nrow(parent(gd)))
     else
         idx_keeprows = nothing
     end

From 5713194acdec6d314b95235f74c2d8f3bcd17b64 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= <bkamins@sgh.waw.pl>
Date: Fri, 1 May 2020 16:59:00 +0200
Subject: [PATCH 22/29] @simd did not improve the performance here

---
 src/groupeddataframe/splitapplycombine.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/groupeddataframe/splitapplycombine.jl b/src/groupeddataframe/splitapplycombine.jl
index d10e7e15fc..f58a21f5bb 100644
--- a/src/groupeddataframe/splitapplycombine.jl
+++ b/src/groupeddataframe/splitapplycombine.jl
@@ -561,7 +561,7 @@ function gen_groups(idx::Vector{Int})
     groups[1] = 1
     j = 1
     last_idx = idx[1]
-    @inbounds @simd for i in 2:length(idx)
+    @inbounds for i in 2:length(idx)
         cur_idx = idx[i]
         j += cur_idx != last_idx
         last_idx = cur_idx

From 1f34d55ee969c55d11da7bdbab3b4c38328c99e2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= <bkamins@sgh.waw.pl>
Date: Fri, 1 May 2020 17:36:27 +0200
Subject: [PATCH 23/29] Update docs/src/man/split_apply_combine.md

Co-authored-by: pdeffebach <23196228+pdeffebach@users.noreply.github.com>
---
 docs/src/man/split_apply_combine.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/src/man/split_apply_combine.md b/docs/src/man/split_apply_combine.md
index 96375c9f77..5c42a1de52 100644
--- a/docs/src/man/split_apply_combine.md
+++ b/docs/src/man/split_apply_combine.md
@@ -48,7 +48,7 @@ passed to `function`.
 
 In all of these cases, `function` can return either a single row or multiple rows.
 `function` can always generate a single column by returning a single value or a vector.
-Additionally, if `combine` is passed exactly one `function` as a first argument
+Additionally, if `combine` is passed exactly one `function`, `cols => function`, or `cols => function => outcol` as a first argument
 and `target_col` is not specified,
 `function` can return multiple columns in the form of an `AbstractDataFrame`,
 `AbstractMatrix`, `NamedTuple` or `DataFrameRow`.

From 2201789c32b60b6bce3662fec310a25c96ec5c2e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= <bkamins@sgh.waw.pl>
Date: Fri, 1 May 2020 17:39:08 +0200
Subject: [PATCH 24/29] add an example of passing function as a first argument
 to combine

---
 docs/src/man/split_apply_combine.md | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/docs/src/man/split_apply_combine.md b/docs/src/man/split_apply_combine.md
index 5c42a1de52..b743d4250d 100644
--- a/docs/src/man/split_apply_combine.md
+++ b/docs/src/man/split_apply_combine.md
@@ -196,6 +196,15 @@ julia> combine(gdf,
 │ 2   │ Iris-versicolor │ 0.910378                         │
 │ 3   │ Iris-virginica  │ 0.867923                         │
 
+julia> combine(x -> std(x.PetalLength) / std(x.SepalLength), gdf) # passing a SubDataFrame
+3×2 DataFrame
+│ Row │ Species         │ PetalLength_SepalLength_function │
+│     │ String          │ Float64                          │
+├─────┼─────────────────┼──────────────────────────────────┤
+│ 1   │ Iris-setosa     │ 0.492245                         │
+│ 2   │ Iris-versicolor │ 0.910378                         │
+│ 3   │ Iris-virginica  │ 0.867923                         │
+
 julia> combine(gdf, 1:2 => cor, nrow)
 3×3 DataFrame
 │ Row │ Species         │ SepalLength_SepalWidth_cor │ nrow  │

From 2aa9170b2b713188741b006b7ba826a105babe20 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= <bkamins@sgh.waw.pl>
Date: Sat, 2 May 2020 07:08:52 +0200
Subject: [PATCH 25/29] change regroup to ungroup

---
 src/deprecated.jl                         |   2 +-
 src/groupeddataframe/splitapplycombine.jl |  86 ++++----
 test/deprecated.jl                        |   4 +-
 test/grouping.jl                          | 226 +++++++++++-----------
 test/string.jl                            |  12 +-
 5 files changed, 165 insertions(+), 165 deletions(-)

diff --git a/src/deprecated.jl b/src/deprecated.jl
index 59fe83f214..c38150ad2e 100644
--- a/src/deprecated.jl
+++ b/src/deprecated.jl
@@ -463,4 +463,4 @@ end
                                                               skipmissing=skipmissing), f...)
 
 import Base: map
-@deprecate map(f::Union{Base.Callable, Pair}, gd::GroupedDataFrame) combine(f, gd, regroup=true)
+@deprecate map(f::Union{Base.Callable, Pair}, gd::GroupedDataFrame) combine(f, gd, ungroup=false)
diff --git a/src/groupeddataframe/splitapplycombine.jl b/src/groupeddataframe/splitapplycombine.jl
index f58a21f5bb..b6d2ddb4ac 100644
--- a/src/groupeddataframe/splitapplycombine.jl
+++ b/src/groupeddataframe/splitapplycombine.jl
@@ -246,17 +246,17 @@ const KWARG_PROCESSING_RULES =
     value contains columns with the same names as the grouping columns, they are
     required to be equal.
 
-    If `regroup=false` (the default) a `DataFrame` is returned.
-    If `regroup=true` a `GroupedDataFrame` grouped using `keycols(gdf)` is returned.
+    If `ungroup=true` (the default) a `DataFrame` is returned.
+    If `ungroup=false` a `GroupedDataFrame` grouped using `keycols(gdf)` is returned.
     """
 
 """
-    combine(gd::GroupedDataFrame, args...; keepkeys::Bool=true, regroup::Bool=false)
+    combine(gd::GroupedDataFrame, args...; keepkeys::Bool=true, ungroup::Bool=true)
     combine(fun::Union{Function, Type}, gd::GroupedDataFrame;
-            keepkeys::Bool=true, regroup::Bool=false)
-    combine(pair::Pair, gd::GroupedDataFrame; keepkeys::Bool=true, regroup::Bool=false)
-    combine(fun::Union{Function, Type}, df::AbstractDataFrame)
-    combine(pair::Pair, df::AbstractDataFrame)
+            keepkeys::Bool=true, ungroup::Bool=true)
+    combine(pair::Pair, gd::GroupedDataFrame; keepkeys::Bool=true, ungroup::Bool=true)
+    combine(fun::Union{Function, Type}, df::AbstractDataFrame, ungroup::Bool=true)
+    combine(pair::Pair, df::AbstractDataFrame, ungroup::Bool=true)
 
 Apply operations to each group in a [`GroupedDataFrame`](@ref) and return
 the combined result as a `DataFrame`.
@@ -292,7 +292,7 @@ julia> combine(gd, :c => sum, nrow)
 │ 3   │ 3     │ 10    │ 2     │
 │ 4   │ 4     │ 12    │ 2     │
 
-julia> combine(gd, :c => sum, nrow, regroup=true)
+julia> combine(gd, :c => sum, nrow, ungroup=false)
 GroupedDataFrame with 4 groups based on key: a
 First Group (1 row): a = 1
 │ Row │ a     │ c_sum │ nrow  │
@@ -428,24 +428,24 @@ julia> combine(gd, :, AsTable(Not(:a)) => sum)
 ```
 """
 function combine(f::Base.Callable, gd::GroupedDataFrame;
-                 keepkeys::Bool=true, regroup::Bool=false)
-    return combine_helper(f, gd, keepkeys=keepkeys, regroup=regroup,
+                 keepkeys::Bool=true, ungroup::Bool=true)
+    return combine_helper(f, gd, keepkeys=keepkeys, ungroup=ungroup,
                           copycols=true, keeprows=false)
 end
 
 combine(f::typeof(nrow), gd::GroupedDataFrame;
-        keepkeys::Bool=true, regroup::Bool=false) =
-    combine(gd, [nrow => :nrow], keepkeys=keepkeys, regroup=regroup)
+        keepkeys::Bool=true, ungroup::Bool=true) =
+    combine(gd, [nrow => :nrow], keepkeys=keepkeys, ungroup=ungroup)
 
 function combine(p::Pair, gd::GroupedDataFrame;
-                 keepkeys::Bool=true, regroup::Bool=false)
+                 keepkeys::Bool=true, ungroup::Bool=true)
     # move handling of aggregate to specialized combine
     p_from, p_to = p
 
     # verify if it is not better to use a fast path, which we achieve
     # by moving to combine(::GroupedDataFrame, ::AbstractVector) method
     if isagg(p_from => (p_to isa Pair ? first(p_to) : p_to)) || p_from === nrow
-        return combine(gd, [p], keepkeys=keepkeys, regroup=regroup)
+        return combine(gd, [p], keepkeys=keepkeys, ungroup=ungroup)
     end
 
     if p_from isa Tuple
@@ -455,20 +455,20 @@ function combine(p::Pair, gd::GroupedDataFrame;
     else
         cs = p_from
     end
-    return combine_helper(cs => p_to, gd, keepkeys=keepkeys, regroup=regroup,
+    return combine_helper(cs => p_to, gd, keepkeys=keepkeys, ungroup=ungroup,
                           copycols=true, keeprows=false)
 end
 
 combine(gd::GroupedDataFrame,
         cs::Union{Pair, typeof(nrow), ColumnIndex, MultiColumnIndex}...;
-        keepkeys::Bool=true, regroup::Bool=false) =
-    _combine_prepare(gd, cs..., keepkeys=keepkeys, regroup=regroup,
+        keepkeys::Bool=true, ungroup::Bool=true) =
+    _combine_prepare(gd, cs..., keepkeys=keepkeys, ungroup=ungroup,
                      copycols=true, keeprows=false)
 
 function _combine_prepare(gd::GroupedDataFrame,
                           @nospecialize(cs::Union{Pair, typeof(nrow),
                                                   ColumnIndex, MultiColumnIndex}...);
-                 keepkeys::Bool, regroup::Bool, copycols::Bool, keeprows::Bool)
+                 keepkeys::Bool, ungroup::Bool, copycols::Bool, keeprows::Bool)
     @assert !isempty(cs)
     cs_vec = []
     for p in cs
@@ -541,7 +541,7 @@ function _combine_prepare(gd::GroupedDataFrame,
     end
     f = Pair[first(x) => first(last(x)) for x in cs_norm]
     nms = Symbol[last(last(x)) for x in cs_norm]
-    return combine_helper(f, gd, nms, keepkeys=keepkeys, regroup=regroup,
+    return combine_helper(f, gd, nms, keepkeys=keepkeys, ungroup=ungroup,
                           copycols=copycols, keeprows=keeprows)
 end
 
@@ -572,14 +572,14 @@ end
 
 function combine_helper(f, gd::GroupedDataFrame,
                         nms::Union{AbstractVector{Symbol},Nothing}=nothing;
-                        keepkeys::Bool, regroup::Bool,
+                        keepkeys::Bool, ungroup::Bool,
                         copycols::Bool, keeprows::Bool)
-    if regroup && !keepkeys
-        throw(ArgumentError("keepkeys=false when regroup=true is not allowed"))
+    if !ungroup && !keepkeys
+        throw(ArgumentError("keepkeys=false when ungroup=false is not allowed"))
     end
     if length(gd) > 0
         idx, valscat = _combine(f, gd, nms, copycols, keeprows)
-        keepkeys || regroup || return valscat
+        !keepkeys && ungroup && return valscat
         keys = groupcols(gd)
         for key in keys
             if hasproperty(valscat, key)
@@ -597,7 +597,7 @@ function combine_helper(f, gd::GroupedDataFrame,
         end
         hcat!(newparent, select(valscat, Not(intersect(keys, _names(valscat))), copycols=false),
               copycols=false)
-        regroup || return newparent
+        ungroup && return newparent
 
         if length(idx) == 0
             @assert nrow(newparent) == 0
@@ -617,11 +617,11 @@ function combine_helper(f, gd::GroupedDataFrame,
                                     nothing, nothing, nothing, groups[end], nothing)
         end
     else
-        if regroup
+        if ungroup
+            return keepkeys ? parent(gd)[1:0, gd.cols] : DataFrame()
+        else
             return GroupedDataFrame(parent(gd)[1:0, gd.cols], collect(1:length(gd.cols)),
                                     Int[], Int[], Int[], Int[], 0, Dict{Any,Int}())
-        else
-            return keepkeys ? parent(gd)[1:0, gd.cols] : DataFrame()
         end
     end
 end
@@ -1418,7 +1418,7 @@ end
 
 """
     select(gd::GroupedDataFrame, args...;
-           copycols::Bool=true, keepkeys::Bool=true, regroup::Bool=false)
+           copycols::Bool=true, keepkeys::Bool=true, ungroup::Bool=true)
 
 Apply `args` to `gd` following the rules described in [`combine`](@ref).
 The return value has number of rows equal to `nrow(parent(gd))`
@@ -1464,7 +1464,7 @@ julia> select(gd, :c => sum, nrow)
 │ 7   │ 1     │ 19    │ 5     │
 │ 8   │ 2     │ 17    │ 3     │
 
-julia> select(gd, :c => sum, nrow, regroup=true)
+julia> select(gd, :c => sum, nrow, ungroup=false)
 GroupedDataFrame with 2 groups based on key: a
 First Group (5 rows): a = 1
 │ Row │ a     │ c_sum │ nrow  │
@@ -1557,49 +1557,49 @@ julia> select(gd, :, AsTable(Not(:a)) => sum)
 ```
 """
 select(gd::GroupedDataFrame, args...;
-       copycols::Bool=true, keepkeys::Bool=true, regroup::Bool=false) =
+       copycols::Bool=true, keepkeys::Bool=true, ungroup::Bool=true) =
     _combine_prepare(gd, args..., copycols=copycols, keepkeys=keepkeys,
-                     regroup=regroup, keeprows=true)
+                     ungroup=ungroup, keeprows=true)
 
 """
     transform(gd::GroupedDataFrame, args...;
-              copycols::Bool=true, keepkeys::Bool=true, regroup::Bool=false)
+              copycols::Bool=true, keepkeys::Bool=true, ungroup::Bool=true)
 
 An equivalent of
-`select(gd, :, args..., copycols=copycols, keepkeys=keepkeys, regroup=regroup)`
+`select(gd, :, args..., copycols=copycols, keepkeys=keepkeys, ungroup=ungroup)`
 
 # See also
 
 [`groupby](@ref), [`combine`](@ref), [`select`](@ref), [`select!`](@ref), [`transform!`](@ref)
 """
 transform(gd::GroupedDataFrame, args...;
-          copycols::Bool=true, keepkeys::Bool=true, regroup::Bool=false) =
+          copycols::Bool=true, keepkeys::Bool=true, ungroup::Bool=true) =
     select(gd, :, args..., copycols=copycols, keepkeys=keepkeys,
-           regroup=regroup)
+           ungroup=ungroup)
 
 """
-    select!(gd::GroupedDataFrame{DataFrame}, args...; regroup::Bool=false)
+    select!(gd::GroupedDataFrame{DataFrame}, args...; ungroup::Bool=true)
 
 An equivalent of
-`select(gd, args..., copycols=false, keepkeys=true, regroup=regroup)`
+`select(gd, args..., copycols=false, keepkeys=true, ungroup=ungroup)`
 but updates `parent(gd)` in place.
 
 # See also
 
 [`groupby](@ref), [`combine`](@ref), [`select`](@ref), [`transform`](@ref), [`transform!`](@ref)
 """
-function select!(gd::GroupedDataFrame{DataFrame}, args...; regroup::Bool=false)
+function select!(gd::GroupedDataFrame{DataFrame}, args...; ungroup::Bool=true)
     newdf = select(gd, args..., copycols=false)
     df = parent(gd)
     _replace_columns!(df, newdf)
-    return regroup ? gd : df
+    return ungroup ? df : gd
 end
 
 """
-    transform!(gd::GroupedDataFrame{DataFrame}, args...; regroup::Bool=false)
+    transform!(gd::GroupedDataFrame{DataFrame}, args...; ungroup::Bool=true)
 
 An equivalent of
-`transform(gd, args..., copycols=false, keepkeys=true, regroup=regroup)`
+`transform(gd, args..., copycols=false, keepkeys=true, ungroup=ungroup)`
 but updates `parent(gd)` in place.
 
 
@@ -1607,5 +1607,5 @@ but updates `parent(gd)` in place.
 
 [`groupby](@ref), [`combine`](@ref), [`select`](@ref), [`select!`](@ref), [`transform`](@ref)
 """
-transform!(gd::GroupedDataFrame{DataFrame}, args...; regroup::Bool=false) =
-    select!(gd, :, args..., regroup=regroup)
+transform!(gd::GroupedDataFrame{DataFrame}, args...; ungroup::Bool=true) =
+    select!(gd, :, args..., ungroup=ungroup)
diff --git a/test/deprecated.jl b/test/deprecated.jl
index d9532f63e1..ccd8f10c69 100644
--- a/test/deprecated.jl
+++ b/test/deprecated.jl
@@ -708,8 +708,8 @@ end
     df = DataFrame(a=[2, 2, missing, missing, 1, 1, 3, 3], b=1:8)
     for dosort in (false, true), doskipmissing in (false, true)
         gdf = groupby(df, :a, sort=dosort, skipmissing=doskipmissing)
-        @test map(identity, gdf) ≅ combine(identity, gdf, regroup=true)
-        @test map(:b => sum, gdf) ≅ combine(:b => sum, gdf, regroup=true)
+        @test map(identity, gdf) ≅ combine(identity, gdf, ungroup=false)
+        @test map(:b => sum, gdf) ≅ combine(:b => sum, gdf, ungroup=false)
     end
 end
 
diff --git a/test/grouping.jl b/test/grouping.jl
index 7e0144e9e9..85a572d76f 100644
--- a/test/grouping.jl
+++ b/test/grouping.jl
@@ -201,41 +201,41 @@ end
         @test combine(f7, gd) == sres4
         @test combine(f8, gd) == sres4
 
-        # combine() with regroup without and with groups sorting
+        # combine() with ungroup without and with groups sorting
         for dosort in (false, true)
             gd = groupby_checked(df, cols, sort=dosort)
-            v = validate_gdf(combine(d -> d[:, [:x]], gd, regroup=true))
+            v = validate_gdf(combine(d -> d[:, [:x]], gd, ungroup=false))
             @test length(gd) == length(v)
             nms = [colssym; :x]
             @test v[1] == gd[1][:, nms]
             @test v[1] == gd[1][:, nms] && v[2] == gd[2][:, nms] &&
                 v[3] == gd[3][:, nms] && v[4] == gd[4][:, nms]
             @test names(parent(v))[v.cols] == string.(colssym)
-            v = validate_gdf(combine(f1, gd, regroup=true))
+            v = validate_gdf(combine(f1, gd, ungroup=false))
             @test extrema(v.groups) == extrema(gd.groups)
             @test vcat(v[1], v[2], v[3], v[4]) == combine(f1, gd)
-            v = validate_gdf(combine(f2, gd, regroup=true))
+            v = validate_gdf(combine(f2, gd, ungroup=false))
             @test extrema(v.groups) == extrema(gd.groups)
             @test vcat(v[1], v[2], v[3], v[4]) == combine(f2, gd)
-            v = validate_gdf(combine(f3, gd, regroup=true))
+            v = validate_gdf(combine(f3, gd, ungroup=false))
             @test extrema(v.groups) == extrema(gd.groups)
             @test vcat(v[1], v[2], v[3], v[4]) == combine(f3, gd)
-            v = validate_gdf(combine(f4, gd, regroup=true))
+            v = validate_gdf(combine(f4, gd, ungroup=false))
             @test extrema(v.groups) == extrema(gd.groups)
             @test vcat(v[1], v[2], v[3], v[4]) == combine(f4, gd)
-            v = validate_gdf(combine(f5, gd, regroup=true))
+            v = validate_gdf(combine(f5, gd, ungroup=false))
             @test extrema(v.groups) == extrema(gd.groups)
             @test vcat(v[1], v[2], v[3], v[4]) == combine(f5, gd)
-            v = validate_gdf(combine(f5, gd, regroup=true))
+            v = validate_gdf(combine(f5, gd, ungroup=false))
             @test extrema(v.groups) == extrema(gd.groups)
             @test vcat(v[1], v[2], v[3], v[4]) == combine(f5, gd)
-            v = validate_gdf(combine(f6, gd, regroup=true))
+            v = validate_gdf(combine(f6, gd, ungroup=false))
             @test extrema(v.groups) == extrema(gd.groups)
             @test vcat(v[1], v[2], v[3], v[4]) == combine(f6, gd)
-            v = validate_gdf(combine(f7, gd, regroup=true))
+            v = validate_gdf(combine(f7, gd, ungroup=false))
             @test extrema(v.groups) == extrema(gd.groups)
             @test vcat(v[1], v[2], v[3], v[4]) == combine(f7, gd)
-            v = validate_gdf(combine(f8, gd, regroup=true))
+            v = validate_gdf(combine(f8, gd, ungroup=false))
             @test extrema(v.groups) == extrema(gd.groups)
             @test vcat(v[1], v[2], v[3], v[4]) == combine(f8, gd)
         end
@@ -363,7 +363,7 @@ end
     @test combine(d -> d.x == [1] ? DataFrame(z=[]) : DataFrame(z=1), gdf) ==
         DataFrame(x=[2, 3], z=[1, 1])
     v = validate_gdf(combine(d -> d.x == [1] ? DataFrame(z=[]) : DataFrame(z=1),
-                             groupby_checked(df, :x), regroup=true))
+                             groupby_checked(df, :x), ungroup=false))
     @test length(v) == 2
     @test vcat(v[1], v[2]) == DataFrame(x=[2, 3], z=[1, 1])
 
@@ -438,14 +438,14 @@ end
     df = DataFrame(x=[], y=[])
     gd = groupby_checked(df, :x)
     @test combine(df -> sum(df.x), gd) == DataFrame(x=[])
-    res = validate_gdf(combine(df -> sum(df.x), gd, regroup=true))
+    res = validate_gdf(combine(df -> sum(df.x), gd, ungroup=false))
     @test length(res) == 0
     @test res.parent == DataFrame(x=[])
 
     # Test with zero groups in output
     df = DataFrame(A = [1, 2])
     gd = groupby_checked(df, :A)
-    gd2 = validate_gdf(combine(d -> DataFrame(), gd, regroup=true))
+    gd2 = validate_gdf(combine(d -> DataFrame(), gd, ungroup=false))
     @test length(gd2) == 0
     @test gd.cols == [1]
     @test isempty(gd2.groups)
@@ -455,7 +455,7 @@ end
     @test parent(gd2) == DataFrame(A=[])
     @test eltype.(eachcol(parent(gd2))) == [Int]
 
-    gd2 = validate_gdf(combine(d -> DataFrame(X=Int[]), gd, regroup=true))
+    gd2 = validate_gdf(combine(d -> DataFrame(X=Int[]), gd, ungroup=false))
     @test length(gd2) == 0
     @test gd.cols == [1]
     @test isempty(gd2.groups)
@@ -723,42 +723,42 @@ end
         @test_throws ArgumentError combine(col => (x -> (z=x,)) => :xyz, gd)
         @test_throws ArgumentError combine(col => x -> (z=1, xzz=[1]), gd)
     end
-    for cols in ([:b, :c], 2:3, [2, 3], [false, true, true]), regroup in (true, false)
-        @test combine(cols => (b,c) -> (y=exp.(b), z=c), gd, regroup=regroup) ==
-            combine(d -> (y=exp.(d.b), z=d.c), gd, regroup=regroup)
-        @test combine(cols => (b,c) -> [exp.(b) c], gd, regroup=regroup) ==
-            combine(d -> [exp.(d.b) d.c], gd, regroup=regroup)
-        @test combine(cols => ((b,c) -> sum(b) + sum(c)) => :xyz, gd, regroup=regroup) ==
-            combine(d -> (xyz=sum(d.b) + sum(d.c),), gd, regroup=regroup)
+    for cols in ([:b, :c], 2:3, [2, 3], [false, true, true]), ungroup in (true, false)
+        @test combine(cols => (b,c) -> (y=exp.(b), z=c), gd, ungroup=ungroup) ==
+            combine(d -> (y=exp.(d.b), z=d.c), gd, ungroup=ungroup)
+        @test combine(cols => (b,c) -> [exp.(b) c], gd, ungroup=ungroup) ==
+            combine(d -> [exp.(d.b) d.c], gd, ungroup=ungroup)
+        @test combine(cols => ((b,c) -> sum(b) + sum(c)) => :xyz, gd, ungroup=ungroup) ==
+            combine(d -> (xyz=sum(d.b) + sum(d.c),), gd, ungroup=ungroup)
         if eltype(cols) === Bool
             cols2 = [[false, true, false], [false, false, true]]
             @test_throws MethodError combine((xyz = cols[1] => sum, xzz = cols2[2] => sum),
-                                             gd, regroup=regroup)
+                                             gd, ungroup=ungroup)
             @test_throws MethodError combine((xyz = cols[1] => sum, xzz = cols2[1] => sum),
-                                             gd, regroup=regroup)
+                                             gd, ungroup=ungroup)
             @test_throws MethodError combine((xyz = cols[1] => sum, xzz = cols2[2] => x -> first(x)),
-                                             gd, regroup=regroup)
+                                             gd, ungroup=ungroup)
         else
             cols2 = cols
-            @test combine(gd, cols2[1] => sum => :xyz, cols2[2] => sum => :xzz, regroup=regroup) ==
-                combine(d -> (xyz=sum(d.b), xzz=sum(d.c)), gd, regroup=regroup)
-            @test combine(gd, cols2[1] => sum => :xyz, cols2[1] => sum => :xzz, regroup=regroup) ==
-                combine(d -> (xyz=sum(d.b), xzz=sum(d.b)), gd, regroup=regroup)
+            @test combine(gd, cols2[1] => sum => :xyz, cols2[2] => sum => :xzz, ungroup=ungroup) ==
+                combine(d -> (xyz=sum(d.b), xzz=sum(d.c)), gd, ungroup=ungroup)
+            @test combine(gd, cols2[1] => sum => :xyz, cols2[1] => sum => :xzz, ungroup=ungroup) ==
+                combine(d -> (xyz=sum(d.b), xzz=sum(d.b)), gd, ungroup=ungroup)
             @test combine(gd, cols2[1] => sum => :xyz,
-                    cols2[2] => (x -> first(x)) => :xzz, regroup=regroup) ==
-                combine(d -> (xyz=sum(d.b), xzz=first(d.c)), gd, regroup=regroup)
+                    cols2[2] => (x -> first(x)) => :xzz, ungroup=ungroup) ==
+                combine(d -> (xyz=sum(d.b), xzz=first(d.c)), gd, ungroup=ungroup)
             @test combine(gd, cols2[1] => vexp => :xyz,
-                    cols2[2] => sum => :xzz, regroup=regroup) ==
+                    cols2[2] => sum => :xzz, ungroup=ungroup) ==
                 combine(d -> (xyz=vexp(d.b), xzz=fill(sum(d.c), length(vexp(d.b)))),
-                        gd, regroup=regroup)
+                        gd, ungroup=ungroup)
         end
 
         @test_throws ArgumentError combine(cols => (b,c) -> (y=exp.(b), z=sum(c)),
-                                           gd, regroup=regroup)
+                                           gd, ungroup=ungroup)
         @test_throws ArgumentError combine(cols2 => ((b,c) -> DataFrame(y=exp.(b),
-                                           z=sum(c))) => :xyz, gd, regroup=regroup)
+                                           z=sum(c))) => :xyz, gd, ungroup=ungroup)
         @test_throws ArgumentError combine(cols2 => ((b,c) -> [exp.(b) c]) => :xyz,
-                                           gd, regroup=regroup)
+                                           gd, ungroup=ungroup)
     end
 end
 
@@ -908,19 +908,19 @@ end
     @test combine(identity, gd) ≅ df
     @test combine(d -> d[:, [2, 1]], gd) ≅ df
     @test_throws ArgumentError combine(f -> DataFrame(x=["a", "b"], z=[1, 1]), gd)
-    @test validate_gdf(combine(identity, gd, regroup=true)) ≅ gd
-    @test combine(d -> d[:, [2, 1]], gd, regroup=true) ≅ gd
+    @test validate_gdf(combine(identity, gd, ungroup=false)) ≅ gd
+    @test combine(d -> d[:, [2, 1]], gd, ungroup=false) ≅ gd
     @test_throws ArgumentError combine(f -> DataFrame(x=["a", "b"], z=[1, 1]), gd,
-                                       regroup=true)
+                                       ungroup=false)
 
     gd = groupby_checked(df, :x, skipmissing=true)
     @test combine(identity, gd) == df[1:3, :]
     @test combine(d -> d[:, [2, 1]], gd) == df[1:3, :]
     @test_throws ArgumentError combine(f -> DataFrame(x=["a", "b"], z=[1, 1]), gd)
-    @test validate_gdf(combine(identity, gd, regroup=true)) == gd
-    @test validate_gdf(combine(d -> d[:, [2, 1]], gd, regroup=true)) == gd
+    @test validate_gdf(combine(identity, gd, ungroup=false)) == gd
+    @test validate_gdf(combine(d -> d[:, [2, 1]], gd, ungroup=false)) == gd
     @test_throws ArgumentError combine(f -> DataFrame(x=["a", "b"], z=[1, 1]), gd,
-                                       regroup=true)
+                                       ungroup=false)
 end
 
 @testset "iteration protocol" begin
@@ -1306,9 +1306,9 @@ end
     @test gdf[:] == gdf
     @test gdf[1:1] == gdf
 
-    @test validate_gdf(combine(nrow => :x1, gdf, regroup=true)) ==
+    @test validate_gdf(combine(nrow => :x1, gdf, ungroup=false)) ==
           groupby_checked(DataFrame(x1=3), [])
-    @test validate_gdf(combine(:x2 => identity => :x2_identity, gdf, regroup=true)) ==
+    @test validate_gdf(combine(:x2 => identity => :x2_identity, gdf, ungroup=false)) ==
           groupby_checked(DataFrame(x2_identity=[1,1,2]), [])
     @test DataFrame(gdf) == df
 
@@ -1646,7 +1646,7 @@ end
         gdf = groupby_checked(df, :a)
         res = combine(sdf -> sdf.x1[1] ? fr : er, gdf)
         @test res == DataFrame(validate_gdf(combine(sdf -> sdf.x1[1] ? fr : er,
-                                                    groupby_checked(df, :a), regroup=true)))
+                                                    groupby_checked(df, :a), ungroup=false)))
         if fr isa AbstractVector && df.x1[1]
             @test res == combine(:x1 => (x1 -> x1[1] ? fr : er) => :x1, gdf)
         else
@@ -1679,7 +1679,7 @@ end
     @test combine(gdf, Between(:x2, :x1) => () -> 1) == DataFrame(:g => 1:2, Symbol("function") => 1)
     @test combine(gdf, :x1 => :z) == combine(gdf, [:x1 => :z]) == combine(:x1 => :z, gdf) ==
           DataFrame(g=[1,1,1,2,2,2], z=1:6)
-    @test validate_gdf(combine(:x1 => :z, groupby_checked(df, :g), regroup=true)) ==
+    @test validate_gdf(combine(:x1 => :z, groupby_checked(df, :g), ungroup=false)) ==
           groupby_checked(DataFrame(g=[1,1,1,2,2,2], z=1:6), :g)
 end
 
@@ -1882,7 +1882,7 @@ end
     @test combine(gdf , AsTable([:x, :y]) => Ref) ==
           combine(AsTable([:x, :y]) => Ref, gdf) ==
           DataFrame(g=1:2, x_y_Ref=[(x=[1,2,3], y=[6,7,8]), (x=[4,5], y=[9,10])])
-    @test validate_gdf(combine(AsTable([:x, :y]) => Ref, gdf, regroup=true)) ==
+    @test validate_gdf(combine(AsTable([:x, :y]) => Ref, gdf, ungroup=false)) ==
           groupby_checked(combine(gdf, AsTable([:x, :y]) => Ref), :g)
 
     @test combine(gdf, AsTable(1) => Ref) ==
@@ -1894,7 +1894,7 @@ end
           combine(AsTable([:x, :y]) => ByRow(x -> [x]), gdf) ==
           DataFrame(g=[1,1,1,2,2],
                     x_y_function=[[(x=1,y=6)], [(x=2,y=7)], [(x=3,y=8)], [(x=4,y=9)], [(x=5,y=10)]])
-    @test validate_gdf(combine(AsTable([:x, :y]) => ByRow(x -> [x]), gdf, regroup=true)) ==
+    @test validate_gdf(combine(AsTable([:x, :y]) => ByRow(x -> [x]), gdf, ungroup=false)) ==
           groupby_checked(combine(gdf, AsTable([:x, :y]) => ByRow(x -> [x])), :g)
 
     # whole column and ByRow test for multiple pairs passed
@@ -1908,10 +1908,10 @@ end
     @test_throws ArgumentError combine(gdf, AsTable([:x, :y]) => ByRow(x -> df[1, :]))
 end
 
-@testset "test correctness of regrouping" begin
+@testset "test correctness of ungrouping" begin
     df = DataFrame(g=[2,2,1,3,1,2,1,2,3])
     gdf = groupby_checked(df, :g)
-    gdf2 = validate_gdf(combine(identity, gdf, regroup=true))
+    gdf2 = validate_gdf(combine(identity, gdf, ungroup=false))
     @test combine(gdf, :g => sum) == combine(gdf2, :g => sum)
 
     df.id = 1:9
@@ -1926,32 +1926,32 @@ end
         if !(df.g isa CategoricalVector)
             gdf = groupby_checked(df, :g, sort=false, skipmissing=false)
 
-            @test combine(gdf, :x => sum, keepkeys=false, regroup=false) ==
+            @test combine(gdf, :x => sum, keepkeys=false, ungroup=true) ==
                   DataFrame(x_sum = [1, 5, 4])
-            @test_throws ArgumentError combine(gdf, :x => sum, keepkeys=false, regroup=true)
-            @test combine(gdf, :x => sum, keepkeys=true, regroup=false) ≅
+            @test_throws ArgumentError combine(gdf, :x => sum, keepkeys=false, ungroup=false)
+            @test combine(gdf, :x => sum, keepkeys=true, ungroup=true) ≅
                   DataFrame(g = [3, 1, missing], x_sum = [1, 5, 4])
-            gdf2 = validate_gdf(combine(gdf, :x => sum, keepkeys=true, regroup=true))
+            gdf2 = validate_gdf(combine(gdf, :x => sum, keepkeys=true, ungroup=false))
             @test gdf2 isa GroupedDataFrame{DataFrame}
             @test gdf2.groups == 1:3
             @test DataFrame(gdf2) ≅ DataFrame(g = [3, 1, missing], x_sum = [1, 5, 4])
             @test DataFrame(gdf2, keepkeys=false) == DataFrame(x_sum = [1, 5, 4])
 
-            @test combine(gdf, :x => sum, :g, keepkeys=false, regroup=false) ≅
+            @test combine(gdf, :x => sum, :g, keepkeys=false, ungroup=true) ≅
                   DataFrame(x_sum = [1, 5, 5, 4], g = [3, 1, 1, missing])
-            @test combine(gdf, :x => sum, :g, keepkeys=true, regroup=false) ≅
+            @test combine(gdf, :x => sum, :g, keepkeys=true, ungroup=true) ≅
                   DataFrame(g = [3, 1, 1, missing], x_sum = [1, 5, 5, 4])
-            gdf2 = validate_gdf(combine(gdf, :x => sum, :g, keepkeys=true, regroup=true))
+            gdf2 = validate_gdf(combine(gdf, :x => sum, :g, keepkeys=true, ungroup=false))
             @test gdf2 isa GroupedDataFrame{DataFrame}
             @test gdf2.groups == [1, 2, 2, 3]
             @test DataFrame(gdf2) ≅ DataFrame(g = [3, 1, 1, missing], x_sum = [1, 5, 5, 4])
             @test DataFrame(gdf2, keepkeys=false) ≅ DataFrame(x_sum = [1, 5, 5, 4])
 
-            @test combine(x -> (x_sum = sum(x.x),), gdf, keepkeys=false, regroup=false) ==
+            @test combine(x -> (x_sum = sum(x.x),), gdf, keepkeys=false, ungroup=true) ==
                   DataFrame(x_sum = [1, 5, 4])
-            @test combine(x -> (x_sum = sum(x.x),), gdf, keepkeys=true, regroup=false) ≅
+            @test combine(x -> (x_sum = sum(x.x),), gdf, keepkeys=true, ungroup=true) ≅
                   DataFrame(g = [3, 1, missing], x_sum = [1, 5, 4])
-            gdf2 = validate_gdf(combine(x -> (x_sum = sum(x.x),), gdf, keepkeys=true, regroup=true))
+            gdf2 = validate_gdf(combine(x -> (x_sum = sum(x.x),), gdf, keepkeys=true, ungroup=false))
             @test gdf2 isa GroupedDataFrame{DataFrame}
             @test gdf2.groups == 1:3
             @test DataFrame(gdf2) ≅ DataFrame(g = [3, 1, missing], x_sum = [1, 5, 4])
@@ -1959,32 +1959,32 @@ end
 
             gdf = groupby_checked(df, :g, sort=false, skipmissing=true)
 
-            @test combine(gdf, :x => sum, keepkeys=false, regroup=false) ==
+            @test combine(gdf, :x => sum, keepkeys=false, ungroup=true) ==
                   DataFrame(x_sum = [1, 5])
-            @test_throws ArgumentError combine(gdf, :x => sum, keepkeys=false, regroup=true)
-            @test combine(gdf, :x => sum, keepkeys=true, regroup=false) ≅
+            @test_throws ArgumentError combine(gdf, :x => sum, keepkeys=false, ungroup=false)
+            @test combine(gdf, :x => sum, keepkeys=true, ungroup=true) ≅
                   DataFrame(g = [3, 1], x_sum = [1, 5])
-            gdf2 = validate_gdf(combine(gdf, :x => sum, keepkeys=true, regroup=true))
+            gdf2 = validate_gdf(combine(gdf, :x => sum, keepkeys=true, ungroup=false))
             @test gdf2 isa GroupedDataFrame{DataFrame}
             @test gdf2.groups == 1:2
             @test DataFrame(gdf2) ≅ DataFrame(g = [3, 1], x_sum = [1, 5])
             @test DataFrame(gdf2, keepkeys=false) ≅ DataFrame(x_sum = [1, 5])
 
-            @test combine(gdf, :x => sum, :g, keepkeys=false, regroup=false) ≅
+            @test combine(gdf, :x => sum, :g, keepkeys=false, ungroup=true) ≅
                   DataFrame(x_sum = [1, 5, 5], g = [3, 1, 1])
-            @test combine(gdf, :x => sum, :g, keepkeys=true, regroup=false) ≅
+            @test combine(gdf, :x => sum, :g, keepkeys=true, ungroup=true) ≅
                   DataFrame(g = [3, 1, 1], x_sum = [1, 5, 5])
-            gdf2 = validate_gdf(combine(gdf, :x => sum, :g, keepkeys=true, regroup=true))
+            gdf2 = validate_gdf(combine(gdf, :x => sum, :g, keepkeys=true, ungroup=false))
             @test gdf2 isa GroupedDataFrame{DataFrame}
             @test gdf2.groups == [1, 2, 2]
             @test DataFrame(gdf2) ≅ DataFrame(g = [3, 1, 1], x_sum = [1, 5, 5])
             @test DataFrame(gdf2, keepkeys=false) ≅ DataFrame(x_sum = [1, 5, 5])
 
-            @test combine(x -> (x_sum = sum(x.x),), gdf, keepkeys=false, regroup=false) ==
+            @test combine(x -> (x_sum = sum(x.x),), gdf, keepkeys=false, ungroup=true) ==
                   DataFrame(x_sum = [1, 5])
-            @test combine(x -> (x_sum = sum(x.x),), gdf, keepkeys=true, regroup=false) ≅
+            @test combine(x -> (x_sum = sum(x.x),), gdf, keepkeys=true, ungroup=true) ≅
                   DataFrame(g = [3, 1], x_sum = [1, 5])
-            gdf2 = validate_gdf(combine(x -> (x_sum = sum(x.x),), gdf, keepkeys=true, regroup=true))
+            gdf2 = validate_gdf(combine(x -> (x_sum = sum(x.x),), gdf, keepkeys=true, ungroup=false))
             @test gdf2 isa GroupedDataFrame{DataFrame}
             @test gdf2.groups == 1:2
             @test DataFrame(gdf2) ≅ DataFrame(g = [3, 1], x_sum = [1, 5])
@@ -1993,32 +1993,32 @@ end
 
         gdf = groupby_checked(df, :g, sort=true, skipmissing=false)
 
-        @test combine(gdf, :x => sum, keepkeys=false, regroup=false) ==
+        @test combine(gdf, :x => sum, keepkeys=false, ungroup=true) ==
               DataFrame(x_sum = [5, 1, 4])
-        @test_throws ArgumentError validate_gdf(combine(gdf, :x => sum, keepkeys=false, regroup=true))
-        @test combine(gdf, :x => sum, keepkeys=true, regroup=false) ≅
+        @test_throws ArgumentError validate_gdf(combine(gdf, :x => sum, keepkeys=false, ungroup=false))
+        @test combine(gdf, :x => sum, keepkeys=true, ungroup=true) ≅
               DataFrame(g = [1, 3, missing], x_sum = [5, 1, 4])
-        gdf2 = validate_gdf(combine(gdf, :x => sum, keepkeys=true, regroup=true))
+        gdf2 = validate_gdf(combine(gdf, :x => sum, keepkeys=true, ungroup=false))
         @test gdf2 isa GroupedDataFrame{DataFrame}
         @test gdf2.groups == 1:3
         @test DataFrame(gdf2) ≅ DataFrame(g = [1, 3, missing], x_sum = [5, 1, 4])
         @test DataFrame(gdf2, keepkeys=false) ≅ DataFrame(x_sum = [5, 1, 4])
 
-        @test combine(gdf, :x => sum, :g, keepkeys=false, regroup=false) ≅
+        @test combine(gdf, :x => sum, :g, keepkeys=false, ungroup=true) ≅
               DataFrame(x_sum = [5, 5, 1, 4], g = [1, 1, 3, missing])
-        @test combine(gdf, :x => sum, :g, keepkeys=true, regroup=false) ≅
+        @test combine(gdf, :x => sum, :g, keepkeys=true, ungroup=true) ≅
               DataFrame(g = [1, 1, 3, missing], x_sum = [5, 5, 1, 4])
-        gdf2 = validate_gdf(combine(gdf, :x => sum, :g, keepkeys=true, regroup=true))
+        gdf2 = validate_gdf(combine(gdf, :x => sum, :g, keepkeys=true, ungroup=false))
         @test gdf2 isa GroupedDataFrame{DataFrame}
         @test gdf2.groups == [1, 1, 2, 3]
         @test DataFrame(gdf2) ≅ DataFrame(g = [1, 1, 3, missing], x_sum = [5, 5, 1, 4])
         @test DataFrame(gdf2, keepkeys=false) ≅ DataFrame(x_sum = [5, 5, 1, 4])
 
-        @test combine(x -> (x_sum = sum(x.x),), gdf, keepkeys=false, regroup=false) ==
+        @test combine(x -> (x_sum = sum(x.x),), gdf, keepkeys=false, ungroup=true) ==
               DataFrame(x_sum = [5, 1, 4])
-        @test combine(x -> (x_sum = sum(x.x),), gdf, keepkeys=true, regroup=false) ≅
+        @test combine(x -> (x_sum = sum(x.x),), gdf, keepkeys=true, ungroup=true) ≅
               DataFrame(g = [1, 3, missing], x_sum = [5, 1, 4])
-        gdf2 = validate_gdf(combine(x -> (x_sum = sum(x.x),), gdf, keepkeys=true, regroup=true))
+        gdf2 = validate_gdf(combine(x -> (x_sum = sum(x.x),), gdf, keepkeys=true, ungroup=false))
         @test gdf2 isa GroupedDataFrame{DataFrame}
         @test gdf2.groups == 1:3
         @test DataFrame(gdf2) ≅ DataFrame(g = [1, 3, missing], x_sum = [5, 1, 4])
@@ -2026,32 +2026,32 @@ end
 
         gdf = groupby_checked(df, :g, sort=true, skipmissing=true)
 
-        @test combine(gdf, :x => sum, keepkeys=false, regroup=false) ==
+        @test combine(gdf, :x => sum, keepkeys=false, ungroup=true) ==
               DataFrame(x_sum = [5, 1])
-        @test_throws ArgumentError combine(gdf, :x => sum, keepkeys=false, regroup=true)
-        @test combine(gdf, :x => sum, keepkeys=true, regroup=false) ≅
+        @test_throws ArgumentError combine(gdf, :x => sum, keepkeys=false, ungroup=false)
+        @test combine(gdf, :x => sum, keepkeys=true, ungroup=true) ≅
               DataFrame(g = [1, 3], x_sum = [5, 1])
-        gdf2 = validate_gdf(combine(gdf, :x => sum, keepkeys=true, regroup=true))
+        gdf2 = validate_gdf(combine(gdf, :x => sum, keepkeys=true, ungroup=false))
         @test gdf2 isa GroupedDataFrame{DataFrame}
         @test gdf2.groups == 1:2
         @test DataFrame(gdf2) ≅ DataFrame(g = [1, 3], x_sum = [5, 1])
         @test DataFrame(gdf2, keepkeys=false) ≅ DataFrame(x_sum = [5, 1])
 
-        @test combine(gdf, :x => sum, :g, keepkeys=false, regroup=false) ≅
+        @test combine(gdf, :x => sum, :g, keepkeys=false, ungroup=true) ≅
               DataFrame(x_sum = [5, 5, 1], g = [1, 1, 3])
-        @test combine(gdf, :x => sum, :g, keepkeys=true, regroup=false) ≅
+        @test combine(gdf, :x => sum, :g, keepkeys=true, ungroup=true) ≅
               DataFrame(g = [1, 1, 3], x_sum = [5, 5, 1])
-        gdf2 = validate_gdf(combine(gdf, :x => sum, :g, keepkeys=true, regroup=true))
+        gdf2 = validate_gdf(combine(gdf, :x => sum, :g, keepkeys=true, ungroup=false))
         @test gdf2 isa GroupedDataFrame{DataFrame}
         @test gdf2.groups == [1, 1, 2]
         @test DataFrame(gdf2) ≅ DataFrame(g = [1, 1, 3], x_sum = [5, 5, 1])
         @test DataFrame(gdf2, keepkeys=false) ≅ DataFrame(x_sum = [5, 5, 1])
 
-        @test combine(x -> (x_sum = sum(x.x),), gdf, keepkeys=false, regroup=false) ==
+        @test combine(x -> (x_sum = sum(x.x),), gdf, keepkeys=false, ungroup=true) ==
               DataFrame(x_sum = [5, 1])
-        @test combine(x -> (x_sum = sum(x.x),), gdf, keepkeys=true, regroup=false) ≅
+        @test combine(x -> (x_sum = sum(x.x),), gdf, keepkeys=true, ungroup=true) ≅
               DataFrame(g = [1, 3], x_sum = [5, 1])
-        gdf2 = validate_gdf(combine(x -> (x_sum = sum(x.x),), gdf, keepkeys=true, regroup=true))
+        gdf2 = validate_gdf(combine(x -> (x_sum = sum(x.x),), gdf, keepkeys=true, ungroup=false))
         @test gdf2 isa GroupedDataFrame{DataFrame}
         @test gdf2.groups == 1:2
         @test DataFrame(gdf2) ≅ DataFrame(g = [1, 3], x_sum = [5, 1])
@@ -2066,33 +2066,33 @@ end
 
         gdf = groupby_checked(df, :g, sort=dosort, skipmissing=false)
 
-        @test select(gdf, :x => sum, keepkeys=false, regroup=false) ==
+        @test select(gdf, :x => sum, keepkeys=false, ungroup=true) ==
               DataFrame(x_sum = [1, 5, 5, 4])
-        @test_throws ArgumentError select(gdf, :x => sum, keepkeys=false, regroup=true)
-        @test select(gdf, :x => sum, keepkeys=true, regroup=false) ≅
+        @test_throws ArgumentError select(gdf, :x => sum, keepkeys=false, ungroup=false)
+        @test select(gdf, :x => sum, keepkeys=true, ungroup=true) ≅
               DataFrame(g = df.g, x_sum = [1, 5, 5, 4])
-        gdf2 = validate_gdf(select(gdf, :x => sum, keepkeys=true, regroup=true))
+        gdf2 = validate_gdf(select(gdf, :x => sum, keepkeys=true, ungroup=false))
         @test gdf2 isa GroupedDataFrame{DataFrame}
         @test gdf2.groups == gdf.groups
         @test parent(gdf2).g ≅ df.g
         @test parent(gdf2).g !== df.g
 
-        @test select(gdf, :x => sum, :g, keepkeys=false, regroup=false) ≅
+        @test select(gdf, :x => sum, :g, keepkeys=false, ungroup=true) ≅
               DataFrame(x_sum = [1, 5, 5, 4], g = df.g)
-        @test select(gdf, :x => sum, :g, keepkeys=true, regroup=false) ≅
+        @test select(gdf, :x => sum, :g, keepkeys=true, ungroup=true) ≅
               DataFrame(g = df.g, x_sum = [1, 5, 5, 4])
-        gdf2 = validate_gdf(select(gdf, :x => sum, :g, keepkeys=true, regroup=true))
+        gdf2 = validate_gdf(select(gdf, :x => sum, :g, keepkeys=true, ungroup=false))
         @test gdf2 isa GroupedDataFrame{DataFrame}
         @test gdf2.groups == gdf.groups
         @test parent(gdf2).g ≅ df.g
         @test parent(gdf2).g !== df.g
 
-        @test transform(gdf, :x => sum, keepkeys=false, regroup=false) ≅
+        @test transform(gdf, :x => sum, keepkeys=false, ungroup=true) ≅
               [df DataFrame(x_sum = [1, 5, 5, 4])]
-        @test_throws ArgumentError transform(gdf, :x => sum, keepkeys=false, regroup=true)
-        @test transform(gdf, :x => sum, keepkeys=true, regroup=false) ≅
+        @test_throws ArgumentError transform(gdf, :x => sum, keepkeys=false, ungroup=false)
+        @test transform(gdf, :x => sum, keepkeys=true, ungroup=true) ≅
               DataFrame(g = df.g, x = df.x, y = df.y, x_sum = [1, 5, 5, 4])
-        gdf2 = validate_gdf(transform(gdf, :x => sum, keepkeys=true, regroup=true))
+        gdf2 = validate_gdf(transform(gdf, :x => sum, keepkeys=true, ungroup=false))
         @test gdf2 isa GroupedDataFrame{DataFrame}
         @test gdf2.groups == gdf.groups
         @test parent(gdf2).g ≅ df.g
@@ -2100,11 +2100,11 @@ end
         @test parent(gdf2).y ≅ df.y
         @test parent(gdf2).g !== df.g
 
-        @test transform(gdf, :x => sum, :g, keepkeys=false, regroup=false) ≅
+        @test transform(gdf, :x => sum, :g, keepkeys=false, ungroup=true) ≅
               [df DataFrame(x_sum = [1, 5, 5, 4])]
-        @test transform(gdf, :x => sum, :g, keepkeys=true, regroup=false) ≅
+        @test transform(gdf, :x => sum, :g, keepkeys=true, ungroup=true) ≅
               [df DataFrame(x_sum = [1, 5, 5, 4])]
-        gdf2 = validate_gdf(transform(gdf, :x => sum, :g, keepkeys=true, regroup=true))
+        gdf2 = validate_gdf(transform(gdf, :x => sum, :g, keepkeys=true, ungroup=false))
         @test gdf2 isa GroupedDataFrame{DataFrame}
         @test gdf2.groups == gdf.groups
         @test parent(gdf2).g ≅ df.g
@@ -2112,17 +2112,17 @@ end
         @test parent(gdf2).y ≅ df.y
         @test parent(gdf2).g !== df.g
 
-        df2 = transform(gdf, :x => sum, :g, keepkeys=false, regroup=false, copycols=false)
+        df2 = transform(gdf, :x => sum, :g, keepkeys=false, ungroup=true, copycols=false)
         @test df2 ≅ [df DataFrame(x_sum = [1, 5, 5, 4])]
         @test df2.g === df.g
         @test df2.x === df.x
         @test df2.y === df.y
-        df2 = transform(gdf, :x => sum, :g, keepkeys=true, regroup=false, copycols=false)
+        df2 = transform(gdf, :x => sum, :g, keepkeys=true, ungroup=true, copycols=false)
         @test df2 ≅ [df DataFrame(x_sum = [1, 5, 5, 4])]
         @test df2.g === df.g
         @test df2.x === df.x
         @test df2.y === df.y
-        gdf2 = validate_gdf(transform(gdf, :x => sum, :g, keepkeys=true, regroup=true, copycols=false))
+        gdf2 = validate_gdf(transform(gdf, :x => sum, :g, keepkeys=true, ungroup=false, copycols=false))
         @test gdf2 isa GroupedDataFrame{DataFrame}
         @test gdf2.groups == gdf.groups
         @test parent(gdf2).g ≅ df.g
@@ -2132,9 +2132,9 @@ end
 
         gdf = groupby_checked(df, :g, sort=dosort, skipmissing=true)
         @test_throws ArgumentError select(gdf, :x => sum)
-        @test_throws ArgumentError select(gdf, :x => sum, regroup=true)
+        @test_throws ArgumentError select(gdf, :x => sum, ungroup=false)
         @test_throws ArgumentError transform(gdf, :x => sum)
-        @test_throws ArgumentError transform(gdf, :x => sum, regroup=true)
+        @test_throws ArgumentError transform(gdf, :x => sum, ungroup=false)
     end
 
     # show the difference between the ordering of rows in select and combine
@@ -2190,7 +2190,7 @@ end
         dfc = copy(df)
         g = dfc.g
         gdf = groupby_checked(dfc, :g, sort=dosort, skipmissing=false)
-        @test validate_gdf(select!(gdf, :x => sum, regroup=true)) === gdf
+        @test validate_gdf(select!(gdf, :x => sum, ungroup=false)) === gdf
         @test dfc.g === g
         @test dfc.x_sum == [1, 5, 5, 4]
         @test propertynames(dfc) == [:g, :x_sum]
@@ -2200,7 +2200,7 @@ end
         x = dfc.x
         y = dfc.y
         gdf = groupby_checked(dfc, :g, sort=dosort, skipmissing=false)
-        @test validate_gdf(transform!(gdf, :g => first => :g, :x => first, regroup=true)) === gdf
+        @test validate_gdf(transform!(gdf, :g => first => :g, :x => first, ungroup=false)) === gdf
         @test dfc.g === g
         @test dfc.x === x
         @test dfc.y === y
@@ -2210,9 +2210,9 @@ end
         dfc = copy(df)
         gdf = groupby_checked(dfc, :g, sort=dosort, skipmissing=true)
         @test_throws ArgumentError select!(gdf, :x => sum)
-        @test_throws ArgumentError select!(gdf, :x => sum, regroup=true)
+        @test_throws ArgumentError select!(gdf, :x => sum, ungroup=false)
         @test_throws ArgumentError transform!(gdf, :x => sum)
-        @test_throws ArgumentError transform!(gdf, :x => sum, regroup=true)
+        @test_throws ArgumentError transform!(gdf, :x => sum, ungroup=false)
         @test dfc ≅ df
     end
 end
diff --git a/test/string.jl b/test/string.jl
index 233b9cd8b8..589d4ca825 100644
--- a/test/string.jl
+++ b/test/string.jl
@@ -169,12 +169,12 @@ end
     @test combine(gdf, :a) == combine(gdf, "a") ==
           combine(gdf, [:a]) == combine(gdf, ["a"])
 
-    @test combine("a" => identity, gdf, regroup=true) ==
-          combine(:a => identity, gdf, regroup=true)
-    @test combine(["a"] => identity, gdf, regroup=true) ==
-          combine([:a] => identity, gdf, regroup=true)
-    @test combine(nrow => :n, gdf, regroup=true) ==
-          combine(nrow => "n", gdf, regroup=true)
+    @test combine("a" => identity, gdf, ungroup=false) ==
+          combine(:a => identity, gdf, ungroup=false)
+    @test combine(["a"] => identity, gdf, ungroup=false) ==
+          combine([:a] => identity, gdf, ungroup=false)
+    @test combine(nrow => :n, gdf, ungroup=false) ==
+          combine(nrow => "n", gdf, ungroup=false)
 
     @test combine("a" => identity, gdf) == combine(:a => identity, gdf) ==
           combine(gdf, "a" => identity) == combine(gdf, :a => identity)

From 333cca222319ef1fa9fe39fd0d8f28041c9302f5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= <bkamins@sgh.waw.pl>
Date: Tue, 5 May 2020 10:50:31 +0200
Subject: [PATCH 26/29] Apply suggestions from code review

Co-authored-by: Milan Bouchet-Valat <nalimilan@club.fr>
---
 docs/src/man/split_apply_combine.md       |  3 ++-
 src/dataframe/dataframe.jl                |  4 ++--
 src/groupeddataframe/splitapplycombine.jl | 11 +++++++----
 3 files changed, 11 insertions(+), 7 deletions(-)

diff --git a/docs/src/man/split_apply_combine.md b/docs/src/man/split_apply_combine.md
index b743d4250d..1eb02f4889 100644
--- a/docs/src/man/split_apply_combine.md
+++ b/docs/src/man/split_apply_combine.md
@@ -48,7 +48,8 @@ passed to `function`.
 
 In all of these cases, `function` can return either a single row or multiple rows.
 `function` can always generate a single column by returning a single value or a vector.
-Additionally, if `combine` is passed exactly one `function`, `cols => function`, or `cols => function => outcol` as a first argument
+Additionally, if `combine` is passed exactly one `function`, `cols => function`,
+or `cols => function => outcol` as a first argument
 and `target_col` is not specified,
 `function` can return multiple columns in the form of an `AbstractDataFrame`,
 `AbstractMatrix`, `NamedTuple` or `DataFrameRow`.
diff --git a/src/dataframe/dataframe.jl b/src/dataframe/dataframe.jl
index 06ebd27d53..07d45426dd 100644
--- a/src/dataframe/dataframe.jl
+++ b/src/dataframe/dataframe.jl
@@ -66,7 +66,7 @@ stored in a `Ref` or a `0`-dimensional `AbstractArray` are unwrapped and treated
 in the same way.
 
 Additionally `DataFrame` can be used to collect a [`GroupedDataFrame`](@ref)
-into a `DataFrame`. In this case the row ofder of the result follows the order
+into a `DataFrame`. In this case the order of rows in the result follows the order
 of groups in the `GroupedDataFrame` passed.
 
 # Notes
@@ -1673,7 +1673,7 @@ function repeat!(df::DataFrame, count::Integer)
     return mapcols!(x -> repeat(x, count), df)
 end
 
-# it is not exactly copy! as in general we alow axes to be different
+# This is not exactly copy! as in general we allow axes to be different
 function _replace_columns!(df::DataFrame, newdf::DataFrame)
     copy!(_columns(df), _columns(newdf))
     copy!(_names(index(df)), _names(newdf))
diff --git a/src/groupeddataframe/splitapplycombine.jl b/src/groupeddataframe/splitapplycombine.jl
index b6d2ddb4ac..86df05b953 100644
--- a/src/groupeddataframe/splitapplycombine.jl
+++ b/src/groupeddataframe/splitapplycombine.jl
@@ -1003,7 +1003,10 @@ function _agg2idx_map_helper(idx, idx_agg)
     return agg2idx_map
 end
 
-function prepare_idx_keeprows(idx, starts, ends, nrowparent)
+function prepare_idx_keeprows(idx::AbstractVector{<:Integer},
+                              starts::AbstractVector{<:Integer},
+                              ends::AbstractVector{<:Integer},
+                              nrowparent::Integer)
     idx_keeprows = Vector{Int}(undef, nrowparent)
     i = 0
     for (s, e) in zip(starts, ends)
@@ -1421,8 +1424,8 @@ end
            copycols::Bool=true, keepkeys::Bool=true, ungroup::Bool=true)
 
 Apply `args` to `gd` following the rules described in [`combine`](@ref).
-The return value has number of rows equal to `nrow(parent(gd))`
-(if single value is returned it is always broadcasted to have this number of rows).
+The returned object has as many rows as `parent(gd)`.
+If an operation returns a single value it is always broadcasted to have this number of rows.
 
 If `copycols=false` then do not perform copying of columns that are not transformed.
 
@@ -1513,7 +1516,7 @@ julia> select(gd, [:b, :c] .=> sum) # passing a vector of pairs
 │ 8   │ 2     │ 4     │ 17    │
 
 julia> select(gd, :b => :b1, :c => :c1,
-              [:b, :c] => +, keepkeys=false) # auto-splatting, renaming and keepkeys
+              [:b, :c] => +, keepkeys=false) # multiple arguments, renaming and keepkeys
 8×3 DataFrame
 │ Row │ b1    │ c1    │ b_c_+ │
 │     │ Int64 │ Int64 │ Int64 │

From 10b947467d4b86c0bfa2d4b48c45eca11759d949 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= <bkamins@sgh.waw.pl>
Date: Tue, 5 May 2020 11:17:07 +0200
Subject: [PATCH 27/29] update docs

---
 src/groupeddataframe/splitapplycombine.jl | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/groupeddataframe/splitapplycombine.jl b/src/groupeddataframe/splitapplycombine.jl
index 86df05b953..3671b618bb 100644
--- a/src/groupeddataframe/splitapplycombine.jl
+++ b/src/groupeddataframe/splitapplycombine.jl
@@ -1429,6 +1429,8 @@ If an operation returns a single value it is always broadcasted to have this num
 
 If `copycols=false` then do not perform copying of columns that are not transformed.
 
+$KWARG_PROCESSING_RULES
+
 # See also
 
 [`groupby](@ref), [`combine`](@ref), [`select!`](@ref), [`transform`](@ref), [`transform!`](@ref)

From 792b57d9221b4169f27459e5eecaaa17460acd90 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= <bkamins@sgh.waw.pl>
Date: Tue, 5 May 2020 11:45:16 +0200
Subject: [PATCH 28/29] improve description of what gets returned in combine
 and select

---
 src/groupeddataframe/splitapplycombine.jl | 20 ++++++++++++++------
 1 file changed, 14 insertions(+), 6 deletions(-)

diff --git a/src/groupeddataframe/splitapplycombine.jl b/src/groupeddataframe/splitapplycombine.jl
index 3671b618bb..ef2f81be15 100644
--- a/src/groupeddataframe/splitapplycombine.jl
+++ b/src/groupeddataframe/splitapplycombine.jl
@@ -258,9 +258,11 @@ const KWARG_PROCESSING_RULES =
     combine(fun::Union{Function, Type}, df::AbstractDataFrame, ungroup::Bool=true)
     combine(pair::Pair, df::AbstractDataFrame, ungroup::Bool=true)
 
-Apply operations to each group in a [`GroupedDataFrame`](@ref) and return
-the combined result as a `DataFrame`.
-If an `AbstractDataFrame` is passed, apply operations to the data frame as a whole.
+Apply operations to each group in a [`GroupedDataFrame`](@ref) and return the combined
+result as a `DataFrame` if `ungroup=true` or `GroupedDataFrame` if `ungroup=false`.
+
+If an `AbstractDataFrame` is passed, apply operations to the data frame as a whole
+and a `DataFrame` is always returend.
 
 $F_ARGUMENT_RULES
 
@@ -1423,9 +1425,15 @@ end
     select(gd::GroupedDataFrame, args...;
            copycols::Bool=true, keepkeys::Bool=true, ungroup::Bool=true)
 
-Apply `args` to `gd` following the rules described in [`combine`](@ref).
-The returned object has as many rows as `parent(gd)`.
-If an operation returns a single value it is always broadcasted to have this number of rows.
+Apply `args` to `gd` following the rules described in [`combine`](@ref) and return the
+result as a `DataFrame` if `ungroup=true` or `GroupedDataFrame` if `ungroup=false`.
+
+The `parent` of the returned value has as many rows as `parent(gd)`. If an operation
+in `args` returns a single value it is always broadcasted to have this number of rows.
+
+Apply operations to each group in a [`GroupedDataFrame`](@ref) and return the combined
+result as a `DataFrame` if `ungroup=true` or `GroupedDataFrame` if `ungroup=false`.
+
 
 If `copycols=false` then do not perform copying of columns that are not transformed.
 

From f34873cbf773c06986d4f17b939a49ec9542d40e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= <bkamins@sgh.waw.pl>
Date: Tue, 5 May 2020 12:24:10 +0200
Subject: [PATCH 29/29] fix repeated code

---
 src/groupeddataframe/splitapplycombine.jl | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/src/groupeddataframe/splitapplycombine.jl b/src/groupeddataframe/splitapplycombine.jl
index ef2f81be15..28b8be5c23 100644
--- a/src/groupeddataframe/splitapplycombine.jl
+++ b/src/groupeddataframe/splitapplycombine.jl
@@ -1431,10 +1431,6 @@ result as a `DataFrame` if `ungroup=true` or `GroupedDataFrame` if `ungroup=fals
 The `parent` of the returned value has as many rows as `parent(gd)`. If an operation
 in `args` returns a single value it is always broadcasted to have this number of rows.
 
-Apply operations to each group in a [`GroupedDataFrame`](@ref) and return the combined
-result as a `DataFrame` if `ungroup=true` or `GroupedDataFrame` if `ungroup=false`.
-
-
 If `copycols=false` then do not perform copying of columns that are not transformed.
 
 $KWARG_PROCESSING_RULES