diff --git a/Project.toml b/Project.toml index dccf26834c..ced552141a 100644 --- a/Project.toml +++ b/Project.toml @@ -35,7 +35,7 @@ test = ["DataStructures", "DataValues", "Dates", "Logging", "Random", "Test"] julia = "1" CategoricalArrays = "0.8" Compat = "2.2, 3" -DataAPI = "1.0.1" +DataAPI = "1.2" InvertedIndices = "1" IteratorInterfaceExtensions = "0.1.1, 1" Missings = "0.4.2" diff --git a/docs/src/lib/indexing.md b/docs/src/lib/indexing.md index b77cc232cc..3cc614024b 100644 --- a/docs/src/lib/indexing.md +++ b/docs/src/lib/indexing.md @@ -17,9 +17,11 @@ and broadcasting are intended to work with `DataFrame`, `SubDataFrame` and `Data The rules for a valid type of index into a column are the following: * a value, later denoted as `col`: * a `Symbol`; + * an `AbstractString`; * an `Integer` that is not `Bool`; * a vector, later denoted as `cols`: * a vector of `Symbol` (does not have to be a subtype of `AbstractVector{Symbol}`); + * a vector of `AbstractString` (does not have to be a subtype of `AbstractVector{<:AbstractString}`); * a vector of `Integer` other than `Bool` (does not have to be a subtype of `AbstractVector{<:Integer}`); * a vector of `Bool` that has to be a subtype of `AbstractVector{Bool}`; * a regular expression, which gets expanded to a vector of matching column names; @@ -122,13 +124,14 @@ so it is unsafe to use it afterwards (the column length correctness will be pres * `df[CartesianIndex(row, col)] = v` -> the same as `df[row, col] = v`; * `df[row, cols] = v` -> set row `row` of columns `cols` in-place; the same as `dfr = df[row, cols]; dfr[:] = v`; * `df[rows, col] = v` -> set rows `rows` of column `col` in-place; `v` must be an `AbstractVector`; - if `rows` is `:` and `col` is a `Symbol` that is not present in `df` then a new column - in `df` is created and holds a `copy` of `v`; equivalent to `df.col = copy(v)` if `col` is a valid identifier; + if `rows` is `:` and `col` is a `Symbol` or `AbstractString` + that is not present in `df` then a new column in `df` is created and holds a `copy` of `v`; equivalent to `df.col = copy(v)` if `col` is a valid identifier; * `df[rows, cols] = v` -> set rows `rows` of columns `cols` in-place; `v` must be an `AbstractMatrix` or an `AbstractDataFrame` (in this case column names must match); * `df[!, col] = v` -> replaces `col` with `v` without copying (with the exception that if `v` is an `AbstractRange` it gets converted to a `Vector`); - also if `col` is a `Symbol` that is not present in `df` then a new column in `df` is created and holds `v`; + also if `col` is a `Symbol` or `AbstractString` that is not present in `df` then + a new column in `df` is created and holds `v`; equivalent to `df.col = v` if `col` is a valid identifier; this is allowed if `ncol(df) == 0 || length(v) == nrow(df)`; * `df[!, cols] = v` -> replaces existing columns `cols` in data frame `df` with copying; @@ -183,10 +186,10 @@ Additional rules: * in the `df[CartesianIndex(row, col)] .= v`, `df[row, col] .= v` syntaxes `v` is broadcasted into the contents of `df[row, col]` (this is consistent with Julia Base); * in the `df[row, cols] .= v` syntaxes the assignment to `df` is performed in-place; * in the `df[rows, col] .= v` and `df[rows, cols] .= v` syntaxes the assignment to `df` is performed in-place; - if `rows` is `:` and `col` is `Symbol` and it is missing from `df` then a new column is allocated and added; + if `rows` is `:` and `col` is `Symbol` or `AbstractString` and it is missing from `df` then a new column is allocated and added; the length of the column is always the value of `nrow(df)` before the assignment takes place; * in the `df[!, col] .= v` syntax column `col` is replaced by a freshly allocated vector; - if `col` is `Symbol` and it is missing from `df` then a new column is allocated added; + if `col` is `Symbol` or `AbstractString` and it is missing from `df` then a new column is allocated added; the length of the column is always the value of `nrow(df)` before the assignment takes place; * the `df[!, cols] .= v` syntax replaces existing columns `cols` in data frame `df` with freshly allocated vectors; * `df.col .= v` syntax is allowed and performs in-place assignment to an existing vector `df.col`. @@ -197,9 +200,8 @@ Additional rules: Note that `sdf[!, col] .= v` and `sdf[!, cols] .= v` syntaxes are not allowed as `sdf` can be only modified in-place. -If column indexing using `Symbol` names in `cols` is performed, the order of columns in the operation is specified -by the order of names. - +If column indexing using `Symbol` or `AbstractString` names in `cols` is performed, the order +of columns in the operation is specified by the order of names. ## Indexing `GroupedDataFrame`s @@ -230,3 +232,18 @@ The elements of a `GroupedDataFrame` are [`SubDataFrame`](@ref)s of its parent. * `gd[n::Not]` -> Any of the above types wrapped in `Not`. The result will be a new `GroupedDataFrame` containing all groups in `gd` *not* selected by the wrapped index. + +# Common API for types defined in DataFrames.jl + +This table presents return value types of calling `names`, `propertynames` and `keys` +on types exposed to the user by DataFrames.jl: + +| Type | `names` | `propertynames` | `keys` | +|---------------------|------------------|------------------|------------------| +| `AbstractDataFrame` | `Vector{String}` | `Vector{Symbol}` | undefined | +| `DataFrameRow` | `Vector{String}` | `Vector{Symbol}` | `Vector{Symbol}` | +| `DataFrameRows` | `Vector{String}` | `Vector{Symbol}` | vector of `Int` | +| `DataFrameColumns` | `Vector{String}` | `Vector{Symbol}` | `Vector{Symbol}` | +| `GroupedDataFrame` | `Vector{String}` | tuple of fields | `GroupKeys` | +| `GroupKeys` | undefined | tuple of fields | vector of `Int` | +| `GroupKey` | `Vector{String}` | `Vector{Symbol}` | `Vector{Symbol}` | diff --git a/docs/src/lib/types.md b/docs/src/lib/types.md index 0b5d8c4908..51f2571405 100644 --- a/docs/src/lib/types.md +++ b/docs/src/lib/types.md @@ -109,6 +109,7 @@ without caution because: ```@docs AbstractDataFrame +AsTable ByRow DataFrame DataFrameRow diff --git a/docs/src/man/getting_started.md b/docs/src/man/getting_started.md index b160d56d03..6f9bdea429 100644 --- a/docs/src/man/getting_started.md +++ b/docs/src/man/getting_started.md @@ -45,7 +45,8 @@ julia> df = DataFrame(A = 1:4, B = ["M", "F", "F", "M"]) ``` -Columns can be directly (i.e. without copying) accessed via `df.col` or `df[!, :col]`. The latter syntax is more flexible as it allows passing a variable holding the name of the column, and not only a literal name. Note that column names are symbols (`:col` or `Symbol("col")`) rather than strings (`"col"`). Columns can also be accessed using an integer index specifying their position. +Columns can be directly (i.e. without copying) accessed via `df.col`, `df."col"`, `df[!, :col]` or `df[!, "col"]`. The two latter syntaxes are more flexible as they allow passing a variable holding the name of the column, and not only a literal name. Note that column names can be either symbols (written as `:col`, `:var"col"` or `Symbol("col")`) or strings (written as `"col"`). +Columns can also be accessed using an integer index specifying their position. Since `df[!, :col]` does not make a copy, changing the elements of the column vector returned by this syntax will affect the values stored in the original `df`. To get a copy of the column use `df[:, :col]`: changing the vector returned by this syntax does not change `df`. @@ -58,6 +59,13 @@ julia> df.A 3 4 +julia> df."A" +4-element Array{Int64,1}: + 1 + 2 + 3 + 4 + julia> df.A === df[!, :A] true @@ -67,6 +75,15 @@ false julia> df.A == df[:, :A] true +julia> df.A === df[!, "A"] +true + +julia> df.A === df[:, "A"] +false + +julia> df.A == df[:, "A"] +true + julia> df.A === df[!, 1] true @@ -89,15 +106,30 @@ julia> df[:, firstcolumn] == df.A true ``` -Column names can be obtained using the `names` function: +Column names can be obtained as strings using the `names` function: ```jldoctest dataframe julia> names(df) +2-element Array{String,1}: + "A" + "B" + ``` + +To get column names as `Symbol`s use the `propertynames` function: +``` +julia> propertynames(df) 2-element Array{Symbol,1}: :A :B ``` +!!! note + + DataFrames.jl allows to use `Symbol`s (like `:A`) and strings (like `"A"`) + for all column indexing operations for convenience. + However, using `Symbol`s is slightly faster and should generally be preferred. + + ### Constructing Column by Column It is also possible to start with an empty `DataFrame` and add columns to it one by one: diff --git a/src/abstractdataframe/abstractdataframe.jl b/src/abstractdataframe/abstractdataframe.jl index 5015df8c36..8cd4e89eab 100644 --- a/src/abstractdataframe/abstractdataframe.jl +++ b/src/abstractdataframe/abstractdataframe.jl @@ -6,9 +6,8 @@ for working with tabular data. # Common methods -An AbstractDataFrame is a two-dimensional table with Symbols for -column names. An AbstractDataFrame is also similar to an Associative -type in that it allows indexing by a key (the columns). +An `AbstractDataFrame` is a two-dimensional table with `Symbol`s or strings +for column names. The following are normally implemented for AbstractDataFrames: @@ -46,11 +45,13 @@ The following are normally implemented for AbstractDataFrames: row and column selectors. The allowed indices are a superset of indices that can be used for standard arrays. You can also access a single column of an `AbstractDataFrame` using `getproperty` and `setproperty!` functions. +Columns can be selected using integers, `Symbol`s, or strings. In broadcasting `AbstractDataFrame` behavior is similar to a `Matrix`. A detailed description of `getindex`, `setindex!`, `getproperty`, `setproperty!`, broadcasting and broadcasting assignment for data frames is given in -the ["Indexing" section](https://juliadata.github.io/DataFrames.jl/stable/lib/indexing/) of the manual. +the ["Indexing" section](https://juliadata.github.io/DataFrames.jl/stable/lib/indexing/) +of the manual. """ abstract type AbstractDataFrame end @@ -65,26 +66,35 @@ abstract type AbstractDataFrame end names(df::AbstractDataFrame) names(df::AbstractDataFrame, cols) - Return a `Vector{Symbol}` of names of columns contained in `df`. +Return a freshly allocated `Vector{String}` of names of columns contained in `df`. - If a `cols` column selector is passed then restrict returned - column names to those matching the selector - (this is useful in particular with regular expressions, `Not`, and `Between`). +If `cols` is passed then restrict returned column names to those matching the +selector (this is useful in particular with regular expressions, `Not`, and `Between`). +`cols` can be any column selector ($COLUMNINDEX_STR; $MULTICOLUMNINDEX_STR). + +See also [propertynames](@ref) which returns a `Vector{Symbol}`. """ Base.names(df::AbstractDataFrame) = names(index(df)) function Base.names(df::AbstractDataFrame, cols) - sel = index(df)[cols] - return _names(index(df))[sel isa Int ? (sel:sel) : sel] + nms = _names(index(df)) + idx = index(df)[cols] + idxs = idx isa Int ? (idx:idx) : idx + return [string(nms[i]) for i in idxs] end +# _names returns Vector{Symbol} without copying _names(df::AbstractDataFrame) = _names(index(df)) +# separate methods are needed due to dispatch ambiguity Compat.hasproperty(df::AbstractDataFrame, s::Symbol) = haskey(index(df), s) +Compat.hasproperty(df::AbstractDataFrame, s::AbstractString) = haskey(index(df), s) """ - rename!(df::AbstractDataFrame, vals::AbstractVector{Symbol}; makeunique::Bool=false) - rename!(df::AbstractDataFrame, vals::AbstractVector{<:AbstractString}; makeunique::Bool=false) + rename!(df::AbstractDataFrame, vals::AbstractVector{Symbol}; + makeunique::Bool=false) + rename!(df::AbstractDataFrame, vals::AbstractVector{<:AbstractString}; + makeunique::Bool=false) rename!(df::AbstractDataFrame, (from => to)::Pair...) rename!(df::AbstractDataFrame, d::AbstractDict) rename!(df::AbstractDataFrame, d::AbstractVector{<:Pair}) @@ -97,7 +107,7 @@ Each name is changed at most once. Permutation of names is allowed. - `df` : the `AbstractDataFrame` - `d` : an `AbstractDict` or an `AbstractVector` of `Pair`s that maps the original names or column numbers to new names -- `f` : a function which for each column takes the old name (a `Symbol`) +- `f` : a function which for each column takes the old name as a `String` and returns the new name that gets converted to a `Symbol` - `vals` : new column names as a vector of `Symbol`s or `AbstractString`s of the same length as the number of columns in `df` @@ -105,8 +115,8 @@ Each name is changed at most once. Permutation of names is allowed. if duplicate names are found; if `true`, duplicate names will be suffixed with `_i` (`i` starting at 1 for the first duplicate). -If pairs are passed to `rename!` (as positional arguments or in a dictionary or a vector) -then: +If pairs are passed to `rename!` (as positional arguments or in a dictionary or +a vector) then: * `from` value can be a `Symbol`, an `AbstractString` or an `Integer`; * `to` value can be a `Symbol` or an `AbstractString`. @@ -138,7 +148,8 @@ julia> rename!(df, [:a, :b, :c]) │ 1 │ 1 │ 2 │ 3 │ julia> rename!(df, [:a, :b, :a]) -ERROR: ArgumentError: Duplicate variable names: :a. Pass makeunique=true to make them unique using a suffix automatically. +ERROR: ArgumentError: Duplicate variable names: :a. Pass makeunique=true to make +them unique using a suffix automatically. julia> rename!(df, [:a, :b, :a], makeunique=true) 1×3 DataFrame @@ -147,9 +158,7 @@ julia> rename!(df, [:a, :b, :a], makeunique=true) ├─────┼───────┼───────┼───────┤ │ 1 │ 1 │ 2 │ 3 │ -julia> rename!(df) do x - uppercase(string(x)) - end +julia> rename!(uppercase, df) 1×3 DataFrame │ Row │ A │ B │ A_1 │ │ │ Int64 │ Int64 │ Int64 │ @@ -203,8 +212,10 @@ function rename!(f::Function, df::AbstractDataFrame) end """ - rename(df::AbstractDataFrame, vals::AbstractVector{Symbol}; makeunique::Bool=false) - rename(df::AbstractDataFrame, vals::AbstractVector{<:AbstractString}; makeunique::Bool=false) + rename(df::AbstractDataFrame, vals::AbstractVector{Symbol}; + makeunique::Bool=false) + rename(df::AbstractDataFrame, vals::AbstractVector{<:AbstractString}; + makeunique::Bool=false) rename(df::AbstractDataFrame, (from => to)::Pair...) rename(df::AbstractDataFrame, d::AbstractDict) rename(df::AbstractDataFrame, d::AbstractVector{<:Pair}) @@ -217,7 +228,7 @@ Each name is changed at most once. Permutation of names is allowed. - `df` : the `AbstractDataFrame` - `d` : an `AbstractDict` or an `AbstractVector` of `Pair`s that maps the original names or column numbers to new names -- `f` : a function which for each column takes the old name (a `Symbol`) +- `f` : a function which for each column takes the old name as a `String` and returns the new name that gets converted to a `Symbol` - `vals` : new column names as a vector of `Symbol`s or `AbstractString`s of the same length as the number of columns in `df` @@ -225,8 +236,8 @@ Each name is changed at most once. Permutation of names is allowed. if duplicate names are found; if `true`, duplicate names will be suffixed with `_i` (`i` starting at 1 for the first duplicate). -If pairs are passed to `rename` (as positional arguments or in a dictionary or a vector) -then: +If pairs are passed to `rename` (as positional arguments or in a dictionary or +a vector) then: * `from` value can be a `Symbol`, an `AbstractString` or an `Integer`; * `to` value can be a `Symbol` or an `AbstractString`. @@ -271,9 +282,7 @@ julia> rename(df, Dict("i" => "A", "x" => "X")) ├─────┼───────┼───────┼───────┤ │ 1 │ 1 │ 2 │ 3 │ -julia> rename(df) do x - uppercase(string(x)) - end +julia> rename(uppercase, df) 1×3 DataFrame │ Row │ I │ X │ Y │ │ │ Int64 │ Int64 │ Int64 │ @@ -333,9 +342,17 @@ Return the number of dimensions of a data frame, which is always `2`. Base.ndims(::AbstractDataFrame) = 2 Base.ndims(::Type{<:AbstractDataFrame}) = 2 +# separate methods are needed due to dispatch ambiguity Base.getproperty(df::AbstractDataFrame, col_ind::Symbol) = df[!, col_ind] +Base.getproperty(df::AbstractDataFrame, col_ind::AbstractString) = df[!, col_ind] + # Private fields are never exposed since they can conflict with column names -Base.propertynames(df::AbstractDataFrame, private::Bool=false) = Tuple(_names(df)) +""" + propertynames(df::AbstractDataFrame) + +Return a freshly allocated `Vector{Symbol}` of names of columns contained in `df`. +""" +Base.propertynames(df::AbstractDataFrame, private::Bool=false) = copy(_names(df)) ############################################################################## ## @@ -421,44 +438,46 @@ Base.last(df::AbstractDataFrame, n::Integer) = df[max(1,nrow(df)-n+1):nrow(df), """ describe(df::AbstractDataFrame; cols=:) - describe(df::AbstractDataFrame, stats::Union{Symbol, Pair{<:Symbol}}...; cols=:) + describe(df::AbstractDataFrame, stats::Union{Symbol, Pair}...; cols=:) Return descriptive statistics for a data frame as a new `DataFrame` where each row represents a variable and each column a summary statistic. # Arguments - `df` : the `AbstractDataFrame` -- `stats::Union{Symbol, Pair{<:Symbol}}...` : the summary statistics to report. +- `stats::Union{Symbol, Pair}...` : the summary statistics to report. Arguments can be: - - A symbol from the list `:mean`, `:std`, `:min`, `:q25`, + - A symbol from the list `:mean`, `:std`, `:min`, `:q25`, `:median`, `:q75`, `:max`, `:eltype`, `:nunique`, `:first`, `:last`, and - `:nmissing`. The default statistics used - are `:mean`, `:min`, `:median`, `:max`, `:nunique`, `:nmissing`, and `:eltype`. + `:nmissing`. The default statistics used are `:mean`, `:min`, `:median`, + `:max`, `:nunique`, `:nmissing`, and `:eltype`. - `:all` as the only `Symbol` argument to return all statistics. - - A `name => function` pair where `name` is a `Symbol`. This will create - a column of summary statistics with the provided name. + - A `name => function` pair where `name` is a `Symbol` or string. This will + create a column of summary statistics with the provided name. - `cols` : a keyword argument allowing to select only a subset of columns from `df` - to describe; all standard column selection methods are allowed. + to describe. Can be any column selector ($COLUMNINDEX_STR; $MULTICOLUMNINDEX_STR). # Details -For `Real` columns, compute the mean, standard deviation, minimum, first quantile, median, -third quantile, and maximum. If a column does not derive from `Real`, `describe` will -attempt to calculate all statistics, using `nothing` as a fall-back in the case of an error. +For `Real` columns, compute the mean, standard deviation, minimum, first +quantile, median, third quantile, and maximum. If a column does not derive from +`Real`, `describe` will attempt to calculate all statistics, using `nothing` as +a fall-back in the case of an error. When `stats` contains `:nunique`, `describe` will report the number of unique values in a column. If a column's base type derives from `Real`, `:nunique` will return `nothing`s. -Missing values are filtered in the calculation of all statistics, however the column -`:nmissing` will report the number of missing values of that variable. -If the column does not allow missing values, `nothing` is returned. -Consequently, `nmissing = 0` indicates that the column allows -missing values, but does not currently contain any. +Missing values are filtered in the calculation of all statistics, however the +column `:nmissing` will report the number of missing values of that variable. If +the column does not allow missing values, `nothing` is returned. Consequently, +`nmissing = 0` indicates that the column allows missing values, but does not +currently contain any. -If custom functions are provided, they are called repeatedly with the vector corresponding -to each column as the only argument. For columns allowing for missing values, -the vector is wrapped in a call to [`skipmissing`](@ref): custom functions must therefore -support such objects (and not only vectors), and cannot access missing values. +If custom functions are provided, they are called repeatedly with the vector +corresponding to each column as the only argument. For columns allowing for +missing values, the vector is wrapped in a call to [`skipmissing`](@ref): custom +functions must therefore support such objects (and not only vectors), and cannot +access missing values. # Examples ```julia @@ -513,7 +532,9 @@ julia> describe(df, :min, :sum => sum, cols=:x) │ 1 │ x │ 0.1 │ 5.5 │ ``` """ -DataAPI.describe(df::AbstractDataFrame, stats::Union{Symbol, Pair{Symbol}}...; cols=:) = +DataAPI.describe(df::AbstractDataFrame, + stats::Union{Symbol, Pair{<:SymbolOrString}}...; + cols=:) = _describe(select(df, cols, copycols=false), collect(stats)) DataAPI.describe(df::AbstractDataFrame; cols=:) = @@ -538,22 +559,23 @@ function _describe(df::AbstractDataFrame, stats::AbstractVector) throw(ArgumentError(":$not_allowed not allowed." * allowed_msg)) end - custom_funs = Pair[s for s in stats if s isa Pair] + custom_funs = Pair[Symbol(s[1]) => s[2] for s in stats if s isa Pair] - ordered_names = [s isa Symbol ? s : s[1] for s in stats] + ordered_names = [s isa Symbol ? s : Symbol(first(s)) for s in stats] if !allunique(ordered_names) - duplicate_names = unique(ordered_names[nonunique(DataFrame(ordered_names = ordered_names))]) + df_ord_names = DataFrame(ordered_names = ordered_names) + duplicate_names = unique(ordered_names[nonunique(df_ord_names)]) throw(ArgumentError("Duplicate names not allowed. Duplicated value(s) are: " * ":$(join(duplicate_names, ", "))")) end # Put the summary stats into the return data frame data = DataFrame() - data.variable = names(df) + data.variable = copy(_names(df)) # An array of Dicts for summary statistics - column_stats_dicts = map(eachcol(df)) do col + col_stats_dicts = map(eachcol(df)) do col if eltype(col) >: Missing t = collect(skipmissing(col)) d = get_stats(t, predefined_funs) @@ -585,7 +607,7 @@ function _describe(df::AbstractDataFrame, stats::AbstractVector) for stat in ordered_names # for each statistic, loop through the columns array to find values # letting the comprehension choose the appropriate type - data[!, stat] = [column_stats_dict[stat] for column_stats_dict in column_stats_dicts] + data[!, stat] = [d[stat] for d in col_stats_dicts] end return data @@ -646,13 +668,13 @@ end ############################################################################## """ - completecases(df::AbstractDataFrame, cols::Colon=:) - completecases(df::AbstractDataFrame, cols::Union{AbstractVector, Regex, Not, Between, All}) - completecases(df::AbstractDataFrame, cols::Union{Integer, Symbol}) + completecases(df::AbstractDataFrame, cols=:) Return a Boolean vector with `true` entries indicating rows without missing values -(complete cases) in data frame `df`. If `cols` is provided, only missing values in -the corresponding columns are considered. +(complete cases) in data frame `df`. + +If `cols` is provided, only missing values in the corresponding columns areconsidered. +`cols` can be any column selector ($COLUMNINDEX_STR; $MULTICOLUMNINDEX_STR). See also: [`dropmissing`](@ref) and [`dropmissing!`](@ref). Use `findall(completecases(df))` to get the indices of the rows. @@ -700,7 +722,8 @@ julia> completecases(df, [:x, :y]) """ function completecases(df::AbstractDataFrame, col::Colon=:) if ncol(df) == 0 - throw(ArgumentError("Unable to compute complete cases of a data frame with no columns")) + throw(ArgumentError("Unable to compute complete cases of a " * + "data frame with no columns")) end res = trues(size(df, 1)) for i in 1:size(df, 2) @@ -712,18 +735,16 @@ end completecases(df::AbstractDataFrame, col::ColumnIndex) = .!ismissing.(df[!, col]) -completecases(df::AbstractDataFrame, cols::Union{AbstractVector, Regex, Not, Between, All}) = +completecases(df::AbstractDataFrame, cols::MultiColumnIndex) = completecases(df[!, cols]) """ - dropmissing(df::AbstractDataFrame, cols::Colon=:; disallowmissing::Bool=true) - dropmissing(df::AbstractDataFrame, cols::Union{AbstractVector, Regex, Not, Between, All}; - disallowmissing::Bool=true) - dropmissing(df::AbstractDataFrame, cols::Union{Integer, Symbol}; - disallowmissing::Bool=true) + dropmissing(df::AbstractDataFrame, cols=:; disallowmissing::Bool=true) Return a copy of data frame `df` excluding rows with missing values. + If `cols` is provided, only missing values in the corresponding columns are considered. +`cols` can be any column selector ($COLUMNINDEX_STR; $MULTICOLUMNINDEX_STR). If `disallowmissing` is `true` (the default) then columns specified in `cols` will be converted so as not to allow for missing values using [`disallowmissing!`](@ref). @@ -781,7 +802,7 @@ julia> dropmissing(df, [:x, :y]) ``` """ function dropmissing(df::AbstractDataFrame, - cols::Union{ColumnIndex, AbstractVector, Regex, Not, Between, All, Colon}=:; + cols::Union{ColumnIndex, MultiColumnIndex}=:; disallowmissing::Bool=true) newdf = df[completecases(df, cols), :] disallowmissing && disallowmissing!(newdf, cols) @@ -789,14 +810,12 @@ function dropmissing(df::AbstractDataFrame, end """ - dropmissing!(df::AbstractDataFrame, cols::Colon=:; disallowmissing::Bool=true) - dropmissing!(df::AbstractDataFrame, cols::Union{AbstractVector, Regex, Not, Between, All}; - disallowmissing::Bool=true) - dropmissing!(df::AbstractDataFrame, cols::Union{Integer, Symbol}; - disallowmissing::Bool=true) + dropmissing!(df::AbstractDataFrame, cols=:; disallowmissing::Bool=true) Remove rows with missing values from data frame `df` and return it. + If `cols` is provided, only missing values in the corresponding columns are considered. +`cols` can be any column selector ($COLUMNINDEX_STR; $MULTICOLUMNINDEX_STR). If `disallowmissing` is `true` (the default) then the `cols` columns will get converted using [`disallowmissing!`](@ref). @@ -852,7 +871,7 @@ julia> dropmissing!(df3, [:x, :y]) ``` """ function dropmissing!(df::AbstractDataFrame, - cols::Union{ColumnIndex, AbstractVector, Regex, Not, Between, All, Colon}=:; + cols::Union{ColumnIndex, MultiColumnIndex}=:; disallowmissing::Bool=true) delete!(df, (!).(completecases(df, cols))) disallowmissing && disallowmissing!(df, cols) @@ -867,10 +886,13 @@ Return a copy of data frame `df` containing only rows for which `function` returns `true`. If `cols` is not specified then the function is passed `DataFrameRow`s. -If `cols` is specified then it should be a valid column selector -(column duplicates are allowed if a vector of `Int` or `Symbol` is passed), -the function is passed elements of the selected columns as separate positional arguments, -unless it is an `AsTable` selector, in which case a `NamedTuple` of these arguments is passed. + +If `cols` is specified then the function is passed elements of the corresponding +columns as separate positional arguments, unless `cols` is an `AsTable` selector, +in which case a `NamedTuple` of these arguments is passed. +`cols` can be any column selector ($COLUMNINDEX_STR; $MULTICOLUMNINDEX_STR), +and column duplicates are allowed if a vector of `Symbol`s, strings, or integers +is passed. Passing `cols` leads to a more efficient execution of the operation for large data frames. @@ -926,13 +948,19 @@ julia> filter(AsTable(:) => nt -> nt.x == 1 || nt.y == "b", df) Base.filter(f, df::AbstractDataFrame) = _filter_helper(df, f, eachrow(df)) Base.filter((col, f)::Pair{<:ColumnIndex}, df::AbstractDataFrame) = _filter_helper(df, f, df[!, col]) -Base.filter((cols, f)::Pair{<:AbstractVector{Int}}, df::AbstractDataFrame) = - (cdf = _columns(df); _filter_helper(df, f, (cdf[i] for i in cols)...)) Base.filter((cols, f)::Pair{<:AbstractVector{Symbol}}, df::AbstractDataFrame) = filter([index(df)[col] for col in cols] => f, df) +Base.filter((cols, f)::Pair{<:AbstractVector{<:AbstractString}}, df::AbstractDataFrame) = + filter([index(df)[col] for col in cols] => f, df) + Base.filter((cols, f)::Pair, df::AbstractDataFrame) = filter(index(df)[cols] => f, df) +function Base.filter((cols, f)::Pair{<:AbstractVector{Int}}, df::AbstractDataFrame) + cdf = _columns(df) + return _filter_helper(df, f, (cdf[i] for i in cols)...) +end + function _filter_helper(df::AbstractDataFrame, f, cols...) if length(cols) == 0 throw(ArgumentError("At least one column must be passed to filter on")) @@ -953,14 +981,17 @@ _filter_helper_astable(df::AbstractDataFrame, nti::Tables.NamedTupleIterator, f) """ filter!(function, df::AbstractDataFrame) + filter!(cols => function, df::AbstractDataFrame) Remove rows from data frame `df` for which `function` returns `false`. If `cols` is not specified then the function is passed `DataFrameRow`s. -If `cols` is specified then it should be a valid column selector -(column duplicates are allowed if a vector of `Int` or `Symbol` is passed), -the function is passed elements of the selected columns as separate positional arguments, -unless it is `AsTable` selector in which case `NamedTuple`s of these arguments are passed. +If `cols` is specified then the function is passed elements of the corresponding +columns as separate positional arguments, unless `cols` is an `AsTable` selector, +in which case a `NamedTuple` of these arguments is passed. +`cols` can be any column selector ($COLUMNINDEX_STR; $MULTICOLUMNINDEX_STR), +and column duplicates are allowed if a vector of `Symbol`s, strings, or integers +is passed. Passing `cols` leads to a more efficient execution of the operation for large data frames. @@ -1025,13 +1056,18 @@ julia> filter!(AsTable(:) => nt -> nt.x == 1 || nt.y == "b", df) Base.filter!(f, df::AbstractDataFrame) = _filter!_helper(df, f, eachrow(df)) Base.filter!((col, f)::Pair{<:ColumnIndex}, df::AbstractDataFrame) = _filter!_helper(df, f, df[!, col]) -Base.filter!((cols, f)::Pair{<:AbstractVector{Int}}, df::AbstractDataFrame) = - (cdf = _columns(df); _filter!_helper(df, f, (cdf[i] for i in cols)...)) Base.filter!((cols, f)::Pair{<:AbstractVector{Symbol}}, df::AbstractDataFrame) = filter!([index(df)[col] for col in cols] => f, df) +Base.filter!((cols, f)::Pair{<:AbstractVector{<:AbstractString}}, df::AbstractDataFrame) = + filter!([index(df)[col] for col in cols] => f, df) Base.filter!((cols, f)::Pair, df::AbstractDataFrame) = filter!(index(df)[cols] => f, df) +function Base.filter!((cols, f)::Pair{<:AbstractVector{Int}}, df::AbstractDataFrame) + cdf = _columns(df) + return _filter!_helper(df, f, (cdf[i] for i in cols)...) +end + function _filter!_helper(df::AbstractDataFrame, f, cols...) if length(cols) == 0 throw(ArgumentError("At least one column must be passed to filter on")) @@ -1052,8 +1088,9 @@ _filter!_helper_astable(df::AbstractDataFrame, nti::Tables.NamedTupleIterator, f function Base.convert(::Type{Matrix}, df::AbstractDataFrame) T = reduce(promote_type, (eltype(v) for v in eachcol(df))) - convert(Matrix{T}, df) + return convert(Matrix{T}, df) end + function Base.convert(::Type{Matrix{T}}, df::AbstractDataFrame) where T n, p = size(df) res = Matrix{T}(undef, n, p) @@ -1064,8 +1101,8 @@ function Base.convert(::Type{Matrix{T}}, df::AbstractDataFrame) where T catch err if err isa MethodError && err.f == convert && !(T >: Missing) && any(ismissing, col) - throw(ArgumentError("cannot convert a DataFrame containing missing values to Matrix{$T} " * - "(found for column $name)")) + throw(ArgumentError("cannot convert a DataFrame containing missing " * + "values to Matrix{$T} (found for column $name)")) else rethrow(err) end @@ -1074,6 +1111,7 @@ function Base.convert(::Type{Matrix{T}}, df::AbstractDataFrame) where T end return res end + Base.Matrix(df::AbstractDataFrame) = Base.convert(Matrix, df) Base.Matrix{T}(df::AbstractDataFrame) where {T} = Base.convert(Matrix{T}, df) @@ -1093,9 +1131,9 @@ equal values (according to `isequal`). See also [`unique`](@ref) and [`unique!`](@ref). # Arguments -- `df` : the AbstractDataFrame -- `cols` : a column indicator (Symbol, Int, Vector{Symbol}, etc.) - specifying the column(s) to compare +- `df` : `AbstractDataFrame` +- `cols` : a selector specifying the column(s) to compare. Can be any column + selector ($COLUMNINDEX_STR; $MULTICOLUMNINDEX_STR). # Examples ```julia @@ -1107,7 +1145,8 @@ nonunique(df, 1) """ function nonunique(df::AbstractDataFrame) if ncol(df) == 0 - throw(ArgumentError("finding duplicate rows in data frame with no columns is not allowed")) + throw(ArgumentError("finding duplicate rows in data frame with no" * + " columns is not allowed")) end gslots = row_group_slots(ntuple(i -> df[!, i], ncol(df)), Val(true))[3] # unique rows are the first encountered group representatives, @@ -1139,8 +1178,9 @@ Base.unique(df::AbstractDataFrame, cols) = unique!(df::AbstractDataFrame, cols) Delete duplicate rows of data frame `df`, keeping only the first occurrence of unique rows. -When `cols` is specified, the return DataFrame contains complete rows, +When `cols` is specified, the returned `DataFrame` contains complete rows, retaining in each case the first instance for which `df[cols]` is unique. +`cols` can be any column selector ($COLUMNINDEX_STR; $MULTICOLUMNINDEX_STR). When `unique` is called a new data frame is returned; `unique!` updates `df` in-place. @@ -1162,13 +1202,6 @@ unique!(df) # modifies df """ (unique, unique!) -function without(df::AbstractDataFrame, icols::Vector{<:Integer}) - newcols = setdiff(1:ncol(df), icols) - view(df, :, newcols) -end -without(df::AbstractDataFrame, i::Int) = without(df, [i]) -without(df::AbstractDataFrame, c::Any) = without(df, index(df)[c]) - """ hcat(df::AbstractDataFrame...; makeunique::Bool=false, copycols::Bool=true) @@ -1247,24 +1280,29 @@ Base.hcat(df1::AbstractDataFrame, df2::AbstractDataFrame, dfn::AbstractDataFrame makeunique=makeunique, copycols=copycols) """ - vcat(dfs::AbstractDataFrame...; cols::Union{Symbol, AbstractVector{Symbol}}=:setequal) + vcat(dfs::AbstractDataFrame...; + cols::Union{Symbol, AbstractVector{Symbol}, + AbstractVector{<:AbstractString}}=:setequal) Vertically concatenate `AbstractDataFrame`s. The `cols` keyword argument determines the columns of the returned data frame: -* `:setequal`: require all data frames to have the same column names disregarding order. - If they appear in different orders, the order of the first provided data frame is used. -* `:orderequal`: require all data frames to have the same column names and in the same order. +* `:setequal`: require all data frames to have the same column names disregarding + order. If they appear in different orders, the order of the first provided data + frame is used. +* `:orderequal`: require all data frames to have the same column names and in the + same order. * `:intersect`: only the columns present in *all* provided data frames are kept. If the intersection is empty, an empty data frame is returned. * `:union`: columns present in *at least one* of the provided data frames are kept. Columns not present in some data frames are filled with `missing` where necessary. -* A vector of `Symbol`s: only listed columns are kept. +* A vector of `Symbol`s or strings: only listed columns are kept. Columns not present in some data frames are filled with `missing` where necessary. -The order of columns is determined by the order they appear in the included data frames, -searching through the header of the first data frame, then the second, etc. +The order of columns is determined by the order they appear in the included data +frames, searching through the header of the first data frame, then the second, +etc. The element types of columns are determined using `promote_type`, as with `vcat` for `AbstractVector`s. @@ -1330,16 +1368,20 @@ julia> vcat(d4, df1) """ Base.vcat(dfs::AbstractDataFrame...; - cols::Union{Symbol, AbstractVector{Symbol}}=:setequal) = + cols::Union{Symbol, AbstractVector{Symbol}, + AbstractVector{<:AbstractString}}=:setequal) = reduce(vcat, dfs; cols=cols) Base.reduce(::typeof(vcat), - dfs::Union{AbstractVector{<:AbstractDataFrame}, Tuple{Vararg{AbstractDataFrame}}}; - cols::Union{Symbol, AbstractVector{Symbol}}=:setequal) = + dfs::Union{AbstractVector{<:AbstractDataFrame}, + Tuple{Vararg{AbstractDataFrame}}}; + cols::Union{Symbol, AbstractVector{Symbol}, + AbstractVector{<:AbstractString}}=:setequal) = _vcat([df for df in dfs if ncol(df) != 0]; cols=cols) function _vcat(dfs::AbstractVector{<:AbstractDataFrame}; - cols::Union{Symbol, AbstractVector{Symbol}}=:setequal) + cols::Union{Symbol, AbstractVector{Symbol}, + AbstractVector{<:AbstractString}}=:setequal) isempty(dfs) && return DataFrame() # Array of all headers @@ -1354,8 +1396,8 @@ function _vcat(dfs::AbstractVector{<:AbstractDataFrame}; if cols === :orderequal header = unionunique if length(uniqueheaders) > 1 - throw(ArgumentError("when `cols=:orderequal` all data frames need to have the same column names " * - "and be in the same order")) + throw(ArgumentError("when `cols=:orderequal` all data frames need to " * + "have the same column names and be in the same order")) end elseif cols === :setequal || cols === :equal if cols === :equal @@ -1386,8 +1428,11 @@ function _vcat(dfs::AbstractVector{<:AbstractDataFrame}; throw(ArgumentError("Invalid `cols` value :$cols. " * "Only `:orderequal`, `:setequal`, `:intersect`, " * "`:union`, or a vector of column names is allowed.")) - else + elseif cols isa AbstractVector{Symbol} header = cols + else + @assert cols isa AbstractVector{<:AbstractString} + header = Symbol.(cols) end length(header) == 0 && return DataFrame() @@ -1534,16 +1579,17 @@ julia> ncol(df) (nrow, ncol) """ - disallowmissing(df::AbstractDataFrame, - cols::Union{ColumnIndex, AbstractVector, Regex, Not, Between, All, Colon}=:; - error::Bool=true) + disallowmissing(df::AbstractDataFrame, cols=:; error::Bool=true) Return a copy of data frame `df` with columns `cols` converted from element type `Union{T, Missing}` to `T` to drop support for missing values. +`cols` can be any column selector ($COLUMNINDEX_STR; $MULTICOLUMNINDEX_STR). + If `cols` is omitted all columns in the data frame are converted. -If `error=false` then columns containing a `missing` value will be skipped instead of throwing an error. +If `error=false` then columns containing a `missing` value will be skipped instead +of throwing an error. **Examples** @@ -1582,7 +1628,7 @@ julia> disallowmissing(df, error=false) │ 2 │ missing │ 2 │ """ function Missings.disallowmissing(df::AbstractDataFrame, - cols::Union{ColumnIndex, AbstractVector, Regex, Not, Between, All, Colon}=:; + cols::Union{ColumnIndex, MultiColumnIndex}=:; error::Bool=true) idxcols = Set(index(df)[cols]) newcols = AbstractVector[] @@ -1599,16 +1645,17 @@ function Missings.disallowmissing(df::AbstractDataFrame, push!(newcols, copy(x)) end end - DataFrame(newcols, _names(df), copycols=false) + return DataFrame(newcols, _names(df), copycols=false) end """ - allowmissing(df::AbstractDataFrame, - cols::Union{ColumnIndex, AbstractVector, Regex, Not, Between, All, Colon}=:) + allowmissing(df::AbstractDataFrame, cols=:) Return a copy of data frame `df` with columns `cols` converted to element type `Union{T, Missing}` from `T` to allow support for missing values. +`cols` can be any column selector ($COLUMNINDEX_STR; $MULTICOLUMNINDEX_STR). + If `cols` is omitted all columns in the data frame are converted. **Examples** @@ -1632,7 +1679,7 @@ julia> allowmissing(df) ``` """ function Missings.allowmissing(df::AbstractDataFrame, - cols::Union{ColumnIndex, AbstractVector, Regex, Not, Between, All, Colon}=:) + cols::Union{ColumnIndex, MultiColumnIndex}=:) idxcols = Set(index(df)[cols]) newcols = AbstractVector[] for i in axes(df, 2) @@ -1644,23 +1691,24 @@ function Missings.allowmissing(df::AbstractDataFrame, push!(newcols, copy(x)) end end - DataFrame(newcols, _names(df), copycols=false) + return DataFrame(newcols, _names(df), copycols=false) end """ - categorical(df::AbstractDataFrame, cols::Type=Union{AbstractString, Missing}; - compress::Bool=false) - categorical(df::AbstractDataFrame, - cols::Union{ColumnIndex, AbstractVector, Regex, Not, Between, All, Colon}; + categorical(df::AbstractDataFrame, cols=Union{AbstractString, Missing}; compress::Bool=false) Return a copy of data frame `df` with columns `cols` converted to `CategoricalVector`. + +`cols` can be any column selector ($COLUMNINDEX_STR; $MULTICOLUMNINDEX_STR) +or a `Type`. + If `categorical` is called with the `cols` argument being a `Type`, then all columns whose element type is a subtype of this type (by default `Union{AbstractString, Missing}`) will be converted to categorical. -If the `compress` keyword argument is set to `true` then the created `CategoricalVector`s -will be compressed. +If the `compress` keyword argument is set to `true` then the created +`CategoricalVector`s will be compressed. All created `CategoricalVector`s are unordered. @@ -1693,7 +1741,7 @@ julia> categorical(df, :) ``` """ function CategoricalArrays.categorical(df::AbstractDataFrame, - cols::Union{ColumnIndex, AbstractVector, Regex, Not, Between, All, Colon}; + cols::Union{ColumnIndex, MultiColumnIndex}; compress::Bool=false) idxcols = Set(index(df)[cols]) newcols = AbstractVector[] @@ -1726,17 +1774,19 @@ function CategoricalArrays.categorical(df::AbstractDataFrame, end """ - flatten(df::AbstractDataFrame, - cols::Union{ColumnIndex, AbstractVector, Regex, Not, Between, All, Colon}) - -When columns `cols` of data frame `df` have iterable elements that define `length` (for -example a `Vector` of `Vector`s), return a `DataFrame` where each element of each `col` in -`cols` is flattened, meaning the column corresponding to `col` becomes a longer vector -where the original entries are concatenated. Elements of row `i` of `df` in columns other -than `cols` will be repeated according to the length of `df[i, col]`. These lengths must -therefore be the same for each `col` in `cols`, or else an error is raised. Note that these -elements are not copied, and thus if they are mutable changing them in the returned -`DataFrame` will affect `df`. + flatten(df::AbstractDataFrame, cols) + +When columns `cols` of data frame `df` have iterable elements that define +`length` (for example a `Vector` of `Vector`s), return a `DataFrame` where each +element of each `col` in `cols` is flattened, meaning the column corresponding +to `col` becomes a longer vector where the original entries are concatenated. +Elements of row `i` of `df` in columns other than `cols` will be repeated +according to the length of `df[i, col]`. These lengths must therefore be the +same for each `col` in `cols`, or else an error is raised. Note that these +elements are not copied, and thus if they are mutable changing them in the +returned `DataFrame` will affect `df`. + +`cols` can be any column selector ($COLUMNINDEX_STR; $MULTICOLUMNINDEX_STR). # Examples @@ -1801,7 +1851,7 @@ are not the the same in row 2 ``` """ function flatten(df::AbstractDataFrame, - cols::Union{ColumnIndex, AbstractVector, Regex, Not, Between, All, Colon}) + cols::Union{ColumnIndex, MultiColumnIndex}) _check_consistency(df) idxcols = index(df)[cols] @@ -1812,7 +1862,7 @@ function flatten(df::AbstractDataFrame, v = df[!, col] if any(x -> length(x[1]) != x[2], zip(v, lengths)) r = findfirst(x -> x != 0, length.(v) .- lengths) - colnames = names(df) + colnames = _names(df) throw(ArgumentError("Lengths of iterables stored in columns :$(colnames[col1])" * " and :$(colnames[col]) are not the the same in row $r")) end @@ -1829,13 +1879,14 @@ function flatten(df::AbstractDataFrame, reduce(vcat, col_to_flatten) : collect(Iterators.flatten(col_to_flatten)) - insertcols!(new_df, col, names(df)[col] => flattened_col) + insertcols!(new_df, col, _names(df)[col] => flattened_col) end return new_df end -function repeat_lengths!(longnew::AbstractVector, shortold::AbstractVector, lengths::AbstractVector{Int}) +function repeat_lengths!(longnew::AbstractVector, shortold::AbstractVector, + lengths::AbstractVector{Int}) counter = 1 @inbounds for i in eachindex(shortold) l = lengths[i] diff --git a/src/abstractdataframe/io.jl b/src/abstractdataframe/io.jl index a4b5e4cea6..fb0cb135cb 100644 --- a/src/abstractdataframe/io.jl +++ b/src/abstractdataframe/io.jl @@ -123,7 +123,7 @@ function _show(io::IO, ::MIME"text/html", df::AbstractDataFrame; else "" end - write(io, "
$(digitsep(size(df, 1))) rows × $(digitsep(ncol(df))) columns$omitmsg
") + write(io, "$(digitsep(nrow(df))) rows × $(digitsep(ncol(df))) columns$omitmsg
") end for row in 1:mxrow write(io, "$(typeof(gd).name) with $N $groupstr based on $keystr: $keys
") @@ -189,7 +188,8 @@ function Base.show(io::IO, mime::MIME"text/html", gd::GroupedDataFrame) nrows = size(gd[1], 1) rows = nrows > 1 ? "rows" : "row" - identified_groups = [html_escape(string(parent_names[col], " = ", repr(first(gd[1][!, col])))) + identified_groups = [html_escape(string(parent_names[col], " = ", + repr(first(gd[1][!, col])))) for col in gd.cols] write(io, "First Group ($nrows $rows): ") @@ -201,7 +201,8 @@ function Base.show(io::IO, mime::MIME"text/html", gd::GroupedDataFrame) nrows = size(gd[N], 1) rows = nrows > 1 ? "rows" : "row" - identified_groups = [html_escape(string(parent_names[col], " = ", repr(first(gd[N][!, col])))) + identified_groups = [html_escape(string(parent_names[col], " = ", + repr(first(gd[N][!, col])))) for col in gd.cols] write(io, "
⋮
") @@ -265,7 +266,8 @@ function _show(io::IO, ::MIME"text/latex", df::AbstractDataFrame; write(io, "\t\\hline\n") if eltypes write(io, "\t& ") - header = join(map(c -> latex_escape(string(compacttype(c))), eltype.(eachcol(df)[1:mxcol])), " & ") + header = join(map(c -> latex_escape(string(compacttype(c))), + eltype.(eachcol(df)[1:mxcol])), " & ") write(io, header) mxcol < size(df, 2) && write(io, " & ") write(io, "\\\\\n") @@ -311,9 +313,8 @@ Base.show(io::IO, mime::MIME"text/latex", dfcs::DataFrameColumns; eltypes::Bool= function Base.show(io::IO, mime::MIME"text/latex", gd::GroupedDataFrame) N = length(gd) - keynames = names(gd.parent)[gd.cols] - parent_names = names(gd.parent) - keys = join(latex_escape.(string.(keynames)), ", ") + parent_names = _names(gd) + keys = join(latex_escape.(string.(groupcols(gd))), ", ") keystr = length(gd.cols) > 1 ? "keys" : "key" groupstr = N > 1 ? "groups" : "group" write(io, "$(typeof(gd).name) with $N $groupstr based on $keystr: $keys\n\n") @@ -353,7 +354,8 @@ end ############################################################################## escapedprint(io::IO, x::Any, escapes::AbstractString) = ourshow(io, x) -escapedprint(io::IO, x::AbstractString, escapes::AbstractString) = escape_string(io, x, escapes) +escapedprint(io::IO, x::AbstractString, escapes::AbstractString) = + escape_string(io, x, escapes) function printtable(io::IO, df::AbstractDataFrame; diff --git a/src/abstractdataframe/iteration.jl b/src/abstractdataframe/iteration.jl index 5ffa7f3313..567be68f4d 100644 --- a/src/abstractdataframe/iteration.jl +++ b/src/abstractdataframe/iteration.jl @@ -81,15 +81,22 @@ Base.size(itr::DataFrameRows) = (size(parent(itr), 1), ) Base.@propagate_inbounds function Base.getindex(itr::DataFrameRows, i::Int) df = parent(itr) - DataFrameRow(df, index(df), i) + return DataFrameRow(df, index(df), i) end Base.@propagate_inbounds function Base.getindex(itr::DataFrameRows{<:SubDataFrame}, i::Int) sdf = parent(itr) - DataFrameRow(parent(sdf), index(sdf), rows(sdf)[i]) + return DataFrameRow(parent(sdf), index(sdf), rows(sdf)[i]) end -Base.getproperty(itr::DataFrameRows, col_ind::Symbol) = getproperty(parent(itr), col_ind) +# separate methods are needed due to dispatch ambiguity +Base.getproperty(itr::DataFrameRows, col_ind::Symbol) = + getproperty(parent(itr), col_ind) +Base.getproperty(itr::DataFrameRows, col_ind::AbstractString) = + getproperty(parent(itr), col_ind) +Compat.hasproperty(itr::DataFrameRows, s::Symbol) = haskey(index(parent(itr)), s) +Compat.hasproperty(itr::DataFrameRows, s::AbstractString) = haskey(index(parent(itr)), s) + # Private fields are never exposed since they can conflict with column names Base.propertynames(itr::DataFrameRows, private::Bool=false) = propertynames(parent(itr)) @@ -158,16 +165,26 @@ end Base.getindex(itr::DataFrameColumns, j::Symbol) = parent(itr)[!, j] -Base.getproperty(itr::DataFrameColumns, col_ind::Symbol) = getproperty(parent(itr), col_ind) +# separate methods are needed due to dispatch ambiguity +Base.getproperty(itr::DataFrameColumns, col_ind::Symbol) = + getproperty(parent(itr), col_ind) +Base.getproperty(itr::DataFrameColumns, col_ind::AbstractString) = + getproperty(parent(itr), col_ind) +Compat.hasproperty(itr::DataFrameColumns, s::Symbol) = + haskey(index(parent(itr)), s) +Compat.hasproperty(itr::DataFrameColumns, s::AbstractString) = + haskey(index(parent(itr)), s) + # Private fields are never exposed since they can conflict with column names -Base.propertynames(itr::DataFrameColumns, private::Bool=false) = propertynames(parent(itr)) +Base.propertynames(itr::DataFrameColumns, private::Bool=false) = + propertynames(parent(itr)) """ keys(dfc::DataFrameColumns) -Get a vector of column names of `dfc`. +Get a vector of column names of `dfc` as `Symbol`s. """ -Base.keys(itr::DataFrameColumns) = names(parent(itr)) +Base.keys(itr::DataFrameColumns) = propertynames(itr) """ pairs(dfc::DataFrameColumns) @@ -236,6 +253,7 @@ end Base.parent(itr::Union{DataFrameRows, DataFrameColumns}) = getfield(itr, :df) Base.names(itr::Union{DataFrameRows, DataFrameColumns}) = names(parent(itr)) +Base.names(itr::Union{DataFrameRows, DataFrameColumns}, cols) = names(parent(itr), cols) function Base.show(io::IO, dfrs::DataFrameRows; allrows::Bool = !get(io, :limit, false), diff --git a/src/abstractdataframe/join.jl b/src/abstractdataframe/join.jl index a5bcc84ed1..5150f25452 100644 --- a/src/abstractdataframe/join.jl +++ b/src/abstractdataframe/join.jl @@ -6,7 +6,8 @@ similar_missing(dv::AbstractArray{T}, dims::Union{Int, Tuple{Vararg{Int}}}) where {T} = fill!(similar(dv, Union{T, Missing}, dims), missing) -const OnType = Union{Symbol, NTuple{2,Symbol}, Pair{Symbol,Symbol}} +const OnType = Union{SymbolOrString, NTuple{2,Symbol}, Pair{Symbol,Symbol}, + Pair{<:AbstractString, <:AbstractString}} # helper structure for DataFrames joining struct DataFrameJoiner{DF1<:AbstractDataFrame, DF2<:AbstractDataFrame} @@ -23,16 +24,18 @@ struct DataFrameJoiner{DF1<:AbstractDataFrame, DF2<:AbstractDataFrame} left_on = Symbol[] right_on = Symbol[] for v in on_cols - if v isa Symbol - push!(left_on, v) - push!(right_on, v) - elseif v isa Pair{Symbol,Symbol} || v isa NTuple{2,Symbol} - push!(left_on, first(v)) - push!(right_on, last(v)) + if v isa SymbolOrString + push!(left_on, Symbol(v)) + push!(right_on, Symbol(v)) + elseif v isa Union{Pair{Symbol,Symbol}, + Pair{<:AbstractString, <:AbstractString}, + NTuple{2,Symbol}} + push!(left_on, Symbol(first(v))) + push!(right_on, Symbol(last(v))) if v isa NTuple{2,Symbol} - Base.depwarn("Using a `Tuple{Symbol, Symbol}` or a vector containing such tuples " * - "as a value of `on` keyword argument is deprecated: use " * - "`Pair{Symbol,Symbol}` instead.", :join) + Base.depwarn("Using a `Tuple{Symbol, Symbol}` or a vector containing " * + "such tuples as a value of `on` keyword argument is " * + "deprecated: use `Pair{Symbol,Symbol}` instead.", :join) end else @@ -75,14 +78,15 @@ function compose_joined_table(joiner::DataFrameJoiner, kind::Symbol, roil = length(rightonly_ixs) if loil > 0 - # combine the matched (left_ixs.orig) and non-matched (leftonly_ixs.orig) indices of the left table rows - # preserving the original rows order + # combine the matched (left_ixs.orig) and non-matched (leftonly_ixs.orig) + # indices of the left table rows, preserving the original rows order all_orig_left_ixs = similar(left_ixs.orig, lil + loil) @inbounds all_orig_left_ixs[left_ixs.join] = left_ixs.orig @inbounds all_orig_left_ixs[leftonly_ixs.join] = leftonly_ixs.orig else # the result contains only the left rows that are matched to right rows (left_ixs) - all_orig_left_ixs = left_ixs.orig # no need to copy left_ixs.orig as it's not used elsewhere + # no need to copy left_ixs.orig as it's not used elsewhere + all_orig_left_ixs = left_ixs.orig end # permutation to swap rightonly and leftonly rows right_perm = vcat(1:ril, ril+roil+1:ril+roil+loil, ril+1:ril+roil) @@ -93,7 +97,7 @@ function compose_joined_table(joiner::DataFrameJoiner, kind::Symbol, all_orig_right_ixs = vcat(right_ixs.orig, rightonly_ixs.orig) # compose right half of the result taking all right columns excluding on - dfr_noon = without(joiner.dfr, joiner.right_on) + dfr_noon = select(joiner.dfr, Not(joiner.right_on), copycols=false) nrow = length(all_orig_left_ixs) + roil @assert nrow == length(all_orig_right_ixs) + loil @@ -112,7 +116,7 @@ function compose_joined_table(joiner::DataFrameJoiner, kind::Symbol, copyto!(cols[i+ncleft], view(col, all_orig_right_ixs)) permute!(cols[i+ncleft], right_perm) end - res = DataFrame(cols, vcat(names(joiner.dfl), names(dfr_noon)), + res = DataFrame(cols, vcat(_names(joiner.dfl), _names(dfr_noon)), makeunique=makeunique, copycols=false) if length(rightonly_ixs.join) > 0 @@ -121,7 +125,8 @@ function compose_joined_table(joiner::DataFrameJoiner, kind::Symbol, for (on_col_ix, on_col) in enumerate(joiner.left_on) # fix the result of the rightjoin by taking the nonmissing values from the right table offset = nrow - length(rightonly_ixs.orig) + 1 - copyto!(res[!, on_col], offset, view(joiner.dfr_on[!, on_col_ix], rightonly_ixs.orig)) + copyto!(res[!, on_col], offset, + view(joiner.dfr_on[!, on_col_ix], rightonly_ixs.orig)) end end if kind ∈ (:right, :outer) && !isempty(rightonly_ixs.join) @@ -166,7 +171,8 @@ function update_row_maps!(left_table::AbstractDataFrame, ixs end @inline update!(ixs::Nothing, orig_ixs::AbstractArray) = nothing - @inline update!(mask::Vector{Bool}, orig_ixs::AbstractArray) = (mask[orig_ixs] .= false) + @inline update!(mask::Vector{Bool}, orig_ixs::AbstractArray) = + (mask[orig_ixs] .= false) # iterate over left rows and compose the left<->right index map right_dict_cols = ntuple(i -> right_dict.df[!, i], ncol(right_dict.df)) @@ -211,18 +217,21 @@ function update_row_maps!(left_table::AbstractDataFrame, leftonly_ixs = init_map(left_table, map_leftonly) right_ixs = init_map(right_table, map_right) rightonly_mask = map_rightonly ? fill(true, nrow(right_table)) : nothing - update_row_maps!(left_table, right_table, right_dict, left_ixs, leftonly_ixs, right_ixs, rightonly_mask) + update_row_maps!(left_table, right_table, right_dict, left_ixs, leftonly_ixs, + right_ixs, rightonly_mask) if map_rightonly rightonly_orig_ixs = findall(rightonly_mask) + leftonly_ixs_len = leftonly_ixs === nothing ? 0 : length(leftonly_ixs) rightonly_ixs = RowIndexMap(rightonly_orig_ixs, - collect(length(right_ixs.orig) + - (leftonly_ixs === nothing ? 0 : length(leftonly_ixs)) .+ + collect(length(right_ixs.orig) .+ + leftonly_ixs_len .+ (1:length(rightonly_orig_ixs)))) else rightonly_ixs = nothing end - return to_bimap(left_ixs), to_bimap(leftonly_ixs), to_bimap(right_ixs), to_bimap(rightonly_ixs) + return to_bimap(left_ixs), to_bimap(leftonly_ixs), + to_bimap(right_ixs), to_bimap(rightonly_ixs) end function _join(df1::AbstractDataFrame, df2::AbstractDataFrame; @@ -276,24 +285,28 @@ function _join(df1::AbstractDataFrame, df2::AbstractDataFrame; end if kind == :inner - joined = compose_joined_table(joiner, kind, update_row_maps!(joiner.dfl_on, joiner.dfr_on, - group_rows(joiner.dfr_on), - true, false, true, false)..., + joined = compose_joined_table(joiner, kind, + update_row_maps!(joiner.dfl_on, joiner.dfr_on, + group_rows(joiner.dfr_on), + true, false, true, false)..., makeunique=makeunique) elseif kind == :left - joined = compose_joined_table(joiner, kind, update_row_maps!(joiner.dfl_on, joiner.dfr_on, - group_rows(joiner.dfr_on), - true, true, true, false)..., + joined = compose_joined_table(joiner, kind, + update_row_maps!(joiner.dfl_on, joiner.dfr_on, + group_rows(joiner.dfr_on), + true, true, true, false)..., makeunique=makeunique) elseif kind == :right - joined = compose_joined_table(joiner, kind, update_row_maps!(joiner.dfr_on, joiner.dfl_on, - group_rows(joiner.dfl_on), - true, true, true, false)[[3, 4, 1, 2]]..., + joined = compose_joined_table(joiner, kind, + update_row_maps!(joiner.dfr_on, joiner.dfl_on, + group_rows(joiner.dfl_on), + true, true, true, false)[[3, 4, 1, 2]]..., makeunique=makeunique) elseif kind == :outer - joined = compose_joined_table(joiner, kind, update_row_maps!(joiner.dfl_on, joiner.dfr_on, - group_rows(joiner.dfr_on), - true, true, true, true)..., + joined = compose_joined_table(joiner, kind, + update_row_maps!(joiner.dfl_on, joiner.dfr_on, + group_rows(joiner.dfr_on), + true, true, true, true)..., makeunique=makeunique) elseif kind == :semi # hash the right rows @@ -346,13 +359,14 @@ function _join(df1::AbstractDataFrame, df2::AbstractDataFrame; end """ - innerjoin(df1, df2; on = Symbol[], makeunique = false, + innerjoin(df1, df2; on, makeunique = false, validate = (false, false)) - innerjoin(df1, df2, dfs...; on = Symbol[], makeunique = false, + innerjoin(df1, df2, dfs...; on, makeunique = false, validate = (false, false)) -Perform an inner join of two or more data frame objects and return a `DataFrame` containing -the result. An inner join includes rows with keys that match in all passed data frames. +Perform an inner join of two or more data frame objects and return a `DataFrame` +containing the result. An inner join includes rows with keys that match in all +passed data frames. # Arguments - `df1`, `df2`, `dfs...`: the `AbstractDataFrames` to be joined @@ -375,12 +389,13 @@ the result. An inner join includes rows with keys that match in all passed data run check for `df1` and the second element for `df2`. By default no check is performed. -When merging `on` categorical columns that differ in the ordering of their levels, the -ordering of the left data frame takes precedence over the ordering of the right data frame. +When merging `on` categorical columns that differ in the ordering of their +levels, the ordering of the left data frame takes precedence over the ordering +of the right data frame. -If more than two data frames are passed, the join is performed +If more than two data frames are passed, the join is performed recursively with +left associativity. In this case the `validate` keyword argument is applied recursively with left associativity. -In this case the `validate` keyword argument is applied recursively with left associativity. See also: [`leftjoin`](@ref), [`rightjoin`](@ref), [`outerjoin`](@ref), [`semijoin`](@ref), [`antijoin`](@ref), [`crossjoin`](@ref). @@ -440,18 +455,21 @@ julia> innerjoin(name, job2, on = [:ID => :identifier]) ``` """ innerjoin(df1::AbstractDataFrame, df2::AbstractDataFrame; - on::Union{<:OnType, AbstractVector} = Symbol[], makeunique::Bool=false, + on::Union{<:OnType, AbstractVector} = Symbol[], + makeunique::Bool=false, validate::Union{Pair{Bool, Bool}, Tuple{Bool, Bool}}=(false, false)) = - _join(df1, df2, on=on, kind=:inner, makeunique=makeunique, indicator=nothing, - validate=validate) + _join(df1, df2, on=on, kind=:inner, makeunique=makeunique, + indicator=nothing, validate=validate) + innerjoin(df1::AbstractDataFrame, df2::AbstractDataFrame, dfs::AbstractDataFrame...; - on::Union{<:OnType, AbstractVector} = Symbol[], makeunique::Bool=false, + on::Union{<:OnType, AbstractVector} = Symbol[], + makeunique::Bool=false, validate::Union{Pair{Bool, Bool}, Tuple{Bool, Bool}}=(false, false)) = innerjoin(innerjoin(df1, df2, on=on, makeunique=makeunique, validate=validate), dfs..., on=on, makeunique=makeunique, validate=validate) """ - leftjoin(df1, df2; on = Symbol[], makeunique = false, + leftjoin(df1, df2; on, makeunique = false, indicator = nothing, validate = (false, false)) Perform a left join of twodata frame objects and return a `DataFrame` containing @@ -482,8 +500,9 @@ the result. A left join includes all rows from `df1`. All columns of the returned data table will support missing values. -When merging `on` categorical columns that differ in the ordering of their levels, the -ordering of the left data frame takes precedence over the ordering of the right data frame. +When merging `on` categorical columns that differ in the ordering of their +levels, the ordering of the left data frame takes precedence over the ordering +of the right data frame. See also: [`innerjoin`](@ref), [`rightjoin`](@ref), [`outerjoin`](@ref), [`semijoin`](@ref), [`antijoin`](@ref), [`crossjoin`](@ref). @@ -553,7 +572,7 @@ leftjoin(df1::AbstractDataFrame, df2::AbstractDataFrame; validate=validate) """ - rightjoin(df1, df2; on = Symbol[], makeunique = false, + rightjoin(df1, df2; on, makeunique = false, indicator = nothing, validate = (false, false)) Perform a right join on two data frame objects and return a `DataFrame` containing @@ -584,8 +603,9 @@ the result. A right join includes all rows from `df2`. All columns of the returned data table will support missing values. -When merging `on` categorical columns that differ in the ordering of their levels, the -ordering of the left data frame takes precedence over the ordering of the right data frame. +When merging `on` categorical columns that differ in the ordering of their +levels, the ordering of the left data frame takes precedence over the ordering +of the right data frame. See also: [`innerjoin`](@ref), [`leftjoin`](@ref), [`outerjoin`](@ref), [`semijoin`](@ref), [`antijoin`](@ref), [`crossjoin`](@ref). @@ -652,16 +672,17 @@ rightjoin(df1::AbstractDataFrame, df2::AbstractDataFrame; makeunique::Bool=false, indicator::Union{Nothing, Symbol} = nothing, validate::Union{Pair{Bool, Bool}, Tuple{Bool, Bool}}=(false, false)) = _join(df1, df2, on=on, kind=:right, makeunique=makeunique, indicator=indicator, - validate=validate) + validate=validate) """ - outerjoin(df1, df2; on = Symbol[], kind = :inner, makeunique = false, + outerjoin(df1, df2; on, kind = :inner, makeunique = false, indicator = nothing, validate = (false, false)) - outerjoin(df1, df2, dfs...; on = Symbol[], kind = :inner, makeunique = false, + outerjoin(df1, df2, dfs...; on, kind = :inner, makeunique = false, validate = (false, false)) -Perform an outer join of two or more data frame objects and return a `DataFrame` containing -the result. An outer join includes rows with keys that appear in any of the passed data frames. +Perform an outer join of two or more data frame objects and return a `DataFrame` +containing the result. An outer join includes rows with keys that appear in any +of the passed data frames. # Arguments - `df1`, `df2`, `dfs...` : the `AbstractDataFrames` to be joined @@ -691,8 +712,9 @@ the result. An outer join includes rows with keys that appear in any of the pass All columns of the returned data table will support missing values. -When merging `on` categorical columns that differ in the ordering of their levels, the -ordering of the left data frame takes precedence over the ordering of the right data frame. +When merging `on` categorical columns that differ in the ordering of their +levels, the ordering of the left data frame takes precedence over the ordering +of the right data frame. If more than two data frames are passed, the join is performed recursively with left associativity. @@ -768,6 +790,7 @@ outerjoin(df1::AbstractDataFrame, df2::AbstractDataFrame; validate::Union{Pair{Bool, Bool}, Tuple{Bool, Bool}}=(false, false)) = _join(df1, df2, on=on, kind=:outer, makeunique=makeunique, indicator=indicator, validate=validate) + outerjoin(df1::AbstractDataFrame, df2::AbstractDataFrame, dfs::AbstractDataFrame...; on::Union{<:OnType, AbstractVector} = Symbol[], makeunique::Bool=false, validate::Union{Pair{Bool, Bool}, Tuple{Bool, Bool}}=(false, false)) = @@ -775,10 +798,11 @@ outerjoin(df1::AbstractDataFrame, df2::AbstractDataFrame, dfs::AbstractDataFrame dfs..., on=on, makeunique=makeunique, validate=validate) """ - semijoin(df1, df2; on = Symbol[], makeunique = false, validate = (false, false)) + semijoin(df1, df2; on, makeunique = false, validate = (false, false)) -Perform a semi join of two data frame objects and return a `DataFrame` containing the result. -A semi join returns the subset of rows of `df1` that match with the keys in `df2`. +Perform a semi join of two data frame objects and return a `DataFrame` +containing the result. A semi join returns the subset of rows of `df1` that +match with the keys in `df2`. # Arguments - `df1`, `df2`: the `AbstractDataFrames` to be joined @@ -803,8 +827,9 @@ A semi join returns the subset of rows of `df1` that match with the keys in `df2 run check for `df1` and the second element for `df2`. By default no check is performed. -When merging `on` categorical columns that differ in the ordering of their levels, the -ordering of the left data frame takes precedence over the ordering of the right data frame. +When merging `on` categorical columns that differ in the ordering of their +levels, the ordering of the left data frame takes precedence over the ordering +of the right data frame. See also: [`innerjoin`](@ref), [`leftjoin`](@ref), [`rightjoin`](@ref), [`outerjoin`](@ref), [`antijoin`](@ref), [`crossjoin`](@ref). @@ -866,14 +891,15 @@ julia> semijoin(name, job2, on = [:ID => :identifier]) semijoin(df1::AbstractDataFrame, df2::AbstractDataFrame; on::Union{<:OnType, AbstractVector} = Symbol[], makeunique::Bool=false, validate::Union{Pair{Bool, Bool}, Tuple{Bool, Bool}}=(false, false)) = - _join(df1, df2, on=on, kind=:semi, makeunique=makeunique, indicator=nothing, - validate=validate) + _join(df1, df2, on=on, kind=:semi, makeunique=makeunique, + indicator=nothing, validate=validate) """ - antijoin(df1, df2; on = Symbol[], makeunique = false, validate = (false, false)) + antijoin(df1, df2; on, makeunique = false, validate = (false, false)) -Perform an anti join of two data frame objects and return a `DataFrame` containing the result. -An anti join returns the subset of rows of `df1` that do not match with the keys in `df2`. +Perform an anti join of two data frame objects and return a `DataFrame` +containing the result. An anti join returns the subset of rows of `df1` that do +not match with the keys in `df2`. # Arguments - `df1`, `df2`: the `AbstractDataFrames` to be joined @@ -894,8 +920,9 @@ An anti join returns the subset of rows of `df1` that do not match with the keys run check for `df1` and the second element for `df2`. By default no check is performed. -When merging `on` categorical columns that differ in the ordering of their levels, the -ordering of the left data frame takes precedence over the ordering of the right data frame. +When merging `on` categorical columns that differ in the ordering of their +levels, the ordering of the left data frame takes precedence over the ordering +of the right data frame. See also: [`innerjoin`](@ref), [`leftjoin`](@ref), [`rightjoin`](@ref), [`outerjoin`](@ref), [`semijoin`](@ref), [`crossjoin`](@ref). @@ -954,14 +981,15 @@ julia> antijoin(name, job2, on = [:ID => :identifier]) antijoin(df1::AbstractDataFrame, df2::AbstractDataFrame; on::Union{<:OnType, AbstractVector} = Symbol[], makeunique::Bool=false, validate::Union{Pair{Bool, Bool}, Tuple{Bool, Bool}}=(false, false)) = - _join(df1, df2, on=on, kind=:anti, makeunique=makeunique, indicator=nothing, - validate=validate) + _join(df1, df2, on=on, kind=:anti, makeunique=makeunique, + indicator=nothing, validate=validate) """ crossjoin(df1, df2, dfs...; makeunique = false) -Perform a cross join of two or more data frame objects and return a `DataFrame` containing -the result. A cross join returns the cartesian product of rows from all passed data frames. +Perform a cross join of two or more data frame objects and return a `DataFrame` +containing the result. A cross join returns the cartesian product of rows from +all passed data frames. # Arguments - `df1`, `df2`, `dfs...` : the `AbstractDataFrames` to be joined @@ -1017,7 +1045,7 @@ function crossjoin(df1::AbstractDataFrame, df2::AbstractDataFrame; makeunique::B colindex = merge(index(df1), index(df2), makeunique=makeunique) cols = Any[[repeat(c, inner=r2) for c in eachcol(df1)]; [repeat(c, outer=r1) for c in eachcol(df2)]] - DataFrame(cols, colindex, copycols=false) + return DataFrame(cols, colindex, copycols=false) end crossjoin(df1::AbstractDataFrame, df2::AbstractDataFrame, dfs::AbstractDataFrame...; diff --git a/src/abstractdataframe/reshape.jl b/src/abstractdataframe/reshape.jl index ce93a7b805..5f4adf5c2a 100644 --- a/src/abstractdataframe/reshape.jl +++ b/src/abstractdataframe/reshape.jl @@ -1,6 +1,6 @@ """ stack(df::AbstractDataFrame, [measure_vars], [id_vars]; - variable_name::Symbol=:variable, value_name::Symbol=:value, + variable_name=:variable, value_name=:value, view::Bool=false, variable_eltype::Type=CategoricalValue{String}) Stack a data frame `df`, i.e. convert it from wide to long format. @@ -18,24 +18,24 @@ that return views into the original data frame. # Arguments - `df` : the AbstractDataFrame to be stacked -- `measure_vars` : the columns to be stacked (the measurement - variables), a normal column indexing type, like a `Symbol`, - `Vector{Symbol}`, Int, etc.; If neither `measure_vars` - or `id_vars` are given, `measure_vars` defaults to all - floating point columns. -- `id_vars` : the identifier columns that are repeated during - stacking, a normal column indexing type; defaults to all - variables that are not `measure_vars` -- `variable_name` : the name of the new stacked column that shall hold the names - of each of `measure_vars` -- `value_name` : the name of the new stacked column containing the values from - each of `measure_vars` +- `measure_vars` : the columns to be stacked (the measurement variables), + as a column selector ($COLUMNINDEX_STR; $MULTICOLUMNINDEX_STR). + If neither `measure_vars` or `id_vars` are given, `measure_vars` + defaults to all floating point columns. +- `id_vars` : the identifier columns that are repeated during stacking, + as a column selector ($COLUMNINDEX_STR; $MULTICOLUMNINDEX_STR). + Defaults to all variables that are not `measure_vars` +- `variable_name` : the name (`Symbol` or string) of the new stacked column that + shall hold the names of each of `measure_vars` +- `value_name` : the name (`Symbol` or string) of the new stacked column containing + the values from each of `measure_vars` - `view` : whether the stacked data frame should be a view rather than contain - freshly allocated vectors. -- `variable_eltype` : determines the element type of column `variable_name`. By default - a categorical vector of strings is created. - If `variable_eltype=Symbol` it is a vector of `Symbol`, - and if `variable_eltype=String` a vector of `String` is produced. + freshly allocated vectors. +- `variable_eltype` : determines the element type of column `variable_name`. + By default a categorical vector of strings is created. + If `variable_eltype=Symbol` it is a vector of `Symbol`, + and if `variable_eltype=String` a vector of `String` is produced. + # Examples ```julia @@ -55,33 +55,37 @@ function stack(df::AbstractDataFrame, measure_vars = findall(col -> eltype(col) <: Union{AbstractFloat, Missing}, eachcol(df)), id_vars = Not(measure_vars); - variable_name::Symbol=:variable, - value_name::Symbol=:value, view::Bool=false, + variable_name::SymbolOrString=:variable, + value_name::SymbolOrString=:value, view::Bool=false, variable_eltype::Type=CategoricalValue{String}) + variable_name_s = Symbol(variable_name) + value_name_s = Symbol(value_name) # getindex from index returns either Int or AbstractVector{Int} mv_tmp = index(df)[measure_vars] ints_measure_vars = mv_tmp isa Int ? [mv_tmp] : mv_tmp idv_tmp = index(df)[id_vars] ints_id_vars = idv_tmp isa Int ? [idv_tmp] : idv_tmp if view - return _stackview(df, ints_measure_vars, ints_id_vars, variable_name=variable_name, - value_name=value_name, variable_eltype=variable_eltype) + return _stackview(df, ints_measure_vars, ints_id_vars, + variable_name=variable_name_s, + value_name=value_name_s, + variable_eltype=variable_eltype) end N = length(ints_measure_vars) cnames = _names(df)[ints_id_vars] - push!(cnames, variable_name) - push!(cnames, value_name) + push!(cnames, variable_name_s) + push!(cnames, value_name_s) if variable_eltype <: CategoricalValue{String} - nms = String.(_names(df)[ints_measure_vars]) + nms = names(df, ints_measure_vars) catnms = categorical(nms) levels!(catnms, nms) elseif variable_eltype === Symbol catnms = _names(df)[ints_measure_vars] elseif variable_eltype === String - catnms = PooledArray(String.(_names(df)[ints_measure_vars])) + catnms = PooledArray(names(df, ints_measure_vars)) else - throw(ArgumentError("`variable_eltype` keyword argument accepts only `CategoricalValue{String}`, " * - "`String` or `Symbol` as a value.")) + throw(ArgumentError("`variable_eltype` keyword argument accepts only " * + "`CategoricalValue{String}`, `String` or `Symbol` as a value.")) end return DataFrame(AbstractVector[[repeat(df[!, c], outer=N) for c in ints_id_vars]..., # id_var columns repeat(catnms, inner=nrow(df)), # variable @@ -97,16 +101,16 @@ function _stackview(df::AbstractDataFrame, measure_vars::AbstractVector{Int}, push!(cnames, variable_name) push!(cnames, value_name) if variable_eltype <: CategoricalValue{String} - nms = String.(_names(df)[measure_vars]) + nms = names(df, measure_vars) catnms = categorical(nms) levels!(catnms, nms) elseif variable_eltype <: Symbol catnms = _names(df)[measure_vars] elseif variable_eltype <: String - catnms = String.(_names(df)[measure_vars]) + catnms = names(df, measure_vars) else - throw(ArgumentError("`variable_eltype` keyword argument accepts only `CategoricalValue{String}`, " * - "`String` or `Symbol` as a value.")) + throw(ArgumentError("`variable_eltype` keyword argument accepts only " * + "`CategoricalValue{String}`, `String` or `Symbol` as a value.")) end return DataFrame(AbstractVector[[RepeatedVector(df[!, c], 1, N) for c in id_vars]..., # id_var columns RepeatedVector(catnms, nrow(df), 1), # variable @@ -115,34 +119,31 @@ function _stackview(df::AbstractDataFrame, measure_vars::AbstractVector{Int}, end """ - unstack(df::AbstractDataFrame, rowkeys::Union{Integer, Symbol}, - colkey::Union{Integer, Symbol}, value::Union{Integer, Symbol}; - renamecols::Function=identity) - unstack(df::AbstractDataFrame, rowkeys::AbstractVector{<:Union{Integer, Symbol}}, - colkey::Union{Integer, Symbol}, value::Union{Integer, Symbol}; - renamecols::Function=identity) - unstack(df::AbstractDataFrame, colkey::Union{Integer, Symbol}, - value::Union{Integer, Symbol}; renamecols::Function=identity) + unstack(df::AbstractDataFrame, rowkeys, colkey, value; renamecols::Function=identity) + unstack(df::AbstractDataFrame, colkey, value; renamecols::Function=identity) unstack(df::AbstractDataFrame; renamecols::Function=identity) Unstack data frame `df`, i.e. convert it from long to wide format. -If `colkey` contains `missing` values then they will be skipped and a warning will be printed. +If `colkey` contains `missing` values then they will be skipped and a warning +will be printed. -If combination of `rowkeys` and `colkey` contains duplicate entries then last `value` will -be retained and a warning will be printed. +If combination of `rowkeys` and `colkey` contains duplicate entries then last +`value` will be retained and a warning will be printed. # Arguments - `df` : the AbstractDataFrame to be unstacked -- `rowkeys` : the column(s) with a unique key for each row, if not given, - find a key by grouping on anything not a `colkey` or `value` -- `colkey` : the column holding the column names in wide format, +- `rowkeys` : the columns with a unique key for each row, if not given, + find a key by grouping on anything not a `colkey` or `value`. + Can be any column selector ($COLUMNINDEX_STR; $MULTICOLUMNINDEX_STR). +- `colkey` : the column ($COLUMNINDEX_STR) holding the column names in wide format, defaults to `:variable` -- `value` : the value column, defaults to `:value` +- `value` : the value column ($COLUMNINDEX_STR), defaults to `:value` - `renamecols` : a function called on each unique value in `colkey` which must return the name of the column to be created (typically as a string or a `Symbol`). Duplicate names are not allowed. + # Examples ```julia wide = DataFrame(id = 1:12, @@ -185,7 +186,7 @@ function _unstack(df::AbstractDataFrame, rowkey::Int, colkey::Int, kref = keycol.refs[k] if kref <= 0 # we have found missing in colkey if !warned_missing - @warn("Missing value in variable $(_names(df)[colkey]) at row $k. Skipping.") + @warn("Missing value in variable :$(_names(df)[colkey]) at row $k. Skipping.") warned_missing = true end continue # skip processing it @@ -266,7 +267,7 @@ function _unstack(df::AbstractDataFrame, rowkeys::AbstractVector{Int}, kref = keycol.refs[k] if kref <= 0 if !warned_missing - @warn("Missing value in variable $(_names(df)[colkey]) at row $k. Skipping.") + @warn("Missing value in variable :$(_names(df)[colkey]) at row $k. Skipping.") warned_missing = true end continue @@ -333,7 +334,8 @@ Base.eltype(v::Type{StackedVector{T}}) where {T} = T Base.similar(v::StackedVector, T::Type, dims::Union{Integer, AbstractUnitRange}...) = similar(v.components[1], T, dims...) -CategoricalArrays.CategoricalArray(v::StackedVector) = CategoricalArray(v[:]) # could be more efficient +CategoricalArrays.CategoricalArray(v::StackedVector) = + CategoricalArray(v[:]) # could be more efficient """ diff --git a/src/abstractdataframe/selection.jl b/src/abstractdataframe/selection.jl index cc953afbb3..5aa917eb14 100644 --- a/src/abstractdataframe/selection.jl +++ b/src/abstractdataframe/selection.jl @@ -1,6 +1,4 @@ # TODO: -# * add NT (or better name) to column selector passing NamedTuple -# (also in other places: filter, combine) # * add select/select!/transform/transform! for GroupedDataFrame # normalize_selection function makes sure that whatever input format of idx is it @@ -35,7 +33,8 @@ _by_row_helper(x::Union{NamedTuple, DataFrameRow}) = Base.broadcastable(x::ByRow) = Ref(x) (f::ByRow)(cols::AbstractVector...) = _by_row_helper.(f.fun.(cols...)) -(f::ByRow)(table::NamedTuple) = _by_row_helper.(f.fun.(Tables.namedtupleiterator(table))) +(f::ByRow)(table::NamedTuple) = + _by_row_helper.(f.fun.(Tables.namedtupleiterator(table))) # add a method to funname defined in other/utils.jl funname(row::ByRow) = funname(row.fun) @@ -51,10 +50,10 @@ normalize_selection(idx::AbstractIndex, sel) = end end - normalize_selection(idx::AbstractIndex, sel::Pair{typeof(nrow), Symbol}) = length(idx) == 0 ? (Int[] => (() -> 0) => last(sel)) : (1 => length => last(sel)) - +normalize_selection(idx::AbstractIndex, sel::Pair{typeof(nrow), <:AbstractString}) = + normalize_selection(idx, first(sel) => Symbol(last(sel))) normalize_selection(idx::AbstractIndex, sel::typeof(nrow)) = normalize_selection(idx, nrow => :nrow) @@ -68,6 +67,9 @@ function normalize_selection(idx::AbstractIndex, sel::Pair{<:ColumnIndex, Symbol return c => identity => last(sel) end +normalize_selection(idx::AbstractIndex, sel::Pair{<:ColumnIndex, <:AbstractString}) = + normalize_selection(idx, first(sel) => Symbol(last(sel))) + function normalize_selection(idx::AbstractIndex, sel::Pair{<:Any,<:Pair{<:Union{Base.Callable, ByRow}, Symbol}}) if first(sel) isa AsTable @@ -79,7 +81,7 @@ function normalize_selection(idx::AbstractIndex, end if rawc isa AbstractVector{Int} c = rawc - elseif rawc isa AbstractVector{Symbol} + elseif rawc isa Union{AbstractVector{Symbol}, AbstractVector{<:AbstractString}} c = [idx[n] for n in rawc] else c = try @@ -99,6 +101,11 @@ function normalize_selection(idx::AbstractIndex, return (wanttable ? AsTable(c) : c) => last(sel) end +normalize_selection(idx::AbstractIndex, + sel::Pair{<:Any,<:Pair{<:Union{Base.Callable, ByRow}, + <:AbstractString}}) = + normalize_selection(idx, first(sel) => first(last(sel)) => Symbol(last(last(sel)))) + function normalize_selection(idx::AbstractIndex, sel::Pair{<:ColumnIndex,<:Union{Base.Callable, ByRow}}) c = idx[first(sel)] @@ -118,7 +125,7 @@ function normalize_selection(idx::AbstractIndex, end if rawc isa AbstractVector{Int} c = rawc - elseif rawc isa AbstractVector{Symbol} + elseif rawc isa Union{AbstractVector{Symbol}, AbstractVector{<:AbstractString}} c = [idx[n] for n in rawc] else c = try @@ -203,57 +210,60 @@ SELECT_ARG_RULES = """ Arguments passed as `args...` can be: - * Any index that is allowed for column indexing. In particular, symbols, integers, - vectors of symbols, vectors of integers, vectors of bools, regular expressions, - `All`, `Between`, and `Not` selectors are supported. - * Column transformation operations using the `Pair` notation that is described below - and vectors of such pairs. - - Columns can be renamed using the `old_column => new_column_name` syntax, - and transformed using the `old_column => fun => new_column_name` syntax. - `new_column_name` must be a `Symbol`, and `fun` a function or a type. If `old_column` - is a `Symbol` or an integer then `fun` is applied to the corresponding column vector. - Otherwise `old_column` can be any column indexing syntax, in which case `fun` - will be passed the column vectors specified by `old_column` as separate arguments. - The only exception is when `old_column` is an `AsTable` type wrapping a selector, - in which case `fun` is passed a `NamedTuple` containing the selected columns. - - If `fun` returns a value of type other than `AbstractVector` then it will be broadcasted - into a vector matching the target number of rows in the data frame, - unless its type is one of `AbstractDataFrame`, `NamedTuple`, `DataFrameRow`, - `AbstractMatrix`, in which case an error is thrown as currently these - return types are not allowed. - As a particular rule, values wrapped in a `Ref` or a `0`-dimensional `AbstractArray` - are unwrapped and then broadcasted. - - To apply `fun` to each row instead of whole columns, it can be wrapped in a `ByRow` - struct. In this case if `old_column` is a `Symbol` or an integer then `fun` is applied - to each element (row) of `old_column` using broadcasting. Otherwise `old_column` can be - any column indexing syntax, in which case `fun` will be passed one argument for each of - the columns specified by `old_column`. If `ByRow` is used it is not allowed for - `old_column` to select an empty set of columns nor for `fun` to return - a `NamedTuple` or a `DataFrameRow`. - - Column transformation can also be specified using the short `old_column => fun` form. - In this case, `new_column_name` is automatically generated as `\$(old_column)_\$(fun)`. - Up to three column names are used for multiple input columns and they are joined - using `_`; if more than three columns are passed then the name consists of the - first two names and `etc` suffix then, e.g. `[:a,:b,:c,:d] => fun` produces - the new column name `:a_b_etc_fun`. - - Column renaming and transformation operations can be passed wrapped in vectors - (this is useful when combined with broadcasting). - - As a special rule passing `nrow` without specifying `old_column` creates a column named `:nrow` - containing a number of rows in a source data frame, and passing `nrow => new_column_name` - stores the number of rows in source data frame in `new_column_name` column. - - If a collection of column names is passed to `select!` or `select` then requesting - duplicate column names in target data frame are accepted (e.g. `select!(df, [:a], :, r"a")` - is allowed) and only the first occurrence is used. In particular a syntax to move - column `:col` to the first position in the data frame is `select!(df, :col, :)`. - On the contrary, output column names of renaming, transformation and single column - selection operations must be unique, so e.g. `select!(df, :a, :a => :a)` or + * Any index that is allowed for column indexing + ($COLUMNINDEX_STR; $MULTICOLUMNINDEX_STR). + * Column transformation operations using the `Pair` notation that is + described below and vectors of such pairs. + + Columns can be renamed using the `old_column => new_column_name` syntax, and + transformed using the `old_column => fun => new_column_name` syntax. + `new_column_name` must be a `Symbol` or a string, and `fun` a function or a + type. If `old_column` is a `Symbol`, a string, or an integer then `fun` is + applied to the corresponding column vector. Otherwise `old_column` can be + any column indexing syntax, in which case `fun` will be passed the column + vectors specified by `old_column` as separate arguments. The only exception + is when `old_column` is an `AsTable` type wrapping a selector, in which case + `fun` is passed a `NamedTuple` containing the selected columns. + + If `fun` returns a value of type other than `AbstractVector` then it will be + broadcasted into a vector matching the target number of rows in the data + frame, unless its type is one of `AbstractDataFrame`, `NamedTuple`, + `DataFrameRow`, `AbstractMatrix`, in which case an error is thrown as + currently these return types are not allowed. As a particular rule, values + wrapped in a `Ref` or a `0`-dimensional `AbstractArray` are unwrapped and + then broadcasted. + + To apply `fun` to each row instead of whole columns, it can be wrapped in a + `ByRow` struct. In this case if `old_column` is a `Symbol`, a string, or an + integer then `fun` is applied to each element (row) of `old_column` using + broadcasting. Otherwise `old_column` can be any column indexing syntax, in + which case `fun` will be passed one argument for each of the columns + specified by `old_column`. If `ByRow` is used it is not allowed for + `old_column` to select an empty set of columns nor for `fun` to return a + `NamedTuple` or a `DataFrameRow`. + + Column transformation can also be specified using the short `old_column => + fun` form. In this case, `new_column_name` is automatically generated as + `\$(old_column)_\$(fun)`. Up to three column names are used for multiple + input columns and they are joined using `_`; if more than three columns are + passed then the name consists of the first two names and `etc` suffix then, + e.g. `[:a,:b,:c,:d] => fun` produces the new column name `:a_b_etc_fun`. + + Column renaming and transformation operations can be passed wrapped in + vectors (this is useful when combined with broadcasting). + + As a special rule passing `nrow` without specifying `old_column` creates a + column named `:nrow` containing a number of rows in a source data frame, and + passing `nrow => new_column_name` stores the number of rows in source data + frame in `new_column_name` column. + + If a collection of column names is passed to `select!` or `select` then + requesting duplicate column names in target data frame are accepted (e.g. + `select!(df, [:a], :, r"a")` is allowed) and only the first occurrence is + used. In particular a syntax to move column `:col` to the first position in + the data frame is `select!(df, :col, :)`. On the contrary, output column + names of renaming, transformation and single column selection operations + must be unique, so e.g. `select!(df, :a, :a => :a)` or `select!(df, :a, :a => ByRow(sin) => :a)` are not allowed. """ @@ -265,8 +275,9 @@ Mutate `df` in place to retain only columns specified by `args...` and return it $SELECT_ARG_RULES Note that including the same column several times in the data frame via renaming -or transformations that return the same object without copying will create column aliases. -An example of such a situation is `select!(df, :a, :a => :b, :a => identity => :c)`. +or transformations that return the same object without copying will create +column aliases. An example of such a situation is +`select!(df, :a, :a => :b, :a => identity => :c)`. # Examples ```jldoctest @@ -361,9 +372,14 @@ function select!(df::DataFrame, args::AbstractVector{Int}) end select!(df::DataFrame, c::Int) = select!(df, [c]) -select!(df::DataFrame, c::Union{AbstractVector{<:Integer}, AbstractVector{Symbol}, - Colon, All, Not, Between, Regex}) = - select!(df, index(df)[c]) + +function select!(df::DataFrame, c::MultiColumnIndex) + if c isa AbstractVector{<:Pair} + return select!(df, c...) + else + return select!(df, index(df)[c]) + end +end function select!(df::DataFrame, cs...) newdf = select(df, cs..., copycols=false) @@ -390,7 +406,8 @@ transform!(df::DataFrame, args...) = select!(df, :, args...) """ select(df::AbstractDataFrame, args...; copycols::Bool=true) -Create a new data frame that contains columns from `df` specified by `args` and return it. +Create a new data frame that contains columns from `df` specified by `args` and +return it. If `df` is a `DataFrame` or `copycols=true` then column renaming and transformations are supported. @@ -398,24 +415,26 @@ are supported. $SELECT_ARG_RULES If `df` is a `DataFrame` a new `DataFrame` is returned. -If `copycols=false`, then the returned `DataFrame` shares column vectors with `df` where possible. -If `copycols=true` (the default), then the returned `DataFrame` will not share columns with `df`. -The only exception for this rule is the `old_column => fun => new_column` transformation -when `fun` returns a vector that is not allocated by `fun` but is neither a `SubArray` nor one -of the input vectors. -In such a case a new `DataFrame` might contain aliases. Such a situation can only happen -with transformations which returns vectors other than their inputs, e.g. with -`select(df, :a => (x -> c) => :c1, :b => (x -> c) => :c2)` when `c` is a vector object -or with `select(df, :a => (x -> df.c) => :c2)`. - -If `df` is a `SubDataFrame` and `copycols=true` then a `DataFrame` is returned and -the same copying rules apply as for a `DataFrame` input: +If `copycols=false`, then the returned `DataFrame` shares column vectors with `df` +where possible. +If `copycols=true` (the default), then the returned `DataFrame` will not share +columns with `df`. +The only exception for this rule is the `old_column => fun => new_column` +transformation when `fun` returns a vector that is not allocated by `fun` but is +neither a `SubArray` nor one of the input vectors. +In such a case a new `DataFrame` might contain aliases. Such a situation can +only happen with transformations which returns vectors other than their inputs, +e.g. with `select(df, :a => (x -> c) => :c1, :b => (x -> c) => :c2)` when `c` +is a vector object or with `select(df, :a => (x -> df.c) => :c2)`. + +If `df` is a `SubDataFrame` and `copycols=true` then a `DataFrame` is returned +and the same copying rules apply as for a `DataFrame` input: this means in particular that selected columns will be copied. If `copycols=false`, a `SubDataFrame` is returned without copying columns. -Note that including the same column several times in the data frame via renaming or -transformations that return the same object when `copycols=false` will create column -aliases. An example of such a situation is +Note that including the same column several times in the data frame via renaming +or transformations that return the same object when `copycols=false` will create +column aliases. An example of such a situation is `select(df, :a, :a => :b, :a => identity => :c, copycols=false)`. # Examples @@ -502,9 +521,15 @@ julia> select(df, AsTable(:) => ByRow(mean)) select(df::DataFrame, args::AbstractVector{Int}; copycols::Bool=true) = DataFrame(_columns(df)[args], Index(_names(df)[args]), copycols=copycols) -select(df::DataFrame, c::Union{AbstractVector{<:Integer}, AbstractVector{Symbol}, - Colon, All, Not, Between, Regex}; copycols::Bool=true) = - select(df, index(df)[c], copycols=copycols) + +function select(df::DataFrame, c::MultiColumnIndex; copycols::Bool=true) + if c isa AbstractVector{<:Pair} + return select(df, c..., copycols=copycols) + else + return select(df, index(df)[c], copycols=copycols) + end +end + select(df::DataFrame, c::ColumnIndex; copycols::Bool=true) = select(df, [c], copycols=copycols) @@ -620,9 +645,14 @@ end select(dfv::SubDataFrame, ind::ColumnIndex; copycols::Bool=true) = select(dfv, [ind], copycols=copycols) -select(dfv::SubDataFrame, args::Union{AbstractVector{<:Integer}, AbstractVector{Symbol}, - Colon, All, Not, Between, Regex}; copycols::Bool=true) = - copycols ? dfv[:, args] : view(dfv, :, args) + +function select(dfv::SubDataFrame, args::MultiColumnIndex; copycols::Bool=true) + if args isa AbstractVector{<:Pair} + return select(dfv, args..., copycols=copycols) + else + return copycols ? dfv[:, args] : view(dfv, :, args) + end +end function select(dfv::SubDataFrame, args...; copycols::Bool=true) if copycols @@ -644,9 +674,9 @@ function select(dfv::SubDataFrame, args...; copycols::Bool=true) if ind isa ColumnIndex ind_idx = index(dfv)[ind] if ind_idx in seen_single_column - throw(ArgumentError("selecting the same column multiple times using" * - " Symbol or integer is not allowed ($ind was " * - "passed more than once")) + throw(ArgumentError("selecting the same column multiple times " * + "using Symbol, string or integer is not allowed " * + "($ind was passed more than once")) else push!(seen_single_column, ind_idx) end diff --git a/src/abstractdataframe/show.jl b/src/abstractdataframe/show.jl index 78c78d08f1..1344c0e176 100644 --- a/src/abstractdataframe/show.jl +++ b/src/abstractdataframe/show.jl @@ -14,7 +14,7 @@ let function ourstrwidth(io::IO, x::Any) truncate(buffer, 0) ourshow(IOContext(buffer, :compact=>get(io, :compact, true)), x) - textwidth(String(take!(buffer))) + return textwidth(String(take!(buffer))) end end @@ -427,8 +427,8 @@ NOTE: The value of `maxwidths[end]` must be the string width of required to render each column. - `allcols::Bool = false`: Whether to print all columns, rather than a subset that fits the device width. -- `splitcols::Bool`: Whether to split printing in chunks of columns fitting the screen width - rather than printing all columns in the same block. +- `splitcols::Bool`: Whether to split printing in chunks of columns fitting the + screen width rather than printing all columns in the same block. - `rowlabel::Symbol`: What label should be printed when rendering the numeric ID's of each row? Defaults to `:Row`. - `displaysummary::Bool`: Should a brief string summary of the @@ -653,8 +653,9 @@ while `splitcols` defaults to `true`. the first and last, when `df` is a `GroupedDataFrame`. By default this is the case only if `io` does not have the `IOContext` property `limit` set. -- `splitcols::Bool`: Whether to split printing in chunks of columns fitting the screen width - rather than printing all columns in the same block. Only applies if `allcols` is `true`. +- `splitcols::Bool`: Whether to split printing in chunks of columns fitting the + screen width rather than printing all columns in the same block. Only applies + if `allcols` is `true`. By default this is the case only if `io` has the `IOContext` property `limit` set. - `rowlabel::Symbol = :Row`: The label to use for the column containing row numbers. - `summary::Bool = true`: Whether to print a brief string summary of the data frame. diff --git a/src/abstractdataframe/sort.jl b/src/abstractdataframe/sort.jl index f0c1238e5c..cebbde7c19 100644 --- a/src/abstractdataframe/sort.jl +++ b/src/abstractdataframe/sort.jl @@ -73,7 +73,8 @@ _getcol(x) = x ### # Get an Ordering for a single column ### -function ordering(col_ord::UserColOrdering, lt::Function, by::Function, rev::Bool, order::Ordering) +function ordering(col_ord::UserColOrdering, lt::Function, by::Function, + rev::Bool, order::Ordering) for (k,v) in pairs(col_ord.kwargs) if k == :lt; lt = v elseif k == :by; by = v @@ -147,11 +148,13 @@ ordering(df::AbstractDataFrame, lt::Function, by::Function, rev::Bool, order::Or ###### function ordering(df::AbstractDataFrame, lt::AbstractVector{S}, by::AbstractVector{T}, - rev::AbstractVector{Bool}, order::AbstractVector) where {S<:Function, T<:Function} + rev::AbstractVector{Bool}, + order::AbstractVector) where {S<:Function, T<:Function} if !(length(lt) == length(by) == length(rev) == length(order) == size(df,2)) throw(ArgumentError("Orderings must be specified for all DataFrame columns")) end - DFPerm([Order.ord(_lt, _by, _rev, _order) for (_lt, _by, _rev, _order) in zip(lt, by, rev, order)], df) + DFPerm([Order.ord(_lt, _by, _rev, _order) for + (_lt, _by, _rev, _order) in zip(lt, by, rev, order)], df) end ################ @@ -159,13 +162,15 @@ end ################ ## Case 2a: The column is given directly ###### -ordering(df::AbstractDataFrame, col::ColumnIndex, lt::Function, by::Function, rev::Bool, order::Ordering) = +ordering(df::AbstractDataFrame, col::ColumnIndex, lt::Function, by::Function, + rev::Bool, order::Ordering) = Perm(Order.ord(lt, by, rev, order), df[!, col]) ###### ## Case 2b: The column is given as a UserColOrdering ###### -ordering(df::AbstractDataFrame, col_ord::UserColOrdering, lt::Function, by::Function, rev::Bool, order::Ordering) = +ordering(df::AbstractDataFrame, col_ord::UserColOrdering, lt::Function, by::Function, + rev::Bool, order::Ordering) = Perm(ordering(col_ord, lt, by, rev, order), df[!, col_ord.col]) ################ @@ -173,7 +178,8 @@ ordering(df::AbstractDataFrame, col_ord::UserColOrdering, lt::Function, by::Func ################ ## Case 3a: None of lt, by, rev, or order is an Array ###### -function ordering(df::AbstractDataFrame, cols::AbstractVector, lt::Function, by::Function, rev::Bool, order::Ordering) +function ordering(df::AbstractDataFrame, cols::AbstractVector, lt::Function, + by::Function, rev::Bool, order::Ordering) if length(cols) == 0 return ordering(df, lt, by, rev, order) @@ -204,9 +210,9 @@ end ###### # Case 3b: cols, lt, by, rev, and order are all arrays ###### -function ordering(df::AbstractDataFrame, cols::AbstractVector, - lt::AbstractVector{S}, by::AbstractVector{T}, - rev::AbstractVector{Bool}, order::AbstractVector) where {S<:Function, T<:Function} +function ordering(df::AbstractDataFrame, cols::AbstractVector, lt::AbstractVector{S}, + by::AbstractVector{T}, rev::AbstractVector{Bool}, + order::AbstractVector) where {S<:Function, T<:Function} if !(length(lt) == length(by) == length(rev) == length(order)) throw(ArgumentError("All ordering arguments must be 1 or the same length.")) @@ -274,7 +280,8 @@ end # TimSort is fast for data with structure, but only if the DataFrame is large enough # TODO: 8192 is informed but somewhat arbitrary -Sort.defalg(df::AbstractDataFrame) = size(df, 1) < 8192 ? Sort.MergeSort : SortingAlgorithms.TimSort +Sort.defalg(df::AbstractDataFrame) = + size(df, 1) < 8192 ? Sort.MergeSort : SortingAlgorithms.TimSort # For DataFrames, we can choose the algorithm based on the column type and requested ordering function Sort.defalg(df::AbstractDataFrame, ::Type{T}, o::Ordering) where T<:Real @@ -286,15 +293,15 @@ function Sort.defalg(df::AbstractDataFrame, ::Type{T}, o::Ordering) where T<:Rea Sort.defalg(df) end end -Sort.defalg(df::AbstractDataFrame, ::Type, o::Ordering) = Sort.defalg(df) -Sort.defalg(df::AbstractDataFrame, col ::ColumnIndex, o::Ordering) = Sort.defalg(df, eltype(df[!, col]), o) -Sort.defalg(df::AbstractDataFrame, col_ord::UserColOrdering, o::Ordering) = Sort.defalg(df, col_ord.col, o) -Sort.defalg(df::AbstractDataFrame, cols, o::Ordering) = Sort.defalg(df) - -function Sort.defalg(df::AbstractDataFrame, o::Ordering; alg=nothing, cols=[]) - alg != nothing && return alg - Sort.defalg(df, cols, o) -end + +Sort.defalg(df::AbstractDataFrame, ::Type, o::Ordering) = Sort.defalg(df) +Sort.defalg(df::AbstractDataFrame, col::ColumnIndex, o::Ordering) = + Sort.defalg(df, eltype(df[!, col]), o) +Sort.defalg(df::AbstractDataFrame, col_ord::UserColOrdering, o::Ordering) = + Sort.defalg(df, col_ord.col, o) +Sort.defalg(df::AbstractDataFrame, cols, o::Ordering) = Sort.defalg(df) +Sort.defalg(df::AbstractDataFrame, o::Ordering; alg=nothing, cols=[]) = + alg != nothing ? alg : Sort.defalg(df, cols, o) ######################## ## Actual sort functions @@ -305,8 +312,8 @@ end lt=isless, by=identity, rev::Bool=false, order::Ordering=Forward) Test whether data frame `df` sorted by column(s) `cols`. -`cols` can be either a `Symbol` or `Integer` column index, -a vector of such indices, `:`, `All`, `Not`, `Between`, or `Regex`. + +`cols` can be any column selector ($COLUMNINDEX_STR; $MULTICOLUMNINDEX_STR). If `rev` is `true`, reverse sorting is performed. To enable reverse sorting only for some columns, pass `order(c, rev=true)` in `cols`, with `c` the @@ -315,7 +322,8 @@ See other methods for a description of other keyword arguments. """ function Base.issorted(df::AbstractDataFrame, cols=[]; lt=isless, by=identity, rev=false, order=Forward) - if cols isa Union{Colon, All, Not, Between, Regex} + # exclude AbstractVector as in that case cols can contain order(...) clauses + if cols isa MultiColumnIndex && !(cols isa AbstractVector) cols = index(df)[cols] end if cols isa ColumnIndex @@ -334,10 +342,12 @@ for s in [:(Base.sort), :(Base.sortperm)] function $s(df::AbstractDataFrame, cols=[]; alg=nothing, lt=isless, by=identity, rev=false, order=Forward) if !(isa(by, Function) || eltype(by) <: Function) - msg = "'by' must be a Function or a vector of Functions. Perhaps you wanted 'cols'." + msg = "'by' must be a Function or a vector of Functions. " * + " Perhaps you wanted 'cols'." throw(ArgumentError(msg)) end - if cols isa Union{All, Not, Between, Colon, Regex} + # exclude AbstractVector as in that case cols can contain order(...) clauses + if cols isa MultiColumnIndex && !(cols isa AbstractVector) cols = index(df)[cols] end ord = ordering(df, cols, lt, by, rev, order) @@ -353,8 +363,8 @@ end rev::Bool=false, order::Ordering=Forward) Return a copy of data frame `df` sorted by column(s) `cols`. -`cols` can be either a `Symbol` or `Integer` column index, or -a vector of such indices, `:`, `All`, `Not`, `Between`, or `Regex`. + +`cols` can be any column selector ($COLUMNINDEX_STR; $MULTICOLUMNINDEX_STR). If `alg` is `nothing` (the default), the most appropriate algorithm is chosen automatically among `TimSort`, `MergeSort` and `RadixSort` depending @@ -427,6 +437,8 @@ sort(::AbstractDataFrame, ::Any) Return a permutation vector of row indices of data frame `df` that puts them in sorted order according to column(s) `cols`. +`cols` can be any column selector ($COLUMNINDEX_STR; $MULTICOLUMNINDEX_STR). + If `alg` is `nothing` (the default), the most appropriate algorithm is chosen automatically among `TimSort`, `MergeSort` and `RadixSort` depending on the type of the sorting columns and on the number of rows in `df`. @@ -478,6 +490,9 @@ julia> sortperm(df, (:x, :y), rev=true) """ sortperm(::AbstractDataFrame, ::Any) -Base.sort(df::AbstractDataFrame, a::Algorithm, o::Ordering) = df[sortperm(df, a, o),:] -Base.sortperm(df::AbstractDataFrame, a::Algorithm, o::Union{Perm,DFPerm}) = sort!([1:size(df, 1);], a, o) -Base.sortperm(df::AbstractDataFrame, a::Algorithm, o::Ordering) = sortperm(df, a, DFPerm(o,df)) +Base.sort(df::AbstractDataFrame, a::Algorithm, o::Ordering) = + df[sortperm(df, a, o),:] +Base.sortperm(df::AbstractDataFrame, a::Algorithm, o::Union{Perm,DFPerm}) = + sort!([1:size(df, 1);], a, o) +Base.sortperm(df::AbstractDataFrame, a::Algorithm, o::Ordering) = + sortperm(df, a, DFPerm(o,df)) diff --git a/src/dataframe/dataframe.jl b/src/dataframe/dataframe.jl index 1356b5a7af..fe274daa86 100644 --- a/src/dataframe/dataframe.jl +++ b/src/dataframe/dataframe.jl @@ -8,16 +8,26 @@ particularly a Vector or CategoricalVector. # Constructors ```julia -DataFrame(columns::Vector, names::Vector{Symbol}; +DataFrame(columns::AbstractVector, names::AbstractVector{Symbol}; + makeunique::Bool=false, copycols::Bool=true) +DataFrame(columns::AbstractVector, names::AbstractVector{<:AbstractString}; makeunique::Bool=false, copycols::Bool=true) DataFrame(columns::NTuple{N,AbstractVector}, names::NTuple{N,Symbol}; makeunique::Bool=false, copycols::Bool=true) -DataFrame(columns::Matrix, names::Vector{Symbol}; makeunique::Bool=false) +DataFrame(columns::NTuple{N,AbstractVector}, names::NTuple{N,<:AbstractString}; + makeunique::Bool=false, copycols::Bool=true) +DataFrame(columns::Matrix, names::AbstractVector{Symbol}; makeunique::Bool=false) +DataFrame(columns::Matrix, names::AbstractVector{<:AbstractString}; + makeunique::Bool=false) DataFrame(kwargs...) DataFrame(pairs::Pair{Symbol,<:Any}...; makeunique::Bool=false, copycols::Bool=true) +DataFrame(pairs::Pair{<:AbstractString,<:Any}...; makeunique::Bool=false, + copycols::Bool=true) DataFrame() # an empty DataFrame -DataFrame(column_eltypes::Vector, names::AbstractVector{Symbol}, nrows::Integer=0; - makeunique::Bool=false) +DataFrame(column_eltypes::AbstractVector, names::AbstractVector{Symbol}, + nrows::Integer=0; makeunique::Bool=false) +DataFrame(column_eltypes::AbstractVector, names::AbstractVector{<:AbstractString}, + nrows::Integer=0; makeunique::Bool=false) DataFrame(ds::AbstractDict; copycols::Bool=true) DataFrame(table; makeunique::Bool=false, copycols::Bool=true) DataFrame(::Union{DataFrame, SubDataFrame}; copycols::Bool=true) @@ -37,8 +47,8 @@ DataFrame(::GroupedDataFrame) - `t` : elemental type of all columns - `nrows`, `ncols` : number of rows and columns - `column_eltypes` : element type of each column -- `categorical` : a vector of `Bool` indicating which columns should be converted to - `CategoricalVector` +- `categorical` : a vector of `Bool` indicating which columns should be converted + to `CategoricalVector` - `ds` : `AbstractDict` of columns - `table` : any type that implements the [Tables.jl](https://github.com/JuliaData/Tables.jl) interface; in particular @@ -47,12 +57,13 @@ DataFrame(::GroupedDataFrame) to `false` then the constructor will still copy the passed columns if it is not possible to construct a `DataFrame` without materializing new columns. -All columns in `columns` must be `AbstractVector`s and have the same length. -An exception are `DataFrame(kwargs...)` and `DataFrame(pairs::Pair{Symbol,<:Any}...)` -form constructors which additionally allow a column to be of any other type that is not -an `AbstractArray`, in which case the passed value is automatically repeated to fill -a new vector of the appropriate length. As a particular rule values stored in -a `Ref` or a `0`-dimensional `AbstractArray` are unwrapped and treated in the same way. +All columns in `columns` must be `AbstractVector`s and have the same length. An +exception are `DataFrame(kwargs...)` and `DataFrame(pairs::Pair...)` form +constructors which additionally allow a column to be of any other type that is +not an `AbstractArray`, in which case the passed value is automatically repeated +to fill a new vector of the appropriate length. As a particular rule values +stored in a `Ref` or a `0`-dimensional `AbstractArray` are unwrapped and treated +in the same way. # Notes The `DataFrame` constructor by default copies all columns vectors passed to it. @@ -153,9 +164,27 @@ function DataFrame(pairs::Pair{Symbol,<:Any}...; makeunique::Bool=false, copycols::Bool=true)::DataFrame colnames = [Symbol(k) for (k,v) in pairs] columns = Any[v for (k,v) in pairs] - DataFrame(columns, Index(colnames, makeunique=makeunique), copycols=copycols) + return DataFrame(columns, Index(colnames, makeunique=makeunique), + copycols=copycols) +end + +function DataFrame(pairs::Pair{<:AbstractString,<:Any}...; makeunique::Bool=false, + copycols::Bool=true)::DataFrame + colnames = [Symbol(k) for (k,v) in pairs] + columns = Any[v for (k,v) in pairs] + return DataFrame(columns, Index(colnames, makeunique=makeunique), + copycols=copycols) end +# these two are needed as a workaround Tables.jl dispatch +DataFrame(pairs::AbstractVector{<:Pair}; makeunique::Bool=false, + copycols::Bool=true) = + DataFrame(pairs..., makeunique=makeunique, copycols=copycols) + +DataFrame(pairs::NTuple{N, Pair}; makeunique::Bool=false, + copycols::Bool=true) where {N} = + DataFrame(pairs..., makeunique=makeunique, copycols=copycols) + function DataFrame(d::AbstractDict; copycols::Bool=true) if isa(d, Dict) colnames = sort!(collect(keys(d))) @@ -200,6 +229,10 @@ function DataFrame(columns::AbstractVector, cnames::AbstractVector{Symbol}; copycols=copycols) end +DataFrame(columns::AbstractVector, cnames::AbstractVector{<:AbstractString}; + makeunique::Bool=false, copycols::Bool=true) = + DataFrame(columns, Symbol.(cnames), makeunique=makeunique, copycols=copycols) + DataFrame(columns::AbstractVector{<:AbstractVector}, cnames::AbstractVector{Symbol}=gennames(length(columns)); makeunique::Bool=false, copycols::Bool=true)::DataFrame = @@ -207,30 +240,49 @@ DataFrame(columns::AbstractVector{<:AbstractVector}, Index(convert(Vector{Symbol}, cnames), makeunique=makeunique), copycols=copycols) +DataFrame(columns::AbstractVector{<:AbstractVector}, + cnames::AbstractVector{<:AbstractString}; + makeunique::Bool=false, copycols::Bool=true) = + DataFrame(columns, Symbol.(cnames); makeunique=makeunique, copycols=copycols) + DataFrame(columns::NTuple{N, AbstractVector}, cnames::NTuple{N, Symbol}; makeunique::Bool=false, copycols::Bool=true) where {N} = DataFrame(collect(AbstractVector, columns), collect(Symbol, cnames), makeunique=makeunique, copycols=copycols) +DataFrame(columns::NTuple{N, AbstractVector}, cnames::NTuple{N, AbstractString}; + makeunique::Bool=false, copycols::Bool=true) where {N} = + DataFrame(columns, Symbol.(cnames); makeunique=makeunique, copycols=copycols) + DataFrame(columns::NTuple{N, AbstractVector}; copycols::Bool=true) where {N} = DataFrame(collect(AbstractVector, columns), gennames(length(columns)), copycols=copycols) -DataFrame(columns::AbstractMatrix, cnames::AbstractVector{Symbol} = gennames(size(columns, 2)); +DataFrame(columns::AbstractMatrix, + cnames::AbstractVector{Symbol} = gennames(size(columns, 2)); makeunique::Bool=false) = DataFrame(AbstractVector[columns[:, i] for i in 1:size(columns, 2)], cnames, makeunique=makeunique, copycols=false) +DataFrame(columns::AbstractMatrix, cnames::AbstractVector{<:AbstractString}; + makeunique::Bool=false) = + DataFrame(columns, Symbol.(cnames); makeunique=makeunique) + function DataFrame(column_eltypes::AbstractVector{T}, cnames::AbstractVector{Symbol}, nrows::Integer=0; makeunique::Bool=false)::DataFrame where T<:Type columns = AbstractVector[elty >: Missing ? fill!(Tables.allocatecolumn(elty, nrows), missing) : Tables.allocatecolumn(elty, nrows) for elty in column_eltypes] - return DataFrame(columns, Index(convert(Vector{Symbol}, cnames), makeunique=makeunique), - copycols=false) + return DataFrame(columns, Index(convert(Vector{Symbol}, cnames), + makeunique=makeunique), copycols=false) end +DataFrame(column_eltypes::AbstractVector{<:Type}, + cnames::AbstractVector{<:AbstractString}, + nrows::Integer=0; makeunique::Bool=false) = + DataFrame(column_eltypes, Symbol.(cnames), nrows; makeunique=makeunique) + """ DataFrame!(args...; kwargs...) @@ -263,14 +315,16 @@ function DataFrame!(args...; kwargs...) end DataFrame!(columns::AbstractMatrix, - cnames::AbstractVector{Symbol} = gennames(size(columns, 2)); + cnames::Union{AbstractVector{Symbol}, + AbstractVector{<:AbstractString}} = gennames(size(columns, 2)); makeunique::Bool=false) = throw(ArgumentError("It is not possible to construct a `DataFrame` from " * "`$(typeof(columns))` without allocating new columns: " * "use `DataFrame(...)` instead")) -DataFrame!(column_eltypes::AbstractVector{<:Type}, cnames::AbstractVector{Symbol}, +DataFrame!(column_eltypes::AbstractVector{<:Type}, + cnames::Union{AbstractVector{Symbol}, AbstractVector{<:AbstractString}}, nrows::Integer=0; makeunique::Bool=false)::DataFrame = throw(ArgumentError("It is not possible to construct an uninitialized `DataFrame`" * "without allocating new columns: use `DataFrame(...)` instead")) @@ -294,6 +348,13 @@ ncol(df::DataFrame) = length(index(df)) ## ############################################################################## +corrupt_msg(df::DataFrame, i::Integer) = + "Data frame is corrupt: length of column " * + ":$(_names(df)[i]) ($(length(df[!, i]))) " * + "does not match length of column 1 ($(length(df[!, 1]))). " * + "The column vector has likely been resized unintentionally " * + "(either directly or because it is shared with another data frame)." + function _check_consistency(df::DataFrame) cols, idx = _columns(df), index(df) ncols = length(cols) @@ -301,10 +362,7 @@ function _check_consistency(df::DataFrame) ncols == 0 && return nothing nrows = length(cols[1]) for i in 2:length(cols) - @assert length(cols[i]) == nrows "Data frame is corrupt: length of column :$(names(df)[i]) ($(length(df[!, i])))" * - " does not match length of column 1 ($(length(df[!, 1]))). " * - "The column vector has likely been resized unintentionally " * - "(either directly or because it is shared with another data frame)." + @assert length(cols[i]) == nrows corrupt_msg(df, i) end nothing end @@ -333,7 +391,8 @@ _check_consistency(df::AbstractDataFrame) = _check_consistency(parent(df)) @inbounds cols[col_ind][row_ind] end -@inline function Base.getindex(df::DataFrame, row_ind::Integer, col_ind::Symbol) +@inline function Base.getindex(df::DataFrame, row_ind::Integer, + col_ind::SymbolOrString) selected_column = index(df)[col_ind] @boundscheck if !checkindex(Bool, axes(df, 1), row_ind) throw(BoundsError(df, (row_ind, col_ind))) @@ -368,14 +427,14 @@ end @inbounds cols[col_ind] end -function Base.getindex(df::DataFrame, ::typeof(!), col_ind::Symbol) +function Base.getindex(df::DataFrame, ::typeof(!), col_ind::SymbolOrString) selected_column = index(df)[col_ind] return _columns(df)[selected_column] end # df[MultiRowIndex, MultiColumnIndex] => DataFrame @inline function Base.getindex(df::DataFrame, row_inds::AbstractVector{T}, - col_inds::Union{AbstractVector, Regex, Not, Between, All}) where T + col_inds::MultiColumnIndex) where T @boundscheck if !checkindex(Bool, axes(df, 1), row_inds) throw(BoundsError(df, (row_inds, col_inds))) end @@ -397,15 +456,17 @@ end end @inline Base.getindex(df::DataFrame, row_inds::Not, - col_inds::Union{AbstractVector, Regex, Not, Between, All, Colon}) = + col_inds::MultiColumnIndex) = df[axes(df, 1)[row_inds], col_inds] # df[:, MultiColumnIndex] => DataFrame -Base.getindex(df::DataFrame, row_ind::Colon, col_inds::Union{AbstractVector, Regex, Not, Between, All, Colon}) = +Base.getindex(df::DataFrame, row_ind::Colon, + col_inds::MultiColumnIndex) = select(df, col_inds, copycols=true) # df[!, MultiColumnIndex] => DataFrame -Base.getindex(df::DataFrame, row_ind::typeof(!), col_inds::Union{AbstractVector, Regex, Not, Between, All, Colon}) = +Base.getindex(df::DataFrame, row_ind::typeof(!), + col_inds::MultiColumnIndex) = select(df, col_inds, copycols=false) ############################################################################## @@ -426,10 +487,7 @@ function nextcolname(df::DataFrame) end # Will automatically add a new column if needed -function insert_single_column!(df::DataFrame, - v::AbstractVector, - col_ind::ColumnIndex) - +function insert_single_column!(df::DataFrame, v::AbstractVector, col_ind::ColumnIndex) if ncol(df) != 0 && nrow(df) != length(v) throw(ArgumentError("New columns must have the same length as old columns")) end @@ -438,8 +496,8 @@ function insert_single_column!(df::DataFrame, j = index(df)[col_ind] _columns(df)[j] = dv else - if col_ind isa Symbol - push!(index(df), col_ind) + if col_ind isa SymbolOrString + push!(index(df), Symbol(col_ind)) push!(_columns(df), dv) else if ncol(df) + 1 == Int(col_ind) @@ -484,7 +542,11 @@ function Base.setindex!(df::DataFrame, v::AbstractVector, ::typeof(!), col_ind:: end # df.col = AbstractVector -Base.setproperty!(df::DataFrame, col_ind::Symbol, v::AbstractVector) = (df[!, col_ind] = v) +# separate methods are needed due to dispatch ambiguity +Base.setproperty!(df::DataFrame, col_ind::Symbol, v::AbstractVector) = + (df[!, col_ind] = v) +Base.setproperty!(df::DataFrame, col_ind::AbstractString, v::AbstractVector) = + (df[!, col_ind] = v) # df[SingleRowIndex, SingleColumnIndex] = Single Item function Base.setindex!(df::DataFrame, v::Any, row_ind::Integer, col_ind::ColumnIndex) @@ -496,7 +558,7 @@ end # the method for value of type DataFrameRow, AbstractDict and NamedTuple # is defined in dataframerow.jl -for T in (:AbstractVector, :Regex, :Not, :Between, :All, :Colon) +for T in MULTICOLUMNINDEX_TUPLE @eval function Base.setindex!(df::DataFrame, v::Union{Tuple, AbstractArray}, row_ind::Integer, @@ -537,7 +599,7 @@ end # df[MultiRowIndex, MultiColumnIndex] = AbstractDataFrame for T1 in (:AbstractVector, :Not, :Colon), - T2 in (:AbstractVector, :Regex, :Not, :Between, :All, :Colon) + T2 in MULTICOLUMNINDEX_TUPLE @eval function Base.setindex!(df::DataFrame, new_df::AbstractDataFrame, row_inds::$T1, @@ -547,13 +609,14 @@ for T1 in (:AbstractVector, :Not, :Colon), df[row_inds, col] = new_df[!, j] end if view(_names(df), idxs) != _names(new_df) - Base.depwarn("in the future column names in source and target will have to match", :setindex!) + Base.depwarn("in the future column names in source and target will " * + "have to match", :setindex!) end return df end end -for T in (:AbstractVector, :Regex, :Not, :Between, :All, :Colon) +for T in MULTICOLUMNINDEX_TUPLE @eval function Base.setindex!(df::DataFrame, new_df::AbstractDataFrame, row_inds::typeof(!), @@ -572,14 +635,15 @@ end # df[MultiRowIndex, MultiColumnIndex] = AbstractMatrix for T1 in (:AbstractVector, :Not, :Colon, :(typeof(!))), - T2 in (:AbstractVector, :Regex, :Not, :Between, :All, :Colon) + T2 in MULTICOLUMNINDEX_TUPLE @eval function Base.setindex!(df::DataFrame, mx::AbstractMatrix, row_inds::$T1, col_inds::$T2) idxs = index(df)[col_inds] if size(mx, 2) != length(idxs) - throw(DimensionMismatch("number of selected columns ($(length(idxs))) and number of columns in" * + throw(DimensionMismatch("number of selected columns ($(length(idxs)))" * + " and number of columns in" * " matrix ($(size(mx, 2))) do not match")) end for (j, col) in enumerate(idxs) @@ -596,7 +660,7 @@ end ############################################################################## """ - insertcols!(df::DataFrame, [ind::Int], (name=>col)::Pair{Symbol}...; + insertcols!(df::DataFrame, [ind::Int], (name=>col)::Pair...; makeunique::Bool=false, copycols::Bool=true) Insert a column into a data frame in place. Return the updated `DataFrame`. @@ -657,8 +721,8 @@ function insertcols!(df::DataFrame, col_ind::Int, name_cols::Pair{Symbol,<:Any}. if !makeunique if !allunique(first.(name_cols)) - throw(ArgumentError("Names of columns to be inserted into a data frame must be " * - "unique when `makeunique=true`")) + throw(ArgumentError("Names of columns to be inserted into a data frame " * + "must be unique when `makeunique=true`")) end for (n, _) in name_cols if hasproperty(df, n) @@ -737,10 +801,20 @@ function insertcols!(df::DataFrame, col_ind::Int, name_cols::Pair{Symbol,<:Any}. return df end +insertcols!(df::DataFrame, col_ind::Int, name_cols::Pair{<:AbstractString,<:Any}...; + makeunique::Bool=false, copycols::Bool=true) = + insertcols!(df, col_ind, (Symbol(n) => v for (n,v) in name_cols)..., + makeunique=makeunique, copycols=copycols) + insertcols!(df::DataFrame, name_cols::Pair{Symbol,<:Any}...; makeunique::Bool=false, copycols::Bool=true) = insertcols!(df, ncol(df)+1, name_cols..., makeunique=makeunique, copycols=copycols) +insertcols!(df::DataFrame, name_cols::Pair{<:AbstractString,<:Any}...; + makeunique::Bool=false, copycols::Bool=true) = + insertcols!(df, (Symbol(n) => v for (n,v) in name_cols)..., + makeunique=makeunique, copycols=copycols) + """ copy(df::DataFrame; copycols::Bool=true) @@ -753,7 +827,7 @@ function Base.copy(df::DataFrame; copycols::Bool=true) if copycols df[:, :] else - DataFrame(eachcol(df), names(df), copycols=false) + DataFrame(eachcol(df), _names(df), copycols=false) end end @@ -821,11 +895,10 @@ function hcat!(df1::DataFrame, df2::AbstractDataFrame; end # definition required to avoid hcat! ambiguity -function hcat!(df1::DataFrame, df2::DataFrame; - makeunique::Bool=false, copycols::Bool=true) +hcat!(df1::DataFrame, df2::DataFrame; + makeunique::Bool=false, copycols::Bool=true) = invoke(hcat!, Tuple{DataFrame, AbstractDataFrame}, df1, df2, makeunique=makeunique, copycols=copycols)::DataFrame -end hcat!(df::DataFrame, x::AbstractVector; makeunique::Bool=false, copycols::Bool=true) = hcat!(df, DataFrame(AbstractVector[x], copycols=copycols), @@ -863,27 +936,27 @@ Base.hcat(df1::DataFrame, df2::AbstractDataFrame, dfn::AbstractDataFrame...; ## ############################################################################## """ - allowmissing!(df::DataFrame, cols::Colon=:) - allowmissing!(df::DataFrame, cols::Union{Integer, Symbol}) - allowmissing!(df::DataFrame, cols::Union{AbstractVector, Regex, Not, Between, All}) + allowmissing!(df::DataFrame, cols=:) Convert columns `cols` of data frame `df` from element type `T` to `Union{T, Missing}` to support missing values. +`cols` can be any column selector ($COLUMNINDEX_STR; $MULTICOLUMNINDEX_STR). + If `cols` is omitted all columns in the data frame are converted. """ function allowmissing! end function allowmissing!(df::DataFrame, col::ColumnIndex) df[!, col] = allowmissing(df[!, col]) - df + return df end function allowmissing!(df::DataFrame, cols::AbstractVector{<:ColumnIndex}) for col in cols allowmissing!(df, col) end - df + return df end function allowmissing!(df::DataFrame, cols::AbstractVector{Bool}) @@ -891,27 +964,27 @@ function allowmissing!(df::DataFrame, cols::AbstractVector{Bool}) for (col, cond) in enumerate(cols) cond && allowmissing!(df, col) end - df + return df end -allowmissing!(df::DataFrame, cols::Union{Regex, Not, Between, All}) = +allowmissing!(df::DataFrame, cols::MultiColumnIndex) = allowmissing!(df, index(df)[cols]) allowmissing!(df::DataFrame, cols::Colon=:) = allowmissing!(df, axes(df, 2)) """ - disallowmissing!(df::DataFrame, cols::Colon=:; error::Bool=true) - disallowmissing!(df::DataFrame, cols::Union{Integer, Symbol}; error::Bool=true) - disallowmissing!(df::DataFrame, cols::Union{AbstractVector, Regex, Not, Between, All}; - error::Bool=true) + disallowmissing!(df::DataFrame, cols=:; error::Bool=true) Convert columns `cols` of data frame `df` from element type `Union{T, Missing}` to `T` to drop support for missing values. +`cols` can be any column selector ($COLUMNINDEX_STR; $MULTICOLUMNINDEX_STR). + If `cols` is omitted all columns in the data frame are converted. -If `error=false` then columns containing a `missing` value will be skipped instead of throwing an error. +If `error=false` then columns containing a `missing` value will be skipped instead +of throwing an error. """ function disallowmissing! end @@ -920,7 +993,7 @@ function disallowmissing!(df::DataFrame, col::ColumnIndex; error::Bool=true) if !(!error && Missing <: eltype(x) && any(ismissing, x)) df[!, col] = disallowmissing(x) end - df + return df end function disallowmissing!(df::DataFrame, cols::AbstractVector{<:ColumnIndex}; @@ -928,7 +1001,7 @@ function disallowmissing!(df::DataFrame, cols::AbstractVector{<:ColumnIndex}; for col in cols disallowmissing!(df, col, error=error) end - df + return df end function disallowmissing!(df::DataFrame, cols::AbstractVector{Bool}; error::Bool=true) @@ -936,10 +1009,10 @@ function disallowmissing!(df::DataFrame, cols::AbstractVector{Bool}; error::Bool for (col, cond) in enumerate(cols) cond && disallowmissing!(df, col, error=error) end - df + return df end -disallowmissing!(df::DataFrame, cols::Union{Regex, Not, Between, All}; error::Bool=true) = +disallowmissing!(df::DataFrame, cols::MultiColumnIndex; error::Bool=true) = disallowmissing!(df, index(df)[cols], error=error) disallowmissing!(df::DataFrame, cols::Colon=:; error::Bool=true) = @@ -952,24 +1025,19 @@ disallowmissing!(df::DataFrame, cols::Colon=:; error::Bool=true) = ############################################################################## """ - categorical!(df::DataFrame, cols::Type=Union{AbstractString, Missing}; - compress::Bool=false) - categorical!(df::DataFrame, cname::Union{Integer, Symbol}; - compress::Bool=false) - categorical!(df::DataFrame, cnames::Vector{<:Union{Integer, Symbol}}; - compress::Bool=false) - categorical!(df::DataFrame, cnames::Union{Regex, Not, Between, All}; + categorical!(df::DataFrame, cols=Union{AbstractString, Missing}; compress::Bool=false) -Change columns selected by `cname` or `cnames` in data frame `df` -to `CategoricalVector`. +Change columns selected by `cols` in data frame `df` to `CategoricalVector`. + +`cols` can be any column selector ($COLUMNINDEX_STR; $MULTICOLUMNINDEX_STR) or a `Type`. If `categorical!` is called with the `cols` argument being a `Type`, then all columns whose element type is a subtype of this type (by default `Union{AbstractString, Missing}`) will be converted to categorical. -If the `compress` keyword argument is set to `true` then the created `CategoricalVector`s -will be compressed. +If the `compress` keyword argument is set to `true` then the created +`CategoricalVector`s will be compressed. All created `CategoricalVector`s are unordered. @@ -1022,32 +1090,32 @@ julia> eltype.(eachcol(df)) """ function categorical! end -function categorical!(df::DataFrame, cname::ColumnIndex; +function categorical!(df::DataFrame, cols::ColumnIndex; compress::Bool=false) - df[!, cname] = categorical(df[!, cname], compress=compress) - df + df[!, cols] = categorical(df[!, cols], compress=compress) + return df end -function categorical!(df::DataFrame, cnames::AbstractVector{<:ColumnIndex}; +function categorical!(df::DataFrame, cols::AbstractVector{<:ColumnIndex}; compress::Bool=false) - for cname in cnames + for cname in cols df[!, cname] = categorical(df[!, cname], compress=compress) end - df + return df end -categorical!(df::DataFrame, cnames::Union{Regex, Not, Between, All, Colon}; compress::Bool=false) = - categorical!(df, index(df)[cnames], compress=compress) +categorical!(df::DataFrame, cols::MultiColumnIndex; + compress::Bool=false) = + categorical!(df, index(df)[cols], compress=compress) -function categorical!(df::DataFrame, - cols::Type=Union{AbstractString, Missing}; +function categorical!(df::DataFrame, cols::Type=Union{AbstractString, Missing}; compress::Bool=false) for i in 1:size(df, 2) if eltype(df[!, i]) <: cols df[!, i] = categorical(df[!, i], compress=compress) end end - df + return df end """ @@ -1056,35 +1124,38 @@ end append!(df::DataFrame, table; cols::Symbol=:setequal, promote::Bool=(cols in [:union, :subset])) -Add the rows of `df2` to the end of `df`. If the second argument `table` is -not an `AbstractDataFrame` then it is converted using `DataFrame(table, copycols=false)` +Add the rows of `df2` to the end of `df`. If the second argument `table` is not an +`AbstractDataFrame` then it is converted using `DataFrame(table, copycols=false)` before being appended. The exact behavior of `append!` depends on the `cols` argument: * If `cols == :setequal` (this is the default) - then `df2` must contain exactly the same columns as `df` (but possibly in a different order). -* If `cols == :orderequal` then `df2` must contain the same columns in the same order - (for `AbstractDict` this option requires that `keys(row)` matches `names(df)` - to allow for support of ordered dicts; however, if `df2` is a `Dict` an error is thrown - as it is an unordered collection). -* If `cols == :intersect` then `df2` may contain more columns than `df`, - but all column names that are present in `df` must be present in `df2` and only these + then `df2` must contain exactly the same columns as `df` (but possibly in a + different order). +* If `cols == :orderequal` then `df2` must contain the same columns in the same + order (for `AbstractDict` this option requires that `keys(row)` matches + `propertynames(df)` to allow for support of ordered dicts; however, if `df2` + is a `Dict` an error is thrown as it is an unordered collection). +* If `cols == :intersect` then `df2` may contain more columns than `df`, but all + column names that are present in `df` must be present in `df2` and only these are used. -* If `cols == :subset` then `append!` behaves like for `:intersect` but if some column - is missing in `df2` then a `missing` value is pushed to `df`. -* If `cols == :union` then `append!` adds columns missing in `df` that are present in `row`, - for columns present in `df` but missing in `row` a `missing` value is pushed. +* If `cols == :subset` then `append!` behaves like for `:intersect` but if some + column is missing in `df2` then a `missing` value is pushed to `df`. +* If `cols == :union` then `append!` adds columns missing in `df` that are present + in `row`, for columns present in `df` but missing in `row` a `missing` value + is pushed. -If `promote=true` and element type of a column present in `df` does not allow the type -of a pushed argument then a new column with a promoted element type allowing it is freshly -allocated and stored in `df`. If `promote=false` an error is thrown. +If `promote=true` and element type of a column present in `df` does not allow +the type of a pushed argument then a new column with a promoted element type +allowing it is freshly allocated and stored in `df`. If `promote=false` an error +is thrown. The above rule has the following exceptions: * If `df` has no columns then copies of columns from `df2` are added to it. * If `df2` has no columns then calling `append!` leaves `df` unchanged. -Please note that `append!` must not be used on a `DataFrame` that contains columns -that are aliases (equal when compared with `===`). +Please note that `append!` must not be used on a `DataFrame` that contains +columns that are aliases (equal when compared with `===`). # See also @@ -1134,8 +1205,8 @@ function Base.append!(df1::DataFrame, df2::AbstractDataFrame; cols::Symbol=:sete @assert !isempty(mismatches) throw(ArgumentError("Columns number " * join(mismatches, ", ", " and ") * - " do not have the same names in both passed data frames" * - "and `cols == :orderequal`")) + " do not have the same names in both passed " * + "data frames and `cols == :orderequal`")) else mismatchmsg = " Column names :" * throw(ArgumentError("Column names :" * @@ -1189,7 +1260,8 @@ function Base.append!(df1::DataFrame, df2::AbstractDataFrame; cols::Symbol=:sete if Missing <: eltype(df1[!, j]) || !promote append!(df1[!, j], df2[!, n]) else - newcol = similar(df1[!, j], Union{Missing, eltype(df1[!, j])}, targetrows) + newcol = similar(df1[!, j], Union{Missing, eltype(df1[!, j])}, + targetrows) copyto!(newcol, 1, df1[!, j], 1, nrows) newcol[nrows+1:targetrows] .= missing _columns(df1)[j] = newcol @@ -1203,7 +1275,8 @@ function Base.append!(df1::DataFrame, df2::AbstractDataFrame; cols::Symbol=:sete end if cols == :union for n in setdiff(_names(df2), _names(df1)) - newcol = similar(df2[!, n], Union{Missing, eltype(df2[!, n])}, targetrows) + newcol = similar(df2[!, n], Union{Missing, eltype(df2[!, n])}, + targetrows) @inbounds newcol[1:nrows] .= missing copyto!(newcol, nrows+1, df2[!, n], 1, targetrows - nrows) df1[!, n] = newcol @@ -1214,7 +1287,7 @@ function Base.append!(df1::DataFrame, df2::AbstractDataFrame; cols::Symbol=:sete for col in _columns(df1) resize!(col, nrows) end - @error "Error adding value to column $(names(df1)[current_col])." + @error "Error adding value to column :$(_names(df1)[current_col])." rethrow(err) end return df1 @@ -1224,16 +1297,19 @@ Base.convert(::Type{DataFrame}, A::AbstractMatrix) = DataFrame(A) Base.convert(::Type{DataFrame}, d::AbstractDict) = DataFrame(d, copycols=false) -function Base.push!(df::DataFrame, row::Union{AbstractDict, NamedTuple}; cols::Symbol=:setequal, +function Base.push!(df::DataFrame, row::Union{AbstractDict, NamedTuple}; + cols::Symbol=:setequal, columns::Union{Nothing,Symbol}=nothing, promote::Bool=(cols in [:union, :subset])) if columns !== nothing cols = columns - Base.depwarn("`columns` keyword argument is deprecated. Use `cols` instead. ", :push!) + Base.depwarn("`columns` keyword argument is deprecated. " * + "Use `cols` instead. ", :push!) end possible_cols = (:orderequal, :setequal, :intersect, :subset, :union) if !(cols in possible_cols) - throw(ArgumentError("`cols` keyword argument must be any of :" * join(possible_cols, ", :"))) + throw(ArgumentError("`cols` keyword argument must be any of :" * + join(possible_cols, ", :"))) end nrows, ncols = size(df) @@ -1246,6 +1322,11 @@ function Base.push!(df::DataFrame, row::Union{AbstractDict, NamedTuple}; cols::S return df end + old_row_type = typeof(row) + if row isa AbstractDict && all(x -> x isa AbstractString, keys(row)) + row = (;(Symbol.(keys(row)) .=> values(row))...) + end + # in the code below we use a direct access to _columns because # we resize the columns so temporarily the `DataFrame` is internally # inconsistent and normal data frame indexing would error. @@ -1296,12 +1377,13 @@ function Base.push!(df::DataFrame, row::Union{AbstractDict, NamedTuple}; cols::S end if cols == :orderequal - if row isa Dict + if old_row_type <: Dict throw(ArgumentError("passing `Dict` as `row` when `cols == :orderequal` " * "is not allowed as it is unordered")) elseif length(row) != ncol(df) || any(x -> x[1] != x[2], zip(keys(row), _names(df))) - throw(ArgumentError("when `cols == :orderequal` pushed row must have the same column " * - "names and in the same order as the target data frame")) + throw(ArgumentError("when `cols == :orderequal` pushed row must " * + "have the same column names and in the" * + " same order as the target data frame")) end elseif cols == :setequal || cols === :equal if cols == :equal @@ -1312,8 +1394,8 @@ function Base.push!(df::DataFrame, row::Union{AbstractDict, NamedTuple}; cols::S # as an error will be thrown below if some names don't match if length(row) != ncols Base.depwarn("In the future `push!` with `cols` equal to `:setequal`" * - "will require `row` to have the same number of elements as is the " * - "number of columns in `df`.", :push!) + "will require `row` to have the same number of elements " * + "as is the number of columns in `df`.", :push!) end end current_col = 0 @@ -1346,7 +1428,7 @@ function Base.push!(df::DataFrame, row::Union{AbstractDict, NamedTuple}; cols::S for col in _columns(df) resize!(col, nrows) end - @error "Error adding value to column :$(names(df)[current_col])." + @error "Error adding value to column :$(_names(df)[current_col])." rethrow(err) end return df @@ -1371,26 +1453,29 @@ If `row` is a `DataFrameRow`, `NamedTuple` or `AbstractDict` then values in `row` are matched to columns in `df` based on names. The exact behavior depends on the `cols` argument value in the following way: * If `cols == :setequal` (this is the default) - then `row` must contain exactly the same columns as `df` (but possibly in a different order). -* If `cols == :orderequal` then `row` must contain the same columns in the same order - (for `AbstractDict` this option requires that `keys(row)` matches `names(df)` - to allow for support of ordered dicts; however, if `row` is a `Dict` an error is thrown - as it is an unordered collection). + then `row` must contain exactly the same columns as `df` (but possibly in a + different order). +* If `cols == :orderequal` then `row` must contain the same columns in the same + order (for `AbstractDict` this option requires that `keys(row)` matches + `propertynames(df)` to allow for support of ordered dicts; however, if `row` + is a `Dict` an error is thrown as it is an unordered collection). * If `cols == :intersect` then `row` may contain more columns than `df`, - but all column names that are present in `df` must be present in `row` and only they - are used to populate a new row in `df`. -* If `cols == :subset` then `push!` behaves like for `:intersect` but if some column - is missing in `row` then a `missing` value is pushed to `df`. -* If `cols == :union` then columns missing in `df` that are present in `row` are added to `df` - (using `missing` for existing rows) and a `missing` value is pushed to columns - missing in `row` that are present in `df`. - -If `promote=true` and element type of a column present in `df` does not allow the type -of a pushed argument then a new column with a promoted element type allowing it is freshly -allocated and stored in `df`. If `promote=false` an error is thrown. - -As a special case, if `df` has no columns and `row` is a `NamedTuple` or `DataFrameRow`, -columns are created for all values in `row`, using their names and order. + but all column names that are present in `df` must be present in `row` and only + they are used to populate a new row in `df`. +* If `cols == :subset` then `push!` behaves like for `:intersect` but if some + column is missing in `row` then a `missing` value is pushed to `df`. +* If `cols == :union` then columns missing in `df` that are present in `row` are + added to `df` (using `missing` for existing rows) and a `missing` value is + pushed to columns missing in `row` that are present in `df`. + +If `promote=true` and element type of a column present in `df` does not allow +the type of a pushed argument then a new column with a promoted element type +allowing it is freshly allocated and stored in `df`. If `promote=false` an error +is thrown. + +As a special case, if `df` has no columns and `row` is a `NamedTuple` or +`DataFrameRow`, columns are created for all values in `row`, using their names +and order. Please note that `push!` must not be used on a `DataFrame` that contains columns that are aliases (equal when compared with `===`). @@ -1462,10 +1547,10 @@ julia> push!(df, NamedTuple(), cols=:subset) """ function Base.push!(df::DataFrame, row::Any; promote::Bool=false) if !(row isa Union{Tuple, AbstractArray}) - Base.depwarn("In the future `push!` will not allow passing collections of type" * - " $(typeof(row)) to be pushed into a DataFrame. " * - "Only `Tuple`, `AbstractArray`, `AbstractDict`, `DataFrameRow` and " * - "`NamedTuple` will be allowed.", :push!) + Base.depwarn("In the future `push!` will not allow passing collections " * + "of type $(typeof(row)) to be pushed into a DataFrame. " * + "Only `Tuple`, `AbstractArray`, `AbstractDict`, `DataFrameRow`" * + " and `NamedTuple` will be allowed.", :push!) end nrows, ncols = size(df) targetrows = nrows + 1 @@ -1499,7 +1584,7 @@ function Base.push!(df::DataFrame, row::Any; promote::Bool=false) for col in _columns(df) resize!(col, nrows) end - @error "Error adding value to column :$(names(df)[current_col])." + @error "Error adding value to column :$(_names(df)[current_col])." rethrow(err) end df diff --git a/src/dataframe/sort.jl b/src/dataframe/sort.jl index bc39be6307..a1107d7130 100644 --- a/src/dataframe/sort.jl +++ b/src/dataframe/sort.jl @@ -5,8 +5,8 @@ rev::Bool=false, order::Ordering=Forward) Sort data frame `df` by column(s) `cols`. -`cols` can be either a `Symbol` or `Integer` column index, or -a vector of such indices, `:`, `All`, `Not`, `Between`, or `Regex`. + +`cols` can be any column selector ($COLUMNINDEX_STR; $MULTICOLUMNINDEX_STR). If `alg` is `nothing` (the default), the most appropriate algorithm is chosen automatically among `TimSort`, `MergeSort` and `RadixSort` depending @@ -72,10 +72,12 @@ julia> sort!(df, (:x, order(:y, rev=true))) function Base.sort!(df::DataFrame, cols=[]; alg=nothing, lt=isless, by=identity, rev=false, order=Forward) if !(isa(by, Function) || eltype(by) <: Function) - msg = "'by' must be a Function or a vector of Functions. Perhaps you wanted 'cols'." + msg = "'by' must be a Function or a vector of Functions. " * + "Perhaps you wanted 'cols'." throw(ArgumentError(msg)) end - if cols isa Union{Colon, All, Not, Between, Regex} + # exclude AbstractVector as in that case cols can contain order(...) clauses + if cols isa MultiColumnIndex && !(cols isa AbstractVector) cols = index(df)[cols] end ord = ordering(df, cols, lt, by, rev, order) diff --git a/src/dataframerow/dataframerow.jl b/src/dataframerow/dataframerow.jl index a8f5354e10..254776e3e5 100644 --- a/src/dataframerow/dataframerow.jl +++ b/src/dataframerow/dataframerow.jl @@ -85,17 +85,17 @@ Base.summary(dfr::DataFrameRow) = # -> String Base.summary(io::IO, dfr::DataFrameRow) = print(io, summary(dfr)) Base.@propagate_inbounds Base.view(adf::AbstractDataFrame, rowind::Integer, - colinds::Union{Colon, AbstractVector, Regex, Not, Between, All}) = + colinds::MultiColumnIndex) = DataFrameRow(adf, rowind, colinds) Base.@propagate_inbounds Base.getindex(df::AbstractDataFrame, rowind::Integer, - colinds::Union{AbstractVector, Regex, Not, Between, All}) = + colinds::MultiColumnIndex) = DataFrameRow(df, rowind, colinds) Base.@propagate_inbounds Base.getindex(df::AbstractDataFrame, rowind::Integer, ::Colon) = DataFrameRow(df, rowind, :) Base.@propagate_inbounds Base.getindex(r::DataFrameRow, idx::ColumnIndex) = parent(r)[row(r), parentcols(index(r), idx)] -Base.@propagate_inbounds Base.getindex(r::DataFrameRow, idxs::Union{AbstractVector, Regex, Not, Between, All}) = +Base.@propagate_inbounds Base.getindex(r::DataFrameRow, idxs::MultiColumnIndex) = DataFrameRow(parent(r), row(r), parentcols(index(r), idxs)) Base.@propagate_inbounds Base.getindex(r::DataFrameRow, ::Colon) = r @@ -111,6 +111,9 @@ for T in (:AbstractVector, :Regex, :Not, :Between, :All, :Colon) end if v isa AbstractDict + if all(x -> x isa AbstractString, keys(v)) + v = (;(Symbol.(keys(v)) .=> values(v))...) + end for n in view(_names(df), idxs) if !haskey(v, n) throw(ArgumentError("Column :$n not found in source dictionary")) @@ -118,8 +121,8 @@ for T in (:AbstractVector, :Regex, :Not, :Between, :All, :Colon) end elseif !all(((a, b),) -> a == b, zip(view(_names(df), idxs), keys(v))) mismatched = findall(view(_names(df), idxs) .!= collect(keys(v))) - throw(ArgumentError("Selected column names do not match the names in assigned value in" * - " positions $(join(mismatched, ", ", " and "))")) + throw(ArgumentError("Selected column names do not match the names in assigned " * + "value in positions $(join(mismatched, ", ", " and "))")) end for (col, val) in pairs(v) @@ -137,8 +140,10 @@ index(r::DataFrameRow) = getfield(r, :colindex) Base.names(r::DataFrameRow) = names(index(r)) function Base.names(r::DataFrameRow, cols) - sel = index(r)[cols] - return _names(index(r))[sel isa Int ? (sel:sel) : sel] + nms = _names(index(r)) + idx = index(r)[cols] + idxs = idx isa Int ? (idx:idx) : idx + return [string(nms[i]) for i in idxs] end _names(r::DataFrameRow) = view(_names(parent(r)), parentcols(index(r), :)) @@ -146,6 +151,7 @@ _names(r::DataFrameRow) = view(_names(parent(r)), parentcols(index(r), :)) Base.haskey(r::DataFrameRow, key::Bool) = throw(ArgumentError("invalid key: $key of type Bool")) Base.haskey(r::DataFrameRow, key::Integer) = 1 ≤ key ≤ size(r, 1) + function Base.haskey(r::DataFrameRow, key::Symbol) hasproperty(parent(r), key) || return false index(r) isa Index && return true @@ -154,17 +160,25 @@ function Base.haskey(r::DataFrameRow, key::Symbol) remap = index(r).remap length(remap) == 0 && lazyremap!(index(r)) checkbounds(Bool, remap, pos) || return false - remap[pos] > 0 + return remap[pos] > 0 end -Base.getproperty(r::DataFrameRow, idx::Symbol) = getindex(r, idx) -Base.setproperty!(r::DataFrameRow, idx::Symbol, x::Any) = setindex!(r, x, idx) +Base.haskey(r::DataFrameRow, key::AbstractString) = haskey(r, Symbol(key)) + +# separate methods are needed due to dispatch ambiguity +Base.getproperty(r::DataFrameRow, idx::Symbol) = r[idx] +Base.getproperty(r::DataFrameRow, idx::AbstractString) = r[idx] +Base.setproperty!(r::DataFrameRow, idx::Symbol, x::Any) = (r[idx] = x) +Base.setproperty!(r::DataFrameRow, idx::AbstractString, x::Any) = (r[idx] = x) +Compat.hasproperty(r::DataFrameRow, s::Symbol) = haskey(index(r), s) +Compat.hasproperty(r::DataFrameRow, s::AbstractString) = haskey(index(r), s) + # Private fields are never exposed since they can conflict with column names -Base.propertynames(r::DataFrameRow, private::Bool=false) = Tuple(_names(r)) +Base.propertynames(r::DataFrameRow, private::Bool=false) = copy(_names(r)) Base.view(r::DataFrameRow, col::ColumnIndex) = view(parent(r)[!, parentcols(index(r), col)], row(r)) -Base.view(r::DataFrameRow, cols::Union{AbstractVector, Regex, Not, Between, All}) = +Base.view(r::DataFrameRow, cols::MultiColumnIndex) = DataFrameRow(parent(r), row(r), parentcols(index(r), cols)) Base.view(r::DataFrameRow, ::Colon) = r @@ -172,8 +186,8 @@ Base.view(r::DataFrameRow, ::Colon) = r size(dfr::DataFrameRow, [dim]) Return a 1-tuple containing the number of elements of `dfr`. -If an optional dimension `dim` is specified, it must be `1`, and the number of elements -is returned directly as a number. +If an optional dimension `dim` is specified, it must be `1`, and the number of +elements is returned directly as a number. See also: [`length`](@ref) @@ -245,7 +259,7 @@ Base.convert(::Type{Array{T}}, dfr::DataFrameRow) where {T} = Vector{T}(dfr) Base.Array(dfr::DataFrameRow) = Vector(dfr) Base.Array{T}(dfr::DataFrameRow) where {T} = Vector{T}(dfr) -Base.keys(r::DataFrameRow) = Tuple(_names(r)) +Base.keys(r::DataFrameRow) = propertynames(r) Base.values(r::DataFrameRow) = ntuple(col -> parent(r)[row(r), parentcols(index(r), col)], length(r)) Base.map(f, r::DataFrameRow, rs::DataFrameRow...) = map(f, copy(r), copy.(rs)...) @@ -334,12 +348,14 @@ function Base.push!(df::DataFrame, dfr::DataFrameRow; cols::Symbol=:setequal, promote::Bool=(cols in [:union, :subset])) if columns !== nothing cols = columns - Base.depwarn("`columns` keyword argument is deprecated. Use `cols` instead.", :push!) + Base.depwarn("`columns` keyword argument is deprecated. " * + "Use `cols` instead.", :push!) end possible_cols = (:orderequal, :setequal, :intersect, :subset, :union) if !(cols in possible_cols) - throw(ArgumentError("`cols` keyword argument must be any of :" * join(possible_cols, ", :"))) + throw(ArgumentError("`cols` keyword argument must be any of :" * + join(possible_cols, ", :"))) end nrows, ncols = size(df) @@ -398,7 +414,7 @@ function Base.push!(df::DataFrame, dfr::DataFrameRow; cols::Symbol=:setequal, throw(AssertionError("Error adding value to column :$colname")) end end - for colname in setdiff(keys(dfr), _names(df)) + for colname in setdiff(_names(dfr), _names(df)) val = dfr[colname] S = typeof(val) if nrows == 0 @@ -417,8 +433,8 @@ function Base.push!(df::DataFrame, dfr::DataFrameRow; cols::Symbol=:setequal, try if cols === :orderequal if _names(df) != _names(dfr) - msg = "when `cols == :orderequal` pushed row must have the same column " * - "names and in the same order as the target data frame" + msg = "when `cols == :orderequal` pushed row must have the same " * + "column names and in the same order as the target data frame" throw(ArgumentError(msg)) end elseif cols === :setequal || cols === :equal @@ -457,7 +473,7 @@ function Base.push!(df::DataFrame, dfr::DataFrameRow; cols::Symbol=:setequal, resize!(col, nrows) end if current_col > 0 - @error "Error adding value to column :$(names(df)[current_col])." + @error "Error adding value to column :$(_names(df)[current_col])." end rethrow(err) end diff --git a/src/groupeddataframe/groupeddataframe.jl b/src/groupeddataframe/groupeddataframe.jl index 1059ac6db7..6de06309d2 100644 --- a/src/groupeddataframe/groupeddataframe.jl +++ b/src/groupeddataframe/groupeddataframe.jl @@ -109,21 +109,22 @@ Return a vector of group indices for each row of `parent(gd)`. Rows appearing in group `gd[i]` are attributed index `i`. Rows not present in any group are attributed `missing` (this can happen if `skipmissing=true` was -passed when creating `gd`, or if `gd` is a subset from a larger [`GroupedDataFrame`](@ref)). +passed when creating `gd`, or if `gd` is a subset from +a larger [`GroupedDataFrame`](@ref)). """ groupindices(gd::GroupedDataFrame) = replace(gd.groups, 0=>missing) """ groupcols(gd::GroupedDataFrame) -Return a vector of column names in `parent(gd)` used for grouping. +Return a vector of `Symbol` column names in `parent(gd)` used for grouping. """ groupcols(gd::GroupedDataFrame) = _names(gd)[gd.cols] """ valuecols(gd::GroupedDataFrame) -Return a vector of column names in `parent(gd)` not used for grouping. +Return a vector of `Symbol` column names in `parent(gd)` not used for grouping. """ valuecols(gd::GroupedDataFrame) = _names(gd)[Not(gd.cols)] @@ -136,7 +137,8 @@ function _groupvar_idx(gd::GroupedDataFrame, name::Symbol, strict::Bool) end # Get values of grouping columns for single group -_groupvalues(gd::GroupedDataFrame, i::Integer) = gd.parent[gd.idx[gd.starts[i]], gd.cols] +_groupvalues(gd::GroupedDataFrame, i::Integer) = + gd.parent[gd.idx[gd.starts[i]], gd.cols] # Get values of single grouping column for single group _groupvalues(gd::GroupedDataFrame, i::Integer, col::Integer) = @@ -227,17 +229,18 @@ end Base.parent(key::GroupKey) = getfield(key, :parent) Base.length(key::GroupKey) = length(parent(key).cols) -Base.keys(key::GroupKey) = Tuple(groupcols(parent(key))) +Base.names(key::GroupKey) = string.(groupcols(parent(key))) +# Private fields are never exposed since they can conflict with column names +Base.propertynames(key::GroupKey, private::Bool=false) = groupcols(parent(key)) +Base.keys(key::GroupKey) = propertynames(key) Base.haskey(key::GroupKey, idx::Symbol) = idx in groupcols(parent(key)) +Base.haskey(key::GroupKey, idx::AbstractString) = haskey(key, Symbol(idx)) Base.haskey(key::GroupKey, idx::Union{Signed,Unsigned}) = 1 <= idx <= length(key) -Base.names(key::GroupKey) = groupcols(parent(key)) -# Private fields are never exposed since they can conflict with column names -Base.propertynames(key::GroupKey, private::Bool=false) = keys(key) Base.values(key::GroupKey) = Tuple(_groupvalues(parent(key), getfield(key, :idx))) - -Base.iterate(key::GroupKey, i::Integer=1) = i <= length(key) ? (key[i], i + 1) : nothing - -Base.getindex(key::GroupKey, i::Integer) = _groupvalues(parent(key), getfield(key, :idx), i) +Base.iterate(key::GroupKey, i::Integer=1) = + i <= length(key) ? (key[i], i + 1) : nothing +Base.getindex(key::GroupKey, i::Integer) = + _groupvalues(parent(key), getfield(key, :idx), i) function Base.getindex(key::GroupKey, n::Symbol) try @@ -247,6 +250,8 @@ function Base.getindex(key::GroupKey, n::Symbol) end end +Base.getindex(key::GroupKey, n::AbstractString) = key[Symbol(n)] + function Base.getproperty(key::GroupKey, p::Symbol) try return key[p] @@ -255,6 +260,8 @@ function Base.getproperty(key::GroupKey, p::Symbol) end end +Base.getproperty(key::GroupKey, p::AbstractString) = getproperty(key, Symbol(p)) + function Base.NamedTuple(key::GroupKey) N = NamedTuple{Tuple(groupcols(parent(key)))} N(_groupvalues(parent(key), getfield(key, :idx))) @@ -292,7 +299,8 @@ end # The full version (to_indices) is required rather than to_index even though # GroupedDataFrame behaves as a 1D array due to the behavior of Colon and Not. # Note that this behavior would be the default if it was <:AbstractArray -Base.getindex(gd::GroupedDataFrame, idx...) = getindex(gd, Base.to_indices(gd, idx)...) +Base.getindex(gd::GroupedDataFrame, idx...) = + getindex(gd, Base.to_indices(gd, idx)...) # The allowed key types for dictionary-like indexing const GroupKeyTypes = Union{GroupKey, Tuple, NamedTuple} @@ -479,7 +487,8 @@ function Base.haskey(gd::GroupedDataFrame, key::GroupKey) throw(BoundsError(gd, getfield(key, :idx))) end else - throw(ArgumentError("The parent of key does not match the passed GroupedDataFrame")) + msg = "The parent of key does not match the passed GroupedDataFrame" + throw(ArgumentError(msg)) end end @@ -499,7 +508,8 @@ function Base.haskey(gd::GroupedDataFrame, key::NamedTuple{N}) where {N} return haskey(gd, Tuple(key)) end -Base.haskey(gd::GroupedDataFrame, key::Union{Signed,Unsigned}) = 1 <= key <= length(gd) +Base.haskey(gd::GroupedDataFrame, key::Union{Signed,Unsigned}) = + 1 <= key <= length(gd) """ get(gd::GroupedDataFrame, key, default) diff --git a/src/groupeddataframe/show.jl b/src/groupeddataframe/show.jl index 586235dcd4..9012d13829 100644 --- a/src/groupeddataframe/show.jl +++ b/src/groupeddataframe/show.jl @@ -14,7 +14,7 @@ function Base.show(io::IO, gd::GroupedDataFrame; rowlabel::Symbol = :Row, summary::Bool = true) N = length(gd) - parent_names = names(gd.parent) + parent_names = _names(gd.parent) summary && Base.summary(io, gd) @@ -70,6 +70,6 @@ function Base.show(df::GroupedDataFrame; rowlabel::Symbol = :Row, summary::Bool = true) # -> Nothing return show(stdout, df, - allrows=allrows, allcols=allcols, allgroups=allgroups, splitcols=splitcols, - rowlabel=rowlabel, summary=summary) + allrows=allrows, allcols=allcols, allgroups=allgroups, + splitcols=splitcols, rowlabel=rowlabel, summary=summary) end diff --git a/src/groupeddataframe/splitapplycombine.jl b/src/groupeddataframe/splitapplycombine.jl index 778b0afc9a..088163cc81 100644 --- a/src/groupeddataframe/splitapplycombine.jl +++ b/src/groupeddataframe/splitapplycombine.jl @@ -5,13 +5,17 @@ """ groupby(d::AbstractDataFrame, cols; sort=false, skipmissing=false) -Return a `GroupedDataFrame` representing a view of an `AbstractDataFrame` split into row groups. +Return a `GroupedDataFrame` representing a view of an `AbstractDataFrame` split +into row groups. # Arguments - `df` : an `AbstractDataFrame` to split -- `cols` : data frame columns to group by -- `sort` : whether to sort rows according to the values of the grouping columns `cols` -- `skipmissing` : whether to skip rows with `missing` values in one of the grouping columns `cols` +- `cols` : data frame columns to group by. Can be any column selector + ($COLUMNINDEX_STR; $MULTICOLUMNINDEX_STR). +- `sort` : whether to sort rows according to the values of the grouping columns + `cols` +- `skipmissing` : whether to skip rows with `missing` values in one of the + grouping columns `cols` # Details An iterator over a `GroupedDataFrame` returns a `SubDataFrame` view @@ -30,8 +34,10 @@ and combines the result into a data frame). See the following for additional split-apply-combine operations: * [`by`](@ref) : split-apply-combine using functions -* [`map`](@ref) : apply a function to each group of a `GroupedDataFrame` (without combining) -* [`combine`](@ref) : combine a `GroupedDataFrame`, optionally applying a function to each group +* [`map`](@ref) : apply a function to each group of a `GroupedDataFrame` + (without combining) +* [`combine`](@ref) : combine a `GroupedDataFrame`, optionally applying + a function to each group `GroupedDataFrame` also supports the dictionary interface. The keys are [`GroupKey`](@ref) objects returned by [`keys(::GroupedDataFrame)`](@ref), @@ -215,8 +221,8 @@ view for each group and can return any return value defined below. Note that this form is slower than `pair` due to type instability. If `pair` is passed then it must follow the rules specified for transformations in -[`select`](@ref) and have the form `source_cols => fun`, `source_cols => fun => target_col`, -or `source_col => target_col`. +[`select`](@ref) and have the form `source_cols => fun`, +`source_cols => fun => target_col`, or `source_col => target_col`. Function defined by `fun` is passed `SubArray` views as positional arguments for each column specified to be selected and can return any return value defined below, or a `NamedTuple` containing these `SubArray`s if `source_cols` is an `AsTable` selector. @@ -330,7 +336,7 @@ function Base.map(f::Union{Base.Callable, Pair}, gd::GroupedDataFrame) end end newparent = hcat!(parent(gd)[idx, gd.cols], - without(valscat, intersect(keys, _names(valscat)))) + select(valscat, Not(intersect(keys, _names(valscat))), copycols=false)) if length(idx) == 0 return GroupedDataFrame(newparent, collect(1:length(gd.cols)), idx, Int[], Int[], Int[], 0, Dict{Any,Int}()) @@ -363,9 +369,7 @@ const F_ARGUMENT_RULES = Arguments passed as `args...` can be: - * Any index that is allowed for column indexing. In particular, symbols, integers, - vectors of symbols, vectors of integers, vectors of bools, regular expressions, - `All`, `Between`, and `Not` selectors are supported. + * Any index that is allowed for column indexing ($COLUMNINDEX_STR, $MULTICOLUMNINDEX_STR). * Column transformation operations using the `Pair` notation that is described below and vectors of such pairs. @@ -598,9 +602,8 @@ function combine(gd::GroupedDataFrame, p::Pair; keepkeys::Bool=true) end function combine(gd::GroupedDataFrame, - @nospecialize(cs::Union{Pair, AbstractVector{<:Pair}, typeof(nrow), - AbstractVector{<:Integer}, AbstractVector{Symbol}, - ColumnIndex, Colon, Regex, Not, All, Between}...); + @nospecialize(cs::Union{Pair, typeof(nrow), + ColumnIndex, MultiColumnIndex}...); keepkeys::Bool=true) @assert !isempty(cs) cs_vec = [] @@ -703,7 +706,7 @@ function combine_helper(f, gd::GroupedDataFrame, end end return hcat!(parent(gd)[idx, gd.cols], - without(valscat, intersect(keys, _names(valscat)))) + select(valscat, Not(intersect(keys, _names(valscat))), copycols=false)) else return keepkeys ? parent(gd)[1:0, gd.cols] : DataFrame() end @@ -1068,7 +1071,8 @@ function (agg::Aggregate{typeof(length)})(incol::AbstractVector, gd::GroupedData end end -isagg(p::Pair) = check_aggregate(last(p)) isa AbstractAggregate && first(p) isa ColumnIndex +isagg(p::Pair) = + check_aggregate(last(p)) isa AbstractAggregate && first(p) isa ColumnIndex const MULTI_COLS_TYPE = Union{AbstractDataFrame, NamedTuple, DataFrameRow, AbstractMatrix} @@ -1123,8 +1127,8 @@ function _combine(f::AbstractVector{<:Pair}, firstres = do_call(fun, gd.idx, gd.starts, gd.ends, gd, incols, 1) firstmulticol = firstres isa MULTI_COLS_TYPE if firstmulticol - throw(ArgumentError("a single value or vector result is required when passing " * - "multiple functions (got $(typeof(res)))")) + throw(ArgumentError("a single value or vector result is required when " * + "passing multiple functions (got $(typeof(res)))")) end # if idx_agg was not computed yet it is nothing # in this case if we are not passed a vector compute it. @@ -1622,10 +1626,8 @@ by(d::AbstractDataFrame, cols::Any, f::Pair; combine(groupby(d, cols, sort=sort, skipmissing=skipmissing), f, keepkeys=keepkeys) -by(d::AbstractDataFrame, cols::Any, f::Union{Pair, AbstractVector{<:Pair}, - typeof(nrow), AbstractVector{<:Integer}, - AbstractVector{Symbol}, ColumnIndex, - Colon, Regex, Not, All, Between}...; +by(d::AbstractDataFrame, cols::Any, f::Union{Pair, typeof(nrow), + ColumnIndex, MultiColumnIndex}...; sort::Bool=false, skipmissing::Bool=false, keepkeys::Bool=true) = combine(groupby(d, cols, sort=sort, skipmissing=skipmissing), f..., keepkeys=keepkeys) diff --git a/src/other/broadcasting.jl b/src/other/broadcasting.jl index 010d9dead5..5b38715f8d 100644 --- a/src/other/broadcasting.jl +++ b/src/other/broadcasting.jl @@ -12,8 +12,10 @@ struct DataFrameStyle <: Base.Broadcast.BroadcastStyle end Base.Broadcast.BroadcastStyle(::Type{<:AbstractDataFrame}) = DataFrameStyle() -Base.Broadcast.BroadcastStyle(::DataFrameStyle, ::Base.Broadcast.BroadcastStyle) = DataFrameStyle() -Base.Broadcast.BroadcastStyle(::Base.Broadcast.BroadcastStyle, ::DataFrameStyle) = DataFrameStyle() +Base.Broadcast.BroadcastStyle(::DataFrameStyle, ::Base.Broadcast.BroadcastStyle) = + DataFrameStyle() +Base.Broadcast.BroadcastStyle(::Base.Broadcast.BroadcastStyle, ::DataFrameStyle) = + DataFrameStyle() Base.Broadcast.BroadcastStyle(::DataFrameStyle, ::DataFrameStyle) = DataFrameStyle() function copyto_widen!(res::AbstractVector{T}, bc::Base.Broadcast.Broadcasted, @@ -38,7 +40,7 @@ function getcolbc(bcf::Base.Broadcast.Broadcasted{Style}, colind) where {Style} newargs = map(bcf.args) do x Base.Broadcast.extrude(x isa AbstractDataFrame ? x[!, colind] : x) end - Base.Broadcast.Broadcasted{Style}(bcf.f, newargs, bcf.axes) + return Base.Broadcast.Broadcasted{Style}(bcf.f, newargs, bcf.axes) end function Base.copy(bc::Base.Broadcast.Broadcasted{DataFrameStyle}) @@ -98,20 +100,20 @@ Base.maybeview(df::AbstractDataFrame, rows, cols) = view(df, rows, cols) function Base.dotview(df::DataFrame, ::Colon, cols::ColumnIndex) haskey(index(df), cols) && return view(df, :, cols) - if !(cols isa Symbol) + if !(cols isa SymbolOrString) throw(ArgumentError("creating new columns using an integer index is disallowed")) end - LazyNewColDataFrame(df, cols) + return LazyNewColDataFrame(df, Symbol(cols)) end function Base.dotview(df::DataFrame, ::typeof(!), cols) if !(cols isa ColumnIndex) return ColReplaceDataFrame(df, index(df)[cols]) end - if !(cols isa Symbol) && cols > ncol(df) + if !(cols isa SymbolOrString) && cols > ncol(df) throw(ArgumentError("creating new columns using an integer index is disallowed")) end - LazyNewColDataFrame(df, cols) + return LazyNewColDataFrame(df, cols isa AbstractString ? Symbol(cols) : cols) end Base.dotview(df::SubDataFrame, ::typeof(!), idxs) = @@ -132,8 +134,8 @@ end function _copyto_helper!(dfcol::AbstractVector, bc::Base.Broadcast.Broadcasted, col::Int) if axes(dfcol, 1) != axes(bc)[1] # this should never happen unless data frame is corrupted (has unequal column lengths) - throw(DimensionMismatch("Dimension mismatch in broadcasting. " * - "The updated data frame is invalid and should not be used")) + throw(DimensionMismatch("Dimension mismatch in broadcasting. The updated" * + " data frame is invalid and should not be used")) end @inbounds for row in eachindex(dfcol) dfcol[row] = bc[CartesianIndex(row, col)] @@ -144,7 +146,7 @@ function Base.Broadcast.broadcast_unalias(dest::AbstractDataFrame, src) for col in eachcol(dest) src = Base.Broadcast.unalias(col, src) end - src + return src end function Base.Broadcast.broadcast_unalias(dest, src::AbstractDataFrame) @@ -167,7 +169,7 @@ function Base.Broadcast.broadcast_unalias(dest, src::AbstractDataFrame) wascopied = true end end - src + return src end function _broadcast_unalias_helper(dest::AbstractDataFrame, scol::AbstractVector, @@ -206,7 +208,7 @@ function Base.Broadcast.broadcast_unalias(dest::AbstractDataFrame, src::Abstract scol = src[!, col2] src, wascopied = _broadcast_unalias_helper(dest, scol, src, col2, wascopied) end - src + return src end function Base.copyto!(df::AbstractDataFrame, bc::Base.Broadcast.Broadcasted) @@ -229,18 +231,19 @@ function Base.copyto!(df::AbstractDataFrame, bc::Base.Broadcast.Broadcasted) for i in axes(df, 2) _copyto_helper!(df[!, i], getcolbc(bcf′, i), i) end - df + return df end -function Base.copyto!(df::AbstractDataFrame, bc::Base.Broadcast.Broadcasted{<:Base.Broadcast.AbstractArrayStyle{0}}) +function Base.copyto!(df::AbstractDataFrame, + bc::Base.Broadcast.Broadcasted{<:Base.Broadcast.AbstractArrayStyle{0}}) # special case of fast approach when bc is providing an untransformed scalar if bc.f === identity && bc.args isa Tuple{Any} && Base.Broadcast.isflat(bc) for col in axes(df, 2) fill!(df[!, col], bc.args[1][]) end - df + return df else - copyto!(df, convert(Base.Broadcast.Broadcasted{Nothing}, bc)) + return copyto!(df, convert(Base.Broadcast.Broadcasted{Nothing}, bc)) end end @@ -250,7 +253,8 @@ create_bc_tmp(bcf′_col::Base.Broadcast.Broadcasted{T}) where {T} = function Base.copyto!(crdf::ColReplaceDataFrame, bc::Base.Broadcast.Broadcasted) bcf = Base.Broadcast.flatten(bc) colnames = unique!([_names(x) for x in bcf.args if x isa AbstractDataFrame]) - if length(colnames) > 1 || (length(colnames) == 1 && view(_names(crdf.df), crdf.cols) != colnames[1]) + if length(colnames) > 1 || + (length(colnames) == 1 && view(_names(crdf.df), crdf.cols) != colnames[1]) push!(colnames, view(_names(crdf.df), crdf.cols)) wrongnames = setdiff(union(colnames...), intersect(colnames...)) if isempty(wrongnames) @@ -284,7 +288,7 @@ function Base.copyto!(crdf::ColReplaceDataFrame, bc::Base.Broadcast.Broadcasted) end crdf.df[!, col_idx] = newcol end - crdf.df + return crdf.df end Base.Broadcast.broadcast_unalias(dest::DataFrameRow, src) = @@ -295,5 +299,5 @@ function Base.copyto!(dfr::DataFrameRow, bc::Base.Broadcast.Broadcasted) for I in eachindex(bc′) dfr[I] = bc′[I] end - dfr + return dfr end diff --git a/src/other/index.jl b/src/other/index.jl index bd96d84fef..ff19fdf3d3 100644 --- a/src/other/index.jl +++ b/src/other/index.jl @@ -5,11 +5,18 @@ abstract type AbstractIndex end function Base.summary(idx::AbstractIndex) l = length(idx) - "data frame with $l column$(l == 1 ? "" : "s")" + return "data frame with $l column$(l == 1 ? "" : "s")" end Base.summary(io::IO, idx::AbstractIndex) = print(io, summary(idx)) -const ColumnIndex = Union{Signed, Unsigned, Symbol} +const SymbolOrString = Union{Symbol, AbstractString} +const ColumnIndex = Union{Signed, Unsigned, SymbolOrString} +const MultiColumnIndex = Union{AbstractVector, Regex, Not, Between, All, Colon} +const MULTICOLUMNINDEX_TUPLE = (:AbstractVector, :Regex, :Not, :Between, :All, :Colon) + +const COLUMNINDEX_STR = "`Symbol`, string or integer" +const MULTICOLUMNINDEX_STR = "`:`, `All`, `Between`, `Not`, a regular expression," * + " or a vector of `Symbol`s, strings or integers" struct Index <: AbstractIndex # an OrderedDict would be nice here... lookup::Dict{Symbol, Int} # name => names array position @@ -19,12 +26,16 @@ end function Index(names::AbstractVector{Symbol}; makeunique::Bool=false) u = make_unique(names, makeunique=makeunique) lookup = Dict{Symbol, Int}(zip(u, 1:length(u))) - Index(lookup, u) + return Index(lookup, u) end + Index() = Index(Dict{Symbol, Int}(), Symbol[]) Base.length(x::Index) = length(x.names) -Base.names(x::Index) = copy(x.names) +Base.names(x::Index) = string.(x.names) + +# _names returns Vector{Symbol} _names(x::Index) = x.names + Base.copy(x::Index) = Index(copy(x.lookup), copy(x.names)) Base.isequal(x::AbstractIndex, y::AbstractIndex) = _names(x) == _names(y) # it is enough to check names Base.:(==)(x::AbstractIndex, y::AbstractIndex) = isequal(x, y) @@ -92,13 +103,16 @@ function rename!(x::Index, nms::AbstractVector{Pair{Symbol, Symbol}}) return x end -rename!(f::Function, x::Index) = rename!(x, [(n=>Symbol(f(n))) for n in x.names]) +rename!(f::Function, x::Index) = rename!(x, [(n=>Symbol(f(string(n)))) for n in x.names]) +# we do not define keys on purpose; +# use names to get keys as strings with copying +# or _names to get keys as Symbols without copying Base.haskey(x::Index, key::Symbol) = haskey(x.lookup, key) +Base.haskey(x::Index, key::AbstractString) = haskey(x.lookup, Symbol(key)) Base.haskey(x::Index, key::Integer) = 1 <= key <= length(x.names) Base.haskey(x::Index, key::Bool) = throw(ArgumentError("invalid key: $key of type Bool")) -Base.keys(x::Index) = names(x) # TODO: If this should stay 'unsafe', perhaps make unexported function Base.push!(x::Index, nm::Symbol) @@ -139,10 +153,12 @@ function Base.delete!(x::Index, nm::Symbol) return delete!(x, idx) end +Base.delete!(x::Index, nm::AbstractString) = delete!(x, Symbol(nm)) + function Base.empty!(x::Index) empty!(x.lookup) empty!(x.names) - x + return x end function Base.insert!(x::Index, idx::Integer, nm::Symbol) @@ -154,16 +170,19 @@ function Base.insert!(x::Index, idx::Integer, nm::Symbol) end x.lookup[nm] = idx insert!(x.names, idx, nm) - x + return x end -@inline Base.getindex(x::AbstractIndex, idx::Bool) = throw(ArgumentError("invalid index: $idx of type Bool")) +Base.insert!(x::Index, idx::Integer, nm::AbstractString) = insert!(x, idx, Symbol(nm)) + +@inline Base.getindex(x::AbstractIndex, idx::Bool) = + throw(ArgumentError("invalid index: $idx of type Bool")) @inline function Base.getindex(x::AbstractIndex, idx::Integer) if !(1 <= idx <= length(x)) throw(BoundsError(x, idx)) end - Int(idx) + return Int(idx) end @inline function Base.getindex(x::AbstractIndex, idx::AbstractVector{Int}) @@ -176,7 +195,7 @@ end throw(BoundsError(x, idx)) end allunique(idx) || throw(ArgumentError("Elements of $idx must be unique")) - idx + return idx end @inline function Base.getindex(x::AbstractIndex, idx::AbstractRange{Int}) @@ -189,7 +208,7 @@ end throw(BoundsError(x, idx)) end allunique(idx) || throw(ArgumentError("Elements of $idx must be unique")) - idx + return idx end @inline Base.getindex(x::AbstractIndex, idx::AbstractRange{<:Integer}) = @@ -203,16 +222,17 @@ end @inline function Base.getindex(x::AbstractIndex, idx::AbstractVector{<:Integer}) if any(v -> v isa Bool, idx) - throw(ArgumentError("Bool values except for AbstractVector{Bool} are not allowed for column indexing")) + throw(ArgumentError("Bool values except for AbstractVector{Bool} are not" * + " allowed for column indexing")) end - getindex(x, Vector{Int}(idx)) + return getindex(x, Vector{Int}(idx)) end @inline Base.getindex(x::AbstractIndex, idx::AbstractRange{Bool}) = getindex(x, collect(idx)) @inline function Base.getindex(x::AbstractIndex, idx::AbstractVector{Bool}) length(x) == length(idx) || throw(BoundsError(x, idx)) - findall(idx) + return findall(idx) end # catch all method handling cases when type of idx is not narrowest possible, Any in particular @@ -220,18 +240,28 @@ end isempty(idxs) && return Int[] # special case of empty idxs if idxs[1] isa Real if !all(v -> v isa Integer && !(v isa Bool), idxs) - throw(ArgumentError("Only Integer values allowed when indexing by vector of numbers")) + throw(ArgumentError("Only `Integer` values allowed when indexing by vector of numbers")) end return getindex(x, convert(Vector{Int}, idxs)) + elseif idxs[1] isa Symbol + if all(x -> x isa Symbol, idxs) + return getindex(x, convert(Vector{Symbol}, idxs)) + else + throw(ArgumentError("mixing `Symbol`s with other selectors is not allowed")) + end + elseif idxs[1] isa AbstractString + if all(x -> x isa AbstractString, idxs) + return getindex(x, Symbol.(idxs)) + else + throw(ArgumentError("mixing strings with other selectors is not allowed")) + end end - idxs[1] isa Symbol && return getindex(x, convert(Vector{Symbol}, idxs)) - throw(ArgumentError("idxs[1] has type $(typeof(idxs[1])); "* - "Only Integer or Symbol values allowed when indexing by vector")) + throw(ArgumentError("idxs[1] has type $(typeof(idxs[1])); only Integer, Symbol, "* + "or string values allowed when indexing by vector")) end -@inline function Base.getindex(x::AbstractIndex, rx::Regex) +@inline Base.getindex(x::AbstractIndex, rx::Regex) = getindex(x, filter(name -> occursin(rx, String(name)), _names(x))) -end # Fuzzy matching rules: # 1. ignore case @@ -245,7 +275,7 @@ function fuzzymatch(l::Dict{Symbol, Int}, idx::Symbol) sort!(dist) c = [count(x -> x[1] <= i, dist) for i in 0:2] maxd = max(0, searchsortedlast(c, 8) - 1) - [s for (d, s) in dist if d <= maxd] + return [s for (d, s) in dist if d <= maxd] end @inline function lookupname(l::Dict{Symbol, Int}, idx::Symbol) @@ -259,19 +289,22 @@ end throw(ArgumentError("column name :$idx not found in the data frame; " * "existing most similar names are: $candidatesstr")) end - i + return i end @inline Base.getindex(x::Index, idx::Symbol) = lookupname(x.lookup, idx) -@inline function Base.getindex(x::Index, idx::AbstractVector{Symbol}) +@inline Base.getindex(x::Index, idx::AbstractString) = x[Symbol(idx)] + +@inline function Base.getindex(x::Index, idx::Union{AbstractVector{Symbol}, + AbstractVector{<:AbstractString}}) allunique(idx) || throw(ArgumentError("Elements of $idx must be unique")) - [lookupname(x.lookup, i) for i in idx] + return [x[i] for i in idx] end # Helpers function add_names(ind::Index, add_ind::AbstractIndex; makeunique::Bool=false) - u = names(add_ind) + u = copy(_names(add_ind)) seen = Set(_names(ind)) dups = Int[] @@ -336,9 +369,15 @@ Base.@propagate_inbounds function parentcols(ind::SubIndex, idx::Symbol) return parentcol end +Base.@propagate_inbounds parentcols(ind::SubIndex, idx::AbstractString) = + parentcols(ind, Symbol(idx)) + Base.@propagate_inbounds parentcols(ind::SubIndex, idx::AbstractVector{Symbol}) = [parentcols(ind, i) for i in idx] +Base.@propagate_inbounds parentcols(ind::SubIndex, idx::AbstractVector{<:AbstractString}) = + [parentcols(ind, i) for i in idx] + Base.@propagate_inbounds parentcols(ind::SubIndex, idx::Regex) = [parentcols(ind, i) for i in _names(ind) if occursin(idx, String(i))] @@ -354,7 +393,7 @@ function SubIndex(parent::AbstractIndex, cols::AbstractUnitRange{Int}) throw(BoundsError(parent, cols)) end remap = (1:l) .- f .+ 1 - SubIndex(parent, cols, remap) + return SubIndex(parent, cols, remap) end function SubIndex(parent::AbstractIndex, cols::AbstractVector{Int}) @@ -369,7 +408,7 @@ function SubIndex(parent::AbstractIndex, cols::AbstractVector{Int}) end remap[col] = i end - SubIndex(parent, cols, remap) + return SubIndex(parent, cols, remap) end @inline SubIndex(parent::AbstractIndex, cols::ColumnIndex) = @@ -379,7 +418,7 @@ Base.@propagate_inbounds SubIndex(parent::AbstractIndex, cols) = SubIndex(parent, parent[cols]) Base.length(x::SubIndex) = length(x.cols) -Base.names(x::SubIndex) = copy(_names(x)) +Base.names(x::SubIndex) = string.(_names(x)) _names(x::SubIndex) = view(_names(x.parent), x.cols) function Base.haskey(x::SubIndex, key::Symbol) @@ -387,20 +426,19 @@ function Base.haskey(x::SubIndex, key::Symbol) pos = x.parent[key] remap = x.remap checkbounds(Bool, remap, pos) || return false - remap[pos] > 0 + return remap[pos] > 0 end +Base.haskey(x::SubIndex, key::AbstractString) = haskey(x, Symbol(key)) Base.haskey(x::SubIndex, key::Integer) = 1 <= key <= length(x) Base.haskey(x::SubIndex, key::Bool) = throw(ArgumentError("invalid key: $key of type Bool")) -Base.keys(x::SubIndex) = names(x) -function Base.getindex(x::SubIndex, idx::Symbol) - remap = x.remap - remap[x.parent[idx]] -end +Base.getindex(x::SubIndex, idx::Symbol) = x.remap[x.parent[idx]] +Base.getindex(x::SubIndex, idx::AbstractString) = x[Symbol(idx)] -function Base.getindex(x::SubIndex, idx::AbstractVector{Symbol}) +function Base.getindex(x::SubIndex, idx::Union{AbstractVector{Symbol}, + AbstractVector{AbstractString}}) allunique(idx) || throw(ArgumentError("Elements of $idx must be unique")) - [x[i] for i in idx] + return [x[i] for i in idx] end diff --git a/src/other/tables.jl b/src/other/tables.jl index dcabe88ab5..bb40c1470e 100644 --- a/src/other/tables.jl +++ b/src/other/tables.jl @@ -6,17 +6,17 @@ Tables.rows(df::AbstractDataFrame) = eachrow(df) Tables.rowtable(df::AbstractDataFrame) = Tables.rowtable(Tables.columntable(df)) Tables.namedtupleiterator(df::AbstractDataFrame) = Tables.namedtupleiterator(Tables.columntable(df)) +Tables.columnindex(df::AbstractDataFrame, idx::AbstractString) = + columnindex(df, Symbol(idx)) -Tables.schema(df::AbstractDataFrame) = Tables.Schema(names(df), eltype.(eachcol(df))) +Tables.schema(df::AbstractDataFrame) = Tables.Schema(propertynames(df), eltype.(eachcol(df))) Tables.materializer(df::AbstractDataFrame) = DataFrame Tables.getcolumn(df::AbstractDataFrame, i::Int) = df[!, i] Tables.getcolumn(df::AbstractDataFrame, nm::Symbol) = df[!, nm] -Tables.columnnames(df::AbstractDataFrame) = names(df) Tables.getcolumn(dfr::DataFrameRow, i::Int) = dfr[i] Tables.getcolumn(dfr::DataFrameRow, nm::Symbol) = dfr[nm] -Tables.columnnames(dfr::DataFrameRow) = names(dfr) getvector(x::AbstractVector) = x getvector(x) = [x[i] for i = 1:length(x)] @@ -78,8 +78,6 @@ Tables.getcolumn(itr::Union{DataFrameRows,DataFrameColumns}, i::Int) = Tables.getcolumn(parent(itr), i) Tables.getcolumn(itr::Union{DataFrameRows,DataFrameColumns}, nm::Symbol) = Tables.getcolumn(parent(itr), nm) -Tables.columnnames(itr::Union{DataFrameRows,DataFrameColumns}) = - Tables.columnnames(parent(itr)) IteratorInterfaceExtensions.getiterator(df::AbstractDataFrame) = Tables.datavaluerows(Tables.columntable(df)) diff --git a/src/other/utils.jl b/src/other/utils.jl index 061ca6c174..3cda5346f7 100644 --- a/src/other/utils.jl +++ b/src/other/utils.jl @@ -1,8 +1,8 @@ """ AsTable(cols) -A type used for selection operations to signal that the columns selected by the wrapped -selector should be passed as a `NamedTuple` to the function. +A type used for selection operations to signal that the columns selected by the +wrapped selector should be passed as a `NamedTuple` to the function. """ struct AsTable cols @@ -10,7 +10,8 @@ end Base.broadcastable(x::AsTable) = Ref(x) -function make_unique!(names::Vector{Symbol}, src::AbstractVector{Symbol}; makeunique::Bool=false) +function make_unique!(names::Vector{Symbol}, src::AbstractVector{Symbol}; + makeunique::Bool=false) if length(names) != length(src) throw(DimensionMismatch("Length of src doesn't match length of names.")) end @@ -59,11 +60,11 @@ end """ gennames(n::Integer) -Generate standardized names for columns of a DataFrame. The first name will be `:x1`, the -second `:x2`, etc. +Generate standardized names for columns of a DataFrame. +The first name will be `:x1`, the second `:x2`, etc. """ function gennames(n::Integer) - res = Array{Symbol}(undef, n) + res = Vector{Symbol}(undef, n) for i in 1:n res[i] = Symbol(@sprintf "x%d" i) end diff --git a/src/subdataframe/subdataframe.jl b/src/subdataframe/subdataframe.jl index b11ac814ac..44349b6cb1 100644 --- a/src/subdataframe/subdataframe.jl +++ b/src/subdataframe/subdataframe.jl @@ -92,13 +92,13 @@ Base.@propagate_inbounds Base.view(adf::AbstractDataFrame, ::typeof(!), colind:: @inline Base.view(adf::AbstractDataFrame, rowinds, colind::Bool) = throw(ArgumentError("invalid column index $colind of type `Bool`")) Base.@propagate_inbounds Base.view(adf::AbstractDataFrame, rowinds, - colinds::Union{AbstractVector, Regex, Not, Between, All, Colon}) = + colinds::MultiColumnIndex) = SubDataFrame(adf, rowinds, colinds) Base.@propagate_inbounds Base.view(adf::AbstractDataFrame, rowinds::typeof(!), - colinds::Union{AbstractVector, Regex, Not, Between, All, Colon}) = + colinds::MultiColumnIndex) = SubDataFrame(adf, :, colinds) Base.@propagate_inbounds Base.view(adf::AbstractDataFrame, rowinds::Not, - colinds::Union{AbstractVector, Regex, Not, Between, All, Colon}) = + colinds::MultiColumnIndex) = SubDataFrame(adf, axes(adf, 1)[rowinds], colinds) ############################################################################## @@ -126,13 +126,13 @@ Base.@propagate_inbounds Base.getindex(sdf::SubDataFrame, ::typeof(!), colind::C view(parent(sdf), rows(sdf), parentcols(index(sdf), colind)) Base.@propagate_inbounds Base.getindex(sdf::SubDataFrame, rowinds::Union{AbstractVector, Not}, - colinds::Union{AbstractVector, Regex, Not, Between, All, Colon}) = + colinds::MultiColumnIndex) = parent(sdf)[rows(sdf)[rowinds], parentcols(index(sdf), colinds)] Base.@propagate_inbounds Base.getindex(sdf::SubDataFrame, ::Colon, - colinds::Union{AbstractVector, Regex, Not, Between, All, Colon}) = + colinds::MultiColumnIndex) = parent(sdf)[rows(sdf), parentcols(index(sdf), colinds)] Base.@propagate_inbounds Base.getindex(df::SubDataFrame, row_ind::typeof(!), - col_inds::Union{AbstractVector, Regex, Not, Between, All, Colon}) = + col_inds::MultiColumnIndex) = select(df, col_inds, copycols=false) @@ -168,7 +168,7 @@ function DataFrame(sdf::SubDataFrame; copycols::Bool=true) if copycols sdf[:, :] else - DataFrame(eachcol(sdf), names(sdf), copycols=false) + DataFrame(eachcol(sdf), _names(sdf), copycols=false) end end diff --git a/test/broadcasting.jl b/test/broadcasting.jl index 8f9e3f92b0..2668ff98b8 100644 --- a/test/broadcasting.jl +++ b/test/broadcasting.jl @@ -205,9 +205,17 @@ end @test df.x1 == [2.5, 4.5, 6.5] @test df[:, 2:end] == refdf[:, 2:end] + df = copy(refdf) + df[!, "x1"] .+= [0, 1, 2] .+ 1 + @test df."x1" == [2.5, 4.5, 6.5] + @test df[:, Not("x1")] == refdf[:, 2:end] + dfv = @view df[1:2, 2:end] @test_throws ArgumentError dfv[!, 1] .+= [0, 1] .+ 1 + dfv = @view df[1:2, 2:end] + @test_throws ArgumentError dfv[!, "x1"] .+= [0, 1] .+ 1 + df = copy(refdf) df.x1 .+= [0, 1, 2] .+ 1 @test df.x1 == [2.5, 4.5, 6.5] @@ -221,6 +229,19 @@ end 4.5 7.5 8.5 11.5 14.5 6.5 6.5 9.5 12.5 15.5] + df = copy(refdf) + df."x1" .+= [0, 1, 2] .+ 1 + @test df."x1" == [2.5, 4.5, 6.5] + @test df[:, 2:end] == refdf[:, 2:end] + + dfv = @view df[1:2, 2:end] + dfv."x2" .+= [0, 1] .+ 1 + @test dfv."x2" == [5.5, 7.5] + @test dfv[:, 2:end] == refdf[1:2, 3:end] + @test Matrix(df) == [2.5 5.5 7.5 10.5 13.5 + 4.5 7.5 8.5 11.5 14.5 + 6.5 6.5 9.5 12.5 15.5] + dfr = df[1, 3:end] dfr[end-1:end] .= [9, 10] .+ 1 @test Vector(dfr) == [7.5, 10.0, 11.0] @@ -241,6 +262,19 @@ end 4.5 7.5 8.5 11.5 14.5 6.5 6.5 9.5 12.5 15.5] + df = copy(refdf) + df[:, "x1"] .+= [0, 1, 2] .+ 1 + @test df."x1" == [2.5, 4.5, 6.5] + @test df[:, 2:end] == refdf[:, 2:end] + + dfv = @view df[1:2, 2:end] + dfv[:, "x2"] .+= [0, 1] .+ 1 + @test dfv."x2" == [5.5, 7.5] + @test dfv[:, 2:end] == refdf[1:2, 3:end] + @test Matrix(df) == [2.5 5.5 7.5 10.5 13.5 + 4.5 7.5 8.5 11.5 14.5 + 6.5 6.5 9.5 12.5 15.5] + df = copy(refdf) dfv = @view df[1:2, 2:end] dfr = df[1, 3:end] @@ -270,6 +304,21 @@ end 3.5 5.5 8.5 11.5 14.5 4.5 6.5 9.5 12.5 15.5] + df = copy(refdf) + df[!, "x1"] .+= 1 + @test df."x1" == [2.5, 3.5, 4.5] + @test df[:, 2:end] == refdf[:, 2:end] + + dfv = @view df[1:2, 2:end] + @test_throws ArgumentError dfv[!, "x2"] .+= 1 + + dfr = df[1, 3:end] + dfr[["x4", "x5"]] .= 10 + @test Vector(dfr) == [7.5, 10.0, 10.0] + @test Matrix(df) == [2.5 4.5 7.5 10.0 10.0 + 3.5 5.5 8.5 11.5 14.5 + 4.5 6.5 9.5 12.5 15.5] + df = copy(refdf) df[:, :x1] .+= 1 @test df.x1 == [2.5, 3.5, 4.5] @@ -283,6 +332,19 @@ end 3.5 6.5 8.5 11.5 14.5 4.5 6.5 9.5 12.5 15.5] + df = copy(refdf) + df[:, "x1"] .+= 1 + @test df."x1" == [2.5, 3.5, 4.5] + @test df[:, 2:end] == refdf[:, 2:end] + + dfv = @view df[1:2, 2:end] + dfv[:, "x2"] .+= 1 + @test dfv."x2" == [5.5, 6.5] + @test dfv[:, 2:end] == refdf[1:2, 3:end] + @test Matrix(df) == [2.5 5.5 7.5 10.5 13.5 + 3.5 6.5 8.5 11.5 14.5 + 4.5 6.5 9.5 12.5 15.5] + df = copy(refdf) df[!, :x1] .+= [1, 2, 3] @test df.x1 == [2.5, 4.5, 6.5] @@ -298,6 +360,21 @@ end 4.5 5.5 8.5 11.5 14.5 6.5 6.5 9.5 12.5 15.5] + df = copy(refdf) + df[!, "x1"] .+= [1, 2, 3] + @test df."x1" == [2.5, 4.5, 6.5] + @test df[:, 2:end] == refdf[:, 2:end] + + dfv = @view df[1:2, 2:end] + @test_throws ArgumentError dfv[!, :x2] .+= [1, 2] + + dfr = df[1, 3:end] + dfr[["x4", "x5"]] .= [10, 11] + @test Vector(dfr) == [7.5, 10.0, 11.0] + @test Matrix(df) == [2.5 4.5 7.5 10.0 11.0 + 4.5 5.5 8.5 11.5 14.5 + 6.5 6.5 9.5 12.5 15.5] + df = copy(refdf) df[:, :x1] .+= [1, 2, 3] @test df.x1 == [2.5, 4.5, 6.5] @@ -311,6 +388,19 @@ end 4.5 7.5 8.5 11.5 14.5 6.5 6.5 9.5 12.5 15.5] + df = copy(refdf) + df[:, "x1"] .+= [1, 2, 3] + @test df."x1" == [2.5, 4.5, 6.5] + @test df[:, 2:end] == refdf[:, 2:end] + + dfv = @view df[1:2, 2:end] + dfv[:, "x2"] .+= [1, 2] + @test dfv."x2" == [5.5, 7.5] + @test dfv[:, 2:end] == refdf[1:2, 3:end] + @test Matrix(df) == [2.5 5.5 7.5 10.5 13.5 + 4.5 7.5 8.5 11.5 14.5 + 6.5 6.5 9.5 12.5 15.5] + df = copy(refdf) dfv = @view df[1:2, 2:end] dfr = df[1, 3:end] @@ -324,6 +414,11 @@ end @test_throws DimensionMismatch dfr[end-1:end] .= reshape(rand(3), :, 1) @test_throws DimensionMismatch df[:, 1] .= reshape(rand(3), :, 1) @test_throws DimensionMismatch dfv[:, 1] .= reshape(rand(2), :, 1) + @test_throws DimensionMismatch df[!, "x1"] .= rand(3, 1) + @test_throws ArgumentError dfv[!, "x2"] .= rand(2, 1) + @test_throws DimensionMismatch dfr[["x4", "x5"]] .= rand(3, 1) + @test_throws DimensionMismatch df[:, "x1"] .= rand(3, 1) + @test_throws DimensionMismatch dfv[:, "x2"] .= rand(2, 1) end @testset "normal data frame and data frame view in broadcasted assignment - two columns" begin @@ -415,6 +510,21 @@ end 3.5 6.5 9.5 12.5 14.5 4.5 7.5 9.5 12.5 15.5] + df = copy(refdf) + df[:, ["x1","x2"]] .= Matrix(df[:, [:x1,:x2]]) .+ 1 + @test df.x1 == [2.5, 3.5, 4.5] + @test df.x2 == [5.5, 6.5, 7.5] + @test df[:, 3:end] == refdf[:, 3:end] + + dfv = @view df[1:2, 3:end] + dfv[:, ["x3","x4"]] .= Matrix(dfv[:, [:x3,:x4]]) .+ 1 + @test dfv.x3 == [8.5, 9.5] + @test dfv.x4 == [11.5, 12.5] + @test dfv[:, 3:end] == refdf[1:2, 5:end] + @test Matrix(df) == [2.5 5.5 8.5 11.5 13.5 + 3.5 6.5 9.5 12.5 14.5 + 4.5 7.5 9.5 12.5 15.5] + df = copy(refdf) df[:, [:x1,:x2]] .= Matrix(df[:, [:x1,:x2]]) .+ 1 @test df.x1 == [2.5, 3.5, 4.5] @@ -430,6 +540,21 @@ end 3.5 6.5 9.5 12.5 14.5 4.5 7.5 9.5 12.5 15.5] + df = copy(refdf) + df[:, ["x1","x2"]] .= Matrix(df[:, [:x1,:x2]]) .+ 1 + @test df.x1 == [2.5, 3.5, 4.5] + @test df.x2 == [5.5, 6.5, 7.5] + @test df[:, 3:end] == refdf[:, 3:end] + + dfv = @view df[1:2, 3:end] + dfv[:, ["x3","x4"]] .= Matrix(dfv[:, [:x3,:x4]]) .+ 1 + @test dfv.x3 == [8.5, 9.5] + @test dfv.x4 == [11.5, 12.5] + @test dfv[:, 3:end] == refdf[1:2, 5:end] + @test Matrix(df) == [2.5 5.5 8.5 11.5 13.5 + 3.5 6.5 9.5 12.5 14.5 + 4.5 7.5 9.5 12.5 15.5] + df = copy(refdf) df[:, [:x1,:x2]] .= Matrix(df[:, [:x1,:x2]]) .+ [1 4 2 5 @@ -448,6 +573,24 @@ end 4.5 10.5 10.5 15.5 14.5 6.5 12.5 9.5 12.5 15.5] + df = copy(refdf) + df[:, ["x1","x2"]] .= Matrix(df[:, ["x1","x2"]]) .+ [1 4 + 2 5 + 3 6] + @test df.x1 == [2.5, 4.5, 6.5] + @test df.x2 == [8.5, 10.5, 12.5] + @test df[:, 3:end] == refdf[:, 3:end] + + dfv = @view df[1:2, 3:end] + dfv[:, ["x3","x4"]] .= Matrix(dfv[:, ["x3","x4"]]) .+ [1 3 + 2 4] + @test dfv.x3 == [8.5, 10.5] + @test dfv.x4 == [13.5, 15.5] + @test dfv[:, 3:end] == refdf[1:2, 5:end] + @test Matrix(df) == [2.5 8.5 8.5 13.5 13.5 + 4.5 10.5 10.5 15.5 14.5 + 6.5 12.5 9.5 12.5 15.5] + df = copy(refdf) df[:, [:x1,:x2]] .= Matrix(df[:, [:x1,:x2]]) .+ [1 4 2 5 @@ -466,12 +609,34 @@ end 4.5 10.5 10.5 15.5 14.5 6.5 12.5 9.5 12.5 15.5] + df = copy(refdf) + df[:, ["x1","x2"]] .= Matrix(df[:, ["x1","x2"]]) .+ [1 4 + 2 5 + 3 6] + @test df.x1 == [2.5, 4.5, 6.5] + @test df.x2 == [8.5, 10.5, 12.5] + @test df[:, 3:end] == refdf[:, 3:end] + + dfv = @view df[1:2, 3:end] + dfv[:, ["x3","x4"]] .= Matrix(dfv[:, ["x3","x4"]]) .+ [1 3 + 2 4] + @test dfv.x3 == [8.5, 10.5] + @test dfv.x4 == [13.5, 15.5] + @test dfv[:, 3:end] == refdf[1:2, 5:end] + @test Matrix(df) == [2.5 8.5 8.5 13.5 13.5 + 4.5 10.5 10.5 15.5 14.5 + 6.5 12.5 9.5 12.5 15.5] + df = copy(refdf) dfv = @view df[1:2, 2:end] @test_throws DimensionMismatch df[:, [:x1,:x2]] .= rand(3, 10) @test_throws DimensionMismatch dfv[:, [:x3,:x4]] .= rand(2, 10) @test_throws DimensionMismatch df[:, [:x1,:x2]] .= rand(3, 10) @test_throws DimensionMismatch dfv[:, [:x3,:x4]] .= rand(2, 10) + @test_throws DimensionMismatch df[:, ["x1","x2"]] .= rand(3, 10) + @test_throws DimensionMismatch dfv[:, ["x3","x4"]] .= rand(2, 10) + @test_throws DimensionMismatch df[:, ["x1","x2"]] .= rand(3, 10) + @test_throws DimensionMismatch dfv[:, ["x3","x4"]] .= rand(2, 10) df = copy(refdf) df[:, [1,2]] .= [1 2 @@ -574,13 +739,13 @@ end @test Matrix(df) == [1.5 4.5 7.5 10.5 13.5 1.0 2.5 5.5 8.5 11.5 14.5 1.0 3.5 6.5 9.5 12.5 15.5 1.0] - @test names(df)[end] == :a + @test names(df)[end] == "a" @test df[:, 1:end-1] == refdf df[!, :b] .= [1, 2, 3] @test Matrix(df) == [1.5 4.5 7.5 10.5 13.5 1.0 1.0 2.5 5.5 8.5 11.5 14.5 1.0 2.0 3.5 6.5 9.5 12.5 15.5 1.0 3.0] - @test names(df)[end] == :b + @test names(df)[end] == "b" @test df[:, 1:end-2] == refdf cdf = copy(df) @test_throws DimensionMismatch df[!, :c] .= ones(3, 1) @@ -604,6 +769,42 @@ end df[!, :b] .= sin.(1) df[!, :c] .= sin(1) .+ 1 @test df == DataFrame(b=Float64[], c=Float64[]) + + df = copy(refdf) + df[!, "a"] .= 1 + @test Matrix(df) == [1.5 4.5 7.5 10.5 13.5 1.0 + 2.5 5.5 8.5 11.5 14.5 1.0 + 3.5 6.5 9.5 12.5 15.5 1.0] + @test names(df)[end] == "a" + @test df[:, 1:end-1] == refdf + df[!, "b"] .= [1, 2, 3] + @test Matrix(df) == [1.5 4.5 7.5 10.5 13.5 1.0 1.0 + 2.5 5.5 8.5 11.5 14.5 1.0 2.0 + 3.5 6.5 9.5 12.5 15.5 1.0 3.0] + @test names(df)[end] == "b" + @test df[:, 1:end-2] == refdf + cdf = copy(df) + @test_throws DimensionMismatch df[!, "c"] .= ones(3, 1) + @test df == cdf + @test_throws DimensionMismatch df[!, "x"] .= ones(4) + @test df == cdf + @test_throws ArgumentError df[!, 10] .= ones(3) + @test df == cdf + + dfv = @view df[1:2, 2:end] + @test_throws ArgumentError dfv[!, 10] .= ones(3) + @test_throws ArgumentError dfv[!, "z"] .= ones(3) + @test df == cdf + dfr = df[1, 3:end] + @test_throws BoundsError dfr[10] .= ones(3) + @test_throws ArgumentError dfr["z"] .= ones(3) + @test df == cdf + + df = DataFrame() + @test_throws DimensionMismatch df[!, "a"] .= sin.(1:3) + df[!, "b"] .= sin.(1) + df[!, "c"] .= sin(1) .+ 1 + @test df == DataFrame(b=Float64[], c=Float64[]) end @testset "empty data frame corner case" begin @@ -615,6 +816,11 @@ end @test_throws DimensionMismatch df[!, :a] .= [1 2] @test_throws DimensionMismatch df[!, :a] .= [1, 2] @test_throws DimensionMismatch df[!, :a] .= sin.(1) .+ [1, 2] + @test_throws ArgumentError df[!, ["a", "b"]] .= [1] + @test_throws ArgumentError df[!, ["a", "b"]] .= 1 + @test_throws DimensionMismatch df[!, "a"] .= [1 2] + @test_throws DimensionMismatch df[!, "a"] .= [1, 2] + @test_throws DimensionMismatch df[!, "a"] .= sin.(1) .+ [1, 2] for rhs in [1, [1], Int[], "abc", ["abc"]] df = DataFrame() @@ -656,6 +862,46 @@ end @. df[!, :a] = length(rhs) + 1 @test size(df) == (0, 2) @test eltype(df[!, 2]) == Int + + df = DataFrame() + df[!, "a"] .= rhs + @test size(df) == (0, 1) + @test eltype(df[!, 1]) == (rhs isa AbstractVector ? eltype(rhs) : typeof(rhs)) + + df = DataFrame() + df[!, "a"] .= length.(rhs) + @test size(df) == (0, 1) + @test eltype(df[!, 1]) == Int + + df = DataFrame() + df[!, "a"] .= length.(rhs) .+ 1 + @test size(df) == (0, 1) + @test eltype(df[!, 1]) == Int + + df = DataFrame() + @. df[!, "a"] = length(rhs) + 1 + @test size(df) == (0, 1) + @test eltype(df[!, 1]) == Int + + df = DataFrame(x=Int[]) + df[!, "a"] .= rhs + @test size(df) == (0, 2) + @test eltype(df[!, 2]) == (rhs isa AbstractVector ? eltype(rhs) : typeof(rhs)) + + df = DataFrame(x=Int[]) + df[!, "a"] .= length.(rhs) + @test size(df) == (0, 2) + @test eltype(df[!, 2]) == Int + + df = DataFrame(x=Int[]) + df[!, "a"] .= length.(rhs) .+ 1 + @test size(df) == (0, 2) + @test eltype(df[!, 2]) == Int + + df = DataFrame(x=Int[]) + @. df[!, "a"] = length(rhs) + 1 + @test size(df) == (0, 2) + @test eltype(df[!, 2]) == Int end df = DataFrame() @@ -675,7 +921,7 @@ end @test eltype(df.b) == Int df[!, :b] .= 'a' @test eltype(df.b) == Char - @test names(df) == [:a, :b] + @test names(df) == ["a", "b"] c = categorical(["a", "b", "c"]) df = DataFrame() @@ -684,6 +930,23 @@ end df[!, :b] .= c[1] @test nrow(df) == 0 @test df.b isa CategoricalVector{String} + + df = DataFrame(a=[]) + df[!, "b"] .= sin.(1) + @test eltype(df."b") == Float64 + df[!, "b"] .= [1] + @test eltype(df."b") == Int + df[!, "b"] .= 'a' + @test eltype(df."b") == Char + @test names(df) == ["a", "b"] + + c = categorical(["a", "b", "c"]) + df = DataFrame() + @test_throws DimensionMismatch df[!, "a"] .= c + + df[!, "b"] .= c[1] + @test nrow(df) == 0 + @test df."b" isa CategoricalVector{String} end @testset "test categorical values" begin @@ -1022,6 +1285,12 @@ end @test_throws MethodError df[1, :x1] .= "d" @test_throws DimensionMismatch df[1, :x1] .= [1, 2] + df = copy(refdf) + v1 = df[!, 1] + @test_throws MethodError df[1, "x1"] .= 1 + @test_throws MethodError df[1, "x1"] .= "d" + @test_throws DimensionMismatch df[1, "x1"] .= [1, 2] + df = copy(refdf) v1 = df[!, 1] v2 = df[!, 2] @@ -1134,15 +1403,29 @@ end @test df.newcol == [100.0, 100.0, 100.0] @test df[:, 1:end-1] == refdf + df = copy(refdf) + df[!, "newcol"] .= 100.0 + @test df.newcol == [100.0, 100.0, 100.0] + @test df[:, 1:end-1] == refdf + df = copy(refdf) df[!, :newcol] .= 'd' @test df.newcol == ['d', 'd', 'd'] @test df[:, 1:end-1] == refdf + df = copy(refdf) + df[!, "newcol"] .= 'd' + @test df.newcol == ['d', 'd', 'd'] + @test df[:, 1:end-1] == refdf + df = copy(refdf) @test_throws DimensionMismatch df[!, :newcol] .= [1 2 3] @test df == refdf + df = copy(refdf) + @test_throws DimensionMismatch df[!, "newcol"] .= [1 2 3] + @test df == refdf + df = copy(refdf) @test_throws ArgumentError df[!, 10] .= 'a' @test df == refdf @@ -1319,6 +1602,18 @@ end dfr.a .= 10 @test df == DataFrame(a=[[10,10],[3,4]], b=[[5,6],[7,8]]) @test_throws MethodError dfr.a .= ["a", "b"] + + df = DataFrame(a=[[1,2],[3,4]], b=[[5,6],[7,8]]) + dfr = df[1, :] + dfr."a" .= 10 + @test df == DataFrame(a=[[10,10],[3,4]], b=[[5,6],[7,8]]) + @test_throws MethodError dfr."a" .= ["a", "b"] + + df = DataFrame(a=[[1,2],[3,4]], b=[[5,6],[7,8]]) + dfr = df[1, 1:1] + dfr."a" .= 10 + @test df == DataFrame(a=[[10,10],[3,4]], b=[[5,6],[7,8]]) + @test_throws MethodError dfr."a" .= ["a", "b"] end @testset "make sure that : is in place and ! allocates" begin @@ -1331,24 +1626,37 @@ end @test a == [2, 3, 4] @test df.a == [3, 4, 5] @test df.a !== a + + df = DataFrame(a = [1, 2, 3]) + a = df.a + df[:, "a"] .+= 1 + @test a == [2, 3, 4] + @test df.a === a + df[!, "a"] .+= 1 + @test a == [2, 3, 4] + @test df.a == [3, 4, 5] + @test df.a !== a end @testset "add new correct rules for df[row, col] .= v broadcasting" begin - df = DataFrame(a=1) - @test_throws MethodError df[1,1] .= 10 - @test_throws MethodError df[1,:a] .= 10 - @test_throws MethodError df[CartesianIndex(1,1)] .= 10 - df = DataFrame(a=[[1,2,3]]) - df[1,1] .= 10 - @test df == DataFrame(a=[[10,10,10]]) - df[1,:a] .= 100 - @test df == DataFrame(a=[[100,100,100]]) - df[CartesianIndex(1,1)] .= 1000 - @test df == DataFrame(a=[[1000,1000,1000]]) + for v in [:a, "a"] + df = DataFrame(a=1) + @test_throws MethodError df[1,1] .= 10 + @test_throws MethodError df[1, v] .= 10 + @test_throws MethodError df[CartesianIndex(1,1)] .= 10 + df = DataFrame(a=[[1,2,3]]) + df[1,1] .= 10 + @test df == DataFrame(a=[[10,10,10]]) + df[1, v] .= 100 + @test df == DataFrame(a=[[100,100,100]]) + df[CartesianIndex(1,1)] .= 1000 + @test df == DataFrame(a=[[1000,1000,1000]]) + end end @testset "broadcasting into df[!, cols]" begin - for selector in [1:2, Between(:x1, :x2), Not(r"x3"), [:x1, :x2]] + for selector in [1:2, Between(:x1, :x2), Not(r"x3"), [:x1, :x2], + ["x1", "x2"], Between("x1", "x2")] df = DataFrame(x1=1:3, x2=4:6) df[!, selector] .= "a" @test df == DataFrame(fill("a", 3, 2)) @@ -1509,6 +1817,21 @@ end df = DataFrame(ones(3, 4)) z = fill("abc", 1, 1, 1) @test_throws DimensionMismatch df[:, :z] .= z + + df = DataFrame(ones(3, 4)) + z = ["a", "b", "c"] + df[:, "z"] .= z + @test df.z == z + @test df.z !== z + + df = DataFrame(ones(3, 4)) + z = "abc" + df[:, "z"] .= z + @test df.z == fill("abc", 3) + + df = DataFrame(ones(3, 4)) + z = fill("abc", 1, 1, 1) + @test_throws DimensionMismatch df[:, "z"] .= z end end # module diff --git a/test/cat.jl b/test/cat.jl index 6cee48c0ae..d2b80c1ebc 100644 --- a/test/cat.jl +++ b/test/cat.jl @@ -19,7 +19,7 @@ const ≅ = isequal dfh = hcat(df3, df4, makeunique=true) @test ref_df ≅ df3 # make sure that df3 is not mutated by hcat @test size(dfh, 2) == 3 - @test names(dfh) ≅ [:x1, :x1_1, :x2] + @test names(dfh) ≅ ["x1", "x1_1", "x2"] @test dfh[!, :x1] ≅ df3[!, :x1] @test dfh ≅ DataFrames.hcat!(DataFrame(), df3, df4, makeunique=true) @@ -28,7 +28,7 @@ const ≅ = isequal @test hcat(dfa, dfb) ≅ [dfa dfb] dfh3 = hcat(df3, df4, df5, makeunique=true) - @test names(dfh3) == [:x1, :x1_1, :x2, :x1_2, :x2_1] + @test names(dfh3) == ["x1", "x1_1", "x2", "x1_2", "x2_1"] @test dfh3 ≅ hcat(dfh, df5, makeunique=true) @test dfh3 ≅ DataFrames.hcat!(DataFrame(), df3, df4, df5, makeunique=true) @@ -101,12 +101,12 @@ end df2 = hcat(CategoricalVector{Union{Int,Missing}}(1:10), df, makeunique=true) @test isempty(df) @test df2[!, 1] == collect(1:10) - @test names(df2) == [:x1] + @test names(df2) == ["x1"] ref_df = copy(df2) df3 = hcat(11:20, df2, makeunique=true) @test df2 == ref_df @test df3[!, 1] == collect(11:20) - @test names(df3) == [:x1, :x1_1] + @test names(df3) == ["x1", "x1_1"] @test_throws ArgumentError hcat("a", df, makeunique=true) @test_throws ArgumentError hcat(df, "a", makeunique=true) @@ -129,75 +129,75 @@ end @test df3.a === df1.a df3 = hcat(df1, df2) - @test names(df3) == [:a, :b] + @test propertynames(df3) == [:a, :b] @test df3.a == df1.a @test df3.b == df2.b @test df3.a !== df1.a @test df3.b !== df2.b df3 = hcat(df1, df2, copycols=true) - @test names(df3) == [:a, :b] + @test propertynames(df3) == [:a, :b] @test df3.a == df1.a @test df3.b == df2.b @test df3.a !== df1.a @test df3.b !== df2.b df3 = hcat(df1, df2, copycols=false) - @test names(df3) == [:a, :b] + @test propertynames(df3) == [:a, :b] @test df3.a === df1.a @test df3.b === df2.b df3 = hcat(df1, dfv) - @test names(df3) == [:a, :b] + @test propertynames(df3) == [:a, :b] @test df3.a == df1.a @test df3.b == df2.b @test df3.a !== df1.a @test df3.b !== df2.b df3 = hcat(df1, dfv, copycols=true) - @test names(df3) == [:a, :b] + @test propertynames(df3) == [:a, :b] @test df3.a == df1.a @test df3.b == df2.b @test df3.a !== df1.a @test df3.b !== df2.b df3 = hcat(df1, dfv, copycols=false) - @test names(df3) == [:a, :b] + @test propertynames(df3) == [:a, :b] @test df3.a === df1.a @test df3.b === dfv.b df3 = hcat(df1, x) - @test names(df3) == [:a, :x1] + @test propertynames(df3) == [:a, :x1] @test df3.a == df1.a @test df3.x1 == x @test df3.a !== df1.a @test df3.x1 !== x df3 = hcat(df1, x, copycols=true) - @test names(df3) == [:a, :x1] + @test propertynames(df3) == [:a, :x1] @test df3.a == df1.a @test df3.x1 == x @test df3.a !== df1.a @test df3.x1 !== x df3 = hcat(df1, x, copycols=false) - @test names(df3) == [:a, :x1] + @test propertynames(df3) == [:a, :x1] @test df3.a === df1.a @test df3.x1 === x df3 = hcat(x, df1) - @test names(df3) == [:x1, :a] + @test propertynames(df3) == [:x1, :a] @test df3.a == df1.a @test df3.x1 == x @test df3.a !== df1.a @test df3.x1 !== x df3 = hcat(x, df1, copycols=true) - @test names(df3) == [:x1, :a] + @test propertynames(df3) == [:x1, :a] @test df3.a == df1.a @test df3.x1 == x @test df3.a !== df1.a @test df3.x1 !== x df3 = hcat(x, df1, copycols=false) - @test names(df3) == [:x1, :a] + @test propertynames(df3) == [:x1, :a] @test df3.a === df1.a @test df3.x1 === x df3 = hcat(dfv, x, df1) - @test names(df3) == [:b, :x1, :a] + @test propertynames(df3) == [:b, :x1, :a] @test df3.a == df1.a @test df3.b == dfv.b @test df3.x1 == x @@ -205,7 +205,7 @@ end @test df3.b !== dfv.b @test df3.x1 !== x df3 = hcat(dfv, x, df1, copycols=true) - @test names(df3) == [:b, :x1, :a] + @test propertynames(df3) == [:b, :x1, :a] @test df3.a == df1.a @test df3.b == dfv.b @test df3.x1 == x @@ -213,7 +213,7 @@ end @test df3.b !== dfv.b @test df3.x1 !== x df3 = hcat(dfv, x, df1, copycols=false) - @test names(df3) == [:b, :x1, :a] + @test propertynames(df3) == [:b, :x1, :a] @test df3.a === df1.a @test df3.b === dfv.b @test df3.x1 === x @@ -400,27 +400,69 @@ end DataFrame(A = [1, 2, 3, 7, 8, 9], B = [4, 5, 6, missing, missing, missing], C = [missing, missing, missing, missing, missing, missing]) - @test vcat(df1, df2; cols = [:A, :B, :C]) ≅ reduce(vcat, [df1, df2]; cols = [:A, :B, :C]) - @test vcat(df1, df2; cols = [:A, :B, :C]) ≅ reduce(vcat, (df1, df2); cols = [:A, :B, :C]) + @test vcat(df1, df2; cols = [:A, :B, :C]) ≅ + reduce(vcat, [df1, df2]; cols = [:A, :B, :C]) + @test vcat(df1, df2; cols = [:A, :B, :C]) ≅ + reduce(vcat, (df1, df2); cols = [:A, :B, :C]) @test vcat(df1, df2, df3; cols = [:A, :B, :C]) ≅ DataFrame(A = [1, 2, 3, 7, 8, 9, 10, 11, 12], B = [4, 5, 6, missing, missing, missing, missing, missing, missing], C = [missing, missing, missing, missing, missing, missing, 13, 14, 15]) - @test vcat(df1, df2, df3; cols = [:A, :B, :C]) ≅ reduce(vcat, [df1, df2, df3]; cols = [:A, :B, :C]) - @test vcat(df1, df2, df3; cols = [:A, :B, :C]) ≅ reduce(vcat, (df1, df2, df3); cols = [:A, :B, :C]) + @test vcat(df1, df2, df3; cols = [:A, :B, :C]) ≅ + reduce(vcat, [df1, df2, df3]; cols = [:A, :B, :C]) + @test vcat(df1, df2, df3; cols = [:A, :B, :C]) ≅ + reduce(vcat, (df1, df2, df3); cols = [:A, :B, :C]) df1 = DataFrame(A=Int[], B=Float64[]) df2 = DataFrame(B=1.0, A=1) @test vcat(df1, df2, df1, cols=[:A, :C, :B]) ≅ DataFrame(A=1, C=missing, B=1.0) - @test vcat(df1, df2, df1, cols=[:A, :C, :B]) ≅ reduce(vcat, [df1, df2, df1], cols=[:A, :C, :B]) - @test vcat(df1, df2, df1, cols=[:A, :C, :B]) ≅ reduce(vcat, (df1, df2, df1), cols=[:A, :C, :B]) + @test vcat(df1, df2, df1, cols=[:A, :C, :B]) ≅ + reduce(vcat, [df1, df2, df1], cols=[:A, :C, :B]) + @test vcat(df1, df2, df1, cols=[:A, :C, :B]) ≅ + reduce(vcat, (df1, df2, df1), cols=[:A, :C, :B]) @test vcat(df1, df2, df2, cols=[:C]) ≅ DataFrame(C=[missing, missing]) @test vcat(df1, df2, df2, cols=[:C]) ≅ reduce(vcat, [df1, df2, df2], cols=[:C]) @test vcat(df1, df2, df2, cols=[:C]) ≅ reduce(vcat, (df1, df2, df2), cols=[:C]) @test_throws ArgumentError vcat(df1, df2, df2, cols=[:C, :C]) @test_throws ArgumentError reduce(vcat, [df1, df2, df2], cols=[:C, :C]) @test_throws ArgumentError reduce(vcat, (df1, df2, df2), cols=[:C, :C]) + + df1 = DataFrame(A = 1:3, B = 4:6) + df2 = DataFrame(A = 7:9) + df3 = DataFrame(A = 10:12, C = 13:15) + + @test vcat(df1, df2; cols = ["A", "B", "C"]) ≅ + DataFrame(A = [1, 2, 3, 7, 8, 9], + B = [4, 5, 6, missing, missing, missing], + C = [missing, missing, missing, missing, missing, missing]) + @test vcat(df1, df2; cols = ["A", "B", "C"]) ≅ + reduce(vcat, [df1, df2]; cols = ["A", "B", "C"]) + @test vcat(df1, df2; cols = ["A", "B", "C"]) ≅ + reduce(vcat, (df1, df2); cols = ["A", "B", "C"]) + + @test vcat(df1, df2, df3; cols = ["A", "B", "C"]) ≅ + DataFrame(A = [1, 2, 3, 7, 8, 9, 10, 11, 12], + B = [4, 5, 6, missing, missing, missing, missing, missing, missing], + C = [missing, missing, missing, missing, missing, missing, 13, 14, 15]) + @test vcat(df1, df2, df3; cols = ["A", "B", "C"]) ≅ + reduce(vcat, [df1, df2, df3]; cols = ["A", "B", "C"]) + @test vcat(df1, df2, df3; cols = ["A", "B", "C"]) ≅ + reduce(vcat, (df1, df2, df3); cols = ["A", "B", "C"]) + + df1 = DataFrame(A=Int[], B=Float64[]) + df2 = DataFrame(B=1.0, A=1) + @test vcat(df1, df2, df1, cols=["A", "C", "B"]) ≅ DataFrame(A=1, C=missing, B=1.0) + @test vcat(df1, df2, df1, cols=["A", "C", "B"]) ≅ + reduce(vcat, [df1, df2, df1], cols=["A", "C", "B"]) + @test vcat(df1, df2, df1, cols=["A", "C", "B"]) ≅ + reduce(vcat, (df1, df2, df1), cols=["A", "C", "B"]) + @test vcat(df1, df2, df2, cols=["C"]) ≅ DataFrame(C=[missing, missing]) + @test vcat(df1, df2, df2, cols=["C"]) ≅ reduce(vcat, [df1, df2, df2], cols=["C"]) + @test vcat(df1, df2, df2, cols=["C"]) ≅ reduce(vcat, (df1, df2, df2), cols=["C"]) + @test_throws ArgumentError vcat(df1, df2, df2, cols=[:C, :C]) + @test_throws ArgumentError reduce(vcat, [df1, df2, df2], cols=["C", "C"]) + @test_throws ArgumentError reduce(vcat, (df1, df2, df2), cols=["C", "C"]) end @testset "vcat thrown exceptions" begin @@ -436,7 +478,8 @@ end err1 = @test_throws ArgumentError vcat(df1, df2, df2, df2, df2, df2) err2 = @test_throws ArgumentError reduce(vcat, [df1, df2, df2, df2, df2, df2]) @test_throws ArgumentError reduce(vcat, (df1, df2, df2, df2, df2, df2)) - @test err1.value.msg == err2.value.msg == "column(s) B are missing from argument(s) 2, 3, 4, 5 and 6" + @test err1.value.msg == err2.value.msg == + "column(s) B are missing from argument(s) 2, 3, 4, 5 and 6" # argument missing >1 columns df1 = DataFrame(A = 1:3, B = 1:3, C = 1:3, D = 1:3, E = 1:3) err = @test_throws ArgumentError vcat(df1, df2) @@ -454,32 +497,51 @@ end df1 = DataFrame(A = 1, B = 1, C = 1, D = 1) df2 = DataFrame(A = 1, C = 1, D = 1, E = 1, F = 1) err = @test_throws ArgumentError vcat(df1, df2) - @test err.value.msg == "column(s) E and F are missing from argument(s) 1, and column(s) B are missing from argument(s) 2" + @test err.value.msg == "column(s) E and F are missing from argument(s) 1, and" * + " column(s) B are missing from argument(s) 2" err = @test_throws ArgumentError vcat(df1, df1, df2, df2) - @test err.value.msg == "column(s) E and F are missing from argument(s) 1 and 2, and column(s) B are missing from argument(s) 3 and 4" + @test err.value.msg == "column(s) E and F are missing from argument(s) 1 and 2," * + " and column(s) B are missing from argument(s) 3 and 4" df3 = DataFrame(A = 1, B = 1, C = 1, D = 1, E = 1) err = @test_throws ArgumentError vcat(df1, df2, df3) - @test err.value.msg == "column(s) E and F are missing from argument(s) 1, column(s) B are missing from argument(s) 2, and column(s) F are missing from argument(s) 3" + @test err.value.msg == "column(s) E and F are missing from argument(s) 1, " * + "column(s) B are missing from argument(s) 2, and " * + "column(s) F are missing from argument(s) 3" err = @test_throws ArgumentError vcat(df1, df1, df2, df2, df3, df3) - @test err.value.msg == "column(s) E and F are missing from argument(s) 1 and 2, column(s) B are missing from argument(s) 3 and 4, and column(s) F are missing from argument(s) 5 and 6" + @test err.value.msg == "column(s) E and F are missing from argument(s) 1 and 2," * + " column(s) B are missing from argument(s) 3 and 4, " * + "and column(s) F are missing from argument(s) 5 and 6" err = @test_throws ArgumentError vcat(df1, df1, df1, df2, df2, df2, df3, df3, df3) - @test err.value.msg == "column(s) E and F are missing from argument(s) 1, 2 and 3, column(s) B are missing from argument(s) 4, 5 and 6, and column(s) F are missing from argument(s) 7, 8 and 9" + @test err.value.msg == "column(s) E and F are missing from argument(s) 1, 2 and 3," * + " column(s) B are missing from argument(s) 4, 5 and 6, and" * + " column(s) F are missing from argument(s) 7, 8 and 9" # df4 is a superset of names found in all other DataFrames and won't be shown in error df4 = DataFrame(A = 1, B = 1, C = 1, D = 1, E = 1, F = 1) err = @test_throws ArgumentError vcat(df1, df2, df3, df4) - @test err.value.msg == "column(s) E and F are missing from argument(s) 1, column(s) B are missing from argument(s) 2, and column(s) F are missing from argument(s) 3" + @test err.value.msg == "column(s) E and F are missing from argument(s) 1, " * + "column(s) B are missing from argument(s) 2, and " * + "column(s) F are missing from argument(s) 3" err = @test_throws ArgumentError vcat(df1, df1, df2, df2, df3, df3, df4, df4) - @test err.value.msg == "column(s) E and F are missing from argument(s) 1 and 2, column(s) B are missing from argument(s) 3 and 4, and column(s) F are missing from argument(s) 5 and 6" - err = @test_throws ArgumentError vcat(df1, df1, df1, df2, df2, df2, df3, df3, df3, df4, df4, df4) - @test err.value.msg == "column(s) E and F are missing from argument(s) 1, 2 and 3, column(s) B are missing from argument(s) 4, 5 and 6, and column(s) F are missing from argument(s) 7, 8 and 9" - err = @test_throws ArgumentError vcat(df1, df2, df3, df4, df1, df2, df3, df4, df1, df2, df3, df4) - @test err.value.msg == "column(s) E and F are missing from argument(s) 1, 5 and 9, column(s) B are missing from argument(s) 2, 6 and 10, and column(s) F are missing from argument(s) 3, 7 and 11" + @test err.value.msg == "column(s) E and F are missing from argument(s) 1 and 2," * + " column(s) B are missing from argument(s) 3 and 4, and" * + " column(s) F are missing from argument(s) 5 and 6" + err = @test_throws ArgumentError vcat(df1, df1, df1, df2, df2, df2, + df3, df3, df3, df4, df4, df4) + @test err.value.msg == "column(s) E and F are missing from argument(s) 1, 2 " * + "and 3, column(s) B are missing from argument(s) 4, 5 " * + "and 6, and column(s) F are missing from argument(s) 7, 8 and 9" + err = @test_throws ArgumentError vcat(df1, df2, df3, df4, df1, df2, + df3, df4, df1, df2, df3, df4) + @test err.value.msg == "column(s) E and F are missing from argument(s) 1, 5 and 9" * + ", column(s) B are missing from argument(s) 2, 6 and 10, " * + "and column(s) F are missing from argument(s) 3, 7 and 11" end @testset "vcat with view" begin x = view(DataFrame(A = Vector{Union{Missing, Int}}(1:3)), 2:2, :) y = DataFrame(A = 4:5) - @test vcat(x, y) == DataFrame(A = [2, 4, 5]) == reduce(vcat, [x, y]) == reduce(vcat, (x, y)) + @test vcat(x, y) == DataFrame(A = [2, 4, 5]) == + reduce(vcat, [x, y]) == reduce(vcat, (x, y)) end end # module diff --git a/test/constructors.jl b/test/constructors.jl index ccebfe9750..62559e1038 100644 --- a/test/constructors.jl +++ b/test/constructors.jl @@ -16,57 +16,54 @@ const ≅ = isequal @test index(df) == Index() @test size(DataFrame!()) == (0,0) - df = DataFrame(Any[CategoricalVector{Union{Float64, Missing}}(zeros(3)), - CategoricalVector{Union{Float64, Missing}}(ones(3))], - Index([:x1, :x2])) + vecvec = [CategoricalVector{Union{Float64, Missing}}(zeros(3)), + CategoricalVector{Union{Float64, Missing}}(ones(3))] + + df = DataFrame(collect(Any, vecvec), Index([:x1, :x2])) @test size(df, 1) == 3 @test size(df, 2) == 2 - df2 = DataFrame!(Any[CategoricalVector{Union{Float64, Missing}}(zeros(3)), - CategoricalVector{Union{Float64, Missing}}(ones(3))], - Index([:x1, :x2])) + df2 = DataFrame!(collect(Any, vecvec), Index([:x1, :x2])) @test size(df2, 1) == 3 @test size(df2, 2) == 2 - - @test df == DataFrame([CategoricalVector{Union{Float64, Missing}}(zeros(3)), - CategoricalVector{Union{Float64, Missing}}(ones(3))]) - @test df == DataFrame!([CategoricalVector{Union{Float64, Missing}}(zeros(3)), - CategoricalVector{Union{Float64, Missing}}(ones(3))]) - @test df == DataFrame([CategoricalVector{Union{Float64, Missing}}(zeros(3)), - CategoricalVector{Union{Float64, Missing}}(ones(3))], [:x1, :x2]) - @test df == DataFrame(Any[CategoricalVector{Union{Float64, Missing}}(zeros(3)), - CategoricalVector{Union{Float64, Missing}}(ones(3))]) - @test df == DataFrame(Any[CategoricalVector{Union{Float64, Missing}}(zeros(3)), - CategoricalVector{Union{Float64, Missing}}(ones(3))], [:x1, :x2]) - @test df == DataFrame(AbstractVector[CategoricalVector{Union{Float64, Missing}}(zeros(3)), - CategoricalVector{Union{Float64, Missing}}(ones(3))], [:x1, :x2]) - @test df == DataFrame((CategoricalVector{Union{Float64, Missing}}(zeros(3)), - CategoricalVector{Union{Float64, Missing}}(ones(3)))) - @test df == DataFrame((CategoricalVector{Union{Float64, Missing}}(zeros(3)), - CategoricalVector{Union{Float64, Missing}}(ones(3))), (:x1, :x2)) - @test df == DataFrame(x1 = Union{Int, Missing}[0.0, 0.0, 0.0], - x2 = Union{Int, Missing}[1.0, 1.0, 1.0]) - @test df == DataFrame!(x1 = Union{Int, Missing}[0.0, 0.0, 0.0], - x2 = Union{Int, Missing}[1.0, 1.0, 1.0]) - @test df == DataFrame([:x1=>Union{Int, Missing}[0.0, 0.0, 0.0], - :x2=>Union{Int, Missing}[1.0, 1.0, 1.0]]) - @test df == DataFrame!((:x1=>Union{Int, Missing}[0.0, 0.0, 0.0], - :x2=>Union{Int, Missing}[1.0, 1.0, 1.0])) + @test df2.x1 === vecvec[1] + @test df2.x2 === vecvec[2] + + for fun in (DataFrame, DataFrame!) + @test df == fun(vecvec) + @test df == fun(collect(Any, vecvec)) + @test df == fun(collect(AbstractVector, vecvec)) + @test df == fun(Tuple(vecvec)) + @test df == fun(x1 = vecvec[1], x2 = vecvec[2]) + + for cols in ([:x1, :x2], ["x1", "x2"]) + @test df == fun(vecvec, cols) + @test df == fun(collect(Any, vecvec), cols) + @test df == fun(collect(AbstractVector, vecvec), cols) + @test df == fun(Tuple(vecvec), Tuple(cols)) + @test df == fun([col=>vect for (col, vect) in zip(cols, vecvec)]) + end + end @test DataFrame([1:3, 1:3]) == DataFrame(Any[1:3, 1:3]) == DataFrame(UnitRange[1:3, 1:3]) == DataFrame(AbstractVector[1:3, 1:3]) == DataFrame([[1,2,3], [1,2,3]]) == DataFrame(Any[[1,2,3], [1,2,3]]) == DataFrame(([1,2,3], [1,2,3])) == DataFrame((1:3, 1:3)) == DataFrame((1:3, [1,2,3])) == DataFrame([1:3, [1,2,3]]) - DataFrame((:x1=>1:3, :x2=>[1,2,3])) == DataFrame([:x1=>1:3, :x2=>[1,2,3]]) + DataFrame((:x1=>1:3, :x2=>[1,2,3])) == DataFrame([:x1=>1:3, :x2=>[1,2,3]]) == + DataFrame(("x1"=>1:3, "x2"=>[1,2,3])) == DataFrame(["x1"=>1:3, "x2"=>[1,2,3]]) @inferred DataFrame([1:3, 1:3]) @inferred DataFrame((1:3, 1:3)) @inferred DataFrame([1:3, 1:3], [:a, :b]) @inferred DataFrame((1:3, 1:3), (:a, :b)) + @inferred DataFrame([1:3, 1:3], ["a", "b"]) + @inferred DataFrame((1:3, 1:3), ("a", "b")) @inferred DataFrame((:x1=>1:3, :x2=>[1,2,3])) @inferred DataFrame([:x1=>1:3, :x2=>[1,2,3]]) + @inferred DataFrame(("x1"=>1:3, "x2"=>[1,2,3])) + @inferred DataFrame(["x1"=>1:3, "x2"=>[1,2,3]]) @test df !== DataFrame(df) @test df == DataFrame(df) @@ -78,6 +75,14 @@ const ≅ = isequal @test df[!, :x1] == df2[!, :x1] @test df[!, :x2] == df2[!, :x2] + + df2 = convert(DataFrame, Union{Float64, Missing}[0.0 1.0; + 0.0 1.0; + 0.0 1.0]) + rename!(df2, ["x1", "x2"]) + @test df[!, "x1"] == df2[!, "x1"] + @test df[!, "x2"] == df2[!, "x2"] + df2 = DataFrame([0.0 1.0; 0.0 1.0; 0.0 1.0]) @@ -85,10 +90,24 @@ const ≅ = isequal @test df[!, :x1] == df2[!, :x1] @test df[!, :x2] == df2[!, :x2] + df2 = DataFrame([0.0 1.0; + 0.0 1.0; + 0.0 1.0]) + rename!(df2, ["x1", "x2"]) + @test df[!, "x1"] == df2[!, "x1"] + @test df[!, "x2"] == df2[!, "x2"] + @test_throws ArgumentError DataFrame!([0.0 1.0; 0.0 1.0; 0.0 1.0]) + df2 = DataFrame([0.0 1.0; + 0.0 1.0; + 0.0 1.0], ["a", "b"]) + rename!(df2, ["a", "b"]) + @test df[!, "x1"] == df2[!, "a"] + @test df[!, "x2"] == df2[!, "b"] + df2 = DataFrame([0.0 1.0; 0.0 1.0; 0.0 1.0], [:a, :b]) @@ -100,11 +119,25 @@ const ≅ = isequal 0.0 1.0; 0.0 1.0], [:a, :b]) + df2 = DataFrame([0.0 1.0; + 0.0 1.0; + 0.0 1.0], ["a", "b"]) + rename!(df2, ["a", "b"]) + @test df[!, "x1"] == df2[!, "a"] + @test df[!, "x2"] == df2[!, "b"] + + @test_throws ArgumentError DataFrame!([0.0 1.0; + 0.0 1.0; + 0.0 1.0], ["a", "b"]) + @test df == DataFrame(x1 = Union{Float64, Missing}[0.0, 0.0, 0.0], x2 = Union{Float64, Missing}[1.0, 1.0, 1.0]) @test df == DataFrame(x1 = Union{Float64, Missing}[0.0, 0.0, 0.0], x2 = Union{Float64, Missing}[1.0, 1.0, 1.0], x3 = Union{Float64, Missing}[2.0, 2.0, 2.0])[:, [:x1, :x2]] + @test df == DataFrame(x1 = Union{Float64, Missing}[0.0, 0.0, 0.0], + x2 = Union{Float64, Missing}[1.0, 1.0, 1.0], + x3 = Union{Float64, Missing}[2.0, 2.0, 2.0])[:, ["x1", "x2"]] @test_throws BoundsError SubDataFrame(DataFrame(A=1), 0:0, :) @test_throws ArgumentError SubDataFrame(DataFrame(A=1), 0, :) @@ -124,28 +157,28 @@ end df = DataFrame(x=x, y=y) @test size(df) == (3, 2) - @test names(df) == [:x, :y] + @test propertynames(df) == [:x, :y] @test df.x == x @test df.y == y @test df.x !== x @test df.y !== y df = DataFrame(x=x, y=y, copycols=true) @test size(df) == (3, 2) - @test names(df) == [:x, :y] + @test propertynames(df) == [:x, :y] @test df.x == x @test df.y == y @test df.x !== x @test df.y !== y df = DataFrame(x=x, y=y, copycols=false) @test size(df) == (3, 2) - @test names(df) == [:x, :y] + @test propertynames(df) == [:x, :y] @test df.x === x @test df.y === y @test_throws ArgumentError DataFrame(x=x, y=y, copycols=1) df = DataFrame!(x=x, y=y) @test size(df) == (3, 2) - @test names(df) == [:x, :y] + @test propertynames(df) == [:x, :y] @test df.x === x @test df.y === y @test_throws ArgumentError DataFrame!(x=x, y=y, copycols=true) @@ -198,20 +231,42 @@ end @test isequal(df, DataFrame(x1 = [0.0, 0.0, 0.0], x2 = [1.0, 1.0, 1.0])) df = DataFrame(:type => [], :begin => []) - @test names(df) == [:type, :begin] + @test propertynames(df) == [:type, :begin] a=[1,2,3] df = DataFrame(:a=>a, :b=>1, :c=>1:3) - @test names(df) == [:a, :b, :c] + @test propertynames(df) == [:a, :b, :c] @test df.a == a @test df.a !== a df = DataFrame(:a=>a, :b=>1, :c=>1:3, copycols=false) - @test names(df) == [:a, :b, :c] + @test propertynames(df) == [:a, :b, :c] @test df.a === a df = DataFrame!(:a=>a, :b=>1, :c=>1:3) - @test names(df) == [:a, :b, :c] + @test propertynames(df) == [:a, :b, :c] @test df.a === a + + df = DataFrame("x1" => zeros(3), "x2" => ones(3)) + @inferred DataFrame("x1" => zeros(3), "x2" => ones(3)) + @test size(df, 1) == 3 + @test size(df, 2) == 2 + @test isequal(df, DataFrame(x1 = [0.0, 0.0, 0.0], x2 = [1.0, 1.0, 1.0])) + + df = DataFrame("type" => [], "begin" => []) + @test propertynames(df) == [:type, :begin] + + a=[1,2,3] + df = DataFrame("a"=>a, "b"=>1, "c"=>1:3) + @test propertynames(df) == [:a, :b, :c] + @test df."a" == a + @test df."a" !== a + df = DataFrame("a"=>a, "b"=>1, "c"=>1:3, copycols=false) + @test propertynames(df) == [:a, :b, :c] + @test df."a" === a + + df = DataFrame!("a"=>a, "b"=>1, "c"=>1:3) + @test propertynames(df) == [:a, :b, :c] + @test df."a" === a end @testset "associative" begin @@ -222,16 +277,34 @@ end a=[1,2,3] df = DataFrame(Dict(:a=>a, :b=>1, :c=>1:3)) - @test names(df) == [:a, :b, :c] + @test propertynames(df) == [:a, :b, :c] @test df.a == a @test df.a !== a df = DataFrame(Dict(:a=>a, :b=>1, :c=>1:3), copycols=false) - @test names(df) == [:a, :b, :c] + @test propertynames(df) == [:a, :b, :c] @test df.a === a df = DataFrame!(Dict(:a=>a, :b=>1, :c=>1:3)) - @test names(df) == [:a, :b, :c] + @test propertynames(df) == [:a, :b, :c] @test df.a === a + + df = DataFrame(Dict("A" => 1:3, "B" => 4:6)) + @inferred DataFrame(Dict("A" => 1:3, "B" => 4:6)) + @test df == DataFrame(A = 1:3, B = 4:6) + @test eltype.(eachcol(df)) == [Int, Int] + + a=[1,2,3] + df = DataFrame(Dict("a"=>a, "b"=>1, "c"=>1:3)) + @test propertynames(df) == [:a, :b, :c] + @test df."a" == a + @test df."a" !== a + df = DataFrame(Dict("a"=>a, "b"=>1, "c"=>1:3), copycols=false) + @test propertynames(df) == [:a, :b, :c] + @test df."a" === a + + df = DataFrame!(Dict("a"=>a, "b"=>1, "c"=>1:3)) + @test propertynames(df) == [:a, :b, :c] + @test df."a" === a end @testset "vector constructors" begin @@ -239,80 +312,114 @@ end y = [1,2,3] df = DataFrame([x, y]) - @test names(df) == [:x1, :x2] + @test propertynames(df) == [:x1, :x2] @test df.x1 == x @test df.x2 == y @test df.x1 !== x @test df.x2 !== y df = DataFrame([x, y], copycols=true) - @test names(df) == [:x1, :x2] + @test propertynames(df) == [:x1, :x2] @test df.x1 == x @test df.x2 == y @test df.x1 !== x @test df.x2 !== y df = DataFrame([x, y], copycols=false) - @test names(df) == [:x1, :x2] + @test propertynames(df) == [:x1, :x2] @test df.x1 === x @test df.x2 === y df = DataFrame([x, y], [:x1, :x2]) - @test names(df) == [:x1, :x2] + @test propertynames(df) == [:x1, :x2] @test df.x1 == x @test df.x2 == y @test df.x1 !== x @test df.x2 !== y df = DataFrame([x, y], [:x1, :x2], copycols=true) - @test names(df) == [:x1, :x2] + @test propertynames(df) == [:x1, :x2] @test df.x1 == x @test df.x2 == y @test df.x1 !== x @test df.x2 !== y df = DataFrame([x, y], [:x1, :x2], copycols=false) - @test names(df) == [:x1, :x2] + @test propertynames(df) == [:x1, :x2] @test df.x1 === x @test df.x2 === y + df = DataFrame([x, y], ["x1", "x2"]) + @test names(df) == ["x1", "x2"] + @test df."x1" == x + @test df."x2" == y + @test df."x1" !== x + @test df."x2" !== y + df = DataFrame([x, y], ["x1", "x2"], copycols=true) + @test names(df) == ["x1", "x2"] + @test df."x1" == x + @test df."x2" == y + @test df."x1" !== x + @test df."x2" !== y + df = DataFrame([x, y], ["x1", "x2"], copycols=false) + @test names(df) == ["x1", "x2"] + @test df."x1" === x + @test df."x2" === y + df = DataFrame((x, y)) - @test names(df) == [:x1, :x2] + @test propertynames(df) == [:x1, :x2] @test df.x1 == x @test df.x2 == y @test df.x1 !== x @test df.x2 !== y df = DataFrame((x, y), copycols=true) - @test names(df) == [:x1, :x2] + @test propertynames(df) == [:x1, :x2] @test df.x1 == x @test df.x2 == y @test df.x1 !== x @test df.x2 !== y df = DataFrame((x, y), copycols=false) - @test names(df) == [:x1, :x2] + @test propertynames(df) == [:x1, :x2] @test df.x1 === x @test df.x2 === y df = DataFrame!((x, y)) - @test names(df) == [:x1, :x2] + @test propertynames(df) == [:x1, :x2] @test df.x1 === x @test df.x2 === y df = DataFrame((x, y), (:x1, :x2)) - @test names(df) == [:x1, :x2] + @test propertynames(df) == [:x1, :x2] @test df.x1 == x @test df.x2 == y @test df.x1 !== x @test df.x2 !== y df = DataFrame((x, y), (:x1, :x2), copycols=true) - @test names(df) == [:x1, :x2] + @test propertynames(df) == [:x1, :x2] @test df.x1 == x @test df.x2 == y @test df.x1 !== x @test df.x2 !== y df = DataFrame((x, y), (:x1, :x2), copycols=false) - @test names(df) == [:x1, :x2] + @test propertynames(df) == [:x1, :x2] @test df.x1 === x @test df.x2 === y + df = DataFrame((x, y), ("x1", "x2")) + @test names(df) == ["x1", "x2"] + @test df."x1" == x + @test df."x2" == y + @test df."x1" !== x + @test df."x2" !== y + df = DataFrame((x, y), ("x1", "x2"), copycols=true) + @test names(df) == ["x1", "x2"] + @test df."x1" == x + @test df."x2" == y + @test df."x1" !== x + @test df."x2" !== y + df = DataFrame((x, y), ("x1", "x2"), copycols=false) + @test names(df) == ["x1", "x2"] + @test df."x1" === x + @test df."x2" === y + df = DataFrame!((x, y), (:x1, :x2)) - @test names(df) == [:x1, :x2] + @test propertynames(df) == [:x1, :x2] @test df.x1 === x @test df.x2 === y @@ -320,6 +427,16 @@ end v = AbstractVector[1:3, [1,2,3]] @test DataFrame(v, n).x1 isa Vector{Int} @test v[1] isa AbstractRange + + df = DataFrame!((x, y), ("x1", "x2")) + @test names(df) == ["x1", "x2"] + @test df."x1" === x + @test df."x2" === y + + n = ["x1", "x2"] + v = AbstractVector[1:3, [1,2,3]] + @test DataFrame(v, n)."x1" isa Vector{Int} + @test v[1] isa AbstractRange end @testset "recyclers" begin @@ -337,6 +454,7 @@ end @test_throws ArgumentError f([1, 2, 3]) @test_throws DimensionMismatch f(AbstractVector[1:3, [1,2]]) @test_throws ArgumentError f([1:3, 1], [:x1, :x2]) + @test_throws ArgumentError f([1:3, 1], ["x1", "x2"]) @test_throws ErrorException f([1:3, 1]) end @@ -376,6 +494,17 @@ end @test all(ismissing, df[!, 2]) @test all(ismissing, df[!, 3]) + df = DataFrame([Union{Int, Missing}, Union{Float64, Missing}, Union{String, Missing}], + ["A", "B", "C"], 100) + @test size(df, 1) == 100 + @test size(df, 2) == 3 + @test typeof(df[!, "A"]) == Vector{Union{Int, Missing}} + @test typeof(df[!, "B"]) == Vector{Union{Float64, Missing}} + @test typeof(df[!, "C"]) == Vector{Union{String, Missing}} + @test all(ismissing, df[!, "A"]) + @test all(ismissing, df[!, "B"]) + @test all(ismissing, df[!, "C"]) + df = DataFrame([Union{Int, Missing}, Union{Float64, Missing}], [:x1, :x2], 2) @test size(df) == (2, 2) @test eltype.(eachcol(df)) == [Union{Int, Missing}, Union{Float64, Missing}] @@ -392,7 +521,16 @@ end @test typeof(df[!, 1]) == Vector{Union{Int, Missing}} @test typeof(df[!, 2]) == Vector{Union{Float64, Missing}} @test typeof(df[!, 3]) == Vector{Union{String, Missing}} - @test names(df) == [:A, :B, :C] + @test propertynames(df) == [:A, :B, :C] + + df = DataFrame([Union{Int, Missing}, Union{Float64, Missing}, Union{String, Missing}], + ["A", "B", "C"]) + @test size(df, 1) == 0 + @test size(df, 2) == 3 + @test typeof(df[!, "A"]) == Vector{Union{Int, Missing}} + @test typeof(df[!, "B"]) == Vector{Union{Float64, Missing}} + @test typeof(df[!, "C"]) == Vector{Union{String, Missing}} + @test names(df) == ["A", "B", "C"] end @testset "expansion of Ref and 0-dimensional arrays" begin diff --git a/test/conversions.jl b/test/conversions.jl index f4ee02d7d6..42defbef3b 100644 --- a/test/conversions.jl +++ b/test/conversions.jl @@ -100,7 +100,7 @@ const ≅ = isequal df = convert(DataFrame, di) @test isa(df, DataFrame) - @test names(df) == [Symbol(x) for x in sort(collect(keys(di)))] + @test names(df) == [x for x in sort(collect(keys(di)))] @test df[!, :a] == a @test df[!, :b] == b @test df[!, :c] == c @@ -108,7 +108,7 @@ const ≅ = isequal od = OrderedDict("c"=>c, "a"=>a, "b"=>b) df = convert(DataFrame,od) @test isa(df, DataFrame) - @test names(df) == [Symbol(x) for x in keys(od)] + @test names(df) == [x for x in keys(od)] @test df[!, :a] == a @test df[!, :b] == b @test df[!, :c] == c @@ -116,7 +116,7 @@ const ≅ = isequal sd = SortedDict("c"=>c, "a"=>a, "b"=>b) df = convert(DataFrame,sd) @test isa(df, DataFrame) - @test names(df) == [Symbol(x) for x in keys(sd)] + @test names(df) == [x for x in keys(sd)] @test df[!, :a] == a @test df[!, :b] == b @test df[!, :c] == c diff --git a/test/data.jl b/test/data.jl index 7dd6580cad..7de60ccacf 100644 --- a/test/data.jl +++ b/test/data.jl @@ -16,9 +16,9 @@ const ≅ = isequal #test_group("description functions") @test size(df6, 1) == 4 @test size(df6, 2) == 3 - @test names(df6) == [:A, :B, :C] - @test names(df2) == [:x1, :x2] - @test names(df7) == [:x, :y] + @test propertynames(df6) == [:A, :B, :C] + @test propertynames(df2) == [:x1, :x2] + @test propertynames(df7) == [:x, :y] #test_group("ref") @test df6[2, 3] == "two" @@ -41,7 +41,7 @@ const ≅ = isequal df6[!, :D] = [true, false, true, false] @test df6[1,4] select!(df6, Not(:D)) - @test names(df6) == [:A, :B, :C] + @test propertynames(df6) == [:A, :B, :C] @test size(df6, 2) == 3 #test_context("SubDataFrames") @@ -127,7 +127,8 @@ end @test_throws ArgumentError completecases(DataFrame()) @test_throws MethodError completecases(DataFrame(x=1), true) - for cols in (:x2, [:x2], [:x1, :x2], 2, [2], 1:2, [true, true], [false, true], :, + for cols in (:x2, "x2", [:x2], ["x2"], [:x1, :x2], ["x1", "x2"], 2, [2], 1:2, + [true, true], [false, true], :, r"x2", r"x", Not(1), Not([1]), Not(Int[]), Not([]), Not(Symbol[]), Not(1:0), Not([true, false]), Not(:x1), Not([:x1])) @test df2[completecases(df2, cols), :] == df2[[1, 2, 4], :] @@ -278,7 +279,9 @@ end @test findall(nonunique(df, :)) == collect(7:12) @test findall(nonunique(df, Colon())) == collect(7:12) @test findall(nonunique(df, :a)) == collect(3:12) + @test findall(nonunique(df, "a")) == collect(3:12) @test findall(nonunique(df, [:a, :c])) == collect(7:12) + @test findall(nonunique(df, ["a", "c"])) == collect(7:12) @test findall(nonunique(df, r"[ac]")) == collect(7:12) @test findall(nonunique(df, Not(2))) == collect(7:12) @test findall(nonunique(df, Not([2]))) == collect(7:12) @@ -295,6 +298,7 @@ end @test unique(df, 3) == df1[1:3,:] @test unique(df, [1, 3]) == df1 @test unique(df, [:a, :c]) == df1 + @test unique(df, ["a", "c"]) == df1 @test unique(df, r"[ac]") == df1 @test unique(df, Not(2)) == df1 @test unique(df, Not([2])) == df1 @@ -302,9 +306,13 @@ end @test unique(df, Not([:b])) == df1 @test unique(df, Not([false, true, false])) == df1 @test unique(df, :a) == df1[1:2,:] + @test unique(df, "a") == df1[1:2,:] @test_throws ArgumentError unique(DataFrame()) @test_throws ArgumentError nonunique(DataFrame()) + @test unique(copy(df1), "a") == unique(copy(df1), :a) == unique(copy(df1), 1) == + df1[1:2, :] + unique!(df, [1, 3]) @test df == df1 for cols in (r"[ac]", Not(:b), Not(2), Not([:b]), Not([2]), Not([false, true, false])) @@ -323,6 +331,10 @@ end @test filter(:x => x -> x > 1, df) == DataFrame(x = [3, 2], y = ["b", "a"]) @test filter!(:x => x -> x > 1, df) === df == DataFrame(x = [3, 2], y = ["b", "a"]) + df = DataFrame(x = [3, 1, 2, 1], y = ["b", "c", "a", "b"]) + @test filter("x" => x -> x > 1, df) == DataFrame(x = [3, 2], y = ["b", "a"]) + @test filter!("x" => x -> x > 1, df) === df == DataFrame(x = [3, 2], y = ["b", "a"]) + df = DataFrame(x = [3, 1, 2, 1], y = ["b", "c", "a", "b"]) @test filter(1 => x -> x > 1, df) == DataFrame(x = [3, 2], y = ["b", "a"]) @test filter!(1 => x -> x > 1, df) === df == DataFrame(x = [3, 2], y = ["b", "a"]) @@ -331,6 +343,10 @@ end @test filter([:x] => x -> x > 1, df) == DataFrame(x = [3, 2], y = ["b", "a"]) @test filter!([:x] => x -> x > 1, df) === df == DataFrame(x = [3, 2], y = ["b", "a"]) + df = DataFrame(x = [3, 1, 2, 1], y = ["b", "c", "a", "b"]) + @test filter(["x"] => x -> x > 1, df) == DataFrame(x = [3, 2], y = ["b", "a"]) + @test filter!(["x"] => x -> x > 1, df) === df == DataFrame(x = [3, 2], y = ["b", "a"]) + df = DataFrame(x = [3, 1, 2, 1], y = ["b", "c", "a", "b"]) @test filter((:) => (r...) -> r[1] > 1, df) == DataFrame(x = [3, 2], y = ["b", "a"]) @test filter!((:) => (r...) -> r[1] > 1, df) === df == DataFrame(x = [3, 2], y = ["b", "a"]) @@ -343,21 +359,27 @@ end @test filter([2, 2] => !=, df) == DataFrame(x=Int[], y=String[]) @test filter!([2, 2] => !=, df) === df == DataFrame(x=Int[], y=String[]) - for sel in [r"x", [1,2], [:x1, :x2], :, Not(r"y")] + for sel in [r"x", [1,2], [:x1, :x2], ["x1", "x2"], :, Not(r"y")] df = DataFrame(x1 = [3, 1, 2, 1], x2 = ["b", "c", "aa", "bbb"]) - @test filter(sel => (a, b) -> a == length(b), df) == DataFrame(x1=[1, 2], x2=["c", "aa"]) - @test filter!(sel => (a, b) -> a == length(b), df) === df == DataFrame(x1=[1, 2], x2=["c", "aa"]) + @test filter(sel => (a, b) -> a == length(b), df) == + DataFrame(x1=[1, 2], x2=["c", "aa"]) + @test filter!(sel => (a, b) -> a == length(b), df) === df == + DataFrame(x1=[1, 2], x2=["c", "aa"]) end df = DataFrame(x = [3, 1, 2, 1, missing], y = ["b", "c", "a", "b", "c"]) @test_throws TypeError filter(r -> r[:x] > 1, df) @test_throws TypeError filter!(r -> r[:x] > 1, df) @test_throws TypeError filter(:x => x -> x > 1, df) + @test_throws TypeError filter("x" => x -> x > 1, df) @test_throws TypeError filter!(:x => x -> x > 1, df) + @test_throws TypeError filter!("x" => x -> x > 1, df) @test_throws TypeError filter(1 => x -> x > 1, df) @test_throws TypeError filter!(1 => x -> x > 1, df) @test_throws TypeError filter([:x] => x -> x > 1, df) + @test_throws TypeError filter(["x"] => x -> x > 1, df) @test_throws TypeError filter!([:x] => x -> x > 1, df) + @test_throws TypeError filter!(["x"] => x -> x > 1, df) @test_throws TypeError filter((:) => (r...) -> r[1] > 1, df) @test_throws TypeError filter!((:) => (r...) -> r[1] > 1, df) end @@ -375,6 +397,12 @@ end filter!(AsTable(:x) => testfun, df) @test df == DataFrame(x=[3, 2], y=["b", "a"]) + df = DataFrame(x = [3, 1, 2, 1], y = ["b", "c", "a", "b"]) + + @test filter(AsTable("x") => testfun, df) == DataFrame(x=[3, 2], y=["b", "a"]) + filter!(AsTable("x") => testfun, df) + @test df == DataFrame(x=[3, 2], y=["b", "a"]) + @test_throws ArgumentError filter([] => () -> true, df) @test_throws ArgumentError filter(AsTable(r"z") => () -> true, df) @test_throws ArgumentError filter!([] => () -> true, df) @@ -385,17 +413,17 @@ end df = DataFrame(a = 1, x1 = 2, x2 = 3, x3 = 4, x4 = 5) for v in [df, groupby(df, :a)] - @test names(v, All()) == names(v, :) == names(v) == [:a, :x1, :x2, :x3, :x4] - @test names(v, Between(:x1, :x3)) == [:x1, :x2, :x3] - @test names(v, Not(:a)) == names(v, r"x") == [:x1, :x2, :x3, :x4] - @test names(v, :x1) == names(v, 2) == [:x1] + @test names(v, All()) == names(v, :) == names(v) == ["a", "x1", "x2", "x3", "x4"] + @test names(v, Between(:x1, :x3)) == ["x1", "x2", "x3"] + @test names(v, Not(:a)) == names(v, r"x") == ["x1", "x2", "x3", "x4"] + @test names(v, :x1) == names(v, 2) == ["x1"] end for v in [view(df, :, [4,3,2,1]), groupby(view(df, :, [4,3,2,1]), 1), view(df, 1, [4,3,2,1])] - @test names(v, All()) == names(v, :) == names(v) == [:x3, :x2, :x1, :a] - @test names(v, Between(:x2, :x1)) == [:x2, :x1] - @test names(v, Not(:a)) == names(v, r"x") == [:x3, :x2, :x1] - @test names(v, :x1) == names(v, 3) == [:x1] + @test names(v, All()) == names(v, :) == names(v) == ["x3", "x2", "x1", "a"] + @test names(v, Between(:x2, :x1)) == ["x2", "x1"] + @test names(v, Not(:a)) == names(v, r"x") == ["x3", "x2", "x1"] + @test names(v, :x1) == names(v, 3) == ["x1"] end end diff --git a/test/dataframe.jl b/test/dataframe.jl index ecee728460..10dbe889f8 100644 --- a/test/dataframe.jl +++ b/test/dataframe.jl @@ -27,12 +27,13 @@ const ≇ = !isequal i = lcm(cyclelength) while true rename!(df, p) - @test sort(names(df)) == n + @test sort(propertynames(df)) == n + @test sort(names(df)) == string.(n) @test sort(collect(keys(index(df).lookup))) == n @test sort(collect(values(index(df).lookup))) == 1:26 - @test all(index(df).lookup[x] == i for (i,x) in enumerate(names(df))) + @test all(index(df).lookup[x] == i for (i,x) in enumerate(propertynames(df))) i -= 1 - names(df) == n && break + propertynames(df) == n && break end @test i == 0 end @@ -52,23 +53,23 @@ end newnames = [Symbol.(rand('a':'z', 4)); oldnames[5:end]] df = DataFrame([[] for i in 1:8], oldnames) if allunique(newnames) - @test names(rename(df, Pair.(oldnames[1:4], newnames[1:4])...)) == newnames - @test names(df) == oldnames + @test names(rename(df, Pair.(oldnames[1:4], newnames[1:4])...)) == string.(newnames) + @test propertynames(df) == oldnames rename!(df, Pair.(oldnames[1:4], newnames[1:4])...) - @test names(df) == newnames + @test propertynames(df) == newnames else @test_throws ArgumentError rename(df, Pair.(oldnames[1:4], newnames[1:4])...) - @test names(df) == oldnames + @test propertynames(df) == oldnames @test_throws ArgumentError rename!(df, Pair.(oldnames[1:4], newnames[1:4])...) - @test names(df) == oldnames + @test propertynames(df) == oldnames end newnames = [oldnames[1:2]; reverse(oldnames[3:6]); oldnames[7:end]] df = DataFrame([[] for i in 1:8], oldnames) - @test names(rename(df, Pair.(oldnames[3:6], newnames[3:6])...)) == newnames - @test names(df) == oldnames + @test names(rename(df, Pair.(oldnames[3:6], newnames[3:6])...)) == string.(newnames) + @test propertynames(df) == oldnames rename!(df, Pair.(oldnames[3:6], newnames[3:6])...) - @test names(df) == newnames + @test propertynames(df) == newnames end end @@ -127,20 +128,20 @@ end df[1, :a] = 4 df[1, :b][!, :e] .= 5 - @test names(rename(df, [:f, :g])) == [:f, :g] - @test names(rename(df, [:f, :f], makeunique=true)) == [:f, :f_1] - @test names(df) == [:a, :b] + @test names(rename(df, [:f, :g])) == ["f", "g"] + @test names(rename(df, [:f, :f], makeunique=true)) == ["f", "f_1"] + @test names(df) == ["a", "b"] rename!(df, [:f, :g]) - @test names(dfc) == [:a, :b] - @test names(dfdc) == [:a, :b] + @test names(dfc) == ["a", "b"] + @test names(dfdc) == ["a", "b"] @test dfc[1, :a] === 2 @test dfdc[1, :a] === 2 - @test names(dfc[1, :b]) == [:c, :e] - @test names(dfdc[1, :b]) == [:c] + @test names(dfc[1, :b]) == ["c", "e"] + @test names(dfdc[1, :b]) == ["c"] end @testset "similar / missings" begin @@ -171,14 +172,14 @@ end @test_throws ArgumentError insertcols!(df, 0, :newcol => ["a", "b"]) @test_throws DimensionMismatch insertcols!(df, 1, :newcol => ["a"]) @test insertcols!(df, 1, :newcol => ["a", "b"]) == df - @test names(df) == [:newcol, :a, :b] + @test names(df) == ["newcol", "a", "b"] @test df.a == [1, 2] @test df.b == [3.0, 4.0] @test df.newcol == ["a", "b"] @test_throws ArgumentError insertcols!(df, 1, :newcol => ["a1", "b1"]) @test insertcols!(df, 1, :newcol => ["a1", "b1"], makeunique=true) == df - @test names(df) == [:newcol_1, :newcol, :a, :b] + @test propertynames(df) == [:newcol_1, :newcol, :a, :b] @test df.a == [1, 2] @test df.b == [3.0, 4.0] @test df.newcol == ["a", "b"] @@ -193,13 +194,40 @@ end @test insertcols!(df, 1, :c3 => x, copycols=false) === df @test df.c3 === x + df = DataFrame(a=Union{Int, Missing}[1, 2], b=Union{Float64, Missing}[3.0, 4.0]) + @test_throws ArgumentError insertcols!(df, 5, "newcol" => ["a", "b"]) + @test_throws ArgumentError insertcols!(df, 0, "newcol" => ["a", "b"]) + @test_throws DimensionMismatch insertcols!(df, 1, "newcol" => ["a"]) + @test insertcols!(df, 1, "newcol" => ["a", "b"]) == df + @test names(df) == ["newcol", "a", "b"] + @test df.a == [1, 2] + @test df.b == [3.0, 4.0] + @test df.newcol == ["a", "b"] + + @test_throws ArgumentError insertcols!(df, 1, "newcol" => ["a1", "b1"]) + @test insertcols!(df, 1, "newcol" => ["a1", "b1"], makeunique=true) == df + @test propertynames(df) == [:newcol_1, :newcol, :a, :b] + @test df.a == [1, 2] + @test df.b == [3.0, 4.0] + @test df.newcol == ["a", "b"] + @test df.newcol_1 == ["a1", "b1"] + + @test insertcols!(df, 1, "c1" => 1:2) === df + @test df.c1 isa Vector{Int} + x = [1, 2] + @test insertcols!(df, 1, "c2" => x, copycols=true) === df + @test df.c2 == x + @test df.c2 !== x + @test insertcols!(df, 1, "c3" => x, copycols=false) === df + @test df.c3 === x + df = DataFrame(a=[1,2], a_1=[3,4]) @test_throws ArgumentError insertcols!(df, 1, :a => [11,12]) @test df == DataFrame(a=[1,2], a_1=[3,4]) insertcols!(df, 1, :a => [11,12], makeunique=true) - @test names(df) == [:a_2, :a, :a_1] + @test propertynames(df) == [:a_2, :a, :a_1] insertcols!(df, 4, :a => [11,12], makeunique=true) - @test names(df) == [:a_2, :a, :a_1, :a_3] + @test propertynames(df) == [:a_2, :a, :a_1, :a_3] @test_throws ArgumentError insertcols!(df, 10, :a => [11,12], makeunique=true) # TODO: re-enable this test after the deprecation; this should be no-op @@ -249,6 +277,10 @@ end @test insertcols!(df, 2, :a=>v1, :b=>v2, :c=>v3) == DataFrame(p='a':'b', a=v1, b=v2, c=v3, q='r':'s') + df = DataFrame(p='a':'b',q='r':'s') + @test insertcols!(df, 2, "a"=>v1, "b"=>v2, "c"=>v3) == + DataFrame(p='a':'b', a=v1, b=v2, c=v3, q='r':'s') + df = DataFrame(p='a':'b',q='r':'s') @test_throws ArgumentError insertcols!(df, 2, :p=>v1, :q=>v2, :p=>v3) @test insertcols!(df, 2, :p=>v1, :q=>v2, :p=>v3, makeunique=true, copycols=true) == @@ -264,10 +296,10 @@ end df = DataFrame(a=[1,2], a_1=[3,4]) insertcols!(df, 1, :a => 11, makeunique=true) - @test names(df) == [:a_2, :a, :a_1] + @test propertynames(df) == [:a_2, :a, :a_1] @test df[!, 1] == [11, 11] insertcols!(df, 4, :a => 12, makeunique=true) - @test names(df) == [:a_2, :a, :a_1, :a_3] + @test propertynames(df) == [:a_2, :a, :a_1, :a_3] @test df[!, 4] == [12, 12] df = DataFrame() @test insertcols!(df, :a => "a", :b => 1:2) == DataFrame(a=["a", "a"], b=1:2) @@ -638,7 +670,10 @@ end # Test that it works on a custom function describe_output.test_std = describe_output.std # Test that describe works with a Pair and a symbol - @test describe_output[:, [:variable, :mean, :test_std]] ≅ describe(df, :mean, :test_std => std) + @test describe_output[:, [:variable, :mean, :test_std]] ≅ + describe(df, :mean, :test_std => std) + @test describe_output[:, [:variable, :mean, :test_std]] ≅ + describe(df, :mean, "test_std" => std) # Test that describe works with a dataframe with no observations df = DataFrame(a = Int[], b = String[], c = []) @@ -647,6 +682,7 @@ end @test describe(df, :all, cols=Not(1)) ≅ describe(select(df, Not(1)), :all) @test describe(df, cols=Not(1)) ≅ describe(select(df, Not(1))) + @test describe(df, cols=Not("a")) ≅ describe(select(df, Not(1))) @test_throws ArgumentError describe(df, :mean, :all) end @@ -661,12 +697,12 @@ end @test_throws InexactError append!(df, DataFrame(A = 3:4, B = [3.5, 4.5])) end @test df == df2 - @test occursin("Error adding value to column B", String(take!(buf))) + @test occursin("Error adding value to column :B", String(take!(buf))) with_logger(sl) do @test_throws MethodError append!(df, DataFrame(A = 3:4, B = ["a", "b"])) end @test df == df2 - @test occursin("Error adding value to column B", String(take!(buf))) + @test occursin("Error adding value to column :B", String(take!(buf))) @test_throws ArgumentError append!(df, DataFrame(A = 1:4, C = 1:4)) @test df == df2 @@ -689,7 +725,7 @@ end @test_throws AssertionError append!(df, dfc) end @test df == dfc - @test occursin("Error adding value to column a", String(take!(buf))) + @test occursin("Error adding value to column :a", String(take!(buf))) df = DataFrame() df.a = [1,2,3,4] @@ -700,7 +736,7 @@ end @test_throws AssertionError append!(df, dfc) end @test df == dfc - @test occursin("Error adding value to column a", String(take!(buf))) + @test occursin("Error adding value to column :a", String(take!(buf))) rename!(df, [:a, :b, :z]) @test_throws ArgumentError append!(df, dfc) @@ -946,41 +982,39 @@ end @testset "rename" begin df = DataFrame(A = 1:3, B = 'A':'C') - @test names(rename(df, :A => :A_1)) == [:A_1, :B] - @test names(df) == [:A, :B] - @test names(rename(df, :A => :A_1, :B => :B_1)) == [:A_1, :B_1] - @test names(df) == [:A, :B] - @test names(rename(df, [:A => :A_1, :B => :B_1])) == [:A_1, :B_1] - @test names(df) == [:A, :B] - @test names(rename(df, Dict(:A => :A_1, :B => :B_1))) == [:A_1, :B_1] - @test names(df) == [:A, :B] - @test names(rename(x->Symbol(lowercase(string(x))), df)) == [:a, :b] - @test names(rename(x->lowercase(string(x)), df)) == [:a, :b] - @test names(df) == [:A, :B] + @test names(rename(df, :A => :A_1)) == ["A_1", "B"] + @test names(df) == ["A", "B"] + @test names(rename(df, :A => :A_1, :B => :B_1)) == ["A_1", "B_1"] + @test names(df) == ["A", "B"] + @test names(rename(df, [:A => :A_1, :B => :B_1])) == ["A_1", "B_1"] + @test names(df) == ["A", "B"] + @test names(rename(df, Dict(:A => :A_1, :B => :B_1))) == ["A_1", "B_1"] + @test names(df) == ["A", "B"] + @test names(rename(lowercase, df)) == ["a", "b"] + @test names(df) == ["A", "B"] @test rename!(df, :A => :A_1) === df - @test names(df) == [:A_1, :B] + @test propertynames(df) == [:A_1, :B] @test rename!(df, :A_1 => :A_2, :B => :B_2) === df - @test names(df) == [:A_2, :B_2] + @test propertynames(df) == [:A_2, :B_2] @test rename!(df, [:A_2 => :A_3, :B_2 => :B_3]) === df - @test names(df) == [:A_3, :B_3] + @test propertynames(df) == [:A_3, :B_3] @test rename!(df, Dict(:A_3 => :A_4, :B_3 => :B_4)) === df - @test names(df) == [:A_4, :B_4] - @test rename!(x->Symbol(lowercase(string(x))), df) === df - @test rename!(x->lowercase(string(x)), df) === df - @test names(df) == [:a_4, :b_4] + @test propertynames(df) == [:A_4, :B_4] + @test rename!(lowercase, df) === df + @test propertynames(df) == [:a_4, :b_4] df = DataFrame(A = 1:3, B = 'A':'C', C = [:x, :y, :z]) @test rename!(df, :A => :B, :B => :A) === df - @test names(df) == [:B, :A, :C] + @test propertynames(df) == [:B, :A, :C] @test rename!(df, :A => :B, :B => :A, :C => :D) === df - @test names(df) == [:A, :B, :D] + @test propertynames(df) == [:A, :B, :D] @test rename!(df, :A => :B, :B => :C, :D => :A) === df - @test names(df) == [:B, :C, :A] + @test propertynames(df) == [:B, :C, :A] @test rename!(df, :A => :C, :B => :A, :C => :B) === df - @test names(df) == [:A, :B, :C] + @test propertynames(df) == [:A, :B, :C] @test rename!(df, :A => :A, :B => :B, :C => :C) === df - @test names(df) == [:A, :B, :C] + @test propertynames(df) == [:A, :B, :C] cdf = copy(df) @test_throws ArgumentError rename!(df, :X => :Y) @@ -999,6 +1033,9 @@ end @test df == cdf @test_throws ArgumentError rename!(df, :A => :B, :B => :A, :A => :X) @test df == cdf + + df = DataFrame(A=1) + @test rename(x -> 1, df) == DataFrame(Symbol("1") => 1) end @testset "flexible rename arguments" begin @@ -1156,6 +1193,14 @@ end @test eltype(df.b) == Union{Int, Missing} @test eltype(df.c) == Int @test eltype(df.d) == Int + @test allowmissing!(df, [:d]) === df + @test eltype(df.b) == Union{Int, Missing} + @test eltype(df.c) == Int + @test eltype(df.d) == Union{Int, Missing} + @test disallowmissing!(df, [:c, :d], error=em) === df + @test eltype(df.b) == Union{Int, Missing} + @test eltype(df.c) == Int + @test eltype(df.d) == Int @test allowmissing!(df, [false, false, true]) === df @test eltype(df.b) == Union{Int, Missing} @test eltype(df.c) == Int @@ -1166,6 +1211,26 @@ end @test eltype(df.d) == Union{Int, Missing} end + for em in [true, false] + df = DataFrame(b=[1,2], c=[1,2], d=[1,2]) + @test allowmissing!(df, ["b", "c"]) === df + @test eltype(df.b) == Union{Int, Missing} + @test eltype(df.c) == Union{Int, Missing} + @test eltype(df.d) == Int + @test disallowmissing!(df, "c", error=em) === df + @test eltype(df.b) == Union{Int, Missing} + @test eltype(df.c) == Int + @test eltype(df.d) == Int + @test allowmissing!(df, ["d"]) === df + @test eltype(df.b) == Union{Int, Missing} + @test eltype(df.c) == Int + @test eltype(df.d) == Union{Int, Missing} + @test disallowmissing!(df, ["c", "d"], error=em) === df + @test eltype(df.b) == Union{Int, Missing} + @test eltype(df.c) == Int + @test eltype(df.d) == Int + end + df = DataFrame(x=[1], y = Union{Int,Missing}[1], z=[missing]) disallowmissing!(df, error=false) @test eltype(df.x) == Int @@ -1202,7 +1267,7 @@ end @test eltype.(eachcol(y)) == [Int, Int, Int] end - for colsel in [:x, 1, [:x], [1], [true, false, false], r"x", Not(2:3)] + for colsel in [:x, "x", 1, [:x], ["x"], [1], [true, false, false], r"x", Not(2:3)] y = disallowmissing(x, colsel, error=em) @test y isa DataFrame @test x == y @@ -1212,7 +1277,7 @@ end @test eltype.(eachcol(y)) == [Int, Union{Missing, Int}, Int] end - for colsel in [:z, 3, [:z], [3], [false, false, true], r"z", Not(1:2)] + for colsel in [:z, "z", 3, [:z], ["z"], [3], [false, false, true], r"z", Not(1:2)] y = disallowmissing(x, colsel, error=em) @test y isa DataFrame @test x == y @@ -1273,7 +1338,7 @@ end @test eltype.(eachcol(y)) == fill(Union{Missing, Int}, 3) end - for colsel in [:x, 1, [:x], [1], [true, false, false], r"x", Not(2:3)] + for colsel in [:x, "x", 1, [:x], ["x"], [1], [true, false, false], r"x", Not(2:3)] y = allowmissing(x, colsel) @test y isa DataFrame @test x == y @@ -1283,7 +1348,7 @@ end @test eltype.(eachcol(y)) == [Union{Missing, Int}, Int, Int] end - for colsel in [:z, 3, [:z], [3], [false, false, true], r"z", Not(1:2)] + for colsel in [:z, "z", 3, [:z], ["z"], [3], [false, false, true], r"z", Not(1:2)] y = allowmissing(x, colsel) @test y isa DataFrame @test x == y @@ -1342,7 +1407,7 @@ end @test y.z isa CategoricalVector{Int} end - for colsel in [:x, 1, [:x], [1], [true, false, false], r"x", Not(2:3)] + for colsel in [:x, "x", 1, [:x], ["x"], [1], [true, false, false], r"x", Not(2:3)] y = categorical(x, colsel) @test y isa DataFrame @test x ≅ y @@ -1354,7 +1419,7 @@ end @test y.z isa Vector{Int} end - for colsel in [:z, 3, [:z], [3], [false, false, true], r"z", Not(1:2)] + for colsel in [:z, "z", 3, [:z], ["z"], [3], [false, false, true], r"z", Not(1:2)] y = categorical(x, colsel) @test y isa DataFrame @test x ≅ y @@ -1527,7 +1592,7 @@ end z = collect(10:-1:1) df = DataFrame(x=x, y=y, copycols=false) - @test Base.propertynames(df) == Tuple(names(df)) + @test propertynames(df) == Symbol.(names(df)) @test df.x === x @test df.y === y diff --git a/test/dataframerow.jl b/test/dataframerow.jl index 6a6ffe6928..7db9477671 100644 --- a/test/dataframerow.jl +++ b/test/dataframerow.jl @@ -13,10 +13,10 @@ ref_df = DataFrame(a=Union{Int, Missing}[1, 2, 3, 1, 2, 2], df = deepcopy(ref_df) sdf = view(df, [5, 3], [3, 1, 2]) - @test names(DataFrameRow(df, 1, :)) == [:a, :b, :c, :d] + @test names(DataFrameRow(df, 1, :)) == ["a", "b", "c", "d"] @test DataFrameRow(df, 1) == DataFrameRow(df, 1, :) @test DataFrameRow(df, 1) == DataFrameRow(df, 1, r"") - @test names(DataFrameRow(df, 3, [3, 2])) == [:c, :b] + @test names(DataFrameRow(df, 3, [3, 2])) == ["c", "b"] @test copy(DataFrameRow(df, 3, [3, 2])) == (c = "C", b = 1.2) @test copy(DataFrameRow(df, 3, r"[bc]")) == (b = 1.2, c = "C") @test copy(DataFrameRow(sdf, 2, [3, 2])) == (b = 1.2, a = 3) @@ -108,7 +108,7 @@ end df = DataFrame([1 2 3 4 5 6 7 8]) r = df[1, r"[1-3]"] - @test names(r) == [:x1, :x2, :x3] + @test names(r) == ["x1", "x2", "x3"] r[:] .= 10 @test df == DataFrame([10 10 10 4 5 6 7 8]) @@ -206,7 +206,7 @@ end df = deepcopy(ref_df) r = DataFrameRow(df, 1, :) - @test Base.propertynames(r) == Tuple(names(df)) + @test propertynames(r) == keys(r) == Symbol.(names(df)) @test r.a === 1 @test r.b === 2.0 @test copy(r[[:a,:b]]) === (a=1, b=2.0) @@ -221,7 +221,7 @@ end df = deepcopy(ref_df) r = DataFrameRow(df, 1, :) - @test keys(r) == Tuple(names(df)) + @test keys(r) == propertynames(df) @test values(r) == (df[1, 1], df[1, 2], df[1, 3], df[1, 4]) @test collect(pairs(r)) == [:a=>df[1, 1], :b=>df[1, 2], :c=>df[1, 3], :d=>df[1, 4]] @@ -247,7 +247,7 @@ end @test size(r) == (4,) @test size(r, 1) == 4 @test_throws BoundsError size(r, 2) - @test keys(r) == (:x8, :x5, :x1, :x3) + @test keys(r) == [:x8, :x5, :x1, :x3] r[:] .= 0.0 r[1:2] .= 2.0 @test values(r) == (2.0, 2.0, 0.0, 0.0) diff --git a/test/deprecated.jl b/test/deprecated.jl index ac1c925c3b..39236be972 100644 --- a/test/deprecated.jl +++ b/test/deprecated.jl @@ -407,7 +407,7 @@ end @test_throws ArgumentError df[4] = [1, 2, 3] df[3] = [1,2,3] df[4] = [1,2,3] - @test names(df) == [:x3, :x3_1, :x3_2, :x4] + @test propertynames(df) == [:x3, :x3_1, :x3_2, :x4] df = DataFrame() @test_throws MethodError df[true] = 1 @test_throws MethodError df[true] = [1,2,3] @@ -499,24 +499,24 @@ end @test d1m == melt(d1, r"[cde]") @test d1s == d1m d1m = melt(d1[:, [1,3,4]], :a) - @test names(d1m) == [:a, :variable, :value] + @test propertynames(d1m) == [:a, :variable, :value] d1m_named = melt(d1[:, [1,3,4]], :a, variable_name=:letter, value_name=:someval) - @test names(d1m_named) == [:a, :letter, :someval] + @test propertynames(d1m_named) == [:a, :letter, :someval] dx = melt(d1, [], [:a]) @test dx == melt(d1, r"xxx", r"a") @test size(dx) == (12, 2) - @test names(dx) == [:variable, :value] + @test propertynames(dx) == [:variable, :value] dx = melt(d1, :a, []) @test dx == stack(d1, r"xxx", r"a") @test size(dx) == (0, 3) - @test names(dx) == [:a, :variable, :value] + @test propertynames(dx) == [:a, :variable, :value] d1m = melt(d1, [:c, :d, :e], view=true) @test d1m == melt(d1, r"[cde]", view=true) d1m = melt(d1[:, [1,3,4]], :a, view=true) - @test names(d1m) == [:a, :variable, :value] + @test propertynames(d1m) == [:a, :variable, :value] d1m_named = melt(d1, [:c, :d, :e], variable_name=:letter, value_name=:someval, view=true) @test d1m_named == melt(d1, r"[cde]", variable_name=:letter, value_name=:someval, view=true) - @test names(d1m_named) == [:c, :d, :e, :letter, :someval] + @test propertynames(d1m_named) == [:c, :d, :e, :letter, :someval] df1 = melt(DataFrame(rand(10,10))) df1[!, :id] = 1:100 @test size(unstack(df1, :variable, :value)) == (100, 11) @@ -678,11 +678,11 @@ end # Check column names anonf = x -> sum(x) adf = aggregate(df7, :d2, [mean, anonf]) - @test names(adf) == [:d2, :d1_mean, :d3_mean, - :d1_function, :d3_function] + @test propertynames(adf) == [:d2, :d1_mean, :d3_mean, + :d1_function, :d3_function] adf = aggregate(df7, :d2, [mean, mean, anonf, anonf]) - @test names(adf) == [:d2, :d1_mean, :d3_mean, :d1_mean_1, :d3_mean_1, - :d1_function, :d3_function, :d1_function_1, :d3_function_1] + @test propertynames(adf) == [:d2, :d1_mean, :d3_mean, :d1_mean_1, :d3_mean_1, + :d1_function, :d3_function, :d1_function_1, :d3_function_1] df9 = aggregate(df7, :d2, [sum, length], sort=true) @test df9 ≅ df8 @@ -690,12 +690,12 @@ end @test df9′ ≅ df8 end -global_logger(old_logger) - @testset "deprecated deleterows!" begin @test deleterows!(DataFrame(x=[1, 2]), 1) == deleterows!(DataFrame(x=[1, 2]), [1]) == deleterows!(DataFrame(x=[1, 2]), [true, false]) == DataFrame(x=[2]) end +global_logger(old_logger) + end # module diff --git a/test/grouping.jl b/test/grouping.jl index 4993fc15e9..c9587302c6 100644 --- a/test/grouping.jl +++ b/test/grouping.jl @@ -118,9 +118,9 @@ end for cols in ([:a, :b], [:b, :a], [:a, :c], [:c, :a], [1, 2], [2, 1], [1, 3], [3, 1], [true, true, false, false], [true, false, true, false]) - colssym = names(df[!, cols]) + colssym = propertynames(df[!, cols]) hcatdf = hcat(df[!, cols], df[!, Not(cols)]) - nms = names(hcatdf) + nms = propertynames(hcatdf) res = unique(df[:, cols]) res.xmax = [maximum(df[(df[!, colssym[1]] .== a) .& (df[!, colssym[2]] .== b), :x]) for (a, b) in zip(res[!, colssym[1]], res[!, colssym[2]])] @@ -173,7 +173,7 @@ end # groupby() without groups sorting gd = groupby_checked(df, cols) - @test names(parent(gd))[gd.cols] == colssym + @test names(parent(gd))[gd.cols] == string.(colssym) df_comb = combine(identity, gd) @test sort(df_comb, colssym) == shcatdf df_ref = DataFrame(gd) @@ -190,7 +190,7 @@ end # groupby() with groups sorting gd = groupby_checked(df, cols, sort=true) - @test names(parent(gd))[gd.cols] == colssym + @test names(parent(gd))[gd.cols] == string.(colssym) for i in 1:length(gd) @test all(gd[i][!, colssym[1]] .== sres[i, colssym[1]]) @test all(gd[i][!, colssym[2]] .== sres[i, colssym[2]]) @@ -218,7 +218,7 @@ end v[2] == gd[2][:, nms] && v[3] == gd[3][:, nms] && v[4] == gd[4][:, nms] - @test names(parent(v))[v.cols] == colssym + @test names(parent(v))[v.cols] == string.(colssym) v = map(f1, gd) @test vcat(v[1], v[2], v[3], v[4]) == by(f1, df, cols, sort=sort) v = map(f2, gd) @@ -696,7 +696,7 @@ end @test_throws ArgumentError by([:b,:c] => ((b,c) -> [1 2; 3 4]) => :xxx, df, :a) @test_throws ArgumentError by(df, :a, [:b,:c] => ((b,c) -> [1 2; 3 4]) => :xxx) @test_throws ArgumentError by(df, :a, nrow, nrow) - @test_throws MethodError by(df, :a, [nrow]) + @test_throws ArgumentError by(df, :a, [nrow]) gd = groupby(df, :a) @@ -761,7 +761,7 @@ end @test_throws ArgumentError combine([:b,:c] => ((b,c) -> [1 2; 3 4]) => :xxx, gd) @test_throws ArgumentError combine(gd, [:b,:c] => ((b,c) -> [1 2; 3 4]) => :xxx) @test_throws ArgumentError combine(gd, nrow, nrow) - @test_throws MethodError combine(gd, [nrow]) + @test_throws ArgumentError combine(gd, [nrow]) for f in (map, combine) for col in (:c, 3) @@ -1430,7 +1430,6 @@ end @testset "GroupKey and GroupKeys" begin df = DataFrame(a = repeat([:A, :B, missing], outer=4), b = repeat([:X, :Y], inner=6), c = 1:12) cols = [:a, :b] - colstup = Tuple(cols) gd = groupby_checked(df, cols) gdkeys = keys(gd) @@ -1457,10 +1456,10 @@ end # Basic methods @test parent(key) === gd @test length(key) == length(cols) - @test names(key) == cols - @test keys(key) == colstup - @test propertynames(key) == colstup - @test propertynames(key, true) == colstup + @test propertynames(key) == cols + @test keys(key) == cols + @test propertynames(key) == cols + @test propertynames(key, true) == cols @test values(key) ≅ values(nt) # (Named)Tuple conversion @@ -1725,7 +1724,7 @@ end @test_throws ArgumentError by(sdf -> sdf.x1[1] ? fr : (x2=[true],), df, :a) else res = by(sdf -> sdf.x1[1] ? fr : (x2=[true],), df, :a) - @test names(res) == [:a, :x2] + @test names(res) == ["a", "x2"] end @test_throws ArgumentError by(sdf -> sdf.x1[1] ? true : er, df, :a) end @@ -1751,12 +1750,12 @@ end Random.seed!(1) df = DataFrame(b = repeat([2, 1], outer=[4]), x = randn(8)) res = by(sdf -> sdf.x[1:2], df, :b) - @test names(res) == [:b, :x1] + @test names(res) == ["b", "x1"] res2 = by(:x => x -> x[1:2], df, :b) - @test names(res2) == [:b, :x_function] + @test names(res2) == ["b", "x_function"] @test Matrix(res) == Matrix(res2) res2 = by(:x => (x -> x[1:2]) => :z, df, :b) - @test names(res2) == [:b, :z] + @test names(res2) == ["b", "z"] @test Matrix(res) == Matrix(res2) @test_throws ArgumentError by(df, :b) do sdf diff --git a/test/index.jl b/test/index.jl index 8063a234f6..dd3887f3d5 100644 --- a/test/index.jl +++ b/test/index.jl @@ -8,22 +8,15 @@ using DataFrames: Index, SubIndex, fuzzymatch push!(i, :A) push!(i, :B) - inds = Any[1, - big(1), - :A, + inds = Any[1, big(1), :A, "A", [true, false], - [1], - [big(1)], - big(1):big(1), - [:A], - Union{Int, Missing}[1], - Union{BigInt, Missing}[big(1)], - Union{Symbol, Missing}[:A], - Any[1], - Any[:A]] + [1], [big(1)], big(1):big(1), [:A], ["A"], + Union{Int, Missing}[1], Union{BigInt, Missing}[big(1)], + Union{Symbol, Missing}[:A], Union{String, Missing}["A"], + Any[1], Any[:A], Any["A"]] for ind in inds - if ind == :A || ndims(ind) == 0 + if ind == :A || ind == "A" || ndims(ind) == 0 @test i[ind] == 1 else @test (i[ind] == [1]) @@ -38,23 +31,27 @@ using DataFrames: Index, SubIndex, fuzzymatch @test_throws ArgumentError i[[1, missing]] @test_throws ArgumentError i[[true, missing]] @test_throws ArgumentError i[Any[true, missing]] - @test_throws MethodError i[[:A, missing]] - @test_throws MethodError i[Any[:A, missing]] + @test_throws ArgumentError i[[:A, missing]] + @test_throws ArgumentError i[Any[:A, missing]] @test_throws ArgumentError i[1.0:1.0] @test_throws ArgumentError i[[1.0]] @test_throws ArgumentError i[Any[1.0]] @test_throws BoundsError i[0] @test_throws BoundsError i[10] @test_throws ArgumentError i[:x] + @test_throws ArgumentError i["x"] @test_throws BoundsError i[1:3] @test_throws ArgumentError i[[1,1]] @test_throws ArgumentError i[[:A,:A]] + @test_throws ArgumentError i[["A","A"]] @test_throws BoundsError i[Not(0)] @test_throws BoundsError i[Not(10)] @test_throws ArgumentError i[Not(:x)] + @test_throws ArgumentError i[Not("x")] @test_throws BoundsError i[Not(1:3)] @test_throws ArgumentError i[Not([1,1])] @test_throws ArgumentError i[Not([:A,:A])] + @test_throws ArgumentError i[Not(["A","A"])] @test i[1:1] == 1:1 @@ -62,9 +59,6 @@ using DataFrames: Index, SubIndex, fuzzymatch @test_throws BoundsError i[true:true] @test_throws BoundsError i[[true, false, true]] - @test_throws ArgumentError i[["a"]] - @test_throws ArgumentError i[Any["a"]] - @test i[[]] == Int[] @test i[Int[]] == Int[] @test i[Symbol[]] == Int[] @@ -75,27 +69,34 @@ end @testset "rename!" begin i = Index([:A,:B]) - @test names(i) == [:A,:B] + @test names(i) == ["A", "B"] @test rename!(i, [:a,:a], makeunique=true) == Index([:a,:a_1]) @test_throws ArgumentError rename!(i, [:a,:a]) @test_throws DimensionMismatch rename!(i, [:a,:b,:c]) @test rename!(copy(i), [:a,:b]) == Index([:a,:b]) - @test names(i) == [:a,:a_1] + @test names(i) == ["a", "a_1"] @test rename!(i, [:a,:b]) == Index([:a,:b]) @test rename!(copy(i), [:a => :A]) == Index([:A,:b]) @test rename!(copy(i), [:a => :a]) == Index([:a,:b]) @test rename!(copy(i), [:a => :b, :b => :a]) == Index([:b,:a]) @test rename!(x -> Symbol(uppercase(string(x))), copy(i)) == Index([:A,:B]) @test rename!(x -> Symbol(lowercase(string(x))), copy(i)) == Index([:a,:b]) + @test rename!(uppercase, copy(i)) == Index([:A,:B]) + @test rename!(lowercase, copy(i)) == Index([:a,:b]) @test delete!(i, :a) == Index([:b]) push!(i, :C) @test delete!(i, 1) == Index([:C]) + push!(i, :D) + @test delete!(i, "C") == Index([:D]) + insert!(i, 1, :x2) + insert!(i, 1, "x1") + @test i == Index([:x1, :x2, :D]) i = Index([:A, :B, :C, :D, :E]) i2 = copy(i) - rename!(i2, reverse(names(i2))) - rename!(i2, reverse(names(i2))) + rename!(i2, reverse(DataFrames._names(i2))) + rename!(i2, reverse(DataFrames._names(i2))) @test names(i2) == names(i) for name in names(i) i2[name] # Issue #715 @@ -111,6 +112,8 @@ end si5 = SubIndex(i, [:C, :D, :E]) si6 = SubIndex(i, Not(Not([:C, :D, :E]))) si7 = SubIndex(i, Not(1:2)) + si8 = SubIndex(i, ["C", "D", "E"]) + si9 = SubIndex(i, Not(Not(["C", "D", "E"]))) @test copy(si1) == i @test copy(si2) == Index([:C, :D, :E]) @@ -119,9 +122,12 @@ end @test copy(si5) == Index([:C, :D, :E]) @test copy(si6) == Index([:C, :D, :E]) @test copy(si7) == Index([:C, :D, :E]) + @test copy(si8) == Index([:C, :D, :E]) + @test copy(si9) == Index([:C, :D, :E]) @test_throws ArgumentError SubIndex(i, 1) @test_throws ArgumentError SubIndex(i, :A) + @test_throws ArgumentError SubIndex(i, "A") @test_throws ArgumentError SubIndex(i, true) @test si1 isa Index @test si2.cols == 3:5 @@ -129,22 +135,27 @@ end @test si3.cols == 3:5 @test si3.remap == [0, 0, 1, 2, 3] @test !haskey(si3, :A) + @test !haskey(si3, "A") @test si3.remap == [0, 0, 1, 2, 3] @test si4.cols == 3:5 @test si4.remap == [0, 0, 1, 2, 3] @test !haskey(si4, :A) + @test !haskey(si4, "A") @test si4.remap == [0, 0, 1, 2, 3] @test si5.cols == 3:5 @test si5.remap == [0, 0, 1, 2, 3] @test !haskey(si5, :A) + @test !haskey(si5, "A") @test si5.remap == [0, 0, 1, 2, 3] @test si6.cols == 3:5 @test si6.remap == [0, 0, 1, 2, 3] @test !haskey(si6, :A) + @test !haskey(si6, "A") @test si6.remap == [0, 0, 1, 2, 3] @test si7.cols == 3:5 @test si7.remap == [0, 0, 1, 2, 3] @test !haskey(si7, :A) + @test !haskey(si7, "A") @test si7.remap == [0, 0, 1, 2, 3] @test length(si1) == 5 @@ -155,13 +166,21 @@ end @test length(si6) == 3 @test length(si7) == 3 - @test names(si1) == keys(si1) == [:A, :B, :C, :D, :E] - @test names(si2) == keys(si2) == [:C, :D, :E] - @test names(si3) == keys(si3) == [:C, :D, :E] - @test names(si4) == keys(si4) == [:C, :D, :E] - @test names(si5) == keys(si5) == [:C, :D, :E] - @test names(si6) == keys(si5) == [:C, :D, :E] - @test names(si7) == keys(si5) == [:C, :D, :E] + @test DataFrames._names(si1) == [:A, :B, :C, :D, :E] + @test DataFrames._names(si2) == [:C, :D, :E] + @test DataFrames._names(si3) == [:C, :D, :E] + @test DataFrames._names(si4) == [:C, :D, :E] + @test DataFrames._names(si5) == [:C, :D, :E] + @test DataFrames._names(si6) == [:C, :D, :E] + @test DataFrames._names(si7) == [:C, :D, :E] + + @test names(si1) == ["A", "B", "C", "D", "E"] + @test names(si2) == ["C", "D", "E"] + @test names(si3) == ["C", "D", "E"] + @test names(si4) == ["C", "D", "E"] + @test names(si5) == ["C", "D", "E"] + @test names(si6) == ["C", "D", "E"] + @test names(si7) == ["C", "D", "E"] @test_throws ArgumentError haskey(si3, true) @test haskey(si3, 1) @@ -170,6 +189,10 @@ end @test haskey(si3, :D) @test !haskey(si3, :A) @test si3[:C] == 1 + @test haskey(si3, "D") + @test !haskey(si3, "A") + @test si3["C"] == 1 + @test si3[DataFrames._names(i)] == [0, 0, 1, 2, 3] @test si3[names(i)] == [0, 0, 1, 2, 3] end @@ -181,23 +204,31 @@ end selector2 = [1] dfv2 = view(dfv1, :, selector2) dfr2 = view(dfr1, selector2) - @test names(dfv1) == [:c, :b] - @test names(dfv2) == [:c] - @test names(dfr1) == [:c, :b] - @test names(dfr2) == [:c] + @test names(dfv1) == ["c", "b"] + @test names(dfv2) == ["c"] + @test names(dfr1) == ["c", "b"] + @test names(dfr2) == ["c"] selector1[1] = 1 - @test names(dfv1) == [:a, :b] - @test names(dfv2) == [:c] - @test names(dfr1) == [:a, :b] - @test names(dfr2) == [:c] + @test names(dfv1) == ["a", "b"] + @test names(dfv2) == ["c"] + @test names(dfr1) == ["a", "b"] + @test names(dfr2) == ["c"] selector3 = [:c, :b] dfv3 = view(df, :, selector3) dfr3 = view(df, 2, selector3) - @test names(dfv3) == [:c, :b] - @test names(dfr3) == [:c, :b] + @test names(dfv3) == ["c", "b"] + @test names(dfr3) == ["c", "b"] selector3[1] = :a - @test names(dfv3) == [:c, :b] - @test names(dfr3) == [:c, :b] + @test names(dfv3) == ["c", "b"] + @test names(dfr3) == ["c", "b"] + selector3 = ["c", "b"] + dfv3 = view(df, :, selector3) + dfr3 = view(df, 2, selector3) + @test names(dfv3) == ["c", "b"] + @test names(dfr3) == ["c", "b"] + selector3[1] = "a" + @test names(dfv3) == ["c", "b"] + @test names(dfr3) == ["c", "b"] end @testset "fuzzy matching" begin @@ -211,6 +242,10 @@ end @test_throws ArgumentError i[:xx13] @test_throws ArgumentError i[:yy14] @test_throws ArgumentError i[:abcd] + @test_throws ArgumentError i["x13"] + @test_throws ArgumentError i["xx13"] + @test_throws ArgumentError i["yy14"] + @test_throws ArgumentError i["abcd"] @test fuzzymatch(i.lookup, :x13) == [:x1, :x12, :x131, :y13, :yy13] @test fuzzymatch(i.lookup, :xx1314) == [:x131] @test fuzzymatch(i.lookup, :yy14) == [:yy13, :y13] @@ -227,12 +262,12 @@ end @test i[r"x1."] == [2, 3] @test isempty(i[r"xx"]) @test i[r""] == 1:5 - @test names(SubIndex(i, r"x1.")) == [:x12, :x131] + @test DataFrames._names(SubIndex(i, r"x1.")) == [:x12, :x131] @test isempty(names(SubIndex(i, r"xx"))) @test names(SubIndex(i, r"")) == names(i) @test DataFrames._names(SubIndex(i, r"x1.")) == [:x12, :x131] @test isempty(DataFrames._names(SubIndex(i, r"xx"))) - @test DataFrames._names(SubIndex(i, r"")) == names(i) + @test DataFrames._names(SubIndex(i, r"")) == DataFrames._names(i) @test DataFrames.parentcols(SubIndex(i, r"x1.")) == [2, 3] @test isempty(DataFrames.parentcols(SubIndex(i, r"xx"))) @test DataFrames.parentcols(SubIndex(i, r"")) == 1:5 @@ -244,12 +279,12 @@ end @test i2[r"x1."] == [2, 3] @test isempty(i2[r"xx"]) @test i2[r""] == 1:5 - @test names(SubIndex(i2, r"x1.")) == [:x12, :x131] + @test DataFrames._names(SubIndex(i2, r"x1.")) == [:x12, :x131] @test isempty(names(SubIndex(i2, r"xx"))) @test names(SubIndex(i2, r"")) == names(i) @test DataFrames._names(SubIndex(i2, r"x1.")) == [:x12, :x131] @test isempty(DataFrames._names(SubIndex(i2, r"xx"))) - @test DataFrames._names(SubIndex(i2, r"")) == names(i2) + @test DataFrames._names(SubIndex(i2, r"")) == DataFrames._names(i2) @test DataFrames.parentcols(SubIndex(i2, r"x1.")) == [2, 3] @test isempty(DataFrames.parentcols(SubIndex(i2, r"xx"))) @test DataFrames.parentcols(SubIndex(i2, r"")) == 1:5 @@ -261,12 +296,12 @@ end @test i3[r"x1.$"] == [1] @test isempty(i3[r"xx"]) @test i3[r""] == 1:2 - @test names(SubIndex(i3, r"x1.$")) == [:x12] + @test DataFrames._names(SubIndex(i3, r"x1.$")) == [:x12] @test isempty(names(SubIndex(i3, r"xx"))) @test names(SubIndex(i3, r"")) == names(i3) @test DataFrames._names(SubIndex(i3, r"x1.$")) == [:x12] @test isempty(DataFrames._names(SubIndex(i3, r"xx"))) - @test DataFrames._names(SubIndex(i3, r"")) == names(i3) + @test DataFrames._names(SubIndex(i3, r"")) == DataFrames._names(i3) @test DataFrames.parentcols(SubIndex(i3, r"x1.$")) == [1] @test isempty(DataFrames.parentcols(SubIndex(i3, r"xx"))) @test DataFrames.parentcols(SubIndex(i3, r"")) == 1:2 @@ -286,12 +321,12 @@ end @test i[Not(Not(r"x1."))] == [2, 3] @test isempty(i[Not(Not(r"xx"))]) @test i[Not(Not(r""))] == 1:5 - @test names(SubIndex(i, Not(Not(r"x1.")))) == [:x12, :x131] + @test DataFrames._names(SubIndex(i, Not(Not(r"x1.")))) == [:x12, :x131] @test isempty(names(SubIndex(i, Not(Not(r"xx"))))) @test names(SubIndex(i, Not(Not(r"")))) == names(i) @test DataFrames._names(SubIndex(i, Not(Not(r"x1.")))) == [:x12, :x131] @test isempty(DataFrames._names(SubIndex(i, Not(Not(r"xx"))))) - @test DataFrames._names(SubIndex(i, Not(Not(r"")))) == names(i) + @test DataFrames._names(SubIndex(i, Not(Not(r"")))) == DataFrames._names(i) @test DataFrames.parentcols(SubIndex(i, Not(Not(r"x1.")))) == [2, 3] @test isempty(DataFrames.parentcols(SubIndex(i, Not(Not(r"xx"))))) @test DataFrames.parentcols(SubIndex(i, Not(Not(r"")))) == 1:5 @@ -300,12 +335,12 @@ end @test i2[Not(Not(r"x1."))] == [2, 3] @test isempty(i2[Not(Not(r"xx"))]) @test i2[Not(Not(r""))] == 1:5 - @test names(SubIndex(i2, Not(Not(r"x1.")))) == [:x12, :x131] + @test DataFrames._names(SubIndex(i2, Not(Not(r"x1.")))) == [:x12, :x131] @test isempty(names(SubIndex(i2, Not(Not(r"xx"))))) @test names(SubIndex(i2, Not(Not(r"")))) == names(i) @test DataFrames._names(SubIndex(i2, Not(Not(r"x1.")))) == [:x12, :x131] @test isempty(DataFrames._names(SubIndex(i2, Not(Not(r"xx"))))) - @test DataFrames._names(SubIndex(i2, Not(Not(r"")))) == names(i2) + @test DataFrames._names(SubIndex(i2, Not(Not(r"")))) == DataFrames._names(i2) @test DataFrames.parentcols(SubIndex(i2, Not(Not(r"x1.")))) == [2, 3] @test isempty(DataFrames.parentcols(SubIndex(i2, Not(Not(r"xx"))))) @test DataFrames.parentcols(SubIndex(i2, Not(Not(r"")))) == 1:5 @@ -314,12 +349,12 @@ end @test i3[Not(Not(r"x1.$"))] == [1] @test isempty(i3[Not(Not(r"xx"))]) @test i3[Not(Not(r""))] == 1:2 - @test names(SubIndex(i3, Not(Not(r"x1.$")))) == [:x12] + @test DataFrames._names(SubIndex(i3, Not(Not(r"x1.$")))) == [:x12] @test isempty(names(SubIndex(i3, Not(Not(r"xx"))))) @test names(SubIndex(i3, Not(Not(r"")))) == names(i3) @test DataFrames._names(SubIndex(i3, Not(Not(r"x1.$")))) == [:x12] @test isempty(DataFrames._names(SubIndex(i3, Not(Not(r"xx"))))) - @test DataFrames._names(SubIndex(i3, Not(Not(r"")))) == names(i3) + @test DataFrames._names(SubIndex(i3, Not(Not(r"")))) == DataFrames._names(i3) @test DataFrames.parentcols(SubIndex(i3, Not(Not(r"x1.$")))) == [1] @test isempty(DataFrames.parentcols(SubIndex(i3, Not(Not(r"xx"))))) @test DataFrames.parentcols(SubIndex(i3, Not(Not(r"")))) == 1:2 @@ -329,26 +364,42 @@ end df = DataFrame(a=1, b=2, c=3) @test select(df, Between(1,2)) == df[:, 1:2] @test select(df, Between(1,:b)) == df[:, 1:2] + @test select(df, Between(1,"b")) == df[:, 1:2] @test select(df, Between(:a,2)) == df[:, 1:2] + @test select(df, Between("a",2)) == df[:, 1:2] @test select(df, Between(:a,:b)) == df[:, 1:2] + @test select(df, Between("a","b")) == df[:, 1:2] @test select(df, Between(2,1)) == df[:, 2:1] @test select(df, Between(:b,1)) == df[:, 2:1] + @test select(df, Between("b",1)) == df[:, 2:1] @test select(df, Between(2,:a)) == df[:, 2:1] - @test select(df, Between(:b,:a)) == df[:, 2:1] + @test select(df, Between(2,"a")) == df[:, 2:1] + @test select(df, Between("b","a")) == df[:, 2:1] + @test select(df, Between("b","a")) == df[:, 2:1] @test df[:, Between(1,2)] == df[:, 1:2] @test df[:, Between(1,:b)] == df[:, 1:2] + @test df[:, Between(1,"b")] == df[:, 1:2] @test df[:, Between(:a,2)] == df[:, 1:2] + @test df[:, Between("a",2)] == df[:, 1:2] @test df[:, Between(:a,:b)] == df[:, 1:2] + @test df[:, Between("a","b")] == df[:, 1:2] @test df[:, Between(2,1)] == df[:, 2:1] @test df[:, Between(:b,1)] == df[:, 2:1] + @test df[:, Between("b",1)] == df[:, 2:1] @test df[:, Between(2,:a)] == df[:, 2:1] + @test df[:, Between(2,"a")] == df[:, 2:1] @test df[:, Between(:b,:a)] == df[:, 2:1] + @test df[:, Between("b","a")] == df[:, 2:1] @test_throws BoundsError df[:, Between(:b,0)] @test_throws BoundsError df[:, Between(0,:b)] @test_throws ArgumentError df[:, Between(:b,:z)] @test_throws ArgumentError df[:, Between(:z,:b)] + @test_throws BoundsError df[:, Between("b",0)] + @test_throws BoundsError df[:, Between(0,"b")] + @test_throws ArgumentError df[:, Between("b","z")] + @test_throws ArgumentError df[:, Between("z","b")] end @testset "All indexing" begin @@ -380,9 +431,44 @@ end @test df[:, All(:a,:b,2)] == df[:, 1:2] @test df[:, All(2,1,:a)] == df[:, [2,1]] + @test select(df, All(1,"b")) == df[:, 1:2] + @test select(df, All("a",2)) == df[:, 1:2] + @test select(df, All("a","b")) == df[:, 1:2] + @test select(df, All("b",1)) == df[:, [2,1]] + @test select(df, All(2,"a")) == df[:, [2,1]] + @test select(df, All("b","a")) == df[:, [2,1]] + + @test df[:, All(1,"b")] == df[:, 1:2] + @test df[:, All("a",2)] == df[:, 1:2] + @test df[:, All("a","b")] == df[:, 1:2] + @test df[:, All("b",1)] == df[:, [2,1]] + @test df[:, All(2,"a")] == df[:, [2,1]] + @test df[:, All("b","a")] == df[:, [2,1]] + + @test df[:, All("a",1,"b")] == df[:, 1:2] + @test df[:, All("a",2,"b")] == df[:, 1:2] + @test df[:, All("a","b",2)] == df[:, 1:2] + @test df[:, All(2,1,"a")] == df[:, [2,1]] + df = DataFrame(a1=1, a2=2, b1=3, b2=4) @test df[:, All(r"a", Not(r"1"))] == df[:, [1,2,4]] @test df[:, All(Not(r"1"), r"a")] == df[:, [2,4,1]] end +@testset "views" begin + df = DataFrame(a=1,b=2,c=3) + dfv = view(df, 1:1, [:a, :c]) + @test DataFrames.parentcols(DataFrames.index(dfv)) == [1,3] + @test DataFrames.parentcols(DataFrames.index(dfv), :c) == 3 + @test DataFrames.parentcols(DataFrames.index(dfv), "c") == 3 + @test DataFrames.parentcols(DataFrames.index(dfv), 2) == 3 + @test DataFrames.parentcols(DataFrames.index(dfv), [:c, :c]) == [3, 3] + @test DataFrames.parentcols(DataFrames.index(dfv), ["c", "c"]) == [3, 3] + @test DataFrames.parentcols(DataFrames.index(dfv), [2, 2]) == [3, 3] + @test DataFrames.index(dfv)["c"] == 2 + @test DataFrames.index(dfv)[:c] == 2 + @test DataFrames.index(dfv)[["a","c"]] == [1, 2] + @test DataFrames.index(dfv)[[:a,:c]] == [1, 2] +end + end # module diff --git a/test/indexing.jl b/test/indexing.jl index 9a6c4433ef..f3f92077cb 100644 --- a/test/indexing.jl +++ b/test/indexing.jl @@ -8,9 +8,9 @@ using Test, DataFrames @test df[!, 1] == [1, 2, 3] @test df[!, 1] === eachcol(df)[1] @test df[!, :a] == [1, 2, 3] - @test df[!, :a] === eachcol(df)[1] + @test df[!, :a] === df[!, "a"] === eachcol(df)[1] @test df.a == [1, 2, 3] - @test df.a === eachcol(df)[1] + @test df.a === df."a" === eachcol(df)[1] for selector in [1:2, r"[ab]", Not(Not(r"[ab]")), Not(r"ab"), Not(3), Not(1:0), Not(1:2), :] dfx = df[!, selector] @@ -85,8 +85,8 @@ end @testset "getindex df[!, col]" begin x = [1, 2, 3] df = DataFrame(x=x, copycols=false) - @test df.x === x - @test df[!, :x] === x + @test df.x === df."x" === x + @test df[!, :x] === df[!, "x"] === x @test df[!, 1] === x @test df[:, [:x]].x !== x @test df[:, :].x !== x @@ -107,6 +107,8 @@ end @test view(df, !, 1) isa SubArray @test view(df, !, :a) == [1, 2, 3] @test view(df, !, :a) isa SubArray + @test view(df, !, "a") == [1, 2, 3] + @test view(df, !, "a") isa SubArray for selector in [1:2, r"[ab]", Not(Not(r"[ab]")), Not(r"ab"), Not(3), Not(1:0), Not(1:2), :] dfx = @view df[!, selector] @@ -232,8 +234,11 @@ end @test sdf[!, 1] isa SubArray @test sdf[!, :a] == [1, 2, 3] @test sdf[!, :a] isa SubArray + @test sdf[!, "a"] == [1, 2, 3] + @test sdf[!, "a"] isa SubArray @test sdf.a == [1, 2, 3] @test sdf.a isa SubArray + @test sdf.a === sdf."a" for selector in [1:2, r"[ab]", Not(Not(r"[ab]")), Not(r"ab"), Not(3), Not(1:0), Not(1:2), :] dfx = @view sdf[!, selector] @@ -322,6 +327,10 @@ end @test sdf[:, 1] == [1, 2, 3] @test sdf[:, 1] isa Vector + @test sdf[:, :a] == [1, 2, 3] + @test sdf[:, :a] isa Vector + @test sdf[:, "a"] == [1, 2, 3] + @test sdf[:, "a"] isa Vector @test sdf[:, 1] !== df[!, 1] @test sdf[:, 1:2] == DataFrame(a=1:3, b=4:6) @test sdf[:, 1:2] isa DataFrame @@ -361,6 +370,8 @@ end @test view(sdf, !, 1) isa SubArray @test view(sdf, !, :a) == [1, 2, 3] @test view(sdf, !, :a) isa SubArray + @test view(sdf, !, "a") == [1, 2, 3] + @test view(sdf, !, "a") isa SubArray for selector in [1:2, r"[ab]", Not(Not(r"[ab]")), Not(r"ab"), Not(3), Not(1:0), Not(1:2), :] dfx = @view sdf[!, selector] @@ -388,6 +399,10 @@ end @test view(sdf, 1, 1) isa SubArray @test view(sdf, 1, 1)[] == 1 + @test view(sdf, 1, :a) isa SubArray + @test view(sdf, 1, :a)[] == 1 + @test view(sdf, 1, "a") isa SubArray + @test view(sdf, 1, "a")[] == 1 @test view(sdf, 1, 1:2) isa DataFrameRow @test copy(view(sdf, 1, 1:2)) == (a=1, b=4) @test view(sdf, 1, r"[ab]") isa DataFrameRow @@ -424,6 +439,10 @@ end @test view(sdf, Not(Not(1:2)), 1) == [1, 2] @test view(sdf, Not(Not(1:2)), 1) isa SubArray + @test view(sdf, Not(Not(1:2)), :a) == [1, 2] + @test view(sdf, Not(Not(1:2)), :a) isa SubArray + @test view(sdf, Not(Not(1:2)), "a") == [1, 2] + @test view(sdf, Not(Not(1:2)), "a") isa SubArray @test view(sdf, Not(Not(1:2)), 1:2) isa SubDataFrame @test view(sdf, Not(Not(1:2)), 1:2) == DataFrame(a=1:2, b=4:5) @test view(sdf, Not(Not(1:2)), r"[ab]") isa SubDataFrame @@ -442,6 +461,10 @@ end @test view(sdf, :, 1) == [1, 2, 3] @test view(sdf, :, 1) isa SubArray + @test view(sdf, :, :a) == [1, 2, 3] + @test view(sdf, :, :a) isa SubArray + @test view(sdf, :, "a") == [1, 2, 3] + @test view(sdf, :, "a") isa SubArray @test view(sdf, :, 1:2) isa SubDataFrame @test view(sdf, :, 1:2) == DataFrame(a=1:3, b=4:6) @test view(sdf, :, r"[ab]") isa SubDataFrame @@ -484,8 +507,14 @@ end dfr = df[1, :] @test dfr[1] == 1 + @test dfr[:a] == 1 + @test dfr["a"] == 1 @test dfr[1:2] isa DataFrameRow @test copy(dfr[1:2]) == (a=1, b=4) + @test dfr[[:a, :b]] isa DataFrameRow + @test copy(dfr[[:a, :b]]) == (a=1, b=4) + @test dfr[["a", "b"]] isa DataFrameRow + @test copy(dfr[["a", "b"]]) == (a=1, b=4) @test dfr[r"[ab]"] isa DataFrameRow @test copy(dfr[r"[ab]"]) == (a=1, b=4) @test dfr[Not(Not(r"[ab]"))] isa DataFrameRow @@ -507,18 +536,26 @@ end @test view(dfr, 1)[] == 1 @test view(dfr, 1) isa SubArray + @test view(dfr, :a)[] == 1 + @test view(dfr, :a) isa SubArray + @test view(dfr, "a")[] == 1 + @test view(dfr, "a") isa SubArray @test view(dfr, 1:2) isa DataFrameRow - @test copy(dfr[1:2]) == (a=1, b=4) + @test copy(view(dfr, 1:2)) == (a=1, b=4) + @test view(dfr, [:a, :b]) isa DataFrameRow + @test copy(view(dfr, [:a, :b])) == (a=1, b=4) + @test view(dfr, ["a", "b"]) isa DataFrameRow + @test copy(view(dfr, ["a", "b"])) == (a=1, b=4) @test view(dfr, r"[ab]") isa DataFrameRow - @test copy(dfr[r"[ab]"]) == (a=1, b=4) + @test copy(view(dfr,r"[ab]")) == (a=1, b=4) @test view(dfr, Not(Not(r"[ab]"))) isa DataFrameRow - @test copy(dfr[Not(Not(r"[ab]"))]) == (a=1, b=4) + @test copy(view(dfr,Not(Not(r"[ab]")))) == (a=1, b=4) @test dfr[:] isa DataFrameRow - @test copy(dfr[:]) == (a=1, b=4, c=7) + @test copy(view(dfr,:)) == (a=1, b=4, c=7) @test dfr[r""] isa DataFrameRow - @test copy(dfr[r""]) == (a=1, b=4, c=7) + @test copy(view(dfr,r"")) == (a=1, b=4, c=7) @test dfr[Not(Not(:))] isa DataFrameRow - @test copy(dfr[Not(Not(:))]) == (a=1, b=4, c=7) + @test copy(view(dfr,Not(Not(:)))) == (a=1, b=4, c=7) @test parent(dfr[:]) === df @test parent(dfr[r""]) === df @test parent(dfr[Not([])]) === df @@ -546,6 +583,8 @@ end @test parent(dfr) === df2 df2[!, :y] .= 100 @test Vector(dfr) == [2.5, 6.5, 10.5, 14.5, 100] + df2[!, "y"] .= 1000 + @test Vector(dfr) == [2.5, 6.5, 10.5, 14.5, 1000] df2 = copy(df) dfr = df2[2, 1:4] @@ -554,14 +593,22 @@ end @test parent(dfr) === df2 df2[!, :y] .= 100 @test Vector(dfr) == [2.5, 6.5, 10.5, 14.5] + df2[!, "y"] .= 1000 + @test Vector(dfr) == [2.5, 6.5, 10.5, 14.5] @test df[2:3, :x2] == df[!, :x2][2:3] == [6.5, 7.5] + @test df[2:3, "x2"] == df[!, "x2"][2:3] == [6.5, 7.5] @test_throws ArgumentError df[2:3, :x] @test_throws BoundsError df[0:3, :x2] @test_throws BoundsError df[1:5, :x2] + @test_throws ArgumentError df[2:3, "x"] + @test_throws BoundsError df[0:3, "x2"] + @test_throws BoundsError df[1:5, "x2"] @test df[:, :x2] == df[!, :x2] @test df[:, :x2] !== df[!, :x2] + @test df[:, "x2"] == df[!, "x2"] + @test df[:, "x2"] !== df[!, "x2"] @test df[1:2, 1:2] == df[Not(3:4), Not(3:4)] == select(df, r"[12]")[1:2, :] @test df[1:2, 1:2] isa DataFrame @@ -575,6 +622,8 @@ end @test df[!, :x2] === df.x2 === DataFrames._columns(df)[2] @test_throws ArgumentError df[!, :x] + @test df[!, "x2"] === df.x2 === DataFrames._columns(df)[2] + @test_throws ArgumentError df[!, "x"] v = @view df[2,2] @test v isa SubArray @@ -601,6 +650,8 @@ end @test parent(dfr) === df2 df2[!, :y] .= 100 @test Vector(dfr) == [2.5, 6.5, 10.5, 14.5, 100] + df2[!, "y"] .= 1000 + @test Vector(dfr) == [2.5, 6.5, 10.5, 14.5, 1000] df2 = copy(df) dfr = @view df2[2, 1:4] @@ -609,6 +660,8 @@ end @test parent(dfr) === df2 df2[!, :y] .= 100 @test Vector(dfr) == [2.5, 6.5, 10.5, 14.5] + df2[!, "y"] .= 1000 + @test Vector(dfr) == [2.5, 6.5, 10.5, 14.5] v = @view df[2:3, :x2] @test v == [6.5, 7.5] @@ -618,9 +671,20 @@ end @test_throws BoundsError @view df[0:3, :x2] @test_throws BoundsError @view df[1:5, :x2] + v = @view df[2:3, "x2"] + @test v == [6.5, 7.5] + @test v isa SubArray + @test parent(v) === df.x2 + @test_throws ArgumentError @view df[2:3, "x"] + @test_throws BoundsError @view df[0:3, "x2"] + @test_throws BoundsError @view df[1:5, "x2"] + @test @view(df[:, :x2]) == df[!, :x2] @test parent(@view(df[:, :x2])) === df[!, :x2] + @test @view(df[:, "x2"]) == df[!, "x2"] + @test parent(@view(df[:, "x2"])) === df[!, "x2"] + sdf = @view df[1:2, 1:2] @test sdf == df[1:2, 1:2] @test sdf isa SubDataFrame @@ -639,6 +703,11 @@ end @test parent(@view(df[!, :x2])) === df.x2 @test_throws ArgumentError @view df[!, :x] + @test @view(df[!, "x2"]) === @view(df[:, "x2"]) + @test @view(df[!, "x2"]) isa SubArray + @test parent(@view(df[!, "x2"])) === df.x2 + @test_throws ArgumentError @view df[!, "x"] + sdf = @view df[Not(1:0), Not(r"zzz")] @test sdf[2,2] == sdf[!, 2][2] == 6.5 @@ -660,6 +729,8 @@ end @test parent(dfr) === df2 df2[!, :y] .= 100 @test Vector(dfr) == [2.5, 6.5, 10.5, 14.5, 100] + df2[!, "y"] .= 1000 + @test Vector(dfr) == [2.5, 6.5, 10.5, 14.5, 1000] df2 = copy(df) dfr = view(df2, 1:4, :)[2, 1:4] @@ -668,6 +739,8 @@ end @test parent(dfr) === df2 df2[!, :y] .= 100 @test Vector(dfr) == [2.5, 6.5, 10.5, 14.5] + df2[!, "y"] .= 1000 + @test Vector(dfr) == [2.5, 6.5, 10.5, 14.5] @test sdf[2:3, :x2] == sdf[!, :x2][2:3] == [6.5, 7.5] @test sdf[2:3, :x2] isa Vector @@ -679,10 +752,21 @@ end @test sdf[:, :x2] !== sdf[!, :x2] @test sdf[:, :x2] isa Vector + @test sdf[2:3, "x2"] == sdf[!, "x2"][2:3] == [6.5, 7.5] + @test sdf[2:3, "x2"] isa Vector + @test_throws ArgumentError sdf[2:3, "x"] + @test_throws BoundsError sdf[0:3, "x2"] + @test_throws BoundsError sdf[1:5, "x2"] + + @test sdf[:, "x2"] == sdf[!, "x2"] + @test sdf[:, "x2"] !== sdf[!, "x2"] + @test sdf[:, "x2"] isa Vector + @test sdf[1:2, 1:2] == sdf[Not(3:4), Not(3:4)] == select(sdf, r"[12]")[1:2, :] @test sdf[1:2, 1:2] isa DataFrame @test sdf[:, 1:2] == sdf[Not(1:0), Not(3:4)] == select(sdf, r"[12]") @test sdf[:, 1:2][!, :x1] !== sdf.x1 + @test sdf[:, 1:2][!, "x1"] !== sdf.x1 @test sdf[:, 1:2] isa DataFrame @test sdf[:, :] == sdf @test sdf[:, :] isa DataFrame @@ -696,6 +780,13 @@ end @test_throws ArgumentError sdf[!, :x] + @test sdf[!, "x2"] === sdf."x2" + @test sdf."x2" == DataFrames._columns(df)[2] + @test sdf."x2" isa SubArray + @test sdf[!, "x2"] isa SubArray + + @test_throws ArgumentError sdf[!, "x"] + v = @view sdf[2,2] @test v isa SubArray @test size(v) == () @@ -721,6 +812,8 @@ end @test parent(dfr) === df2 df2[!, :y] .= 100 @test Vector(dfr) == [2.5, 6.5, 10.5, 14.5, 100] + df2[!, "y"] .= 1000 + @test Vector(dfr) == [2.5, 6.5, 10.5, 14.5, 1000] df2 = copy(df) dfr = @view view(df2, 1:4, :)[2, 1:4] @@ -729,6 +822,8 @@ end @test parent(dfr) === df2 df2[!, :y] .= 100 @test Vector(dfr) == [2.5, 6.5, 10.5, 14.5] + df2[!, "y"] .= 1000 + @test Vector(dfr) == [2.5, 6.5, 10.5, 14.5] v = @view sdf[2:3, :x2] @test v == [6.5, 7.5] @@ -741,6 +836,9 @@ end @test @view(sdf[:, :x2]) == sdf[!, :x2] @test parent(@view(sdf[:, :x2])) === df[!, :x2] + @test @view(sdf[:, "x2"]) == sdf[!, "x2"] + @test parent(@view(sdf[:, "x2"])) === df[!, "x2"] + sdf2 = @view sdf[1:2, 1:2] @test sdf2 == sdf[1:2, 1:2] @test sdf2 isa SubDataFrame @@ -760,12 +858,19 @@ end @test_throws ArgumentError @view sdf[!, :x] @test select(sdf, 1:2, copycols=false) == @view sdf[!, 1:2] + @test @view(sdf[!, "x2"]) == df."x2" + @test sdf[!, "x2"] isa SubArray + @test parent(sdf[!, "x2"]) === df."x2" + @test_throws ArgumentError @view sdf[!, "x"] + dfr = df[2, :] - @test dfr[2] == dfr.x2 == 6.5 + @test dfr[2] == dfr.x2 == dfr."x2" == 6.5 @test_throws BoundsError dfr[0] @test_throws BoundsError dfr[5] @test_throws ArgumentError dfr[:z] @test_throws ArgumentError dfr.z + @test_throws ArgumentError dfr["z"] + @test_throws ArgumentError dfr."z" @test Vector(dfr[2:3]) == [6.5, 10.5] @test dfr[2:3] isa DataFrameRow @@ -778,6 +883,7 @@ end @test_throws BoundsError @view dfr[0] @test_throws BoundsError @view dfr[5] @test_throws ArgumentError @view dfr[:z] + @test_throws ArgumentError @view dfr["z"] @test Vector(@view(dfr[2:3])) == [6.5, 10.5] @test @view(dfr[2:3]) isa DataFrameRow @@ -804,9 +910,12 @@ end df[BigInt(1), :a] = 'a' @test df == DataFrame(a=[97, 2, 3], b=4:6, c=7:9) @test_throws ArgumentError df[BigInt(1), :z] = 'z' - @test df == DataFrame(a=[97, 2, 3], b=4:6, c=7:9) + df[BigInt(1), "a"] = 'b' + @test df == DataFrame(a=[98, 2, 3], b=4:6, c=7:9) + @test_throws ArgumentError df[BigInt(1), "z"] = 'z' + @test df == DataFrame(a=[98, 2, 3], b=4:6, c=7:9) @test_throws MethodError df[1, 1] = "a" - @test df == DataFrame(a=[97, 2, 3], b=4:6, c=7:9) + @test df == DataFrame(a=[98, 2, 3], b=4:6, c=7:9) # `df[CartesianIndex(row, col)] = v` -> the same as `df[row, col] = v` df = DataFrame(a=1:3, b=4:6, c=7:9) @@ -827,7 +936,7 @@ end # the same as `dfr = df[row, cols]; dfr[:] = v` df = DataFrame(a=[[1,2]],b=[[1,2]]) - dfr = df[1, :]; + dfr = df[1, :] @test_throws MethodError dfr[:] = [10, 11] @test df == DataFrame(a=[[1,2]],b=[[1,2]]) @test_throws MethodError df[1, :] = [10, 11] @@ -841,6 +950,14 @@ end dfr[:] = [10, 11] @test df == DataFrame(a=10,b=11) + df = DataFrame(a=1,b=2) + df[1, ["a", "b"]] = [10, 11] + @test df == DataFrame(a=10,b=11) + df = DataFrame(a=1,b=2) + dfr = df[1, ["a", "b"]] + dfr[["a", "b"]] = [10, 11] + @test df == DataFrame(a=10,b=11) + df = DataFrame(a=1,b=2) df[1, :] = (10, 11) @test df == DataFrame(a=10,b=11) @@ -862,6 +979,16 @@ end @test_throws DimensionMismatch df[1, :] = Dict(:a=>10, :b=>11, :c=>12) @test df == DataFrame(a=1,b=2) + df = DataFrame(a=1,b=2) + df[1, ["a", "b"]] = Dict("a"=>10, "b"=>11) + @test df == DataFrame(a=10,b=11) + df = DataFrame(a=1,b=2) + @test_throws ArgumentError df[1, ["a", "b"]] = Dict("a"=>10, "c"=>11) + @test df == DataFrame(a=1,b=2) + df = DataFrame(a=1,b=2) + @test_throws DimensionMismatch df[1, ["a", "b"]] = Dict("a"=>10, "b"=>11, "c"=>12) + @test df == DataFrame(a=1,b=2) + df = DataFrame(a=1,b=2) df[1, :] = (a=10, b=11) @test df == DataFrame(a=10,b=11) @@ -902,8 +1029,15 @@ end # @test_throws ArgumentError df[1:3, 1] = [1] # @test_throws ArgumentError df[1:3, 1] = 1 @test_throws ArgumentError df[1:3, :z] = ["a", "b", "c"] + @test_throws ArgumentError df[1:3, "z"] = ["a", "b", "c"] @test_throws BoundsError df[1:3, 4] = ["a", "b", "c"] + df = DataFrame(a=1:3, b=4:6, c=7:9) + x = df.a + df[1:3, "a"] = 10:12 + @test df == DataFrame(a=10:12, b=4:6, c=7:9) + @test df.a === x + df = DataFrame(a=1:3, b=4:6, c=7:9) x = df.a df[:, 1] = 10:12 @@ -915,6 +1049,17 @@ end @test df.y == y @test df.y !== y + df = DataFrame(a=1:3, b=4:6, c=7:9) + x = df.a + df[:, "a"] = 10:12 + @test df == DataFrame(a=10:12, b=4:6, c=7:9) + @test df."a" === x + + y = ["a", "b", "c"] + df[:, "y"] = y + @test df."y" == y + @test df."y" !== y + @test_throws MethodError df[:, 1] = ["a", "b", "c"] # TODO: enable these tests after deprecation period # @test_throws ArgumentError df[:, 1] = [1] @@ -969,6 +1114,18 @@ end # TODO: add the following tests after deprecation # 1. if `df[:, col] = v` an error is thrown if such operation is attempted). # 2. it is not allowed to add a column with column index `ncol(df)+1` + + df = DataFrame(a=1:3, b=4:6, c=7:9) + df[!, "a"] = ["a", "b", "c"] + @test df == DataFrame(a=["a", "b", "c"], b=4:6, c=7:9) + @test_throws ArgumentError df[!, "a"] = ["a", "b"] + @test_throws ArgumentError df[!, "a"] = ["a"] + df[!, "a"] = 'a':'c' + @test df == DataFrame(a='a':'c', b=4:6, c=7:9) + df."a" = ["aaa", "bbb", 1] + @test df == DataFrame(a=["aaa", "bbb", 1], b=4:6, c=7:9) + df."z" = 11:13 + @test df == DataFrame(a=["aaa", "bbb", 1], b=4:6, c=7:9, z=11:13) end @testset "setindex! on SubDataFrame" begin @@ -989,6 +1146,19 @@ end @test df == DataFrame(a=[10, 2, 3], b=4:6, c=7:9) end + df = DataFrame(a=1:3, b=4:6, c=7:9) + for sdf in [view(df, :, :), view(df, :, 1:2), view(df, 1:2, :), view(df, 1:2, 1:2)] + df.a = [1,2,3] # make sure we have a fresh first column in each iteration + x = df.a + sdf[1, names(sdf)[1]] = 10 + @test df == DataFrame(a=[10, 2, 3], b=4:6, c=7:9) + @test x === df.a + @test_throws BoundsError sdf[0, names(sdf)[1]] = 100 + @test_throws ArgumentError sdf[true, names(sdf)[1]] = 100 + @test_throws MethodError sdf[1, names(sdf)[1]] = "a" + @test df == DataFrame(a=[10, 2, 3], b=4:6, c=7:9) + end + # `sdf[CartesianIndex(row, col)] = v` -> the same as `sdf[row, col] = v`; df = DataFrame(a=1:3, b=4:6, c=7:9) @@ -1048,6 +1218,16 @@ end @test_throws DimensionMismatch df[1, :] = Dict(:a=>10, :b=>11, :c=>12) @test df == DataFrame(a=1,b=2) + df = view(DataFrame(a=1,b=2), :, :) + df[1, :] = Dict("a"=>101, "b"=>111) + @test df == DataFrame(a=101,b=111) + df = view(DataFrame(a=1,b=2), :, :) + @test_throws ArgumentError df[1, :] = Dict("a"=>10, "c"=>11) + @test df == DataFrame(a=1,b=2) + df = view(DataFrame(a=1,b=2), :, :) + @test_throws DimensionMismatch df[1, :] = Dict("a"=>10, "b"=>11, "c"=>12) + @test df == DataFrame(a=1,b=2) + df = view(DataFrame(a=1,b=2), :, :) df[1, :] = (a=10, b=11) @test df == DataFrame(a=10,b=11) @@ -1092,7 +1272,20 @@ end end df = DataFrame(a=1:3, b=4:6, c=7:9) - for sdf in [view(df, :, :), view(df, :, 1:3), view(df, 1:3, :), view(df, 1:3, 1:3)] + for sdf in [view(df, :, :), view(df, :, 1:3), view(df, 1:3, :), + view(df, 1:3, 1:3), view(df, 1:3, ["a", "b", "c"])] + df."a" = [1,2,3] + x = df."a" + sdf[1:3, names(sdf)[1]] = 10:12 + @test sdf == DataFrame(a=10:12, b=4:6, c=7:9) + @test df.a === x + @test_throws MethodError sdf[1:3, names(sdf)[1]] = ["a", "b", "c"] + @test_throws ArgumentError sdf[1:3, "z"] = ["a", "b", "c"] + end + + df = DataFrame(a=1:3, b=4:6, c=7:9) + for sdf in [view(df, :, :), view(df, :, 1:3), view(df, 1:3, :), + view(df, 1:3, 1:3), view(df, 1:3, ["a", "b", "c"])] df.a = [1,2,3] x = df.a sdf[:, 1] = 10:12 @@ -1105,10 +1298,22 @@ end # @test_throws ArgumentError sdf[:, 1] = 1 end + df = DataFrame(a=1:3, b=4:6, c=7:9) + for sdf in [view(df, :, :), view(df, :, 1:3), view(df, 1:3, :), + view(df, 1:3, 1:3), view(df, 1:3, ["a", "b", "c"])] + df.a = [1,2,3] + x = df.a + sdf[:, names(sdf)[1]] = 10:12 + @test df == DataFrame(a=10:12, b=4:6, c=7:9) + @test_throws MethodError sdf[:, names(sdf)[1]] = ["a", "b", "c"] + @test_throws ArgumentError sdf[:, "z"] = ["a", "b", "c"] + end + # `sdf[rows, cols] = v` -> set rows `rows` of columns `cols` in-place; # `v` can be an `AbstractMatrix` or `v` can be `AbstractDataFrame` when column names must match; - for (row_sel, col_sel) in [(:, :), (:, 1:3), (1:3, :), (1:3, 1:3)] + for (row_sel, col_sel) in [(:, :), (:, 1:3), (1:3, :), + (1:3, 1:3), (1:3, ["a", "b", "c"])] df = DataFrame(a=1:3, b=4:6, c=7:9) sdf = view(df, row_sel, col_sel) df2 = DataFrame(a=11:13, b=14:16, c=17:19) @@ -1135,10 +1340,11 @@ end end # Note that `sdf[!, col] = v` and `sdf.col = v` are not allowed as `sdf` can be only modified in-place. - for (row_sel, col_sel) in [(:, :), (:, 1:3), (1:3, :), (1:3, 1:3)] + for (row_sel, col_sel) in [(:, :), (:, 1:3), (1:3, :), (1:3, 1:3), (1:3, ["a", "b", "c"])] df = DataFrame(a=1:3, b=4:6, c=7:9) sdf = view(df, row_sel, col_sel) @test_throws ArgumentError sdf[!, 1] = [1,2,3] + @test_throws ArgumentError sdf[!, "a"] = [1,2,3] @test_throws ArgumentError sdf[!, 1:3] = ones(Int, 3, 3) # TODO: add this test after deprecation period # @test_throw ArgumentError sdf[!, 1] = [1,2,3] @@ -1164,14 +1370,23 @@ end @test df == DataFrame(a=[100, 2, 3], b=4:6, c=7:9) dfr[:a] = 'a' @test df == DataFrame(a=[97, 2, 3], b=4:6, c=7:9) + dfr["a"] = 'b' + @test df == DataFrame(a=[98, 2, 3], b=4:6, c=7:9) @test_throws ArgumentError dfr[:z] = 'z' - @test df == DataFrame(a=[97, 2, 3], b=4:6, c=7:9) - dfr.a = 'b' + @test_throws ArgumentError dfr["z"] = 'z' @test df == DataFrame(a=[98, 2, 3], b=4:6, c=7:9) + dfr.a = 'c' + @test df == DataFrame(a=[99, 2, 3], b=4:6, c=7:9) @test_throws ArgumentError dfr.z = 'z' - @test df == DataFrame(a=[98, 2, 3], b=4:6, c=7:9) + @test df == DataFrame(a=[99, 2, 3], b=4:6, c=7:9) @test_throws MethodError dfr.a = "a" - @test df == DataFrame(a=[98, 2, 3], b=4:6, c=7:9) + @test df == DataFrame(a=[99, 2, 3], b=4:6, c=7:9) + dfr."a" = 'd' + @test df == DataFrame(a=[100, 2, 3], b=4:6, c=7:9) + @test_throws ArgumentError dfr.z = 'z' + @test df == DataFrame(a=[100, 2, 3], b=4:6, c=7:9) + @test_throws MethodError dfr.a = "a" + @test df == DataFrame(a=[100, 2, 3], b=4:6, c=7:9) end # * `dfr[cols] = v` -> set values of entries in columns `cols` in `dfr` by elements of `v` in place; @@ -1411,7 +1626,75 @@ end @test_throws BoundsError view(df, r, [1,4]) @test_throws ArgumentError view(df, r, [1,2,1]) @test_throws ArgumentError view(df, r, [:x1,:x2,:x1]) + @test_throws ArgumentError view(df, r, ["x1","x2","x1"]) + end + end +end + +# just to check that dispatch works correctly +@testset "string indexing" begin + df_ref = DataFrame(a=1:3, b=4:6, c=7:9) + for df in (df_ref[1:2, [2,1]], df_ref[1:2, ["b","a"]], + view(df_ref, 1:2, [2,1]), view(df_ref, 1:2, ["b","a"])) + @test df[1, "a"] == df[1, 2] + @test df[1:2, "a"] == df[1:2, 2] + @test df[1, ["a", "b"]] == df[1, [2,1]] + @test df[1:2, ["a", "b"]] == df[1:2, [2,1]] + @test df[:, ["a", "b"]] == df[:, [2,1]] + @test df[!, ["a", "b"]] == df[!, [2,1]] + + @test view(df, 1, "a") == view(df, 1, 2) + @test view(df, 1:2, "a") == view(df, 1:2, 2) + @test view(df, 1, ["a", "b"]) == view(df, 1, [2,1]) + @test view(df, 1:2, ["a", "b"]) == view(df, 1:2, [2,1]) + @test view(df, :, ["a", "b"]) == view(df, :, [2,1]) + @test view(df, !, ["a", "b"]) == view(df, !, [2,1]) + + df[1, "a"] = 100 + @test df[1, "a"] == 100 + df[1:2, "a"] = [20, 30] + @test df[1:2, "a"] == [20, 30] + df[:, "a"] = [30, 40] + @test df[:, "a"] == [30, 40] + if df isa DataFrame + df[!, "a"] = [1, 2] + @test df[!, "a"] == [1, 2] + else + @test_throws ArgumentError df[!, "a"] = [1, 2] end + + df[1, ["a", "b"]] = (a=1000, b=2000) + @test copy(df[1, ["a", "b"]]) == (a=1000, b=2000) + df[1:1, ["a"]] = ones(1,1) + @test df[1, "a"] == 1 + df[1, ["a", "b"]] .= 50 + @test copy(df[1, ["a", "b"]]) == (a=50, b=50) + df[1:1, ["a"]] .= 1 + @test df[1, "a"] == 1 + end + + df_ref."g1" = 11:13 + @test df_ref."g1" == 11:13 + df_ref[!, "g2"] = 11:13 + @test df_ref."g2" == 11:13 + df_ref[:, "g3"] = 11:13 + @test df_ref."g3" == 11:13 + + for dfr in (df_ref[1, [2,1]], df_ref[1, ["b","a"]], + view(df_ref, 1, [2,1]), view(df_ref, 1, ["b","a"])) + @test dfr["a"] == dfr[2] + @test dfr[["a", "b"]] == dfr[[2,1]] + @test view(dfr, "a") == view(dfr, 2) + @test view(dfr, ["a", "b"]) == view(dfr, [2,1]) + + dfr["a"] = 100 + @test dfr."a" == 100 + dfr[["a", "b"]] = (a=1000, b=2000) + @test copy(dfr) == (b=2000, a=1000) + + @test_throws MethodError dfr["a"] .= 100 + dfr[["a", "b"]] .= 50 + @test copy(dfr) == (b=50, a=50) end end diff --git a/test/iteration.jl b/test/iteration.jl index 7bbf6a7c2e..4c29a0bd02 100644 --- a/test/iteration.jl +++ b/test/iteration.jl @@ -108,7 +108,7 @@ end cols = eachcol(df) - @test keys(cols) == names(df) + @test keys(cols) == propertynames(df) for (a, b, c) in zip(keys(cols), cols, pairs(cols)) @test (a => b) == c end diff --git a/test/reshape.jl b/test/reshape.jl index a4b7dea7da..c18d4f0ccc 100644 --- a/test/reshape.jl +++ b/test/reshape.jl @@ -167,21 +167,21 @@ end df = DataFrame(id=[1, 1, 1, missing, missing, missing, 2, 2, 2], variable=["a", "b", missing, "a", "b", "missing", "a", "b", "missing"], value=[missing, 2.0, 3.0, 4.0, 5.0, missing, 7.0, missing, 9.0]) - @test_logs (:warn, "Missing value in variable variable at row 3. Skipping.") unstack(df, :variable, :value) + @test_logs (:warn, "Missing value in variable :variable at row 3. Skipping.") unstack(df, :variable, :value) udf = with_logger(NullLogger()) do unstack(df, :variable, :value) end - @test names(udf) == [:id, :a, :b, :missing] + @test propertynames(udf) == [:id, :a, :b, :missing] @test udf[!, :missing] ≅ [missing, 9.0, missing] df = DataFrame(id=[1, 1, 1, missing, missing, missing, 2, 2, 2], id2=[1, 1, 1, missing, missing, missing, 2, 2, 2], variable=["a", "b", missing, "a", "b", "missing", "a", "b", "missing"], value=[missing, 2.0, 3.0, 4.0, 5.0, missing, 7.0, missing, 9.0]) - @test_logs (:warn, "Missing value in variable variable at row 3. Skipping.") unstack(df, 3, 4) + @test_logs (:warn, "Missing value in variable :variable at row 3. Skipping.") unstack(df, 3, 4) udf = with_logger(NullLogger()) do unstack(df, 3, 4) end - @test names(udf) == [:id, :id2, :a, :b, :missing] + @test propertynames(udf) == [:id, :id2, :a, :b, :missing] @test udf[!, :missing] ≅ [missing, 9.0, missing] end @@ -209,7 +209,7 @@ end d = Array{Union{Float64, Missing}}(randn(12)), e = Array{Union{String, Missing}}(map(string, 'a':'l'))) - @test names(stack(d1, :a)) == [:b, :c, :d, :e, :variable, :value] + @test propertynames(stack(d1, :a)) == [:b, :c, :d, :e, :variable, :value] d1s = stack(d1, [:a, :b]) @test d1s == stack(d1, r"[ab]") @test d1s == stack(d1, Not(r"[cde]")) @@ -223,35 +223,35 @@ end @test d1s[1:12, :c] == d1[!, :c] @test d1s[13:24, :c] == d1[!, :c] @test d1s2 == d1s3 - @test names(d1s) == [:c, :d, :e, :variable, :value] + @test propertynames(d1s) == [:c, :d, :e, :variable, :value] @test d1s == d1m d1m = stack(d1[:, [1,3,4]], Not(:a)) - @test names(d1m) == [:a, :variable, :value] + @test propertynames(d1m) == [:a, :variable, :value] # Test naming of measure/value columns d1s_named = stack(d1, [:a, :b], variable_name=:letter, value_name=:someval) @test d1s_named == stack(d1, r"[ab]", variable_name=:letter, value_name=:someval) - @test names(d1s_named) == [:c, :d, :e, :letter, :someval] + @test propertynames(d1s_named) == [:c, :d, :e, :letter, :someval] d1m_named = stack(d1[:, [1,3,4]], Not(:a), variable_name=:letter, value_name=:someval) - @test names(d1m_named) == [:a, :letter, :someval] + @test propertynames(d1m_named) == [:a, :letter, :someval] # test empty measures or ids dx = stack(d1, [], [:a]) @test dx == stack(d1, r"xxx", r"a") @test size(dx) == (0, 3) - @test names(dx) == [:a, :variable, :value] + @test propertynames(dx) == [:a, :variable, :value] dx = stack(d1, :a, []) @test dx == stack(d1, r"a", r"xxx") @test size(dx) == (12, 2) - @test names(dx) == [:variable, :value] + @test propertynames(dx) == [:variable, :value] dx = stack(d1, [:a], []) @test dx == stack(d1, r"a", r"xxx") @test size(dx) == (12, 2) - @test names(dx) == [:variable, :value] + @test propertynames(dx) == [:variable, :value] dx = stack(d1, [], :a) @test dx == stack(d1, r"xxx", r"a") @test size(dx) == (0, 3) - @test names(dx) == [:a, :variable, :value] + @test propertynames(dx) == [:a, :variable, :value] @test stack(d1, :a, view=true) == stack(d1, [:a], view=true) @test all(isa.(eachcol(stack(d1, :a, view=true)), @@ -305,17 +305,17 @@ end @test d1s[1:12, :c] == d1[!, :c] @test d1s[13:24, :c] == d1[!, :c] @test d1s2 == d1s3 - @test names(d1s) == [:c, :d, :e, :variable, :value] + @test propertynames(d1s) == [:c, :d, :e, :variable, :value] @test d1s == d1m d1m = stack(d1[:, [1,3,4]], Not(:a), view=true) - @test names(d1m) == [:a, :variable, :value] + @test propertynames(d1m) == [:a, :variable, :value] d1s_named = stack(d1, [:a, :b], variable_name=:letter, value_name=:someval, view=true) @test d1s_named == stack(d1, r"[ab]", variable_name=:letter, value_name=:someval, view=true) - @test names(d1s_named) == [:c, :d, :e, :letter, :someval] + @test propertynames(d1s_named) == [:c, :d, :e, :letter, :someval] d1m_named = stack(d1, Not([:c, :d, :e]), variable_name=:letter, value_name=:someval, view=true) @test d1m_named == stack(d1, Not(r"[cde]"), variable_name=:letter, value_name=:someval, view=true) - @test names(d1m_named) == [:c, :d, :e, :letter, :someval] + @test propertynames(d1m_named) == [:c, :d, :e, :letter, :someval] d1s[!, :id] = Union{Int, Missing}[1:12; 1:12] d1s2[!, :id] = Union{Int, Missing}[1:12; 1:12] @@ -369,24 +369,29 @@ end df_tup = DataFrame(a = [1, 2], b = [(1, 2), (3, 4)]) ref = DataFrame(a = [1, 1, 2, 2], b = [1, 2, 3, 4]) @test flatten(df_vec, :b) == flatten(df_tup, :b) == ref + @test flatten(df_vec, "b") == flatten(df_tup, "b") == ref df_mixed_types = DataFrame(a = [1, 2], b = [[1, 2], ["x", "y"]]) ref_mixed_types = DataFrame(a = [1, 1, 2, 2], b = [1, 2, "x", "y"]) @test flatten(df_mixed_types, :b) == ref_mixed_types df_three = DataFrame(a = [1, 2, 3], b = [[1, 2], [10, 20], [100, 200, 300]]) ref_three = DataFrame(a = [1, 1, 2, 2, 3, 3, 3], b = [1, 2, 10, 20, 100, 200, 300]) @test flatten(df_three, :b) == ref_three + @test flatten(df_three, "b") == ref_three df_gen = DataFrame(a = [1, 2], b = [(i for i in 1:5), (i for i in 6:10)]) ref_gen = DataFrame(a = [fill(1, 5); fill(2, 5)], b = collect(1:10)) @test flatten(df_gen, :b) == ref_gen + @test flatten(df_gen, "b") == ref_gen df_miss = DataFrame(a = [1, 2], b = [Union{Missing, Int}[1, 2], Union{Missing, Int}[3, 4]]) ref = DataFrame(a = [1, 1, 2, 2], b = [1, 2, 3, 4]) @test flatten(df_miss, :b) == ref + @test flatten(df_miss, "b") == ref v1 = [[1, 2], [3, 4]] v2 = [[5, 6], [7, 8]] v = [v1, v2] df_vec_vec = DataFrame(a = [1, 2], b = v) ref_vec_vec = DataFrame(a = [1, 1, 2, 2], b = [v1 ; v2]) @test flatten(df_vec_vec, :b) == ref_vec_vec + @test flatten(df_vec_vec, "b") == ref_vec_vec df_cat = DataFrame(a = [1, 2], b = [CategoricalArray([1, 2]), CategoricalArray([1, 2])]) df_flat_cat = flatten(df_cat, :b) ref_cat = DataFrame(a = [1, 1, 2, 2], b = [1, 2, 1, 2]) @@ -400,6 +405,8 @@ end ref = DataFrame(a = [1, 1, 2, 2], b = [1, 2, 3, 4], c = [5, 6, 7, 8]) @test flatten(df, [:b, :c]) == ref @test flatten(df, [:c, :b]) == ref + @test flatten(df, ["b", "c"]) == ref + @test flatten(df, ["c", "b"]) == ref @test flatten(df, 2:3) == ref @test flatten(df, r"[bc]") == ref @test flatten(df, Not(:a)) == ref diff --git a/test/runtests.jl b/test/runtests.jl index 26446fe6d8..b48de847c9 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -30,6 +30,7 @@ my_tests = ["utils.jl", "tabletraits.jl", "indexing.jl", "broadcasting.jl", + "string.jl", "deprecated.jl"] println("Running tests:") diff --git a/test/select.jl b/test/select.jl index 54dbdcce78..005b87e222 100644 --- a/test/select.jl +++ b/test/select.jl @@ -253,48 +253,48 @@ end d = copy(df, copycols=false) select!(d, [:a, :e, :c]) - @test names(d) == [:a, :e, :c] + @test propertynames(d) == [:a, :e, :c] @test d.a === df.a @test d.e === df.e @test d.c === df.c d = copy(df, copycols=false) select!(d, r"[aec]") - @test names(d) == [:a, :c, :e] + @test propertynames(d) == [:a, :c, :e] @test d.a === df.a @test d.e === df.e @test d.c === df.c d = copy(df, copycols=false) select!(d, [true, false, true, false, true]) - @test names(d) == [:a, :c, :e] + @test propertynames(d) == [:a, :c, :e] @test d.a === df.a @test d.c === df.c @test d.e === df.e d = copy(df, copycols=false) select!(d, [:d, :e, :a, :c, :b]) - @test names(d) == [:d, :e, :a, :c, :b] + @test propertynames(d) == [:d, :e, :a, :c, :b] for i in [:d, :e, :a, :c, :b] @test d[!, i] === df[!, i] end d = copy(df, copycols=false) select!(d, [2, 5, 3]) - @test names(d) == [:b, :e, :c] + @test propertynames(d) == [:b, :e, :c] @test d.b === df.b @test d.e === df.e @test d.c === df.c d = copy(df, copycols=false) select!(d, 2:3) - @test names(d) == [:b, :c] + @test propertynames(d) == [:b, :c] @test d.b === df.b @test d.c === df.c d = copy(df, copycols=false) select!(d, 2) - @test names(d) == [:b] + @test propertynames(d) == [:b] @test d.b === df.b end @@ -312,7 +312,7 @@ end @test select(df, Not(r""), copycols=false) == DataFrame() d = select(df, [:a, :e, :c]) - @test names(d) == [:a, :e, :c] + @test propertynames(d) == [:a, :e, :c] @test d.a !== df.a @test d.e !== df.e @test d.c !== df.c @@ -321,7 +321,7 @@ end @test d.c == df.c d = select(df, r"[aec]") - @test names(d) == [:a, :c, :e] + @test propertynames(d) == [:a, :c, :e] @test d.a !== df.a @test d.e !== df.e @test d.c !== df.c @@ -330,7 +330,7 @@ end @test d.c == df.c d = select(df, [true, false, true, false, true]) - @test names(d) == [:a, :c, :e] + @test propertynames(d) == [:a, :c, :e] @test d.a !== df.a @test d.c !== df.c @test d.e !== df.e @@ -339,7 +339,7 @@ end @test d.e == df.e d = select(df, [2, 5, 3]) - @test names(d) == [:b, :e, :c] + @test propertynames(d) == [:b, :e, :c] @test d.b !== df.b @test d.e !== df.e @test d.c !== df.c @@ -348,48 +348,48 @@ end @test d.c == df.c d = select(df, 2:3) - @test names(d) == [:b, :c] + @test propertynames(d) == [:b, :c] @test d.b !== df.b @test d.c !== df.c @test d.b == df.b @test d.c == df.c d = select(df, 2) - @test names(d) == [:b] + @test propertynames(d) == [:b] @test d.b !== df.b @test d.b == df.b d = select(df, [:a, :e, :c], copycols=false) - @test names(d) == [:a, :e, :c] + @test propertynames(d) == [:a, :e, :c] @test d.a === df.a @test d.e === df.e @test d.c === df.c d = select(df, r"[aec]", copycols=false) - @test names(d) == [:a, :c, :e] + @test propertynames(d) == [:a, :c, :e] @test d.a === df.a @test d.e === df.e @test d.c === df.c d = select(df, [true, false, true, false, true], copycols=false) - @test names(d) == [:a, :c, :e] + @test propertynames(d) == [:a, :c, :e] @test d.a === df.a @test d.c === df.c @test d.e === df.e d = select(df, [2, 5, 3], copycols=false) - @test names(d) == [:b, :e, :c] + @test propertynames(d) == [:b, :e, :c] @test d.b === df.b @test d.e === df.e @test d.c === df.c d = select(df, 2:3, copycols=false) - @test names(d) == [:b, :c] + @test propertynames(d) == [:b, :c] @test d.b === df.b @test d.c === df.c d = select(df, 2, copycols=false) - @test names(d) == [:b] + @test propertynames(d) == [:b] @test d.b === df.b end @@ -408,7 +408,7 @@ end d = select(df, [:a, :e, :c]) @test d isa DataFrame - @test names(d) == [:a, :e, :c] + @test propertynames(d) == [:a, :e, :c] @test d.a !== df.a @test d.e !== df.e @test d.c !== df.c @@ -418,7 +418,7 @@ end d = select(df, r"[aec]") @test d isa DataFrame - @test names(d) == [:a, :c, :e] + @test propertynames(d) == [:a, :c, :e] @test d.a !== df.a @test d.e !== df.e @test d.c !== df.c @@ -428,7 +428,7 @@ end d = select(df, [true, false, true, false, true]) @test d isa DataFrame - @test names(d) == [:a, :c, :e] + @test propertynames(d) == [:a, :c, :e] @test d.a !== df.a @test d.c !== df.c @test d.e !== df.e @@ -438,7 +438,7 @@ end d = select(df, [2, 5, 3]) @test d isa DataFrame - @test names(d) == [:b, :e, :c] + @test propertynames(d) == [:b, :e, :c] @test d.b !== df.b @test d.e !== df.e @test d.c !== df.c @@ -448,7 +448,7 @@ end d = select(df, 2:3) @test d isa DataFrame - @test names(d) == [:b, :c] + @test propertynames(d) == [:b, :c] @test d.b !== df.b @test d.c !== df.c @test d.b == df.b @@ -456,47 +456,47 @@ end d = select(df, 2) @test d isa DataFrame - @test names(d) == [:b] + @test propertynames(d) == [:b] @test d.b !== df.b @test d.b == df.b d = select(df, [:a, :e, :c], copycols=false) @test d isa SubDataFrame - @test names(d) == [:a, :e, :c] + @test propertynames(d) == [:a, :e, :c] @test d.a === df.a @test d.e === df.e @test d.c === df.c d = select(df, r"[aec]", copycols=false) @test d isa SubDataFrame - @test names(d) == [:a, :c, :e] + @test propertynames(d) == [:a, :c, :e] @test d.a === df.a @test d.e === df.e @test d.c === df.c d = select(df, [true, false, true, false, true], copycols=false) @test d isa SubDataFrame - @test names(d) == [:a, :c, :e] + @test propertynames(d) == [:a, :c, :e] @test d.a === df.a @test d.c === df.c @test d.e === df.e d = select(df, [2, 5, 3], copycols=false) @test d isa SubDataFrame - @test names(d) == [:b, :e, :c] + @test propertynames(d) == [:b, :e, :c] @test d.b === df.b @test d.e === df.e @test d.c === df.c d = select(df, 2:3, copycols=false) @test d isa SubDataFrame - @test names(d) == [:b, :c] + @test propertynames(d) == [:b, :c] @test d.b === df.b @test d.c === df.c d = select(df, 2, copycols=false) @test d isa SubDataFrame - @test names(d) == [:b] + @test propertynames(d) == [:b] @test d.b === df.b end @@ -611,7 +611,7 @@ end select!(df, :x1 => :x2, :x2 => :x1) @test x1 === df.x2 @test x2 === df.x1 - @test names(df) == [:x2, :x1] + @test names(df) == ["x2", "x1"] df = DataFrame(rand(10, 4)) select!(df, :x1, :x1 => :x2) @@ -664,7 +664,7 @@ end df2 = select(df, :x2, :, :x1 => ByRow(x -> x^2) => :r1, :x1 => (x -> x .^ 2) => :r2, [:x1, :x2] => (+) => :x1, 1:2 => ByRow(/) => :x3, :x1 => :x4) - @test names(df2) == [:x2, :x1, :x3, :x4, :r1, :r2] + @test propertynames(df2) == [:x2, :x1, :x3, :x4, :r1, :r2] @test df.x2 == df2.x2 @test df.x2 !== df2.x2 @test df.x1 == df2.x4 @@ -679,7 +679,7 @@ end df2 = select(df, :x2, :, :x1 => ByRow(x -> x^2) => :r1, :x1 => (x -> x .^ 2) => :r2, [:x1, :x2] => (+) => :x1, 1:2 => ByRow(/) => :x3, :x1 => :x4, copycols=false) - @test names(df2) == [:x2, :x1, :x3, :x4, :r1, :r2] + @test propertynames(df2) == [:x2, :x1, :x3, :x4, :r1, :r2] @test df.x2 === df2.x2 @test df.x1 === df2.x4 @test df2.r1 == df.x1 .^ 2 @@ -690,7 +690,7 @@ end x1, x2, x3, x4 = df.x1, df.x2, df.x3, df.x4 select!(df, :x2, :, :x1 => ByRow(x -> x^2) => :r1, :x1 => (x -> x .^ 2) => :r2, [:x1, :x2] => (+) => :x1, 1:2 => ByRow(/) => :x3, :x1 => :x4) - @test names(df2) == [:x2, :x1, :x3, :x4, :r1, :r2] + @test propertynames(df2) == [:x2, :x1, :x3, :x4, :r1, :r2] @test x2 === df.x2 @test x1 === df.x4 @test df.r1 == x1 .^ 2 @@ -876,16 +876,16 @@ end DataFrame([6 1 9], [:d, :b, :a]) res = select(df3, [] => (() -> v) => :a, :x1 => x -> []) - @test names(res) == [:a, :x1_function] && nrow(res) == 0 + @test propertynames(res) == [:a, :x1_function] && nrow(res) == 0 @test eltype.(eachcol(res)) == [Int, Any] res = select(df3, :x1 => x -> [], [] => (() -> v) => :a) - @test names(res) == [:x1_function, :a] && nrow(res) == 0 + @test propertynames(res) == [:x1_function, :a] && nrow(res) == 0 @test eltype.(eachcol(res)) == [Any, Int] res = select(df3, [] => (() -> v) => :a, :x1) - @test names(res) == [:a, :x1] && nrow(res) == 0 + @test propertynames(res) == [:a, :x1] && nrow(res) == 0 @test eltype.(eachcol(res)) == [Int, Char] res = select(df3, :x1, [] => (() -> v) => :a) - @test names(res) == [:x1, :a] && nrow(res) == 0 + @test propertynames(res) == [:x1, :a] && nrow(res) == 0 @test eltype.(eachcol(res)) == [Char, Int] end @test_throws ArgumentError select(df, [] => (() -> [9]) => :a, :) @@ -1124,7 +1124,7 @@ end df.a = a @test_throws DomainError select!(df, :a => x -> sqrt(-1)) @test df.a === a - @test propertynames(df) == (:a,) + @test propertynames(df) == [:a,] end end # module diff --git a/test/sort.jl b/test/sort.jl index 5187caf816..f0bb6d10d7 100644 --- a/test/sort.jl +++ b/test/sort.jl @@ -12,7 +12,7 @@ using DataFrames, Random, Test @test sortperm(d) == sortperm(dv1) @test sortperm(d[:, [:dv3, :dv1]]) == sortperm(dv3) - @test sort(d, :dv1)[!, :dv3] == sortperm(dv1) + @test sort(d, :dv1)[!, :dv3] == sort(d, "dv1")[!, "dv3"] == sortperm(dv1) @test sort(d, :dv2)[!, :dv3] == sortperm(dv1) @test sort(d, :cv1)[!, :dv3] == sortperm(dv1) @test sort(d, [:dv1, :cv1])[!, :dv3] == sortperm(dv1) @@ -25,27 +25,48 @@ using DataFrames, Random, Test @test issorted(sort(df)) @test issorted(sort(df, rev=true), rev=true) @test issorted(sort(df, [:chrom,:pos])[:, [:chrom,:pos]]) + @test issorted(sort(df, ["chrom", "pos"])[:, ["chrom", "pos"]]) ds = sort(df, [order(:rank, rev=true),:chrom,:pos]) @test issorted(ds, [order(:rank, rev=true),:chrom,:pos]) @test issorted(ds, rev=(true, false, false)) + ds = sort(df, [order("rank", rev=true), "chrom", "pos"]) + @test issorted(ds, [order("rank", rev=true), "chrom", "pos"]) + @test issorted(ds, rev=(true, false, false)) + ds2 = sort(df, [:rank, :chrom, :pos], rev=(true, false, false)) @test issorted(ds2, [order(:rank, rev=true), :chrom, :pos]) @test issorted(ds2, rev=(true, false, false)) @test ds2 == ds + ds2 = sort(df, ["rank", "chrom", "pos"], rev=(true, false, false)) + @test issorted(ds2, [order("rank", rev=true), "chrom", "pos"]) + @test issorted(ds2, rev=(true, false, false)) + + @test ds2 == ds + sort!(df, [:rank, :chrom, :pos], rev=(true, false, false)) @test issorted(df, [order(:rank, rev=true), :chrom, :pos]) @test issorted(df, rev=(true, false, false)) @test df == ds + sort!(df, ["rank", "chrom", "pos"], rev=(true, false, false)) + @test issorted(df, [order("rank", rev=true), "chrom", "pos"]) + @test issorted(df, rev=(true, false, false)) + + @test df == ds + df = DataFrame(x = [3, 1, 2, 1], y = ["b", "c", "a", "b"]) @test !issorted(df, :x) @test issorted(sort(df, :x), :x) + df = DataFrame(x = [3, 1, 2, 1], y = ["b", "c", "a", "b"]) + @test !issorted(df, "x") + @test issorted(sort(df, "x"), "x") + x = DataFrame(a=1:3,b=3:-1:1,c=3:-1:1) @test issorted(x) @test !issorted(x, [:b,:c]) @@ -53,6 +74,13 @@ using DataFrames, Random, Test @test issorted(sort(x,[2,3]), [:b,:c]) @test issorted(sort(x[:, 2:3]), [:b,:c]) + x = DataFrame(a=1:3,b=3:-1:1,c=3:-1:1) + @test issorted(x) + @test !issorted(x, ["b","c"]) + @test !issorted(x[:, 2:3], ["b","c"]) + @test issorted(sort(x,[2,3]), ["b","c"]) + @test issorted(sort(x[:, 2:3]), ["b","c"]) + # Check that columns that shares the same underlying array are only permuted once PR#1072 df = DataFrame(a=[2,1]) df.b = df.a diff --git a/test/string.jl b/test/string.jl new file mode 100644 index 0000000000..817dc830aa --- /dev/null +++ b/test/string.jl @@ -0,0 +1,266 @@ +module TestStringIndexing + +using Test, DataFrames + +@testset "iteration" begin + df = DataFrame(a=1:2, b=3:4) + er = eachrow(df) + ec = eachcol(df) + @test er.a == er."a" == ec.a == ec."a" == df.a == df."a" + @test_throws ArgumentError er.c + @test_throws ArgumentError er."c" + @test_throws ArgumentError ec.c + @test_throws ArgumentError ec."c" + @test_throws ArgumentError df.c + @test_throws ArgumentError df."c" + @test hasproperty(er, :a) == hasproperty(er, "a") == + hasproperty(ec, :a) == hasproperty(ec, "a") == + hasproperty(df, :a) == hasproperty(df, "a") == true + @test hasproperty(er, :c) == hasproperty(er, "c") == + hasproperty(ec, :c) == hasproperty(ec, "c") == + hasproperty(df, :c) == hasproperty(df, "c") == false + + @test keys(er) == 1:2 + @test propertynames(er) == propertynames(ec) == propertynames(df) == + keys(ec) == [:a, :b] + @test_throws MethodError keys(df) +end + +@testset "joins" begin + df1 = DataFrame(a = 1, b = 2) + df2 = DataFrame(a = 1, c = 2) + df3 = DataFrame(a = 1, d = 2) + + # only check if the output is the same in all cases + for f in (innerjoin, leftjoin, rightjoin, outerjoin, antijoin, semijoin) + @test f(df1, df2, on=:a) == f(df1, df2, on="a") == + f(df1, df2, on=[:a]) == f(df1, df2, on=["a"]) == + f(df1, df2, on=:a => :a) == f(df1, df2, on="a" => "a") == + f(df1, df2, on=[:a => :a]) == f(df1, df2, on=["a" => "a"]) + @test_throws TypeError f(df1, df2, on = :a => "a") + @test_throws ArgumentError f(df1, df2, on = [:a => "a"]) + + if f === innerjoin || f === outerjoin + @test f(df1, df2, df3, on=:a) == f(df1, df2, df3, on="a") == + f(df1, df2, df3, on=[:a]) == f(df1, df2, df3, on=["a"]) == + f(df1, df2, df3, on=:a => :a) == f(df1, df2, df3, on="a" => "a") == + f(df1, df2, df3, on=[:a => :a]) == f(df1, df2, df3, on=["a" => "a"]) + @test_throws TypeError f(df1, df2, df3, on = :a => "a") + @test_throws ArgumentError f(df1, df2, df3, on = [:a => "a"]) + end + end +end + +@testset "reshape" begin + df = DataFrame(a = repeat([1:3;], inner = [4]), + b = repeat([1:4;], inner = [3]), + c = 1:12, d = 1.0:12.0, + e = map(string, 'a':'l')) + + # only check if the output is the same in all cases + for v in (true, false) + @test stack(df, [:c, :d], [:a], variable_name=:varn, value_name=:valn, view=v) == + stack(df, ["c", "d"], ["a"], variable_name="varn", value_name="valn", view=v) + end + + wide = DataFrame(id = 1:12, + a = repeat([1:3;], inner = [4]), + b = repeat([1:4;], inner = [3]), + c = randn(12), + d = randn(12)) + + long = stack(wide) + @test unstack(long, :variable, :value) == unstack(long, "variable", "value") + @test unstack(long, :id, :variable, :value) == + unstack(long, "id", "variable", "value") + @test unstack(long, [:id, :a], :variable, :value) == + unstack(long, ["id", "a"], "variable", "value") + @test unstack(long, :id, :variable, :value, renamecols=x->Symbol(:_, x)) == + unstack(long, "id", "variable", "value", renamecols=x->"_"*x) +end + +@testset "selection" begin + df = DataFrame(a = 1:2, b=3:4) + # only check if the output is the same in all cases + @test select(df, :a, :b => :d, :b => identity => :d2, :b => identity, + [:a, :a] => (+), [:a, :a] => (+) => :e, AsTable(:a) => ByRow(first), + nrow => :xxx) == + select(df, "a", "b" => "d", "b" => identity => "d2", "b" => identity, + ["a", "a"] => (+), ["a", "a"] => (+) => "e", AsTable("a") => ByRow(first), + nrow => "xxx") + @test transform(df, :a, :b => :d, :b => identity => :d2, :b => identity, + [:a, :a] => (+), [:a, :a] => (+) => :e, AsTable(:a) => ByRow(first), + nrow => :xxx) == + transform(df, "a", "b" => "d", "b" => identity => "d2", "b" => identity, + ["a", "a"] => (+), ["a", "a"] => (+) => "e", AsTable("a") => ByRow(first), + nrow => "xxx") + @test select(df, [:a]) == select(df, ["a"]) == + select(df, :a) == select(df, "a") + @test transform(df, [:a]) == transform(df, ["a"]) == + transform(df, :a) == transform(df, "a") + + df2 = copy(df) + @test select!(df, :a, :b => :d, :b => identity => :d2, :b => identity, + [:a, :a] => (+), [:a, :a] => (+) => :e, AsTable(:a) => ByRow(first), + nrow => :xxx) == + select!(df2, "a", "b" => "d", "b" => identity => "d2", "b" => identity, + ["a", "a"] => (+), ["a", "a"] => (+) => "e", AsTable("a") => ByRow(first), + nrow => "xxx") + + df = DataFrame(a = 1:2, b=3:4) + df2 = copy(df) + @test select!(df, [:a]) == select!(df2, ["a"]) + + df = DataFrame(a = 1:2, b=3:4) + df2 = copy(df) + @test select!(df, :a) == select!(df2, "a") + + df = DataFrame(a = 1:2, b=3:4) + df2 = copy(df) + @test transform!(df, :a, :b => :d, :b => identity => :d2, :b => identity, + [:a, :a] => (+), [:a, :a] => (+) => :e, AsTable(:a) => ByRow(first), + nrow => :xxx) == + transform!(df2, "a", "b" => "d", "b" => identity => "d2", "b" => identity, + ["a", "a"] => (+), ["a", "a"] => (+) => "e", AsTable("a") => ByRow(first), + nrow => "xxx") + + df = DataFrame(a = 1:2, b=3:4) + df2 = copy(df) + @test transform!(df, [:a]) == transform!(df2, ["a"]) + + df = DataFrame(a = 1:2, b=3:4) + df2 = copy(df) + @test transform!(df, :a) == transform!(df2, "a") + + df = DataFrame(a = 1:2, b=3:4) + @test_throws ArgumentError select(df, [:a, "b"]) + @test_throws ArgumentError select(df, ["a", :b]) + @test_throws ArgumentError select(df, ["a", "b", "a"]) +end + +@testset "tables" begin + df = DataFrame(a = 1:2, b=3:4) + + @test columnindex(df, :a) == columnindex(df, "a") + @test columnindex(df, :c) == columnindex(df, "c") + + @test Tables.schema(df) == Tables.schema(Tables.columntable(df)) + @test [:a, :b] == Tables.columnnames(df) == Tables.columnnames(df[1,:]) == + Tables.columnnames(eachrow(df)) == Tables.columnnames(eachcol(df)) +end + +@testset "split-apply-combine" begin + df = DataFrame(g=[1,1,1,2,2], a=1:5) + + # only check if the output is the same in all cases + gdf = groupby(df, :g) + @test gdf == groupby(df, "g") + @test groupby(df, [:g, :a]) == groupby(df, ["g", "a"]) + + @test names(gdf) == ["g", "a"] + + k = keys(gdf) + @test names(k[1]) == ["g"] + @test propertynames(k[1]) == keys(k[1]) == [:g] + @test haskey(k[1], :g) == haskey(k[1], "g") == true + @test haskey(k[1], :a) == haskey(k[1], "a") == false + @test k[1].g == k[1]."g" == k[1][:g] == k[1]["g"] + + @test by(df, :g, :a) == by(df, "g", "a") == combine(gdf, :a) == combine(gdf, "a") == + by(df, :g, [:a]) == by(df, "g", ["a"]) == combine(gdf, [:a]) == combine(gdf, ["a"]) + + @test map("a" => identity, gdf) == map(:a => identity, gdf) + @test map(["a"] => identity, gdf) == map([:a] => identity, gdf) + @test map(nrow => :n, gdf) == map(nrow => "n", gdf) + + @test combine("a" => identity, gdf) == combine(:a => identity, gdf) == + combine(gdf, "a" => identity) == combine(gdf, :a => identity) == + by("a" => identity, df, :g) == by(:a => identity, df, :g) == + by(df, :g, "a" => identity) == by(df, :g, :a => identity) + @test combine(["a"] => identity, gdf) == combine([:a] => identity, gdf) == + combine(gdf, ["a"] => identity) == combine(gdf, [:a] => identity) == + by(["a"] => identity, df, :g) == by([:a] => identity, df, :g) == + by(df, :g, ["a"] => identity) == by(df, :g, [:a] => identity) + @test combine(nrow => :n, gdf) == combine(nrow => "n", gdf) == + combine(gdf, nrow => :n) == combine(gdf, nrow => "n") == + by(nrow => :n, df, :g) == by(nrow => "n", df, :g) == + by(df, :g, nrow => :n) == by(df, :g, nrow => "n") +end + +@testset "DataFrameRow" begin + dfr = DataFrame(a=1:2, b=3:4, c=5:6)[2, ["c", "a"]] + @test names(dfr) == ["c", "a"] + @test names(dfr, "a") == names(dfr, :a) == names(dfr, 2) == names(dfr, Not("c")) == + names(dfr, All("a")) == names(dfr, Between("a", "a")) == ["a"] + @test keys(dfr) == propertynames(dfr) == [:c, :a] + @test haskey(dfr, :a) == haskey(dfr, "a") == true + @test haskey(dfr, :z) == haskey(dfr, "z") == false + @test hasproperty(dfr, :a) == hasproperty(dfr, "a") == true + @test hasproperty(dfr, :z) == hasproperty(dfr, "z") == false + @test dfr["a"] == dfr[:a] == dfr[2] + dfr["a"] = 1000 + @test dfr."a" == 1000 + dfr[1:2] = Dict("a" => 100, "c" => 500) + @test dfr.a == 100 + @test dfr.c == 500 + @test_throws ArgumentError dfr[1:2] = Dict("a" => 100, "f" => 500) + @test_throws ArgumentError dfr[1:2] = Dict("a" => 100, :c => 500) +end + +@testset "names, propertynames and hasproperty" begin + df = DataFrame(a=1, x1=2, x2=3, x3=4) + sdf = view(df, 1:1, 1:3) + dfr = view(df, 1:1, 1:3) + er = eachrow(df) + ec = eachcol(df) + gdf = groupby(df, :a) + + for v in [df, er, ec] + @test names(v) == ["a", "x1", "x2", "x3"] + @test propertynames(v) == [:a, :x1, :x2, :x3] + @test hasproperty(v, :a) + @test !hasproperty(v, :x) + @test hasproperty(v, "a") + @test !hasproperty(v, "x") + @test names(v, 1) == names(v, :a) == names(v, "a") == ["a"] + @test names(v, Not(:a)) == names(v, Not("a")) == names(v, Not(1)) == + names(v, r"x") == ["x1", "x2", "x3"] + end + + @test names(gdf) == ["a", "x1", "x2", "x3"] + @test names(gdf, 1) == names(gdf, :a) == names(gdf, "a") == ["a"] + @test names(gdf, Not(:a)) == names(gdf, Not("a")) == names(gdf, Not(1)) == + names(gdf, r"x") == ["x1", "x2", "x3"] + + + for v in [sdf, dfr] + @test names(v) == ["a", "x1", "x2"] + @test propertynames(v) == [:a, :x1, :x2] + @test names(v, 1) == names(v, :a) == names(v, "a") == ["a"] + @test names(v, Not(:a)) == names(v, Not("a")) == names(v, Not(1)) == + names(v, r"x") == ["x1", "x2"] + @test hasproperty(v, :a) + @test !hasproperty(v, :x) + @test hasproperty(v, "a") + @test !hasproperty(v, "x") + end +end + +@testset "append! and push!" begin + df = DataFrame(a=1, b=2) + append!(df, Dict("a" => 2, "b" => 3)) + @test df == DataFrame(a=1:2, b=2:3) + + df = DataFrame() + append!(df, Dict("a" => 2, "b" => 3)) + @test df == DataFrame(a=2, b=3) + + df = DataFrame(a=1, b=2) + push!(df, Dict("a" => 2, "b" => 3)) + @test df == DataFrame(a=1:2, b=2:3) + @test_throws ArgumentError push!(df, Dict("a" => 2, "b" => 3), cols=:orderequal) + push!(df, Dict("b" => 4, "c" => 0), cols=:union) + @test isequal(df, DataFrame(a=[1:2; missing], b=2:4, c=[missing, missing, 0])) +end + +end # module diff --git a/test/subdataframe.jl b/test/subdataframe.jl index 59edc58d4c..a02be46fcf 100644 --- a/test/subdataframe.jl +++ b/test/subdataframe.jl @@ -187,7 +187,7 @@ end y = collect(1.0:10.0) df = view(DataFrame(:x=>x, :y=>y, copycols=false), 2:6, :) - @test Base.propertynames(df) == Tuple(names(df)) + @test propertynames(df) == Symbol.(names(df)) @test df.x == 2:6 @test df.y == 2:6 @@ -212,7 +212,8 @@ end @test !haskey(DataFrames.index(df2), 2) @test !haskey(DataFrames.index(df2), 0) @test_throws ArgumentError haskey(DataFrames.index(df2), true) - @test keys(DataFrames.index(df2)) == [:y] + @test names(DataFrames.index(df2)) == ["y"] + @test DataFrames._names(DataFrames.index(df2)) == [:y] x = DataFrame(ones(5,4)) df = view(x, 2:3, 2:3) diff --git a/test/tables.jl b/test/tables.jl index 2c7b548ea6..6df913b582 100644 --- a/test/tables.jl +++ b/test/tables.jl @@ -92,7 +92,7 @@ Base.propertynames(d::DuplicateNamesColumnTable) = (:a, :a, :b) and_back = DataFrame(bare_rows) @test and_back isa DataFrame - @test names(and_back) == [:a, :b] + @test names(and_back) == ["a", "b"] @test and_back.a == df.a @test and_back.b == df.b end @@ -104,7 +104,7 @@ Base.propertynames(d::DuplicateNamesColumnTable) = (:a, :a, :b) and_back = DataFrame(cols) @test and_back isa DataFrame - @test names(and_back) == [:a, :b] + @test names(and_back) == ["a", "b"] @test and_back.a == df.a == Tables.getcolumn(df, :a) == Tables.getcolumn(df, 1) @test and_back.b == df.b end @@ -209,12 +209,12 @@ end @test all(((a,b),) -> a === b, zip(eachcol(df), eachcol(df2))) df2 = DataFrame(eachcol(df)) - @test names(df2) == [:x1, :x2, :x3, :x4] + @test propertynames(df2) == [:x1, :x2, :x3, :x4] @test all(((a,b),) -> a == b, zip(eachcol(df), eachcol(df2))) @test !any(((a,b),) -> a === b, zip(eachcol(df), eachcol(df2))) df2 = DataFrame(eachcol(df)) - @test names(df2) == [:x1, :x2, :x3, :x4] + @test propertynames(df2) == [:x1, :x2, :x3, :x4] @test !any(((a,b),) -> a === b, zip(eachcol(df), eachcol(df2))) @test Tables.rowtable(df) == Tables.rowtable(eachrow(df))