diff --git a/src/abstractdataframe/abstractdataframe.jl b/src/abstractdataframe/abstractdataframe.jl index a40627c6a..ea407d495 100644 --- a/src/abstractdataframe/abstractdataframe.jl +++ b/src/abstractdataframe/abstractdataframe.jl @@ -197,18 +197,30 @@ julia> rename!(uppercase, df) ``` """ function rename!(df::AbstractDataFrame, vals::AbstractVector{Symbol}; - makeunique::Bool=false) - rename!(index(df), vals, makeunique=makeunique) + makeunique::Bool=false, mergeduplicates::MergeDuplicates=nothing) + if !makeunique && isa(mergeduplicates, Function) + (new_columns, colindex) = process_updates(UpdateIndex(vals), _columns(df), mergeduplicates) + # Now we must replace the columns and index with the new ones in place... + splice!(_columns(df), 1:length(_columns(df)), new_columns) # Replace the columns with these new ones... + rename!(index(df), colindex) + else + rename!(index(df), vals, makeunique=makeunique) + end # renaming columns of SubDataFrame has to clean non-note metadata in its parent _drop_all_nonnote_metadata!(parent(df)) return df end +function rename!(idx::Index, new_index::Index) + splice!(idx.names, 1:length(idx.names), new_index.names) + empty!(idx.lookup) + merge!(idx.lookup, new_index.lookup) + return idx +end + function rename!(df::AbstractDataFrame, vals::AbstractVector{<:AbstractString}; - makeunique::Bool=false) - rename!(index(df), Symbol.(vals), makeunique=makeunique) - # renaming columns of SubDataFrame has to clean non-note metadata in its parent - _drop_all_nonnote_metadata!(parent(df)) + makeunique::Bool=false, mergeduplicates::MergeDuplicates=nothing) + rename!(df, Symbol.(vals), makeunique=makeunique, mergeduplicates=mergeduplicates) return df end @@ -353,9 +365,11 @@ julia> rename(uppercase, df) ``` """ rename(df::AbstractDataFrame, vals::AbstractVector{Symbol}; - makeunique::Bool=false) = rename!(copy(df), vals, makeunique=makeunique) + makeunique::Bool=false, mergeduplicates::MergeDuplicates=nothing) = + rename!(copy(df), vals, makeunique=makeunique, mergeduplicates=mergeduplicates) rename(df::AbstractDataFrame, vals::AbstractVector{<:AbstractString}; - makeunique::Bool=false) = rename!(copy(df), vals, makeunique=makeunique) + makeunique::Bool=false, mergeduplicates::MergeDuplicates=nothing) = + rename!(copy(df), vals, makeunique=makeunique, mergeduplicates=mergeduplicates) rename(df::AbstractDataFrame, args...) = rename!(copy(df), args...) rename(f::Function, df::AbstractDataFrame) = rename!(f, copy(df)) @@ -1534,9 +1548,17 @@ function fillcombinations(df::AbstractDataFrame, indexcols; return out_df end +""" +MergeDuplicates = Union{Nothing,Function} + +Wherever the `mergeduplicates` keyword argument is available it is either `nothing` or +a `Function` that will be executed to combine duplicated columns (when `makeunique=false`) +""" +MergeDuplicates = Union{Nothing,Function} + """ hcat(df::AbstractDataFrame...; - makeunique::Bool=false, copycols::Bool=true) + makeunique::Bool=false, mergeduplicates::MergeDuplicates=nothing, copycols::Bool=true) Horizontally concatenate data frames. @@ -1544,6 +1566,12 @@ If `makeunique=false` (the default) column names of passed objects must be uniqu If `makeunique=true` then duplicate column names will be suffixed with `_i` (`i` starting at 1 for the first duplicate). +If `makeunique=false` and `mergeduplicates` is a `Function` then duplicate columns +will be combined by invoking the function with all values from those columns. +e.g. `mergeduplicates=coalesce` will use the first non-missing value. Since `hcat` and +`hcat!` are performed recursively for more than two frames, this `mergeduplicates` +function will only combine two columns at a time. + If `copycols=true` (the default) then the `DataFrame` returned by `hcat` will contain copied columns from the source data frames. If `copycols=false` then it will contain columns as they are stored in the @@ -1593,26 +1621,26 @@ julia> df3.A === df1.A true ``` """ -function Base.hcat(df::AbstractDataFrame; makeunique::Bool=false, copycols::Bool=true) +function Base.hcat(df::AbstractDataFrame; makeunique::Bool=false, mergeduplicates::MergeDuplicates=nothing, copycols::Bool=true) df = DataFrame(df, copycols=copycols) _drop_all_nonnote_metadata!(df) return df end # TODO: after deprecation remove AbstractVector methods -Base.hcat(df::AbstractDataFrame, x::AbstractVector; makeunique::Bool=false, copycols::Bool=true) = - hcat!(DataFrame(df, copycols=copycols), x, makeunique=makeunique, copycols=copycols) -Base.hcat(x::AbstractVector, df::AbstractDataFrame; makeunique::Bool=false, copycols::Bool=true) = - hcat!(x, df, makeunique=makeunique, copycols=copycols) +Base.hcat(df::AbstractDataFrame, x::AbstractVector; makeunique::Bool=false, mergeduplicates::MergeDuplicates=nothing, copycols::Bool=true) = + hcat!(DataFrame(df, copycols=copycols), x, makeunique=makeunique, mergeduplicates=mergeduplicates, copycols=copycols) +Base.hcat(x::AbstractVector, df::AbstractDataFrame; makeunique::Bool=false, mergeduplicates::MergeDuplicates=nothing, copycols::Bool=true) = + hcat!(x, df, makeunique=makeunique, mergeduplicates=mergeduplicates, copycols=copycols) Base.hcat(df1::AbstractDataFrame, df2::AbstractDataFrame; - makeunique::Bool=false, copycols::Bool=true) = + makeunique::Bool=false, mergeduplicates::MergeDuplicates=nothing, copycols::Bool=true) = hcat!(DataFrame(df1, copycols=copycols), df2, - makeunique=makeunique, copycols=copycols) + makeunique=makeunique, mergeduplicates=mergeduplicates, copycols=copycols) Base.hcat(df::AbstractDataFrame, x::Union{AbstractVector, AbstractDataFrame}, y::Union{AbstractVector, AbstractDataFrame}...; - makeunique::Bool=false, copycols::Bool=true) = - hcat!(hcat(df, x, makeunique=makeunique, copycols=copycols), y..., - makeunique=makeunique, copycols=copycols) + makeunique::Bool=false, mergeduplicates::MergeDuplicates=nothing, copycols::Bool=true) = + hcat!(hcat(df, x, makeunique=makeunique, mergeduplicates=mergeduplicates, copycols=copycols), y..., + makeunique=makeunique, mergeduplicates=mergeduplicates, copycols=copycols) """ vcat(dfs::AbstractDataFrame...; @@ -2868,8 +2896,11 @@ const INSERTCOLS_ARGUMENTS = are unwrapped and treated in the same way - `after` : if `true` columns are inserted after `col` - `makeunique` : defines what to do if `name` already exists in `df`; - if it is `false` an error will be thrown; if it is `true` a new unique name will - be generated by adding a suffix + if it is `true` a new unique name will be generated by adding a suffix, + if it is `false` an error will be thrown unless a `mergeduplicates` functiom is provided. + - `mergeduplicates` : defines what to do if `name` already exists in `df` and `makeunique` + is false. It should be given a Function that combines the values of all of the duplicated + columns which will be passed as a varargs. The return value is used. - `copycols` : whether vectors passed as columns should be copied If `val` is an `AbstractRange` then the result of `collect(val)` is inserted. @@ -2891,7 +2922,7 @@ const INSERTCOLS_ARGUMENTS = """ insertcols(df::AbstractDataFrame[, col], (name=>val)::Pair...; - after::Bool=false, makeunique::Bool=false, copycols::Bool=true) + after::Bool=false, makeunique::Bool=false, mergeduplicates::MergeDuplicates=nothing, copycols::Bool=true) Insert a column into a copy of `df` data frame using the [`insertcols!`](@ref) function and return the newly created data frame. @@ -2942,13 +2973,13 @@ julia> insertcols(df, :a, :d => 7:9, after=true) ``` """ insertcols(df::AbstractDataFrame, args...; - after::Bool=false, makeunique::Bool=false, copycols::Bool=true) = + after::Bool=false, makeunique::Bool=false, mergeduplicates::MergeDuplicates=nothing, copycols::Bool=true) = insertcols!(copy(df), args...; - after=after, makeunique=makeunique, copycols=copycols) + after=after, makeunique=makeunique, mergeduplicates=mergeduplicates, copycols=copycols) """ insertcols!(df::AbstractDataFrame[, col], (name=>val)::Pair...; - after::Bool=false, makeunique::Bool=false, copycols::Bool=true) + after::Bool=false, makeunique::Bool=false, mergeduplicates::MergeDuplicates=nothing, copycols::Bool=true) Insert a column into a data frame in place. Return the updated data frame. @@ -2999,7 +3030,10 @@ julia> insertcols!(df, :b, :d => 7:9, after=true) ``` """ function insertcols!(df::AbstractDataFrame, col::ColumnIndex, name_cols::Pair{Symbol}...; - after::Bool=false, makeunique::Bool=false, copycols::Bool=true) + after::Bool=false, makeunique::Bool=false, mergeduplicates::MergeDuplicates=nothing, copycols::Bool=true) + + _check_makeunique_args(mergeduplicates, makeunique) + if !is_column_insertion_allowed(df) throw(ArgumentError("insertcols! is only supported for DataFrame, or for " * "SubDataFrame created with `:` as column selector")) @@ -3025,15 +3059,15 @@ function insertcols!(df::AbstractDataFrame, col::ColumnIndex, name_cols::Pair{Sy "$(ncol(df)) columns at index $col_ind")) end - if !makeunique + if !makeunique && isnothing(mergeduplicates) if !allunique(first.(name_cols)) throw(ArgumentError("Names of columns to be inserted into a data frame " * - "must be unique when `makeunique=true`")) + "must be unique when `mergeduplicates=nothing`")) end for (n, _) in name_cols if hasproperty(df, n) throw(ArgumentError("Column $n is already present in the data frame " * - "which is not allowed when `makeunique=true`")) + "which is not allowed when `mergeduplicates=nothing`")) end end end @@ -3067,6 +3101,7 @@ function insertcols!(df::AbstractDataFrame, col::ColumnIndex, name_cols::Pair{Sy target_row_count = 1 end + mergecolumns = Dict{Symbol, Any}() start_col_ind = col_ind for (name, item) in name_cols if !(item isa AbstractVector) @@ -3103,23 +3138,38 @@ function insertcols!(df::AbstractDataFrame, col::ColumnIndex, name_cols::Pair{Sy dfp[!, name] = item_new else if hasproperty(dfp, name) - @assert makeunique - k = 1 - while true - nn = Symbol("$(name)_$k") - if !hasproperty(dfp, nn) - name = nn - break + if makeunique + k = 1 + while true + nn = Symbol("$(name)_$k") + if !hasproperty(dfp, nn) + name = nn + break + end + k += 1 end - k += 1 + insert!(index(dfp), col_ind, name) + insert!(_columns(dfp), col_ind, item_new) + else + # Just update without adding to index + merge = get(mergecolumns, name, (dfp=dfp, cols=[])) + push!(merge.cols, item_new) + mergecolumns[name] = merge + col_ind -= 1 end + else + insert!(index(dfp), col_ind, name) + insert!(_columns(dfp), col_ind, item_new) end - insert!(index(dfp), col_ind, name) - insert!(_columns(dfp), col_ind, item_new) end col_ind += 1 end + # Combine columns using mergeduplicates + for (name, merge) in mergecolumns + merge.dfp[!, name] = mergeduplicates.(merge.dfp[!, name], merge.cols...) + end + delta = col_ind - start_col_ind colmetadata_dict = getfield(parent(df), :colmetadata) if !isnothing(colmetadata_dict) && delta > 0 @@ -3134,22 +3184,22 @@ function insertcols!(df::AbstractDataFrame, col::ColumnIndex, name_cols::Pair{Sy end insertcols!(df::AbstractDataFrame, col::ColumnIndex, name_cols::Pair{<:AbstractString}...; - after::Bool=false, makeunique::Bool=false, copycols::Bool=true) = + after::Bool=false, makeunique::Bool=false, mergeduplicates::MergeDuplicates=nothing, copycols::Bool=true) = insertcols!(df, col, (Symbol(n) => v for (n, v) in name_cols)..., - after=after, makeunique=makeunique, copycols=copycols) + after=after, makeunique=makeunique, mergeduplicates=mergeduplicates, copycols=copycols) insertcols!(df::AbstractDataFrame, name_cols::Pair{Symbol}...; - after::Bool=false, makeunique::Bool=false, copycols::Bool=true) = + after::Bool=false, makeunique::Bool=false, mergeduplicates::MergeDuplicates=nothing, copycols::Bool=true) = insertcols!(df, ncol(df)+1, name_cols..., after=after, - makeunique=makeunique, copycols=copycols) + makeunique=makeunique, mergeduplicates=mergeduplicates, copycols=copycols) insertcols!(df::AbstractDataFrame, name_cols::Pair{<:AbstractString}...; - after::Bool=false, makeunique::Bool=false, copycols::Bool=true) = + after::Bool=false, makeunique::Bool=false, mergeduplicates::MergeDuplicates=nothing, copycols::Bool=true) = insertcols!(df, (Symbol(n) => v for (n, v) in name_cols)..., - after=after, makeunique=makeunique, copycols=copycols) + after=after, makeunique=makeunique, mergeduplicates=mergeduplicates, copycols=copycols) function insertcols!(df::AbstractDataFrame, col::ColumnIndex; after::Bool=false, - makeunique::Bool=false, copycols::Bool=true) + makeunique::Bool=false, mergeduplicates::MergeDuplicates=nothing, copycols::Bool=true) if col isa SymbolOrString col_ind = Int(columnindex(df, col)) if col_ind == 0 @@ -3173,7 +3223,7 @@ function insertcols!(df::AbstractDataFrame, col::ColumnIndex; after::Bool=false, end function insertcols!(df::AbstractDataFrame; after::Bool=false, - makeunique::Bool=false, copycols::Bool=true) + makeunique::Bool=false, mergeduplicates::MergeDuplicates=nothing, copycols::Bool=true) _drop_all_nonnote_metadata!(parent(df)) return df end diff --git a/src/abstractdataframe/reshape.jl b/src/abstractdataframe/reshape.jl index 2effb6f2f..8df8e7588 100644 --- a/src/abstractdataframe/reshape.jl +++ b/src/abstractdataframe/reshape.jl @@ -823,7 +823,8 @@ julia> permutedims(df2, 1, "different_name") """ function Base.permutedims(df::AbstractDataFrame, src_namescol::ColumnIndex, dest_namescol::Union{Symbol, AbstractString}; - makeunique::Bool=false, strict::Bool=true) + makeunique::Bool=false, mergeduplicates::MergeDuplicates=nothing, + strict::Bool=true) if src_namescol isa Integer 1 <= src_namescol <= ncol(df) || throw(BoundsError(index(df), src_namescol)) @@ -854,18 +855,20 @@ function Base.permutedims(df::AbstractDataFrame, src_namescol::ColumnIndex, if ncol(df_notsrc) == 0 df_tmp = DataFrame(AbstractVector[[] for _ in 1:nrow(df)], new_col_names, - makeunique=makeunique, copycols=false) + makeunique=makeunique, mergeduplicates=mergeduplicates, + copycols=false) else m = permutedims(Matrix(df_notsrc)) - df_tmp = rename!(DataFrame(Tables.table(m)), new_col_names, makeunique=makeunique) + df_tmp = rename!(DataFrame(Tables.table(m)), new_col_names, makeunique=makeunique, mergeduplicates=mergeduplicates) end - out_df = hcat!(df_permuted, df_tmp, makeunique=makeunique, copycols=false) + out_df = hcat!(df_permuted, df_tmp, makeunique=makeunique, mergeduplicates=mergeduplicates, copycols=false) _copy_table_note_metadata!(out_df, df) return out_df end function Base.permutedims(df::AbstractDataFrame, src_namescol::ColumnIndex; - makeunique::Bool=false, strict::Bool=true) + makeunique::Bool=false, mergeduplicates::MergeDuplicates=nothing, + strict::Bool=true) if src_namescol isa Integer 1 <= src_namescol <= ncol(df) || throw(BoundsError(index(df), src_namescol)) dest_namescol = _names(df)[src_namescol] @@ -873,7 +876,7 @@ function Base.permutedims(df::AbstractDataFrame, src_namescol::ColumnIndex; dest_namescol = src_namescol end return permutedims(df, src_namescol, dest_namescol; - makeunique=makeunique, strict=strict) + makeunique=makeunique, mergeduplicates=mergeduplicates, strict=strict) end function Base.permutedims(df::AbstractDataFrame) @@ -883,8 +886,8 @@ function Base.permutedims(df::AbstractDataFrame) end function Base.permutedims(df::AbstractDataFrame, cnames::AbstractVector; - makeunique::Bool=false) - out_df = DataFrame(permutedims(Matrix(df)), cnames, makeunique=makeunique) + makeunique::Bool=false, mergeduplicates::MergeDuplicates=nothing) + out_df = DataFrame(permutedims(Matrix(df)), cnames, makeunique=makeunique, mergeduplicates=mergeduplicates) _copy_table_note_metadata!(out_df, df) return out_df end diff --git a/src/abstractdataframe/selection.jl b/src/abstractdataframe/selection.jl index a90f1203d..be53c41d2 100644 --- a/src/abstractdataframe/selection.jl +++ b/src/abstractdataframe/selection.jl @@ -1822,7 +1822,7 @@ end function manipulate(df::DataFrame, args::AbstractVector{Int}; copycols::Bool, keeprows::Bool, renamecols::Bool) - new_df = DataFrame(_columns(df)[args], Index(_names(df)[args]), copycols=copycols) + new_df = DataFrame(_columns(df)[args], UpdateIndex(_names(df)[args]), copycols=copycols) _copy_all_note_metadata!(new_df, df) return new_df end diff --git a/src/dataframe/dataframe.jl b/src/dataframe/dataframe.jl index 3f4afafec..821b8c300 100755 --- a/src/dataframe/dataframe.jl +++ b/src/dataframe/dataframe.jl @@ -8,16 +8,16 @@ particularly a `Vector`, `PooledVector` or `CategoricalVector`. # Constructors ```julia -DataFrame(pairs::Pair...; makeunique::Bool=false, copycols::Bool=true) -DataFrame(pairs::AbstractVector{<:Pair}; makeunique::Bool=false, copycols::Bool=true) +DataFrame(pairs::Pair...; makeunique::Bool=false, mergeduplicates::MergeDuplicates=nothing, copycols::Bool=true) +DataFrame(pairs::AbstractVector{<:Pair}; makeunique::Bool=false, mergeduplicates::MergeDuplicates=nothing, copycols::Bool=true) DataFrame(ds::AbstractDict; copycols::Bool=true) DataFrame(; kwargs..., copycols::Bool=true) DataFrame(table; copycols::Union{Bool, Nothing}=nothing) DataFrame(table, names::AbstractVector; - makeunique::Bool=false, copycols::Union{Bool, Nothing}=nothing) + makeunique::Bool=false, mergeduplicates::MergeDuplicates=nothing, copycols::Union{Bool, Nothing}=nothing) DataFrame(columns::AbstractVecOrMat, names::AbstractVector; - makeunique::Bool=false, copycols::Bool=true) + makeunique::Bool=false, mergeduplicates::MergeDuplicates=nothing, copycols::Bool=true) DataFrame(::DataFrameRow; copycols::Bool=true) DataFrame(::GroupedDataFrame; copycols::Bool=true, keepkeys::Bool=true) @@ -84,9 +84,13 @@ Pass the `copycols=false` keyword argument (where supported) to reuse vectors wi copying them. By default an error will be raised if duplicates in column names are found. Pass -`makeunique=true` keyword argument (where supported) to accept duplicate names, +`makeunique=true` keyword argument to accept duplicate names, in which case they will be suffixed with `_i` (`i` starting at 1 for the first -duplicate). +duplicate), or provide a `mergeduplicates` function. + +If `makeunique=false` and `mergeduplicates` is a `Function` then duplicate column names +will be combined by this function with the column named overwritten by the results of +the function on all values from the duplicated column(s). If an `AbstractRange` is passed to a `DataFrame` constructor as a column it is always collected to a `Vector` (even if `copycols=false`). As a general rule @@ -189,12 +193,24 @@ mutable struct DataFrame <: AbstractDataFrame # non-:note-style metadata is added. allnotemetadata::Bool + """ + UpdateIndex is a temporary struct used *only* in the initialization + of a DataFrame. It holds the eventual Index for the DataFrame as well + as a list of columns to be combined using mergeduplicates. + """ + struct UpdateIndex + index::Index + updates::Vector{Symbol} + end + # the inner constructor should not be used directly function DataFrame(columns::Union{Vector{Any}, Vector{AbstractVector}}, - colindex::Index; copycols::Bool=true) + update_index::UpdateIndex; copycols::Bool=true, + mergeduplicates::MergeDuplicates=nothing) + colindex = update_index.index if length(columns) == length(colindex) == 0 return new(AbstractVector[], Index(), nothing, nothing, true) - elseif length(columns) != length(colindex) + elseif length(columns) != column_length(update_index) throw(DimensionMismatch("Number of columns ($(length(columns))) and number of " * "column names ($(length(colindex))) are not equal")) end @@ -232,8 +248,73 @@ mutable struct DataFrame <: AbstractDataFrame firstindex(col) != 1 && _onebased_check_error(i, col) end + # process updates if they exist + (columns, colindex) = process_updates(update_index, columns, mergeduplicates) + return new(convert(Vector{AbstractVector}, columns), colindex, nothing, nothing, true) end + + function DataFrame(columns::Union{Vector{Any},Vector{AbstractVector}}, + colindex::Index; copycols::Bool=true, + mergeduplicates=nothing) + return DataFrame(columns, UpdateIndex(colindex), + copycols=copycols, mergeduplicates=mergeduplicates) + end +end + +column_length(x::UpdateIndex) = isempty(x.updates) ? column_length(x.index) : length(x.updates) +has_updates(x::UpdateIndex) = !isempty(x.updates) + +UpdateIndex(idx::Index) = UpdateIndex(idx, []) +UpdateIndex() = UpdateIndex(Index()) + +function UpdateIndex(names::AbstractVector{Symbol}; makeunique::Bool=false) + if !makeunique + lookup = Dict{Symbol, Int}(zip(reverse(names), length(names):-1:1)) + return UpdateIndex(Index(lookup, unique(names)), names) + else + idx = Index(names, makeunique=makeunique) + return UpdateIndex(idx, []) + end +end + +""" +Processes the updates defined by an UpdateIndex on the given columns + +Returns a tuple of the new set of columns and the new column index +""" +function process_updates(update_index::UpdateIndex, columns::Union{Vector{Any}, Vector{AbstractVector}}, mergeduplicates::MergeDuplicates) + if has_updates(update_index) + colindex = update_index.index + merges = Dict{Symbol,Any}() + updated = Vector{Any}(nothing, length(colindex.names)) + for src in eachindex(update_index.updates) + name = update_index.updates[src] + dst = colindex.lookup[name] + if isnothing(updated[dst]) + updated[dst] = columns[src] + else + if isnothing(mergeduplicates) + msg = "Duplicate variable names: $name. Pass makeunique=true " * + "to make them unique using a suffix automatically." + throw(ArgumentError(msg)) + end + merge = get(merges, name, (dst=dst, columns=[])) + push!(merge.columns, columns[src]) + merges[name] = merge + end + end + + # Handle mergeduplicates updates + for (_, merge) in merges + updated[merge.dst] = mergeduplicates.(updated[merge.dst], merge.columns...) + end + + columns = updated + colindex = update_index.index + end + + return (columns, update_index.index) end function _preprocess_column(col::Any, len::Integer, copycols::Bool) @@ -254,24 +335,27 @@ end DataFrame(df::DataFrame; copycols::Bool=true) = copy(df, copycols=copycols) -function DataFrame(pairs::Pair{Symbol, <:Any}...; makeunique::Bool=false, +function DataFrame(pairs::Pair{Symbol, <:Any}...; + makeunique::Bool=false, mergeduplicates::MergeDuplicates=nothing, copycols::Bool=true)::DataFrame colnames = [Symbol(k) for (k, v) in pairs] columns = Any[v for (k, v) in pairs] - return DataFrame(columns, Index(colnames, makeunique=makeunique), - copycols=copycols) + return DataFrame(columns, UpdateIndex(colnames, makeunique=makeunique), + copycols=copycols, mergeduplicates=mergeduplicates) end -function DataFrame(pairs::Pair{<:AbstractString, <:Any}...; makeunique::Bool=false, +function DataFrame(pairs::Pair{<:AbstractString, <:Any}...; + makeunique::Bool=false, mergeduplicates::MergeDuplicates=nothing, copycols::Bool=true)::DataFrame colnames = [Symbol(k) for (k, v) in pairs] columns = Any[v for (k, v) in pairs] - return DataFrame(columns, Index(colnames, makeunique=makeunique), - copycols=copycols) + return DataFrame(columns, UpdateIndex(colnames, makeunique=makeunique), + copycols=copycols, mergeduplicates=mergeduplicates) end # this is needed as a workaround for Tables.jl dispatch -function DataFrame(pairs::AbstractVector{<:Pair}; makeunique::Bool=false, +function DataFrame(pairs::AbstractVector{<:Pair}; + makeunique::Bool=false, mergeduplicates::MergeDuplicates=nothing, copycols::Bool=true) if isempty(pairs) return DataFrame() @@ -281,8 +365,8 @@ function DataFrame(pairs::AbstractVector{<:Pair}; makeunique::Bool=false, end colnames = [Symbol(k) for (k, v) in pairs] columns = Any[v for (k, v) in pairs] - return DataFrame(columns, Index(colnames, makeunique=makeunique), - copycols=copycols) + return DataFrame(columns, UpdateIndex(colnames, makeunique=makeunique), + copycols=copycols, mergeduplicates=mergeduplicates) end end @@ -295,7 +379,7 @@ function DataFrame(d::AbstractDict; copycols::Bool=true) throw(ArgumentError("All column names must be either Symbols or strings (mixing is not allowed)")) end - colindex = Index(colnames) + colindex = UpdateIndex(colnames) columns = Any[v for v in values(d)] df = DataFrame(columns, colindex, copycols=copycols) if d isa Dict @@ -329,18 +413,19 @@ function DataFrame(; kwargs...) push!(columns, val) end end - DataFrame(columns, Index(cnames), copycols=copycols) + DataFrame(columns, UpdateIndex(cnames), copycols=copycols) end end function DataFrame(columns::AbstractVector, cnames::AbstractVector{Symbol}; - makeunique::Bool=false, copycols::Bool=true)::DataFrame + makeunique::Bool=false, mergeduplicates::MergeDuplicates=nothing, copycols::Bool=true)::DataFrame + _check_makeunique_args(mergeduplicates, makeunique) if !(eltype(columns) <: AbstractVector) && !all(col -> isa(col, AbstractVector), columns) - return rename!(DataFrame(columns, copycols=copycols), cnames, makeunique=makeunique) + return rename!(DataFrame(columns, copycols=copycols), cnames, makeunique=makeunique, mergeduplicates=mergeduplicates) end return DataFrame(collect(AbstractVector, columns), - Index(convert(Vector{Symbol}, cnames), makeunique=makeunique), - copycols=copycols) + UpdateIndex(convert(Vector{Symbol}, cnames), makeunique=makeunique), + copycols=copycols, mergeduplicates=mergeduplicates) end function _name2symbol(str::AbstractVector) @@ -351,18 +436,18 @@ function _name2symbol(str::AbstractVector) end DataFrame(columns::AbstractVector, cnames::AbstractVector; - makeunique::Bool=false, copycols::Bool=true) = - DataFrame(columns, _name2symbol(cnames), makeunique=makeunique, copycols=copycols) + makeunique::Bool=false, mergeduplicates::MergeDuplicates=nothing, copycols::Bool=true) = + DataFrame(columns, _name2symbol(cnames), mergeduplicates=mergeduplicates, makeunique=makeunique, copycols=copycols) DataFrame(columns::AbstractVector{<:AbstractVector}, cnames::AbstractVector{Symbol}; - makeunique::Bool=false, copycols::Bool=true)::DataFrame = + makeunique::Bool=false, mergeduplicates::MergeDuplicates=nothing, copycols::Bool=true)::DataFrame = DataFrame(collect(AbstractVector, columns), - Index(convert(Vector{Symbol}, cnames), makeunique=makeunique), - copycols=copycols) + UpdateIndex(convert(Vector{Symbol}, cnames), makeunique=makeunique), + copycols=copycols, mergeduplicates=mergeduplicates) DataFrame(columns::AbstractVector{<:AbstractVector}, cnames::AbstractVector; - makeunique::Bool=false, copycols::Bool=true) = - DataFrame(columns, _name2symbol(cnames); makeunique=makeunique, copycols=copycols) + makeunique::Bool=false, mergeduplicates::MergeDuplicates=nothing, copycols::Bool=true) = + DataFrame(columns, _name2symbol(cnames); mergeduplicates=mergeduplicates, makeunique=makeunique, copycols=copycols) function DataFrame(columns::AbstractVector, cnames::Symbol; copycols::Bool=true) if cnames !== :auto @@ -375,15 +460,15 @@ function DataFrame(columns::AbstractVector, cnames::Symbol; copycols::Bool=true) end function DataFrame(columns::AbstractMatrix, cnames::AbstractVector{Symbol}; - makeunique::Bool=false, copycols::Bool=true) + makeunique::Bool=false, mergeduplicates::MergeDuplicates=nothing, copycols::Bool=true) getter = copycols ? getindex : view return DataFrame(AbstractVector[getter(columns, :, i) for i in 1:size(columns, 2)], - cnames, makeunique=makeunique, copycols=false) + cnames, makeunique=makeunique, mergeduplicates=mergeduplicates, copycols=false) end DataFrame(columns::AbstractMatrix, cnames::AbstractVector; - makeunique::Bool=false, copycols::Bool=true) = - DataFrame(columns, _name2symbol(cnames); makeunique=makeunique, copycols=copycols) + makeunique::Bool=false, mergeduplicates::MergeDuplicates=nothing, copycols::Bool=true) = + DataFrame(columns, _name2symbol(cnames); makeunique=makeunique, mergeduplicates=mergeduplicates, copycols=copycols) function DataFrame(columns::AbstractMatrix, cnames::Symbol; copycols::Bool=true) if cnames !== :auto @@ -392,7 +477,7 @@ function DataFrame(columns::AbstractMatrix, cnames::Symbol; copycols::Bool=true) "positional argument is passed then the second " * "argument must be a vector of column names or :auto")) end - return DataFrame(columns, gennames(size(columns, 2)), makeunique=false, copycols=copycols) + return DataFrame(columns, gennames(size(columns, 2)), mergeduplicates=nothing, copycols=copycols) end # Discontinued constructors @@ -572,7 +657,7 @@ function _threaded_getindex(selected_rows::AbstractVector, return DataFrame(new_columns, idx, copycols=false) else return DataFrame(AbstractVector[df_columns[i][selected_rows] for i in selected_columns], - idx, copycols=false) + idx, copycols=false) end end @@ -1202,14 +1287,30 @@ end # hcat! for 2 arguments, only a vector or a data frame is allowed function hcat!(df1::DataFrame, df2::AbstractDataFrame; - makeunique::Bool=false, copycols::Bool=true) - u = add_names(index(df1), index(df2), makeunique=makeunique) + makeunique::Bool=false, mergeduplicates::MergeDuplicates=nothing, copycols::Bool=true) + if makeunique || isnothing(mergeduplicates) + u = add_names(index(df1), index(df2), makeunique=makeunique) + else + u = _names(index(df2)) + end _drop_all_nonnote_metadata!(df1) _keep_matching_table_note_metadata!(df1, df2) - for i in 1:length(u) - df1[!, u[i]] = copycols ? df2[:, i] : df2[!, i] - _copy_col_note_metadata!(df1, u[i], df2, i) + if !makeunique && isa(mergeduplicates,Function) + df1_names = _names(index(df1)) + for nm in u + if nm ∈ df1_names + df1[!, nm] = mergeduplicates.(df1[!, nm], df2[!, nm]) + else + df1[!, nm] = copycols ? df2[:, nm] : df2[!, nm] + end + _copy_col_note_metadata!(df1, nm, df2, nm) + end + else + for i in 1:length(u) + df1[!, u[i]] = copycols ? df2[:, i] : df2[!, i] + _copy_col_note_metadata!(df1, u[i], df2, i) + end end return df1 @@ -1217,31 +1318,31 @@ end # TODO: after deprecation remove AbstractVector methods -function hcat!(df::DataFrame, x::AbstractVector; makeunique::Bool=false, copycols::Bool=true) +function hcat!(df::DataFrame, x::AbstractVector; makeunique::Bool=false, mergeduplicates::MergeDuplicates=nothing, copycols::Bool=true) Base.depwarn("horizontal concatenation of data frame with a vector is deprecated. " * "Pass DataFrame(x1=x) instead.", :hcat!) return hcat!(df, DataFrame(AbstractVector[x], [:x1], copycols=false), - makeunique=makeunique, copycols=copycols) + makeunique=makeunique, mergeduplicates=mergeduplicates, copycols=copycols) end -function hcat!(x::AbstractVector, df::DataFrame; makeunique::Bool=false, copycols::Bool=true) +function hcat!(x::AbstractVector, df::DataFrame; makeunique::Bool=false, mergeduplicates::MergeDuplicates=nothing,copycols::Bool=true) Base.depwarn("horizontal concatenation of data frame with a vector is deprecated. " * "Pass DataFrame(x1=x) instead.", :hcat!) return hcat!(DataFrame(AbstractVector[x], [:x1], copycols=copycols), df, - makeunique=makeunique, copycols=copycols) + makeunique=makeunique, mergeduplicates=mergeduplicates, copycols=copycols) end # hcat! for 1-n arguments -function hcat!(df::DataFrame; makeunique::Bool=false, copycols::Bool=true) +function hcat!(df::DataFrame; makeunique::Bool=false, mergeduplicates::MergeDuplicates=nothing, copycols::Bool=true) _drop_all_nonnote_metadata!(df) return df end hcat!(a::DataFrame, b::Union{AbstractDataFrame, AbstractVector}, c::Union{AbstractDataFrame, AbstractVector}...; - makeunique::Bool=false, copycols::Bool=true) = - hcat!(hcat!(a, b, makeunique=makeunique, copycols=copycols), - c..., makeunique=makeunique, copycols=copycols) + makeunique::Bool=false, mergeduplicates::MergeDuplicates=nothing, copycols::Bool=true) = + hcat!(hcat!(a, b, makeunique=makeunique, mergeduplicates=mergeduplicates, copycols=copycols), + c..., makeunique=makeunique, mergeduplicates=mergeduplicates, copycols=copycols) ############################################################################## ## diff --git a/src/join/composer.jl b/src/join/composer.jl index 3cd4e90b3..5dace8773 100644 --- a/src/join/composer.jl +++ b/src/join/composer.jl @@ -118,7 +118,8 @@ _rename_cols(old_names::AbstractVector{Symbol}, for n in old_names] function _propagate_join_metadata!(joiner::DataFrameJoiner, dfr_noon::AbstractDataFrame, - res::DataFrame, kind::Symbol) + res::DataFrame, kind::Symbol; + makeunique::Bool=false, mergeduplicates::MergeDuplicates=nothing, names=nothing) @assert kind == :left || kind == :right || kind == :outer || kind == :inner # The steps taken in this function are (all applies only to :note-style metadata): @@ -174,8 +175,17 @@ function _propagate_join_metadata!(joiner::DataFrameJoiner, dfr_noon::AbstractDa end end - for i in 1:ncol(dfr_noon) - _copy_col_note_metadata!(res, ncol(joiner.dfl) + i, dfr_noon, i) + if isnothing(mergeduplicates) + for i in 1:ncol(dfr_noon) + _copy_col_note_metadata!(res, ncol(joiner.dfl) + i, dfr_noon, i) + end + else + map = UpdateIndex(names, makeunique=makeunique) + for i in 1:ncol(dfr_noon) + name = map.updates[ncol(joiner.dfl) + i] + dst = map.index.lookup[name] + _merge_col_note_metadata!(res, dst, dfr_noon, i) + end end if kind == :outer || kind == :inner @@ -235,7 +245,7 @@ function _count_sortperm!(input::Vector{Int}, count::Vector, end function compose_inner_table(joiner::DataFrameJoiner, - makeunique::Bool, + makeunique, mergeduplicates, left_rename::Union{Function, AbstractString, Symbol}, right_rename::Union{Function, AbstractString, Symbol}, order::Symbol) @@ -278,9 +288,9 @@ function compose_inner_table(joiner::DataFrameJoiner, new_names = vcat(_rename_cols(_names(joiner.dfl), left_rename, joiner.left_on), _rename_cols(_names(dfr_noon), right_rename)) - res = DataFrame(cols, new_names, makeunique=makeunique, copycols=false) + res = DataFrame(cols, new_names, makeunique=makeunique, mergeduplicates=mergeduplicates, copycols=false) - _propagate_join_metadata!(joiner, dfr_noon, res, :inner) + _propagate_join_metadata!(joiner, dfr_noon, res, :inner, makeunique=makeunique, mergeduplicates=mergeduplicates, names=new_names) return res end @@ -292,7 +302,7 @@ function find_missing_idxs(present::Vector{Int}, target_len::Int) return _findall(not_seen) end -function compose_joined_table(joiner::DataFrameJoiner, kind::Symbol, makeunique::Bool, +function compose_joined_table(joiner::DataFrameJoiner, kind::Symbol, makeunique, mergeduplicates, left_rename::Union{Function, AbstractString, Symbol}, right_rename::Union{Function, AbstractString, Symbol}, indicator::Union{Nothing, Symbol, AbstractString}, @@ -314,12 +324,12 @@ function compose_joined_table(joiner::DataFrameJoiner, kind::Symbol, makeunique: else rightonly_ixs = 1:0 end - return _compose_joined_table(joiner, kind, makeunique, left_rename, right_rename, + return _compose_joined_table(joiner, kind, makeunique, mergeduplicates, left_rename, right_rename, indicator, left_ixs, right_ixs, leftonly_ixs, rightonly_ixs, order) end -function _compose_joined_table(joiner::DataFrameJoiner, kind::Symbol, makeunique::Bool, +function _compose_joined_table(joiner::DataFrameJoiner, kind::Symbol, makeunique, mergeduplicates, left_rename::Union{Function, AbstractString, Symbol}, right_rename::Union{Function, AbstractString, Symbol}, indicator::Union{Nothing, Symbol, AbstractString}, @@ -440,14 +450,14 @@ function _compose_joined_table(joiner::DataFrameJoiner, kind::Symbol, makeunique new_names = vcat(_rename_cols(_names(joiner.dfl), left_rename, joiner.left_on), _rename_cols(_names(dfr_noon), right_rename)) - res = DataFrame(cols, new_names, makeunique=makeunique, copycols=false) + res = DataFrame(cols, new_names, makeunique=makeunique, mergeduplicates=mergeduplicates, copycols=false) if new_order !== nothing isnothing(src_indicator) || permute!(src_indicator, new_order) permute!(res, new_order) end - _propagate_join_metadata!(joiner, dfr_noon, res, kind) + _propagate_join_metadata!(joiner, dfr_noon, res, kind, makeunique=makeunique, mergeduplicates=mergeduplicates, names=new_names) return res, src_indicator end @@ -484,7 +494,8 @@ function _sort_compose_helper(fillval::Int, # value to use to fill unused indice end function _join(df1::AbstractDataFrame, df2::AbstractDataFrame; - on::Union{<:OnType, AbstractVector}, kind::Symbol, makeunique::Bool, + on::Union{<:OnType, AbstractVector}, kind::Symbol, + makeunique::Bool, mergeduplicates, indicator::Union{Nothing, Symbol, AbstractString}, validate::Union{Pair{Bool, Bool}, Tuple{Bool, Bool}}, left_rename::Union{Function, AbstractString, Symbol}, @@ -579,16 +590,16 @@ function _join(df1::AbstractDataFrame, df2::AbstractDataFrame; src_indicator = nothing if kind == :inner - joined = compose_inner_table(joiner, makeunique, left_rename, right_rename, order) + joined = compose_inner_table(joiner, makeunique, mergeduplicates, left_rename, right_rename, order) elseif kind == :left joined, src_indicator = - compose_joined_table(joiner, kind, makeunique, left_rename, right_rename, indicator, order) + compose_joined_table(joiner, kind, makeunique, mergeduplicates, left_rename, right_rename, indicator, order) elseif kind == :right joined, src_indicator = - compose_joined_table(joiner, kind, makeunique, left_rename, right_rename, indicator, order) + compose_joined_table(joiner, kind, makeunique, mergeduplicates, left_rename, right_rename, indicator, order) elseif kind == :outer joined, src_indicator = - compose_joined_table(joiner, kind, makeunique, left_rename, right_rename, indicator, order) + compose_joined_table(joiner, kind, makeunique, mergeduplicates, left_rename, right_rename, indicator, order) elseif kind == :semi joined = joiner.dfl[find_semi_rows(joiner), :] elseif kind == :anti @@ -614,12 +625,16 @@ function _join(df1::AbstractDataFrame, df2::AbstractDataFrame; end end - if hasproperty(joined, unique_indicator) - throw(ArgumentError("joined data frame already has column " * - ":$unique_indicator. Pass makeunique=true to " * - "make it unique using a suffix automatically.")) + if unique_indicator == indicator && !isnothing(mergeduplicates) + joined[!, indicator] = mergeduplicates.(joined[!, indicator], indicatorcol) + else + if hasproperty(joined, unique_indicator) + throw(ArgumentError("joined data frame already has column " * + ":$unique_indicator. Pass makeunique=true to " * + "make it unique using a suffix automatically.")) + end + joined[!, unique_indicator] = indicatorcol end - joined[!, unique_indicator] = indicatorcol else @assert isnothing(src_indicator) end @@ -628,10 +643,10 @@ function _join(df1::AbstractDataFrame, df2::AbstractDataFrame; end """ - innerjoin(df1, df2; on, makeunique=false, validate=(false, false), + innerjoin(df1, df2; on, makeunique=false, mergeduplicates=nothing, validate=(false, false), renamecols=(identity => identity), matchmissing=:error, order=:undefined) - innerjoin(df1, df2, dfs...; on, makeunique=false, + innerjoin(df1, df2, dfs...; on, makeunique=false, mergeduplicates=nothing, validate=(false, false), matchmissing=:error, order=:undefined) @@ -653,10 +668,16 @@ change in future releases. can be used instead of a name, for the case where a key has different names in `df1` and `df2` (it is allowed to mix names and name pairs in a vector). Key values are compared using `isequal`. `on` is a required argument. -- `makeunique` : if `false` (the default), an error will be raised +- `makeunique` : if `false` (the default), an error will be raised or the `mergeduplicates` + function will be used to combine columns if provided; if duplicate names are found in columns not joined on; if `true`, duplicate names will be suffixed with `_i` (`i` starting at 1 for the first duplicate). +- `mergeduplicates` : defines what to do if `name` already exists in `df` and `makeunique` + is false. It should be given a Function that combines the values of each of the duplicated + columns which will be passed as a vararg, the return value is then used to fill the column. + Because these joins are performed recursively, this `mergeduplicates` function will only + combine two values at a time. - `validate` : whether to check that columns passed as the `on` argument define unique keys in each input data frame (according to `isequal`). Can be a tuple or a pair, with the first element indicating whether to @@ -755,7 +776,7 @@ julia> innerjoin(name, job2, on = [:ID => :identifier], renamecols = uppercase = """ function innerjoin(df1::AbstractDataFrame, df2::AbstractDataFrame; on::Union{<:OnType, AbstractVector} = Symbol[], - makeunique::Bool=false, + makeunique::Bool=false, mergeduplicates::MergeDuplicates=nothing, validate::Union{Pair{Bool, Bool}, Tuple{Bool, Bool}}=(false, false), renamecols::Pair=identity => identity, matchmissing::Symbol=:error, @@ -764,7 +785,7 @@ function innerjoin(df1::AbstractDataFrame, df2::AbstractDataFrame; throw(ArgumentError("renamecols keyword argument must be a `Pair` " * "containing functions, strings, or `Symbol`s")) end - return _join(df1, df2, on=on, kind=:inner, makeunique=makeunique, + return _join(df1, df2, on=on, kind=:inner, mergeduplicates=mergeduplicates, makeunique=makeunique, indicator=nothing, validate=validate, left_rename=first(renamecols), right_rename=last(renamecols), matchmissing=matchmissing, order=order) @@ -772,16 +793,17 @@ end function innerjoin(df1::AbstractDataFrame, df2::AbstractDataFrame, dfs::AbstractDataFrame...; on::Union{<:OnType, AbstractVector} = Symbol[], - makeunique::Bool=false, + makeunique::Bool=false, mergeduplicates::MergeDuplicates=nothing, validate::Union{Pair{Bool, Bool}, Tuple{Bool, Bool}}=(false, false), matchmissing::Symbol=:error, order::Symbol=:undefined) @assert !isempty(dfs) - res = innerjoin(df1, df2, on=on, makeunique=makeunique, validate=validate, + _check_makeunique_args(mergeduplicates, makeunique) + res = innerjoin(df1, df2, on=on, mergeduplicates=mergeduplicates, validate=validate, matchmissing=matchmissing, order=order === :right ? :undefined : order) for (i, dfn) in enumerate(dfs) - res = innerjoin(res, dfn, on=on, makeunique=makeunique, validate=validate, + res = innerjoin(res, dfn, on=on, mergeduplicates=mergeduplicates, validate=validate, matchmissing=matchmissing, order= order === :right ? (i == length(dfs) ? :right : :undefined) : @@ -791,7 +813,7 @@ function innerjoin(df1::AbstractDataFrame, df2::AbstractDataFrame, dfs::Abstract end """ - leftjoin(df1, df2; on, makeunique=false, source=nothing, validate=(false, false), + leftjoin(df1, df2; on, makeunique=false, mergeduplicates=nothing, source=nothing, validate=(false, false), renamecols=(identity => identity), matchmissing=:error, order=:undefined) Perform a left join of two data frame objects and return a `DataFrame` containing @@ -811,10 +833,14 @@ change in future releases. the case where a key has different names in `df1` and `df2` (it is allowed to mix names and name pairs in a vector). Key values are compared using `isequal`. `on` is a required argument. -- `makeunique` : if `false` (the default), an error will be raised +- `makeunique` : if `false` (the default), an error will be raised or the `mergeduplicates` + function will be used to combine columns if provided; if duplicate names are found in columns not joined on; if `true`, duplicate names will be suffixed with `_i` (`i` starting at 1 for the first duplicate). +- `mergeduplicates` : defines what to do if `name` already exists in `df` and `makeunique` + is false. It should be given a Function that combines the values of each of the duplicated + columns which will be passed as a vararg, the return value is then used to fill the column. - `source` : Default: `nothing`. If a `Symbol` or string, adds indicator column with the given name, for whether a row appeared in only `df1` (`"left_only"`) or in both (`"both"`). If the name is already in use, @@ -915,12 +941,14 @@ julia> leftjoin(name, job2, on = [:ID => :identifier], renamecols = uppercase => ``` """ function leftjoin(df1::AbstractDataFrame, df2::AbstractDataFrame; - on::Union{<:OnType, AbstractVector} = Symbol[], makeunique::Bool=false, + on::Union{<:OnType, AbstractVector} = Symbol[], + makeunique::Bool=false, mergeduplicates::MergeDuplicates=nothing, source::Union{Nothing, Symbol, AbstractString}=nothing, indicator::Union{Nothing, Symbol, AbstractString}=nothing, validate::Union{Pair{Bool, Bool}, Tuple{Bool, Bool}}=(false, false), renamecols::Pair=identity => identity, matchmissing::Symbol=:error, order::Symbol=:undefined) + if !all(x -> x isa Union{Function, AbstractString, Symbol}, renamecols) throw(ArgumentError("renamecols keyword argument must be a `Pair` " * "containing functions, strings, or `Symbol`s")) @@ -937,14 +965,14 @@ function leftjoin(df1::AbstractDataFrame, df2::AbstractDataFrame; "It is not allowed to pass both `indicator` and `source` " * "keyword arguments at the same time.")) end - return _join(df1, df2, on=on, kind=:left, makeunique=makeunique, + return _join(df1, df2, on=on, kind=:left, mergeduplicates=mergeduplicates, makeunique=makeunique, indicator=source, validate=validate, left_rename=first(renamecols), right_rename=last(renamecols), matchmissing=matchmissing, order=order) end """ - rightjoin(df1, df2; on, makeunique=false, source=nothing, + rightjoin(df1, df2; on, makeunique=false, mergeduplicates=nothing, source=nothing, validate=(false, false), renamecols=(identity => identity), matchmissing=:error, order=:undefined) @@ -967,10 +995,14 @@ change in future releases. the case where a key has different names in `df1` and `df2` (it is allowed to mix names and name pairs in a vector). Key values are compared using `isequal`. `on` is a required argument. -- `makeunique` : if `false` (the default), an error will be raised + - `makeunique` : if `false` (the default), an error will be raised or the `mergeduplicates` + function will be used to combine columns if provided; if duplicate names are found in columns not joined on; if `true`, duplicate names will be suffixed with `_i` (`i` starting at 1 for the first duplicate). +- `mergeduplicates` : defines what to do if `name` already exists in `df` and `makeunique` + is false. It should be given a Function that combines the values of each of the duplicated + columns which will be passed as a vararg, the return value is then used to fill the column. - `source` : Default: `nothing`. If a `Symbol` or string, adds indicator column with the given name for whether a row appeared in only `df2` (`"right_only"`) or in both (`"both"`). If the name is already in use, @@ -1071,7 +1103,8 @@ julia> rightjoin(name, job2, on = [:ID => :identifier], renamecols = uppercase = ``` """ function rightjoin(df1::AbstractDataFrame, df2::AbstractDataFrame; - on::Union{<:OnType, AbstractVector} = Symbol[], makeunique::Bool=false, + on::Union{<:OnType, AbstractVector} = Symbol[], + makeunique::Bool=false, mergeduplicates::MergeDuplicates=nothing, source::Union{Nothing, Symbol, AbstractString}=nothing, indicator::Union{Nothing, Symbol, AbstractString}=nothing, validate::Union{Pair{Bool, Bool}, Tuple{Bool, Bool}}=(false, false), @@ -1093,16 +1126,16 @@ function rightjoin(df1::AbstractDataFrame, df2::AbstractDataFrame; "It is not allowed to pass both `indicator` and `source` " * "keyword arguments at the same time.")) end - return _join(df1, df2, on=on, kind=:right, makeunique=makeunique, + return _join(df1, df2, on=on, kind=:right, mergeduplicates=mergeduplicates, makeunique=makeunique, indicator=source, validate=validate, left_rename=first(renamecols), right_rename=last(renamecols), matchmissing=matchmissing, order=order) end """ - outerjoin(df1, df2; on, makeunique=false, source=nothing, validate=(false, false), + outerjoin(df1, df2; on, makeunique=false, mergeduplicates=nothing, source=nothing, validate=(false, false), renamecols=(identity => identity), matchmissing=:error, order=:undefined) - outerjoin(df1, df2, dfs...; on, makeunique = false, + outerjoin(df1, df2, dfs...; on, makeunique=false, mergeduplicates=nothing, validate = (false, false), matchmissing=:error, order=:undefined) Perform an outer join of two or more data frame objects and return a `DataFrame` @@ -1125,10 +1158,16 @@ This behavior may change in future releases. can be used instead of a name, for the case where a key has different names in `df1` and `df2` (it is allowed to mix names and name pairs in a vector). Key values are compared using `isequal`. `on` is a required argument. -- `makeunique` : if `false` (the default), an error will be raised +- `makeunique` : if `false` (the default), an error will be raised or the `mergeduplicates` + function will be used to combine columns if provided; if duplicate names are found in columns not joined on; if `true`, duplicate names will be suffixed with `_i` (`i` starting at 1 for the first duplicate). +- `mergeduplicates` : defines what to do if `name` already exists in `df` and `makeunique` + is false. It should be given a Function that combines the values of each of the duplicated + columns which will be passed as a vararg, the return value is then used to fill the column. + Because these joins are performed recursively, this `mergeduplicates` function will only + combine two values at a time. - `source` : Default: `nothing`. If a `Symbol` or string, adds indicator column with the given name for whether a row appeared in only `df1` (`"left_only"`), only `df2` (`"right_only"`) or in both (`"both"`). If the name is already in use, @@ -1240,7 +1279,8 @@ julia> outerjoin(name, job2, on = [:ID => :identifier], renamecols = uppercase = ``` """ function outerjoin(df1::AbstractDataFrame, df2::AbstractDataFrame; - on::Union{<:OnType, AbstractVector} = Symbol[], makeunique::Bool=false, + on::Union{<:OnType, AbstractVector} = Symbol[], + makeunique::Bool=false, mergeduplicates::MergeDuplicates=nothing, source::Union{Nothing, Symbol, AbstractString}=nothing, indicator::Union{Nothing, Symbol, AbstractString}=nothing, validate::Union{Pair{Bool, Bool}, Tuple{Bool, Bool}}=(false, false), @@ -1262,27 +1302,29 @@ function outerjoin(df1::AbstractDataFrame, df2::AbstractDataFrame; "It is not allowed to pass both `indicator` and `source` " * "keyword arguments at the same time.")) end - return _join(df1, df2, on=on, kind=:outer, makeunique=makeunique, + return _join(df1, df2, on=on, kind=:outer, mergeduplicates=mergeduplicates, makeunique=makeunique, indicator=source, validate=validate, left_rename=first(renamecols), right_rename=last(renamecols), matchmissing=matchmissing, order=order) end function outerjoin(df1::AbstractDataFrame, df2::AbstractDataFrame, dfs::AbstractDataFrame...; - on::Union{<:OnType, AbstractVector} = Symbol[], makeunique::Bool=false, + on::Union{<:OnType, AbstractVector} = Symbol[], + makeunique::Bool=false, mergeduplicates::MergeDuplicates=nothing, validate::Union{Pair{Bool, Bool}, Tuple{Bool, Bool}}=(false, false), matchmissing::Symbol=:error, order::Symbol=:undefined) - res = outerjoin(df1, df2, on=on, makeunique=makeunique, validate=validate, + _check_makeunique_args(mergeduplicates, makeunique) + res = outerjoin(df1, df2, on=on, mergeduplicates=mergeduplicates, validate=validate, matchmissing=matchmissing, order=order) for dfn in dfs - res = outerjoin(res, dfn, on=on, makeunique=makeunique, validate=validate, + res = outerjoin(res, dfn, on=on, mergeduplicates=mergeduplicates, validate=validate, matchmissing=matchmissing, order=order) end return res end """ - semijoin(df1, df2; on, makeunique=false, validate=(false, false), matchmissing=:error) + semijoin(df1, df2; on, makeunique=false, mergeduplicates=nothing, validate=(false, false), matchmissing=:error) Perform a semi join of two data frame objects and return a `DataFrame` containing the result. A semi join returns the subset of rows of `df1` that @@ -1384,16 +1426,16 @@ julia> semijoin(name, job2, on = [:ID => :identifier]) ``` """ semijoin(df1::AbstractDataFrame, df2::AbstractDataFrame; - on::Union{<:OnType, AbstractVector} = Symbol[], makeunique::Bool=false, + on::Union{<:OnType, AbstractVector} = Symbol[], makeunique::Bool=false, mergeduplicates::MergeDuplicates=nothing, validate::Union{Pair{Bool, Bool}, Tuple{Bool, Bool}}=(false, false), matchmissing::Symbol=:error) = - _join(df1, df2, on=on, kind=:semi, makeunique=makeunique, + _join(df1, df2, on=on, kind=:semi, mergeduplicates=mergeduplicates, makeunique=makeunique, indicator=nothing, validate=validate, left_rename=identity, right_rename=identity, matchmissing=matchmissing, order=:left) """ - antijoin(df1, df2; on, makeunique=false, validate=(false, false), matchmissing=:error) + antijoin(df1, df2; on, makeunique=false, mergeduplicates=nothing, validate=(false, false), matchmissing=:error) Perform an anti join of two data frame objects and return a `DataFrame` containing the result. An anti join returns the subset of rows of `df1` that do @@ -1488,10 +1530,11 @@ julia> antijoin(name, job2, on = [:ID => :identifier]) ``` """ antijoin(df1::AbstractDataFrame, df2::AbstractDataFrame; - on::Union{<:OnType, AbstractVector} = Symbol[], makeunique::Bool=false, + on::Union{<:OnType, AbstractVector} = Symbol[], + makeunique::Bool=false, mergeduplicates::MergeDuplicates=nothing, validate::Union{Pair{Bool, Bool}, Tuple{Bool, Bool}}=(false, false), matchmissing::Symbol=:error) = - _join(df1, df2, on=on, kind=:anti, makeunique=makeunique, + _join(df1, df2, on=on, kind=:anti, mergeduplicates=mergeduplicates, makeunique=makeunique, indicator=nothing, validate=validate, left_rename=identity, right_rename=identity, matchmissing=matchmissing, @@ -1499,7 +1542,7 @@ antijoin(df1::AbstractDataFrame, df2::AbstractDataFrame; """ crossjoin(df1::AbstractDataFrame, df2::AbstractDataFrame; - makeunique::Bool=false, renamecols=identity => identity) + makeunique::Bool=false, mergeduplicates=nothing, renamecols=identity => identity) crossjoin(df1, df2, dfs...; makeunique = false) Perform a cross join of two or more data frame objects and return a `DataFrame` @@ -1512,10 +1555,16 @@ dimension that changes the fastest. - `df1`, `df2`, `dfs...` : the `AbstractDataFrames` to be joined # Keyword Arguments -- `makeunique` : if `false` (the default), an error will be raised +- `makeunique` : if `false` (the default), an error will be raised or the `mergeduplicates` + function will be used to combine columns if provided; if duplicate names are found in columns not joined on; if `true`, duplicate names will be suffixed with `_i` (`i` starting at 1 for the first duplicate). +- `mergeduplicates` : defines what to do if `name` already exists in `df` and `makeunique` + is false. It should be given a Function that combines the values of each of the duplicated + columns which will be passed as a vararg, the return value is then used to fill the column. + Because these joins are performed recursively, this `mergeduplicates` function will only + combine two values at a time. - `renamecols` : a `Pair` specifying how columns of left and right data frames should be renamed in the resulting data frame. Each element of the pair can be a string or a `Symbol` can be passed in which case it is appended to the original @@ -1565,22 +1614,25 @@ julia> crossjoin(df1, df2) ``` """ function crossjoin(df1::AbstractDataFrame, df2::AbstractDataFrame; - makeunique::Bool=false, renamecols::Pair=identity => identity) + makeunique::Bool=false, mergeduplicates::MergeDuplicates=nothing, renamecols::Pair=identity => identity) _check_consistency(df1) _check_consistency(df2) + _check_makeunique_args(mergeduplicates, makeunique) r1, r2 = size(df1, 1), size(df2, 1) new_names = vcat(_rename_cols(_names(df1), first(renamecols)), _rename_cols(_names(df2), last(renamecols))) cols = Any[[repeat(c, inner=r2) for c in eachcol(df1)]; [repeat(c, outer=r1) for c in eachcol(df2)]] - res = DataFrame(cols, new_names, copycols=false, makeunique=makeunique) + res = DataFrame(cols, new_names, copycols=false, makeunique=makeunique, mergeduplicates=mergeduplicates) for i in 1:ncol(df1) _copy_col_note_metadata!(res, i, df1, i) end - for i in 1:ncol(df2) - _copy_col_note_metadata!(res, ncol(df1) + i, df2, i) + if isnothing(mergeduplicates) + for i in 1:ncol(df2) + _copy_col_note_metadata!(res, ncol(df1) + i, df2, i) + end end _merge_matching_table_note_metadata!(res, (df1, df2)) @@ -1589,8 +1641,8 @@ function crossjoin(df1::AbstractDataFrame, df2::AbstractDataFrame; end crossjoin(df1::AbstractDataFrame, df2::AbstractDataFrame, dfs::AbstractDataFrame...; - makeunique::Bool=false) = - crossjoin(crossjoin(df1, df2, makeunique=makeunique), dfs..., makeunique=makeunique) + makeunique::Bool=false, mergeduplicates::MergeDuplicates=nothing) = + crossjoin(crossjoin(df1, df2, makeunique=makeunique, mergeduplicates=mergeduplicates), dfs..., makeunique=makeunique, mergeduplicates=mergeduplicates) # an explicit error is thrown as join was supported in the past Base.join(df1::AbstractDataFrame, df2::AbstractDataFrame, dfs::AbstractDataFrame...; diff --git a/src/join/inplace.jl b/src/join/inplace.jl index 9f1a9e0c6..42d247915 100644 --- a/src/join/inplace.jl +++ b/src/join/inplace.jl @@ -1,5 +1,5 @@ """ - leftjoin!(df1, df2; on, makeunique=false, source=nothing, + leftjoin!(df1, df2; on, makeunique=false, mergeduplicates=nothing, source=nothing, matchmissing=:error) @@ -25,10 +25,13 @@ added to `df1`. if duplicate names are found in columns not joined on; if `true`, duplicate names will be suffixed with `_i` (`i` starting at 1 for the first duplicate). +- `mergeduplicates` : defines what to do if `name` already exists in `df` and `makeunique` + is false. It should be given a Function that combines the values of all of the duplicated + columns which will be passed as a varargs. The return value is used. - `source` : Default: `nothing`. If a `Symbol` or string, adds indicator column with the given name, for whether a row appeared in only `df1` (`"left_only"`) or in both (`"both"`). If the name is already in use, - the column name will be modified if `makeunique=true`. + the column name will be modified if `makeunique=true` or `makeunique=true`. - `matchmissing` : if equal to `:error` throw an error if `missing` is present in `on` columns; if equal to `:equal` then `missing` is allowed and missings are matched; if equal to `:notequal` then missings are dropped in `df2` `on` columns. @@ -95,12 +98,14 @@ julia> leftjoin!(name, job2, on = :ID => :identifier, makeunique=true, source=:s ``` """ function leftjoin!(df1::AbstractDataFrame, df2::AbstractDataFrame; - on::Union{<:OnType, AbstractVector}=Symbol[], makeunique::Bool=false, + on::Union{<:OnType, AbstractVector}=Symbol[], + makeunique::Bool=false, mergeduplicates::MergeDuplicates=nothing, source::Union{Nothing, Symbol, AbstractString}=nothing, matchmissing::Symbol=:error) _check_consistency(df1) _check_consistency(df2) + _check_makeunique_args(mergeduplicates, makeunique) if !is_column_insertion_allowed(df1) throw(ArgumentError("leftjoin! is only supported if `df1` is a `DataFrame`, " * @@ -114,7 +119,7 @@ function leftjoin!(df1::AbstractDataFrame, df2::AbstractDataFrame; joiner = DataFrameJoiner(df1, df2, on, matchmissing, :left) right_noon_names = names(joiner.dfr, Not(joiner.right_on)) - if !(makeunique || isempty(intersect(right_noon_names, names(df1)))) + if !makeunique && isnothing(mergeduplicates) && !isempty(intersect(right_noon_names, names(df1))) throw(ArgumentError("the following columns are present in both " * "left and right data frames but not listed in `on`: " * join(intersect(right_noon_names, names(df1)), ", ") * @@ -134,7 +139,7 @@ function leftjoin!(df1::AbstractDataFrame, df2::AbstractDataFrame; rcol_joined = compose_joined_rcol!(rcol, similar_missing(rcol, nrow(df1)), right_ixs) # if df1 isa SubDataFrame we must copy columns - insertcols!(df1, colname => rcol_joined, makeunique=makeunique, + insertcols!(df1, colname => rcol_joined, makeunique=makeunique, mergeduplicates=mergeduplicates, copycols=!(df1 isa DataFrame)) # need to call parent as df1 can be a SubDataFrame _copy_col_note_metadata!(parent(df1), ncol(df1), joiner.dfr, colname) @@ -158,11 +163,18 @@ function leftjoin!(df1::AbstractDataFrame, df2::AbstractDataFrame; end if hasproperty(df1, unique_indicator) - throw(ArgumentError("joined data frame already has column " * - ":$unique_indicator. Pass makeunique=true to " * - "make it unique using a suffix automatically.")) + if !isnothing(mergeduplicates) + df1[!, unique_indicator] = mergeduplicates.(df1[!, unique_indicator], indicatorcol) + else + throw(ArgumentError("joined data frame already has column " * + ":$unique_indicator. Pass makeunique=true to " * + "make it unique using a suffix automatically or set mergeduplicates " * + "to a function.")) + end + else + df1[!, unique_indicator] = indicatorcol end - df1[!, unique_indicator] = indicatorcol + end return df1 @@ -192,3 +204,107 @@ function compose_joined_rcol!(rcol::AbstractVector, end return rcol_joined end + +""" + outerjoin!(df1, df2; on, makeunique=false, mergeduplicates=nothing, source=nothing, + matchmissing=:error) + +Perform an outer join of two data frame objects by updating into `df1`. +An outer join includes rows with keys that appear in either +of the passed data frames. + +The order of rows in the result is undefined and may change in future releases. + +In the returned data frame the type of the columns on which the data frames are +joined is determined by the element type of these columns both `df1` and `df2`. +This behavior may change in future releases. + +# Arguments +- `df1`, `df2` : the `AbstractDataFrames` to be joined + +# Keyword Arguments +- `on` : The names of the key columns on which to join the data frames. + This can be a single name, or a vector of names (for joining on multiple + columns). When joining only two data frames, a `left=>right` pair of names + can be used instead of a name, for the case where a key has different names + in `df1` and `df2` (it is allowed to mix names and name pairs in a vector). + Key values are compared using `isequal`. `on` is a required argument. +- `makeunique` : if `false` (the default), an error will be raised + if duplicate names are found in columns not joined on; + if `true`, duplicate names will be suffixed with `_i` + (`i` starting at 1 for the first duplicate). (deprecated) +- `mergeduplicates` : defines what to do if `name` already exists in `df` and `makeunique` + is false. It should be given a Function that combines the values of all of the duplicated + columns which will be passed as a varargs. The return value is used. +- `source` : Default: `nothing`. If a `Symbol` or string, adds indicator + column with the given name for whether a row appeared in only `df1` (`"left_only"`), + only `df2` (`"right_only"`) or in both (`"both"`). If the name is already in use, + the column name will be modified if `makeunique=true`. + This argument is only supported when joining exactly two data frames. +- `matchmissing` : if equal to `:error` throw an error if `missing` is present + in `on` columns; if equal to `:equal` then `missing` is allowed and missings are + matched. + +All columns of the returned data frame will support missing values. + +It is not allowed to join on columns that contain `NaN` or `-0.0` in real or +imaginary part of the number. If you need to perform a join on such values use +CategoricalArrays.jl and transform a column containing such values into a +`CategoricalVector`. + +When merging `on` categorical columns that differ in the ordering of their +levels, the ordering of the left data frame takes precedence over the ordering +of the right data frame. + +Metadata: table-level `:note`-style metadata and column-level `:note`-style metadata +for key columns is preserved only for keys which are defined in all passed tables +and have the same value. +Column-level `:note`-style metadata is preserved for all other columns. + +See also: [`innerjoin`](@ref), [`leftjoin`](@ref), [`rightjoin`](@ref), + [`semijoin`](@ref), [`antijoin`](@ref), [`crossjoin`](@ref). + +# Examples +```jldoctest +julia> name = DataFrame(ID=[1, 2, 3], Name=["John Doe", "Jane Doe", "Joe Blogs"]) +3×2 DataFrame + Row │ ID Name + │ Int64 String +─────┼────────────────── + 1 │ 1 John Doe + 2 │ 2 Jane Doe + 3 │ 3 Joe Blogs + +julia> job = DataFrame(ID=[1, 2, 4], Job=["Lawyer", "Doctor", "Farmer"]) +3×2 DataFrame + Row │ ID Job + │ Int64 String +─────┼─────────────── + 1 │ 1 Lawyer + 2 │ 2 Doctor + 3 │ 4 Farmer + +julia> outerjoin!(name, job, on = :ID) +4×3 DataFrame + Row │ ID Name Job + │ Int64 String? String? +─────┼─────────────────────────── + 1 │ 1 John Doe Lawyer + 2 │ 2 Jane Doe Doctor + 3 │ 3 Joe Blogs missing + 4 │ 4 missing Farmer +``` +""" +function outerjoin!(df1::AbstractDataFrame, df2::AbstractDataFrame; + on::Union{<:OnType, AbstractVector}=Symbol[], + makeunique::Bool=false, mergeduplicates::MergeDuplicates=nothing, + source::Union{Nothing, Symbol, AbstractString}=nothing, + matchmissing::Symbol=:error) + + leftjoin!(df1, df2, on=on, makeunique=makeunique, mergeduplicates=mergeduplicates, source=source, matchmissing=matchmissing) + + aj = antijoin(df2, df1, on=on, makeunique=makeunique, mergeduplicates=mergeduplicates, matchmissing=matchmissing) + append!(df1, aj) + + return df1 +end \ No newline at end of file diff --git a/src/other/index.jl b/src/other/index.jl index 51aa3a31c..d97b8d8e7 100644 --- a/src/other/index.jl +++ b/src/other/index.jl @@ -33,6 +33,8 @@ Index() = Index(Dict{Symbol, Int}(), Symbol[]) Base.length(x::Index) = length(x.names) Base.names(x::Index) = string.(x.names) +column_length(x::Index) = length(x.names) + # _names returns Vector{Symbol} _names(x::Index) = x.names @@ -40,7 +42,6 @@ Base.copy(x::Index) = Index(copy(x.lookup), copy(x.names)) Base.isequal(x::AbstractIndex, y::AbstractIndex) = _names(x) == _names(y) # it is enough to check names Base.:(==)(x::AbstractIndex, y::AbstractIndex) = isequal(x, y) - function rename!(x::Index, nms::AbstractVector{Symbol}; makeunique::Bool=false) if !makeunique if length(unique(nms)) != length(nms) @@ -241,7 +242,7 @@ end @inline _getindex_cols(x::AbstractIndex, idx::Any) = x[idx] @inline _getindex_cols(x::AbstractIndex, idx::Function) = findall(idx, names(x)) -# the definition below is needed because `:` is a Function +# the definition below is needed because `:` is a `Function` @inline _getindex_cols(x::AbstractIndex, idx::Colon) = x[idx] @inline function Base.getindex(x::AbstractIndex, idx::Cols) @@ -432,7 +433,7 @@ end # return Vector{Symbol} of names from add_ind that do not clash with `ind`. # if `makeunique=false` error on collision -# if `makeunique=false` generate new names that are deduplicated +# if `make=:makeunique` generate new names that are deduplicated function add_names(ind::Index, add_ind::AbstractIndex; makeunique::Bool=false) u = copy(_names(add_ind)) @@ -451,6 +452,7 @@ function add_names(ind::Index, add_ind::AbstractIndex; makeunique::Bool=false) throw(ArgumentError(msg)) end end + for i in dups nm = u[i] k = 1 diff --git a/src/other/metadata.jl b/src/other/metadata.jl index 60a283d5a..38ac84876 100644 --- a/src/other/metadata.jl +++ b/src/other/metadata.jl @@ -705,6 +705,24 @@ function _copy_col_note_metadata!(dst::DataFrame, dst_col, src, src_col) return nothing end +# copy column-level :note-style metadata from Tables.jl table src to dst +# from column src_col to dst_col +# discarding previous metadata contents of dst +function _merge_col_note_metadata!(dst::DataFrame, dst_col, src, src_col) + #emptycolmetadata!(dst, dst_col) + metadata = colmetadata(dst, dst_col) + if DataAPI.colmetadatasupport(typeof(src)).read + for key in colmetadatakeys(src, src_col) + val, style = colmetadata(src, src_col, key, style=true) + # TODO write only if does not overwrite + if style === :note && !haskey(metadata, key) + colmetadata!(dst, dst_col, key, val, style=:note) + end + end + end + return nothing +end + # this is a function used to copy table-level and column-level :note-style metadata # from Tables.jl table src to dst, discarding previous metadata contents of dst function _copy_all_note_metadata!(dst::DataFrame, src) diff --git a/src/other/tables.jl b/src/other/tables.jl index 4213c9888..3953e50d2 100644 --- a/src/other/tables.jl +++ b/src/other/tables.jl @@ -64,11 +64,11 @@ end # the logic here relies on the fact that Tables.CopiedColumns # is the only exception for default copycols value -DataFrame(x, cnames::AbstractVector; makeunique::Bool=false, +DataFrame(x, cnames::AbstractVector; makeunique::Bool=false, mergeduplicates::MergeDuplicates=nothing, copycols::Union{Nothing, Bool}=nothing) = rename!(DataFrame(x, copycols=something(copycols, !(x isa Tables.CopiedColumns))), _name2symbol(cnames), - makeunique=makeunique) + makeunique=makeunique, mergeduplicates=mergeduplicates) function Base.append!(df::DataFrame, table; cols::Symbol=:setequal, promote::Bool=(cols in [:union, :subset])) diff --git a/src/other/utils.jl b/src/other/utils.jl index 455c406f4..29609748e 100644 --- a/src/other/utils.jl +++ b/src/other/utils.jl @@ -72,6 +72,21 @@ struct AsTable end end +""" +MergeDuplicates = Union{Nothing,Function} + +Wherever the `mergeduplicates` keyword argument is available it is either `nothing` or +a `Function` that will be executed to combine duplicated columns (when `makeunique=false`) +""" +MergeDuplicates = Union{Nothing,Function} + +function _check_makeunique_args(mergeduplicates::MergeDuplicates, makeunique::Bool=false) + if makeunique && !isnothing(mergeduplicates) + throw(ArgumentError("mergeduplicates should not be set if makeunique=true")) + end + mergeduplicates +end + Base.broadcastable(x::AsTable) = Ref(x) function make_unique!(names::Vector{Symbol}, src::AbstractVector{Symbol}; @@ -102,15 +117,19 @@ function make_unique!(names::Vector{Symbol}, src::AbstractVector{Symbol}; for i in dups nm = src[i] - k = 1 - while true - newnm = Symbol("$(nm)_$k") - if !in(newnm, seen) - names[i] = newnm - push!(seen, newnm) - break + if makeunique + k = 1 + while true + newnm = Symbol("$(nm)_$k") + if !in(newnm, seen) + names[i] = newnm + push!(seen, newnm) + break + end + k += 1 end - k += 1 + else + names[i] = nm end end diff --git a/test/cat.jl b/test/cat.jl index b5aa1cfd9..da8ca2a1e 100644 --- a/test/cat.jl +++ b/test/cat.jl @@ -3,6 +3,8 @@ module TestCat using Test, Random, DataFrames, CategoricalArrays const ≅ = isequal +update_missing = (x...) -> coalesce(reverse(x)...) + @testset "hcat" begin nvint = [1, 2, missing, 4] nvstr = ["one", "two", missing, "four"] @@ -20,6 +22,12 @@ const ≅ = isequal @test dfh[!, :x1] ≅ df3[!, :x1] @test dfh ≅ DataFrames.hcat!(DataFrame(), df3, df4, makeunique=true) + dfhu = hcat(df3, df4, mergeduplicates=update_missing) + @test ref_df ≅ df3 # make sure that df3 is not mutated by hcat + @test size(dfhu, 2) == 2 + @test names(dfhu) ≅ ["x1", "x2"] + @test ! (dfhu[!, :x1] ≅ df3[!, :x1]) + dfa = DataFrame(a=[1, 2]) dfb = DataFrame(b=[3, missing]) @test hcat(dfa, dfb) ≅ [dfa dfb] @@ -29,7 +37,15 @@ const ≅ = isequal @test dfh3 ≅ hcat(dfh, df5, makeunique=true) @test dfh3 ≅ DataFrames.hcat!(DataFrame(), df3, df4, df5, makeunique=true) + dfh = hcat(df3, df4, mergeduplicates=sum∘tuple) + @test dfh ≅ DataFrames.hcat!(DataFrame(), df3, df4, mergeduplicates=sum∘tuple) + @test df2 ≅ DataFrames.hcat!(df2, makeunique=true) + + dfh3 = hcat(df3, df4, df5, mergeduplicates=update_missing) + @test names(dfh3) == ["x1", "x2"] + @test dfh3 ≅ hcat(dfhu, df5, mergeduplicates=update_missing) + @test dfh3 ≅ DataFrames.hcat!(DataFrame(), df3, df4, df5, mergeduplicates=update_missing) end @testset "hcat: copying" begin @@ -43,6 +59,13 @@ end @test hdf[!, 2] !== df[!, 1] @test hdf[!, 1] == hdf[!, 2] @test hdf[!, 1] !== hdf[!, 2] + hdf = hcat(df, df, makeunique=true) + @test hdf[!, 1] == df[!, 1] + @test hdf[!, 1] !== df[!, 1] + @test hdf[!, 2] == df[!, 1] + @test hdf[!, 2] !== df[!, 1] + @test hdf[!, 1] == hdf[!, 2] + @test hdf[!, 1] !== hdf[!, 2] hdf = hcat(df, df, df, makeunique=true) @test hdf[!, 1] == df[!, 1] @test hdf[!, 1] !== df[!, 1] @@ -56,6 +79,8 @@ end @test hdf[!, 1] !== hdf[!, 3] @test hdf[!, 2] == hdf[!, 3] @test hdf[!, 2] !== hdf[!, 3] + hdf = hcat(df, df, mergeduplicates=update_missing) + @test hdf ≅ df end @testset "hcat ::AbstractDataFrame" begin @@ -64,7 +89,6 @@ end answer = DataFrame(A=fill('A', 4), B=1:4, A_1='B', B_1=5:8, A_2='C', B_2=9:12) @test hcat(gd..., makeunique=true) == answer answer = answer[:, 1:4] - @test hcat(gd[1], gd[2], makeunique=true) == answer @test_throws MethodError hcat("a", df, makeunique=true) @test_throws MethodError hcat(df, "a", makeunique=true) diff --git a/test/dataframe.jl b/test/dataframe.jl index 971d7626d..1f4f0e69b 100644 --- a/test/dataframe.jl +++ b/test/dataframe.jl @@ -124,6 +124,8 @@ end @test DataFrame(a=[1, 2, missing], b=[4, 5, 6]) ≇ DataFrame(a=[1, 2, 3], b=[4, 5, 6]) end +update_missing = (x...) -> coalesce(reverse(x)...) + @testset "copying" begin df = DataFrame(a=Union{Int, Missing}[2, 3], b=Union{DataFrame, Missing}[DataFrame(c=1), DataFrame(d=2)]) @@ -152,6 +154,7 @@ end @test names(rename(df, [:f, :g])) == ["f", "g"] @test names(rename(df, [:f, :f], makeunique=true)) == ["f", "f_1"] + #@test names(rename(df, [:f, :f], mergeduplicates=update_missing)) == ["f", "f"] @test names(df) == ["a", "b"] rename!(df, [:f, :g]) @@ -258,6 +261,19 @@ end @test propertynames(df) == [:a_2, :a, :a_1] insertcols!(df, 4, :a => [11, 12], makeunique=true) @test propertynames(df) == [:a_2, :a, :a_1, :a_3] + + df = DataFrame(a=[1, 2], a_1=[3, 4]) + insertcols!(df, 1, :a => [11, 12], makeunique=true) + @test propertynames(df) == [:a_2, :a, :a_1] + insertcols!(df, 4, :a => [11, 12], makeunique=true) + @test propertynames(df) == [:a_2, :a, :a_1, :a_3] + + df = DataFrame(a=[1, 2, 3], a_1=[3, 4, 5]) + insertcols!(df, 1, :a => [11, 12, missing], mergeduplicates=update_missing) + @test propertynames(df) == [:a, :a_1] + @test df == DataFrame(a=[11, 12, 3], a_1=[3, 4, 5]) + + @test_throws ArgumentError insertcols!(df, 10, :a => [11, 12], makeunique=true) @test_throws ArgumentError insertcols!(df, 10, :a => [11, 12], makeunique=true) dfc = copy(df) @@ -303,6 +319,11 @@ end @test df.a_1 === v2 @test df.a_2 === v3 + df = DataFrame() + @test insertcols!(df, 1, :a=>v1, :a=>v2, :a=>v3, mergeduplicates=update_missing, copycols=false) == + DataFrame(a=v3) + @test df.a isa Vector{Int} + df = DataFrame(p='a':'b', q='r':'s') @test insertcols!(df, 2, :a=>v1, :b=>v2, :c=>v3) == DataFrame(p='a':'b', a=v1, b=v2, c=v3, q='r':'s') @@ -319,6 +340,15 @@ end @test df.q_1 !== v2 @test df.p_2 !== v3 + df = DataFrame(p='a':'b', q='r':'s') + @test insertcols!(df, 2, :p=>v1, :q=>v2, :p=>v3, makeunique=true, copycols=true) == + DataFrame(p='a':'b', p_1=v1, q_1=v2, p_2=v3, q='r':'s') + + df = DataFrame(p='a':'b', q='r':'s') + @test_throws ArgumentError insertcols!(df, 2, :p=>v1, :q=>v2, :p=>v3) + @test insertcols!(df, 2, :p=>v1, :q=>v2, :p=>v3, mergeduplicates=update_missing, copycols=true) == + DataFrame(p=v3, q=v2) + df = DataFrame(a=1:3, b=4:6) @test insertcols!(copy(df), :c=>7:9) == insertcols!(copy(df), 3, :c=>7:9) df = DataFrame() diff --git a/test/join.jl b/test/join.jl index 478cca98d..f0397fbd6 100644 --- a/test/join.jl +++ b/test/join.jl @@ -182,12 +182,12 @@ end @test typeof.(eachcol(crossjoin(df1, df2, makeunique=true))) == [Vector{Int}, Vector{Float64}, Vector{Int}, Vector{Float64}] - i(on) = innerjoin(df1, df2, on=on, makeunique=true) - l(on) = leftjoin(df1, df2, on=on, makeunique=true) - r(on) = rightjoin(df1, df2, on=on, makeunique=true) - o(on) = outerjoin(df1, df2, on=on, makeunique=true) - s(on) = semijoin(df1, df2, on=on, makeunique=true) - a(on) = antijoin(df1, df2, on=on, makeunique=true) + i(on,makeunique=true) = innerjoin(df1, df2, on=on, makeunique=makeunique) + l(on,makeunique=true) = leftjoin(df1, df2, on=on, makeunique=makeunique) + r(on,makeunique=true) = rightjoin(df1, df2, on=on, makeunique=makeunique) + o(on,makeunique=true) = outerjoin(df1, df2, on=on, makeunique=makeunique) + s(on,makeunique=true) = semijoin(df1, df2, on=on, makeunique=makeunique) + a(on,makeunique=true) = antijoin(df1, df2, on=on, makeunique=makeunique) @test s(:id) == s(:fid) == @@ -251,6 +251,73 @@ end @test typeof.(eachcol(o(on))) == [Vector{Int}, Vector{Float64}] end +update_missing = (x...) -> coalesce(reverse(x)...) + +@testset "update joins" begin + df1 = DataFrame(Any[[1, 3, 5], [1.0, 3.0, 5.0]], [:id, :fid]) + df2 = DataFrame(Any[[0, 1, 2, 3, 4], [0.0, 1.0, 2.0, 3.0, 4.0]], [:id, :fid]) + + @test crossjoin(df1, df2, mergeduplicates=update_missing) == + DataFrame(Any[repeat([0, 1, 2, 3, 4], outer=3), + repeat([0.0, 1.0, 2.0, 3.0, 4.0], outer=3)], + [:id, :fid]) + + i(on,mergeduplicates=update_missing) = innerjoin(df1, df2, on=on, mergeduplicates=mergeduplicates) + l(on,mergeduplicates=update_missing) = leftjoin(df1, df2, on=on, mergeduplicates=mergeduplicates) + r(on,mergeduplicates=update_missing) = rightjoin(df1, df2, on=on, mergeduplicates=mergeduplicates) + o(on,mergeduplicates=update_missing) = outerjoin(df1, df2, on=on, mergeduplicates=mergeduplicates) + s(on,mergeduplicates=update_missing) = semijoin(df1, df2, on=on, mergeduplicates=mergeduplicates) + a(on,mergeduplicates=update_missing) = antijoin(df1, df2, on=on, mergeduplicates=mergeduplicates) + + @test s(:id) == + s(:fid) == + s([:id, :fid]) == DataFrame([[1, 3], [1, 3]], [:id, :fid]) + @test typeof.(eachcol(s(:id))) == + typeof.(eachcol(s(:fid))) == + typeof.(eachcol(s([:id, :fid]))) == [Vector{Int}, Vector{Float64}] + @test a(:id) == + a(:fid) == + a([:id, :fid]) == DataFrame([[5], [5]], [:id, :fid]) + @test typeof.(eachcol(a(:id))) == + typeof.(eachcol(a(:fid))) == + typeof.(eachcol(a([:id, :fid]))) == [Vector{Int}, Vector{Float64}] + + on = :id + @test i(on) == DataFrame([[1, 3], [1, 3]], [:id, :fid]) + @test typeof.(eachcol(i(on))) == [Vector{Int}, Vector{Float64}] + @test l(on) ≅ DataFrame(id=[1, 3, 5], fid=[1, 3, 5]) + @test typeof.(eachcol(l(on))) == [Vector{Int}, Vector{Float64}] + @test r(on) ≅ DataFrame(id=[1, 3, 0, 2, 4], fid=[1, 3, 0, 2, 4]) + @test typeof.(eachcol(r(on))) == [Vector{Int}, Vector{Float64}] + @test o(on) ≅ DataFrame(id=[1, 3, 5, 0, 2, 4], + fid=[1, 3, 5, 0, 2, 4]) + @test typeof.(eachcol(o(on))) == [Vector{Int}, Vector{Float64}] + + on = :fid + df1.id = [1, missing, 5] + @test i(on) == DataFrame([[1, 3], [1.0, 3.0]], [:id, :fid]) + @test typeof.(eachcol(i(on))) == [Vector{Int}, Vector{Float64}] + @test l(on) ≅ DataFrame(id=[1, 3, 5], fid=[1, 3, 5]) + @test typeof.(eachcol(l(on))) == [Vector{Int}, Vector{Float64}] + @test r(on) ≅ DataFrame(id=[1, 3, 0, 2, 4], + fid=[1, 3, 0, 2, 4]) + @test typeof.(eachcol(r(on))) == [Vector{Int}, Vector{Float64}] + @test o(on) ≅ DataFrame(id=[1, 3, 5, 0, 2, 4], + fid=[1, 3, 5, 0, 2, 4]) + @test typeof.(eachcol(o(on))) == [Vector{Int}, Vector{Float64}] + + on = [:id, :fid] + df1.id = [1, 3, 5] + @test i(on) == DataFrame([[1, 3], [1, 3]], [:id, :fid]) + @test typeof.(eachcol(i(on))) == [Vector{Int}, Vector{Float64}] + @test l(on) == DataFrame(id=[1, 3, 5], fid=[1, 3, 5]) + @test typeof.(eachcol(l(on))) == [Vector{Int}, Vector{Float64}] + @test r(on) == DataFrame(id=[1, 3, 0, 2, 4], fid=[1, 3, 0, 2, 4]) + @test typeof.(eachcol(r(on))) == [Vector{Int}, Vector{Float64}] + @test o(on) == DataFrame(id=[1, 3, 5, 0, 2, 4], fid=[1, 3, 5, 0, 2, 4]) + @test typeof.(eachcol(o(on))) == [Vector{Int}, Vector{Float64}] +end + @testset "all joins with CategoricalArrays" begin df1 = DataFrame(Any[CategoricalArray([1, 3, 5]), CategoricalArray([1.0, 3.0, 5.0])], [:id, :fid]) diff --git a/test/reshape.jl b/test/reshape.jl index d78e46144..6b4ca01da 100644 --- a/test/reshape.jl +++ b/test/reshape.jl @@ -700,6 +700,7 @@ end @test names(permutedims(df3, 1, makeunique=true)) == d3pd_names @test_throws ArgumentError permutedims(df3[!, [:a]], 1) # single column branch @test names(permutedims(df3[!, [:a]], 1, makeunique=true)) == d3pd_names + @test names(permutedims(df3, 1, mergeduplicates=sum∘tuple)) == ["a", "x"] df4 = DataFrame(a=rand(2), b=rand(2), c=[1, 2], d=[1.0, missing], e=["x", "y"], f=[:x, :y], # valid src @@ -753,6 +754,7 @@ end @test permutedims(df, ["p", "p"], makeunique=true) == DataFrame(p=[1, 3], p_1=[2, 4]) @test permutedims(DataFrame()) == permutedims(DataFrame(a=[], b=[])) == permutedims(DataFrame(), []) == permutedims(DataFrame(a=[], b=[]), []) == DataFrame() + @test permutedims(df, ["p", "p"], makeunique=false, mergeduplicates=sum∘tuple) == DataFrame(p=[3, 7]) end @testset "stack view=true additional tests" begin