From 7d3b7cc4b13a57c21c4b973c855bf8b1fcfbc8c3 Mon Sep 17 00:00:00 2001 From: Lee Iverson Date: Mon, 31 Jul 2023 10:30:21 -0700 Subject: [PATCH] Overload makeunique to allow true/false, certain keywords (:update, :ignore) and a combine function to combine columns. --- src/abstractdataframe/abstractdataframe.jl | 106 ++++++++++++--------- src/abstractdataframe/reshape.jl | 8 +- src/dataframe/dataframe.jl | 65 ++++++++----- src/join/composer.jl | 99 ++++++++++++------- src/join/inplace.jl | 41 ++++++-- src/other/index.jl | 47 ++++++--- src/other/metadata.jl | 18 ++++ src/other/tables.jl | 2 +- src/other/utils.jl | 36 ++++--- test/cat.jl | 13 +++ test/dataframe.jl | 20 +++- test/join.jl | 83 ++++++++++++++-- 12 files changed, 396 insertions(+), 142 deletions(-) diff --git a/src/abstractdataframe/abstractdataframe.jl b/src/abstractdataframe/abstractdataframe.jl index a40627c6a4..0a8d18b6db 100644 --- a/src/abstractdataframe/abstractdataframe.jl +++ b/src/abstractdataframe/abstractdataframe.jl @@ -117,9 +117,9 @@ Compat.hasproperty(df::AbstractDataFrame, s::AbstractString) = haskey(index(df), """ rename!(df::AbstractDataFrame, vals::AbstractVector{Symbol}; - makeunique::Bool=false) + makeunique=false) rename!(df::AbstractDataFrame, vals::AbstractVector{<:AbstractString}; - makeunique::Bool=false) + makeunique=false) rename!(df::AbstractDataFrame, (from => to)::Pair...) rename!(df::AbstractDataFrame, d::AbstractDict) rename!(df::AbstractDataFrame, d::AbstractVector{<:Pair}) @@ -138,7 +138,8 @@ Each name is changed at most once. Permutation of names is allowed. of the same length as the number of columns in `df` - `makeunique` : if `false` (the default), an error will be raised if duplicate names are found; if `true`, duplicate names will be suffixed - with `_i` (`i` starting at 1 for the first duplicate). + with `_i` (`i` starting at 1 for the first duplicate). If a Function of two + inputs will use that function to combine the duplicate column with the original If pairs are passed to `rename!` (as positional arguments or in a dictionary or a vector) then: @@ -197,7 +198,7 @@ julia> rename!(uppercase, df) ``` """ function rename!(df::AbstractDataFrame, vals::AbstractVector{Symbol}; - makeunique::Bool=false) + makeunique=false) rename!(index(df), vals, makeunique=makeunique) # renaming columns of SubDataFrame has to clean non-note metadata in its parent _drop_all_nonnote_metadata!(parent(df)) @@ -205,7 +206,7 @@ function rename!(df::AbstractDataFrame, vals::AbstractVector{Symbol}; end function rename!(df::AbstractDataFrame, vals::AbstractVector{<:AbstractString}; - makeunique::Bool=false) + makeunique=false) rename!(index(df), Symbol.(vals), makeunique=makeunique) # renaming columns of SubDataFrame has to clean non-note metadata in its parent _drop_all_nonnote_metadata!(parent(df)) @@ -261,9 +262,9 @@ end """ rename(df::AbstractDataFrame, vals::AbstractVector{Symbol}; - makeunique::Bool=false) + makeunique=false) rename(df::AbstractDataFrame, vals::AbstractVector{<:AbstractString}; - makeunique::Bool=false) + makeunique=false) rename(df::AbstractDataFrame, (from => to)::Pair...) rename(df::AbstractDataFrame, d::AbstractDict) rename(df::AbstractDataFrame, d::AbstractVector{<:Pair}) @@ -353,9 +354,9 @@ julia> rename(uppercase, df) ``` """ rename(df::AbstractDataFrame, vals::AbstractVector{Symbol}; - makeunique::Bool=false) = rename!(copy(df), vals, makeunique=makeunique) + makeunique=false) = rename!(copy(df), vals, makeunique=makeunique) rename(df::AbstractDataFrame, vals::AbstractVector{<:AbstractString}; - makeunique::Bool=false) = rename!(copy(df), vals, makeunique=makeunique) + makeunique=false) = rename!(copy(df), vals, makeunique=makeunique) rename(df::AbstractDataFrame, args...) = rename!(copy(df), args...) rename(f::Function, df::AbstractDataFrame) = rename!(f, copy(df)) @@ -1536,13 +1537,15 @@ end """ hcat(df::AbstractDataFrame...; - makeunique::Bool=false, copycols::Bool=true) + makeunique=false, copycols::Bool=true) Horizontally concatenate data frames. If `makeunique=false` (the default) column names of passed objects must be unique. If `makeunique=true` then duplicate column names will be suffixed with `_i` (`i` starting at 1 for the first duplicate). +If `makeunique` is a Function of two inputs then will use that function to combine +the left-hand values with the right-hand values in the duplicated columns. If `copycols=true` (the default) then the `DataFrame` returned by `hcat` will contain copied columns from the source data frames. @@ -1593,24 +1596,23 @@ julia> df3.A === df1.A true ``` """ -function Base.hcat(df::AbstractDataFrame; makeunique::Bool=false, copycols::Bool=true) +function Base.hcat(df::AbstractDataFrame; makeunique=false, copycols::Bool=true) df = DataFrame(df, copycols=copycols) _drop_all_nonnote_metadata!(df) return df end # TODO: after deprecation remove AbstractVector methods -Base.hcat(df::AbstractDataFrame, x::AbstractVector; makeunique::Bool=false, copycols::Bool=true) = +Base.hcat(df::AbstractDataFrame, x::AbstractVector; makeunique=false, copycols::Bool=true) = hcat!(DataFrame(df, copycols=copycols), x, makeunique=makeunique, copycols=copycols) -Base.hcat(x::AbstractVector, df::AbstractDataFrame; makeunique::Bool=false, copycols::Bool=true) = +Base.hcat(x::AbstractVector, df::AbstractDataFrame; makeunique=false, copycols::Bool=true) = hcat!(x, df, makeunique=makeunique, copycols=copycols) Base.hcat(df1::AbstractDataFrame, df2::AbstractDataFrame; - makeunique::Bool=false, copycols::Bool=true) = - hcat!(DataFrame(df1, copycols=copycols), df2, - makeunique=makeunique, copycols=copycols) + makeunique=false, copycols::Bool=true) = + hcat!(DataFrame(df1, copycols=copycols), df2, makeunique=makeunique, copycols=copycols) Base.hcat(df::AbstractDataFrame, x::Union{AbstractVector, AbstractDataFrame}, y::Union{AbstractVector, AbstractDataFrame}...; - makeunique::Bool=false, copycols::Bool=true) = + makeunique=false, copycols::Bool=true) = hcat!(hcat(df, x, makeunique=makeunique, copycols=copycols), y..., makeunique=makeunique, copycols=copycols) @@ -2869,7 +2871,8 @@ const INSERTCOLS_ARGUMENTS = - `after` : if `true` columns are inserted after `col` - `makeunique` : defines what to do if `name` already exists in `df`; if it is `false` an error will be thrown; if it is `true` a new unique name will - be generated by adding a suffix + be generated by adding a suffix; if it is a Function then combines the two duplicate + column using that function to combine the left-hand value with the right-hand. - `copycols` : whether vectors passed as columns should be copied If `val` is an `AbstractRange` then the result of `collect(val)` is inserted. @@ -2891,7 +2894,7 @@ const INSERTCOLS_ARGUMENTS = """ insertcols(df::AbstractDataFrame[, col], (name=>val)::Pair...; - after::Bool=false, makeunique::Bool=false, copycols::Bool=true) + after::Bool=false, makeunique=false, copycols::Bool=true) Insert a column into a copy of `df` data frame using the [`insertcols!`](@ref) function and return the newly created data frame. @@ -2942,13 +2945,13 @@ julia> insertcols(df, :a, :d => 7:9, after=true) ``` """ insertcols(df::AbstractDataFrame, args...; - after::Bool=false, makeunique::Bool=false, copycols::Bool=true) = + after::Bool=false, makeunique=false, copycols::Bool=true) = insertcols!(copy(df), args...; after=after, makeunique=makeunique, copycols=copycols) """ insertcols!(df::AbstractDataFrame[, col], (name=>val)::Pair...; - after::Bool=false, makeunique::Bool=false, copycols::Bool=true) + after::Bool=false, makeunique=false, copycols::Bool=true) Insert a column into a data frame in place. Return the updated data frame. @@ -2979,7 +2982,7 @@ julia> insertcols!(df, 1, :b => 'a':'c') 2 │ b 2 3 │ c 3 -julia> insertcols!(df, 2, :c => 2:4, :c => 3:5, makeunique=true) +julia> insertcols!(df, 2, :c => 2:4, :c => 3:5, makeunique=false) 3×4 DataFrame Row │ b c c_1 a │ Char Int64 Int64 Int64 @@ -2999,7 +3002,9 @@ julia> insertcols!(df, :b, :d => 7:9, after=true) ``` """ function insertcols!(df::AbstractDataFrame, col::ColumnIndex, name_cols::Pair{Symbol}...; - after::Bool=false, makeunique::Bool=false, copycols::Bool=true) + after::Bool=false, makeunique=false, copycols::Bool=true) + makeunique = _makeunique_normalize(makeunique) + if !is_column_insertion_allowed(df) throw(ArgumentError("insertcols! is only supported for DataFrame, or for " * "SubDataFrame created with `:` as column selector")) @@ -3025,15 +3030,15 @@ function insertcols!(df::AbstractDataFrame, col::ColumnIndex, name_cols::Pair{Sy "$(ncol(df)) columns at index $col_ind")) end - if !makeunique + if makeunique == false if !allunique(first.(name_cols)) throw(ArgumentError("Names of columns to be inserted into a data frame " * - "must be unique when `makeunique=true`")) + "must be unique when `makeunique=false`")) end for (n, _) in name_cols if hasproperty(df, n) throw(ArgumentError("Column $n is already present in the data frame " * - "which is not allowed when `makeunique=true`")) + "which is not allowed when `makeunique=false`")) end end end @@ -3103,19 +3108,11 @@ function insertcols!(df::AbstractDataFrame, col::ColumnIndex, name_cols::Pair{Sy dfp[!, name] = item_new else if hasproperty(dfp, name) - @assert makeunique - k = 1 - while true - nn = Symbol("$(name)_$k") - if !hasproperty(dfp, nn) - name = nn - break - end - k += 1 - end + col_ind = insert_unique(dfp, name, col_ind, item_new, makeunique) + else + insert!(index(dfp), col_ind, name) + insert!(_columns(dfp), col_ind, item_new) end - insert!(index(dfp), col_ind, name) - insert!(_columns(dfp), col_ind, item_new) end col_ind += 1 end @@ -3134,22 +3131,22 @@ function insertcols!(df::AbstractDataFrame, col::ColumnIndex, name_cols::Pair{Sy end insertcols!(df::AbstractDataFrame, col::ColumnIndex, name_cols::Pair{<:AbstractString}...; - after::Bool=false, makeunique::Bool=false, copycols::Bool=true) = + after::Bool=false, makeunique=false, copycols::Bool=true) = insertcols!(df, col, (Symbol(n) => v for (n, v) in name_cols)..., after=after, makeunique=makeunique, copycols=copycols) insertcols!(df::AbstractDataFrame, name_cols::Pair{Symbol}...; - after::Bool=false, makeunique::Bool=false, copycols::Bool=true) = + after::Bool=false, makeunique=false, copycols::Bool=true) = insertcols!(df, ncol(df)+1, name_cols..., after=after, makeunique=makeunique, copycols=copycols) insertcols!(df::AbstractDataFrame, name_cols::Pair{<:AbstractString}...; - after::Bool=false, makeunique::Bool=false, copycols::Bool=true) = + after::Bool=false, makeunique=false, copycols::Bool=true) = insertcols!(df, (Symbol(n) => v for (n, v) in name_cols)..., after=after, makeunique=makeunique, copycols=copycols) function insertcols!(df::AbstractDataFrame, col::ColumnIndex; after::Bool=false, - makeunique::Bool=false, copycols::Bool=true) + makeunique=false, copycols::Bool=true) if col isa SymbolOrString col_ind = Int(columnindex(df, col)) if col_ind == 0 @@ -3173,11 +3170,34 @@ function insertcols!(df::AbstractDataFrame, col::ColumnIndex; after::Bool=false, end function insertcols!(df::AbstractDataFrame; after::Bool=false, - makeunique::Bool=false, copycols::Bool=true) + makeunique=false, copycols::Bool=true) _drop_all_nonnote_metadata!(parent(df)) return df end +function insert_unique(dfp, name, col_ind, item_new, makeunique::Bool=false) + if makeunique + k = 1 + while true + nn = Symbol("$(name)_$k") + if !hasproperty(dfp, nn) + name = nn + break + end + k += 1 + end + insert!(index(dfp), col_ind, name) + insert!(_columns(dfp), col_ind, item_new) + end + col_ind +end + +function insert_unique(dfp, name, col_ind, item_new, makeunique::Function) + # Just update by using function without adding to index + dfp[!, name] = makeunique.(dfp[!, name], item_new) + col_ind - 1 +end + """ Iterators.partition(df::AbstractDataFrame, n::Integer) diff --git a/src/abstractdataframe/reshape.jl b/src/abstractdataframe/reshape.jl index 2effb6f2fd..88726676e5 100644 --- a/src/abstractdataframe/reshape.jl +++ b/src/abstractdataframe/reshape.jl @@ -723,7 +723,7 @@ Base.transpose(::AbstractDataFrame, args...; kwargs...) = permutedims(df::AbstractDataFrame, [src_namescol::Union{Int, Symbol, AbstractString}], [dest_namescol::Union{Symbol, AbstractString}]; - makeunique::Bool=false, strict::Bool=true) + makeunique=false, strict::Bool=true) Turn `df` on its side such that rows become columns and values in the column indexed by `src_namescol` become the names of new columns. @@ -823,7 +823,7 @@ julia> permutedims(df2, 1, "different_name") """ function Base.permutedims(df::AbstractDataFrame, src_namescol::ColumnIndex, dest_namescol::Union{Symbol, AbstractString}; - makeunique::Bool=false, strict::Bool=true) + makeunique=false, strict::Bool=true) if src_namescol isa Integer 1 <= src_namescol <= ncol(df) || throw(BoundsError(index(df), src_namescol)) @@ -865,7 +865,7 @@ function Base.permutedims(df::AbstractDataFrame, src_namescol::ColumnIndex, end function Base.permutedims(df::AbstractDataFrame, src_namescol::ColumnIndex; - makeunique::Bool=false, strict::Bool=true) + makeunique=false, strict::Bool=true) if src_namescol isa Integer 1 <= src_namescol <= ncol(df) || throw(BoundsError(index(df), src_namescol)) dest_namescol = _names(df)[src_namescol] @@ -883,7 +883,7 @@ function Base.permutedims(df::AbstractDataFrame) end function Base.permutedims(df::AbstractDataFrame, cnames::AbstractVector; - makeunique::Bool=false) + makeunique=false) out_df = DataFrame(permutedims(Matrix(df)), cnames, makeunique=makeunique) _copy_table_note_metadata!(out_df, df) return out_df diff --git a/src/dataframe/dataframe.jl b/src/dataframe/dataframe.jl index 3f4afafecf..5c4b110d76 100755 --- a/src/dataframe/dataframe.jl +++ b/src/dataframe/dataframe.jl @@ -8,16 +8,16 @@ particularly a `Vector`, `PooledVector` or `CategoricalVector`. # Constructors ```julia -DataFrame(pairs::Pair...; makeunique::Bool=false, copycols::Bool=true) -DataFrame(pairs::AbstractVector{<:Pair}; makeunique::Bool=false, copycols::Bool=true) +DataFrame(pairs::Pair...; makeunique=false, copycols::Bool=true) +DataFrame(pairs::AbstractVector{<:Pair}; makeunique=false, copycols::Bool=true) DataFrame(ds::AbstractDict; copycols::Bool=true) DataFrame(; kwargs..., copycols::Bool=true) DataFrame(table; copycols::Union{Bool, Nothing}=nothing) DataFrame(table, names::AbstractVector; - makeunique::Bool=false, copycols::Union{Bool, Nothing}=nothing) + makeunique=false, copycols::Union{Bool, Nothing}=nothing) DataFrame(columns::AbstractVecOrMat, names::AbstractVector; - makeunique::Bool=false, copycols::Bool=true) + makeunique=false, copycols::Bool=true) DataFrame(::DataFrameRow; copycols::Bool=true) DataFrame(::GroupedDataFrame; copycols::Bool=true, keepkeys::Bool=true) @@ -88,6 +88,9 @@ By default an error will be raised if duplicates in column names are found. Pass in which case they will be suffixed with `_i` (`i` starting at 1 for the first duplicate). +If duplicate column names are found and `makeunique` is a Function then the left-hand column is `updated` +with the ouput of the function applied to the values from the left-hand column and the right-hand column. + If an `AbstractRange` is passed to a `DataFrame` constructor as a column it is always collected to a `Vector` (even if `copycols=false`). As a general rule `AbstractRange` values are always materialized to a `Vector` by all functions in @@ -194,7 +197,7 @@ mutable struct DataFrame <: AbstractDataFrame colindex::Index; copycols::Bool=true) if length(columns) == length(colindex) == 0 return new(AbstractVector[], Index(), nothing, nothing, true) - elseif length(columns) != length(colindex) + elseif length(columns) != column_length(colindex) throw(DimensionMismatch("Number of columns ($(length(columns))) and number of " * "column names ($(length(colindex))) are not equal")) end @@ -232,6 +235,22 @@ mutable struct DataFrame <: AbstractDataFrame firstindex(col) != 1 && _onebased_check_error(i, col) end + # process updates if they exist + if !isempty(colindex.updates) + updated = Vector{Any}(nothing, length(colindex.names)) + for src in eachindex(colindex.updates) + name = colindex.updates[src] + dst = colindex.lookup[name] + if isnothing(updated[dst]) + updated[dst] = columns[src] + else + updated[dst] = colindex.updatefun.(updated[dst], columns[src]) + end + end + columns = updated + colindex = Index(colindex.lookup, colindex.names) + end + return new(convert(Vector{AbstractVector}, columns), colindex, nothing, nothing, true) end end @@ -254,7 +273,8 @@ end DataFrame(df::DataFrame; copycols::Bool=true) = copy(df, copycols=copycols) -function DataFrame(pairs::Pair{Symbol, <:Any}...; makeunique::Bool=false, +function DataFrame(pairs::Pair{Symbol, <:Any}...; + makeunique=false, copycols::Bool=true)::DataFrame colnames = [Symbol(k) for (k, v) in pairs] columns = Any[v for (k, v) in pairs] @@ -262,7 +282,8 @@ function DataFrame(pairs::Pair{Symbol, <:Any}...; makeunique::Bool=false, copycols=copycols) end -function DataFrame(pairs::Pair{<:AbstractString, <:Any}...; makeunique::Bool=false, +function DataFrame(pairs::Pair{<:AbstractString, <:Any}...; + makeunique=false, copycols::Bool=true)::DataFrame colnames = [Symbol(k) for (k, v) in pairs] columns = Any[v for (k, v) in pairs] @@ -271,8 +292,8 @@ function DataFrame(pairs::Pair{<:AbstractString, <:Any}...; makeunique::Bool=fal end # this is needed as a workaround for Tables.jl dispatch -function DataFrame(pairs::AbstractVector{<:Pair}; makeunique::Bool=false, - copycols::Bool=true) +function DataFrame(pairs::AbstractVector{<:Pair}; + makeunique=false, copycols::Bool=true) if isempty(pairs) return DataFrame() else @@ -334,7 +355,7 @@ function DataFrame(; kwargs...) end function DataFrame(columns::AbstractVector, cnames::AbstractVector{Symbol}; - makeunique::Bool=false, copycols::Bool=true)::DataFrame + makeunique=false, copycols::Bool=true)::DataFrame if !(eltype(columns) <: AbstractVector) && !all(col -> isa(col, AbstractVector), columns) return rename!(DataFrame(columns, copycols=copycols), cnames, makeunique=makeunique) end @@ -351,17 +372,17 @@ function _name2symbol(str::AbstractVector) end DataFrame(columns::AbstractVector, cnames::AbstractVector; - makeunique::Bool=false, copycols::Bool=true) = + makeunique=false, copycols::Bool=true) = DataFrame(columns, _name2symbol(cnames), makeunique=makeunique, copycols=copycols) DataFrame(columns::AbstractVector{<:AbstractVector}, cnames::AbstractVector{Symbol}; - makeunique::Bool=false, copycols::Bool=true)::DataFrame = + makeunique=false, copycols::Bool=true)::DataFrame = DataFrame(collect(AbstractVector, columns), Index(convert(Vector{Symbol}, cnames), makeunique=makeunique), copycols=copycols) DataFrame(columns::AbstractVector{<:AbstractVector}, cnames::AbstractVector; - makeunique::Bool=false, copycols::Bool=true) = + makeunique=false, copycols::Bool=true) = DataFrame(columns, _name2symbol(cnames); makeunique=makeunique, copycols=copycols) function DataFrame(columns::AbstractVector, cnames::Symbol; copycols::Bool=true) @@ -375,14 +396,14 @@ function DataFrame(columns::AbstractVector, cnames::Symbol; copycols::Bool=true) end function DataFrame(columns::AbstractMatrix, cnames::AbstractVector{Symbol}; - makeunique::Bool=false, copycols::Bool=true) + makeunique=false, copycols::Bool=true) getter = copycols ? getindex : view return DataFrame(AbstractVector[getter(columns, :, i) for i in 1:size(columns, 2)], cnames, makeunique=makeunique, copycols=false) end DataFrame(columns::AbstractMatrix, cnames::AbstractVector; - makeunique::Bool=false, copycols::Bool=true) = + makeunique=false, copycols::Bool=true) = DataFrame(columns, _name2symbol(cnames); makeunique=makeunique, copycols=copycols) function DataFrame(columns::AbstractMatrix, cnames::Symbol; copycols::Bool=true) @@ -408,13 +429,13 @@ DataFrame(vecs::Vector{<:AbstractVector}) = "generate column names: `DataFrame(vecs, :auto)`")) DataFrame(column_eltypes::AbstractVector{<:Type}, cnames::AbstractVector{Symbol}, - nrows::Integer=0; makeunique::Bool=false) = + nrows::Integer=0; makeunique=false) = throw(ArgumentError("`DataFrame` constructor with passed eltypes is " * "not supported. Pass explicitly created columns to a " * "`DataFrame` constructor instead.")) DataFrame(column_eltypes::AbstractVector{<:Type}, cnames::AbstractVector{<:AbstractString}, - nrows::Integer=0; makeunique::Bool=false) = + nrows::Integer=0; makeunique=false) = throw(ArgumentError("`DataFrame` constructor with passed eltypes is " * "not supported. Pass explicitly created columns to a " * "`DataFrame` constructor instead.")) @@ -1202,7 +1223,7 @@ end # hcat! for 2 arguments, only a vector or a data frame is allowed function hcat!(df1::DataFrame, df2::AbstractDataFrame; - makeunique::Bool=false, copycols::Bool=true) + makeunique=false, copycols::Bool=true) u = add_names(index(df1), index(df2), makeunique=makeunique) _drop_all_nonnote_metadata!(df1) @@ -1217,14 +1238,14 @@ end # TODO: after deprecation remove AbstractVector methods -function hcat!(df::DataFrame, x::AbstractVector; makeunique::Bool=false, copycols::Bool=true) +function hcat!(df::DataFrame, x::AbstractVector; makeunique=false, copycols::Bool=true) Base.depwarn("horizontal concatenation of data frame with a vector is deprecated. " * "Pass DataFrame(x1=x) instead.", :hcat!) return hcat!(df, DataFrame(AbstractVector[x], [:x1], copycols=false), makeunique=makeunique, copycols=copycols) end -function hcat!(x::AbstractVector, df::DataFrame; makeunique::Bool=false, copycols::Bool=true) +function hcat!(x::AbstractVector, df::DataFrame; makeunique=false, copycols::Bool=true) Base.depwarn("horizontal concatenation of data frame with a vector is deprecated. " * "Pass DataFrame(x1=x) instead.", :hcat!) return hcat!(DataFrame(AbstractVector[x], [:x1], copycols=copycols), df, @@ -1232,14 +1253,14 @@ function hcat!(x::AbstractVector, df::DataFrame; makeunique::Bool=false, copycol end # hcat! for 1-n arguments -function hcat!(df::DataFrame; makeunique::Bool=false, copycols::Bool=true) +function hcat!(df::DataFrame; makeunique=false, copycols::Bool=true) _drop_all_nonnote_metadata!(df) return df end hcat!(a::DataFrame, b::Union{AbstractDataFrame, AbstractVector}, c::Union{AbstractDataFrame, AbstractVector}...; - makeunique::Bool=false, copycols::Bool=true) = + makeunique=false, copycols::Bool=true) = hcat!(hcat!(a, b, makeunique=makeunique, copycols=copycols), c..., makeunique=makeunique, copycols=copycols) diff --git a/src/join/composer.jl b/src/join/composer.jl index 3cd4e90b3e..c3a50209f5 100644 --- a/src/join/composer.jl +++ b/src/join/composer.jl @@ -118,8 +118,10 @@ _rename_cols(old_names::AbstractVector{Symbol}, for n in old_names] function _propagate_join_metadata!(joiner::DataFrameJoiner, dfr_noon::AbstractDataFrame, - res::DataFrame, kind::Symbol) + res::DataFrame, kind::Symbol; + makeunique=false, names=nothing) @assert kind == :left || kind == :right || kind == :outer || kind == :inner + makeunique = _makeunique_normalize(makeunique) # The steps taken in this function are (all applies only to :note-style metadata): # We initially copy metadata from left table as left table is always used @@ -174,8 +176,17 @@ function _propagate_join_metadata!(joiner::DataFrameJoiner, dfr_noon::AbstractDa end end - for i in 1:ncol(dfr_noon) - _copy_col_note_metadata!(res, ncol(joiner.dfl) + i, dfr_noon, i) + if makeunique isa Bool + for i in 1:ncol(dfr_noon) + _copy_col_note_metadata!(res, ncol(joiner.dfl) + i, dfr_noon, i) + end + else + map = Index(names, makeunique=makeunique) + for i in 1:ncol(dfr_noon) + name = map.updates[ncol(joiner.dfl) + i] + dst = map.lookup[name] + _merge_col_note_metadata!(res, dst, dfr_noon, i) + end end if kind == :outer || kind == :inner @@ -234,8 +245,7 @@ function _count_sortperm!(input::Vector{Int}, count::Vector, return output end -function compose_inner_table(joiner::DataFrameJoiner, - makeunique::Bool, +function compose_inner_table(joiner::DataFrameJoiner, makeunique, left_rename::Union{Function, AbstractString, Symbol}, right_rename::Union{Function, AbstractString, Symbol}, order::Symbol) @@ -280,7 +290,7 @@ function compose_inner_table(joiner::DataFrameJoiner, _rename_cols(_names(dfr_noon), right_rename)) res = DataFrame(cols, new_names, makeunique=makeunique, copycols=false) - _propagate_join_metadata!(joiner, dfr_noon, res, :inner) + _propagate_join_metadata!(joiner, dfr_noon, res, :inner, makeunique=makeunique, names=new_names) return res end @@ -292,7 +302,7 @@ function find_missing_idxs(present::Vector{Int}, target_len::Int) return _findall(not_seen) end -function compose_joined_table(joiner::DataFrameJoiner, kind::Symbol, makeunique::Bool, +function compose_joined_table(joiner::DataFrameJoiner, kind::Symbol, makeunique, left_rename::Union{Function, AbstractString, Symbol}, right_rename::Union{Function, AbstractString, Symbol}, indicator::Union{Nothing, Symbol, AbstractString}, @@ -319,7 +329,7 @@ function compose_joined_table(joiner::DataFrameJoiner, kind::Symbol, makeunique: leftonly_ixs, rightonly_ixs, order) end -function _compose_joined_table(joiner::DataFrameJoiner, kind::Symbol, makeunique::Bool, +function _compose_joined_table(joiner::DataFrameJoiner, kind::Symbol, makeunique, left_rename::Union{Function, AbstractString, Symbol}, right_rename::Union{Function, AbstractString, Symbol}, indicator::Union{Nothing, Symbol, AbstractString}, @@ -447,7 +457,7 @@ function _compose_joined_table(joiner::DataFrameJoiner, kind::Symbol, makeunique permute!(res, new_order) end - _propagate_join_metadata!(joiner, dfr_noon, res, kind) + _propagate_join_metadata!(joiner, dfr_noon, res, kind, makeunique=makeunique, names=new_names) return res, src_indicator end @@ -484,7 +494,7 @@ function _sort_compose_helper(fillval::Int, # value to use to fill unused indice end function _join(df1::AbstractDataFrame, df2::AbstractDataFrame; - on::Union{<:OnType, AbstractVector}, kind::Symbol, makeunique::Bool, + on::Union{<:OnType, AbstractVector}, kind::Symbol, makeunique, indicator::Union{Nothing, Symbol, AbstractString}, validate::Union{Pair{Bool, Bool}, Tuple{Bool, Bool}}, left_rename::Union{Function, AbstractString, Symbol}, @@ -604,9 +614,10 @@ function _join(df1::AbstractDataFrame, df2::AbstractDataFrame; "both" => 3) indicatorcol = PooledArray(PooledArrays.RefArray(src_indicator), invpool, pool) - + + makeunique = _makeunique_normalize(makeunique) unique_indicator = indicator - if makeunique + if makeunique == true try_idx = 0 while hasproperty(joined, unique_indicator) try_idx += 1 @@ -614,12 +625,16 @@ function _join(df1::AbstractDataFrame, df2::AbstractDataFrame; end end - if hasproperty(joined, unique_indicator) - throw(ArgumentError("joined data frame already has column " * - ":$unique_indicator. Pass makeunique=true to " * - "make it unique using a suffix automatically.")) + if unique_indicator == indicator && !isa(makeunique, Bool) + joined[!, indicator] = makeunique.(joined[!, indicator], indicatorcol) + else + if hasproperty(joined, unique_indicator) + throw(ArgumentError("joined data frame already has column " * + ":$unique_indicator. Pass makeunique=true to " * + "make it unique using a suffix automatically.")) + end + joined[!, unique_indicator] = indicatorcol end - joined[!, unique_indicator] = indicatorcol else @assert isnothing(src_indicator) end @@ -755,7 +770,7 @@ julia> innerjoin(name, job2, on = [:ID => :identifier], renamecols = uppercase = """ function innerjoin(df1::AbstractDataFrame, df2::AbstractDataFrame; on::Union{<:OnType, AbstractVector} = Symbol[], - makeunique::Bool=false, + makeunique=false, validate::Union{Pair{Bool, Bool}, Tuple{Bool, Bool}}=(false, false), renamecols::Pair=identity => identity, matchmissing::Symbol=:error, @@ -772,7 +787,7 @@ end function innerjoin(df1::AbstractDataFrame, df2::AbstractDataFrame, dfs::AbstractDataFrame...; on::Union{<:OnType, AbstractVector} = Symbol[], - makeunique::Bool=false, + makeunique=false, validate::Union{Pair{Bool, Bool}, Tuple{Bool, Bool}}=(false, false), matchmissing::Symbol=:error, order::Symbol=:undefined) @@ -813,8 +828,10 @@ change in future releases. `isequal`. `on` is a required argument. - `makeunique` : if `false` (the default), an error will be raised if duplicate names are found in columns not joined on; - if `true`, duplicate names will be suffixed with `_i` + if `true`, duplicate names will be suffixed with `_i` (deprecated) (`i` starting at 1 for the first duplicate). + if a Function then combines the duplicated column values by invoking the function + with the left-hand values and right-hand values as inputs. - `source` : Default: `nothing`. If a `Symbol` or string, adds indicator column with the given name, for whether a row appeared in only `df1` (`"left_only"`) or in both (`"both"`). If the name is already in use, @@ -915,12 +932,14 @@ julia> leftjoin(name, job2, on = [:ID => :identifier], renamecols = uppercase => ``` """ function leftjoin(df1::AbstractDataFrame, df2::AbstractDataFrame; - on::Union{<:OnType, AbstractVector} = Symbol[], makeunique::Bool=false, + on::Union{<:OnType, AbstractVector} = Symbol[], + makeunique=false, source::Union{Nothing, Symbol, AbstractString}=nothing, indicator::Union{Nothing, Symbol, AbstractString}=nothing, validate::Union{Pair{Bool, Bool}, Tuple{Bool, Bool}}=(false, false), renamecols::Pair=identity => identity, matchmissing::Symbol=:error, order::Symbol=:undefined) + if !all(x -> x isa Union{Function, AbstractString, Symbol}, renamecols) throw(ArgumentError("renamecols keyword argument must be a `Pair` " * "containing functions, strings, or `Symbol`s")) @@ -970,7 +989,9 @@ change in future releases. - `makeunique` : if `false` (the default), an error will be raised if duplicate names are found in columns not joined on; if `true`, duplicate names will be suffixed with `_i` - (`i` starting at 1 for the first duplicate). + (`i` starting at 1 for the first duplicate); otherwise + if a Function then invokes that function with the left-hand side value + and the right-hand side value as inputs to produce the output value. - `source` : Default: `nothing`. If a `Symbol` or string, adds indicator column with the given name for whether a row appeared in only `df2` (`"right_only"`) or in both (`"both"`). If the name is already in use, @@ -1071,7 +1092,8 @@ julia> rightjoin(name, job2, on = [:ID => :identifier], renamecols = uppercase = ``` """ function rightjoin(df1::AbstractDataFrame, df2::AbstractDataFrame; - on::Union{<:OnType, AbstractVector} = Symbol[], makeunique::Bool=false, + on::Union{<:OnType, AbstractVector} = Symbol[], + makeunique=false, source::Union{Nothing, Symbol, AbstractString}=nothing, indicator::Union{Nothing, Symbol, AbstractString}=nothing, validate::Union{Pair{Bool, Bool}, Tuple{Bool, Bool}}=(false, false), @@ -1102,7 +1124,7 @@ end """ outerjoin(df1, df2; on, makeunique=false, source=nothing, validate=(false, false), renamecols=(identity => identity), matchmissing=:error, order=:undefined) - outerjoin(df1, df2, dfs...; on, makeunique = false, + outerjoin(df1, df2, dfs...; on, makeunique=false, validate = (false, false), matchmissing=:error, order=:undefined) Perform an outer join of two or more data frame objects and return a `DataFrame` @@ -1128,7 +1150,9 @@ This behavior may change in future releases. - `makeunique` : if `false` (the default), an error will be raised if duplicate names are found in columns not joined on; if `true`, duplicate names will be suffixed with `_i` - (`i` starting at 1 for the first duplicate). + (`i` starting at 1 for the first duplicate), otherwise + if a Function then this function will be invoked on the values from the left-hand + column and the right-hand side column as inputs to create an output value. - `source` : Default: `nothing`. If a `Symbol` or string, adds indicator column with the given name for whether a row appeared in only `df1` (`"left_only"`), only `df2` (`"right_only"`) or in both (`"both"`). If the name is already in use, @@ -1240,7 +1264,8 @@ julia> outerjoin(name, job2, on = [:ID => :identifier], renamecols = uppercase = ``` """ function outerjoin(df1::AbstractDataFrame, df2::AbstractDataFrame; - on::Union{<:OnType, AbstractVector} = Symbol[], makeunique::Bool=false, + on::Union{<:OnType, AbstractVector} = Symbol[], + makeunique=false, source::Union{Nothing, Symbol, AbstractString}=nothing, indicator::Union{Nothing, Symbol, AbstractString}=nothing, validate::Union{Pair{Bool, Bool}, Tuple{Bool, Bool}}=(false, false), @@ -1269,7 +1294,8 @@ function outerjoin(df1::AbstractDataFrame, df2::AbstractDataFrame; end function outerjoin(df1::AbstractDataFrame, df2::AbstractDataFrame, dfs::AbstractDataFrame...; - on::Union{<:OnType, AbstractVector} = Symbol[], makeunique::Bool=false, + on::Union{<:OnType, AbstractVector} = Symbol[], + makeunique=false, validate::Union{Pair{Bool, Bool}, Tuple{Bool, Bool}}=(false, false), matchmissing::Symbol=:error, order::Symbol=:undefined) res = outerjoin(df1, df2, on=on, makeunique=makeunique, validate=validate, @@ -1384,7 +1410,7 @@ julia> semijoin(name, job2, on = [:ID => :identifier]) ``` """ semijoin(df1::AbstractDataFrame, df2::AbstractDataFrame; - on::Union{<:OnType, AbstractVector} = Symbol[], makeunique::Bool=false, + on::Union{<:OnType, AbstractVector} = Symbol[], makeunique=false, validate::Union{Pair{Bool, Bool}, Tuple{Bool, Bool}}=(false, false), matchmissing::Symbol=:error) = _join(df1, df2, on=on, kind=:semi, makeunique=makeunique, @@ -1488,7 +1514,7 @@ julia> antijoin(name, job2, on = [:ID => :identifier]) ``` """ antijoin(df1::AbstractDataFrame, df2::AbstractDataFrame; - on::Union{<:OnType, AbstractVector} = Symbol[], makeunique::Bool=false, + on::Union{<:OnType, AbstractVector} = Symbol[], makeunique=false, validate::Union{Pair{Bool, Bool}, Tuple{Bool, Bool}}=(false, false), matchmissing::Symbol=:error) = _join(df1, df2, on=on, kind=:anti, makeunique=makeunique, @@ -1499,7 +1525,7 @@ antijoin(df1::AbstractDataFrame, df2::AbstractDataFrame; """ crossjoin(df1::AbstractDataFrame, df2::AbstractDataFrame; - makeunique::Bool=false, renamecols=identity => identity) + makeunique=false, renamecols=identity => identity) crossjoin(df1, df2, dfs...; makeunique = false) Perform a cross join of two or more data frame objects and return a `DataFrame` @@ -1565,10 +1591,11 @@ julia> crossjoin(df1, df2) ``` """ function crossjoin(df1::AbstractDataFrame, df2::AbstractDataFrame; - makeunique::Bool=false, renamecols::Pair=identity => identity) + makeunique=false, renamecols::Pair=identity => identity) _check_consistency(df1) _check_consistency(df2) r1, r2 = size(df1, 1), size(df2, 1) + makeunique = _makeunique_normalize(makeunique) new_names = vcat(_rename_cols(_names(df1), first(renamecols)), _rename_cols(_names(df2), last(renamecols))) @@ -1579,8 +1606,10 @@ function crossjoin(df1::AbstractDataFrame, df2::AbstractDataFrame; for i in 1:ncol(df1) _copy_col_note_metadata!(res, i, df1, i) end - for i in 1:ncol(df2) - _copy_col_note_metadata!(res, ncol(df1) + i, df2, i) + if makeunique isa Bool + for i in 1:ncol(df2) + _copy_col_note_metadata!(res, ncol(df1) + i, df2, i) + end end _merge_matching_table_note_metadata!(res, (df1, df2)) @@ -1589,13 +1618,13 @@ function crossjoin(df1::AbstractDataFrame, df2::AbstractDataFrame; end crossjoin(df1::AbstractDataFrame, df2::AbstractDataFrame, dfs::AbstractDataFrame...; - makeunique::Bool=false) = + makeunique=false) = crossjoin(crossjoin(df1, df2, makeunique=makeunique), dfs..., makeunique=makeunique) # an explicit error is thrown as join was supported in the past Base.join(df1::AbstractDataFrame, df2::AbstractDataFrame, dfs::AbstractDataFrame...; on::Union{<:OnType, AbstractVector} = Symbol[], - kind::Symbol = :inner, makeunique::Bool=false, + kind::Symbol = :inner, makeunique=false, indicator::Union{Nothing, Symbol} = nothing, validate::Union{Pair{Bool, Bool}, Tuple{Bool, Bool}}=(false, false)) = throw(ArgumentError("join function for data frames is not supported. Use innerjoin, " * diff --git a/src/join/inplace.jl b/src/join/inplace.jl index 9f1a9e0c6c..1cec6b76e7 100644 --- a/src/join/inplace.jl +++ b/src/join/inplace.jl @@ -24,7 +24,10 @@ added to `df1`. - `makeunique` : if `false` (the default), an error will be raised if duplicate names are found in columns not joined on; if `true`, duplicate names will be suffixed with `_i` - (`i` starting at 1 for the first duplicate). + (`i` starting at 1 for the first duplicate); otherwise + if a Function then applies that function to the values from the first + duplicate column and the second duplicate column to produce the output + values. - `source` : Default: `nothing`. If a `Symbol` or string, adds indicator column with the given name, for whether a row appeared in only `df1` (`"left_only"`) or in both (`"both"`). If the name is already in use, @@ -95,12 +98,14 @@ julia> leftjoin!(name, job2, on = :ID => :identifier, makeunique=true, source=:s ``` """ function leftjoin!(df1::AbstractDataFrame, df2::AbstractDataFrame; - on::Union{<:OnType, AbstractVector}=Symbol[], makeunique::Bool=false, + on::Union{<:OnType, AbstractVector}=Symbol[], + makeunique=false, source::Union{Nothing, Symbol, AbstractString}=nothing, matchmissing::Symbol=:error) _check_consistency(df1) _check_consistency(df2) + makeunique = _makeunique_normalize(makeunique) if !is_column_insertion_allowed(df1) throw(ArgumentError("leftjoin! is only supported if `df1` is a `DataFrame`, " * @@ -114,7 +119,7 @@ function leftjoin!(df1::AbstractDataFrame, df2::AbstractDataFrame; joiner = DataFrameJoiner(df1, df2, on, matchmissing, :left) right_noon_names = names(joiner.dfr, Not(joiner.right_on)) - if !(makeunique || isempty(intersect(right_noon_names, names(df1)))) + if makeunique == false && !isempty(intersect(right_noon_names, names(df1))) throw(ArgumentError("the following columns are present in both " * "left and right data frames but not listed in `on`: " * join(intersect(right_noon_names, names(df1)), ", ") * @@ -149,7 +154,7 @@ function leftjoin!(df1::AbstractDataFrame, df2::AbstractDataFrame; invpool, pool) unique_indicator = source - if makeunique + if makeunique == true try_idx = 0 while hasproperty(df1, unique_indicator) try_idx += 1 @@ -158,11 +163,18 @@ function leftjoin!(df1::AbstractDataFrame, df2::AbstractDataFrame; end if hasproperty(df1, unique_indicator) - throw(ArgumentError("joined data frame already has column " * - ":$unique_indicator. Pass makeunique=true to " * - "make it unique using a suffix automatically.")) + if makeunique isa Bool + throw(ArgumentError("joined data frame already has column " * + ":$unique_indicator. Pass makeunique=true to " * + "make it unique using a suffix automatically or a makeunique function " * + "to combine left-hand column and right-hand column values.")) + else + df1[!, unique_indicator] = makeunique.(df1[!, unique_indicator], indicatorcol) + end + else + df1[!, unique_indicator] = indicatorcol end - df1[!, unique_indicator] = indicatorcol + end return df1 @@ -192,3 +204,16 @@ function compose_joined_rcol!(rcol::AbstractVector, end return rcol_joined end + +function outerjoin!(df1::AbstractDataFrame, df2::AbstractDataFrame; + on::Union{<:OnType, AbstractVector}=Symbol[], makeunique=false, + source::Union{Nothing, Symbol, AbstractString}=nothing, + matchmissing::Symbol=:error) + + leftjoin!(df1, df2, on=on, makeunique=makeunique, source=source, matchmissing=matchmissing) + + aj = antijoin(df2, df1, on=on, makeunique=makeunique, matchmissing=matchmissing) + append!(df1, aj) + + return df1 +end diff --git a/src/other/index.jl b/src/other/index.jl index 51aa3a31cc..805e65d7d6 100644 --- a/src/other/index.jl +++ b/src/other/index.jl @@ -21,18 +21,30 @@ const MULTICOLUMNINDEX_STR = "`:`, `Cols`, `All`, `Between`, `Not`, a regular ex struct Index <: AbstractIndex # an OrderedDict would be nice here... lookup::Dict{Symbol, Int} # name => names array position names::Vector{Symbol} + updates::Vector{Symbol} + updatefun::Function end -function Index(names::AbstractVector{Symbol}; makeunique::Bool=false) - u = make_unique(names, makeunique=makeunique) - lookup = Dict{Symbol, Int}(zip(u, 1:length(u))) - return Index(lookup, u) +Index(l,u) = Index(l,u,[],() -> nothing) + +function Index(names::AbstractVector{Symbol}; makeunique=false) + makeunique = _makeunique_normalize(makeunique) + if makeunique isa Bool + u = make_unique(names, makeunique=makeunique) + lookup = Dict{Symbol, Int}(zip(u, 1:length(u))) + return Index(lookup, u) + else + lookup = Dict{Symbol, Int}(zip(reverse(names), length(names):-1:1)) + return Index(lookup, unique(names), names, makeunique) + end end Index() = Index(Dict{Symbol, Int}(), Symbol[]) Base.length(x::Index) = length(x.names) Base.names(x::Index) = string.(x.names) +column_length(x::Index) = isempty(x.updates) ? length(x.names) : length(x.updates) + # _names returns Vector{Symbol} _names(x::Index) = x.names @@ -41,8 +53,9 @@ Base.isequal(x::AbstractIndex, y::AbstractIndex) = _names(x) == _names(y) # it i Base.:(==)(x::AbstractIndex, y::AbstractIndex) = isequal(x, y) -function rename!(x::Index, nms::AbstractVector{Symbol}; makeunique::Bool=false) - if !makeunique +function rename!(x::Index, nms::AbstractVector{Symbol}; makeunique=false) + makeunique = _makeunique_normalize(makeunique) + if makeunique == false if length(unique(nms)) != length(nms) dup = unique(nms[nonunique(DataFrame(nms=nms))]) dupstr = join(string.(':', dup), ", ", " and ") @@ -128,7 +141,7 @@ function Base.push!(x::Index, nm::Symbol) return x end -function Base.merge!(x::Index, y::AbstractIndex; makeunique::Bool=false) +function Base.merge!(x::Index, y::AbstractIndex; makeunique=false) adds = add_names(x, y, makeunique=makeunique) i = length(x) for add in adds @@ -139,7 +152,7 @@ function Base.merge!(x::Index, y::AbstractIndex; makeunique::Bool=false) return x end -Base.merge(x::AbstractIndex, y::AbstractIndex; makeunique::Bool=false) = +Base.merge(x::AbstractIndex, y::AbstractIndex; makeunique=false) = merge!(copy(x), y, makeunique=makeunique) function Base.delete!(x::Index, idx::Integer) @@ -432,8 +445,9 @@ end # return Vector{Symbol} of names from add_ind that do not clash with `ind`. # if `makeunique=false` error on collision -# if `makeunique=false` generate new names that are deduplicated -function add_names(ind::Index, add_ind::AbstractIndex; makeunique::Bool=false) +# if `makeunique=true` generate new names that are deduplicated +# if `makeunique` is a Function just return the names including duplicates +function add_names(ind::Index, add_ind::AbstractIndex; makeunique=false) u = copy(_names(add_ind)) seen = Set(_names(ind)) @@ -443,6 +457,12 @@ function add_names(ind::Index, add_ind::AbstractIndex; makeunique::Bool=false) name = u[i] in(name, seen) ? push!(dups, i) : push!(seen, name) end + + makeunique = _makeunique_normalize(makeunique) + return nondup_names(u, dups, seen, makeunique) +end + +function nondup_names(u, dups, seen, makeunique::Bool) if length(dups) > 0 if !makeunique dupstr = join(string.(':', unique(u[dups])), ", ", " and ") @@ -451,6 +471,7 @@ function add_names(ind::Index, add_ind::AbstractIndex; makeunique::Bool=false) throw(ArgumentError(msg)) end end + for i in dups nm = u[i] k = 1 @@ -468,6 +489,10 @@ function add_names(ind::Index, add_ind::AbstractIndex; makeunique::Bool=false) return u end +function nondup_names(u, dups, seen, makeunique::Function) + return u +end + @inline parentcols(ind::Index) = Base.OneTo(length(ind)) @inline parentcols(ind::Index, cols) = ind[cols] @@ -578,7 +603,7 @@ function Base.getindex(x::SubIndex, idx::Union{AbstractVector{Symbol}, return [x[i] for i in idx] end -rename!(x::SubIndex, nms::AbstractVector{Symbol}; makeunique::Bool=false) = +rename!(x::SubIndex, nms::AbstractVector{Symbol}; makeunique=false) = throw(ArgumentError("rename! is not supported for views other than created " * "with Colon as a column selector")) diff --git a/src/other/metadata.jl b/src/other/metadata.jl index 60a283d5a0..38ac848769 100644 --- a/src/other/metadata.jl +++ b/src/other/metadata.jl @@ -705,6 +705,24 @@ function _copy_col_note_metadata!(dst::DataFrame, dst_col, src, src_col) return nothing end +# copy column-level :note-style metadata from Tables.jl table src to dst +# from column src_col to dst_col +# discarding previous metadata contents of dst +function _merge_col_note_metadata!(dst::DataFrame, dst_col, src, src_col) + #emptycolmetadata!(dst, dst_col) + metadata = colmetadata(dst, dst_col) + if DataAPI.colmetadatasupport(typeof(src)).read + for key in colmetadatakeys(src, src_col) + val, style = colmetadata(src, src_col, key, style=true) + # TODO write only if does not overwrite + if style === :note && !haskey(metadata, key) + colmetadata!(dst, dst_col, key, val, style=:note) + end + end + end + return nothing +end + # this is a function used to copy table-level and column-level :note-style metadata # from Tables.jl table src to dst, discarding previous metadata contents of dst function _copy_all_note_metadata!(dst::DataFrame, src) diff --git a/src/other/tables.jl b/src/other/tables.jl index 4213c98881..2fdb9a559e 100644 --- a/src/other/tables.jl +++ b/src/other/tables.jl @@ -64,7 +64,7 @@ end # the logic here relies on the fact that Tables.CopiedColumns # is the only exception for default copycols value -DataFrame(x, cnames::AbstractVector; makeunique::Bool=false, +DataFrame(x, cnames::AbstractVector; makeunique=false, copycols::Union{Nothing, Bool}=nothing) = rename!(DataFrame(x, copycols=something(copycols, !(x isa Tables.CopiedColumns))), _name2symbol(cnames), diff --git a/src/other/utils.jl b/src/other/utils.jl index 455c406f46..7a4fe8a446 100644 --- a/src/other/utils.jl +++ b/src/other/utils.jl @@ -72,10 +72,20 @@ struct AsTable end end +makeunique_update(v1, v2) = ismissing(v2) ? v1 : v2 +makeunique_ignore(v1, v2) = v1 + +_makeunique_keys = Dict(:update => makeunique_update, + :ignore => makeunique_ignore, + :error => false, + :makeunique => true) + +_makeunique_normalize(makeunique) = get(_makeunique_keys, makeunique, makeunique) + Base.broadcastable(x::AsTable) = Ref(x) function make_unique!(names::Vector{Symbol}, src::AbstractVector{Symbol}; - makeunique::Bool=false) + makeunique=false) if length(names) != length(src) throw(DimensionMismatch("Length of src doesn't match length of names.")) end @@ -92,7 +102,7 @@ function make_unique!(names::Vector{Symbol}, src::AbstractVector{Symbol}; end if length(dups) > 0 - if !makeunique + if makeunique == false dupstr = join(string.(':', unique(src[dups])), ", ", " and ") msg = "Duplicate variable names: $dupstr. Pass makeunique=true " * "to make them unique using a suffix automatically." @@ -102,22 +112,26 @@ function make_unique!(names::Vector{Symbol}, src::AbstractVector{Symbol}; for i in dups nm = src[i] - k = 1 - while true - newnm = Symbol("$(nm)_$k") - if !in(newnm, seen) - names[i] = newnm - push!(seen, newnm) - break + if makeunique == true + k = 1 + while true + newnm = Symbol("$(nm)_$k") + if !in(newnm, seen) + names[i] = newnm + push!(seen, newnm) + break + end + k += 1 end - k += 1 + else + names[i] = nm end end return names end -function make_unique(names::AbstractVector{Symbol}; makeunique::Bool=false) +function make_unique(names::AbstractVector{Symbol}; makeunique=false) make_unique!(similar(names), names, makeunique=makeunique) end diff --git a/test/cat.jl b/test/cat.jl index b5aa1cfd9b..888ff3a92f 100644 --- a/test/cat.jl +++ b/test/cat.jl @@ -20,6 +20,12 @@ const ≅ = isequal @test dfh[!, :x1] ≅ df3[!, :x1] @test dfh ≅ DataFrames.hcat!(DataFrame(), df3, df4, makeunique=true) + dfhu = hcat(df3, df4, makeunique=:update) + @test ref_df ≅ df3 # make sure that df3 is not mutated by hcat + @test size(dfhu, 2) == 2 + @test names(dfhu) ≅ ["x1", "x2"] + @test ! (dfhu[!, :x1] ≅ df3[!, :x1]) + dfa = DataFrame(a=[1, 2]) dfb = DataFrame(b=[3, missing]) @test hcat(dfa, dfb) ≅ [dfa dfb] @@ -30,6 +36,11 @@ const ≅ = isequal @test dfh3 ≅ DataFrames.hcat!(DataFrame(), df3, df4, df5, makeunique=true) @test df2 ≅ DataFrames.hcat!(df2, makeunique=true) + + dfh3 = hcat(df3, df4, df5, makeunique=:update) + @test names(dfh3) == ["x1", "x2"] + @test dfh3 ≅ hcat(dfhu, df5, makeunique=:update) + @test dfh3 ≅ DataFrames.hcat!(DataFrame(), df3, df4, df5, makeunique=:update) end @testset "hcat: copying" begin @@ -56,6 +67,8 @@ end @test hdf[!, 1] !== hdf[!, 3] @test hdf[!, 2] == hdf[!, 3] @test hdf[!, 2] !== hdf[!, 3] + hdf = hcat(df, df, makeunique=:update) + @test hdf ≅ df end @testset "hcat ::AbstractDataFrame" begin diff --git a/test/dataframe.jl b/test/dataframe.jl index 971d7626dd..340beff446 100644 --- a/test/dataframe.jl +++ b/test/dataframe.jl @@ -152,6 +152,7 @@ end @test names(rename(df, [:f, :g])) == ["f", "g"] @test names(rename(df, [:f, :f], makeunique=true)) == ["f", "f_1"] + @test names(rename(df, [:f, :f], makeunique=:update)) == ["f", "f"] @test names(df) == ["a", "b"] rename!(df, [:f, :g]) @@ -253,11 +254,18 @@ end df = DataFrame(a=[1, 2], a_1=[3, 4]) @test_throws ArgumentError insertcols!(df, 1, :a => [11, 12]) - @test df == DataFrame(a=[1, 2], a_1=[3, 4]) + + df = DataFrame(a=[1, 2], a_1=[3, 4]) insertcols!(df, 1, :a => [11, 12], makeunique=true) @test propertynames(df) == [:a_2, :a, :a_1] insertcols!(df, 4, :a => [11, 12], makeunique=true) @test propertynames(df) == [:a_2, :a, :a_1, :a_3] + + df = DataFrame(a=[1, 2], a_1=[3, 4]) + insertcols!(df, 1, :a => [11, 12], makeunique=:update) + @test propertynames(df) == [:a, :a_1] + @test df == DataFrame(a=[11, 12], a_1=[3, 4]) + @test_throws ArgumentError insertcols!(df, 10, :a => [11, 12], makeunique=true) dfc = copy(df) @@ -303,6 +311,11 @@ end @test df.a_1 === v2 @test df.a_2 === v3 + df = DataFrame() + @test insertcols!(df, 1, :a=>v1, :a=>v2, :a=>v3, makeunique=:update, copycols=false) == + DataFrame(a=v3) + @test df.a isa Vector{Int} + df = DataFrame(p='a':'b', q='r':'s') @test insertcols!(df, 2, :a=>v1, :b=>v2, :c=>v3) == DataFrame(p='a':'b', a=v1, b=v2, c=v3, q='r':'s') @@ -319,6 +332,11 @@ end @test df.q_1 !== v2 @test df.p_2 !== v3 + df = DataFrame(p='a':'b', q='r':'s') + @test_throws ArgumentError insertcols!(df, 2, :p=>v1, :q=>v2, :p=>v3) + @test insertcols!(df, 2, :p=>v1, :q=>v2, :p=>v3, makeunique=:update, copycols=true) == + DataFrame(p=v3, q=v2) + df = DataFrame(a=1:3, b=4:6) @test insertcols!(copy(df), :c=>7:9) == insertcols!(copy(df), 3, :c=>7:9) df = DataFrame() diff --git a/test/join.jl b/test/join.jl index 478cca98d3..81b76d052f 100644 --- a/test/join.jl +++ b/test/join.jl @@ -182,12 +182,12 @@ end @test typeof.(eachcol(crossjoin(df1, df2, makeunique=true))) == [Vector{Int}, Vector{Float64}, Vector{Int}, Vector{Float64}] - i(on) = innerjoin(df1, df2, on=on, makeunique=true) - l(on) = leftjoin(df1, df2, on=on, makeunique=true) - r(on) = rightjoin(df1, df2, on=on, makeunique=true) - o(on) = outerjoin(df1, df2, on=on, makeunique=true) - s(on) = semijoin(df1, df2, on=on, makeunique=true) - a(on) = antijoin(df1, df2, on=on, makeunique=true) + i(on,makeunique=true) = innerjoin(df1, df2, on=on, makeunique=makeunique) + l(on,makeunique=true) = leftjoin(df1, df2, on=on, makeunique=makeunique) + r(on,makeunique=true) = rightjoin(df1, df2, on=on, makeunique=makeunique) + o(on,makeunique=true) = outerjoin(df1, df2, on=on, makeunique=makeunique) + s(on,makeunique=true) = semijoin(df1, df2, on=on, makeunique=makeunique) + a(on,makeunique=true) = antijoin(df1, df2, on=on, makeunique=makeunique) @test s(:id) == s(:fid) == @@ -251,6 +251,77 @@ end @test typeof.(eachcol(o(on))) == [Vector{Int}, Vector{Float64}] end +@testset "update joins" begin + df1 = DataFrame(Any[[1, 3, 5], [1.0, 3.0, 5.0]], [:id, :fid]) + df2 = DataFrame(Any[[0, 1, 2, 3, 4], [0.0, 1.0, 2.0, 3.0, 4.0]], [:id, :fid]) + + update = DataFrames.makeunique_update + + @test crossjoin(df1, df2, makeunique=update) == + DataFrame(Any[repeat([0, 1, 2, 3, 4], outer=3), + repeat([0.0, 1.0, 2.0, 3.0, 4.0], outer=3)], + [:id, :fid]) + @test crossjoin(df1, df2, makeunique=:update) == + DataFrame(Any[repeat([0, 1, 2, 3, 4], outer=3), + repeat([0.0, 1.0, 2.0, 3.0, 4.0], outer=3)], + [:id, :fid]) + + i(on,makeunique=:update) = innerjoin(df1, df2, on=on, makeunique=makeunique) + l(on,makeunique=:update) = leftjoin(df1, df2, on=on, makeunique=makeunique) + r(on,makeunique=:update) = rightjoin(df1, df2, on=on, makeunique=makeunique) + o(on,makeunique=:update) = outerjoin(df1, df2, on=on, makeunique=makeunique) + s(on,makeunique=:update) = semijoin(df1, df2, on=on, makeunique=makeunique) + a(on,makeunique=:update) = antijoin(df1, df2, on=on, makeunique=makeunique) + + @test s(:id) == + s(:fid) == + s([:id, :fid]) == DataFrame([[1, 3], [1, 3]], [:id, :fid]) + @test typeof.(eachcol(s(:id))) == + typeof.(eachcol(s(:fid))) == + typeof.(eachcol(s([:id, :fid]))) == [Vector{Int}, Vector{Float64}] + @test a(:id) == + a(:fid) == + a([:id, :fid]) == DataFrame([[5], [5]], [:id, :fid]) + @test typeof.(eachcol(a(:id))) == + typeof.(eachcol(a(:fid))) == + typeof.(eachcol(a([:id, :fid]))) == [Vector{Int}, Vector{Float64}] + + on = :id + @test i(on) == DataFrame([[1, 3], [1, 3]], [:id, :fid]) + @test typeof.(eachcol(i(on))) == [Vector{Int}, Vector{Float64}] + @test l(on) ≅ DataFrame(id=[1, 3, 5], fid=[1, 3, 5]) + @test typeof.(eachcol(l(on))) == [Vector{Int}, Vector{Float64}] + @test r(on) ≅ DataFrame(id=[1, 3, 0, 2, 4], fid=[1, 3, 0, 2, 4]) + @test typeof.(eachcol(r(on))) == [Vector{Int}, Vector{Float64}] + @test o(on) ≅ DataFrame(id=[1, 3, 5, 0, 2, 4], + fid=[1, 3, 5, 0, 2, 4]) + @test typeof.(eachcol(o(on))) == [Vector{Int}, Vector{Float64}] + + on = :fid + df1.id = [1, missing, 5] + @test i(on) == DataFrame([[1, 3], [1.0, 3.0]], [:id, :fid]) + @test typeof.(eachcol(i(on))) == [Vector{Int}, Vector{Float64}] + @test l(on) ≅ DataFrame(id=[1, 3, 5], fid=[1, 3, 5]) + @test typeof.(eachcol(l(on))) == [Vector{Int}, Vector{Float64}] + @test r(on) ≅ DataFrame(id=[1, 3, 0, 2, 4], + fid=[1, 3, 0, 2, 4]) + @test typeof.(eachcol(r(on))) == [Vector{Int}, Vector{Float64}] + @test o(on) ≅ DataFrame(id=[1, 3, 5, 0, 2, 4], + fid=[1, 3, 5, 0, 2, 4]) + @test typeof.(eachcol(o(on))) == [Vector{Int}, Vector{Float64}] + + on = [:id, :fid] + df1.id = [1, 3, 5] + @test i(on) == DataFrame([[1, 3], [1, 3]], [:id, :fid]) + @test typeof.(eachcol(i(on))) == [Vector{Int}, Vector{Float64}] + @test l(on) == DataFrame(id=[1, 3, 5], fid=[1, 3, 5]) + @test typeof.(eachcol(l(on))) == [Vector{Int}, Vector{Float64}] + @test r(on) == DataFrame(id=[1, 3, 0, 2, 4], fid=[1, 3, 0, 2, 4]) + @test typeof.(eachcol(r(on))) == [Vector{Int}, Vector{Float64}] + @test o(on) == DataFrame(id=[1, 3, 5, 0, 2, 4], fid=[1, 3, 5, 0, 2, 4]) + @test typeof.(eachcol(o(on))) == [Vector{Int}, Vector{Float64}] +end + @testset "all joins with CategoricalArrays" begin df1 = DataFrame(Any[CategoricalArray([1, 3, 5]), CategoricalArray([1.0, 3.0, 5.0])], [:id, :fid])