Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Updateindex #3401

Open
wants to merge 5 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
144 changes: 97 additions & 47 deletions src/abstractdataframe/abstractdataframe.jl
Original file line number Diff line number Diff line change
Expand Up @@ -197,18 +197,30 @@ julia> rename!(uppercase, df)
```
"""
function rename!(df::AbstractDataFrame, vals::AbstractVector{Symbol};
makeunique::Bool=false)
rename!(index(df), vals, makeunique=makeunique)
makeunique::Bool=false, mergeduplicates::MergeDuplicates=nothing)
if !makeunique && isa(mergeduplicates, Function)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

the docstring seems not to have been updated.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

in particular, I am not clear what rename!/rename should do when mergeduplicates is passed.

(new_columns, colindex) = process_updates(UpdateIndex(vals), _columns(df), mergeduplicates)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

_columns is not defined for general AbstractDataFrame.

# Now we must replace the columns and index with the new ones in place...
splice!(_columns(df), 1:length(_columns(df)), new_columns) # Replace the columns with these new ones...
rename!(index(df), colindex)
else
rename!(index(df), vals, makeunique=makeunique)
end
# renaming columns of SubDataFrame has to clean non-note metadata in its parent
_drop_all_nonnote_metadata!(parent(df))
return df
end

function rename!(idx::Index, new_index::Index)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

functions for Index should be added in other/index.jl

splice!(idx.names, 1:length(idx.names), new_index.names)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

should we first check that idx and new_index are independent?

empty!(idx.lookup)
merge!(idx.lookup, new_index.lookup)
return idx
end

function rename!(df::AbstractDataFrame, vals::AbstractVector{<:AbstractString};
makeunique::Bool=false)
rename!(index(df), Symbol.(vals), makeunique=makeunique)
# renaming columns of SubDataFrame has to clean non-note metadata in its parent
_drop_all_nonnote_metadata!(parent(df))
makeunique::Bool=false, mergeduplicates::MergeDuplicates=nothing)
rename!(df, Symbol.(vals), makeunique=makeunique, mergeduplicates=mergeduplicates)
return df
end

Expand Down Expand Up @@ -353,9 +365,11 @@ julia> rename(uppercase, df)
```
"""
rename(df::AbstractDataFrame, vals::AbstractVector{Symbol};
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

docstring update is missing

makeunique::Bool=false) = rename!(copy(df), vals, makeunique=makeunique)
makeunique::Bool=false, mergeduplicates::MergeDuplicates=nothing) =
rename!(copy(df), vals, makeunique=makeunique, mergeduplicates=mergeduplicates)
rename(df::AbstractDataFrame, vals::AbstractVector{<:AbstractString};
makeunique::Bool=false) = rename!(copy(df), vals, makeunique=makeunique)
makeunique::Bool=false, mergeduplicates::MergeDuplicates=nothing) =
rename!(copy(df), vals, makeunique=makeunique, mergeduplicates=mergeduplicates)
rename(df::AbstractDataFrame, args...) = rename!(copy(df), args...)
rename(f::Function, df::AbstractDataFrame) = rename!(f, copy(df))

Expand Down Expand Up @@ -1534,16 +1548,30 @@ function fillcombinations(df::AbstractDataFrame, indexcols;
return out_df
end

"""
MergeDuplicates = Union{Nothing,Function}

Wherever the `mergeduplicates` keyword argument is available it is either `nothing` or
a `Function` that will be executed to combine duplicated columns (when `makeunique=false`)
"""
MergeDuplicates = Union{Nothing,Function}
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I am OK to add this definition, but then its docstring should be more precise I think.


"""
hcat(df::AbstractDataFrame...;
makeunique::Bool=false, copycols::Bool=true)
makeunique::Bool=false, mergeduplicates::MergeDuplicates=nothing, copycols::Bool=true)

Horizontally concatenate data frames.

If `makeunique=false` (the default) column names of passed objects must be unique.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this statement does not seem to be true after this PR.

If `makeunique=true` then duplicate column names will be suffixed
with `_i` (`i` starting at 1 for the first duplicate).

If `makeunique=false` and `mergeduplicates` is a `Function` then duplicate columns
will be combined by invoking the function with all values from those columns.
e.g. `mergeduplicates=coalesce` will use the first non-missing value. Since `hcat` and
`hcat!` are performed recursively for more than two frames, this `mergeduplicates`
function will only combine two columns at a time.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

it is not clear what happens if makeuniqe=true and mergeduplicates is Function`.


If `copycols=true` (the default) then the `DataFrame` returned by `hcat` will
contain copied columns from the source data frames.
If `copycols=false` then it will contain columns as they are stored in the
Expand Down Expand Up @@ -1593,26 +1621,26 @@ julia> df3.A === df1.A
true
```
"""
function Base.hcat(df::AbstractDataFrame; makeunique::Bool=false, copycols::Bool=true)
function Base.hcat(df::AbstractDataFrame; makeunique::Bool=false, mergeduplicates::MergeDuplicates=nothing, copycols::Bool=true)
df = DataFrame(df, copycols=copycols)
_drop_all_nonnote_metadata!(df)
return df
end

# TODO: after deprecation remove AbstractVector methods
Base.hcat(df::AbstractDataFrame, x::AbstractVector; makeunique::Bool=false, copycols::Bool=true) =
hcat!(DataFrame(df, copycols=copycols), x, makeunique=makeunique, copycols=copycols)
Base.hcat(x::AbstractVector, df::AbstractDataFrame; makeunique::Bool=false, copycols::Bool=true) =
hcat!(x, df, makeunique=makeunique, copycols=copycols)
Base.hcat(df::AbstractDataFrame, x::AbstractVector; makeunique::Bool=false, mergeduplicates::MergeDuplicates=nothing, copycols::Bool=true) =
hcat!(DataFrame(df, copycols=copycols), x, makeunique=makeunique, mergeduplicates=mergeduplicates, copycols=copycols)
Base.hcat(x::AbstractVector, df::AbstractDataFrame; makeunique::Bool=false, mergeduplicates::MergeDuplicates=nothing, copycols::Bool=true) =
hcat!(x, df, makeunique=makeunique, mergeduplicates=mergeduplicates, copycols=copycols)
Base.hcat(df1::AbstractDataFrame, df2::AbstractDataFrame;
makeunique::Bool=false, copycols::Bool=true) =
makeunique::Bool=false, mergeduplicates::MergeDuplicates=nothing, copycols::Bool=true) =
hcat!(DataFrame(df1, copycols=copycols), df2,
makeunique=makeunique, copycols=copycols)
makeunique=makeunique, mergeduplicates=mergeduplicates, copycols=copycols)
Base.hcat(df::AbstractDataFrame, x::Union{AbstractVector, AbstractDataFrame},
y::Union{AbstractVector, AbstractDataFrame}...;
makeunique::Bool=false, copycols::Bool=true) =
hcat!(hcat(df, x, makeunique=makeunique, copycols=copycols), y...,
makeunique=makeunique, copycols=copycols)
makeunique::Bool=false, mergeduplicates::MergeDuplicates=nothing, copycols::Bool=true) =
hcat!(hcat(df, x, makeunique=makeunique, mergeduplicates=mergeduplicates, copycols=copycols), y...,
makeunique=makeunique, mergeduplicates=mergeduplicates, copycols=copycols)

"""
vcat(dfs::AbstractDataFrame...;
Expand Down Expand Up @@ -2868,8 +2896,11 @@ const INSERTCOLS_ARGUMENTS =
are unwrapped and treated in the same way
- `after` : if `true` columns are inserted after `col`
- `makeunique` : defines what to do if `name` already exists in `df`;
if it is `false` an error will be thrown; if it is `true` a new unique name will
be generated by adding a suffix
if it is `true` a new unique name will be generated by adding a suffix,
if it is `false` an error will be thrown unless a `mergeduplicates` functiom is provided.
- `mergeduplicates` : defines what to do if `name` already exists in `df` and `makeunique`
is false. It should be given a Function that combines the values of all of the duplicated
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
is false. It should be given a Function that combines the values of all of the duplicated
is false. It should be given a `Function` that combines the values of all of the duplicated

columns which will be passed as a varargs. The return value is used.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

it is not clear if the passed function takes elements of the columns iteratively or whole columns.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Also it is not clear how things are processed if multiple duplicate columns are provided.

- `copycols` : whether vectors passed as columns should be copied

If `val` is an `AbstractRange` then the result of `collect(val)` is inserted.
Expand All @@ -2891,7 +2922,7 @@ const INSERTCOLS_ARGUMENTS =

"""
insertcols(df::AbstractDataFrame[, col], (name=>val)::Pair...;
after::Bool=false, makeunique::Bool=false, copycols::Bool=true)
after::Bool=false, makeunique::Bool=false, mergeduplicates::MergeDuplicates=nothing, copycols::Bool=true)

Insert a column into a copy of `df` data frame using the [`insertcols!`](@ref)
function and return the newly created data frame.
Expand Down Expand Up @@ -2942,13 +2973,13 @@ julia> insertcols(df, :a, :d => 7:9, after=true)
```
"""
insertcols(df::AbstractDataFrame, args...;
after::Bool=false, makeunique::Bool=false, copycols::Bool=true) =
after::Bool=false, makeunique::Bool=false, mergeduplicates::MergeDuplicates=nothing, copycols::Bool=true) =
insertcols!(copy(df), args...;
after=after, makeunique=makeunique, copycols=copycols)
after=after, makeunique=makeunique, mergeduplicates=mergeduplicates, copycols=copycols)

"""
insertcols!(df::AbstractDataFrame[, col], (name=>val)::Pair...;
after::Bool=false, makeunique::Bool=false, copycols::Bool=true)
after::Bool=false, makeunique::Bool=false, mergeduplicates::MergeDuplicates=nothing, copycols::Bool=true)

Insert a column into a data frame in place. Return the updated data frame.

Expand Down Expand Up @@ -2999,7 +3030,10 @@ julia> insertcols!(df, :b, :d => 7:9, after=true)
```
"""
function insertcols!(df::AbstractDataFrame, col::ColumnIndex, name_cols::Pair{Symbol}...;
after::Bool=false, makeunique::Bool=false, copycols::Bool=true)
after::Bool=false, makeunique::Bool=false, mergeduplicates::MergeDuplicates=nothing, copycols::Bool=true)

_check_makeunique_args(mergeduplicates, makeunique)

if !is_column_insertion_allowed(df)
throw(ArgumentError("insertcols! is only supported for DataFrame, or for " *
"SubDataFrame created with `:` as column selector"))
Expand All @@ -3025,15 +3059,15 @@ function insertcols!(df::AbstractDataFrame, col::ColumnIndex, name_cols::Pair{Sy
"$(ncol(df)) columns at index $col_ind"))
end

if !makeunique
if !makeunique && isnothing(mergeduplicates)
if !allunique(first.(name_cols))
throw(ArgumentError("Names of columns to be inserted into a data frame " *
"must be unique when `makeunique=true`"))
"must be unique when `mergeduplicates=nothing`"))
end
for (n, _) in name_cols
if hasproperty(df, n)
throw(ArgumentError("Column $n is already present in the data frame " *
"which is not allowed when `makeunique=true`"))
"which is not allowed when `mergeduplicates=nothing`"))
end
end
end
Expand Down Expand Up @@ -3067,6 +3101,7 @@ function insertcols!(df::AbstractDataFrame, col::ColumnIndex, name_cols::Pair{Sy
target_row_count = 1
end

mergecolumns = Dict{Symbol, Any}()
start_col_ind = col_ind
for (name, item) in name_cols
if !(item isa AbstractVector)
Expand Down Expand Up @@ -3103,23 +3138,38 @@ function insertcols!(df::AbstractDataFrame, col::ColumnIndex, name_cols::Pair{Sy
dfp[!, name] = item_new
else
if hasproperty(dfp, name)
@assert makeunique
k = 1
while true
nn = Symbol("$(name)_$k")
if !hasproperty(dfp, nn)
name = nn
break
if makeunique
k = 1
while true
nn = Symbol("$(name)_$k")
if !hasproperty(dfp, nn)
name = nn
break
end
k += 1
end
k += 1
insert!(index(dfp), col_ind, name)
insert!(_columns(dfp), col_ind, item_new)
else
# Just update without adding to index
merge = get(mergecolumns, name, (dfp=dfp, cols=[]))
push!(merge.cols, item_new)
mergecolumns[name] = merge
col_ind -= 1
end
else
insert!(index(dfp), col_ind, name)
insert!(_columns(dfp), col_ind, item_new)
end
insert!(index(dfp), col_ind, name)
insert!(_columns(dfp), col_ind, item_new)
end
col_ind += 1
end

# Combine columns using mergeduplicates
for (name, merge) in mergecolumns
merge.dfp[!, name] = mergeduplicates.(merge.dfp[!, name], merge.cols...)
end

delta = col_ind - start_col_ind
colmetadata_dict = getfield(parent(df), :colmetadata)
if !isnothing(colmetadata_dict) && delta > 0
Expand All @@ -3134,22 +3184,22 @@ function insertcols!(df::AbstractDataFrame, col::ColumnIndex, name_cols::Pair{Sy
end

insertcols!(df::AbstractDataFrame, col::ColumnIndex, name_cols::Pair{<:AbstractString}...;
after::Bool=false, makeunique::Bool=false, copycols::Bool=true) =
after::Bool=false, makeunique::Bool=false, mergeduplicates::MergeDuplicates=nothing, copycols::Bool=true) =
insertcols!(df, col, (Symbol(n) => v for (n, v) in name_cols)...,
after=after, makeunique=makeunique, copycols=copycols)
after=after, makeunique=makeunique, mergeduplicates=mergeduplicates, copycols=copycols)

insertcols!(df::AbstractDataFrame, name_cols::Pair{Symbol}...;
after::Bool=false, makeunique::Bool=false, copycols::Bool=true) =
after::Bool=false, makeunique::Bool=false, mergeduplicates::MergeDuplicates=nothing, copycols::Bool=true) =
insertcols!(df, ncol(df)+1, name_cols..., after=after,
makeunique=makeunique, copycols=copycols)
makeunique=makeunique, mergeduplicates=mergeduplicates, copycols=copycols)

insertcols!(df::AbstractDataFrame, name_cols::Pair{<:AbstractString}...;
after::Bool=false, makeunique::Bool=false, copycols::Bool=true) =
after::Bool=false, makeunique::Bool=false, mergeduplicates::MergeDuplicates=nothing, copycols::Bool=true) =
insertcols!(df, (Symbol(n) => v for (n, v) in name_cols)...,
after=after, makeunique=makeunique, copycols=copycols)
after=after, makeunique=makeunique, mergeduplicates=mergeduplicates, copycols=copycols)

function insertcols!(df::AbstractDataFrame, col::ColumnIndex; after::Bool=false,
makeunique::Bool=false, copycols::Bool=true)
makeunique::Bool=false, mergeduplicates::MergeDuplicates=nothing, copycols::Bool=true)
if col isa SymbolOrString
col_ind = Int(columnindex(df, col))
if col_ind == 0
Expand All @@ -3173,7 +3223,7 @@ function insertcols!(df::AbstractDataFrame, col::ColumnIndex; after::Bool=false,
end

function insertcols!(df::AbstractDataFrame; after::Bool=false,
makeunique::Bool=false, copycols::Bool=true)
makeunique::Bool=false, mergeduplicates::MergeDuplicates=nothing, copycols::Bool=true)
_drop_all_nonnote_metadata!(parent(df))
return df
end
Expand Down
19 changes: 11 additions & 8 deletions src/abstractdataframe/reshape.jl
Original file line number Diff line number Diff line change
Expand Up @@ -823,7 +823,8 @@ julia> permutedims(df2, 1, "different_name")
"""
function Base.permutedims(df::AbstractDataFrame, src_namescol::ColumnIndex,
dest_namescol::Union{Symbol, AbstractString};
makeunique::Bool=false, strict::Bool=true)
makeunique::Bool=false, mergeduplicates::MergeDuplicates=nothing,
strict::Bool=true)

if src_namescol isa Integer
1 <= src_namescol <= ncol(df) || throw(BoundsError(index(df), src_namescol))
Expand Down Expand Up @@ -854,26 +855,28 @@ function Base.permutedims(df::AbstractDataFrame, src_namescol::ColumnIndex,

if ncol(df_notsrc) == 0
df_tmp = DataFrame(AbstractVector[[] for _ in 1:nrow(df)], new_col_names,
makeunique=makeunique, copycols=false)
makeunique=makeunique, mergeduplicates=mergeduplicates,
copycols=false)
else
m = permutedims(Matrix(df_notsrc))
df_tmp = rename!(DataFrame(Tables.table(m)), new_col_names, makeunique=makeunique)
df_tmp = rename!(DataFrame(Tables.table(m)), new_col_names, makeunique=makeunique, mergeduplicates=mergeduplicates)
end
out_df = hcat!(df_permuted, df_tmp, makeunique=makeunique, copycols=false)
out_df = hcat!(df_permuted, df_tmp, makeunique=makeunique, mergeduplicates=mergeduplicates, copycols=false)
_copy_table_note_metadata!(out_df, df)
return out_df
end

function Base.permutedims(df::AbstractDataFrame, src_namescol::ColumnIndex;
makeunique::Bool=false, strict::Bool=true)
makeunique::Bool=false, mergeduplicates::MergeDuplicates=nothing,
strict::Bool=true)
if src_namescol isa Integer
1 <= src_namescol <= ncol(df) || throw(BoundsError(index(df), src_namescol))
dest_namescol = _names(df)[src_namescol]
else
dest_namescol = src_namescol
end
return permutedims(df, src_namescol, dest_namescol;
makeunique=makeunique, strict=strict)
makeunique=makeunique, mergeduplicates=mergeduplicates, strict=strict)
end

function Base.permutedims(df::AbstractDataFrame)
Expand All @@ -883,8 +886,8 @@ function Base.permutedims(df::AbstractDataFrame)
end

function Base.permutedims(df::AbstractDataFrame, cnames::AbstractVector;
makeunique::Bool=false)
out_df = DataFrame(permutedims(Matrix(df)), cnames, makeunique=makeunique)
makeunique::Bool=false, mergeduplicates::MergeDuplicates=nothing)
out_df = DataFrame(permutedims(Matrix(df)), cnames, makeunique=makeunique, mergeduplicates=mergeduplicates)
_copy_table_note_metadata!(out_df, df)
return out_df
end
2 changes: 1 addition & 1 deletion src/abstractdataframe/selection.jl
Original file line number Diff line number Diff line change
Expand Up @@ -1822,7 +1822,7 @@ end

function manipulate(df::DataFrame, args::AbstractVector{Int};
copycols::Bool, keeprows::Bool, renamecols::Bool)
new_df = DataFrame(_columns(df)[args], Index(_names(df)[args]), copycols=copycols)
new_df = DataFrame(_columns(df)[args], UpdateIndex(_names(df)[args]), copycols=copycols)
_copy_all_note_metadata!(new_df, df)
return new_df
end
Expand Down
Loading