Skip to content

Commit

Permalink
Add scalar keyword argument to flatten (#3283)
Browse files Browse the repository at this point in the history
  • Loading branch information
bkamins authored Feb 5, 2023
1 parent 436b686 commit ec7b123
Show file tree
Hide file tree
Showing 3 changed files with 131 additions and 15 deletions.
2 changes: 2 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,8 @@
* Add `haskey` and `get` methods to `DataFrameColumns`
to make it support dictionary interface more completely
([#3282](https://github.com/JuliaData/DataFrames.jl/pull/3282))
* Allow passing `scalar` keyword argument in `flatten`
([#3283](https://github.com/JuliaData/DataFrames.jl/pull/3283))

## Bug fixes

Expand Down
79 changes: 64 additions & 15 deletions src/abstractdataframe/abstractdataframe.jl
Original file line number Diff line number Diff line change
Expand Up @@ -2259,7 +2259,7 @@ function Missings.allowmissing(df::AbstractDataFrame,
end

"""
flatten(df::AbstractDataFrame, cols)
flatten(df::AbstractDataFrame, cols; scalar::Type=Union{})
When columns `cols` of data frame `df` have iterable elements that define
`length` (for example a `Vector` of `Vector`s), return a `DataFrame` where each
Expand All @@ -2273,6 +2273,11 @@ returned `DataFrame` will affect `df`.
`cols` can be any column selector ($COLUMNINDEX_STR; $MULTICOLUMNINDEX_STR).
If `scalar` is passed then values that have this type in flattened columns
are treated as scalars and broadcasted as many times as is needed to match
lengths of values stored in other columns. If all values in a row are scalars,
a single row is produced.
$METADATA_FIXED
# Examples
Expand Down Expand Up @@ -2334,10 +2339,33 @@ julia> df3 = DataFrame(a=[1, 2], b=[[1, 2], [3, 4]], c=[[5, 6], [7]])
julia> flatten(df3, [:b, :c])
ERROR: ArgumentError: Lengths of iterables stored in columns :b and :c are not the same in row 2
julia> df4 = DataFrame(a=[1, 2, 3],
b=[[1, 2], missing, missing],
c=[[5, 6], missing, [7, 8]])
3×3 DataFrame
Row │ a b c
│ Int64 Array…? Array…?
─────┼─────────────────────────
1 │ 1 [1, 2] [5, 6]
2 │ 2 missing missing
3 │ 3 missing [7, 8]
julia> flatten(df4, [:b, :c], scalar=Missing)
5×3 DataFrame
Row │ a b c
│ Int64 Int64? Int64?
─────┼─────────────────────────
1 │ 1 1 5
2 │ 1 2 6
3 │ 2 missing missing
4 │ 3 missing 7
5 │ 3 missing 8
```
"""
function flatten(df::AbstractDataFrame,
cols::Union{ColumnIndex, MultiColumnIndex})
cols::Union{ColumnIndex, MultiColumnIndex};
scalar::Type=Union{})
_check_consistency(df)

idxcols = index(df)[cols]
Expand All @@ -2348,15 +2376,16 @@ function flatten(df::AbstractDataFrame,
end

col1 = first(idxcols)
lengths = length.(df[!, col1])
for col in idxcols
v = df[!, col]
if any(x -> length(x[1]) != x[2], zip(v, lengths))
r = findfirst(x -> x != 0, length.(v) .- lengths)
colnames = _names(df)
throw(ArgumentError("Lengths of iterables stored in columns :$(colnames[col1]) " *
"and :$(colnames[col]) are not the same in row $r"))
end
lengths = Int[x isa scalar ? -1 : length(x) for x in df[!, col1]]
for (i, coli) in enumerate(idxcols)
i == 1 && continue
update_lengths!(lengths, df[!, coli], scalar, df, col1, coli)
end

# handle case where in all columns we had a scalar
# in this case we keep it one time
for i in 1:length(lengths)
lengths[i] == -1 && (lengths[i] = 1)
end

new_df = similar(df[!, Not(cols)], sum(lengths))
Expand All @@ -2366,18 +2395,38 @@ function flatten(df::AbstractDataFrame,
length(idxcols) > 1 && sort!(idxcols)
for col in idxcols
col_to_flatten = df[!, col]
fast_path = eltype(col_to_flatten) isa AbstractVector &&
fast_path = eltype(col_to_flatten) <: AbstractVector &&
!isempty(col_to_flatten)
flattened_col = fast_path ?
reduce(vcat, col_to_flatten) :
collect(Iterators.flatten(col_to_flatten))
flattened_col = if fast_path
reduce(vcat, col_to_flatten)
elseif scalar === Union{}
collect(Iterators.flatten(col_to_flatten))
else
collect(Iterators.flatten(v isa scalar ? Iterators.repeated(v, l) : v
for (l, v) in zip(lengths, col_to_flatten)))
end
insertcols!(new_df, col, _names(df)[col] => flattened_col)
end

_copy_all_note_metadata!(new_df, df)
return new_df
end

function update_lengths!(lengths::Vector{Int}, col::AbstractVector, scalar::Type,
df::AbstractDataFrame, col1::Integer, coli::Integer)
for (i, v) in enumerate(col)
v isa scalar && continue
lv = length(v)
if lengths[i] == -1
lengths[i] = lv
elseif lengths[i] != lv
colnames = _names(df)
throw(ArgumentError("Lengths of iterables stored in columns :$(colnames[col1]) " *
"and :$(colnames[coli]) are not the same in row $i"))
end
end
end

function repeat_lengths!(longnew::AbstractVector, shortold::AbstractVector,
lengths::AbstractVector{Int})
counter = 1
Expand Down
65 changes: 65 additions & 0 deletions test/reshape.jl
Original file line number Diff line number Diff line change
Expand Up @@ -431,6 +431,71 @@ end
@test flatten(DataFrame(), All()) == DataFrame()
end

@testset "flatten with scalar" begin
df = DataFrame(a=[1, 2, 3],
b=[[1, 2], missing, [3, 4]],
c=[[5, 6], missing, missing])
@test flatten(df, :a) df
@test_throws MethodError flatten(df, :b)
@test flatten(df, :b, scalar=Missing)
DataFrame(a=[1, 1, 2, 3, 3],
b=[1, 2, missing, 3, 4],
c=[[5, 6], [5, 6], missing, missing, missing])
@test flatten(df, [:b, :c], scalar=Missing)
DataFrame(a=[1, 1, 2, 3, 3],
b=[1, 2, missing, 3, 4],
c=[5, 6, missing, missing, missing])
@test flatten(df, [:b, :c], scalar=Any) df

df = DataFrame(a=missing, b=[1], c=missing, d=[[1, 2]])
@test_throws ArgumentError flatten(df, All(), scalar=Missing)
@test flatten(df, Not(:d), scalar=Missing)
DataFrame(a=missing, b=1, c=missing, d=[[1, 2]])
@test flatten(df, Not(:b), scalar=Missing)
DataFrame(a=[missing, missing], b=[1, 1], c=[missing, missing], d=[1, 2])

df = DataFrame(a="xy", b=[[1, 2]])
@test flatten(df, [:a, :b]) == DataFrame(a=['x', 'y'], b=[1, 2])
@test flatten(df, [:a, :b], scalar=String) ==
DataFrame(a=["xy", "xy"], b=[1, 2])

df = DataFrame(a=[[1], [], [3, 4], missing], b = missings(4), id=1:4)
@test flatten(df, [:a, :b], scalar=Missing)
DataFrame(a=[1, 3, 4, missing], b=missings(4), id=[1, 3, 3, 4])
df = DataFrame(id=1:10, x=[1:i-1 for i in 1:10])
df.y = [iseven(last(v)) ? missing : v for v in df.x]
@test flatten(df, [:x, :y], scalar=Missing)
DataFrame(id=reduce(vcat, [fill(i, i-1) for i in 2:10]),
x=reduce(vcat, [1:i for i in 1:9]),
y=reduce(vcat, [iseven(i) ? missings(i) : (1:i) for i in 1:9]))

# Below are tests showing handling of strings
df = DataFrame(id=1:5,
col1=["a", missing, 1:2, 3:4, 5:6],
col2=[11:12, 111:112, 1111:1112, missing, "b"])
@test flatten(df, [:col1, :col2], scalar=Union{Missing, AbstractString})
DataFrame(id=[1 ,1, 2, 2, 3, 3, 4, 4, 5, 5],
col1=["a", "a", missing, missing, 1, 2, 3, 4, 5, 6],
col2=[11, 12, 111, 112, 1111, 1112, missing, missing, "b", "b"])
@test_throws MethodError flatten(df, [:col1, :col2])
@test_throws ArgumentError flatten(df, [:col1, :col2], scalar=Missing)
@test_throws MethodError flatten(df, [:col1, :col2], scalar=AbstractString)

df = DataFrame(id=1:5,
col1=["ab", missing, 1:2, 3:4, 5:6],
col2=[11:12, 111:112, 1111:1112, missing, "cd"])
@test flatten(df, [:col1, :col2], scalar=Union{Missing, AbstractString})
DataFrame(id=[1 ,1, 2, 2, 3, 3, 4, 4, 5, 5],
col1=["ab", "ab", missing, missing, 1, 2, 3, 4, 5, 6],
col2=[11, 12, 111, 112, 1111, 1112, missing, missing, "cd", "cd"])
@test_throws MethodError flatten(df, [:col1, :col2])
@test flatten(df, [:col1, :col2], scalar=Missing)
DataFrame(id=[1 ,1, 2, 2, 3, 3, 4, 4, 5, 5],
col1=['a', 'b', missing, missing, 1, 2, 3, 4, 5, 6],
col2=[11, 12, 111, 112, 1111, 1112, missing, missing, 'c', 'd'])
@test_throws MethodError flatten(df, [:col1, :col2], scalar=AbstractString)
end

@testset "stack categorical test" begin
Random.seed!(1234)
d1 = DataFrame(a=repeat([1:3;], inner=[4]),
Expand Down

0 comments on commit ec7b123

Please sign in to comment.