From c459de92707df92e23eded7a0dcf559cd1e99f99 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Wed, 28 Dec 2022 20:33:11 +0100 Subject: [PATCH 01/12] add nest, unnest, improve flatten --- src/DataFrames.jl | 3 + src/abstractdataframe/abstractdataframe.jl | 130 ------- src/abstractdataframe/nest.jl | 390 +++++++++++++++++++++ test/reshape.jl | 39 +++ 4 files changed, 432 insertions(+), 130 deletions(-) create mode 100644 src/abstractdataframe/nest.jl diff --git a/src/DataFrames.jl b/src/DataFrames.jl index c5d8366214..4e117e72f3 100644 --- a/src/DataFrames.jl +++ b/src/DataFrames.jl @@ -76,6 +76,7 @@ export AbstractDataFrame, mapcols, mapcols!, ncol, + nest, nonunique, nrow, order, @@ -95,6 +96,7 @@ export AbstractDataFrame, transform, transform!, unique!, + unnest, unstack, valuecols, metadata, @@ -166,6 +168,7 @@ include("abstractdataframe/show.jl") include("groupeddataframe/show.jl") include("dataframerow/show.jl") include("abstractdataframe/io.jl") +include("abstractdataframe/nest.jl") include("other/tables.jl") include("other/names.jl") diff --git a/src/abstractdataframe/abstractdataframe.jl b/src/abstractdataframe/abstractdataframe.jl index 9fba690d49..ec85d5d458 100644 --- a/src/abstractdataframe/abstractdataframe.jl +++ b/src/abstractdataframe/abstractdataframe.jl @@ -2502,136 +2502,6 @@ function Missings.allowmissing(df::AbstractDataFrame, return new_df end -""" - flatten(df::AbstractDataFrame, cols) - -When columns `cols` of data frame `df` have iterable elements that define -`length` (for example a `Vector` of `Vector`s), return a `DataFrame` where each -element of each `col` in `cols` is flattened, meaning the column corresponding -to `col` becomes a longer vector where the original entries are concatenated. -Elements of row `i` of `df` in columns other than `cols` will be repeated -according to the length of `df[i, col]`. These lengths must therefore be the -same for each `col` in `cols`, or else an error is raised. Note that these -elements are not copied, and thus if they are mutable changing them in the -returned `DataFrame` will affect `df`. - -`cols` can be any column selector ($COLUMNINDEX_STR; $MULTICOLUMNINDEX_STR). - -$METADATA_FIXED - -# Examples - -```jldoctest -julia> df1 = DataFrame(a=[1, 2], b=[[1, 2], [3, 4]], c=[[5, 6], [7, 8]]) -2×3 DataFrame - Row │ a b c - │ Int64 Array… Array… -─────┼─────────────────────── - 1 │ 1 [1, 2] [5, 6] - 2 │ 2 [3, 4] [7, 8] - -julia> flatten(df1, :b) -4×3 DataFrame - Row │ a b c - │ Int64 Int64 Array… -─────┼────────────────────── - 1 │ 1 1 [5, 6] - 2 │ 1 2 [5, 6] - 3 │ 2 3 [7, 8] - 4 │ 2 4 [7, 8] - -julia> flatten(df1, [:b, :c]) -4×3 DataFrame - Row │ a b c - │ Int64 Int64 Int64 -─────┼───────────────────── - 1 │ 1 1 5 - 2 │ 1 2 6 - 3 │ 2 3 7 - 4 │ 2 4 8 - -julia> df2 = DataFrame(a=[1, 2], b=[("p", "q"), ("r", "s")]) -2×2 DataFrame - Row │ a b - │ Int64 Tuple… -─────┼─────────────────── - 1 │ 1 ("p", "q") - 2 │ 2 ("r", "s") - -julia> flatten(df2, :b) -4×2 DataFrame - Row │ a b - │ Int64 String -─────┼─────────────── - 1 │ 1 p - 2 │ 1 q - 3 │ 2 r - 4 │ 2 s - -julia> df3 = DataFrame(a=[1, 2], b=[[1, 2], [3, 4]], c=[[5, 6], [7]]) -2×3 DataFrame - Row │ a b c - │ Int64 Array… Array… -─────┼─────────────────────── - 1 │ 1 [1, 2] [5, 6] - 2 │ 2 [3, 4] [7] - -julia> flatten(df3, [:b, :c]) -ERROR: ArgumentError: Lengths of iterables stored in columns :b and :c are not the same in row 2 -``` -""" -function flatten(df::AbstractDataFrame, - cols::Union{ColumnIndex, MultiColumnIndex}) - _check_consistency(df) - - idxcols = index(df)[cols] - if isempty(idxcols) - cdf = copy(df) - _drop_all_nonnote_metadata!(cdf) - return cdf - end - - col1 = first(idxcols) - lengths = length.(df[!, col1]) - for col in idxcols - v = df[!, col] - if any(x -> length(x[1]) != x[2], zip(v, lengths)) - r = findfirst(x -> x != 0, length.(v) .- lengths) - colnames = _names(df) - throw(ArgumentError("Lengths of iterables stored in columns :$(colnames[col1]) " * - "and :$(colnames[col]) are not the same in row $r")) - end - end - - new_df = similar(df[!, Not(cols)], sum(lengths)) - for name in _names(new_df) - repeat_lengths!(new_df[!, name], df[!, name], lengths) - end - length(idxcols) > 1 && sort!(idxcols) - for col in idxcols - col_to_flatten = df[!, col] - fast_path = eltype(col_to_flatten) isa AbstractVector && - !isempty(col_to_flatten) - flattened_col = fast_path ? - reduce(vcat, col_to_flatten) : - collect(Iterators.flatten(col_to_flatten)) - insertcols!(new_df, col, _names(df)[col] => flattened_col) - end - - _copy_all_note_metadata!(new_df, df) - return new_df -end - -function repeat_lengths!(longnew::AbstractVector, shortold::AbstractVector, - lengths::AbstractVector{Int}) - counter = 1 - @inbounds for i in eachindex(shortold) - l = lengths[i] - longnew[counter:(counter + l - 1)] .= Ref(shortold[i]) - counter += l - end -end - # Disallowed getindex and setindex! operations that are a common mistake Base.getindex(::AbstractDataFrame, ::Union{Symbol, Integer, AbstractString}) = diff --git a/src/abstractdataframe/nest.jl b/src/abstractdataframe/nest.jl new file mode 100644 index 0000000000..966e79ece4 --- /dev/null +++ b/src/abstractdataframe/nest.jl @@ -0,0 +1,390 @@ +""" + nest(gdf::GroupedDataFrame, cols::Pair{<:AbstractString}...) + nest(gdf::GroupedDataFrame, cols::Pair{Symbol}...) + nest(gdf::GroupedDataFrame) + +Nest multiple columns per group of `gdf` into a single column as data frames. + +Every `cols` argument must be a pair `column_selector => column_name`. +If no `cols` are passed, then by default `valuecols(gdf) => :data` +nesting is performed. + +Return a data frame having all grouping columns of `gdf` followed by one +or more columns where `column_name` is a name of the column storing data frames, +where every data frame consists of columns picked by `column_selector` values +computed for each group of `gdf`. + +TODO: metadata + +# Examples + +julia> df = DataFrame(id = ["b", "a", "a", "c", "b", "b"], + x = 1:6, y = 11:16, z='a':'f') +6×4 DataFrame + Row │ id x y z + │ String Int64 Int64 Char +─────┼──────────────────────────── + 1 │ b 1 11 a + 2 │ a 2 12 b + 3 │ a 3 13 c + 4 │ c 4 14 d + 5 │ b 5 15 e + 6 │ b 6 16 f + +julia> n1 = nest(groupby(df, :id)) +3×2 DataFrame + Row │ id data + │ String DataFrame +─────┼─────────────────────── + 1 │ b 3×3 DataFrame + 2 │ a 2×3 DataFrame + 3 │ c 1×3 DataFrame + +julia> n1.data +3-element Vector{DataFrame}: + 3×3 DataFrame + Row │ x y z + │ Int64 Int64 Char +─────┼──────────────────── + 1 │ 1 11 a + 2 │ 5 15 e + 3 │ 6 16 f + 2×3 DataFrame + Row │ x y z + │ Int64 Int64 Char +─────┼──────────────────── + 1 │ 2 12 b + 2 │ 3 13 c + 1×3 DataFrame + Row │ x y z + │ Int64 Int64 Char +─────┼──────────────────── + 1 │ 4 14 d + +julia> n2 = nest(groupby(df, :id), [:z, :x] => :zx) +3×2 DataFrame + Row │ id zx + │ String DataFrame +─────┼─────────────────────── + 1 │ b 3×2 DataFrame + 2 │ a 2×2 DataFrame + 3 │ c 1×2 DataFrame + +julia> n2.zx +3-element Vector{DataFrame}: + 3×2 DataFrame + Row │ z x + │ Char Int64 +─────┼───────────── + 1 │ a 1 + 2 │ e 5 + 3 │ f 6 + 2×2 DataFrame + Row │ z x + │ Char Int64 +─────┼───────────── + 1 │ b 2 + 2 │ c 3 + 1×2 DataFrame + Row │ z x + │ Char Int64 +─────┼───────────── + 1 │ d 4 + +julia> n3 = nest(groupby(df, :id), :x => "x", [:y, :z] => "yz") +3×3 DataFrame + Row │ id x yz + │ String DataFrame DataFrame +─────┼────────────────────────────────────── + 1 │ b 3×1 DataFrame 3×2 DataFrame + 2 │ a 2×1 DataFrame 2×2 DataFrame + 3 │ c 1×1 DataFrame 1×2 DataFrame + +julia> n3.x +3-element Vector{DataFrame}: + 3×1 DataFrame + Row │ x + │ Int64 +─────┼─────── + 1 │ 1 + 2 │ 5 + 3 │ 6 + 2×1 DataFrame + Row │ x + │ Int64 +─────┼─────── + 1 │ 2 + 2 │ 3 + 1×1 DataFrame + Row │ x + │ Int64 +─────┼─────── + 1 │ 4 + +julia> n3.yz +3-element Vector{DataFrame}: + 3×2 DataFrame + Row │ y z + │ Int64 Char +─────┼───────────── + 1 │ 11 a + 2 │ 15 e + 3 │ 16 f + 2×2 DataFrame + Row │ y z + │ Int64 Char +─────┼───────────── + 1 │ 12 b + 2 │ 13 c + 1×2 DataFrame + Row │ y z + │ Int64 Char +─────┼───────────── + 1 │ 14 d +""" +nest(gdf::GroupedDataFrame, cols::Pair{<:Any, <:AbstractString}...) = + combine(gdf, (sdf -> (; Symbol(dst) => select(sdf, index(sdf)[src])) + for (src, dst) in cols)...) +nest(gdf::GroupedDataFrame, cols::Pair{<:Any, Symbol}...) = + combine(gdf, (sdf -> (; dst => select(sdf, index(sdf)[src])) + for (src, dst) in cols)...) +nest(gdf::GroupedDataFrame) = nest(gdf, valuecols(gdf) => :data) + +""" + unnest(df::AbstractDataFrame, src::ColumnIndex...; + cols::Union{Symbol, AbstractVector{Symbol}, + AbstractVector{<:AbstractString}}=:setequal, + promote::Bool=true, + makeunique::Bool=false, flatten::Bool=true) + +Unnest one or more columns `src` into multiple columns. The newly created +columns are stored at the end of the data frame (and the `src` column is +dropped). + +Each `src` column must contain a `NamedTuple`, a `DataFrameRow`, a +`Tables.AbstractRow`, or Tables.jl table. + +`cols` (default `:setequal`) and `promote` (default `true`) keyword arguments +have the same meaning as in [`push!`](@ref). + +If `makeunique=false` (the default) produced column names must be unique. +If `makeunique=true` then duplicate column names will be suffixed with `_i` +(`i` starting at `1` for the first duplicate). + +If `flatten=true` (the default) then newly created columns are flattened +using [`flatten`](@ref) with `scalar=Missing` keyword argument. + +TODO: metadata + +""" +function unnest(df::AbstractDataFrame, src::ColumnIndex...; + cols::Union{Symbol, AbstractVector{Symbol}, + AbstractVector{<:AbstractString}}=:setequal, + promote::Bool=(cols in [:union, :subset]), + makeunique::Bool=false, flatten::Bool=true) + ref_df = select(df, Not(collect(Any, src))) + col_count = ncol(ref_df) + for idx in src + col = df[!, idx] + tmp_df = DataFrame() + for v in col + if v isa DataFrame # produce DataFrameRow + # if flatten=false make a copy to avoid aliases + v = DataFrame([n => [flatten ? c : copy(c)] + for (n, c) in pairs(eachcol(v))], + copycols=false) |> only + elseif Tables.istable(v) # produce NamedTuple + v = Tables.columntable(v) + end + push!(tmp_df, v, cols=cols, promote=promote) + end + hcat!(ref_df, tmp_df, makeunique=makeunique, copycols=false) + end + return if flatten + DataFrames.flatten(ref_df, col_count+1:ncol(ref_df), scalar=Missing) + else + ref_df + end +end + +""" + flatten(df::AbstractDataFrame, cols; scalar::Type) + +When columns `cols` of data frame `df` have iterable elements that define +`length` (for example a `Vector` of `Vector`s), return a `DataFrame` where each +element of each `col` in `cols` is flattened, meaning the column corresponding +to `col` becomes a longer vector where the original entries are concatenated. +Elements of row `i` of `df` in columns other than `cols` will be repeated +according to the length of `df[i, col]`. These lengths must therefore be the +same for each `col` in `cols`, or else an error is raised. Note that these +elements are not copied, and thus if they are mutable changing them in the +returned `DataFrame` will affect `df`. + +`cols` can be any column selector ($COLUMNINDEX_STR; $MULTICOLUMNINDEX_STR). + +If `scalar` is passed then values that have this type in flattened columns +are treated as scalars and broadcasted as many times as is needed to match +lengths of values stored in other columns. One row is produced if all +corresponding values are scalars. + +$METADATA_FIXED + +# Examples + +```jldoctest +julia> df1 = DataFrame(a=[1, 2], b=[[1, 2], [3, 4]], c=[[5, 6], [7, 8]]) +2×3 DataFrame + Row │ a b c + │ Int64 Array… Array… +─────┼─────────────────────── + 1 │ 1 [1, 2] [5, 6] + 2 │ 2 [3, 4] [7, 8] + +julia> flatten(df1, :b) +4×3 DataFrame + Row │ a b c + │ Int64 Int64 Array… +─────┼────────────────────── + 1 │ 1 1 [5, 6] + 2 │ 1 2 [5, 6] + 3 │ 2 3 [7, 8] + 4 │ 2 4 [7, 8] + +julia> flatten(df1, [:b, :c]) +4×3 DataFrame + Row │ a b c + │ Int64 Int64 Int64 +─────┼───────────────────── + 1 │ 1 1 5 + 2 │ 1 2 6 + 3 │ 2 3 7 + 4 │ 2 4 8 + +julia> df2 = DataFrame(a=[1, 2], b=[("p", "q"), ("r", "s")]) +2×2 DataFrame + Row │ a b + │ Int64 Tuple… +─────┼─────────────────── + 1 │ 1 ("p", "q") + 2 │ 2 ("r", "s") + +julia> flatten(df2, :b) +4×2 DataFrame + Row │ a b + │ Int64 String +─────┼─────────────── + 1 │ 1 p + 2 │ 1 q + 3 │ 2 r + 4 │ 2 s + +julia> df3 = DataFrame(a=[1, 2], b=[[1, 2], [3, 4]], c=[[5, 6], [7]]) +2×3 DataFrame + Row │ a b c + │ Int64 Array… Array… +─────┼─────────────────────── + 1 │ 1 [1, 2] [5, 6] + 2 │ 2 [3, 4] [7] + +julia> flatten(df3, [:b, :c]) +ERROR: ArgumentError: Lengths of iterables stored in columns :b and :c are not the same in row 2 + +julia> df4 = DataFrame(a=[1, 2, 3], + b=[[1, 2], missing, missing], + c=[[5, 6], missing, [7, 8]]) +3×3 DataFrame + Row │ a b c + │ Int64 Array…? Array…? +─────┼───────────────────────── + 1 │ 1 [1, 2] [5, 6] + 2 │ 2 missing missing + 3 │ 3 missing [7, 8] + +julia> flatten(df4, [:b, :c], scalar=Missing) +5×3 DataFrame + Row │ a b c + │ Int64 Int64? Int64? +─────┼───────────────────────── + 1 │ 1 1 5 + 2 │ 1 2 6 + 3 │ 2 missing missing + 4 │ 3 missing 7 + 5 │ 3 missing 8 +``` +""" +function flatten(df::AbstractDataFrame, + cols::Union{ColumnIndex, MultiColumnIndex}; + scalar::Type=Union{}) + _check_consistency(df) + + idxcols = index(df)[cols] + if isempty(idxcols) + cdf = copy(df) + _drop_all_nonnote_metadata!(cdf) + return cdf + end + + col1 = first(idxcols) + lengths = Int[length_maybe_scalar(x, scalar) for x in df[!, col1]] + for (i, coli) in enumerate(idxcols) + i == 1 && continue + update_lengths!(lengths, df[!, coli], scalar, df, col1, coli) + end + + # handle case where in all columns we had a scalar + # in this case we keep it one time + for i in 1:length(lengths) + lengths[i] == -1 && (lengths[i] = 1) + end + + new_df = similar(df[!, Not(cols)], sum(lengths)) + for name in _names(new_df) + repeat_lengths!(new_df[!, name], df[!, name], lengths) + end + length(idxcols) > 1 && sort!(idxcols) + for col in idxcols + col_to_flatten = df[!, col] + fast_path = eltype(col_to_flatten) isa AbstractVector && + !isempty(col_to_flatten) + flattened_col = if fast_path + reduce(vcat, col_to_flatten) + elseif scalar === Union{} + collect(Iterators.flatten(col_to_flatten)) + else + collect(Iterators.flatten(v isa scalar ? Iterators.repeated(v, l) : v + for (l, v) in zip(lengths, col_to_flatten))) + end + insertcols!(new_df, col, _names(df)[col] => flattened_col) + end + + _copy_all_note_metadata!(new_df, df) + return new_df +end + +length_maybe_scalar(v, scalar::Type) = v isa scalar ? -1 : length(v) + +function update_lengths!(lengths::Vector{Int}, col::AbstractVector, scalar::Type, + df::AbstractDataFrame, col1, coli) + for (i, v) in enumerate(col) + lv = length_maybe_scalar(v, scalar) + lv == -1 && continue + if lengths[i] == -1 + lengths[i] = lv + elseif lengths[i] != lv + colnames = _names(df) + throw(ArgumentError("Lengths of iterables stored in columns :$(colnames[col1]) " * + "and :$(colnames[coli]) are not the same in row $i")) + end + end +end + +function repeat_lengths!(longnew::AbstractVector, shortold::AbstractVector, + lengths::AbstractVector{Int}) + counter = 1 + @inbounds for i in eachindex(shortold) + l = lengths[i] + longnew[counter:(counter + l - 1)] .= Ref(shortold[i]) + counter += l + end +end + diff --git a/test/reshape.jl b/test/reshape.jl index 58cf7bfce0..d00957297e 100644 --- a/test/reshape.jl +++ b/test/reshape.jl @@ -431,6 +431,45 @@ end @test flatten(DataFrame(), All()) == DataFrame() end +@testset "flatten with scalar" begin + df = DataFrame(a=[1, 2, 3], + b=[[1, 2], missing, [3, 4]], + c=[[5, 6], missing, missing]) + @test flatten(df, :a) ≅ df + @test_throws MethodError flatten(df, :b) + @test flatten(df, :b, scalar=Missing) ≅ + DataFrame(a=[1, 1, 2, 3, 3], + b=[1, 2, missing, 3, 4], + c=[[5, 6], [5, 6], missing, missing, missing]) + @test flatten(df, [:b, :c], scalar=Missing) ≅ + DataFrame(a=[1, 1, 2, 3, 3], + b=[1, 2, missing, 3, 4], + c=[5, 6, missing, missing, missing]) + @test flatten(df, [:b, :c], scalar=Any) ≅ df + + df = DataFrame(a=missing, b=[1], c=missing, d=[[1, 2]]) + @test_throws ArgumentError flatten(df, All(), scalar=Missing) + @test flatten(df, Not(:d), scalar=Missing) ≅ + DataFrame(a=missing, b=1, c=missing, d=[[1, 2]]) + @test flatten(df, Not(:b), scalar=Missing) ≅ + DataFrame(a=[missing, missing], b=[1, 1], c=[missing, missing], d=[1, 2]) + + df = DataFrame(a="xy", b=[[1, 2]]) + @test flatten(df, [:a, :b]) == DataFrame(a=['x', 'y'], b=[1, 2]) + @test flatten(df, [:a, :b], scalar=String) == + DataFrame(a=["xy", "xy"], b=[1, 2]) + + df = DataFrame(a=[[1], [], [3, 4], missing], b = missings(4), id=1:4) + @test flatten(df, [:a, :b], scalar=Missing) ≅ + DataFrame(a=[1, 3, 4, missing], b=missings(4), id=[1, 3, 3, 4]) + df = DataFrame(id=1:10, x=[1:i-1 for i in 1:10]) + df.y = [iseven(last(v)) ? missing : v for v in df.x] + @test flatten(df, [:x, :y], scalar=Missing) ≅ + DataFrame(id=reduce(vcat, [fill(i, i-1) for i in 2:10]), + x=reduce(vcat, [1:i for i in 1:9]), + y=reduce(vcat, [iseven(i) ? missings(i) : (1:i) for i in 1:9])) +end + @testset "stack categorical test" begin Random.seed!(1234) d1 = DataFrame(a=repeat([1:3;], inner=[4]), From d764467937b368dcf366b800a46395fe8b3c301d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Wed, 28 Dec 2022 23:40:24 +0100 Subject: [PATCH 02/12] add to docs --- docs/src/lib/functions.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/docs/src/lib/functions.md b/docs/src/lib/functions.md index 9b9de28471..67dbb7262a 100644 --- a/docs/src/lib/functions.md +++ b/docs/src/lib/functions.md @@ -85,6 +85,7 @@ insertcols! invpermute! mapcols mapcols! +nest permute! prepend! push! @@ -102,6 +103,7 @@ table_transformation transform transform! vcat +unnest ``` ## Reshaping data frames between tall and wide formats From c85a275e2baf65d863436204ad1b45e638fa12db Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Thu, 5 Jan 2023 09:57:39 +0100 Subject: [PATCH 03/12] Apply suggestions from code review Co-authored-by: Milan Bouchet-Valat --- src/abstractdataframe/nest.jl | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/src/abstractdataframe/nest.jl b/src/abstractdataframe/nest.jl index 966e79ece4..88f491635b 100644 --- a/src/abstractdataframe/nest.jl +++ b/src/abstractdataframe/nest.jl @@ -3,15 +3,16 @@ nest(gdf::GroupedDataFrame, cols::Pair{Symbol}...) nest(gdf::GroupedDataFrame) -Nest multiple columns per group of `gdf` into a single column as data frames. +Return a data frame with one row for each group in `gdf` where +or or more columns contain a data frame of the rows that belong to that group. Every `cols` argument must be a pair `column_selector => column_name`. If no `cols` are passed, then by default `valuecols(gdf) => :data` nesting is performed. -Return a data frame having all grouping columns of `gdf` followed by one -or more columns where `column_name` is a name of the column storing data frames, -where every data frame consists of columns picked by `column_selector` values +The returned data frame has all grouping columns of `gdf`, followed by one +or more columns where `column_name` is the name of the column storing data frames, +and every data frame consists of columns picked by `column_selector` values computed for each group of `gdf`. TODO: metadata @@ -91,7 +92,7 @@ julia> n2.zx ─────┼───────────── 1 │ d 4 -julia> n3 = nest(groupby(df, :id), :x => "x", [:y, :z] => "yz") +julia> n3 = nest(groupby(df, :id), :x => :x, [:y, :z] => :yz) 3×3 DataFrame Row │ id x yz │ String DataFrame DataFrame @@ -157,8 +158,10 @@ nest(gdf::GroupedDataFrame) = nest(gdf, valuecols(gdf) => :data) promote::Bool=true, makeunique::Bool=false, flatten::Bool=true) -Unnest one or more columns `src` into multiple columns. The newly created -columns are stored at the end of the data frame (and the `src` column is +Extract the contents of one or more columns `cols` in `df` that contain data frames, +returning a data frame with as many rows and columns as the nested data frames contain, +in addition to original columns. +The newly created columns are stored at the end of the data frame (and the `src` columns are dropped). Each `src` column must contain a `NamedTuple`, a `DataFrameRow`, a From f2db6b184654f49cec37a0207a2d0fd43e7202b7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Thu, 5 Jan 2023 09:57:59 +0100 Subject: [PATCH 04/12] add extract --- NEWS.md | 2 ++ docs/src/lib/functions.md | 2 ++ src/DataFrames.jl | 2 ++ 3 files changed, 6 insertions(+) diff --git a/NEWS.md b/NEWS.md index da12048624..d92ec60745 100644 --- a/NEWS.md +++ b/NEWS.md @@ -15,6 +15,8 @@ * Joining functions now support `order` keyword argument allowing the user to specify the order of the rows in the produced table ([#3233](https://github.com/JuliaData/DataFrames.jl/pull/3233)) +* Add `nest`, `unnest`, `extract`, and `extract!` functions; improve `flatten` + ([#3258](https://github.com/JuliaData/DataFrames.jl/pull/3258)) ## Bug fixes diff --git a/docs/src/lib/functions.md b/docs/src/lib/functions.md index 67dbb7262a..ef010b45a0 100644 --- a/docs/src/lib/functions.md +++ b/docs/src/lib/functions.md @@ -76,6 +76,8 @@ rename! ```@docs append! combine +extract +extract! fillcombinations flatten hcat diff --git a/src/DataFrames.jl b/src/DataFrames.jl index 4e117e72f3..cc6f1c5f10 100644 --- a/src/DataFrames.jl +++ b/src/DataFrames.jl @@ -63,6 +63,8 @@ export AbstractDataFrame, disallowmissing!, dropmissing!, dropmissing, + extract, + extract!, fillcombinations, flatten, groupby, From 792d35513f43ac8eaec8060f396fdb5562638928 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Thu, 5 Jan 2023 10:21:47 +0100 Subject: [PATCH 05/12] initial implementation --- src/abstractdataframe/nest.jl | 142 +++++++++++++++++++++++++--------- 1 file changed, 107 insertions(+), 35 deletions(-) diff --git a/src/abstractdataframe/nest.jl b/src/abstractdataframe/nest.jl index 88f491635b..f5a60d000d 100644 --- a/src/abstractdataframe/nest.jl +++ b/src/abstractdataframe/nest.jl @@ -1,7 +1,7 @@ """ - nest(gdf::GroupedDataFrame, cols::Pair{<:AbstractString}...) - nest(gdf::GroupedDataFrame, cols::Pair{Symbol}...) - nest(gdf::GroupedDataFrame) + nest(gdf::GroupedDataFrame, cols::Pair{<:AbstractString}...; view::Bool=false) + nest(gdf::GroupedDataFrame, cols::Pair{Symbol}...; view::Bool=false) + nest(gdf::GroupedDataFrame; view::Bool=false) Return a data frame with one row for each group in `gdf` where or or more columns contain a data frame of the rows that belong to that group. @@ -15,6 +15,10 @@ or more columns where `column_name` is the name of the column storing data frame and every data frame consists of columns picked by `column_selector` values computed for each group of `gdf`. +If `view=false` (the default) the nested data frames will hold copies of +data from the source data frame. If `view=true` views of the source data frame +will be created. + TODO: metadata # Examples @@ -143,29 +147,28 @@ julia> n3.yz ─────┼───────────── 1 │ 14 d """ -nest(gdf::GroupedDataFrame, cols::Pair{<:Any, <:AbstractString}...) = - combine(gdf, (sdf -> (; Symbol(dst) => select(sdf, index(sdf)[src])) +nest(gdf::GroupedDataFrame, cols::Pair{<:Any, <:AbstractString}...; + view::Bool=false) = + combine(gdf, (sdf -> (; Symbol(dst) => select(sdf, index(sdf)[src], copycols=!view)) for (src, dst) in cols)...) -nest(gdf::GroupedDataFrame, cols::Pair{<:Any, Symbol}...) = - combine(gdf, (sdf -> (; dst => select(sdf, index(sdf)[src])) +nest(gdf::GroupedDataFrame, cols::Pair{<:Any, Symbol}...; view::Bool=false) = + combine(gdf, (sdf -> (; dst => select(sdf, index(sdf)[src], copycols=!view)) for (src, dst) in cols)...) -nest(gdf::GroupedDataFrame) = nest(gdf, valuecols(gdf) => :data) +nest(gdf::GroupedDataFrame; view::Bool=false) = + nest(gdf, valuecols(gdf) => :data, view=view) """ unnest(df::AbstractDataFrame, src::ColumnIndex...; cols::Union{Symbol, AbstractVector{Symbol}, AbstractVector{<:AbstractString}}=:setequal, - promote::Bool=true, - makeunique::Bool=false, flatten::Bool=true) - -Extract the contents of one or more columns `cols` in `df` that contain data frames, -returning a data frame with as many rows and columns as the nested data frames contain, -in addition to original columns. -The newly created columns are stored at the end of the data frame (and the `src` columns are -dropped). + promote::Bool=true, makeunique::Bool=false) -Each `src` column must contain a `NamedTuple`, a `DataFrameRow`, a -`Tables.AbstractRow`, or Tables.jl table. +Extract the contents of one or more columns `cols` in `df` that contain +Tables.jl tables, returning a data frame with as many rows and columns as the +nested data frames contain, in addition to original columns, whose contents +gets appropriately repeated to match the number of rows of the unnested tables. +The newly created columns are stored at the end of the data frame (and the +`src` columns are dropped). `cols` (default `:setequal`) and `promote` (default `true`) keyword arguments have the same meaning as in [`push!`](@ref). @@ -174,9 +177,6 @@ If `makeunique=false` (the default) produced column names must be unique. If `makeunique=true` then duplicate column names will be suffixed with `_i` (`i` starting at `1` for the first duplicate). -If `flatten=true` (the default) then newly created columns are flattened -using [`flatten`](@ref) with `scalar=Missing` keyword argument. - TODO: metadata """ @@ -184,7 +184,7 @@ function unnest(df::AbstractDataFrame, src::ColumnIndex...; cols::Union{Symbol, AbstractVector{Symbol}, AbstractVector{<:AbstractString}}=:setequal, promote::Bool=(cols in [:union, :subset]), - makeunique::Bool=false, flatten::Bool=true) + makeunique::Bool=false) ref_df = select(df, Not(collect(Any, src))) col_count = ncol(ref_df) for idx in src @@ -192,22 +192,96 @@ function unnest(df::AbstractDataFrame, src::ColumnIndex...; tmp_df = DataFrame() for v in col if v isa DataFrame # produce DataFrameRow - # if flatten=false make a copy to avoid aliases - v = DataFrame([n => [flatten ? c : copy(c)] - for (n, c) in pairs(eachcol(v))], + v = DataFrame([n => [c] for (n, c) in pairs(eachcol(v))], copycols=false) |> only - elseif Tables.istable(v) # produce NamedTuple + else # produce NamedTuple v = Tables.columntable(v) end push!(tmp_df, v, cols=cols, promote=promote) end hcat!(ref_df, tmp_df, makeunique=makeunique, copycols=false) end - return if flatten - DataFrames.flatten(ref_df, col_count+1:ncol(ref_df), scalar=Missing) - else - ref_df + return flatten(ref_df, col_count+1:ncol(ref_df), scalar=Missing) +end + +""" + expand(df::AbstractDataFrame, src::ColumnIndex...; + cols::Union{Symbol, AbstractVector{Symbol}, + AbstractVector{<:AbstractString}}=:setequal, + promote::Bool=true, makeunique::Bool=false) + +Extract the contents of one or more columns `cols` in `df` that contain +`NamedTuple`, a `DataFrameRow`, a `Tables.AbstractRow` elements +returning a data frame with expanded columns, in addition to original columns. +The newly created columns are stored at the end of the data frame (and the +`src` columns are dropped). + +`cols` (default `:setequal`) and `promote` (default `true`) keyword arguments +have the same meaning as in [`push!`](@ref). + +If `makeunique=false` (the default) produced column names must be unique. +If `makeunique=true` then duplicate column names will be suffixed with `_i` +(`i` starting at `1` for the first duplicate). + +TODO: metadata + +""" +function expand(df::AbstractDataFrame, src::ColumnIndex...; + cols::Union{Symbol, AbstractVector{Symbol}, + AbstractVector{<:AbstractString}}=:setequal, + promote::Bool=(cols in [:union, :subset]), + makeunique::Bool=false) + ref_df = select(df, Not(collect(Any, src))) + for idx in src + col = df[!, idx] + tmp_df = DataFrame() + for v in col + push!(tmp_df, v, cols=cols, promote=promote) + end + hcat!(ref_df, tmp_df, makeunique=makeunique, copycols=false) + end + return ref_df +end + +""" + expand!(df::AbstractDataFrame, src::ColumnIndex...; + cols::Union{Symbol, AbstractVector{Symbol}, + AbstractVector{<:AbstractString}}=:setequal, + promote::Bool=true, makeunique::Bool=false) + +Extract in-place the contents of one or more columns `cols` in `df` that contain +`NamedTuple`, a `DataFrameRow`, a `Tables.AbstractRow` elements +returning a data frame with expanded columns, in addition to original columns. +The newly created columns are stored at the end of the data frame (and the +`src` columns are dropped). + +`cols` (default `:setequal`) and `promote` (default `true`) keyword arguments +have the same meaning as in [`push!`](@ref). + +If `makeunique=false` (the default) produced column names must be unique. +If `makeunique=true` then duplicate column names will be suffixed with `_i` +(`i` starting at `1` for the first duplicate). + +TODO: metadata + +""" +function expand!(df::AbstractDataFrame, src::ColumnIndex...; + cols::Union{Symbol, AbstractVector{Symbol}, + AbstractVector{<:AbstractString}}=:setequal, + promote::Bool=(cols in [:union, :subset]), + makeunique::Bool=false) + tmp_dfs = DataFrame[] + for idx in src + col = df[!, idx] + tmp_df = DataFrame() + for v in col + push!(tmp_df, v, cols=cols, promote=promote) + end + push!(tmp_dfs, tmp_df) end + ref_df = select!(df, Not(collect(Any, src))) + hcat!(ref_df, tmp_dfs..., makeunique=makeunique, copycols=false) + return ref_df end """ @@ -328,7 +402,7 @@ function flatten(df::AbstractDataFrame, end col1 = first(idxcols) - lengths = Int[length_maybe_scalar(x, scalar) for x in df[!, col1]] + lengths = Int[x isa scalar ? -1 : length(x) for x in df[!, col1]] for (i, coli) in enumerate(idxcols) i == 1 && continue update_lengths!(lengths, df[!, coli], scalar, df, col1, coli) @@ -364,13 +438,11 @@ function flatten(df::AbstractDataFrame, return new_df end -length_maybe_scalar(v, scalar::Type) = v isa scalar ? -1 : length(v) - function update_lengths!(lengths::Vector{Int}, col::AbstractVector, scalar::Type, df::AbstractDataFrame, col1, coli) for (i, v) in enumerate(col) - lv = length_maybe_scalar(v, scalar) - lv == -1 && continue + v isa scalar && continue + lv = length(v) if lengths[i] == -1 lengths[i] = lv elseif lengths[i] != lv From 7d05ac82dd86498923a5fb19003f60276a010f0e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Thu, 5 Jan 2023 20:12:52 +0100 Subject: [PATCH 06/12] change default cols to :union --- src/abstractdataframe/nest.jl | 36 +++++++++++++++++------------------ 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/src/abstractdataframe/nest.jl b/src/abstractdataframe/nest.jl index f5a60d000d..05282d1cd1 100644 --- a/src/abstractdataframe/nest.jl +++ b/src/abstractdataframe/nest.jl @@ -160,7 +160,7 @@ nest(gdf::GroupedDataFrame; view::Bool=false) = """ unnest(df::AbstractDataFrame, src::ColumnIndex...; cols::Union{Symbol, AbstractVector{Symbol}, - AbstractVector{<:AbstractString}}=:setequal, + AbstractVector{<:AbstractString}}=:union, promote::Bool=true, makeunique::Bool=false) Extract the contents of one or more columns `cols` in `df` that contain @@ -170,7 +170,7 @@ gets appropriately repeated to match the number of rows of the unnested tables. The newly created columns are stored at the end of the data frame (and the `src` columns are dropped). -`cols` (default `:setequal`) and `promote` (default `true`) keyword arguments +`cols` (default `:union`) and `promote` (default `true`) keyword arguments have the same meaning as in [`push!`](@ref). If `makeunique=false` (the default) produced column names must be unique. @@ -182,7 +182,7 @@ TODO: metadata """ function unnest(df::AbstractDataFrame, src::ColumnIndex...; cols::Union{Symbol, AbstractVector{Symbol}, - AbstractVector{<:AbstractString}}=:setequal, + AbstractVector{<:AbstractString}}=:union, promote::Bool=(cols in [:union, :subset]), makeunique::Bool=false) ref_df = select(df, Not(collect(Any, src))) @@ -207,16 +207,16 @@ end """ expand(df::AbstractDataFrame, src::ColumnIndex...; cols::Union{Symbol, AbstractVector{Symbol}, - AbstractVector{<:AbstractString}}=:setequal, + AbstractVector{<:AbstractString}}=:union, promote::Bool=true, makeunique::Bool=false) Extract the contents of one or more columns `cols` in `df` that contain -`NamedTuple`, a `DataFrameRow`, a `Tables.AbstractRow` elements -returning a data frame with expanded columns, in addition to original columns. -The newly created columns are stored at the end of the data frame (and the -`src` columns are dropped). +`NamedTuple`, a `DataFrameRow`, an `AbstractDict` or a `Tables.AbstractRow` +elements returning a data frame with expanded columns, in addition to original +columns. The newly created columns are stored at the end of the data frame (and +the `src` columns are dropped). -`cols` (default `:setequal`) and `promote` (default `true`) keyword arguments +`cols` (default `:union`) and `promote` (default `true`) keyword arguments have the same meaning as in [`push!`](@ref). If `makeunique=false` (the default) produced column names must be unique. @@ -228,7 +228,7 @@ TODO: metadata """ function expand(df::AbstractDataFrame, src::ColumnIndex...; cols::Union{Symbol, AbstractVector{Symbol}, - AbstractVector{<:AbstractString}}=:setequal, + AbstractVector{<:AbstractString}}=:union, promote::Bool=(cols in [:union, :subset]), makeunique::Bool=false) ref_df = select(df, Not(collect(Any, src))) @@ -246,16 +246,16 @@ end """ expand!(df::AbstractDataFrame, src::ColumnIndex...; cols::Union{Symbol, AbstractVector{Symbol}, - AbstractVector{<:AbstractString}}=:setequal, + AbstractVector{<:AbstractString}}=:union, promote::Bool=true, makeunique::Bool=false) -Extract in-place the contents of one or more columns `cols` in `df` that contain -`NamedTuple`, a `DataFrameRow`, a `Tables.AbstractRow` elements -returning a data frame with expanded columns, in addition to original columns. -The newly created columns are stored at the end of the data frame (and the -`src` columns are dropped). +Extract in-place the contents of one or more columns `cols` in `df` that +contain `NamedTuple`, a `DataFrameRow`, an `AbstractDict`, or a +`Tables.AbstractRow` elements returning a data frame with expanded columns, in +addition to original columns. The newly created columns are stored at the end +of the data frame (and the `src` columns are dropped). -`cols` (default `:setequal`) and `promote` (default `true`) keyword arguments +`cols` (default `:union`) and `promote` (default `true`) keyword arguments have the same meaning as in [`push!`](@ref). If `makeunique=false` (the default) produced column names must be unique. @@ -267,7 +267,7 @@ TODO: metadata """ function expand!(df::AbstractDataFrame, src::ColumnIndex...; cols::Union{Symbol, AbstractVector{Symbol}, - AbstractVector{<:AbstractString}}=:setequal, + AbstractVector{<:AbstractString}}=:union, promote::Bool=(cols in [:union, :subset]), makeunique::Bool=false) tmp_dfs = DataFrame[] From 0e58244edbd956067579c523abd3ee3f764aa744 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Thu, 5 Jan 2023 23:03:08 +0100 Subject: [PATCH 07/12] fix wrong function name --- docs/src/lib/functions.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/src/lib/functions.md b/docs/src/lib/functions.md index ef010b45a0..79c8e77aae 100644 --- a/docs/src/lib/functions.md +++ b/docs/src/lib/functions.md @@ -76,8 +76,8 @@ rename! ```@docs append! combine -extract -extract! +expand +expand! fillcombinations flatten hcat From 40935331a825143192e2a0f6399bcdc66e88761d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Sun, 8 Jan 2023 18:30:11 +0100 Subject: [PATCH 08/12] remove cols and promote --- src/abstractdataframe/nest.jl | 64 ++++++++++++++++++++--------------- 1 file changed, 37 insertions(+), 27 deletions(-) diff --git a/src/abstractdataframe/nest.jl b/src/abstractdataframe/nest.jl index 05282d1cd1..86fe673af5 100644 --- a/src/abstractdataframe/nest.jl +++ b/src/abstractdataframe/nest.jl @@ -157,11 +157,28 @@ nest(gdf::GroupedDataFrame, cols::Pair{<:Any, Symbol}...; view::Bool=false) = nest(gdf::GroupedDataFrame; view::Bool=false) = nest(gdf, valuecols(gdf) => :data, view=view) +const UNNESTING_COMMON = """ +`cols` argument affects the created columns in the following way: +* If `cols == :setequal` then each row must contain exactly the same columns + (but possibly in a different order). +* If `cols == :orderequal` then each row must contain the same columns in the + same order (for `AbstractDict` this option requires that `keys` of row matches + to allow for support of ordered dicts; however, if row is a `Dict` an error is + thrown as it is an unordered collection). +* If `cols == :union` (the default) then in each row can contain different + colums and a `missing` value is pushed to columns missing in a given row + that are present in other rows. + +If `promote=true` (the default) then ` +the type of a pushed argument then a new column with a promoted element type +allowing it is freshly allocated and stored in `df`. If `promote=false` an error +is thrown. + +""" + """ unnest(df::AbstractDataFrame, src::ColumnIndex...; - cols::Union{Symbol, AbstractVector{Symbol}, - AbstractVector{<:AbstractString}}=:union, - promote::Bool=true, makeunique::Bool=false) + makeunique::Bool=false) Extract the contents of one or more columns `cols` in `df` that contain Tables.jl tables, returning a data frame with as many rows and columns as the @@ -170,8 +187,10 @@ gets appropriately repeated to match the number of rows of the unnested tables. The newly created columns are stored at the end of the data frame (and the `src` columns are dropped). -`cols` (default `:union`) and `promote` (default `true`) keyword arguments -have the same meaning as in [`push!`](@ref). +Table stored in each row of `src` can have different columns. `missing` value is +pushed to columns missing in a given row that are present in other rows. +The element type of resulting column is determined by promotion of element types +of columns in individual rows. If `makeunique=false` (the default) produced column names must be unique. If `makeunique=true` then duplicate column names will be suffixed with `_i` @@ -181,9 +200,6 @@ TODO: metadata """ function unnest(df::AbstractDataFrame, src::ColumnIndex...; - cols::Union{Symbol, AbstractVector{Symbol}, - AbstractVector{<:AbstractString}}=:union, - promote::Bool=(cols in [:union, :subset]), makeunique::Bool=false) ref_df = select(df, Not(collect(Any, src))) col_count = ncol(ref_df) @@ -197,7 +213,7 @@ function unnest(df::AbstractDataFrame, src::ColumnIndex...; else # produce NamedTuple v = Tables.columntable(v) end - push!(tmp_df, v, cols=cols, promote=promote) + push!(tmp_df, v, cols=:union, promote=true) end hcat!(ref_df, tmp_df, makeunique=makeunique, copycols=false) end @@ -206,9 +222,7 @@ end """ expand(df::AbstractDataFrame, src::ColumnIndex...; - cols::Union{Symbol, AbstractVector{Symbol}, - AbstractVector{<:AbstractString}}=:union, - promote::Bool=true, makeunique::Bool=false) + makeunique::Bool=false) Extract the contents of one or more columns `cols` in `df` that contain `NamedTuple`, a `DataFrameRow`, an `AbstractDict` or a `Tables.AbstractRow` @@ -216,8 +230,10 @@ elements returning a data frame with expanded columns, in addition to original columns. The newly created columns are stored at the end of the data frame (and the `src` columns are dropped). -`cols` (default `:union`) and `promote` (default `true`) keyword arguments -have the same meaning as in [`push!`](@ref). +Table stored in each row of `src` can have different columns. `missing` value is +pushed to columns missing in a given row that are present in other rows. +The element type of resulting column is determined by promotion of element types +of columns in individual rows. If `makeunique=false` (the default) produced column names must be unique. If `makeunique=true` then duplicate column names will be suffixed with `_i` @@ -227,16 +243,13 @@ TODO: metadata """ function expand(df::AbstractDataFrame, src::ColumnIndex...; - cols::Union{Symbol, AbstractVector{Symbol}, - AbstractVector{<:AbstractString}}=:union, - promote::Bool=(cols in [:union, :subset]), makeunique::Bool=false) ref_df = select(df, Not(collect(Any, src))) for idx in src col = df[!, idx] tmp_df = DataFrame() for v in col - push!(tmp_df, v, cols=cols, promote=promote) + push!(tmp_df, v, cols=:union, promote=true) end hcat!(ref_df, tmp_df, makeunique=makeunique, copycols=false) end @@ -245,9 +258,7 @@ end """ expand!(df::AbstractDataFrame, src::ColumnIndex...; - cols::Union{Symbol, AbstractVector{Symbol}, - AbstractVector{<:AbstractString}}=:union, - promote::Bool=true, makeunique::Bool=false) + makeunique::Bool=false) Extract in-place the contents of one or more columns `cols` in `df` that contain `NamedTuple`, a `DataFrameRow`, an `AbstractDict`, or a @@ -255,8 +266,10 @@ contain `NamedTuple`, a `DataFrameRow`, an `AbstractDict`, or a addition to original columns. The newly created columns are stored at the end of the data frame (and the `src` columns are dropped). -`cols` (default `:union`) and `promote` (default `true`) keyword arguments -have the same meaning as in [`push!`](@ref). +Table stored in each row of `src` can have different columns. `missing` value is +pushed to columns missing in a given row that are present in other rows. +The element type of resulting column is determined by promotion of element types +of columns in individual rows. If `makeunique=false` (the default) produced column names must be unique. If `makeunique=true` then duplicate column names will be suffixed with `_i` @@ -266,16 +279,13 @@ TODO: metadata """ function expand!(df::AbstractDataFrame, src::ColumnIndex...; - cols::Union{Symbol, AbstractVector{Symbol}, - AbstractVector{<:AbstractString}}=:union, - promote::Bool=(cols in [:union, :subset]), makeunique::Bool=false) tmp_dfs = DataFrame[] for idx in src col = df[!, idx] tmp_df = DataFrame() for v in col - push!(tmp_df, v, cols=cols, promote=promote) + push!(tmp_df, v, cols=:union, promote=true) end push!(tmp_dfs, tmp_df) end From 698330f02bf653e247eba447b565becd89b656b7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Mon, 9 Jan 2023 15:23:35 +0100 Subject: [PATCH 09/12] change to extract --- docs/src/lib/functions.md | 4 ++-- src/abstractdataframe/nest.jl | 32 +++++++++++++++----------------- 2 files changed, 17 insertions(+), 19 deletions(-) diff --git a/docs/src/lib/functions.md b/docs/src/lib/functions.md index 79c8e77aae..ef010b45a0 100644 --- a/docs/src/lib/functions.md +++ b/docs/src/lib/functions.md @@ -76,8 +76,8 @@ rename! ```@docs append! combine -expand -expand! +extract +extract! fillcombinations flatten hcat diff --git a/src/abstractdataframe/nest.jl b/src/abstractdataframe/nest.jl index 86fe673af5..d5237a0112 100644 --- a/src/abstractdataframe/nest.jl +++ b/src/abstractdataframe/nest.jl @@ -205,28 +205,26 @@ function unnest(df::AbstractDataFrame, src::ColumnIndex...; col_count = ncol(ref_df) for idx in src col = df[!, idx] - tmp_df = DataFrame() - for v in col - if v isa DataFrame # produce DataFrameRow - v = DataFrame([n => [c] for (n, c) in pairs(eachcol(v))], - copycols=false) |> only - else # produce NamedTuple - v = Tables.columntable(v) + if all(x -> x isa AbstractDataFrame, col) + tmp_df = reduce(vcat, col, cols=:union) + else + tmp_df = DataFrame() + for v in col + append!(tmp_df, v, cols=:union, promote=true) end - push!(tmp_df, v, cols=:union, promote=true) end hcat!(ref_df, tmp_df, makeunique=makeunique, copycols=false) end - return flatten(ref_df, col_count+1:ncol(ref_df), scalar=Missing) + return ref_df end """ - expand(df::AbstractDataFrame, src::ColumnIndex...; - makeunique::Bool=false) + extract(df::AbstractDataFrame, src::ColumnIndex...; + makeunique::Bool=false) Extract the contents of one or more columns `cols` in `df` that contain `NamedTuple`, a `DataFrameRow`, an `AbstractDict` or a `Tables.AbstractRow` -elements returning a data frame with expanded columns, in addition to original +elements returning a data frame with extracted columns, in addition to original columns. The newly created columns are stored at the end of the data frame (and the `src` columns are dropped). @@ -242,7 +240,7 @@ If `makeunique=true` then duplicate column names will be suffixed with `_i` TODO: metadata """ -function expand(df::AbstractDataFrame, src::ColumnIndex...; +function extract(df::AbstractDataFrame, src::ColumnIndex...; makeunique::Bool=false) ref_df = select(df, Not(collect(Any, src))) for idx in src @@ -257,12 +255,12 @@ function expand(df::AbstractDataFrame, src::ColumnIndex...; end """ - expand!(df::AbstractDataFrame, src::ColumnIndex...; - makeunique::Bool=false) + extract!(df::AbstractDataFrame, src::ColumnIndex...; + makeunique::Bool=false) Extract in-place the contents of one or more columns `cols` in `df` that contain `NamedTuple`, a `DataFrameRow`, an `AbstractDict`, or a -`Tables.AbstractRow` elements returning a data frame with expanded columns, in +`Tables.AbstractRow` elements returning a data frame with extracted columns, in addition to original columns. The newly created columns are stored at the end of the data frame (and the `src` columns are dropped). @@ -278,7 +276,7 @@ If `makeunique=true` then duplicate column names will be suffixed with `_i` TODO: metadata """ -function expand!(df::AbstractDataFrame, src::ColumnIndex...; +function extract!(df::AbstractDataFrame, src::ColumnIndex...; makeunique::Bool=false) tmp_dfs = DataFrame[] for idx in src From 70178676f793cb58de1dc428e205f0ec4e22e803 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Sun, 5 Feb 2023 09:19:57 +0100 Subject: [PATCH 10/12] remove `flatten` from the PR --- NEWS.md | 2 +- src/abstractdataframe/abstractdataframe.jl | 120 +++++++++- src/abstractdataframe/nest.jl | 254 +++------------------ test/reshape.jl | 39 ---- 4 files changed, 157 insertions(+), 258 deletions(-) diff --git a/NEWS.md b/NEWS.md index d92ec60745..5a38cd4f04 100644 --- a/NEWS.md +++ b/NEWS.md @@ -15,7 +15,7 @@ * Joining functions now support `order` keyword argument allowing the user to specify the order of the rows in the produced table ([#3233](https://github.com/JuliaData/DataFrames.jl/pull/3233)) -* Add `nest`, `unnest`, `extract`, and `extract!` functions; improve `flatten` +* Add `nest`, `unnest`, `extract`, and `extract!` functions ([#3258](https://github.com/JuliaData/DataFrames.jl/pull/3258)) ## Bug fixes diff --git a/src/abstractdataframe/abstractdataframe.jl b/src/abstractdataframe/abstractdataframe.jl index ec85d5d458..7654f5d566 100644 --- a/src/abstractdataframe/abstractdataframe.jl +++ b/src/abstractdataframe/abstractdataframe.jl @@ -2502,6 +2502,125 @@ function Missings.allowmissing(df::AbstractDataFrame, return new_df end +""" + flatten(df::AbstractDataFrame, cols) +When columns `cols` of data frame `df` have iterable elements that define +`length` (for example a `Vector` of `Vector`s), return a `DataFrame` where each +element of each `col` in `cols` is flattened, meaning the column corresponding +to `col` becomes a longer vector where the original entries are concatenated. +Elements of row `i` of `df` in columns other than `cols` will be repeated +according to the length of `df[i, col]`. These lengths must therefore be the +same for each `col` in `cols`, or else an error is raised. Note that these +elements are not copied, and thus if they are mutable changing them in the +returned `DataFrame` will affect `df`. +`cols` can be any column selector ($COLUMNINDEX_STR; $MULTICOLUMNINDEX_STR). +$METADATA_FIXED +# Examples +```jldoctest +julia> df1 = DataFrame(a=[1, 2], b=[[1, 2], [3, 4]], c=[[5, 6], [7, 8]]) +2×3 DataFrame + Row │ a b c + │ Int64 Array… Array… +─────┼─────────────────────── + 1 │ 1 [1, 2] [5, 6] + 2 │ 2 [3, 4] [7, 8] +julia> flatten(df1, :b) +4×3 DataFrame + Row │ a b c + │ Int64 Int64 Array… +─────┼────────────────────── + 1 │ 1 1 [5, 6] + 2 │ 1 2 [5, 6] + 3 │ 2 3 [7, 8] + 4 │ 2 4 [7, 8] +julia> flatten(df1, [:b, :c]) +4×3 DataFrame + Row │ a b c + │ Int64 Int64 Int64 +─────┼───────────────────── + 1 │ 1 1 5 + 2 │ 1 2 6 + 3 │ 2 3 7 + 4 │ 2 4 8 +julia> df2 = DataFrame(a=[1, 2], b=[("p", "q"), ("r", "s")]) +2×2 DataFrame + Row │ a b + │ Int64 Tuple… +─────┼─────────────────── + 1 │ 1 ("p", "q") + 2 │ 2 ("r", "s") +julia> flatten(df2, :b) +4×2 DataFrame + Row │ a b + │ Int64 String +─────┼─────────────── + 1 │ 1 p + 2 │ 1 q + 3 │ 2 r + 4 │ 2 s +julia> df3 = DataFrame(a=[1, 2], b=[[1, 2], [3, 4]], c=[[5, 6], [7]]) +2×3 DataFrame + Row │ a b c + │ Int64 Array… Array… +─────┼─────────────────────── + 1 │ 1 [1, 2] [5, 6] + 2 │ 2 [3, 4] [7] +julia> flatten(df3, [:b, :c]) +ERROR: ArgumentError: Lengths of iterables stored in columns :b and :c are not the same in row 2 +``` +""" +function flatten(df::AbstractDataFrame, + cols::Union{ColumnIndex, MultiColumnIndex}) + _check_consistency(df) + + idxcols = index(df)[cols] + if isempty(idxcols) + cdf = copy(df) + _drop_all_nonnote_metadata!(cdf) + return cdf + end + + col1 = first(idxcols) + lengths = length.(df[!, col1]) + for col in idxcols + v = df[!, col] + if any(x -> length(x[1]) != x[2], zip(v, lengths)) + r = findfirst(x -> x != 0, length.(v) .- lengths) + colnames = _names(df) + throw(ArgumentError("Lengths of iterables stored in columns :$(colnames[col1]) " * + "and :$(colnames[col]) are not the same in row $r")) + end + end + + new_df = similar(df[!, Not(cols)], sum(lengths)) + for name in _names(new_df) + repeat_lengths!(new_df[!, name], df[!, name], lengths) + end + length(idxcols) > 1 && sort!(idxcols) + for col in idxcols + col_to_flatten = df[!, col] + fast_path = eltype(col_to_flatten) isa AbstractVector && + !isempty(col_to_flatten) + flattened_col = fast_path ? + reduce(vcat, col_to_flatten) : + collect(Iterators.flatten(col_to_flatten)) + insertcols!(new_df, col, _names(df)[col] => flattened_col) + end + + _copy_all_note_metadata!(new_df, df) + return new_df +end + +function repeat_lengths!(longnew::AbstractVector, shortold::AbstractVector, + lengths::AbstractVector{Int}) + counter = 1 + @inbounds for i in eachindex(shortold) + l = lengths[i] + longnew[counter:(counter + l - 1)] .= Ref(shortold[i]) + counter += l + end +end + # Disallowed getindex and setindex! operations that are a common mistake Base.getindex(::AbstractDataFrame, ::Union{Symbol, Integer, AbstractString}) = @@ -3272,4 +3391,3 @@ function Base.iterate(itr::Iterators.PartitionIterator{<:AbstractDataFrame}, sta r = min(state + itr.n - 1, last_idx) return view(itr.c, state:r, :), r + 1 end - diff --git a/src/abstractdataframe/nest.jl b/src/abstractdataframe/nest.jl index d5237a0112..03d1131295 100644 --- a/src/abstractdataframe/nest.jl +++ b/src/abstractdataframe/nest.jl @@ -26,8 +26,8 @@ TODO: metadata julia> df = DataFrame(id = ["b", "a", "a", "c", "b", "b"], x = 1:6, y = 11:16, z='a':'f') 6×4 DataFrame - Row │ id x y z - │ String Int64 Int64 Char + Row │ id x y z + │ String Int64 Int64 Char ─────┼──────────────────────────── 1 │ b 1 11 a 2 │ a 2 12 b @@ -39,110 +39,110 @@ julia> df = DataFrame(id = ["b", "a", "a", "c", "b", "b"], julia> n1 = nest(groupby(df, :id)) 3×2 DataFrame Row │ id data - │ String DataFrame + │ String DataFrame ─────┼─────────────────────── - 1 │ b 3×3 DataFrame - 2 │ a 2×3 DataFrame - 3 │ c 1×3 DataFrame + 1 │ b 3×3 DataFrame + 2 │ a 2×3 DataFrame + 3 │ c 1×3 DataFrame julia> n1.data 3-element Vector{DataFrame}: 3×3 DataFrame - Row │ x y z - │ Int64 Int64 Char + Row │ x y z + │ Int64 Int64 Char ─────┼──────────────────── 1 │ 1 11 a 2 │ 5 15 e 3 │ 6 16 f 2×3 DataFrame - Row │ x y z - │ Int64 Int64 Char + Row │ x y z + │ Int64 Int64 Char ─────┼──────────────────── 1 │ 2 12 b 2 │ 3 13 c 1×3 DataFrame - Row │ x y z - │ Int64 Int64 Char + Row │ x y z + │ Int64 Int64 Char ─────┼──────────────────── 1 │ 4 14 d julia> n2 = nest(groupby(df, :id), [:z, :x] => :zx) 3×2 DataFrame Row │ id zx - │ String DataFrame + │ String DataFrame ─────┼─────────────────────── - 1 │ b 3×2 DataFrame - 2 │ a 2×2 DataFrame - 3 │ c 1×2 DataFrame + 1 │ b 3×2 DataFrame + 2 │ a 2×2 DataFrame + 3 │ c 1×2 DataFrame julia> n2.zx 3-element Vector{DataFrame}: 3×2 DataFrame - Row │ z x - │ Char Int64 + Row │ z x + │ Char Int64 ─────┼───────────── 1 │ a 1 2 │ e 5 3 │ f 6 2×2 DataFrame - Row │ z x - │ Char Int64 + Row │ z x + │ Char Int64 ─────┼───────────── 1 │ b 2 2 │ c 3 1×2 DataFrame - Row │ z x - │ Char Int64 + Row │ z x + │ Char Int64 ─────┼───────────── 1 │ d 4 julia> n3 = nest(groupby(df, :id), :x => :x, [:y, :z] => :yz) 3×3 DataFrame Row │ id x yz - │ String DataFrame DataFrame + │ String DataFrame DataFrame ─────┼────────────────────────────────────── - 1 │ b 3×1 DataFrame 3×2 DataFrame - 2 │ a 2×1 DataFrame 2×2 DataFrame - 3 │ c 1×1 DataFrame 1×2 DataFrame + 1 │ b 3×1 DataFrame 3×2 DataFrame + 2 │ a 2×1 DataFrame 2×2 DataFrame + 3 │ c 1×1 DataFrame 1×2 DataFrame julia> n3.x 3-element Vector{DataFrame}: 3×1 DataFrame - Row │ x - │ Int64 + Row │ x + │ Int64 ─────┼─────── 1 │ 1 2 │ 5 3 │ 6 2×1 DataFrame - Row │ x - │ Int64 + Row │ x + │ Int64 ─────┼─────── 1 │ 2 2 │ 3 1×1 DataFrame - Row │ x - │ Int64 + Row │ x + │ Int64 ─────┼─────── 1 │ 4 julia> n3.yz 3-element Vector{DataFrame}: 3×2 DataFrame - Row │ y z - │ Int64 Char + Row │ y z + │ Int64 Char ─────┼───────────── 1 │ 11 a 2 │ 15 e 3 │ 16 f 2×2 DataFrame - Row │ y z - │ Int64 Char + Row │ y z + │ Int64 Char ─────┼───────────── 1 │ 12 b 2 │ 13 c 1×2 DataFrame - Row │ y z + Row │ y z │ Int64 Char ─────┼───────────── 1 │ 14 d @@ -291,183 +291,3 @@ function extract!(df::AbstractDataFrame, src::ColumnIndex...; hcat!(ref_df, tmp_dfs..., makeunique=makeunique, copycols=false) return ref_df end - -""" - flatten(df::AbstractDataFrame, cols; scalar::Type) - -When columns `cols` of data frame `df` have iterable elements that define -`length` (for example a `Vector` of `Vector`s), return a `DataFrame` where each -element of each `col` in `cols` is flattened, meaning the column corresponding -to `col` becomes a longer vector where the original entries are concatenated. -Elements of row `i` of `df` in columns other than `cols` will be repeated -according to the length of `df[i, col]`. These lengths must therefore be the -same for each `col` in `cols`, or else an error is raised. Note that these -elements are not copied, and thus if they are mutable changing them in the -returned `DataFrame` will affect `df`. - -`cols` can be any column selector ($COLUMNINDEX_STR; $MULTICOLUMNINDEX_STR). - -If `scalar` is passed then values that have this type in flattened columns -are treated as scalars and broadcasted as many times as is needed to match -lengths of values stored in other columns. One row is produced if all -corresponding values are scalars. - -$METADATA_FIXED - -# Examples - -```jldoctest -julia> df1 = DataFrame(a=[1, 2], b=[[1, 2], [3, 4]], c=[[5, 6], [7, 8]]) -2×3 DataFrame - Row │ a b c - │ Int64 Array… Array… -─────┼─────────────────────── - 1 │ 1 [1, 2] [5, 6] - 2 │ 2 [3, 4] [7, 8] - -julia> flatten(df1, :b) -4×3 DataFrame - Row │ a b c - │ Int64 Int64 Array… -─────┼────────────────────── - 1 │ 1 1 [5, 6] - 2 │ 1 2 [5, 6] - 3 │ 2 3 [7, 8] - 4 │ 2 4 [7, 8] - -julia> flatten(df1, [:b, :c]) -4×3 DataFrame - Row │ a b c - │ Int64 Int64 Int64 -─────┼───────────────────── - 1 │ 1 1 5 - 2 │ 1 2 6 - 3 │ 2 3 7 - 4 │ 2 4 8 - -julia> df2 = DataFrame(a=[1, 2], b=[("p", "q"), ("r", "s")]) -2×2 DataFrame - Row │ a b - │ Int64 Tuple… -─────┼─────────────────── - 1 │ 1 ("p", "q") - 2 │ 2 ("r", "s") - -julia> flatten(df2, :b) -4×2 DataFrame - Row │ a b - │ Int64 String -─────┼─────────────── - 1 │ 1 p - 2 │ 1 q - 3 │ 2 r - 4 │ 2 s - -julia> df3 = DataFrame(a=[1, 2], b=[[1, 2], [3, 4]], c=[[5, 6], [7]]) -2×3 DataFrame - Row │ a b c - │ Int64 Array… Array… -─────┼─────────────────────── - 1 │ 1 [1, 2] [5, 6] - 2 │ 2 [3, 4] [7] - -julia> flatten(df3, [:b, :c]) -ERROR: ArgumentError: Lengths of iterables stored in columns :b and :c are not the same in row 2 - -julia> df4 = DataFrame(a=[1, 2, 3], - b=[[1, 2], missing, missing], - c=[[5, 6], missing, [7, 8]]) -3×3 DataFrame - Row │ a b c - │ Int64 Array…? Array…? -─────┼───────────────────────── - 1 │ 1 [1, 2] [5, 6] - 2 │ 2 missing missing - 3 │ 3 missing [7, 8] - -julia> flatten(df4, [:b, :c], scalar=Missing) -5×3 DataFrame - Row │ a b c - │ Int64 Int64? Int64? -─────┼───────────────────────── - 1 │ 1 1 5 - 2 │ 1 2 6 - 3 │ 2 missing missing - 4 │ 3 missing 7 - 5 │ 3 missing 8 -``` -""" -function flatten(df::AbstractDataFrame, - cols::Union{ColumnIndex, MultiColumnIndex}; - scalar::Type=Union{}) - _check_consistency(df) - - idxcols = index(df)[cols] - if isempty(idxcols) - cdf = copy(df) - _drop_all_nonnote_metadata!(cdf) - return cdf - end - - col1 = first(idxcols) - lengths = Int[x isa scalar ? -1 : length(x) for x in df[!, col1]] - for (i, coli) in enumerate(idxcols) - i == 1 && continue - update_lengths!(lengths, df[!, coli], scalar, df, col1, coli) - end - - # handle case where in all columns we had a scalar - # in this case we keep it one time - for i in 1:length(lengths) - lengths[i] == -1 && (lengths[i] = 1) - end - - new_df = similar(df[!, Not(cols)], sum(lengths)) - for name in _names(new_df) - repeat_lengths!(new_df[!, name], df[!, name], lengths) - end - length(idxcols) > 1 && sort!(idxcols) - for col in idxcols - col_to_flatten = df[!, col] - fast_path = eltype(col_to_flatten) isa AbstractVector && - !isempty(col_to_flatten) - flattened_col = if fast_path - reduce(vcat, col_to_flatten) - elseif scalar === Union{} - collect(Iterators.flatten(col_to_flatten)) - else - collect(Iterators.flatten(v isa scalar ? Iterators.repeated(v, l) : v - for (l, v) in zip(lengths, col_to_flatten))) - end - insertcols!(new_df, col, _names(df)[col] => flattened_col) - end - - _copy_all_note_metadata!(new_df, df) - return new_df -end - -function update_lengths!(lengths::Vector{Int}, col::AbstractVector, scalar::Type, - df::AbstractDataFrame, col1, coli) - for (i, v) in enumerate(col) - v isa scalar && continue - lv = length(v) - if lengths[i] == -1 - lengths[i] = lv - elseif lengths[i] != lv - colnames = _names(df) - throw(ArgumentError("Lengths of iterables stored in columns :$(colnames[col1]) " * - "and :$(colnames[coli]) are not the same in row $i")) - end - end -end - -function repeat_lengths!(longnew::AbstractVector, shortold::AbstractVector, - lengths::AbstractVector{Int}) - counter = 1 - @inbounds for i in eachindex(shortold) - l = lengths[i] - longnew[counter:(counter + l - 1)] .= Ref(shortold[i]) - counter += l - end -end - diff --git a/test/reshape.jl b/test/reshape.jl index d00957297e..58cf7bfce0 100644 --- a/test/reshape.jl +++ b/test/reshape.jl @@ -431,45 +431,6 @@ end @test flatten(DataFrame(), All()) == DataFrame() end -@testset "flatten with scalar" begin - df = DataFrame(a=[1, 2, 3], - b=[[1, 2], missing, [3, 4]], - c=[[5, 6], missing, missing]) - @test flatten(df, :a) ≅ df - @test_throws MethodError flatten(df, :b) - @test flatten(df, :b, scalar=Missing) ≅ - DataFrame(a=[1, 1, 2, 3, 3], - b=[1, 2, missing, 3, 4], - c=[[5, 6], [5, 6], missing, missing, missing]) - @test flatten(df, [:b, :c], scalar=Missing) ≅ - DataFrame(a=[1, 1, 2, 3, 3], - b=[1, 2, missing, 3, 4], - c=[5, 6, missing, missing, missing]) - @test flatten(df, [:b, :c], scalar=Any) ≅ df - - df = DataFrame(a=missing, b=[1], c=missing, d=[[1, 2]]) - @test_throws ArgumentError flatten(df, All(), scalar=Missing) - @test flatten(df, Not(:d), scalar=Missing) ≅ - DataFrame(a=missing, b=1, c=missing, d=[[1, 2]]) - @test flatten(df, Not(:b), scalar=Missing) ≅ - DataFrame(a=[missing, missing], b=[1, 1], c=[missing, missing], d=[1, 2]) - - df = DataFrame(a="xy", b=[[1, 2]]) - @test flatten(df, [:a, :b]) == DataFrame(a=['x', 'y'], b=[1, 2]) - @test flatten(df, [:a, :b], scalar=String) == - DataFrame(a=["xy", "xy"], b=[1, 2]) - - df = DataFrame(a=[[1], [], [3, 4], missing], b = missings(4), id=1:4) - @test flatten(df, [:a, :b], scalar=Missing) ≅ - DataFrame(a=[1, 3, 4, missing], b=missings(4), id=[1, 3, 3, 4]) - df = DataFrame(id=1:10, x=[1:i-1 for i in 1:10]) - df.y = [iseven(last(v)) ? missing : v for v in df.x] - @test flatten(df, [:x, :y], scalar=Missing) ≅ - DataFrame(id=reduce(vcat, [fill(i, i-1) for i in 2:10]), - x=reduce(vcat, [1:i for i in 1:9]), - y=reduce(vcat, [iseven(i) ? missings(i) : (1:i) for i in 1:9])) -end - @testset "stack categorical test" begin Random.seed!(1234) d1 = DataFrame(a=repeat([1:3;], inner=[4]), From cca1c8710b015d5c249b0b82488739469c505c2d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Sun, 5 Feb 2023 09:30:02 +0100 Subject: [PATCH 11/12] fix newlines --- src/abstractdataframe/abstractdataframe.jl | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/src/abstractdataframe/abstractdataframe.jl b/src/abstractdataframe/abstractdataframe.jl index 7654f5d566..78b3928d6b 100644 --- a/src/abstractdataframe/abstractdataframe.jl +++ b/src/abstractdataframe/abstractdataframe.jl @@ -2504,6 +2504,7 @@ end """ flatten(df::AbstractDataFrame, cols) + When columns `cols` of data frame `df` have iterable elements that define `length` (for example a `Vector` of `Vector`s), return a `DataFrame` where each element of each `col` in `cols` is flattened, meaning the column corresponding @@ -2513,8 +2514,11 @@ according to the length of `df[i, col]`. These lengths must therefore be the same for each `col` in `cols`, or else an error is raised. Note that these elements are not copied, and thus if they are mutable changing them in the returned `DataFrame` will affect `df`. + `cols` can be any column selector ($COLUMNINDEX_STR; $MULTICOLUMNINDEX_STR). + $METADATA_FIXED + # Examples ```jldoctest julia> df1 = DataFrame(a=[1, 2], b=[[1, 2], [3, 4]], c=[[5, 6], [7, 8]]) @@ -2524,6 +2528,7 @@ julia> df1 = DataFrame(a=[1, 2], b=[[1, 2], [3, 4]], c=[[5, 6], [7, 8]]) ─────┼─────────────────────── 1 │ 1 [1, 2] [5, 6] 2 │ 2 [3, 4] [7, 8] + julia> flatten(df1, :b) 4×3 DataFrame Row │ a b c @@ -2533,6 +2538,7 @@ julia> flatten(df1, :b) 2 │ 1 2 [5, 6] 3 │ 2 3 [7, 8] 4 │ 2 4 [7, 8] + julia> flatten(df1, [:b, :c]) 4×3 DataFrame Row │ a b c @@ -2542,6 +2548,7 @@ julia> flatten(df1, [:b, :c]) 2 │ 1 2 6 3 │ 2 3 7 4 │ 2 4 8 + julia> df2 = DataFrame(a=[1, 2], b=[("p", "q"), ("r", "s")]) 2×2 DataFrame Row │ a b @@ -2549,6 +2556,7 @@ julia> df2 = DataFrame(a=[1, 2], b=[("p", "q"), ("r", "s")]) ─────┼─────────────────── 1 │ 1 ("p", "q") 2 │ 2 ("r", "s") + julia> flatten(df2, :b) 4×2 DataFrame Row │ a b @@ -2558,6 +2566,7 @@ julia> flatten(df2, :b) 2 │ 1 q 3 │ 2 r 4 │ 2 s + julia> df3 = DataFrame(a=[1, 2], b=[[1, 2], [3, 4]], c=[[5, 6], [7]]) 2×3 DataFrame Row │ a b c @@ -2565,6 +2574,7 @@ julia> df3 = DataFrame(a=[1, 2], b=[[1, 2], [3, 4]], c=[[5, 6], [7]]) ─────┼─────────────────────── 1 │ 1 [1, 2] [5, 6] 2 │ 2 [3, 4] [7] + julia> flatten(df3, [:b, :c]) ERROR: ArgumentError: Lengths of iterables stored in columns :b and :c are not the same in row 2 ``` From 5c7111c4201af139c99762d5a7a34add445f00ce Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Sun, 5 Feb 2023 09:30:49 +0100 Subject: [PATCH 12/12] another newline fix --- src/abstractdataframe/abstractdataframe.jl | 1 + 1 file changed, 1 insertion(+) diff --git a/src/abstractdataframe/abstractdataframe.jl b/src/abstractdataframe/abstractdataframe.jl index 78b3928d6b..a3910d0f77 100644 --- a/src/abstractdataframe/abstractdataframe.jl +++ b/src/abstractdataframe/abstractdataframe.jl @@ -2520,6 +2520,7 @@ returned `DataFrame` will affect `df`. $METADATA_FIXED # Examples + ```jldoctest julia> df1 = DataFrame(a=[1, 2], b=[[1, 2], [3, 4]], c=[[5, 6], [7, 8]]) 2×3 DataFrame