From c459de92707df92e23eded7a0dcf559cd1e99f99 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= <bkamins@sgh.waw.pl>
Date: Wed, 28 Dec 2022 20:33:11 +0100
Subject: [PATCH 01/12] add nest, unnest, improve flatten

---
 src/DataFrames.jl                          |   3 +
 src/abstractdataframe/abstractdataframe.jl | 130 -------
 src/abstractdataframe/nest.jl              | 390 +++++++++++++++++++++
 test/reshape.jl                            |  39 +++
 4 files changed, 432 insertions(+), 130 deletions(-)
 create mode 100644 src/abstractdataframe/nest.jl

diff --git a/src/DataFrames.jl b/src/DataFrames.jl
index c5d8366214..4e117e72f3 100644
--- a/src/DataFrames.jl
+++ b/src/DataFrames.jl
@@ -76,6 +76,7 @@ export AbstractDataFrame,
        mapcols,
        mapcols!,
        ncol,
+       nest,
        nonunique,
        nrow,
        order,
@@ -95,6 +96,7 @@ export AbstractDataFrame,
        transform,
        transform!,
        unique!,
+       unnest,
        unstack,
        valuecols,
        metadata,
@@ -166,6 +168,7 @@ include("abstractdataframe/show.jl")
 include("groupeddataframe/show.jl")
 include("dataframerow/show.jl")
 include("abstractdataframe/io.jl")
+include("abstractdataframe/nest.jl")
 
 include("other/tables.jl")
 include("other/names.jl")
diff --git a/src/abstractdataframe/abstractdataframe.jl b/src/abstractdataframe/abstractdataframe.jl
index 9fba690d49..ec85d5d458 100644
--- a/src/abstractdataframe/abstractdataframe.jl
+++ b/src/abstractdataframe/abstractdataframe.jl
@@ -2502,136 +2502,6 @@ function Missings.allowmissing(df::AbstractDataFrame,
     return new_df
 end
 
-"""
-    flatten(df::AbstractDataFrame, cols)
-
-When columns `cols` of data frame `df` have iterable elements that define
-`length` (for example a `Vector` of `Vector`s), return a `DataFrame` where each
-element of each `col` in `cols` is flattened, meaning the column corresponding
-to `col` becomes a longer vector where the original entries are concatenated.
-Elements of row `i` of `df` in columns other than `cols` will be repeated
-according to the length of `df[i, col]`. These lengths must therefore be the
-same for each `col` in `cols`, or else an error is raised. Note that these
-elements are not copied, and thus if they are mutable changing them in the
-returned `DataFrame` will affect `df`.
-
-`cols` can be any column selector ($COLUMNINDEX_STR; $MULTICOLUMNINDEX_STR).
-
-$METADATA_FIXED
-
-# Examples
-
-```jldoctest
-julia> df1 = DataFrame(a=[1, 2], b=[[1, 2], [3, 4]], c=[[5, 6], [7, 8]])
-2×3 DataFrame
- Row │ a      b       c
-     │ Int64  Array…  Array…
-─────┼───────────────────────
-   1 │     1  [1, 2]  [5, 6]
-   2 │     2  [3, 4]  [7, 8]
-
-julia> flatten(df1, :b)
-4×3 DataFrame
- Row │ a      b      c
-     │ Int64  Int64  Array…
-─────┼──────────────────────
-   1 │     1      1  [5, 6]
-   2 │     1      2  [5, 6]
-   3 │     2      3  [7, 8]
-   4 │     2      4  [7, 8]
-
-julia> flatten(df1, [:b, :c])
-4×3 DataFrame
- Row │ a      b      c
-     │ Int64  Int64  Int64
-─────┼─────────────────────
-   1 │     1      1      5
-   2 │     1      2      6
-   3 │     2      3      7
-   4 │     2      4      8
-
-julia> df2 = DataFrame(a=[1, 2], b=[("p", "q"), ("r", "s")])
-2×2 DataFrame
- Row │ a      b
-     │ Int64  Tuple…
-─────┼───────────────────
-   1 │     1  ("p", "q")
-   2 │     2  ("r", "s")
-
-julia> flatten(df2, :b)
-4×2 DataFrame
- Row │ a      b
-     │ Int64  String
-─────┼───────────────
-   1 │     1  p
-   2 │     1  q
-   3 │     2  r
-   4 │     2  s
-
-julia> df3 = DataFrame(a=[1, 2], b=[[1, 2], [3, 4]], c=[[5, 6], [7]])
-2×3 DataFrame
- Row │ a      b       c
-     │ Int64  Array…  Array…
-─────┼───────────────────────
-   1 │     1  [1, 2]  [5, 6]
-   2 │     2  [3, 4]  [7]
-
-julia> flatten(df3, [:b, :c])
-ERROR: ArgumentError: Lengths of iterables stored in columns :b and :c are not the same in row 2
-```
-"""
-function flatten(df::AbstractDataFrame,
-                 cols::Union{ColumnIndex, MultiColumnIndex})
-    _check_consistency(df)
-
-    idxcols = index(df)[cols]
-    if isempty(idxcols)
-        cdf = copy(df)
-        _drop_all_nonnote_metadata!(cdf)
-        return cdf
-    end
-
-    col1 = first(idxcols)
-    lengths = length.(df[!, col1])
-    for col in idxcols
-        v = df[!, col]
-        if any(x -> length(x[1]) != x[2], zip(v, lengths))
-            r = findfirst(x -> x != 0, length.(v) .- lengths)
-            colnames = _names(df)
-            throw(ArgumentError("Lengths of iterables stored in columns :$(colnames[col1]) " *
-                                "and :$(colnames[col]) are not the same in row $r"))
-        end
-    end
-
-    new_df = similar(df[!, Not(cols)], sum(lengths))
-    for name in _names(new_df)
-        repeat_lengths!(new_df[!, name], df[!, name], lengths)
-    end
-    length(idxcols) > 1 && sort!(idxcols)
-    for col in idxcols
-        col_to_flatten = df[!, col]
-        fast_path = eltype(col_to_flatten) isa AbstractVector &&
-                    !isempty(col_to_flatten)
-        flattened_col = fast_path ?
-            reduce(vcat, col_to_flatten) :
-            collect(Iterators.flatten(col_to_flatten))
-        insertcols!(new_df, col, _names(df)[col] => flattened_col)
-    end
-
-    _copy_all_note_metadata!(new_df, df)
-    return new_df
-end
-
-function repeat_lengths!(longnew::AbstractVector, shortold::AbstractVector,
-                         lengths::AbstractVector{Int})
-    counter = 1
-    @inbounds for i in eachindex(shortold)
-        l = lengths[i]
-        longnew[counter:(counter + l - 1)] .= Ref(shortold[i])
-        counter += l
-    end
-end
-
 # Disallowed getindex and setindex! operations that are a common mistake
 
 Base.getindex(::AbstractDataFrame, ::Union{Symbol, Integer, AbstractString}) =
diff --git a/src/abstractdataframe/nest.jl b/src/abstractdataframe/nest.jl
new file mode 100644
index 0000000000..966e79ece4
--- /dev/null
+++ b/src/abstractdataframe/nest.jl
@@ -0,0 +1,390 @@
+"""
+    nest(gdf::GroupedDataFrame, cols::Pair{<:AbstractString}...)
+    nest(gdf::GroupedDataFrame, cols::Pair{Symbol}...)
+    nest(gdf::GroupedDataFrame)
+
+Nest multiple columns per group of `gdf` into a single column as data frames.
+
+Every `cols` argument must be a pair `column_selector => column_name`.
+If no `cols` are passed, then by default `valuecols(gdf) => :data`
+nesting is performed.
+
+Return a data frame having all grouping columns of `gdf` followed by one
+or more columns where `column_name` is a name of the column storing data frames,
+where every data frame consists of columns picked by `column_selector` values
+computed for each group of `gdf`.
+
+TODO: metadata
+
+# Examples
+
+julia> df = DataFrame(id = ["b", "a", "a", "c", "b", "b"],
+                      x = 1:6, y = 11:16, z='a':'f')
+6×4 DataFrame
+ Row │ id      x      y      z    
+     │ String  Int64  Int64  Char 
+─────┼────────────────────────────
+   1 │ b           1     11  a
+   2 │ a           2     12  b
+   3 │ a           3     13  c
+   4 │ c           4     14  d
+   5 │ b           5     15  e
+   6 │ b           6     16  f
+
+julia> n1 = nest(groupby(df, :id))
+3×2 DataFrame
+ Row │ id      data
+     │ String  DataFrame     
+─────┼───────────────────────
+   1 │ b       3×3 DataFrame 
+   2 │ a       2×3 DataFrame 
+   3 │ c       1×3 DataFrame 
+
+julia> n1.data
+3-element Vector{DataFrame}:
+ 3×3 DataFrame
+ Row │ x      y      z    
+     │ Int64  Int64  Char 
+─────┼────────────────────
+   1 │     1     11  a
+   2 │     5     15  e
+   3 │     6     16  f
+ 2×3 DataFrame
+ Row │ x      y      z    
+     │ Int64  Int64  Char 
+─────┼────────────────────
+   1 │     2     12  b
+   2 │     3     13  c
+ 1×3 DataFrame
+ Row │ x      y      z    
+     │ Int64  Int64  Char 
+─────┼────────────────────
+   1 │     4     14  d
+
+julia> n2 = nest(groupby(df, :id), [:z, :x] => :zx)
+3×2 DataFrame
+ Row │ id      zx
+     │ String  DataFrame     
+─────┼───────────────────────
+   1 │ b       3×2 DataFrame 
+   2 │ a       2×2 DataFrame 
+   3 │ c       1×2 DataFrame 
+
+julia> n2.zx
+3-element Vector{DataFrame}:
+ 3×2 DataFrame
+ Row │ z     x     
+     │ Char  Int64 
+─────┼─────────────
+   1 │ a         1
+   2 │ e         5
+   3 │ f         6
+ 2×2 DataFrame
+ Row │ z     x     
+     │ Char  Int64 
+─────┼─────────────
+   1 │ b         2
+   2 │ c         3
+ 1×2 DataFrame
+ Row │ z     x     
+     │ Char  Int64 
+─────┼─────────────
+   1 │ d         4
+
+julia> n3 = nest(groupby(df, :id), :x => "x", [:y, :z] => "yz")
+3×3 DataFrame
+ Row │ id      x              yz
+     │ String  DataFrame      DataFrame     
+─────┼──────────────────────────────────────
+   1 │ b       3×1 DataFrame  3×2 DataFrame 
+   2 │ a       2×1 DataFrame  2×2 DataFrame 
+   3 │ c       1×1 DataFrame  1×2 DataFrame 
+
+julia> n3.x
+3-element Vector{DataFrame}:
+ 3×1 DataFrame
+ Row │ x     
+     │ Int64 
+─────┼───────
+   1 │     1
+   2 │     5
+   3 │     6
+ 2×1 DataFrame
+ Row │ x     
+     │ Int64 
+─────┼───────
+   1 │     2
+   2 │     3
+ 1×1 DataFrame
+ Row │ x     
+     │ Int64 
+─────┼───────
+   1 │     4
+
+julia> n3.yz
+3-element Vector{DataFrame}:
+ 3×2 DataFrame
+ Row │ y      z    
+     │ Int64  Char 
+─────┼─────────────
+   1 │    11  a
+   2 │    15  e
+   3 │    16  f
+ 2×2 DataFrame
+ Row │ y      z    
+     │ Int64  Char 
+─────┼─────────────
+   1 │    12  b
+   2 │    13  c
+ 1×2 DataFrame
+ Row │ y      z    
+     │ Int64  Char
+─────┼─────────────
+   1 │    14  d
+"""
+nest(gdf::GroupedDataFrame, cols::Pair{<:Any, <:AbstractString}...) =
+    combine(gdf, (sdf -> (; Symbol(dst) => select(sdf, index(sdf)[src]))
+                  for (src, dst) in cols)...)
+nest(gdf::GroupedDataFrame, cols::Pair{<:Any, Symbol}...) =
+    combine(gdf, (sdf -> (; dst => select(sdf, index(sdf)[src]))
+                  for (src, dst) in cols)...)
+nest(gdf::GroupedDataFrame) = nest(gdf, valuecols(gdf) => :data)
+
+"""
+    unnest(df::AbstractDataFrame, src::ColumnIndex...;
+           cols::Union{Symbol, AbstractVector{Symbol},
+                       AbstractVector{<:AbstractString}}=:setequal,
+           promote::Bool=true,
+           makeunique::Bool=false, flatten::Bool=true)
+
+Unnest one or more columns `src` into multiple columns. The newly created
+columns are stored at the end of the data frame (and the `src` column is
+dropped).
+
+Each `src` column must contain a `NamedTuple`, a `DataFrameRow`, a
+`Tables.AbstractRow`, or Tables.jl table.
+
+`cols` (default `:setequal`) and `promote` (default `true`) keyword arguments
+have the same meaning as in [`push!`](@ref).
+
+If `makeunique=false` (the default) produced column names must be unique.
+If `makeunique=true` then duplicate column names will be suffixed with `_i`
+(`i` starting at `1` for the first duplicate).
+
+If `flatten=true` (the default) then newly created columns are flattened
+using [`flatten`](@ref) with `scalar=Missing` keyword argument.
+
+TODO: metadata
+
+"""
+function unnest(df::AbstractDataFrame, src::ColumnIndex...;
+                cols::Union{Symbol, AbstractVector{Symbol},
+                            AbstractVector{<:AbstractString}}=:setequal,
+                promote::Bool=(cols in [:union, :subset]),
+                makeunique::Bool=false, flatten::Bool=true)
+    ref_df = select(df, Not(collect(Any, src)))
+    col_count = ncol(ref_df)
+    for idx in src
+        col = df[!, idx]
+        tmp_df = DataFrame()
+        for v in col
+            if v isa DataFrame # produce DataFrameRow
+                # if flatten=false make a copy to avoid aliases
+                v = DataFrame([n => [flatten ? c : copy(c)]
+                               for (n, c) in pairs(eachcol(v))],
+                              copycols=false) |> only
+            elseif Tables.istable(v) # produce NamedTuple
+                v = Tables.columntable(v)
+            end
+            push!(tmp_df, v, cols=cols, promote=promote)
+        end
+        hcat!(ref_df, tmp_df, makeunique=makeunique, copycols=false)
+    end
+    return if flatten
+        DataFrames.flatten(ref_df, col_count+1:ncol(ref_df), scalar=Missing)
+    else
+        ref_df
+    end
+end
+
+"""
+    flatten(df::AbstractDataFrame, cols; scalar::Type)
+
+When columns `cols` of data frame `df` have iterable elements that define
+`length` (for example a `Vector` of `Vector`s), return a `DataFrame` where each
+element of each `col` in `cols` is flattened, meaning the column corresponding
+to `col` becomes a longer vector where the original entries are concatenated.
+Elements of row `i` of `df` in columns other than `cols` will be repeated
+according to the length of `df[i, col]`. These lengths must therefore be the
+same for each `col` in `cols`, or else an error is raised. Note that these
+elements are not copied, and thus if they are mutable changing them in the
+returned `DataFrame` will affect `df`.
+
+`cols` can be any column selector ($COLUMNINDEX_STR; $MULTICOLUMNINDEX_STR).
+
+If `scalar` is passed then values that have this type in flattened columns
+are treated as scalars and broadcasted as many times as is needed to match
+lengths of values stored in other columns. One row is produced if all
+corresponding values are scalars.
+
+$METADATA_FIXED
+
+# Examples
+
+```jldoctest
+julia> df1 = DataFrame(a=[1, 2], b=[[1, 2], [3, 4]], c=[[5, 6], [7, 8]])
+2×3 DataFrame
+ Row │ a      b       c
+     │ Int64  Array…  Array…
+─────┼───────────────────────
+   1 │     1  [1, 2]  [5, 6]
+   2 │     2  [3, 4]  [7, 8]
+
+julia> flatten(df1, :b)
+4×3 DataFrame
+ Row │ a      b      c
+     │ Int64  Int64  Array…
+─────┼──────────────────────
+   1 │     1      1  [5, 6]
+   2 │     1      2  [5, 6]
+   3 │     2      3  [7, 8]
+   4 │     2      4  [7, 8]
+
+julia> flatten(df1, [:b, :c])
+4×3 DataFrame
+ Row │ a      b      c
+     │ Int64  Int64  Int64
+─────┼─────────────────────
+   1 │     1      1      5
+   2 │     1      2      6
+   3 │     2      3      7
+   4 │     2      4      8
+
+julia> df2 = DataFrame(a=[1, 2], b=[("p", "q"), ("r", "s")])
+2×2 DataFrame
+ Row │ a      b
+     │ Int64  Tuple…
+─────┼───────────────────
+   1 │     1  ("p", "q")
+   2 │     2  ("r", "s")
+
+julia> flatten(df2, :b)
+4×2 DataFrame
+ Row │ a      b
+     │ Int64  String
+─────┼───────────────
+   1 │     1  p
+   2 │     1  q
+   3 │     2  r
+   4 │     2  s
+
+julia> df3 = DataFrame(a=[1, 2], b=[[1, 2], [3, 4]], c=[[5, 6], [7]])
+2×3 DataFrame
+ Row │ a      b       c
+     │ Int64  Array…  Array…
+─────┼───────────────────────
+   1 │     1  [1, 2]  [5, 6]
+   2 │     2  [3, 4]  [7]
+
+julia> flatten(df3, [:b, :c])
+ERROR: ArgumentError: Lengths of iterables stored in columns :b and :c are not the same in row 2
+
+julia> df4 = DataFrame(a=[1, 2, 3],
+                       b=[[1, 2], missing, missing],
+                       c=[[5, 6], missing, [7, 8]])
+3×3 DataFrame
+ Row │ a      b        c       
+     │ Int64  Array…?  Array…? 
+─────┼─────────────────────────
+   1 │     1  [1, 2]   [5, 6]
+   2 │     2  missing  missing 
+   3 │     3  missing  [7, 8]
+
+julia> flatten(df4, [:b, :c], scalar=Missing)
+5×3 DataFrame
+ Row │ a      b        c       
+     │ Int64  Int64?   Int64?  
+─────┼─────────────────────────
+   1 │     1        1        5
+   2 │     1        2        6
+   3 │     2  missing  missing 
+   4 │     3  missing        7
+   5 │     3  missing        8
+```
+"""
+function flatten(df::AbstractDataFrame,
+                 cols::Union{ColumnIndex, MultiColumnIndex};
+                 scalar::Type=Union{})
+    _check_consistency(df)
+
+    idxcols = index(df)[cols]
+    if isempty(idxcols)
+        cdf = copy(df)
+        _drop_all_nonnote_metadata!(cdf)
+        return cdf
+    end
+
+    col1 = first(idxcols)
+    lengths = Int[length_maybe_scalar(x, scalar) for x in df[!, col1]]
+    for (i, coli) in enumerate(idxcols)
+        i == 1 && continue
+        update_lengths!(lengths, df[!, coli], scalar, df, col1, coli)
+    end
+
+    # handle case where in all columns we had a scalar
+    # in this case we keep it one time
+    for i in 1:length(lengths)
+        lengths[i] == -1 && (lengths[i] = 1)
+    end
+
+    new_df = similar(df[!, Not(cols)], sum(lengths))
+    for name in _names(new_df)
+        repeat_lengths!(new_df[!, name], df[!, name], lengths)
+    end
+    length(idxcols) > 1 && sort!(idxcols)
+    for col in idxcols
+        col_to_flatten = df[!, col]
+        fast_path = eltype(col_to_flatten) isa AbstractVector &&
+                    !isempty(col_to_flatten)
+        flattened_col = if fast_path
+                reduce(vcat, col_to_flatten)
+            elseif scalar === Union{}
+                collect(Iterators.flatten(col_to_flatten))
+            else
+                collect(Iterators.flatten(v isa scalar ? Iterators.repeated(v, l) : v
+                                          for (l, v) in zip(lengths, col_to_flatten)))
+            end
+        insertcols!(new_df, col, _names(df)[col] => flattened_col)
+    end
+
+    _copy_all_note_metadata!(new_df, df)
+    return new_df
+end
+
+length_maybe_scalar(v, scalar::Type) = v isa scalar ? -1 : length(v)
+
+function update_lengths!(lengths::Vector{Int}, col::AbstractVector, scalar::Type,
+                         df::AbstractDataFrame, col1, coli)
+    for (i, v) in enumerate(col)
+        lv = length_maybe_scalar(v, scalar)
+        lv == -1 && continue
+        if lengths[i] == -1
+            lengths[i] = lv
+        elseif lengths[i] != lv
+            colnames = _names(df)
+            throw(ArgumentError("Lengths of iterables stored in columns :$(colnames[col1]) " *
+                                "and :$(colnames[coli]) are not the same in row $i"))
+        end
+    end
+end
+
+function repeat_lengths!(longnew::AbstractVector, shortold::AbstractVector,
+                         lengths::AbstractVector{Int})
+    counter = 1
+    @inbounds for i in eachindex(shortold)
+        l = lengths[i]
+        longnew[counter:(counter + l - 1)] .= Ref(shortold[i])
+        counter += l
+    end
+end
+
diff --git a/test/reshape.jl b/test/reshape.jl
index 58cf7bfce0..d00957297e 100644
--- a/test/reshape.jl
+++ b/test/reshape.jl
@@ -431,6 +431,45 @@ end
     @test flatten(DataFrame(), All()) == DataFrame()
 end
 
+@testset "flatten with scalar" begin
+    df = DataFrame(a=[1, 2, 3],
+                   b=[[1, 2], missing, [3, 4]],
+                   c=[[5, 6], missing, missing])
+    @test flatten(df, :a) ≅ df
+    @test_throws MethodError flatten(df, :b)
+    @test flatten(df, :b, scalar=Missing) ≅
+          DataFrame(a=[1, 1, 2, 3, 3],
+                    b=[1, 2, missing, 3, 4],
+                    c=[[5, 6], [5, 6], missing, missing, missing])
+    @test flatten(df, [:b, :c], scalar=Missing) ≅
+          DataFrame(a=[1, 1, 2, 3, 3],
+                    b=[1, 2, missing, 3, 4],
+                    c=[5, 6, missing, missing, missing])
+    @test flatten(df, [:b, :c], scalar=Any) ≅ df
+
+    df = DataFrame(a=missing, b=[1], c=missing, d=[[1, 2]])
+    @test_throws ArgumentError flatten(df, All(), scalar=Missing)
+    @test flatten(df, Not(:d), scalar=Missing) ≅
+        DataFrame(a=missing, b=1, c=missing, d=[[1, 2]])
+    @test flatten(df, Not(:b), scalar=Missing) ≅
+        DataFrame(a=[missing, missing], b=[1, 1], c=[missing, missing], d=[1, 2])
+
+    df = DataFrame(a="xy", b=[[1, 2]])
+    @test flatten(df, [:a, :b]) == DataFrame(a=['x', 'y'], b=[1, 2])
+    @test flatten(df, [:a, :b], scalar=String) ==
+          DataFrame(a=["xy", "xy"], b=[1, 2])
+
+    df = DataFrame(a=[[1], [], [3, 4], missing], b = missings(4), id=1:4)
+    @test flatten(df, [:a, :b], scalar=Missing) ≅
+          DataFrame(a=[1, 3, 4, missing], b=missings(4), id=[1, 3, 3, 4])
+    df = DataFrame(id=1:10, x=[1:i-1 for i in 1:10])
+    df.y = [iseven(last(v)) ? missing : v for v in df.x]
+    @test flatten(df, [:x, :y], scalar=Missing) ≅
+          DataFrame(id=reduce(vcat, [fill(i, i-1) for i in 2:10]),
+                    x=reduce(vcat, [1:i for i in 1:9]),
+                    y=reduce(vcat, [iseven(i) ? missings(i) : (1:i) for i in 1:9]))
+end
+
 @testset "stack categorical test" begin
     Random.seed!(1234)
     d1 = DataFrame(a=repeat([1:3;], inner=[4]),

From d764467937b368dcf366b800a46395fe8b3c301d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= <bkamins@sgh.waw.pl>
Date: Wed, 28 Dec 2022 23:40:24 +0100
Subject: [PATCH 02/12] add to docs

---
 docs/src/lib/functions.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/docs/src/lib/functions.md b/docs/src/lib/functions.md
index 9b9de28471..67dbb7262a 100644
--- a/docs/src/lib/functions.md
+++ b/docs/src/lib/functions.md
@@ -85,6 +85,7 @@ insertcols!
 invpermute!
 mapcols
 mapcols!
+nest
 permute!
 prepend!
 push!
@@ -102,6 +103,7 @@ table_transformation
 transform
 transform!
 vcat
+unnest
 ```
 
 ## Reshaping data frames between tall and wide formats

From c85a275e2baf65d863436204ad1b45e638fa12db Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= <bkamins@sgh.waw.pl>
Date: Thu, 5 Jan 2023 09:57:39 +0100
Subject: [PATCH 03/12] Apply suggestions from code review

Co-authored-by: Milan Bouchet-Valat <nalimilan@club.fr>
---
 src/abstractdataframe/nest.jl | 17 ++++++++++-------
 1 file changed, 10 insertions(+), 7 deletions(-)

diff --git a/src/abstractdataframe/nest.jl b/src/abstractdataframe/nest.jl
index 966e79ece4..88f491635b 100644
--- a/src/abstractdataframe/nest.jl
+++ b/src/abstractdataframe/nest.jl
@@ -3,15 +3,16 @@
     nest(gdf::GroupedDataFrame, cols::Pair{Symbol}...)
     nest(gdf::GroupedDataFrame)
 
-Nest multiple columns per group of `gdf` into a single column as data frames.
+Return a data frame with one row for each group in `gdf` where
+or or more columns contain a data frame of the rows that belong to that group.
 
 Every `cols` argument must be a pair `column_selector => column_name`.
 If no `cols` are passed, then by default `valuecols(gdf) => :data`
 nesting is performed.
 
-Return a data frame having all grouping columns of `gdf` followed by one
-or more columns where `column_name` is a name of the column storing data frames,
-where every data frame consists of columns picked by `column_selector` values
+The returned data frame has all grouping columns of `gdf`, followed by one
+or more columns where `column_name` is the name of the column storing data frames,
+and every data frame consists of columns picked by `column_selector` values
 computed for each group of `gdf`.
 
 TODO: metadata
@@ -91,7 +92,7 @@ julia> n2.zx
 ─────┼─────────────
    1 │ d         4
 
-julia> n3 = nest(groupby(df, :id), :x => "x", [:y, :z] => "yz")
+julia> n3 = nest(groupby(df, :id), :x => :x, [:y, :z] => :yz)
 3×3 DataFrame
  Row │ id      x              yz
      │ String  DataFrame      DataFrame     
@@ -157,8 +158,10 @@ nest(gdf::GroupedDataFrame) = nest(gdf, valuecols(gdf) => :data)
            promote::Bool=true,
            makeunique::Bool=false, flatten::Bool=true)
 
-Unnest one or more columns `src` into multiple columns. The newly created
-columns are stored at the end of the data frame (and the `src` column is
+Extract the contents of one or more columns `cols` in `df` that contain data frames,
+returning a data frame with as many rows and columns as the nested data frames contain,
+in addition to original columns.
+The newly created columns are stored at the end of the data frame (and the `src` columns are
 dropped).
 
 Each `src` column must contain a `NamedTuple`, a `DataFrameRow`, a

From f2db6b184654f49cec37a0207a2d0fd43e7202b7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= <bkamins@sgh.waw.pl>
Date: Thu, 5 Jan 2023 09:57:59 +0100
Subject: [PATCH 04/12] add extract

---
 NEWS.md                   | 2 ++
 docs/src/lib/functions.md | 2 ++
 src/DataFrames.jl         | 2 ++
 3 files changed, 6 insertions(+)

diff --git a/NEWS.md b/NEWS.md
index da12048624..d92ec60745 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -15,6 +15,8 @@
 * Joining functions now support `order` keyword argument allowing the user
   to specify the order of the rows in the produced table
   ([#3233](https://github.com/JuliaData/DataFrames.jl/pull/3233))
+* Add `nest`, `unnest`, `extract`, and `extract!` functions; improve `flatten`
+  ([#3258](https://github.com/JuliaData/DataFrames.jl/pull/3258))
 
 ## Bug fixes
 
diff --git a/docs/src/lib/functions.md b/docs/src/lib/functions.md
index 67dbb7262a..ef010b45a0 100644
--- a/docs/src/lib/functions.md
+++ b/docs/src/lib/functions.md
@@ -76,6 +76,8 @@ rename!
 ```@docs
 append!
 combine
+extract
+extract!
 fillcombinations
 flatten
 hcat
diff --git a/src/DataFrames.jl b/src/DataFrames.jl
index 4e117e72f3..cc6f1c5f10 100644
--- a/src/DataFrames.jl
+++ b/src/DataFrames.jl
@@ -63,6 +63,8 @@ export AbstractDataFrame,
        disallowmissing!,
        dropmissing!,
        dropmissing,
+       extract,
+       extract!,
        fillcombinations,
        flatten,
        groupby,

From 792d35513f43ac8eaec8060f396fdb5562638928 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= <bkamins@sgh.waw.pl>
Date: Thu, 5 Jan 2023 10:21:47 +0100
Subject: [PATCH 05/12] initial implementation

---
 src/abstractdataframe/nest.jl | 142 +++++++++++++++++++++++++---------
 1 file changed, 107 insertions(+), 35 deletions(-)

diff --git a/src/abstractdataframe/nest.jl b/src/abstractdataframe/nest.jl
index 88f491635b..f5a60d000d 100644
--- a/src/abstractdataframe/nest.jl
+++ b/src/abstractdataframe/nest.jl
@@ -1,7 +1,7 @@
 """
-    nest(gdf::GroupedDataFrame, cols::Pair{<:AbstractString}...)
-    nest(gdf::GroupedDataFrame, cols::Pair{Symbol}...)
-    nest(gdf::GroupedDataFrame)
+    nest(gdf::GroupedDataFrame, cols::Pair{<:AbstractString}...; view::Bool=false)
+    nest(gdf::GroupedDataFrame, cols::Pair{Symbol}...; view::Bool=false)
+    nest(gdf::GroupedDataFrame; view::Bool=false)
 
 Return a data frame with one row for each group in `gdf` where
 or or more columns contain a data frame of the rows that belong to that group.
@@ -15,6 +15,10 @@ or more columns where `column_name` is the name of the column storing data frame
 and every data frame consists of columns picked by `column_selector` values
 computed for each group of `gdf`.
 
+If `view=false` (the default) the nested data frames will hold copies of
+data from the source data frame. If `view=true` views of the source data frame
+will be created.
+
 TODO: metadata
 
 # Examples
@@ -143,29 +147,28 @@ julia> n3.yz
 ─────┼─────────────
    1 │    14  d
 """
-nest(gdf::GroupedDataFrame, cols::Pair{<:Any, <:AbstractString}...) =
-    combine(gdf, (sdf -> (; Symbol(dst) => select(sdf, index(sdf)[src]))
+nest(gdf::GroupedDataFrame, cols::Pair{<:Any, <:AbstractString}...;
+     view::Bool=false) =
+    combine(gdf, (sdf -> (; Symbol(dst) => select(sdf, index(sdf)[src], copycols=!view))
                   for (src, dst) in cols)...)
-nest(gdf::GroupedDataFrame, cols::Pair{<:Any, Symbol}...) =
-    combine(gdf, (sdf -> (; dst => select(sdf, index(sdf)[src]))
+nest(gdf::GroupedDataFrame, cols::Pair{<:Any, Symbol}...; view::Bool=false) =
+    combine(gdf, (sdf -> (; dst => select(sdf, index(sdf)[src], copycols=!view))
                   for (src, dst) in cols)...)
-nest(gdf::GroupedDataFrame) = nest(gdf, valuecols(gdf) => :data)
+nest(gdf::GroupedDataFrame; view::Bool=false) =
+    nest(gdf, valuecols(gdf) => :data, view=view)
 
 """
     unnest(df::AbstractDataFrame, src::ColumnIndex...;
            cols::Union{Symbol, AbstractVector{Symbol},
                        AbstractVector{<:AbstractString}}=:setequal,
-           promote::Bool=true,
-           makeunique::Bool=false, flatten::Bool=true)
-
-Extract the contents of one or more columns `cols` in `df` that contain data frames,
-returning a data frame with as many rows and columns as the nested data frames contain,
-in addition to original columns.
-The newly created columns are stored at the end of the data frame (and the `src` columns are
-dropped).
+           promote::Bool=true, makeunique::Bool=false)
 
-Each `src` column must contain a `NamedTuple`, a `DataFrameRow`, a
-`Tables.AbstractRow`, or Tables.jl table.
+Extract the contents of one or more columns `cols` in `df` that contain
+Tables.jl tables, returning a data frame with as many rows and columns as the
+nested data frames contain, in addition to original columns, whose contents
+gets appropriately repeated to match the number of rows of the unnested tables.
+The newly created columns are stored at the end of the data frame (and the
+`src` columns are dropped).
 
 `cols` (default `:setequal`) and `promote` (default `true`) keyword arguments
 have the same meaning as in [`push!`](@ref).
@@ -174,9 +177,6 @@ If `makeunique=false` (the default) produced column names must be unique.
 If `makeunique=true` then duplicate column names will be suffixed with `_i`
 (`i` starting at `1` for the first duplicate).
 
-If `flatten=true` (the default) then newly created columns are flattened
-using [`flatten`](@ref) with `scalar=Missing` keyword argument.
-
 TODO: metadata
 
 """
@@ -184,7 +184,7 @@ function unnest(df::AbstractDataFrame, src::ColumnIndex...;
                 cols::Union{Symbol, AbstractVector{Symbol},
                             AbstractVector{<:AbstractString}}=:setequal,
                 promote::Bool=(cols in [:union, :subset]),
-                makeunique::Bool=false, flatten::Bool=true)
+                makeunique::Bool=false)
     ref_df = select(df, Not(collect(Any, src)))
     col_count = ncol(ref_df)
     for idx in src
@@ -192,22 +192,96 @@ function unnest(df::AbstractDataFrame, src::ColumnIndex...;
         tmp_df = DataFrame()
         for v in col
             if v isa DataFrame # produce DataFrameRow
-                # if flatten=false make a copy to avoid aliases
-                v = DataFrame([n => [flatten ? c : copy(c)]
-                               for (n, c) in pairs(eachcol(v))],
+                v = DataFrame([n => [c] for (n, c) in pairs(eachcol(v))],
                               copycols=false) |> only
-            elseif Tables.istable(v) # produce NamedTuple
+            else # produce NamedTuple
                 v = Tables.columntable(v)
             end
             push!(tmp_df, v, cols=cols, promote=promote)
         end
         hcat!(ref_df, tmp_df, makeunique=makeunique, copycols=false)
     end
-    return if flatten
-        DataFrames.flatten(ref_df, col_count+1:ncol(ref_df), scalar=Missing)
-    else
-        ref_df
+    return flatten(ref_df, col_count+1:ncol(ref_df), scalar=Missing)
+end
+
+"""
+    expand(df::AbstractDataFrame, src::ColumnIndex...;
+           cols::Union{Symbol, AbstractVector{Symbol},
+                       AbstractVector{<:AbstractString}}=:setequal,
+           promote::Bool=true, makeunique::Bool=false)
+
+Extract the contents of one or more columns `cols` in `df` that contain
+`NamedTuple`, a `DataFrameRow`, a `Tables.AbstractRow` elements
+returning a data frame with expanded columns, in addition to original columns.
+The newly created columns are stored at the end of the data frame (and the
+`src` columns are dropped).
+
+`cols` (default `:setequal`) and `promote` (default `true`) keyword arguments
+have the same meaning as in [`push!`](@ref).
+
+If `makeunique=false` (the default) produced column names must be unique.
+If `makeunique=true` then duplicate column names will be suffixed with `_i`
+(`i` starting at `1` for the first duplicate).
+
+TODO: metadata
+
+"""
+function expand(df::AbstractDataFrame, src::ColumnIndex...;
+                cols::Union{Symbol, AbstractVector{Symbol},
+                            AbstractVector{<:AbstractString}}=:setequal,
+                promote::Bool=(cols in [:union, :subset]),
+                makeunique::Bool=false)
+    ref_df = select(df, Not(collect(Any, src)))
+    for idx in src
+        col = df[!, idx]
+        tmp_df = DataFrame()
+        for v in col
+            push!(tmp_df, v, cols=cols, promote=promote)
+        end
+        hcat!(ref_df, tmp_df, makeunique=makeunique, copycols=false)
+    end
+    return ref_df
+end
+
+"""
+    expand!(df::AbstractDataFrame, src::ColumnIndex...;
+            cols::Union{Symbol, AbstractVector{Symbol},
+                        AbstractVector{<:AbstractString}}=:setequal,
+            promote::Bool=true, makeunique::Bool=false)
+
+Extract in-place the contents of one or more columns `cols` in `df` that contain
+`NamedTuple`, a `DataFrameRow`, a `Tables.AbstractRow` elements
+returning a data frame with expanded columns, in addition to original columns.
+The newly created columns are stored at the end of the data frame (and the
+`src` columns are dropped).
+
+`cols` (default `:setequal`) and `promote` (default `true`) keyword arguments
+have the same meaning as in [`push!`](@ref).
+
+If `makeunique=false` (the default) produced column names must be unique.
+If `makeunique=true` then duplicate column names will be suffixed with `_i`
+(`i` starting at `1` for the first duplicate).
+
+TODO: metadata
+
+"""
+function expand!(df::AbstractDataFrame, src::ColumnIndex...;
+                 cols::Union{Symbol, AbstractVector{Symbol},
+                             AbstractVector{<:AbstractString}}=:setequal,
+                 promote::Bool=(cols in [:union, :subset]),
+                 makeunique::Bool=false)
+    tmp_dfs = DataFrame[]
+    for idx in src
+        col = df[!, idx]
+        tmp_df = DataFrame()
+        for v in col
+            push!(tmp_df, v, cols=cols, promote=promote)
+        end
+        push!(tmp_dfs, tmp_df)
     end
+    ref_df = select!(df, Not(collect(Any, src)))
+    hcat!(ref_df, tmp_dfs..., makeunique=makeunique, copycols=false)
+    return ref_df
 end
 
 """
@@ -328,7 +402,7 @@ function flatten(df::AbstractDataFrame,
     end
 
     col1 = first(idxcols)
-    lengths = Int[length_maybe_scalar(x, scalar) for x in df[!, col1]]
+    lengths = Int[x isa scalar ? -1 : length(x) for x in df[!, col1]]
     for (i, coli) in enumerate(idxcols)
         i == 1 && continue
         update_lengths!(lengths, df[!, coli], scalar, df, col1, coli)
@@ -364,13 +438,11 @@ function flatten(df::AbstractDataFrame,
     return new_df
 end
 
-length_maybe_scalar(v, scalar::Type) = v isa scalar ? -1 : length(v)
-
 function update_lengths!(lengths::Vector{Int}, col::AbstractVector, scalar::Type,
                          df::AbstractDataFrame, col1, coli)
     for (i, v) in enumerate(col)
-        lv = length_maybe_scalar(v, scalar)
-        lv == -1 && continue
+        v isa scalar && continue
+        lv = length(v)
         if lengths[i] == -1
             lengths[i] = lv
         elseif lengths[i] != lv

From 7d05ac82dd86498923a5fb19003f60276a010f0e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= <bkamins@sgh.waw.pl>
Date: Thu, 5 Jan 2023 20:12:52 +0100
Subject: [PATCH 06/12] change default cols to :union

---
 src/abstractdataframe/nest.jl | 36 +++++++++++++++++------------------
 1 file changed, 18 insertions(+), 18 deletions(-)

diff --git a/src/abstractdataframe/nest.jl b/src/abstractdataframe/nest.jl
index f5a60d000d..05282d1cd1 100644
--- a/src/abstractdataframe/nest.jl
+++ b/src/abstractdataframe/nest.jl
@@ -160,7 +160,7 @@ nest(gdf::GroupedDataFrame; view::Bool=false) =
 """
     unnest(df::AbstractDataFrame, src::ColumnIndex...;
            cols::Union{Symbol, AbstractVector{Symbol},
-                       AbstractVector{<:AbstractString}}=:setequal,
+                       AbstractVector{<:AbstractString}}=:union,
            promote::Bool=true, makeunique::Bool=false)
 
 Extract the contents of one or more columns `cols` in `df` that contain
@@ -170,7 +170,7 @@ gets appropriately repeated to match the number of rows of the unnested tables.
 The newly created columns are stored at the end of the data frame (and the
 `src` columns are dropped).
 
-`cols` (default `:setequal`) and `promote` (default `true`) keyword arguments
+`cols` (default `:union`) and `promote` (default `true`) keyword arguments
 have the same meaning as in [`push!`](@ref).
 
 If `makeunique=false` (the default) produced column names must be unique.
@@ -182,7 +182,7 @@ TODO: metadata
 """
 function unnest(df::AbstractDataFrame, src::ColumnIndex...;
                 cols::Union{Symbol, AbstractVector{Symbol},
-                            AbstractVector{<:AbstractString}}=:setequal,
+                            AbstractVector{<:AbstractString}}=:union,
                 promote::Bool=(cols in [:union, :subset]),
                 makeunique::Bool=false)
     ref_df = select(df, Not(collect(Any, src)))
@@ -207,16 +207,16 @@ end
 """
     expand(df::AbstractDataFrame, src::ColumnIndex...;
            cols::Union{Symbol, AbstractVector{Symbol},
-                       AbstractVector{<:AbstractString}}=:setequal,
+                       AbstractVector{<:AbstractString}}=:union,
            promote::Bool=true, makeunique::Bool=false)
 
 Extract the contents of one or more columns `cols` in `df` that contain
-`NamedTuple`, a `DataFrameRow`, a `Tables.AbstractRow` elements
-returning a data frame with expanded columns, in addition to original columns.
-The newly created columns are stored at the end of the data frame (and the
-`src` columns are dropped).
+`NamedTuple`, a `DataFrameRow`, an `AbstractDict` or a `Tables.AbstractRow`
+elements returning a data frame with expanded columns, in addition to original
+columns. The newly created columns are stored at the end of the data frame (and
+the `src` columns are dropped).
 
-`cols` (default `:setequal`) and `promote` (default `true`) keyword arguments
+`cols` (default `:union`) and `promote` (default `true`) keyword arguments
 have the same meaning as in [`push!`](@ref).
 
 If `makeunique=false` (the default) produced column names must be unique.
@@ -228,7 +228,7 @@ TODO: metadata
 """
 function expand(df::AbstractDataFrame, src::ColumnIndex...;
                 cols::Union{Symbol, AbstractVector{Symbol},
-                            AbstractVector{<:AbstractString}}=:setequal,
+                            AbstractVector{<:AbstractString}}=:union,
                 promote::Bool=(cols in [:union, :subset]),
                 makeunique::Bool=false)
     ref_df = select(df, Not(collect(Any, src)))
@@ -246,16 +246,16 @@ end
 """
     expand!(df::AbstractDataFrame, src::ColumnIndex...;
             cols::Union{Symbol, AbstractVector{Symbol},
-                        AbstractVector{<:AbstractString}}=:setequal,
+                        AbstractVector{<:AbstractString}}=:union,
             promote::Bool=true, makeunique::Bool=false)
 
-Extract in-place the contents of one or more columns `cols` in `df` that contain
-`NamedTuple`, a `DataFrameRow`, a `Tables.AbstractRow` elements
-returning a data frame with expanded columns, in addition to original columns.
-The newly created columns are stored at the end of the data frame (and the
-`src` columns are dropped).
+Extract in-place the contents of one or more columns `cols` in `df` that
+contain `NamedTuple`, a `DataFrameRow`, an `AbstractDict`, or a
+`Tables.AbstractRow` elements returning a data frame with expanded columns, in
+addition to original columns. The newly created columns are stored at the end
+of the data frame (and the `src` columns are dropped).
 
-`cols` (default `:setequal`) and `promote` (default `true`) keyword arguments
+`cols` (default `:union`) and `promote` (default `true`) keyword arguments
 have the same meaning as in [`push!`](@ref).
 
 If `makeunique=false` (the default) produced column names must be unique.
@@ -267,7 +267,7 @@ TODO: metadata
 """
 function expand!(df::AbstractDataFrame, src::ColumnIndex...;
                  cols::Union{Symbol, AbstractVector{Symbol},
-                             AbstractVector{<:AbstractString}}=:setequal,
+                             AbstractVector{<:AbstractString}}=:union,
                  promote::Bool=(cols in [:union, :subset]),
                  makeunique::Bool=false)
     tmp_dfs = DataFrame[]

From 0e58244edbd956067579c523abd3ee3f764aa744 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= <bkamins@sgh.waw.pl>
Date: Thu, 5 Jan 2023 23:03:08 +0100
Subject: [PATCH 07/12] fix wrong function name

---
 docs/src/lib/functions.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/src/lib/functions.md b/docs/src/lib/functions.md
index ef010b45a0..79c8e77aae 100644
--- a/docs/src/lib/functions.md
+++ b/docs/src/lib/functions.md
@@ -76,8 +76,8 @@ rename!
 ```@docs
 append!
 combine
-extract
-extract!
+expand
+expand!
 fillcombinations
 flatten
 hcat

From 40935331a825143192e2a0f6399bcdc66e88761d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= <bkamins@sgh.waw.pl>
Date: Sun, 8 Jan 2023 18:30:11 +0100
Subject: [PATCH 08/12] remove cols and promote

---
 src/abstractdataframe/nest.jl | 64 ++++++++++++++++++++---------------
 1 file changed, 37 insertions(+), 27 deletions(-)

diff --git a/src/abstractdataframe/nest.jl b/src/abstractdataframe/nest.jl
index 05282d1cd1..86fe673af5 100644
--- a/src/abstractdataframe/nest.jl
+++ b/src/abstractdataframe/nest.jl
@@ -157,11 +157,28 @@ nest(gdf::GroupedDataFrame, cols::Pair{<:Any, Symbol}...; view::Bool=false) =
 nest(gdf::GroupedDataFrame; view::Bool=false) =
     nest(gdf, valuecols(gdf) => :data, view=view)
 
+const UNNESTING_COMMON = """
+`cols` argument affects the created columns in the following way:
+* If `cols == :setequal` then each row must contain exactly the same columns
+  (but possibly in a different order).
+* If `cols == :orderequal` then each row must contain the same columns in the
+  same order (for `AbstractDict` this option requires that `keys` of row matches
+  to allow for support of ordered dicts; however, if row is a `Dict` an error is
+  thrown as it is an unordered collection).
+* If `cols == :union` (the default) then in each row can contain different
+  colums and a `missing` value is pushed to columns missing in a given row
+  that are present in other rows.
+
+If `promote=true` (the default) then `
+the type of a pushed argument then a new column with a promoted element type
+allowing it is freshly allocated and stored in `df`. If `promote=false` an error
+is thrown.
+
+"""
+
 """
     unnest(df::AbstractDataFrame, src::ColumnIndex...;
-           cols::Union{Symbol, AbstractVector{Symbol},
-                       AbstractVector{<:AbstractString}}=:union,
-           promote::Bool=true, makeunique::Bool=false)
+           makeunique::Bool=false)
 
 Extract the contents of one or more columns `cols` in `df` that contain
 Tables.jl tables, returning a data frame with as many rows and columns as the
@@ -170,8 +187,10 @@ gets appropriately repeated to match the number of rows of the unnested tables.
 The newly created columns are stored at the end of the data frame (and the
 `src` columns are dropped).
 
-`cols` (default `:union`) and `promote` (default `true`) keyword arguments
-have the same meaning as in [`push!`](@ref).
+Table stored in each row of `src` can have different columns. `missing` value is
+pushed to columns missing in a given row that are present in other rows.
+The element type of resulting column is determined by promotion of element types
+of columns in individual rows.
 
 If `makeunique=false` (the default) produced column names must be unique.
 If `makeunique=true` then duplicate column names will be suffixed with `_i`
@@ -181,9 +200,6 @@ TODO: metadata
 
 """
 function unnest(df::AbstractDataFrame, src::ColumnIndex...;
-                cols::Union{Symbol, AbstractVector{Symbol},
-                            AbstractVector{<:AbstractString}}=:union,
-                promote::Bool=(cols in [:union, :subset]),
                 makeunique::Bool=false)
     ref_df = select(df, Not(collect(Any, src)))
     col_count = ncol(ref_df)
@@ -197,7 +213,7 @@ function unnest(df::AbstractDataFrame, src::ColumnIndex...;
             else # produce NamedTuple
                 v = Tables.columntable(v)
             end
-            push!(tmp_df, v, cols=cols, promote=promote)
+            push!(tmp_df, v, cols=:union, promote=true)
         end
         hcat!(ref_df, tmp_df, makeunique=makeunique, copycols=false)
     end
@@ -206,9 +222,7 @@ end
 
 """
     expand(df::AbstractDataFrame, src::ColumnIndex...;
-           cols::Union{Symbol, AbstractVector{Symbol},
-                       AbstractVector{<:AbstractString}}=:union,
-           promote::Bool=true, makeunique::Bool=false)
+           makeunique::Bool=false)
 
 Extract the contents of one or more columns `cols` in `df` that contain
 `NamedTuple`, a `DataFrameRow`, an `AbstractDict` or a `Tables.AbstractRow`
@@ -216,8 +230,10 @@ elements returning a data frame with expanded columns, in addition to original
 columns. The newly created columns are stored at the end of the data frame (and
 the `src` columns are dropped).
 
-`cols` (default `:union`) and `promote` (default `true`) keyword arguments
-have the same meaning as in [`push!`](@ref).
+Table stored in each row of `src` can have different columns. `missing` value is
+pushed to columns missing in a given row that are present in other rows.
+The element type of resulting column is determined by promotion of element types
+of columns in individual rows.
 
 If `makeunique=false` (the default) produced column names must be unique.
 If `makeunique=true` then duplicate column names will be suffixed with `_i`
@@ -227,16 +243,13 @@ TODO: metadata
 
 """
 function expand(df::AbstractDataFrame, src::ColumnIndex...;
-                cols::Union{Symbol, AbstractVector{Symbol},
-                            AbstractVector{<:AbstractString}}=:union,
-                promote::Bool=(cols in [:union, :subset]),
                 makeunique::Bool=false)
     ref_df = select(df, Not(collect(Any, src)))
     for idx in src
         col = df[!, idx]
         tmp_df = DataFrame()
         for v in col
-            push!(tmp_df, v, cols=cols, promote=promote)
+            push!(tmp_df, v, cols=:union, promote=true)
         end
         hcat!(ref_df, tmp_df, makeunique=makeunique, copycols=false)
     end
@@ -245,9 +258,7 @@ end
 
 """
     expand!(df::AbstractDataFrame, src::ColumnIndex...;
-            cols::Union{Symbol, AbstractVector{Symbol},
-                        AbstractVector{<:AbstractString}}=:union,
-            promote::Bool=true, makeunique::Bool=false)
+            makeunique::Bool=false)
 
 Extract in-place the contents of one or more columns `cols` in `df` that
 contain `NamedTuple`, a `DataFrameRow`, an `AbstractDict`, or a
@@ -255,8 +266,10 @@ contain `NamedTuple`, a `DataFrameRow`, an `AbstractDict`, or a
 addition to original columns. The newly created columns are stored at the end
 of the data frame (and the `src` columns are dropped).
 
-`cols` (default `:union`) and `promote` (default `true`) keyword arguments
-have the same meaning as in [`push!`](@ref).
+Table stored in each row of `src` can have different columns. `missing` value is
+pushed to columns missing in a given row that are present in other rows.
+The element type of resulting column is determined by promotion of element types
+of columns in individual rows.
 
 If `makeunique=false` (the default) produced column names must be unique.
 If `makeunique=true` then duplicate column names will be suffixed with `_i`
@@ -266,16 +279,13 @@ TODO: metadata
 
 """
 function expand!(df::AbstractDataFrame, src::ColumnIndex...;
-                 cols::Union{Symbol, AbstractVector{Symbol},
-                             AbstractVector{<:AbstractString}}=:union,
-                 promote::Bool=(cols in [:union, :subset]),
                  makeunique::Bool=false)
     tmp_dfs = DataFrame[]
     for idx in src
         col = df[!, idx]
         tmp_df = DataFrame()
         for v in col
-            push!(tmp_df, v, cols=cols, promote=promote)
+            push!(tmp_df, v, cols=:union, promote=true)
         end
         push!(tmp_dfs, tmp_df)
     end

From 698330f02bf653e247eba447b565becd89b656b7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= <bkamins@sgh.waw.pl>
Date: Mon, 9 Jan 2023 15:23:35 +0100
Subject: [PATCH 09/12] change to extract

---
 docs/src/lib/functions.md     |  4 ++--
 src/abstractdataframe/nest.jl | 32 +++++++++++++++-----------------
 2 files changed, 17 insertions(+), 19 deletions(-)

diff --git a/docs/src/lib/functions.md b/docs/src/lib/functions.md
index 79c8e77aae..ef010b45a0 100644
--- a/docs/src/lib/functions.md
+++ b/docs/src/lib/functions.md
@@ -76,8 +76,8 @@ rename!
 ```@docs
 append!
 combine
-expand
-expand!
+extract
+extract!
 fillcombinations
 flatten
 hcat
diff --git a/src/abstractdataframe/nest.jl b/src/abstractdataframe/nest.jl
index 86fe673af5..d5237a0112 100644
--- a/src/abstractdataframe/nest.jl
+++ b/src/abstractdataframe/nest.jl
@@ -205,28 +205,26 @@ function unnest(df::AbstractDataFrame, src::ColumnIndex...;
     col_count = ncol(ref_df)
     for idx in src
         col = df[!, idx]
-        tmp_df = DataFrame()
-        for v in col
-            if v isa DataFrame # produce DataFrameRow
-                v = DataFrame([n => [c] for (n, c) in pairs(eachcol(v))],
-                              copycols=false) |> only
-            else # produce NamedTuple
-                v = Tables.columntable(v)
+        if all(x -> x isa AbstractDataFrame, col)
+            tmp_df = reduce(vcat, col, cols=:union)
+        else
+            tmp_df = DataFrame()
+            for v in col
+                append!(tmp_df, v, cols=:union, promote=true)
             end
-            push!(tmp_df, v, cols=:union, promote=true)
         end
         hcat!(ref_df, tmp_df, makeunique=makeunique, copycols=false)
     end
-    return flatten(ref_df, col_count+1:ncol(ref_df), scalar=Missing)
+    return ref_df
 end
 
 """
-    expand(df::AbstractDataFrame, src::ColumnIndex...;
-           makeunique::Bool=false)
+    extract(df::AbstractDataFrame, src::ColumnIndex...;
+            makeunique::Bool=false)
 
 Extract the contents of one or more columns `cols` in `df` that contain
 `NamedTuple`, a `DataFrameRow`, an `AbstractDict` or a `Tables.AbstractRow`
-elements returning a data frame with expanded columns, in addition to original
+elements returning a data frame with extracted columns, in addition to original
 columns. The newly created columns are stored at the end of the data frame (and
 the `src` columns are dropped).
 
@@ -242,7 +240,7 @@ If `makeunique=true` then duplicate column names will be suffixed with `_i`
 TODO: metadata
 
 """
-function expand(df::AbstractDataFrame, src::ColumnIndex...;
+function extract(df::AbstractDataFrame, src::ColumnIndex...;
                 makeunique::Bool=false)
     ref_df = select(df, Not(collect(Any, src)))
     for idx in src
@@ -257,12 +255,12 @@ function expand(df::AbstractDataFrame, src::ColumnIndex...;
 end
 
 """
-    expand!(df::AbstractDataFrame, src::ColumnIndex...;
-            makeunique::Bool=false)
+    extract!(df::AbstractDataFrame, src::ColumnIndex...;
+             makeunique::Bool=false)
 
 Extract in-place the contents of one or more columns `cols` in `df` that
 contain `NamedTuple`, a `DataFrameRow`, an `AbstractDict`, or a
-`Tables.AbstractRow` elements returning a data frame with expanded columns, in
+`Tables.AbstractRow` elements returning a data frame with extracted columns, in
 addition to original columns. The newly created columns are stored at the end
 of the data frame (and the `src` columns are dropped).
 
@@ -278,7 +276,7 @@ If `makeunique=true` then duplicate column names will be suffixed with `_i`
 TODO: metadata
 
 """
-function expand!(df::AbstractDataFrame, src::ColumnIndex...;
+function extract!(df::AbstractDataFrame, src::ColumnIndex...;
                  makeunique::Bool=false)
     tmp_dfs = DataFrame[]
     for idx in src

From 70178676f793cb58de1dc428e205f0ec4e22e803 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= <bkamins@sgh.waw.pl>
Date: Sun, 5 Feb 2023 09:19:57 +0100
Subject: [PATCH 10/12] remove `flatten` from the PR

---
 NEWS.md                                    |   2 +-
 src/abstractdataframe/abstractdataframe.jl | 120 +++++++++-
 src/abstractdataframe/nest.jl              | 254 +++------------------
 test/reshape.jl                            |  39 ----
 4 files changed, 157 insertions(+), 258 deletions(-)

diff --git a/NEWS.md b/NEWS.md
index d92ec60745..5a38cd4f04 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -15,7 +15,7 @@
 * Joining functions now support `order` keyword argument allowing the user
   to specify the order of the rows in the produced table
   ([#3233](https://github.com/JuliaData/DataFrames.jl/pull/3233))
-* Add `nest`, `unnest`, `extract`, and `extract!` functions; improve `flatten`
+* Add `nest`, `unnest`, `extract`, and `extract!` functions
   ([#3258](https://github.com/JuliaData/DataFrames.jl/pull/3258))
 
 ## Bug fixes
diff --git a/src/abstractdataframe/abstractdataframe.jl b/src/abstractdataframe/abstractdataframe.jl
index ec85d5d458..7654f5d566 100644
--- a/src/abstractdataframe/abstractdataframe.jl
+++ b/src/abstractdataframe/abstractdataframe.jl
@@ -2502,6 +2502,125 @@ function Missings.allowmissing(df::AbstractDataFrame,
     return new_df
 end
 
+"""
+    flatten(df::AbstractDataFrame, cols)
+When columns `cols` of data frame `df` have iterable elements that define
+`length` (for example a `Vector` of `Vector`s), return a `DataFrame` where each
+element of each `col` in `cols` is flattened, meaning the column corresponding
+to `col` becomes a longer vector where the original entries are concatenated.
+Elements of row `i` of `df` in columns other than `cols` will be repeated
+according to the length of `df[i, col]`. These lengths must therefore be the
+same for each `col` in `cols`, or else an error is raised. Note that these
+elements are not copied, and thus if they are mutable changing them in the
+returned `DataFrame` will affect `df`.
+`cols` can be any column selector ($COLUMNINDEX_STR; $MULTICOLUMNINDEX_STR).
+$METADATA_FIXED
+# Examples
+```jldoctest
+julia> df1 = DataFrame(a=[1, 2], b=[[1, 2], [3, 4]], c=[[5, 6], [7, 8]])
+2×3 DataFrame
+ Row │ a      b       c
+     │ Int64  Array…  Array…
+─────┼───────────────────────
+   1 │     1  [1, 2]  [5, 6]
+   2 │     2  [3, 4]  [7, 8]
+julia> flatten(df1, :b)
+4×3 DataFrame
+ Row │ a      b      c
+     │ Int64  Int64  Array…
+─────┼──────────────────────
+   1 │     1      1  [5, 6]
+   2 │     1      2  [5, 6]
+   3 │     2      3  [7, 8]
+   4 │     2      4  [7, 8]
+julia> flatten(df1, [:b, :c])
+4×3 DataFrame
+ Row │ a      b      c
+     │ Int64  Int64  Int64
+─────┼─────────────────────
+   1 │     1      1      5
+   2 │     1      2      6
+   3 │     2      3      7
+   4 │     2      4      8
+julia> df2 = DataFrame(a=[1, 2], b=[("p", "q"), ("r", "s")])
+2×2 DataFrame
+ Row │ a      b
+     │ Int64  Tuple…
+─────┼───────────────────
+   1 │     1  ("p", "q")
+   2 │     2  ("r", "s")
+julia> flatten(df2, :b)
+4×2 DataFrame
+ Row │ a      b
+     │ Int64  String
+─────┼───────────────
+   1 │     1  p
+   2 │     1  q
+   3 │     2  r
+   4 │     2  s
+julia> df3 = DataFrame(a=[1, 2], b=[[1, 2], [3, 4]], c=[[5, 6], [7]])
+2×3 DataFrame
+ Row │ a      b       c
+     │ Int64  Array…  Array…
+─────┼───────────────────────
+   1 │     1  [1, 2]  [5, 6]
+   2 │     2  [3, 4]  [7]
+julia> flatten(df3, [:b, :c])
+ERROR: ArgumentError: Lengths of iterables stored in columns :b and :c are not the same in row 2
+```
+"""
+function flatten(df::AbstractDataFrame,
+                 cols::Union{ColumnIndex, MultiColumnIndex})
+    _check_consistency(df)
+
+    idxcols = index(df)[cols]
+    if isempty(idxcols)
+        cdf = copy(df)
+        _drop_all_nonnote_metadata!(cdf)
+        return cdf
+    end
+
+    col1 = first(idxcols)
+    lengths = length.(df[!, col1])
+    for col in idxcols
+        v = df[!, col]
+        if any(x -> length(x[1]) != x[2], zip(v, lengths))
+            r = findfirst(x -> x != 0, length.(v) .- lengths)
+            colnames = _names(df)
+            throw(ArgumentError("Lengths of iterables stored in columns :$(colnames[col1]) " *
+                                "and :$(colnames[col]) are not the same in row $r"))
+        end
+    end
+
+    new_df = similar(df[!, Not(cols)], sum(lengths))
+    for name in _names(new_df)
+        repeat_lengths!(new_df[!, name], df[!, name], lengths)
+    end
+    length(idxcols) > 1 && sort!(idxcols)
+    for col in idxcols
+        col_to_flatten = df[!, col]
+        fast_path = eltype(col_to_flatten) isa AbstractVector &&
+                    !isempty(col_to_flatten)
+        flattened_col = fast_path ?
+            reduce(vcat, col_to_flatten) :
+            collect(Iterators.flatten(col_to_flatten))
+        insertcols!(new_df, col, _names(df)[col] => flattened_col)
+    end
+
+    _copy_all_note_metadata!(new_df, df)
+    return new_df
+end
+
+function repeat_lengths!(longnew::AbstractVector, shortold::AbstractVector,
+                         lengths::AbstractVector{Int})
+    counter = 1
+    @inbounds for i in eachindex(shortold)
+        l = lengths[i]
+        longnew[counter:(counter + l - 1)] .= Ref(shortold[i])
+        counter += l
+    end
+end
+
 # Disallowed getindex and setindex! operations that are a common mistake
 
 Base.getindex(::AbstractDataFrame, ::Union{Symbol, Integer, AbstractString}) =
@@ -3272,4 +3391,3 @@ function Base.iterate(itr::Iterators.PartitionIterator{<:AbstractDataFrame}, sta
     r = min(state + itr.n - 1, last_idx)
     return view(itr.c, state:r, :), r + 1
 end
-
diff --git a/src/abstractdataframe/nest.jl b/src/abstractdataframe/nest.jl
index d5237a0112..03d1131295 100644
--- a/src/abstractdataframe/nest.jl
+++ b/src/abstractdataframe/nest.jl
@@ -26,8 +26,8 @@ TODO: metadata
 julia> df = DataFrame(id = ["b", "a", "a", "c", "b", "b"],
                       x = 1:6, y = 11:16, z='a':'f')
 6×4 DataFrame
- Row │ id      x      y      z    
-     │ String  Int64  Int64  Char 
+ Row │ id      x      y      z
+     │ String  Int64  Int64  Char
 ─────┼────────────────────────────
    1 │ b           1     11  a
    2 │ a           2     12  b
@@ -39,110 +39,110 @@ julia> df = DataFrame(id = ["b", "a", "a", "c", "b", "b"],
 julia> n1 = nest(groupby(df, :id))
 3×2 DataFrame
  Row │ id      data
-     │ String  DataFrame     
+     │ String  DataFrame
 ─────┼───────────────────────
-   1 │ b       3×3 DataFrame 
-   2 │ a       2×3 DataFrame 
-   3 │ c       1×3 DataFrame 
+   1 │ b       3×3 DataFrame
+   2 │ a       2×3 DataFrame
+   3 │ c       1×3 DataFrame
 
 julia> n1.data
 3-element Vector{DataFrame}:
  3×3 DataFrame
- Row │ x      y      z    
-     │ Int64  Int64  Char 
+ Row │ x      y      z
+     │ Int64  Int64  Char
 ─────┼────────────────────
    1 │     1     11  a
    2 │     5     15  e
    3 │     6     16  f
  2×3 DataFrame
- Row │ x      y      z    
-     │ Int64  Int64  Char 
+ Row │ x      y      z
+     │ Int64  Int64  Char
 ─────┼────────────────────
    1 │     2     12  b
    2 │     3     13  c
  1×3 DataFrame
- Row │ x      y      z    
-     │ Int64  Int64  Char 
+ Row │ x      y      z
+     │ Int64  Int64  Char
 ─────┼────────────────────
    1 │     4     14  d
 
 julia> n2 = nest(groupby(df, :id), [:z, :x] => :zx)
 3×2 DataFrame
  Row │ id      zx
-     │ String  DataFrame     
+     │ String  DataFrame
 ─────┼───────────────────────
-   1 │ b       3×2 DataFrame 
-   2 │ a       2×2 DataFrame 
-   3 │ c       1×2 DataFrame 
+   1 │ b       3×2 DataFrame
+   2 │ a       2×2 DataFrame
+   3 │ c       1×2 DataFrame
 
 julia> n2.zx
 3-element Vector{DataFrame}:
  3×2 DataFrame
- Row │ z     x     
-     │ Char  Int64 
+ Row │ z     x
+     │ Char  Int64
 ─────┼─────────────
    1 │ a         1
    2 │ e         5
    3 │ f         6
  2×2 DataFrame
- Row │ z     x     
-     │ Char  Int64 
+ Row │ z     x
+     │ Char  Int64
 ─────┼─────────────
    1 │ b         2
    2 │ c         3
  1×2 DataFrame
- Row │ z     x     
-     │ Char  Int64 
+ Row │ z     x
+     │ Char  Int64
 ─────┼─────────────
    1 │ d         4
 
 julia> n3 = nest(groupby(df, :id), :x => :x, [:y, :z] => :yz)
 3×3 DataFrame
  Row │ id      x              yz
-     │ String  DataFrame      DataFrame     
+     │ String  DataFrame      DataFrame
 ─────┼──────────────────────────────────────
-   1 │ b       3×1 DataFrame  3×2 DataFrame 
-   2 │ a       2×1 DataFrame  2×2 DataFrame 
-   3 │ c       1×1 DataFrame  1×2 DataFrame 
+   1 │ b       3×1 DataFrame  3×2 DataFrame
+   2 │ a       2×1 DataFrame  2×2 DataFrame
+   3 │ c       1×1 DataFrame  1×2 DataFrame
 
 julia> n3.x
 3-element Vector{DataFrame}:
  3×1 DataFrame
- Row │ x     
-     │ Int64 
+ Row │ x
+     │ Int64
 ─────┼───────
    1 │     1
    2 │     5
    3 │     6
  2×1 DataFrame
- Row │ x     
-     │ Int64 
+ Row │ x
+     │ Int64
 ─────┼───────
    1 │     2
    2 │     3
  1×1 DataFrame
- Row │ x     
-     │ Int64 
+ Row │ x
+     │ Int64
 ─────┼───────
    1 │     4
 
 julia> n3.yz
 3-element Vector{DataFrame}:
  3×2 DataFrame
- Row │ y      z    
-     │ Int64  Char 
+ Row │ y      z
+     │ Int64  Char
 ─────┼─────────────
    1 │    11  a
    2 │    15  e
    3 │    16  f
  2×2 DataFrame
- Row │ y      z    
-     │ Int64  Char 
+ Row │ y      z
+     │ Int64  Char
 ─────┼─────────────
    1 │    12  b
    2 │    13  c
  1×2 DataFrame
- Row │ y      z    
+ Row │ y      z
      │ Int64  Char
 ─────┼─────────────
    1 │    14  d
@@ -291,183 +291,3 @@ function extract!(df::AbstractDataFrame, src::ColumnIndex...;
     hcat!(ref_df, tmp_dfs..., makeunique=makeunique, copycols=false)
     return ref_df
 end
-
-"""
-    flatten(df::AbstractDataFrame, cols; scalar::Type)
-
-When columns `cols` of data frame `df` have iterable elements that define
-`length` (for example a `Vector` of `Vector`s), return a `DataFrame` where each
-element of each `col` in `cols` is flattened, meaning the column corresponding
-to `col` becomes a longer vector where the original entries are concatenated.
-Elements of row `i` of `df` in columns other than `cols` will be repeated
-according to the length of `df[i, col]`. These lengths must therefore be the
-same for each `col` in `cols`, or else an error is raised. Note that these
-elements are not copied, and thus if they are mutable changing them in the
-returned `DataFrame` will affect `df`.
-
-`cols` can be any column selector ($COLUMNINDEX_STR; $MULTICOLUMNINDEX_STR).
-
-If `scalar` is passed then values that have this type in flattened columns
-are treated as scalars and broadcasted as many times as is needed to match
-lengths of values stored in other columns. One row is produced if all
-corresponding values are scalars.
-
-$METADATA_FIXED
-
-# Examples
-
-```jldoctest
-julia> df1 = DataFrame(a=[1, 2], b=[[1, 2], [3, 4]], c=[[5, 6], [7, 8]])
-2×3 DataFrame
- Row │ a      b       c
-     │ Int64  Array…  Array…
-─────┼───────────────────────
-   1 │     1  [1, 2]  [5, 6]
-   2 │     2  [3, 4]  [7, 8]
-
-julia> flatten(df1, :b)
-4×3 DataFrame
- Row │ a      b      c
-     │ Int64  Int64  Array…
-─────┼──────────────────────
-   1 │     1      1  [5, 6]
-   2 │     1      2  [5, 6]
-   3 │     2      3  [7, 8]
-   4 │     2      4  [7, 8]
-
-julia> flatten(df1, [:b, :c])
-4×3 DataFrame
- Row │ a      b      c
-     │ Int64  Int64  Int64
-─────┼─────────────────────
-   1 │     1      1      5
-   2 │     1      2      6
-   3 │     2      3      7
-   4 │     2      4      8
-
-julia> df2 = DataFrame(a=[1, 2], b=[("p", "q"), ("r", "s")])
-2×2 DataFrame
- Row │ a      b
-     │ Int64  Tuple…
-─────┼───────────────────
-   1 │     1  ("p", "q")
-   2 │     2  ("r", "s")
-
-julia> flatten(df2, :b)
-4×2 DataFrame
- Row │ a      b
-     │ Int64  String
-─────┼───────────────
-   1 │     1  p
-   2 │     1  q
-   3 │     2  r
-   4 │     2  s
-
-julia> df3 = DataFrame(a=[1, 2], b=[[1, 2], [3, 4]], c=[[5, 6], [7]])
-2×3 DataFrame
- Row │ a      b       c
-     │ Int64  Array…  Array…
-─────┼───────────────────────
-   1 │     1  [1, 2]  [5, 6]
-   2 │     2  [3, 4]  [7]
-
-julia> flatten(df3, [:b, :c])
-ERROR: ArgumentError: Lengths of iterables stored in columns :b and :c are not the same in row 2
-
-julia> df4 = DataFrame(a=[1, 2, 3],
-                       b=[[1, 2], missing, missing],
-                       c=[[5, 6], missing, [7, 8]])
-3×3 DataFrame
- Row │ a      b        c       
-     │ Int64  Array…?  Array…? 
-─────┼─────────────────────────
-   1 │     1  [1, 2]   [5, 6]
-   2 │     2  missing  missing 
-   3 │     3  missing  [7, 8]
-
-julia> flatten(df4, [:b, :c], scalar=Missing)
-5×3 DataFrame
- Row │ a      b        c       
-     │ Int64  Int64?   Int64?  
-─────┼─────────────────────────
-   1 │     1        1        5
-   2 │     1        2        6
-   3 │     2  missing  missing 
-   4 │     3  missing        7
-   5 │     3  missing        8
-```
-"""
-function flatten(df::AbstractDataFrame,
-                 cols::Union{ColumnIndex, MultiColumnIndex};
-                 scalar::Type=Union{})
-    _check_consistency(df)
-
-    idxcols = index(df)[cols]
-    if isempty(idxcols)
-        cdf = copy(df)
-        _drop_all_nonnote_metadata!(cdf)
-        return cdf
-    end
-
-    col1 = first(idxcols)
-    lengths = Int[x isa scalar ? -1 : length(x) for x in df[!, col1]]
-    for (i, coli) in enumerate(idxcols)
-        i == 1 && continue
-        update_lengths!(lengths, df[!, coli], scalar, df, col1, coli)
-    end
-
-    # handle case where in all columns we had a scalar
-    # in this case we keep it one time
-    for i in 1:length(lengths)
-        lengths[i] == -1 && (lengths[i] = 1)
-    end
-
-    new_df = similar(df[!, Not(cols)], sum(lengths))
-    for name in _names(new_df)
-        repeat_lengths!(new_df[!, name], df[!, name], lengths)
-    end
-    length(idxcols) > 1 && sort!(idxcols)
-    for col in idxcols
-        col_to_flatten = df[!, col]
-        fast_path = eltype(col_to_flatten) isa AbstractVector &&
-                    !isempty(col_to_flatten)
-        flattened_col = if fast_path
-                reduce(vcat, col_to_flatten)
-            elseif scalar === Union{}
-                collect(Iterators.flatten(col_to_flatten))
-            else
-                collect(Iterators.flatten(v isa scalar ? Iterators.repeated(v, l) : v
-                                          for (l, v) in zip(lengths, col_to_flatten)))
-            end
-        insertcols!(new_df, col, _names(df)[col] => flattened_col)
-    end
-
-    _copy_all_note_metadata!(new_df, df)
-    return new_df
-end
-
-function update_lengths!(lengths::Vector{Int}, col::AbstractVector, scalar::Type,
-                         df::AbstractDataFrame, col1, coli)
-    for (i, v) in enumerate(col)
-        v isa scalar && continue
-        lv = length(v)
-        if lengths[i] == -1
-            lengths[i] = lv
-        elseif lengths[i] != lv
-            colnames = _names(df)
-            throw(ArgumentError("Lengths of iterables stored in columns :$(colnames[col1]) " *
-                                "and :$(colnames[coli]) are not the same in row $i"))
-        end
-    end
-end
-
-function repeat_lengths!(longnew::AbstractVector, shortold::AbstractVector,
-                         lengths::AbstractVector{Int})
-    counter = 1
-    @inbounds for i in eachindex(shortold)
-        l = lengths[i]
-        longnew[counter:(counter + l - 1)] .= Ref(shortold[i])
-        counter += l
-    end
-end
-
diff --git a/test/reshape.jl b/test/reshape.jl
index d00957297e..58cf7bfce0 100644
--- a/test/reshape.jl
+++ b/test/reshape.jl
@@ -431,45 +431,6 @@ end
     @test flatten(DataFrame(), All()) == DataFrame()
 end
 
-@testset "flatten with scalar" begin
-    df = DataFrame(a=[1, 2, 3],
-                   b=[[1, 2], missing, [3, 4]],
-                   c=[[5, 6], missing, missing])
-    @test flatten(df, :a) ≅ df
-    @test_throws MethodError flatten(df, :b)
-    @test flatten(df, :b, scalar=Missing) ≅
-          DataFrame(a=[1, 1, 2, 3, 3],
-                    b=[1, 2, missing, 3, 4],
-                    c=[[5, 6], [5, 6], missing, missing, missing])
-    @test flatten(df, [:b, :c], scalar=Missing) ≅
-          DataFrame(a=[1, 1, 2, 3, 3],
-                    b=[1, 2, missing, 3, 4],
-                    c=[5, 6, missing, missing, missing])
-    @test flatten(df, [:b, :c], scalar=Any) ≅ df
-
-    df = DataFrame(a=missing, b=[1], c=missing, d=[[1, 2]])
-    @test_throws ArgumentError flatten(df, All(), scalar=Missing)
-    @test flatten(df, Not(:d), scalar=Missing) ≅
-        DataFrame(a=missing, b=1, c=missing, d=[[1, 2]])
-    @test flatten(df, Not(:b), scalar=Missing) ≅
-        DataFrame(a=[missing, missing], b=[1, 1], c=[missing, missing], d=[1, 2])
-
-    df = DataFrame(a="xy", b=[[1, 2]])
-    @test flatten(df, [:a, :b]) == DataFrame(a=['x', 'y'], b=[1, 2])
-    @test flatten(df, [:a, :b], scalar=String) ==
-          DataFrame(a=["xy", "xy"], b=[1, 2])
-
-    df = DataFrame(a=[[1], [], [3, 4], missing], b = missings(4), id=1:4)
-    @test flatten(df, [:a, :b], scalar=Missing) ≅
-          DataFrame(a=[1, 3, 4, missing], b=missings(4), id=[1, 3, 3, 4])
-    df = DataFrame(id=1:10, x=[1:i-1 for i in 1:10])
-    df.y = [iseven(last(v)) ? missing : v for v in df.x]
-    @test flatten(df, [:x, :y], scalar=Missing) ≅
-          DataFrame(id=reduce(vcat, [fill(i, i-1) for i in 2:10]),
-                    x=reduce(vcat, [1:i for i in 1:9]),
-                    y=reduce(vcat, [iseven(i) ? missings(i) : (1:i) for i in 1:9]))
-end
-
 @testset "stack categorical test" begin
     Random.seed!(1234)
     d1 = DataFrame(a=repeat([1:3;], inner=[4]),

From cca1c8710b015d5c249b0b82488739469c505c2d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= <bkamins@sgh.waw.pl>
Date: Sun, 5 Feb 2023 09:30:02 +0100
Subject: [PATCH 11/12] fix newlines

---
 src/abstractdataframe/abstractdataframe.jl | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/src/abstractdataframe/abstractdataframe.jl b/src/abstractdataframe/abstractdataframe.jl
index 7654f5d566..78b3928d6b 100644
--- a/src/abstractdataframe/abstractdataframe.jl
+++ b/src/abstractdataframe/abstractdataframe.jl
@@ -2504,6 +2504,7 @@ end
 
 """
     flatten(df::AbstractDataFrame, cols)
+
 When columns `cols` of data frame `df` have iterable elements that define
 `length` (for example a `Vector` of `Vector`s), return a `DataFrame` where each
 element of each `col` in `cols` is flattened, meaning the column corresponding
@@ -2513,8 +2514,11 @@ according to the length of `df[i, col]`. These lengths must therefore be the
 same for each `col` in `cols`, or else an error is raised. Note that these
 elements are not copied, and thus if they are mutable changing them in the
 returned `DataFrame` will affect `df`.
+
 `cols` can be any column selector ($COLUMNINDEX_STR; $MULTICOLUMNINDEX_STR).
+
 $METADATA_FIXED
+
 # Examples
 ```jldoctest
 julia> df1 = DataFrame(a=[1, 2], b=[[1, 2], [3, 4]], c=[[5, 6], [7, 8]])
@@ -2524,6 +2528,7 @@ julia> df1 = DataFrame(a=[1, 2], b=[[1, 2], [3, 4]], c=[[5, 6], [7, 8]])
 ─────┼───────────────────────
    1 │     1  [1, 2]  [5, 6]
    2 │     2  [3, 4]  [7, 8]
+
 julia> flatten(df1, :b)
 4×3 DataFrame
  Row │ a      b      c
@@ -2533,6 +2538,7 @@ julia> flatten(df1, :b)
    2 │     1      2  [5, 6]
    3 │     2      3  [7, 8]
    4 │     2      4  [7, 8]
+
 julia> flatten(df1, [:b, :c])
 4×3 DataFrame
  Row │ a      b      c
@@ -2542,6 +2548,7 @@ julia> flatten(df1, [:b, :c])
    2 │     1      2      6
    3 │     2      3      7
    4 │     2      4      8
+
 julia> df2 = DataFrame(a=[1, 2], b=[("p", "q"), ("r", "s")])
 2×2 DataFrame
  Row │ a      b
@@ -2549,6 +2556,7 @@ julia> df2 = DataFrame(a=[1, 2], b=[("p", "q"), ("r", "s")])
 ─────┼───────────────────
    1 │     1  ("p", "q")
    2 │     2  ("r", "s")
+
 julia> flatten(df2, :b)
 4×2 DataFrame
  Row │ a      b
@@ -2558,6 +2566,7 @@ julia> flatten(df2, :b)
    2 │     1  q
    3 │     2  r
    4 │     2  s
+
 julia> df3 = DataFrame(a=[1, 2], b=[[1, 2], [3, 4]], c=[[5, 6], [7]])
 2×3 DataFrame
  Row │ a      b       c
@@ -2565,6 +2574,7 @@ julia> df3 = DataFrame(a=[1, 2], b=[[1, 2], [3, 4]], c=[[5, 6], [7]])
 ─────┼───────────────────────
    1 │     1  [1, 2]  [5, 6]
    2 │     2  [3, 4]  [7]
+
 julia> flatten(df3, [:b, :c])
 ERROR: ArgumentError: Lengths of iterables stored in columns :b and :c are not the same in row 2
 ```

From 5c7111c4201af139c99762d5a7a34add445f00ce Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= <bkamins@sgh.waw.pl>
Date: Sun, 5 Feb 2023 09:30:49 +0100
Subject: [PATCH 12/12] another newline fix

---
 src/abstractdataframe/abstractdataframe.jl | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/abstractdataframe/abstractdataframe.jl b/src/abstractdataframe/abstractdataframe.jl
index 78b3928d6b..a3910d0f77 100644
--- a/src/abstractdataframe/abstractdataframe.jl
+++ b/src/abstractdataframe/abstractdataframe.jl
@@ -2520,6 +2520,7 @@ returned `DataFrame` will affect `df`.
 $METADATA_FIXED
 
 # Examples
+
 ```jldoctest
 julia> df1 = DataFrame(a=[1, 2], b=[[1, 2], [3, 4]], c=[[5, 6], [7, 8]])
 2×3 DataFrame