From 8ebefa1499ee169adba79010e92a7d9ffbe7b548 Mon Sep 17 00:00:00 2001 From: pdeffebach <23196228+pdeffebach@users.noreply.github.com> Date: Fri, 22 Dec 2023 12:25:54 -0500 Subject: [PATCH] squash commits (#372) --- docs/src/dplyr.md | 10 +++--- docs/src/index.md | 16 +++++---- src/macros.jl | 24 +++++++++++-- src/parsing.jl | 27 ++++++++++---- test/dataframes.jl | 12 +++---- test/grouping.jl | 4 +-- test/multicol.jl | 90 ++++++++++++++++++++++++++++++++++++++++++++++ test/runtests.jl | 3 +- 8 files changed, 156 insertions(+), 30 deletions(-) create mode 100644 test/multicol.jl diff --git a/docs/src/dplyr.md b/docs/src/dplyr.md index c083520e..b42d6d0d 100644 --- a/docs/src/dplyr.md +++ b/docs/src/dplyr.md @@ -136,22 +136,22 @@ Similarly, to select the first column, use the syntax `$1`. @select msleep $1 ``` -To select all the columns *except* a specific column, use the `Not` function for inverse selection. We also need to wrap `Not` in the `$` sign, because it is not a symbol. +To select all the columns *except* a specific column, use the `Not` function for inverse selection. ```@repl 1 -@select msleep $(Not(:name)) +@select msleep Not(:name) ``` To select a range of columns by name, use the `Between` operator: ```@repl 1 -@select msleep $(Between(:name, :order)) +@select msleep Between(:name, :order) ``` -To select all columns that start with the character string `"sl"` use [regular expressions](https://regexone.com/): +To select all columns that start with the character string `"sl"` use [regular expressions](https://regexone.com/) in conjunction with `Cols`. ```@repl 1 -@select msleep $(r"^sl") +@select msleep Cols(r"^sl") ``` Regular expressions are powerful, but can be difficult for new users to understand. Here are some quick tips. diff --git a/docs/src/index.md b/docs/src/index.md index f5244abc..31e8970f 100644 --- a/docs/src/index.md +++ b/docs/src/index.md @@ -49,12 +49,6 @@ but exported by DataFramesMeta for convenience. # Provided macros -!!! note - - Newer versions of DataFrames.jl support the operators `Between`, `All`, `Cols`, - and `Not` when selecting and transforming columns. DataFramesMeta does not currently - support this syntax. - ## `@select` and `@select!` Column selections and transformations. Only newly created columns are kept. @@ -79,6 +73,16 @@ gd = groupby(df, :x); @select!(gd, :y = 2 .* :y .* first(:y)) ``` +To select or de-select multiple columns, use `Not`, `Between`, `All`, and `Cols`. +These multi-column selectors are all re-exported from DataFrames.jl. + +```julia +@select df Not(:x) +@select df Between(:x, :y) +@select df All() +@select df Cols(r"x") # Regular expressions. +``` + ## `@transform` and `@transform!` Add additional columns based on keyword-like arguments. Operates on both a diff --git a/src/macros.jl b/src/macros.jl index 71e21e8d..055cb4a2 100644 --- a/src/macros.jl +++ b/src/macros.jl @@ -1786,7 +1786,7 @@ end function select_helper(x, args...) x, exprs, outer_flags, kw = get_df_args_kwargs(x, args...; wrap_byrow = false) - t = (fun_to_vec(ex; gensym_names = false, outer_flags = outer_flags) for ex in exprs) + t = (fun_to_vec(ex; gensym_names = false, outer_flags = outer_flags, allow_multicol = true) for ex in exprs) quote $select($x, $(t...); $(kw...)) end @@ -1851,6 +1851,18 @@ transformations by row, `@select` allows `@byrow` at the beginning of a block of selections (i.e. `@byrow begin... end`). All transformations in the block will operate by row. +To select many columns at once use the tools `Not`, `Between`, `All`, and `Cols`. + +* `@select df Not(:a)` keeps all columns except for `:a` +* `@select df Between(:a, :z)` keeps all columns between `:a` and `:z`, inclusive +* `@select df All()` keeps all columns +* `@select df Cols(...)` can be used to combine many different selectors, as well as use + regular expressions. For example `Cols(r"a")` selects all columns that start with `"a"`. + +Expressions inside `Not(...)`, `Between(...)` etc. are untouched by DataFramesMeta's +parsing. To refer to a variable `x` which represents a column inside `Not`, write `Not(x)`, +rather than `Not($(DOLLAR)x)`. + $ASTABLE_MACRO_FLAG_DOCS $ASTABLE_RHS_SELECT_TRANSFORM_DOCS @@ -1869,7 +1881,7 @@ When inputs are given in "block" format, the last lines may be written ``` @select gd begin :a - @select copycols = false + @kwarg copycols = false end ``` @@ -2024,6 +2036,14 @@ transformations by row, `@select!` allows `@byrow` at the beginning of a block of select!ations (i.e. `@byrow begin... end`). All transformations in the block will operate by row. +To select many columns at once use the tools `Not`, `Between`, `All`, and `Cols`. + +* `@select df Not(:a)` keeps all columns except for `:a` +* `@select df Between(:a, :z)` keeps all columns between `:a` and `:z`, inclusive +* `@select df All()` keeps all columns +* `@select df Cols(...)` can be used to combine many different selectors, as well as use + regular expressions. For example `Cols(r"a")` selects all columns that start with `"a"`. + $ASTABLE_MACRO_FLAG_DOCS $ASTABLE_RHS_SELECT_TRANSFORM_DOCS diff --git a/src/parsing.jl b/src/parsing.jl index 4f1b9e00..e2119e8e 100644 --- a/src/parsing.jl +++ b/src/parsing.jl @@ -18,17 +18,29 @@ a `QuoteNode` or an expression beginning with If input is not a valid column identifier, returns `nothing`. """ -get_column_expr(x) = nothing -function get_column_expr(e::Expr) +get_column_expr(x; allow_multicol::Bool = false) = nothing +function get_column_expr(e::Expr; allow_multicol::Bool = false) e.head == :$ && return e.args[1] onearg(e, :AsTable) && return :($AsTable($(e.args[2]))) if onearg(e, :cols) Base.depwarn("cols is deprecated use $DOLLAR to escape column names instead", :cols) return e.args[2] end + if e.head === :call + e1 = e.args[1] + if e1 === :All || e1 === :Not || e1 === :Between || e1 == :Cols + if allow_multicol + return e + else + s = "Multi-column references outside of @select, @rselect, @select!" * + " and @rselect! must be wrapped in AsTable" + throw(ArgumentError(s)) + end + end + end return nothing end -get_column_expr(x::QuoteNode) = x +get_column_expr(x::QuoteNode; allow_multicol::Bool = false) = x get_column_expr_rename(x) = nothing function get_column_expr_rename(e::Expr) @@ -314,10 +326,12 @@ end function fun_to_vec(ex::Expr; gensym_names::Bool=false, outer_flags::NamedTuple=deepcopy(DEFAULT_FLAGS), - no_dest::Bool=false) + no_dest::Bool=false, + allow_multicol::Bool=false) # classify the type of expression # :x # handled via dispatch # $:x # handled as though above + # All(), Between(...), Cols(...), Not(...), requires allow_multicol (only true in select) # f(:x) # requires no_dest, for `@with` and `@subset` in future # :y = :x # Simple pair # :y = $:x # Extract and return simple pair (no function) @@ -342,7 +356,7 @@ function fun_to_vec(ex::Expr; # :x # handled below via dispatch on ::QuoteNode - ex_col = get_column_expr(ex) + ex_col = get_column_expr(ex; allow_multicol = allow_multicol) if ex_col !== nothing return ex_col end @@ -404,7 +418,8 @@ end fun_to_vec(ex::QuoteNode; no_dest::Bool=false, gensym_names::Bool=false, - outer_flags::Union{NamedTuple, Nothing}=nothing) = ex + outer_flags::Union{NamedTuple, Nothing}=nothing, + allow_multicol::Bool = false) = ex """ diff --git a/test/dataframes.jl b/test/dataframes.jl index 5393b25e..05cffa6e 100644 --- a/test/dataframes.jl +++ b/test/dataframes.jl @@ -275,10 +275,10 @@ s = [:i, :g] # not part of DataFramesMeta. @test_throws LoadError @eval @transform(df, [:i, :g]) @test_throws LoadError @eval @transform(df, All()) - @test_throws LoadError @eval @transform(df, Between(:i, :t)).Between == df.i - @test_throws LoadError @eval @transform(df, Not(:i)).Not == df.i + @test_throws LoadError @eval @transform(df, Between(:i, :t)) + @test_throws LoadError @eval @transform(df, Not(:i)) @test_throws LoadError @eval @transform(df, Not([:i, :g])) - @test_throws MethodError @eval @transform(df, :n = sum(Between(:i, :t))) + @test_throws LoadError @eval @transform(df, :n = sum(Between(:i, :t))) end @testset "@select" begin @@ -546,11 +546,7 @@ cr = "c" @testset "limits of @select" begin ## Test for not-implemented or strange behavior @test_throws LoadError @eval @select(df, [:i, :g]) - @test_throws LoadError @eval @select(df, All()) - @test_throws LoadError @eval @select(df, Between(:i, :t)).Between == df.i - @test_throws LoadError @eval @select(df, Not(:i)).Not == df.i - @test_throws LoadError @eval @select(df, Not([:i, :g])) - @test_throws MethodError @eval @select(df, :n = sum(Between(:i, :t))) + @test_throws LoadError @eval @select(df, :n = sum(Between(:i, :t))) end @testset "with" begin diff --git a/test/grouping.jl b/test/grouping.jl index 35ea8df2..0db08b6e 100644 --- a/test/grouping.jl +++ b/test/grouping.jl @@ -148,7 +148,7 @@ gd = groupby(df, :g) newvar = :n @testset "Limits of @combine" begin - @test_throws MethodError @eval @combine(gd, :n = sum(Between(:i, :t))) + @test_throws LoadError @eval @combine(gd, :n = sum(Between(:i, :t))) @test_throws ArgumentError @eval @combine(gd, :n = mean(:i) + mean(cols(1))) end @@ -287,7 +287,7 @@ gd = groupby(df, :g) newvar = :n @testset "limits of @by" begin - @test_throws MethodError @eval @by(df, :g, :n = sum(Between(:i, :t))) + @test_throws LoadError @eval @by(df, :g, :n = sum(Between(:i, :t))) @test_throws ArgumentError @eval @by(df, :g, :n = mean(:i) + mean(cols(1))) end diff --git a/test/multicol.jl b/test/multicol.jl new file mode 100644 index 00000000..2e44607c --- /dev/null +++ b/test/multicol.jl @@ -0,0 +1,90 @@ +module TestMultiCol + +using Test +using DataFrames +using DataFramesMeta + +df = DataFrame(A = 1, AA = 2, B = 3) + +@testset "select_multi" begin + df = DataFrame(A = 1, AA = 2, B = 3) + + t = @select df Not(:A) + @test t == DataFrame(AA = 2, B = 3) + + t = @select df All() + @test t == DataFrame(A = 1, AA = 2, B = 3) + + t = @select df Cols(r"A") + @test t == DataFrame(A = 1, AA = 2) + + t = @select df Between(:AA, :B) + @test t == DataFrame(AA = 2, B = 3) +end + +@testset "othermacros_multi" begin + df = DataFrame(A = 1, AA = 2, B = 3) + + @test_throws LoadError @eval @with df Not(:A) + + @test_throws LoadError @eval @with df All() + + @test_throws LoadError @eval @with df Cols(r"A") + + @test_throws LoadError @eval @with df Between(:AA, :B) + + @test_throws LoadError @eval @with(df, begin + 1 + Not(:A) + end) + + @test_throws LoadError @eval @with df begin + 1 + All() + end + + @test_throws LoadError @eval @with df begin + 1 + Cols(r"A") + end + + @test_throws LoadError @eval @with df begin + 1 + Between(:AA, :B) + end +end + +@testset "othermacros_multi" begin + df = DataFrame(A = 1, AA = 2, B = 3) + + @test_throws LoadError @eval @select df :y = Not(:A) + + @test_throws LoadError @eval @select df :y = All() + + @test_throws LoadError @eval @select df :y = Cols(r"A") + + @test_throws LoadError @eval @select df :y = Between(:AA, :B) + + @test_throws LoadError @eval @select(df, :y = begin + 1 + Not(:A) + end) + + @test_throws LoadError @eval @select df :y = begin + 1 + All() + end + + @test_throws LoadError @eval @select df :y = begin + 1 + Cols(r"A") + end + + @test_throws LoadError @eval @select df :y = begin + 1 + Between(:AA, :B) + end +end + + +end # module \ No newline at end of file diff --git a/test/runtests.jl b/test/runtests.jl index 3cb43e93..4c12faf9 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -15,7 +15,8 @@ my_tests = ["dataframes.jl", "byrow.jl", "astable.jl", "astable_flag.jl", - "passmissing.jl"] + "passmissing.jl", + "multicol.jl"] println("Running tests:")