add groupby and docs (#373)

* add groupby and docs * implementation * whatever * rebase * change implemetation * docs
JuliaData · Dec 22, 2023 · cdfb733 · cdfb733
1 parent 8ebefa1
commit cdfb733
Show file tree

Hide file tree

Showing 5 changed files with 86 additions and 11 deletions.
diff --git a/docs/src/dplyr.md b/docs/src/dplyr.md
@@ -93,7 +93,7 @@ DataFramesMeta.jl macro | By-row version | Description | `dplyr` equivalent
 `@subset` | `@rsubset` | filter rows | `filter`
 `@orderby` | `@rorderby` | re-order or arrange rows | `arrange`
 `@combine` | | summarise values | `summarize` (but `@combine` is more flexible)
-`groupby` | | allows for group operations in the "split-apply-combine" concept | `group_by`
+`@groupby` | | allows for group operations in the "split-apply-combine" concept | `group_by`
 
 # DataFramesMeta.jl Verbs In Action
 
@@ -341,15 +341,15 @@ DataFrames.jl also provides the function `describe` which performs many of these
 describe(msleep)
 ```
 
-## Group Operations using `groupby` and `@combine`
+## Group Operations using `@groupby` and `@combine`
 
-The `groupby` verb is an important function in DataFrames.jl (it does not live in DataFramesMeta.jl). As we mentioned before it's related to concept of "split-apply-combine". We literally want to split the data frame by some variable (e.g. taxonomic order), apply a function to the individual data frames and then combine the output.   
+The `@groupby` verb is the first step in the "split-apply-combine" workflow. We literally want to split the data frame by some variable (e.g. taxonomic order), apply a function to the individual data frames and then combine the output.   
 
 Let's do that: split the `msleep` data frame by the taxonomic order, then ask for the same summary statistics as above. We expect a set of summary statistics for each taxonomic order. 
 
 ```@repl 1
 @chain msleep begin 
-    groupby(:order)
+    @groupby :order
     @combine begin 
         :avg_sleep = mean(:sleep_total)
         :min_sleep = minimum(:sleep_total)
@@ -363,7 +363,7 @@ Split-apply-combine can also be used with `@transform` to add new variables to a
 
 ```@repl 1
 @chain msleep begin 
-    groupby(:order)
+    @groupby :order
     @transform :sleep_genus = :sleep_total .- mean(:sleep_total)
 end
 ```

diff --git a/docs/src/index.md b/docs/src/index.md
@@ -16,6 +16,7 @@ In addition, DataFramesMeta provides
 * Row-wise versions of the above macros in the form of `@rtransform`, `@rtransform!`,
   `@rselect`, `@rselect!`, `@rorderby`, `@rsubset`, and `@rsubset!`.
 * `@rename` and `@rename!` for renaming columns
+* `@groupby` for grouping data
 * `@by`, for grouping and combining a data frame in a single step
 * `@with`, for working with the columns of a data frame with high performance and 
   convenient syntax
@@ -64,7 +65,7 @@ data frame.
 
 ```julia
 df = DataFrame(x = [1, 1, 2, 2], y = [1, 2, 101, 102]);
-gd = groupby(df, :x);
+gd = @groupby(df, :x);
 @select(df, :x, :y)
 @select(df, :x2 = 2 * :x, :y)
 @select(gd, :x2 = 2 .* :y .* first(:y))
@@ -98,7 +99,7 @@ data frame.
 
 ```julia
 df = DataFrame(x = [1, 1, 2, 2], y = [1, 2, 101, 102]);
-gd = groupby(df, :x);
+gd = @groupby(df, :x);
 @transform(df, :x2 = 2 * :x, :y)
 @transform(gd, :x2 = 2 .* :y .* first(:y))
 @transform!(df, :x, :y)
@@ -115,7 +116,7 @@ Select row subsets. Operates on both a `DataFrame` and a `GroupedDataFrame`.
 ```julia
 using Statistics
 df = DataFrame(x = [1, 1, 2, 2], y = [1, 2, 101, 102]);
-gd = groupby(df, :x);
+gd = @groupby(df, :x);
 outside_var = 1;
 @subset(df, :x .> 1)
 @subset(df, :x .> outside_var)
@@ -134,11 +135,14 @@ acts like a `GroupedDataFrame` with one group.
 Like `@select` and `@transform`, transformations are called with the keyword-like 
 syntax `:y = f(:x)`. 
 
+To group data together into a `GroupedDataFrame`, use `@groupby`, a short-hand for
+the DataFrames.jl function `groupby`.
+
 Examples:
 
 ```julia
 df = DataFrame(x = [1, 1, 2, 2], y = [1, 2, 101, 102]);
-gd = groupby(df, :x);
+gd = @groupby(df, :x);
 @combine(gd, :x2 = sum(:y))
 @combine(gd, :x2 = :y .- sum(:y))
 @combine(gd, $AsTable = (n1 = sum(:y), n2 = first(:y)))
@@ -161,6 +165,17 @@ gd = groupby(df, :x);
 @combine(gd, $AsTable = (a = sum(:x), b = sum(:y)))
 ```
 
+### `@by` 
+
+Perform the grouping and combining operations in one step with `@by`
+
+```
+df = DataFrame(x = [1, 1, 2, 2], y = [1, 2, 101, 102]);
+@by df :x begin
+    :x = sum(:y)
+end
+```
+
 ## `@orderby`
 
 Sort rows in a `DataFrame` by values in one of several columns or a 
@@ -355,7 +370,7 @@ julia> @subset df @byrow begin
 however, like with `ByRow` in DataFrames.jl, when `@byrow` is
 used, functions do not take into account the grouping, so for
 example the result of `@transform(df, @byrow :y = f(:x))` and 
-`@transform(groupby(df, :g), @byrow :y = f(:x))` is the same.
+`@transform(@groupby(df, :g), @byrow :y = f(:x))` is the same.
 
 ## Propagating missing values with `@passmissing`
 
@@ -912,7 +927,7 @@ functions.
 | `@subset`    | `filter`         | `Where`      |
 | `@transform` | `mutate`         | `Select` (?) |
 | `@by`        |                  | `GroupBy`    |
-| `groupby`    | `group_by`       | `GroupBy`    |
+| `@groupby`   | `group_by`       | `GroupBy`    |
 | `@combine`   | `summarise`/`do` |              |
 | `@orderby`   | `arrange`        | `OrderBy`    |
 | `@select`    | `select`         | `Select`     |

diff --git a/src/DataFramesMeta.jl b/src/DataFramesMeta.jl
@@ -21,6 +21,7 @@ export @with,
        @distinct, @rdistinct, @distinct!, @rdistinct!,
        @eachrow, @eachrow!,
        @byrow, @passmissing, @astable, @kwarg,
+       @groupby,
        @based_on, @where # deprecated
 
 const DOLLAR = raw"$"

diff --git a/src/macros.jl b/src/macros.jl
@@ -3008,3 +3008,45 @@ macro rename!(x, args...)
     esc(rename!_helper(x, args...))
 end
 
+function groupby_helper(df, args...)
+    t = Expr(:tuple, args...)
+    :($groupby($df, ($Cols($t...))))
+end
+
+"""
+    groupby(df, args...)
+
+Group a data frame by columns. An alias for
+
+```
+groupby(df, Cols(args...))
+```
+
+but with a few convenience features.
+
+## Details
+
+`@groupby` does not perform any transformations or allow the
+generation of new columns. New column generation must be done
+before `@groupby` is called.
+
+`@groupby` allows mixing of `Symbol`
+and `String` inputs, such that `@groupby df :A "B"`
+is supported.
+
+Arguments are not escaped and DataFramesMeta.jl rules for column
+selection, such as `$DOLLAR` for escaping, do not apply.
+
+## Examples
+```julia-repl
+julia> df = DataFrame(A = [1, 1], B = [3, 4], C = [6, 6]);
+julia> @groupby df :A;
+julia> @groupby df :A :B;
+julia> @groupby df [:A, :B];
+julia> @groupby df :A [:B, :C];
+```
+"""
+macro groupby(df, args...)
+    esc(groupby_helper(df, args...))
+end
+
diff --git a/test/grouping.jl b/test/grouping.jl
@@ -349,4 +349,21 @@ end
 	@test @select(g, :a, @byrow :t = :a ^ 2).t ≅ d.a .^ 2
 end
 
+@testset "@groupby" begin
+    df = DataFrame(a = [1, 2], b = [3, 4], c = [5, 6])
+    resa = groupby(df, [:a])
+    resab = groupby(df, [:a, :b])
+    resabc = groupby(df, [:a, :b, :c])
+    ab = [:a, :b]
+
+    @test @groupby(df, :a) == resa
+    @test @groupby(df, :a, :b) == resab
+    @test (@groupby df ab) == resab
+    @test (@groupby df :a 2) == resab
+    @test (@groupby df [:a, :b]) == resab
+    @test (@groupby df :a "b") == resab
+    @test (@groupby df All()) == resabc
+    @test (@groupby df Cols(:a, 2, "c")) == resabc
+end
+
 end # module