diff --git a/NEWS.md b/NEWS.md index 14b1f1fd1..cee6c9b8e 100644 --- a/NEWS.md +++ b/NEWS.md @@ -13,6 +13,9 @@ columns only to a subset of the columns specified by the `cols` keyword argument ([#3386](https://github.com/JuliaData/DataFrames.jl/pull/3386)) +* add `findcols` that returns a vector of integer column indices + of a data frame that meet the passed condition function + ([#3389](https://github.com/JuliaData/DataFrames.jl/pull/3389)) ## Bug fixes diff --git a/docs/src/lib/functions.md b/docs/src/lib/functions.md index 1cb8dfd66..fd510cec9 100644 --- a/docs/src/lib/functions.md +++ b/docs/src/lib/functions.md @@ -26,7 +26,7 @@ This is a list of operations that currently make use of multi-threading: * a transformation produces one row per group and the passed transformation is a custom function (i.e. not for standard reductions, which use optimized single-threaded methods). -- `dropmissing` when the provided data frame has more than 1 column and `view=false` +- `dropmissing` when the provided data frame has more than 1 column and `view=false` (subsetting of individual columns is spawned in separate tasks). In general at least Julia 1.4 is required to ensure that multi-threading is used @@ -170,6 +170,11 @@ unique unique! ``` +## Filtering columns +```@docs +findcols +``` + ## Working with missing values ```@docs allowmissing diff --git a/src/DataFrames.jl b/src/DataFrames.jl index 57809cbdb..784736a8d 100644 --- a/src/DataFrames.jl +++ b/src/DataFrames.jl @@ -67,6 +67,7 @@ export AbstractDataFrame, dropmissing!, dropmissing, fillcombinations, + findcols, flatten, groupby, groupindices, diff --git a/src/abstractdataframe/abstractdataframe.jl b/src/abstractdataframe/abstractdataframe.jl index a812365ee..54606a4b3 100644 --- a/src/abstractdataframe/abstractdataframe.jl +++ b/src/abstractdataframe/abstractdataframe.jl @@ -3252,3 +3252,29 @@ function Base.iterate(itr::Iterators.PartitionIterator{<:AbstractDataFrame}, sta r = min(state + itr.n - 1, last_idx) return view(itr.c, state:r, :), r + 1 end + +""" + findall(f, df::AbstractDataFrame) + +Return an integer vector `I` of the column indices `i` of `df` where `f(df[:, i])` returns `true`. +If there are no such columns of `df`, return `Int[]`. + +# Examples + +```jldoctest +julia> df = DataFrame(a=[1, missing], b=[2, 3], c=[missing, 4]) +2×3 DataFrame + Row │ a b c + │ Int64? Int64 Int64? +─────┼───────────────────────── + 1 │ 1 2 missing + 2 │ missing 3 4 + +julia> findcols(x -> any(ismissing, x), df) +2-element Vector{Int64}: + 1 + 3 +``` +""" +findcols(f::Function, df::AbstractDataFrame) = + findall(f, eachcol(df)) diff --git a/test/dataframe.jl b/test/dataframe.jl index fbc2ec0ca..dccf6d7f6 100644 --- a/test/dataframe.jl +++ b/test/dataframe.jl @@ -2387,4 +2387,17 @@ end @test eltype(collect(p)) <: DataFrames.DataFrameRows end +@testset "findcols" begin + df = DataFrame(a=[1, missing], b=[2, 3], c=[missing, 4]) + @test findcols(x -> any(ismissing, x), df) == [1, 3] + @test findcols(x -> true, df) == [1, 2, 3] + @test findcols(x -> false, df) == Int[] + @test_throws TypeError findcols(x -> 1, df) + + @test findcols(x -> any(ismissing, x), view(df, :, [1, 2])) == [1] + @test findcols(x -> true, view(df, :, [1, 2])) == [1, 2] + @test findcols(x -> false, view(df, :, [1, 2])) == Int[] + @test_throws TypeError findcols(x -> 1, view(df, :, [1, 2])) +end + end # module