Skip to content

Commit

Permalink
move FeatureSelector model to FeatureSelection.jl pkg
Browse files Browse the repository at this point in the history
  • Loading branch information
OkonSamuel committed Jun 2, 2024
1 parent 6517209 commit f900755
Show file tree
Hide file tree
Showing 3 changed files with 21 additions and 255 deletions.
198 changes: 21 additions & 177 deletions src/builtins/Transformers.jl
Original file line number Diff line number Diff line change
Expand Up @@ -183,90 +183,7 @@ function MMI.fitted_params(::FillImputer, fr)
filler_given_feature=filler_given_feature)
end


# # FOR FEATURE (COLUMN) SELECTION

mutable struct FeatureSelector <: Unsupervised
# features to be selected; empty means all
features::Union{Vector{Symbol}, Function}
ignore::Bool # features to be ignored
end

# keyword constructor
function FeatureSelector(
;
features::Union{AbstractVector{Symbol}, Function}=Symbol[],
ignore::Bool=false
)
transformer = FeatureSelector(features, ignore)
message = MMI.clean!(transformer)
isempty(message) || throw(ArgumentError(message))
return transformer
end

function MMI.clean!(transformer::FeatureSelector)
err = ""
if (
typeof(transformer.features) <: AbstractVector{Symbol} &&
isempty(transformer.features) &&
transformer.ignore
)
err *= "Features to be ignored must be specified in features field."
end
return err
end

function MMI.fit(transformer::FeatureSelector, verbosity::Int, X)
all_features = Tables.schema(X).names

if transformer.features isa AbstractVector{Symbol}
if isempty(transformer.features)
features = collect(all_features)
else
features = if transformer.ignore
!issubset(transformer.features, all_features) && verbosity > -1 &&
@warn("Excluding non-existent feature(s).")
filter!(all_features |> collect) do ftr
!(ftr in transformer.features)
end
else
issubset(transformer.features, all_features) ||
throw(ArgumentError("Attempting to select non-existent feature(s)."))
transformer.features |> collect
end
end
else
features = if transformer.ignore
filter!(all_features |> collect) do ftr
!(transformer.features(ftr))
end
else
filter!(all_features |> collect) do ftr
transformer.features(ftr)
end
end
isempty(features) && throw(
ArgumentError("No feature(s) selected.\n The specified Bool-valued"*
" callable with the `ignore` option set to `$(transformer.ignore)` "*
"resulted in an empty feature set for selection")
)
end

fitresult = features
report = NamedTuple()
return fitresult, nothing, report
end

MMI.fitted_params(::FeatureSelector, fitresult) = (features_to_keep=fitresult,)

function MMI.transform(::FeatureSelector, features, X)
all(e -> e in Tables.schema(X).names, features) ||
throw(ArgumentError("Supplied frame does not admit previously selected features."))
return MMI.selectcols(X, features)
end


# # UNIVARIATE DISCRETIZER
## UNIVARIATE DISCRETIZER

# helper function:
reftype(::CategoricalArray{<:Any,<:Any,R}) where R = R
Expand Down Expand Up @@ -1027,9 +944,14 @@ function MMI.transform(transformer::ContinuousEncoder, fitresult, X)
features_to_keep, hot_encoder, hot_fitresult = values(fitresult)

# dump unseen or untransformable features:
selector = FeatureSelector(features=features_to_keep)
selector_fitresult, _, _ = MMI.fit(selector, 0, X)
X0 = transform(selector, selector_fitresult, X)
if !issubset(features_to_keep, MMI.schema(X).names)
throw(

Check warning on line 948 in src/builtins/Transformers.jl

View check run for this annotation

Codecov / codecov/patch

src/builtins/Transformers.jl#L948

Added line #L948 was not covered by tests
ArgumentError(
"Supplied frame does not admit previously selected features."
)
)
end
X0 = MMI.selectcols(X, features_to_keep)

# one-hot encode:
X1 = transform(hot_encoder, hot_fitresult, X0)
Expand Down Expand Up @@ -1080,11 +1002,18 @@ end
# # METADATA FOR ALL BUILT-IN TRANSFORMERS

metadata_pkg.(
(FeatureSelector, UnivariateStandardizer,
UnivariateDiscretizer, Standardizer,
UnivariateBoxCoxTransformer, UnivariateFillImputer,
OneHotEncoder, FillImputer, ContinuousEncoder,
UnivariateTimeTypeToContinuous, InteractionTransformer),
(
UnivariateStandardizer,
UnivariateDiscretizer,
Standardizer,
UnivariateBoxCoxTransformer,
UnivariateFillImputer,
OneHotEncoder,
FillImputer,
ContinuousEncoder,
UnivariateTimeTypeToContinuous,
InteractionTransformer
),
package_name = "MLJModels",
package_uuid = "d491faf4-2d78-11e9-2867-c94bc002c0b7",
package_url = "https://github.com/JuliaAI/MLJModels.jl",
Expand All @@ -1106,11 +1035,6 @@ metadata_model(FillImputer,
output_scitype = Table,
load_path = "MLJModels.FillImputer")

metadata_model(FeatureSelector,
input_scitype = Table,
output_scitype = Table,
load_path = "MLJModels.FeatureSelector")

metadata_model(UnivariateDiscretizer,
input_scitype = AbstractVector{<:Continuous},
output_scitype = AbstractVector{<:OrderedFactor},
Expand Down Expand Up @@ -1371,86 +1295,6 @@ See also [`UnivariateFillImputer`](@ref).
"""
FillImputer

"""
$(MLJModelInterface.doc_header(FeatureSelector))
Use this model to select features (columns) of a table, usually as
part of a model `Pipeline`.
# Training data
In MLJ or MLJBase, bind an instance `model` to data with
mach = machine(model, X)
where
- `X`: any table of input features, where "table" is in the sense of Tables.jl
Train the machine using `fit!(mach, rows=...)`.
# Hyper-parameters
- `features`: one of the following, with the behavior indicated:
- `[]` (empty, the default): filter out all features (columns) which
were not encountered in training
- non-empty vector of feature names (symbols): keep only the
specified features (`ignore=false`) or keep only unspecified
features (`ignore=true`)
- function or other callable: keep a feature if the callable returns
`true` on its name. For example, specifying
`FeatureSelector(features = name -> name in [:x1, :x3], ignore =
true)` has the same effect as `FeatureSelector(features = [:x1,
:x3], ignore = true)`, namely to select all features, with the
exception of `:x1` and `:x3`.
- `ignore`: whether to ignore or keep specified `features`, as
explained above
# Operations
- `transform(mach, Xnew)`: select features from the table `Xnew` as
specified by the model, taking features seen during training into
account, if relevant
# Fitted parameters
The fields of `fitted_params(mach)` are:
- `features_to_keep`: the features that will be selected
# Example
```
using MLJ
X = (ordinal1 = [1, 2, 3],
ordinal2 = coerce(["x", "y", "x"], OrderedFactor),
ordinal3 = [10.0, 20.0, 30.0],
ordinal4 = [-20.0, -30.0, -40.0],
nominal = coerce(["Your father", "he", "is"], Multiclass));
selector = FeatureSelector(features=[:ordinal3, ], ignore=true);
julia> transform(fit!(machine(selector, X)), X)
(ordinal1 = [1, 2, 3],
ordinal2 = CategoricalValue{Symbol,UInt32}["x", "y", "x"],
ordinal4 = [-20.0, -30.0, -40.0],
nominal = CategoricalValue{String,UInt32}["Your father", "he", "is"],)
```
"""
FeatureSelector


"""
$(MLJModelInterface.doc_header(Standardizer))
Expand Down
77 changes: 0 additions & 77 deletions test/builtins/Transformers.jl
Original file line number Diff line number Diff line change
Expand Up @@ -13,83 +13,6 @@ import MLJBase
_get(x) = CategoricalArrays.DataAPI.unwrap(x)


#### FEATURE SELECTOR ####

@testset "Feat Selector" begin
N = 100
X = (Zn = rand(N),
Crim = rand(N),
x3 = categorical(rand("YN", N)),
x4 = categorical(rand("YN", N)))

# Test feature selection with `features=Symbol[]`
namesX = Tables.schema(X).names |> collect
selector = FeatureSelector()
f, = MLJBase.fit(selector, 1, X)
@test f == namesX
Xt = MLJBase.transform(selector, f, MLJBase.selectrows(X, 1:2))
@test Set(Tables.schema(Xt).names) == Set(namesX)
@test length(Xt.Zn) == 2

# Test on selecting features if `features` keyword is defined
selector = FeatureSelector(features=[:Zn, :Crim])
f, = MLJBase.fit(selector, 1, X)
@test MLJBase.transform(selector, f, MLJBase.selectrows(X, 1:2)) ==
MLJBase.select(X, 1:2, [:Zn, :Crim])

# test on ignoring a feature, even if it's listed in the `features`
selector.ignore = true
f, = MLJBase.fit(selector, 1, X)
Xnew = MLJBase.transform(selector, f, X)
@test MLJBase.transform(selector, f, MLJBase.selectrows(X, 1:2)) ==
MLJBase.select(X, 1:2, [:x3, :x4])

# test error about features selected or excluded in fit.
selector = FeatureSelector(features=[:x1, :mickey_mouse])
@test_throws(
ArgumentError,
MLJBase.fit(selector, 1, X)
)
selector.ignore = true
@test_logs(
(:warn, r"Excluding non-existent"),
MLJBase.fit(selector, 1, X)
)

# features must be specified if ignore=true
@test_throws ArgumentError FeatureSelector(ignore=true)

# test logs for no features selected when using Bool-Callable function interface:
selector = FeatureSelector(features= x-> x == (:x1))
@test_throws(
ArgumentError,
MLJBase.fit(selector, 1, X)
)
selector.ignore = true
selector.features = x-> x in [:Zn, :Crim, :x3, :x4]
@test_throws(
ArgumentError,
MLJBase.fit(selector, 1, X)
)

# Test model Metadata
infos = MLJModels.info_dict(selector)
@test infos[:input_scitype] == MLJBase.Table
@test infos[:output_scitype] == MLJBase.Table
end


# To be added with FeatureSelectorRule X = (n1=["a", "b", "a"], n2=["g", "g", "g"], n3=[7, 8, 9],
# n4 =UInt8[3,5,10], o1=[4.5, 3.6, 4.0], )
# MLJBase.schema(X)
# Xc = coerce(X, :n1=>Multiclass, :n2=>Multiclass)

# t = Discretizer(features=[:o1, :n3, :n2, :n1])
# @test Xt.features == [:o1, :n3, :n2, :n1]
# @test Xt.is_ordinal == [true, false, false, false]
# @test Xt.A == [512 1 1 1; 1 2 1 2; 256 3 1 1]


#### UNIVARIATE DISCRETIZATION ####

@testset "U-Discr" begin
Expand Down
1 change: 0 additions & 1 deletion test/model_search.jl
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,6 @@ end
DeterministicConstantRegressor,
ConstantClassifier,
ConstantRegressor,
FeatureSelector,
OneHotEncoder,
Standardizer,
UnivariateBoxCoxTransformer,
Expand Down

0 comments on commit f900755

Please sign in to comment.