Skip to content

Commit

Permalink
Merge pull request #61 from tylerjthomas9/new-models
Browse files Browse the repository at this point in the history
New models, automatically convert more fit results, and support for feature importances
  • Loading branch information
tylerjthomas9 authored Oct 11, 2023
2 parents f2395ea + 2afc496 commit 530ce27
Show file tree
Hide file tree
Showing 20 changed files with 799 additions and 256 deletions.
9 changes: 7 additions & 2 deletions Project.toml
Original file line number Diff line number Diff line change
@@ -1,15 +1,20 @@
name = "MLJScikitLearnInterface"
uuid = "5ae90465-5518-4432-b9d2-8a1def2f0cab"
authors = ["Thibaut Lienart, Anthony Blaom"]
version = "0.5.0"
version = "0.6.0"

[deps]
MLJBase = "a7f614a8-145f-11e9-1d2a-a57a1082229d"
MLJModelInterface = "e80e1ace-859a-464e-9ed9-23947d8ae3ea"
PythonCall = "6099a3de-0909-46bc-b1f4-468b9a2dfc0d"
Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
Tables = "bd369af6-aec1-5ad0-b16a-f7cc5008161c"

[compat]
MLJBase = "1"
MLJModelInterface = "1.4"
PythonCall = "0.9"
Tables = "1.10"
julia = "1.6"

[extras]
Expand All @@ -19,4 +24,4 @@ StableRNGs = "860ef19b-820b-49d6-a774-d7a799459cd3"
Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"

[targets]
test = ["StableRNGs", "MLJTestInterface", "Test", "MLJBase"]
test = ["MLJBase", "MLJTestInterface", "StableRNGs", "Test"]
3 changes: 3 additions & 0 deletions src/MLJScikitLearnInterface.jl
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@ import MLJModelInterface:
@mlj_model, _process_model_def, _model_constructor, _model_cleaner,
Table, Continuous, Count, Finite, OrderedFactor, Multiclass, Unknown
const MMI = MLJModelInterface
using Statistics
import Tables

include("ScikitLearnAPI.jl")
const SK = ScikitLearnAPI
Expand Down Expand Up @@ -49,6 +51,7 @@ const CV = "with built-in cross-validation"

include("macros.jl")
include("meta.jl")
include("tables.jl")

include("models/linear-regressors.jl")
include("models/linear-regressors-multi.jl")
Expand Down
53 changes: 47 additions & 6 deletions src/macros.jl
Original file line number Diff line number Diff line change
Expand Up @@ -113,11 +113,12 @@ end
Called as part of [`@sk_reg`](@ref), returns the expression corresponing to the
`fit` method for a ScikitLearn regression model.
"""
function _skmodel_fit_reg(modelname, params)
function _skmodel_fit_reg(modelname, params, save_std::Bool=false)
expr = quote
function MMI.fit(model::$modelname, verbosity::Int, X, y)
# set X and y into a format that can be processed by sklearn
Xmatrix = MMI.matrix(X)
Xmatrix = MMI.matrix(X)
names = get_column_names(X)
yplain = y
targnames = nothing
# check if it's a multi-target regression case, in that case keep
Expand Down Expand Up @@ -149,8 +150,12 @@ function _skmodel_fit_reg(modelname, params)
X_py = ScikitLearnAPI.numpy.array(Xmatrix)
y_py = ScikitLearnAPI.numpy.array(yplain)
fitres = SK.fit!(skmodel, X_py, y_py)
# TODO: we may want to use the report later on
report = NamedTuple()
if ScikitLearnAPI.pyhasattr(fitres, "coef_")
column_std = std(Xmatrix, dims=1) |> vec
report = (; column_std, names)
else
report = (; names)
end
# the first nothing is so that we can use the same predict for
# regressors and classifiers
return ((fitres, nothing, targnames), nothing, report)
Expand All @@ -168,6 +173,7 @@ function _skmodel_fit_clf(modelname, params)
quote
function MMI.fit(model::$modelname, verbosity::Int, X, y)
Xmatrix = MMI.matrix(X)
names = get_column_names(X)
yplain = MMI.int(y)
# See _skmodel_fit_reg, same story
sksym, skmod, mdl = $(Symbol(modelname, "_"))
Expand All @@ -177,8 +183,13 @@ function _skmodel_fit_clf(modelname, params)
skmodel = skconstr(
$((Expr(:kw, p, :(model.$p)) for p in params)...))
fitres = SK.fit!(skmodel, Xmatrix, yplain)
# TODO: we may want to use the report later on
report = NamedTuple()
report = (; names)
if ScikitLearnAPI.pyhasattr(fitres, "coef_")
column_std = std(Xmatrix, dims=1) |> vec
report = (; column_std, names)
else
report = (; names)
end
# pass y[1] for decoding in predict method, first nothing
# is targnames
return ((fitres, y[1], nothing), nothing, report)
Expand Down Expand Up @@ -329,3 +340,33 @@ macro sku_predict(modelname)
end
end
end

#
function _coef_vec(coef::AbstractVector)
return abs.(coef)
end

function _coef_vec(coef::AbstractMatrix)
return mean(abs.(coef), dims=1) |> vec
end

"""
macro sk_feature_importances(modelname)
Adds a `feature_importance` method to a declared scikit model if
there is one supported.
"""
macro sk_feature_importances(modelname)
quote
MMI.reports_feature_importances(::Type{<:$modelname}) = true
function MMI.feature_importances(m::$modelname, fitres, r)
params = MMI.fitted_params(m, fitres)
feature_importances = if haskey(params, :feature_importances)
params.feature_importances
else
_coef_vec(params.coef) .* r.column_std
end
result = [(r.names[i] => x) for (i, x) in enumerate(feature_importances)]
end
end
end
163 changes: 158 additions & 5 deletions src/models/clustering.jl
Original file line number Diff line number Diff line change
Expand Up @@ -137,6 +137,48 @@ data which contains clusters of similar density.
"""
DBSCAN

# ============================================================================
const HDBSCAN_ = skcl(:HDBSCAN)
@sk_uns mutable struct HDBSCAN <: MMI.Unsupervised
min_cluster_size::Int = 5::(_ > 0)
min_samples::Option{Int} = nothing
cluster_selection_epsilon::Float64 = 0.0::(_ ≥ 0)
max_cluster_size::Option{Int} = nothing
metric::String = "euclidean"::(_ in ("euclidean", "precomputed"))
alpha::Float64 = 1.0::(_ > 0)
algorithm::String = "auto"::(_ in ("auto", "brute", "kdtree", "balltree"))
leaf_size::Int = 40::(_ > 1)
cluster_selection_method::String = "eom"::(_ in ("eom", "leaf"))
allow_single_cluster::Bool = false
store_centers::Option{String} = nothing
end
function MMI.fitted_params(m::HDBSCAN, f)
labels = pyconvert(Array, f.labels_) .+ 2
nc = length(unique(labels))
catv = MMI.categorical([-1, (1:nc)...])
return (
labels = catv[labels],
probabilities = pyconvert(Array, f.probabilities_)
)
end
meta(HDBSCAN,
input = Table(Continuous),
weights = false,
)

"""
$(MMI.doc_header(HDBSCAN))
Hierarchical Density-Based Spatial Clustering of Applications with
Noise. Performs [`DBSCAN`](@ref) over varying epsilon values and
integrates the result to find a clustering that gives the best
stability over epsilon. This allows HDBSCAN to find clusters of
varying densities (unlike [`DBSCAN`](@ref)), and be more robust to
parameter selection.
"""
HDBSCAN

# ============================================================================
const FeatureAgglomeration_ = skcl(:FeatureAgglomeration)
@sk_uns mutable struct FeatureAgglomeration <: MMI.Unsupervised
Expand Down Expand Up @@ -191,7 +233,7 @@ const KMeans_ = skcl(:KMeans)
copy_x::Bool = true
algorithm::String = "lloyd"::(_ in ("elkane", "lloyd"))
# long
init::Union{AbstractArray,String} = "k-means++"::(_ isa AbstractArray || _ in ("k-means++", "random"))
init::Union{AbstractArray,String} = "k-means++"::(_ isa AbstractArray || _ in ("k-means++", "random"))
end
@sku_transform KMeans
@sku_predict KMeans
Expand All @@ -217,6 +259,45 @@ K-Means algorithm: find K centroids corresponding to K clusters in the data.
"""
KMeans

# ============================================================================
const BisectingKMeans_ = skcl(:BisectingKMeans)
@sk_uns mutable struct BisectingKMeans <: MMI.Unsupervised
n_clusters::Int = 8::(_ ≥ 1)
n_init::Int = 1::(_ ≥ 1)
max_iter::Int = 300::(_ ≥ 1)
tol::Float64 = 1e-4::(_ > 0)
verbose::Int = 0::(_ ≥ 0)
random_state::Any = nothing
copy_x::Bool = true
algorithm::String = "lloyd"::(_ in ("elkane", "lloyd"))
# long
init::Union{AbstractArray,String} = "k-means++"::(_ isa AbstractArray || _ in ("k-means++", "random"))
bisecting_strategy::String = "biggest_inertia"::(_ in ("biggest_inertia", "largest_cluster"))
end
@sku_transform BisectingKMeans
# @sku_predict BisectingKMeans #TODO: Why does this fail?
function MMI.fitted_params(m::BisectingKMeans, f)
nc = pyconvert(Int, f.n_clusters)
catv = MMI.categorical(1:nc)
return (
cluster_centers = pyconvert(Array, f.cluster_centers_),
labels = catv[pyconvert(Array, f.labels_) .+ 1],
inertia = pyconvert(Float64, f.inertia_))
end
meta(BisectingKMeans,
input = Table(Continuous),
target = AbstractVector{Multiclass},
output = Table(Continuous),
weights = false)

"""
$(MMI.doc_header(BisectingKMeans))
Bisecting K-Means clustering.
"""
BisectingKMeans

# ============================================================================
const MiniBatchKMeans_ = skcl(:MiniBatchKMeans)
@sk_uns mutable struct MiniBatchKMeans <: MMI.Unsupervised
Expand Down Expand Up @@ -339,7 +420,7 @@ OPTICS
# ============================================================================
const SpectralClustering_ = skcl(:SpectralClustering)
@sk_uns mutable struct SpectralClustering <: MMI.Unsupervised
n_clusters::Int = 8::(_ ≥ 1)
n_clusters::Int = 8::(_ ≥ 1)
eigen_solver::Option{String} = nothing::(_ === nothing || _ in ("arpack", "lobpcg", "amg"))
# n_components::Option{Int} = nothing::(_ === nothing || _ ≥ 1)
random_state::Any = nothing
Expand Down Expand Up @@ -378,11 +459,83 @@ SpectralClustering

# NOTE: the two models below are weird, not bothering with them for now
# # ============================================================================
# SpectralBiclustering_ = skcl(:SpectralBiclustering)
# const SpectralBiclustering_ = skcl(:SpectralBiclustering)
# @sk_uns mutable struct SpectralBiclustering <: MMI.Unsupervised
# n_clusters::Int = 3::(_ ≥ 1)
# method::String = "bistochastic"::(_ in ("bistochastic", "scale", "log"))
# n_components::Int = 6::(_ ≥ 1)
# n_best::Int = 3
# svd_method::String = "randomized"::(_ in ("arpack", "randomized"))
# n_svd_vecs::Option{Int} = nothing
# mini_batch::Bool = false
# init::Union{AbstractArray,String} = "k-means++"::(_ isa AbstractArray || _ in ("k-means++", "random"))
# n_init::Int = 10::(_ ≥ 1)
# random_state::Any = nothing
# end
#
# function MMI.fitted_params(m::SpectralBiclustering, f)
# return (
# rows = pyconvert(Array, f.rows_),
# columns = pyconvert(Array, f.columns_),
# row_labels = pyconvert(Array, f.row_labels_),
# column_labels = pyconvert(Array, f.column_labels_)
# )
# end
# meta(SpectralBiclustering,
# input = Table(Continuous),
# weights = false
# )

# """
# $(MMI.doc_header(SpectralBiclustering))

# Partitions rows and columns under the assumption that the data
# has an underlying checkerboard structure. For instance, if there
# are two row partitions and three column partitions, each row will
# belong to three biclusters, and each column will belong to two
# biclusters. The outer product of the corresponding row and column
# label vectors gives this checkerboard structure.

# """
# SpectralBiclustering

# # ============================================================================
# SpectralCoclustering_ = skcl(:SpectralCoclustering)
# const SpectralCoclustering_ = skcl(:SpectralCoclustering)
# @sk_uns mutable struct SpectralCoclustering <: MMI.Unsupervised
# n_clusters::Int = 3::(_ ≥ 1)
# svd_method::String = "randomized"::(_ in ("arpack", "randomized"))
# n_svd_vecs::Option{Int} = nothing
# mini_batch::Bool = false
# init::Union{AbstractArray,String} = "k-means++"::(_ isa AbstractArray || _ in ("k-means++", "random"))
# n_init::Int = 10::(_ ≥ 1)
# random_state::Any = nothing
# end
# function MMI.fitted_params(m::SpectralCoclustering, f)
# return (
# rows = pyconvert(Array, f.rows_),
# columns = pyconvert(Array, f.columns_),
# row_labels = pyconvert(Array, f.row_labels_),
# column_labels = pyconvert(Array, f.column_labels_),
# biclusters = Tuple(pyconvert(Array, i) for i in f.biclusters_)
# )
# end
# meta(SpectralCoclustering,
# input = Table(Continuous),
# weights = false
# )

# """
# $(MMI.doc_header(SpectralCoclustering))

# Clusters rows and columns of an array `X` to solve the
# relaxed normalized cut of the bipartite graph created
# from `X` as follows: the edge between row vertex `i` and
# column vertex `j` has weight `X[i, j]`.

# The resulting bicluster structure is block-diagonal, since
# each row and each column belongs to exactly one bicluster.

# Supports sparse matrices, as long as they are nonnegative.

# """
# SpectralCoclustering

29 changes: 15 additions & 14 deletions src/models/discriminant-analysis.jl
Original file line number Diff line number Diff line change
Expand Up @@ -9,22 +9,23 @@ const BayesianLDA_ = skda(:LinearDiscriminantAnalysis)
covariance_estimator::Any = nothing
end
MMI.fitted_params(m::BayesianLDA, (f, _, _)) = (
coef = f.coef_,
intercept = f.intercept_,
covariance = m.store_covariance ? f.covariance_ : nothing,
means = f.means_,
priors = f.priors_,
scalings = f.scalings_,
xbar = f.xbar_,
classes = f.classes_,
explained_variance_ratio = f.explained_variance_ratio_
coef = pyconvert(Array, f.coef_),
intercept = pyconvert(Array, f.intercept_),
covariance = m.store_covariance ? pyconvert(Array, f.covariance_) : nothing,
explained_variance_ratio = pyconvert(Array, f.explained_variance_ratio_),
means = pyconvert(Array, f.means_),
priors = pyconvert(Array, f.priors_),
scalings = pyconvert(Array, f.scalings_),
xbar = pyconvert(Array, f.xbar_),
classes = pyconvert(Array, f.classes_)
)
meta(BayesianLDA,
input = Table(Continuous),
target = AbstractVector{<:Finite},
weights = false,
human_name = "Bayesian linear discriminant analysis"
)
@sk_feature_importances BayesianLDA

# ============================================================================
const BayesianQDA_ = skda(:QuadraticDiscriminantAnalysis)
Expand All @@ -35,11 +36,11 @@ const BayesianQDA_ = skda(:QuadraticDiscriminantAnalysis)
tol::Float64 = 1e-4::(_ > 0)
end
MMI.fitted_params(m::BayesianQDA, (f, _, _)) = (
covariance = m.store_covariance ? f.covariance_ : nothing,
means = f.means_,
priors = f.priors_,
rotations = f.rotations_,
scalings = f.scalings_
covariance = m.store_covariance ? pyconvert(Array, f.covariance_) : nothing,
means = pyconvert(Array, f.means_),
priors = pyconvert(Array, f.priors_),
rotations = pyconvert(Array, f.rotations_),
scalings = pyconvert(Array, f.scalings_),
)
meta(BayesianQDA,
input = Table(Continuous),
Expand Down
Loading

0 comments on commit 530ce27

Please sign in to comment.