Merge pull request #61 from tylerjthomas9/new-models

New models, automatically convert more fit results, and support for feature importances
JuliaAI · Oct 11, 2023 · 530ce27 · 530ce27
2 parents f2395ea + 2afc496
commit 530ce27
Show file tree

Hide file tree

Showing 20 changed files with 799 additions and 256 deletions.
diff --git a/Project.toml b/Project.toml
@@ -1,15 +1,20 @@
 name = "MLJScikitLearnInterface"
 uuid = "5ae90465-5518-4432-b9d2-8a1def2f0cab"
 authors = ["Thibaut Lienart, Anthony Blaom"]
-version = "0.5.0"
+version = "0.6.0"
 
 [deps]
+MLJBase = "a7f614a8-145f-11e9-1d2a-a57a1082229d"
 MLJModelInterface = "e80e1ace-859a-464e-9ed9-23947d8ae3ea"
 PythonCall = "6099a3de-0909-46bc-b1f4-468b9a2dfc0d"
+Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
+Tables = "bd369af6-aec1-5ad0-b16a-f7cc5008161c"
 
 [compat]
+MLJBase = "1"
 MLJModelInterface = "1.4"
 PythonCall = "0.9"
+Tables = "1.10"
 julia = "1.6"
 
 [extras]
@@ -19,4 +24,4 @@ StableRNGs = "860ef19b-820b-49d6-a774-d7a799459cd3"
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 
 [targets]
-test = ["StableRNGs", "MLJTestInterface", "Test", "MLJBase"]
+test = ["MLJBase", "MLJTestInterface", "StableRNGs", "Test"]
diff --git a/src/MLJScikitLearnInterface.jl b/src/MLJScikitLearnInterface.jl
@@ -5,6 +5,8 @@ import MLJModelInterface:
         @mlj_model, _process_model_def, _model_constructor, _model_cleaner,
         Table, Continuous, Count, Finite, OrderedFactor, Multiclass, Unknown
 const MMI = MLJModelInterface
+using Statistics
+import Tables
 
 include("ScikitLearnAPI.jl")
 const SK = ScikitLearnAPI
@@ -49,6 +51,7 @@ const CV = "with built-in cross-validation"
 
 include("macros.jl")
 include("meta.jl")
+include("tables.jl")
 
 include("models/linear-regressors.jl")
 include("models/linear-regressors-multi.jl")

diff --git a/src/macros.jl b/src/macros.jl
@@ -113,11 +113,12 @@ end
 Called as part of [`@sk_reg`](@ref), returns the expression corresponing to the
 `fit` method for a ScikitLearn regression model.
 """
-function _skmodel_fit_reg(modelname, params)
+function _skmodel_fit_reg(modelname, params, save_std::Bool=false)
     expr = quote
         function MMI.fit(model::$modelname, verbosity::Int, X, y)
             # set X and y into a format that can be processed by sklearn
-            Xmatrix   = MMI.matrix(X)
+            Xmatrix = MMI.matrix(X)
+            names = get_column_names(X)
             yplain    = y
             targnames = nothing
             # check if it's a multi-target regression case, in that case keep
@@ -149,8 +150,12 @@ function _skmodel_fit_reg(modelname, params)
             X_py = ScikitLearnAPI.numpy.array(Xmatrix)
             y_py = ScikitLearnAPI.numpy.array(yplain)
             fitres = SK.fit!(skmodel, X_py, y_py)
-            # TODO: we may want to use the report later on
-            report = NamedTuple()
+            if ScikitLearnAPI.pyhasattr(fitres, "coef_")
+                column_std = std(Xmatrix, dims=1) |> vec
+                report = (; column_std, names)
+            else
+                report = (; names)
+            end
             # the first nothing is so that we can use the same predict for
             # regressors and classifiers
             return ((fitres, nothing, targnames), nothing, report)
@@ -168,6 +173,7 @@ function _skmodel_fit_clf(modelname, params)
     quote
         function MMI.fit(model::$modelname, verbosity::Int, X, y)
             Xmatrix = MMI.matrix(X)
+            names = get_column_names(X)
             yplain  = MMI.int(y)
             # See _skmodel_fit_reg, same story
             sksym, skmod, mdl = $(Symbol(modelname, "_"))
@@ -177,8 +183,13 @@ function _skmodel_fit_clf(modelname, params)
             skmodel = skconstr(
                         $((Expr(:kw, p, :(model.$p)) for p in params)...))
             fitres  = SK.fit!(skmodel, Xmatrix, yplain)
-            # TODO: we may want to use the report later on
-            report  = NamedTuple()
+            report = (; names)
+            if ScikitLearnAPI.pyhasattr(fitres, "coef_")
+                column_std = std(Xmatrix, dims=1) |> vec
+                report = (; column_std, names)
+            else
+                report = (; names)
+            end
             # pass y[1] for decoding in predict method, first nothing
             # is targnames
             return ((fitres, y[1], nothing), nothing, report)
@@ -329,3 +340,33 @@ macro sku_predict(modelname)
         end
     end
 end
+
+# 
+function _coef_vec(coef::AbstractVector)
+    return abs.(coef)
+end
+
+function _coef_vec(coef::AbstractMatrix)
+    return mean(abs.(coef), dims=1) |> vec
+end
+
+"""
+    macro sk_feature_importances(modelname)
+
+Adds a `feature_importance` method to a declared scikit model if
+there is one supported.
+"""
+macro sk_feature_importances(modelname)
+    quote
+        MMI.reports_feature_importances(::Type{<:$modelname}) = true
+        function MMI.feature_importances(m::$modelname, fitres, r)
+            params = MMI.fitted_params(m, fitres)
+            feature_importances = if haskey(params, :feature_importances)
+                params.feature_importances
+            else
+                _coef_vec(params.coef) .* r.column_std
+            end
+            result = [(r.names[i] => x) for (i, x) in enumerate(feature_importances)]
+        end
+    end
+end
diff --git a/src/models/clustering.jl b/src/models/clustering.jl
@@ -137,6 +137,48 @@ data which contains clusters of similar density.
 """
 DBSCAN
 
+# ============================================================================
+const HDBSCAN_ = skcl(:HDBSCAN)
+@sk_uns mutable struct HDBSCAN <: MMI.Unsupervised
+    min_cluster_size::Int     = 5::(_ > 0)
+    min_samples::Option{Int}  = nothing
+    cluster_selection_epsilon::Float64 = 0.0::(_ ≥ 0)
+    max_cluster_size::Option{Int} = nothing
+    metric::String           = "euclidean"::(_ in ("euclidean", "precomputed"))
+    alpha::Float64           = 1.0::(_ > 0)
+    algorithm::String        = "auto"::(_ in ("auto", "brute", "kdtree", "balltree"))
+    leaf_size::Int           = 40::(_ > 1)
+    cluster_selection_method::String = "eom"::(_ in ("eom", "leaf"))
+    allow_single_cluster::Bool = false
+    store_centers::Option{String} = nothing
+end
+function MMI.fitted_params(m::HDBSCAN, f)
+    labels = pyconvert(Array, f.labels_) .+ 2
+    nc   = length(unique(labels))
+    catv = MMI.categorical([-1, (1:nc)...])
+    return (
+        labels              = catv[labels],
+        probabilities       = pyconvert(Array, f.probabilities_)
+    )
+end
+meta(HDBSCAN,
+    input   = Table(Continuous),
+    weights = false,
+    )
+
+"""
+$(MMI.doc_header(HDBSCAN))
+
+Hierarchical Density-Based Spatial Clustering of Applications with 
+Noise. Performs [`DBSCAN`](@ref) over varying epsilon values and 
+integrates the result to find a clustering that gives the best 
+stability over epsilon. This allows HDBSCAN to find clusters of 
+varying densities (unlike [`DBSCAN`](@ref)), and be more robust to 
+parameter selection. 
+
+"""
+HDBSCAN
+
 # ============================================================================
 const FeatureAgglomeration_ = skcl(:FeatureAgglomeration)
 @sk_uns mutable struct FeatureAgglomeration <: MMI.Unsupervised
@@ -191,7 +233,7 @@ const KMeans_ = skcl(:KMeans)
     copy_x::Bool        = true
     algorithm::String   = "lloyd"::(_ in ("elkane", "lloyd"))
     # long
-    init::Union{AbstractArray,String}        = "k-means++"::(_ isa AbstractArray || _ in ("k-means++", "random"))
+    init::Union{AbstractArray,String} = "k-means++"::(_ isa AbstractArray || _ in ("k-means++", "random"))
 end
 @sku_transform KMeans
 @sku_predict KMeans
@@ -217,6 +259,45 @@ K-Means algorithm: find K centroids corresponding to K clusters in the data.
 """
 KMeans
 
+# ============================================================================
+const BisectingKMeans_ = skcl(:BisectingKMeans)
+@sk_uns mutable struct BisectingKMeans <: MMI.Unsupervised
+    n_clusters::Int     = 8::(_ ≥ 1)
+    n_init::Int         = 1::(_ ≥ 1)
+    max_iter::Int       = 300::(_ ≥ 1)
+    tol::Float64        = 1e-4::(_ > 0)
+    verbose::Int        = 0::(_ ≥ 0)
+    random_state::Any   = nothing
+    copy_x::Bool        = true
+    algorithm::String   = "lloyd"::(_ in ("elkane", "lloyd"))
+    # long
+    init::Union{AbstractArray,String} = "k-means++"::(_ isa AbstractArray || _ in ("k-means++", "random"))
+    bisecting_strategy::String = "biggest_inertia"::(_ in ("biggest_inertia", "largest_cluster"))
+end
+@sku_transform BisectingKMeans
+# @sku_predict BisectingKMeans #TODO: Why does this fail?
+function MMI.fitted_params(m::BisectingKMeans, f)
+    nc   = pyconvert(Int, f.n_clusters)
+    catv = MMI.categorical(1:nc)
+    return (
+        cluster_centers = pyconvert(Array, f.cluster_centers_),
+        labels          = catv[pyconvert(Array, f.labels_) .+ 1],
+        inertia         = pyconvert(Float64, f.inertia_))
+end
+meta(BisectingKMeans,
+     input   = Table(Continuous),
+     target  = AbstractVector{Multiclass},
+     output  = Table(Continuous),
+     weights = false)
+
+"""
+$(MMI.doc_header(BisectingKMeans))
+
+Bisecting K-Means clustering.
+
+"""
+BisectingKMeans
+
 # ============================================================================
 const MiniBatchKMeans_ = skcl(:MiniBatchKMeans)
 @sk_uns mutable struct MiniBatchKMeans <: MMI.Unsupervised
@@ -339,7 +420,7 @@ OPTICS
 # ============================================================================
 const SpectralClustering_ = skcl(:SpectralClustering)
 @sk_uns mutable struct SpectralClustering <: MMI.Unsupervised
-    n_clusters::Int      = 8::(_ ≥ 1)
+    n_clusters::Int       = 8::(_ ≥ 1)
     eigen_solver::Option{String} = nothing::(_ === nothing || _ in ("arpack", "lobpcg", "amg"))
     # n_components::Option{Int}    = nothing::(_ === nothing || _ ≥ 1)
     random_state::Any     = nothing
@@ -378,11 +459,83 @@ SpectralClustering
 
 # NOTE: the two models below are weird, not bothering with them for now
 # # ============================================================================
-# SpectralBiclustering_ = skcl(:SpectralBiclustering)
+# const SpectralBiclustering_ = skcl(:SpectralBiclustering)
 # @sk_uns mutable struct SpectralBiclustering <: MMI.Unsupervised
+#     n_clusters::Int       = 3::(_ ≥ 1)
+#     method::String        = "bistochastic"::(_ in ("bistochastic", "scale", "log"))
+#     n_components::Int     = 6::(_ ≥ 1)
+#     n_best::Int           = 3
+#     svd_method::String    = "randomized"::(_ in ("arpack", "randomized"))
+#     n_svd_vecs::Option{Int} = nothing
+#     mini_batch::Bool      = false
+#     init::Union{AbstractArray,String} = "k-means++"::(_ isa AbstractArray || _ in ("k-means++", "random"))
+#     n_init::Int           = 10::(_ ≥ 1)
+#     random_state::Any     = nothing
 # end
-#
+# function MMI.fitted_params(m::SpectralBiclustering, f)
+#     return (
+#         rows            = pyconvert(Array, f.rows_),
+#         columns         = pyconvert(Array, f.columns_),
+#         row_labels      = pyconvert(Array, f.row_labels_),
+#         column_labels   = pyconvert(Array, f.column_labels_)
+#     )
+# end
+# meta(SpectralBiclustering,
+#     input   = Table(Continuous),
+#     weights = false
+#     )
+
+# """
+# $(MMI.doc_header(SpectralBiclustering))
+
+# Partitions rows and columns under the assumption that the data 
+# has an underlying checkerboard structure. For instance, if there 
+# are two row partitions and three column partitions, each row will 
+# belong to three biclusters, and each column will belong to two 
+# biclusters. The outer product of the corresponding row and column 
+# label vectors gives this checkerboard structure.
+
+# """
+# SpectralBiclustering
+
 # # ============================================================================
-# SpectralCoclustering_ = skcl(:SpectralCoclustering)
+# const SpectralCoclustering_ = skcl(:SpectralCoclustering)
 # @sk_uns mutable struct SpectralCoclustering <: MMI.Unsupervised
+#     n_clusters::Int       = 3::(_ ≥ 1)
+#     svd_method::String    = "randomized"::(_ in ("arpack", "randomized"))
+#     n_svd_vecs::Option{Int} = nothing
+#     mini_batch::Bool      = false
+#     init::Union{AbstractArray,String} = "k-means++"::(_ isa AbstractArray || _ in ("k-means++", "random"))
+#     n_init::Int           = 10::(_ ≥ 1)
+#     random_state::Any     = nothing
 # end
+# function MMI.fitted_params(m::SpectralCoclustering, f)
+#     return (
+#         rows            = pyconvert(Array, f.rows_),
+#         columns         = pyconvert(Array, f.columns_),
+#         row_labels      = pyconvert(Array, f.row_labels_),
+#         column_labels   = pyconvert(Array, f.column_labels_),
+#         biclusters      = Tuple(pyconvert(Array, i) for i in f.biclusters_)
+#     )
+# end
+# meta(SpectralCoclustering,
+#     input   = Table(Continuous),
+#     weights = false
+#     )
+
+# """
+# $(MMI.doc_header(SpectralCoclustering))
+
+# Clusters rows and columns of an array `X` to solve the 
+# relaxed normalized cut of the bipartite graph created 
+# from `X` as follows: the edge between row vertex `i` and 
+# column vertex `j` has weight `X[i, j]`.
+
+# The resulting bicluster structure is block-diagonal, since 
+# each row and each column belongs to exactly one bicluster.
+
+# Supports sparse matrices, as long as they are nonnegative.
+
+# """
+# SpectralCoclustering
+
diff --git a/src/models/discriminant-analysis.jl b/src/models/discriminant-analysis.jl
@@ -9,22 +9,23 @@ const BayesianLDA_ = skda(:LinearDiscriminantAnalysis)
     covariance_estimator::Any        = nothing
 end
 MMI.fitted_params(m::BayesianLDA, (f, _, _)) = (
-    coef       = f.coef_,
-    intercept  = f.intercept_,
-    covariance = m.store_covariance ? f.covariance_ : nothing,
-    means      = f.means_,
-    priors     = f.priors_,
-    scalings   = f.scalings_,
-    xbar       = f.xbar_,
-    classes    = f.classes_,
-    explained_variance_ratio = f.explained_variance_ratio_
+    coef       = pyconvert(Array, f.coef_),
+    intercept  = pyconvert(Array, f.intercept_),
+    covariance = m.store_covariance ? pyconvert(Array, f.covariance_) : nothing,
+    explained_variance_ratio = pyconvert(Array, f.explained_variance_ratio_),
+    means      = pyconvert(Array, f.means_),
+    priors     = pyconvert(Array, f.priors_),
+    scalings   = pyconvert(Array, f.scalings_),
+    xbar       = pyconvert(Array, f.xbar_),
+    classes    = pyconvert(Array, f.classes_)
     )
 meta(BayesianLDA,
     input   = Table(Continuous),
     target  = AbstractVector{<:Finite},
     weights = false,
     human_name   = "Bayesian linear discriminant analysis"
     )
+@sk_feature_importances BayesianLDA
 
 # ============================================================================
 const BayesianQDA_ = skda(:QuadraticDiscriminantAnalysis)
@@ -35,11 +36,11 @@ const BayesianQDA_ = skda(:QuadraticDiscriminantAnalysis)
     tol::Float64                   = 1e-4::(_ > 0)
 end
 MMI.fitted_params(m::BayesianQDA, (f, _, _)) = (
-    covariance = m.store_covariance ? f.covariance_ : nothing,
-    means      = f.means_,
-    priors     = f.priors_,
-    rotations  = f.rotations_,
-    scalings   = f.scalings_
+    covariance = m.store_covariance ? pyconvert(Array, f.covariance_) : nothing,
+    means      = pyconvert(Array, f.means_),
+    priors     = pyconvert(Array, f.priors_),
+    rotations  = pyconvert(Array, f.rotations_),
+    scalings   = pyconvert(Array, f.scalings_),
     )
 meta(BayesianQDA,
     input   = Table(Continuous),