diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 3713c1b..361c6c9 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -15,7 +15,7 @@ jobs: fail-fast: false matrix: version: - - '1.6' + - '1.10' - '1' os: - ubuntu-latest diff --git a/Project.toml b/Project.toml index 56b46d6..9f813f0 100644 --- a/Project.toml +++ b/Project.toml @@ -6,10 +6,14 @@ version = "0.1.11" [deps] Clustering = "aaaa29a8-35af-508c-8bc3-b662a17a0fe5" Distances = "b4f34e82-e78d-54a5-968a-f98e89d6e8f7" +LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" MLJModelInterface = "e80e1ace-859a-464e-9ed9-23947d8ae3ea" +StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91" [compat] Clustering = "0.15" Distances = "0.9, 0.10" +LinearAlgebra = "1" MLJModelInterface = "1.4" -julia = "1.6" +StatsBase = "0.34" +julia = "1.10" diff --git a/src/MLJClusteringInterface.jl b/src/MLJClusteringInterface.jl index 4e9b17e..3dc83e0 100644 --- a/src/MLJClusteringInterface.jl +++ b/src/MLJClusteringInterface.jl @@ -13,10 +13,12 @@ import MLJModelInterface: Continuous, Count, Finite, Multiclass, Table, OrderedF @mlj_model, metadata_model, metadata_pkg using Distances +using LinearAlgebra +using StatsBase # =================================================================== ## EXPORTS -export KMeans, KMedoids, DBSCAN, HierarchicalClustering +export KMeans, KMedoids, AffinityPropagation, DBSCAN, HierarchicalClustering # =================================================================== ## CONSTANTS @@ -95,7 +97,6 @@ function MMI.transform(model::KMedoids, fitresult, X) return MMI.table(X̃, prototype=X) end - # # PREDICT FOR K_MEANS AND K_MEDOIDS function MMI.predict(model::Union{KMeans,KMedoids}, fitresult, Xnew) @@ -208,10 +209,66 @@ end MMI.reporting_operations(::Type{<:HierarchicalClustering}) = (:predict,) +# # AFFINITY_PROPAGATION + +@mlj_model mutable struct AffinityPropagation <: MMI.Static + damp::Float64 = 0.5::(0.0 ≤ _ < 1.0) + maxiter::Int = 200::(_ > 0) + tol::Float64 = 1e-6::(_ > 0) + preference::Union{Nothing,Float64} = nothing + metric::SemiMetric = SqEuclidean() +end + +function MMI.predict(model::AffinityPropagation, ::Nothing, X) + Xarray = MMI.matrix(X)' + + # Compute similarity matrix using negative pairwise distances + S = -pairwise(model.metric, Xarray, dims=2) + + diagonal_element = if !isnothing(model.preference) + model.preference + else + # Get the median out of all pairs of similarity, that is, values above + # the diagonal line. + # Such default choice is mentioned in the algorithm's wiki article + iuppertri = triu!(trues(size(S)),1) + median(S[iuppertri]) + end + + fill!(view(S, diagind(S)), diagonal_element) + + result = Cl.affinityprop( + S, + maxiter=model.maxiter, + tol=model.tol, + damp=model.damp + ) + + # Get number of clusters and labels + exemplars = result.exemplars + k = length(exemplars) + cluster_labels = MMI.categorical(1:k) + + # Store exemplar points as centers (similar to KMeans/KMedoids) + centers = view(Xarray, :, exemplars) + + report = ( + exemplars=exemplars, + centers=centers, + cluster_labels=cluster_labels, + iterations=result.iterations, + converged=result.converged + ) + + return MMI.categorical(result.assignments), report +end + +MMI.reporting_operations(::Type{<:AffinityPropagation}) = (:predict,) + # # METADATA metadata_pkg.( - (KMeans, KMedoids, DBSCAN, HierarchicalClustering), + (KMeans, KMedoids, DBSCAN, HierarchicalClustering, AffinityPropagation), name="Clustering", uuid="aaaa29a8-35af-508c-8bc3-b662a17a0fe5", url="https://github.com/JuliaStats/Clustering.jl", @@ -251,6 +308,13 @@ metadata_model( path = "$(PKG).HierarchicalClustering" ) +metadata_model( + AffinityPropagation, + human_name = "Affinity Propagation clusterer", + input_scitype = MMI.Table(Continuous), + path = "$(PKG).AffinityPropagation" +) + """ $(MMI.doc_header(KMeans)) @@ -614,4 +678,73 @@ report(mach).cutter(h = 2.5) """ HierarchicalClustering +""" +$(MMI.doc_header(AffinityPropagation)) + +[Affinity Propagation](https://en.wikipedia.org/wiki/Affinity_propagation) is a clustering algorithm based on the concept of "message passing" between data points. More information is available at the [Clustering.jl documentation](https://juliastats.org/Clustering.jl/stable/index.html). Use `predict` to get cluster assignments. Indices of the exemplars, their values, etc, are accessed from the machine report (see below). + +This is a static implementation, i.e., it does not generalize to new data instances, and +there is no training data. For clusterers that do generalize, see [`KMeans`](@ref) or +[`KMedoids`](@ref). + +In MLJ or MLJBase, create a machine with + + mach = machine(model) + +# Hyper-parameters + +- `damp = 0.5`: damping factor + +- `maxiter = 200`: maximum number of iteration + +- `tol = 1e-6`: tolerance for converenge + +- `preference = nothing`: the (single float) value of the diagonal elements of the similarity matrix. If unspecified, choose median (negative) similarity of all pairs as mentioned [here](https://en.wikipedia.org/wiki/Affinity_propagation#Algorithm) + +- `metric = Distances.SqEuclidean()`: metric (see `Distances.jl` for available metrics) + +# Operations + +- `predict(mach, X)`: return cluster label assignments, as an unordered + `CategoricalVector`. Here `X` is any table of input features (eg, a `DataFrame`) whose + columns are of scitype `Continuous`; check column scitypes with `schema(X)`. + +# Report + +After calling `predict(mach)`, the fields of `report(mach)` are: + +- exemplars: indices of the data picked as exemplars in `X` + +- centers: positions of the exemplars in the feature space + +- cluster_labels: labels of clusters given to each datum in `X` + +- iterations: the number of iteration run by the algorithm + +- converged: whether or not the algorithm converges by the maximum iteration + +# Examples + +``` +using MLJ + +X, labels = make_moons(400, noise=0.9, rng=1) + +AffinityPropagation = @load AffinityPropagation pkg=Clustering +model = AffinityPropagation(preference=-10.0) +mach = machine(model) + +# compute and output cluster assignments for observations in `X`: +yhat = predict(mach, X) + +# Get the positions of the exemplars +report(mach).centers + +# Plot clustering result +using GLMakie +scatter(MLJ.matrix(X)', color=yhat.refs) +``` +""" +AffinityPropagation + end # module diff --git a/test/runtests.jl b/test/runtests.jl index b0bc903..b42088e 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -150,8 +150,40 @@ end @test report(mach).dendrogram.heights == dendro.heights end +# # AffinityPropagation + +@testset "AffinityPropagation" begin + X = table(stack(Iterators.partition(0.5:0.5:20, 5))') + + # Test case 1: preference == median (negative) similarity (i.e. unspecified) + mach = machine(AffinityPropagation()) + + yhat = predict(mach, X) + @test yhat == [1, 1, 1, 1, 2, 2, 2, 2] + + _report = report(mach) + @test _report.exemplars == [2, 7] + @test _report.centers == [3.0 15.5; 3.5 16.0; 4.0 16.5; 4.5 17.0; 5.0 17.5] + @test _report.cluster_labels == [1, 2] + @test _report.iterations == 50 + @test _report.converged == true + + # Test case 2: |preference| too large + mach2 = machine(AffinityPropagation(preference=-20.0)) + + yhat = predict(mach2, X) + @test yhat == [1, 2, 3, 4, 5, 6, 7, 8] + + _report = report(mach2) + @test _report.exemplars == [1, 2, 3, 4, 5, 6, 7, 8] + @test _report.centers == matrix(X)' + @test _report.cluster_labels == [1, 2, 3, 4, 5, 6, 7, 8] + @test _report.iterations == 32 + @test _report.converged == true +end + @testset "MLJ interface" begin - models = [KMeans, KMedoids, DBSCAN, HierarchicalClustering] + models = [KMeans, KMedoids, DBSCAN, HierarchicalClustering, AffinityPropagation] failures, summary = MLJTestInterface.test( models, X;