From bc9fcaf18a0d3f1c972f6f5fc71f7a5b8ec7693d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=BAlio=20Hoffimann?= Date: Mon, 4 Oct 2021 14:37:06 -0300 Subject: [PATCH 1/6] Add Clustering.DBSCAN to interface --- Project.toml | 8 ---- src/MLJClusteringInterface.jl | 88 ++++++++++++++++++++++++++++++++++- test/Project.toml | 7 +++ test/runtests.jl | 21 +++++---- 4 files changed, 105 insertions(+), 19 deletions(-) create mode 100644 test/Project.toml diff --git a/Project.toml b/Project.toml index 617dc3d..7cf658b 100644 --- a/Project.toml +++ b/Project.toml @@ -14,11 +14,3 @@ Distances = "0.9, 0.10" MLJModelInterface = "1.4" julia = "1.6" -[extras] -LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" -MLJBase = "a7f614a8-145f-11e9-1d2a-a57a1082229d" -Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" -Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" - -[targets] -test = ["LinearAlgebra", "MLJBase", "Random", "Test"] diff --git a/src/MLJClusteringInterface.jl b/src/MLJClusteringInterface.jl index 8bd084a..4aaedf8 100644 --- a/src/MLJClusteringInterface.jl +++ b/src/MLJClusteringInterface.jl @@ -13,10 +13,11 @@ import MLJModelInterface: Continuous, Count, Finite, Multiclass, Table, OrderedF @mlj_model, metadata_model, metadata_pkg using Distances +using NearestNeighbors # =================================================================== ## EXPORTS -export KMeans, KMedoids +export KMeans, KMedoids, DBSCAN # =================================================================== ## CONSTANTS @@ -124,6 +125,81 @@ function MMI.predict(model::Union{KMeans,KMedoids}, fitresult, Xnew) return cluster_labels[pred] end +#### +#### DBSCAN +#### +""" +DBSCAN(; kwargs...) + +$DBSCANDescription + +$DBFields + +See also the +[package documentation](https://juliastats.org/Clustering.jl/stable/dbscan.html). +""" +@mlj_model mutable struct DBSCAN <: MMI.Unsupervised + radius::Real = 1.0::(_ > 0) + leafsize::Int = 20::(_ > 0) + min_neighbors::Int = 1::(_ > 0) + min_cluster_size::Int = 1::(_ > 0) +end + +function MMI.fit(model::DBSCAN, verbosity::Int, X) + Xarray = MMI.matrix(X, transpose=true) + clusters = Cl.dbscan(Xarray, model.radius; + leafsize=model.leafsize, + min_neighbors=model.min_neighbors, + min_cluster_size=model.min_cluster_size) + + # assignments and point types + npoints = size(Xarray, 2) + assignments = zeros(Int, npoints) + pointtypes = zeros(Int, npoints) + for (k, cluster) in enumerate(clusters) + for i in cluster.core_indices + assignments[i] = k + pointtypes[i] = 1 + end + for i in cluster.boundary_indices + assignments[i] = k + pointtypes[i] = 0 + end + end + + result = (Xarray, assignments, pointtypes) + cache = nothing + report = nothing + result, cache, report +end + +MMI.fitted_params(::DBSCAN, fitresult) = (assignments=fitresult[1][2], + pointtypes=fitresult[1][3]) + +function MMI.transform(::DBSCAN, fitresult, X) + # table with assignments in first column and + # point types in second column (core=1 vs. boundary=0) + _, assignments, pointtypes = fitresult[1] + X̃ = [assignments pointtypes] + MMI.table(X̃, prototype=X) +end + +function MMI.predict(::DBSCAN, fitresult, Xnew) + X1, assignments, _ = fitresult[1] + X2 = MMI.matrix(Xnew, transpose=true) + + labels = MMI.categorical(assignments) + + # construct KDtree with points in X1 + tree = KDTree(X1, Euclidean()) + + # find nearest neighbor of X2 in X1 + inds, _ = nn(tree, X2) + + # return assignment of nearest neighbor + labels[inds] +end + #### #### METADATA #### @@ -324,5 +400,13 @@ See also KMedoids -end # module +metadata_model( + DBSCAN, + input = MMI.Table(Continuous), + output = MMI.Table(Continuous), + weights = false, + descr = DBSCANDescription, + path = "$(PKG).DBSCAN" +) +end # module diff --git a/test/Project.toml b/test/Project.toml new file mode 100644 index 0000000..b831062 --- /dev/null +++ b/test/Project.toml @@ -0,0 +1,7 @@ +[deps] +Clustering = "aaaa29a8-35af-508c-8bc3-b662a17a0fe5" +Distances = "b4f34e82-e78d-54a5-968a-f98e89d6e8f7" +LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" +MLJBase = "a7f614a8-145f-11e9-1d2a-a57a1082229d" +Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" +Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" diff --git a/test/runtests.jl b/test/runtests.jl index cd2b3d0..b6f170c 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -4,11 +4,9 @@ import LinearAlgebra: norm using MLJBase using MLJClusteringInterface -using Random:seed! +using Random: seed! using Test -const Dist = Distances - seed!(132442) X, y = @load_crabs @@ -16,7 +14,7 @@ X, y = @load_crabs #### KMEANS #### -@testset "Kmeans" begin +@testset "KMeans" begin barekm = KMeans() fitresult, cache, report = fit(barekm, 1, X) R = matrix(transform(barekm, fitresult, X)) @@ -28,25 +26,30 @@ X, y = @load_crabs p = predict(barekm, fitresult, X) @test argmin(R[1, :]) == p[1] @test argmin(R[10, :]) == p[10] - - end #### #### KMEDOIDS #### -@testset "Kmedoids" begin +@testset "KMedoids" begin barekm = KMedoids() fitresult, cache, report = fit(barekm, 1, X) X_array = matrix(X) R = matrix(transform(barekm, fitresult, X)) - @test R[1, 2] ≈ Dist.evaluate( + @test R[1, 2] ≈ Distances.evaluate( barekm.metric, view(X_array, 1, :), view(fitresult[1], :, 2) ) - @test R[10, 3] ≈ Dist.evaluate( + @test R[10, 3] ≈ Distances.evaluate( barekm.metric, view(X_array, 10, :), view(fitresult[1], :, 3) ) p = predict(barekm, fitresult, X) @test all(report.assignments .== p) end + +@testset "DBSCAN" begin + dbscan = DBSCAN() + fitresult = fit(dbscan, 1, X) + A = transform(dbscan, fitresult, X) + p = predict(dbscan, fitresult, X) +end \ No newline at end of file From 8bc74a793a4aaca042c13c1c39b31b96e9ee3ec1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=BAlio=20Hoffimann?= Date: Tue, 5 Oct 2021 13:46:47 -0300 Subject: [PATCH 2/6] Add tests for DBSCAN --- Project.toml | 1 - test/runtests.jl | 39 +++++++++++++++++++++++++++++++++++---- 2 files changed, 35 insertions(+), 5 deletions(-) diff --git a/Project.toml b/Project.toml index 7cf658b..2b23a07 100644 --- a/Project.toml +++ b/Project.toml @@ -13,4 +13,3 @@ Clustering = "0.14" Distances = "0.9, 0.10" MLJModelInterface = "1.4" julia = "1.6" - diff --git a/test/runtests.jl b/test/runtests.jl index b6f170c..a16f269 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -48,8 +48,39 @@ end end @testset "DBSCAN" begin - dbscan = DBSCAN() - fitresult = fit(dbscan, 1, X) - A = transform(dbscan, fitresult, X) - p = predict(dbscan, fitresult, X) + # five spot pattern + X = [ + 0.0 0.0 + 1.0 0.0 + 1.0 1.0 + 0.0 1.0 + 0.5 0.5 + ] + + # radius < √2 ==> 5 clusters + dbscan = DBSCAN(radius=0.1) + fitresult = fit(dbscan, 1, X) + A = transform(dbscan, fitresult, X) + p = predict(dbscan, fitresult, X) + @test size(matrix(A)) == (5, 2) + @test A.x2 == [0,0,0,0,0] + @test Set(p) == Set(unique(p)) + + # radius > √2 ==> 1 cluster + dbscan = DBSCAN(radius=√2+eps()) + fitresult = fit(dbscan, 1, X) + A = transform(dbscan, fitresult, X) + p = predict(dbscan, fitresult, X) + @test size(matrix(A)) == (5, 2) + @test A.x2 == [1,1,1,1,1] + @test unique(p) == [1] + + # radius < √2 && min_cluster_size = 2 ==> all points are noise + dbscan = DBSCAN(radius=0.1, min_cluster_size=2) + fitresult = fit(dbscan, 1, X) + A = transform(dbscan, fitresult, X) + p = predict(dbscan, fitresult, X) + @test size(matrix(A)) == (5, 2) + @test A.x2 == [0,0,0,0,0] + @test unique(p) == [0] end \ No newline at end of file From 51243ac4f88238e85e22820ba99e68eb7245c223 Mon Sep 17 00:00:00 2001 From: "Anthony D. Blaom" Date: Tue, 23 Aug 2022 17:03:23 +1200 Subject: [PATCH 3/6] re-implement DBSCAN as a Static transformer with report --- src/MLJClusteringInterface.jl | 117 ++++++++++++---------------------- test/Project.toml | 1 + test/runtests.jl | 79 ++++++++++++++--------- 3 files changed, 92 insertions(+), 105 deletions(-) diff --git a/src/MLJClusteringInterface.jl b/src/MLJClusteringInterface.jl index 4aaedf8..45bbe95 100644 --- a/src/MLJClusteringInterface.jl +++ b/src/MLJClusteringInterface.jl @@ -13,7 +13,6 @@ import MLJModelInterface: Continuous, Count, Finite, Multiclass, Table, OrderedF @mlj_model, metadata_model, metadata_pkg using Distances -using NearestNeighbors # =================================================================== ## EXPORTS @@ -26,19 +25,14 @@ const MMI = MLJModelInterface const Cl = Clustering const PKG = "MLJClusteringInterface" -#### -#### KMeans -#### + +# # K_MEANS @mlj_model mutable struct KMeans <: MMI.Unsupervised k::Int = 3::(_ ≥ 2) metric::SemiMetric = SqEuclidean() end -#### -#### KMeans -#### - function MMI.fit(model::KMeans, verbosity::Int, X) # NOTE: using transpose here to get a LinearAlgebra.Transpose object # which Kmeans can handle. @@ -67,6 +61,8 @@ function MMI.transform(model::KMeans, fitresult, X) return MMI.table(X̃, prototype=X) end +# # K_MEDOIDS + @mlj_model mutable struct KMedoids <: MMI.Unsupervised k::Int = 3::(_ ≥ 2) metric::SemiMetric = SqEuclidean() @@ -101,9 +97,8 @@ function MMI.transform(model::KMedoids, fitresult, X) return MMI.table(X̃, prototype=X) end -#### -#### Predict methods -#### + +# # PREDICT FOR K_MEANS AND K_MEDOIDS function MMI.predict(model::Union{KMeans,KMedoids}, fitresult, Xnew) locations, cluster_labels = fitresult @@ -125,84 +120,55 @@ function MMI.predict(model::Union{KMeans,KMedoids}, fitresult, Xnew) return cluster_labels[pred] end -#### -#### DBSCAN -#### -""" -DBSCAN(; kwargs...) +# # DBSCAN -$DBSCANDescription - -$DBFields - -See also the -[package documentation](https://juliastats.org/Clustering.jl/stable/dbscan.html). -""" -@mlj_model mutable struct DBSCAN <: MMI.Unsupervised +@mlj_model mutable struct DBSCAN <: MMI.Static radius::Real = 1.0::(_ > 0) leafsize::Int = 20::(_ > 0) min_neighbors::Int = 1::(_ > 0) min_cluster_size::Int = 1::(_ > 0) end -function MMI.fit(model::DBSCAN, verbosity::Int, X) - Xarray = MMI.matrix(X, transpose=true) - clusters = Cl.dbscan(Xarray, model.radius; - leafsize=model.leafsize, - min_neighbors=model.min_neighbors, - min_cluster_size=model.min_cluster_size) +# As DBSCAN is `Static`, there is no `fit` to implement. + + +function MMI.predict(model::DBSCAN, ::Nothing, X) + + Xarray = MMI.matrix(X)' + + # output of core algorithm: + clusters = Cl.dbscan( + Xarray, model.radius; + leafsize=model.leafsize, + min_neighbors=model.min_neighbors, + min_cluster_size=model.min_cluster_size, + ) + nclusters = length(clusters) # assignments and point types npoints = size(Xarray, 2) assignments = zeros(Int, npoints) - pointtypes = zeros(Int, npoints) + point_types = fill(-1, npoints) for (k, cluster) in enumerate(clusters) for i in cluster.core_indices assignments[i] = k - pointtypes[i] = 1 + point_types[i] = 1 end for i in cluster.boundary_indices assignments[i] = k - pointtypes[i] = 0 + point_types[i] = 0 end end - result = (Xarray, assignments, pointtypes) - cache = nothing - report = nothing - result, cache, report + yhat = MMI.categorical(assignments) + report = (; point_types, nclusters, clusters) + return yhat, report end -MMI.fitted_params(::DBSCAN, fitresult) = (assignments=fitresult[1][2], - pointtypes=fitresult[1][3]) +MMI.reporting_operations(::Type{<:DBSCAN}) = (:predict,) -function MMI.transform(::DBSCAN, fitresult, X) - # table with assignments in first column and - # point types in second column (core=1 vs. boundary=0) - _, assignments, pointtypes = fitresult[1] - X̃ = [assignments pointtypes] - MMI.table(X̃, prototype=X) -end -function MMI.predict(::DBSCAN, fitresult, Xnew) - X1, assignments, _ = fitresult[1] - X2 = MMI.matrix(Xnew, transpose=true) - - labels = MMI.categorical(assignments) - - # construct KDtree with points in X1 - tree = KDTree(X1, Euclidean()) - - # find nearest neighbor of X2 in X1 - inds, _ = nn(tree, X2) - - # return assignment of nearest neighbor - labels[inds] -end - -#### -#### METADATA -#### +# # METADATA metadata_pkg.( (KMeans, KMedoids), @@ -219,7 +185,6 @@ metadata_model( human_name = "K-means clusterer", input = MMI.Table(Continuous), output = MMI.Table(Continuous), - weights = false, path = "$(PKG).KMeans" ) @@ -228,9 +193,18 @@ metadata_model( human_name = "K-medoids clusterer", input = MMI.Table(Continuous), output = MMI.Table(Continuous), - weights = false, path = "$(PKG).KMedoids" ) + +metadata_model( + DBSCAN, + human_name = "DBSCAN clusterer (density-based spatial clustering of "* + "applications with noise)", + input = MMI.Table(Continuous), + path = "$(PKG).DBSCAN" +) + + """ $(MMI.doc_header(KMeans)) @@ -400,13 +374,4 @@ See also KMedoids -metadata_model( - DBSCAN, - input = MMI.Table(Continuous), - output = MMI.Table(Continuous), - weights = false, - descr = DBSCANDescription, - path = "$(PKG).DBSCAN" -) - end # module diff --git a/test/Project.toml b/test/Project.toml index b831062..6e71f0c 100644 --- a/test/Project.toml +++ b/test/Project.toml @@ -3,5 +3,6 @@ Clustering = "aaaa29a8-35af-508c-8bc3-b662a17a0fe5" Distances = "b4f34e82-e78d-54a5-968a-f98e89d6e8f7" LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" MLJBase = "a7f614a8-145f-11e9-1d2a-a57a1082229d" +MLJTestIntegration = "697918b4-fdc1-4f9e-8ff9-929724cee270" Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" diff --git a/test/runtests.jl b/test/runtests.jl index a16f269..9c44aa0 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -3,6 +3,7 @@ import Distances import LinearAlgebra: norm using MLJBase +using MLJTestIntegration using MLJClusteringInterface using Random: seed! using Test @@ -10,9 +11,8 @@ using Test seed!(132442) X, y = @load_crabs -#### -#### KMEANS -#### + +# # K_MEANS @testset "KMeans" begin barekm = KMeans() @@ -28,9 +28,8 @@ X, y = @load_crabs @test argmin(R[10, :]) == p[10] end -#### -#### KMEDOIDS -#### + +# # K_MEDOIDS @testset "KMedoids" begin barekm = KMedoids() @@ -47,7 +46,11 @@ end @test all(report.assignments .== p) end + +# # DBSCAN + @testset "DBSCAN" begin + # five spot pattern X = [ 0.0 0.0 @@ -55,32 +58,50 @@ end 1.0 1.0 0.0 1.0 0.5 0.5 - ] + ] |> MLJBase.table # radius < √2 ==> 5 clusters - dbscan = DBSCAN(radius=0.1) - fitresult = fit(dbscan, 1, X) - A = transform(dbscan, fitresult, X) - p = predict(dbscan, fitresult, X) - @test size(matrix(A)) == (5, 2) - @test A.x2 == [0,0,0,0,0] - @test Set(p) == Set(unique(p)) + dbscan = DBSCAN(radius=0.1) + yhat1, report1 = predict(dbscan, nothing, X) + @test report1.nclusters == 5 + @test report1.point_types == [0,0,0,0,0] + @test Set(yhat1) == Set(unique(yhat1)) + + # DbscanCluster fields: + @test propertynames(report1.clusters[1]) == (:size, :core_indices, :boundary_indices) # radius > √2 ==> 1 cluster - dbscan = DBSCAN(radius=√2+eps()) - fitresult = fit(dbscan, 1, X) - A = transform(dbscan, fitresult, X) - p = predict(dbscan, fitresult, X) - @test size(matrix(A)) == (5, 2) - @test A.x2 == [1,1,1,1,1] - @test unique(p) == [1] + dbscan = DBSCAN(radius=√2+eps()) + yhat, report = predict(dbscan, nothing, X) + @test report.nclusters == 1 + @test report.point_types == [1,1,1,1,1] + @test length(unique(yhat)) == 1 # radius < √2 && min_cluster_size = 2 ==> all points are noise - dbscan = DBSCAN(radius=0.1, min_cluster_size=2) - fitresult = fit(dbscan, 1, X) - A = transform(dbscan, fitresult, X) - p = predict(dbscan, fitresult, X) - @test size(matrix(A)) == (5, 2) - @test A.x2 == [0,0,0,0,0] - @test unique(p) == [0] -end \ No newline at end of file + dbscan = DBSCAN(radius=0.1, min_cluster_size=2) + yhat, report = predict(dbscan, nothing, X) + @test report.nclusters == 0 + @test report.point_types == [-1,-1,-1,-1,-1] + @test length(unique(yhat)) == 1 + + # MLJ integration: + model = DBSCAN(radius=0.1) + mach = machine(model) # no training data + yhat = predict(mach, X) + @test yhat == yhat1 + @test MLJBase.report(mach).point_types == report1.point_types + @test MLJBase.report(mach).nclusters == report1.nclusters + +end + +@testset "MLJ interface" begin + models = [KMeans, KMedoids, DBSCAN] + failures, summary = MLJTestIntegration.test( + models, + X; + mod=@__MODULE__, + verbosity=0, + throw=false, # set to true to debug + ) + @test isempty(failures) +end From e283bc7ba726a75aae0956b82bd96e1f00c6618f Mon Sep 17 00:00:00 2001 From: "Anthony D. Blaom" Date: Thu, 25 Aug 2022 09:30:56 +1200 Subject: [PATCH 4/6] change point_type labels; add doc-string --- src/MLJClusteringInterface.jl | 113 ++++++++++++++++++++++++++++++++-- test/runtests.jl | 7 ++- 2 files changed, 112 insertions(+), 8 deletions(-) diff --git a/src/MLJClusteringInterface.jl b/src/MLJClusteringInterface.jl index 45bbe95..ca66beb 100644 --- a/src/MLJClusteringInterface.jl +++ b/src/MLJClusteringInterface.jl @@ -131,7 +131,6 @@ end # As DBSCAN is `Static`, there is no `fit` to implement. - function MMI.predict(model::DBSCAN, ::Nothing, X) Xarray = MMI.matrix(X)' @@ -148,20 +147,22 @@ function MMI.predict(model::DBSCAN, ::Nothing, X) # assignments and point types npoints = size(Xarray, 2) assignments = zeros(Int, npoints) - point_types = fill(-1, npoints) + raw_point_types = fill('N', npoints) for (k, cluster) in enumerate(clusters) for i in cluster.core_indices assignments[i] = k - point_types[i] = 1 + raw_point_types[i] = 'C' end for i in cluster.boundary_indices assignments[i] = k - point_types[i] = 0 + raw_point_types[i] = 'B' end end + point_types = MMI.categorical(raw_point_types) + cluster_labels = unique(assignments) yhat = MMI.categorical(assignments) - report = (; point_types, nclusters, clusters) + report = (; point_types, nclusters, cluster_labels, clusters) return yhat, report end @@ -373,5 +374,107 @@ See also """ KMedoids +""" +$(MMI.doc_header(DBSCAN)) + +[DBSCAN](https://en.wikipedia.org/wiki/DBSCAN) is a clustering algorithm that groups +together points that are closely packed together (points with many nearby neighbors), +marking as outliers points that lie alone in low-density regions (whose nearest neighbors +are too far away). More information is available at the [Clustering.jl +documentation](https://juliastats.org/Clustering.jl/stable/index.html). Use `predict` to +get cluster assignments. Point types - core, boundary or noise - are accessed from the +machine report (see below). + +This is a static implementation, i.e., it does not generalize to new data instances, and +there is no training data. For clusterers that do generalize, see [`KMeans`](@ref) or +[`KMedoids`](@ref). + +In MLJ or MLJBase, create a machine with + + mach = machine(model) + +# Hyper-parameters + +- `radius=1.0`: query radius. + +- `leafsize=20`: number of points binned in each leaf node of the nearest neighbor k-d + tree. + +- `min_neighbors=1`: minimum number of a core point neighbors. + +- `min_cluster_size=1`: minimum number of points in a valid cluster. + + +# Operations + +- `predict(mach, X)`: return cluster label assignments, as an unordered + `CategoricalVector`. Here `X` is any table of input features (eg, a `DataFrame`) whose + columns are of scitype `Continuous`; check column scitypes with `schema(X)`. Note that + points of type `noise` will always get a label of `0`. + + +# Report + +After calling `predict(mach)`, the fields of `report(mach)` are: + +- `point_types`: A `CategoricalVector` with the DBSCAN point type classification, one + element per row of `X`. Elements are either `'C'`" (core), `'B'` (boundary), or `'N'` + (noise). + +- `nclusters`: The number of clusters (excluding the noise "cluster") + +- `cluster_labels`: The unique list of cluster labels + +- `clusters`: A vector of `Clustering.DbscanCluster` objects from Clustering.jl, which + have these fields: + + - `size`: number of points in a cluster (core + boundary) + + - `core_indices`: indices of points in the cluster core + + - `boundary_indices`: indices of points on the cluster boundary + + +# Examples + +``` +using MLJ + +X, labels = make_moons(400, noise=0.09, rng=1) # synthetic data with 2 clusters; X +y = map(labels) do label + label == 0 ? "cookie" : "monster" +end; +y = coerce(y, Multiclass); + +DBSCAN = @load DBSCAN pkg=Clustering +model = DBSCAN(radius=0.13, min_cluster_size=5) +mach = machine(model) + +# compute and output cluster assignments for observations in `X`: +yhat = predict(mach, X) + +# get DBSCAN point types: +report(mach).point_types +report(mach).nclusters + +# compare cluster labels with actual labels: +compare = zip(yhat, y) |> collect; +compare[1:10] # clusters align with classes + +# visualize clusters, noise in red: +points = zip(X.x1, X.x2) |> collect +colors = map(yhat) do i + i == 0 ? :red : + i == 1 ? :blue : + i == 2 ? :green : + i == 3 ? :yellow : + :black +end +using Plots +scatter(points, color=colors) +``` + +""" +DBSCAN end # module diff --git a/test/runtests.jl b/test/runtests.jl index 9c44aa0..e4d7a97 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -64,8 +64,9 @@ end dbscan = DBSCAN(radius=0.1) yhat1, report1 = predict(dbscan, nothing, X) @test report1.nclusters == 5 - @test report1.point_types == [0,0,0,0,0] + @test report1.point_types == fill('B', 5) @test Set(yhat1) == Set(unique(yhat1)) + @test Set(report1.cluster_labels) == Set(unique(yhat1)) # DbscanCluster fields: @test propertynames(report1.clusters[1]) == (:size, :core_indices, :boundary_indices) @@ -74,14 +75,14 @@ end dbscan = DBSCAN(radius=√2+eps()) yhat, report = predict(dbscan, nothing, X) @test report.nclusters == 1 - @test report.point_types == [1,1,1,1,1] + @test report.point_types == fill('C', 5) @test length(unique(yhat)) == 1 # radius < √2 && min_cluster_size = 2 ==> all points are noise dbscan = DBSCAN(radius=0.1, min_cluster_size=2) yhat, report = predict(dbscan, nothing, X) @test report.nclusters == 0 - @test report.point_types == [-1,-1,-1,-1,-1] + @test report.point_types == fill('N', 5) @test length(unique(yhat)) == 1 # MLJ integration: From 63e60ec3c688d5ea9e3fe140fc02590d6888f22f Mon Sep 17 00:00:00 2001 From: "Anthony D. Blaom" Date: Thu, 25 Aug 2022 09:42:22 +1200 Subject: [PATCH 5/6] add a test compat MLJTestIntegration = "0.2.2" --- test/Project.toml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/test/Project.toml b/test/Project.toml index 6e71f0c..2ca2a1a 100644 --- a/test/Project.toml +++ b/test/Project.toml @@ -6,3 +6,6 @@ MLJBase = "a7f614a8-145f-11e9-1d2a-a57a1082229d" MLJTestIntegration = "697918b4-fdc1-4f9e-8ff9-929724cee270" Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" + +[compat] +MLJTestIntegration = "0.2.2" \ No newline at end of file From 85cfa5c9fb6f074fdd32911e5a7c45bf1a82e8aa Mon Sep 17 00:00:00 2001 From: "Anthony D. Blaom" Date: Thu, 25 Aug 2022 10:12:15 +1200 Subject: [PATCH 6/6] minor doc-string fix; include DBSCAN in pkg metadata declaration --- src/MLJClusteringInterface.jl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/MLJClusteringInterface.jl b/src/MLJClusteringInterface.jl index ca66beb..68eae62 100644 --- a/src/MLJClusteringInterface.jl +++ b/src/MLJClusteringInterface.jl @@ -172,7 +172,7 @@ MMI.reporting_operations(::Type{<:DBSCAN}) = (:predict,) # # METADATA metadata_pkg.( - (KMeans, KMedoids), + (KMeans, KMedoids, DBSCAN), name="Clustering", uuid="aaaa29a8-35af-508c-8bc3-b662a17a0fe5", url="https://github.com/JuliaStats/Clustering.jl", @@ -418,7 +418,7 @@ In MLJ or MLJBase, create a machine with After calling `predict(mach)`, the fields of `report(mach)` are: - `point_types`: A `CategoricalVector` with the DBSCAN point type classification, one - element per row of `X`. Elements are either `'C'`" (core), `'B'` (boundary), or `'N'` + element per row of `X`. Elements are either `'C'` (core), `'B'` (boundary), or `'N'` (noise). - `nclusters`: The number of clusters (excluding the noise "cluster")