From bc9fcaf18a0d3f1c972f6f5fc71f7a5b8ec7693d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?J=C3=BAlio=20Hoffimann?= <julio.hoffimann@gmail.com>
Date: Mon, 4 Oct 2021 14:37:06 -0300
Subject: [PATCH 1/6] Add Clustering.DBSCAN to interface

---
 Project.toml                  |  8 ----
 src/MLJClusteringInterface.jl | 88 ++++++++++++++++++++++++++++++++++-
 test/Project.toml             |  7 +++
 test/runtests.jl              | 21 +++++----
 4 files changed, 105 insertions(+), 19 deletions(-)
 create mode 100644 test/Project.toml

diff --git a/Project.toml b/Project.toml
index 617dc3d..7cf658b 100644
--- a/Project.toml
+++ b/Project.toml
@@ -14,11 +14,3 @@ Distances = "0.9, 0.10"
 MLJModelInterface = "1.4"
 julia = "1.6"
 
-[extras]
-LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
-MLJBase = "a7f614a8-145f-11e9-1d2a-a57a1082229d"
-Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
-Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
-
-[targets]
-test = ["LinearAlgebra", "MLJBase", "Random", "Test"]
diff --git a/src/MLJClusteringInterface.jl b/src/MLJClusteringInterface.jl
index 8bd084a..4aaedf8 100644
--- a/src/MLJClusteringInterface.jl
+++ b/src/MLJClusteringInterface.jl
@@ -13,10 +13,11 @@ import MLJModelInterface: Continuous, Count, Finite, Multiclass, Table, OrderedF
     @mlj_model, metadata_model, metadata_pkg
 
 using Distances
+using NearestNeighbors
 
 # ===================================================================
 ## EXPORTS
-export KMeans, KMedoids
+export KMeans, KMedoids, DBSCAN
 
 # ===================================================================
 ## CONSTANTS
@@ -124,6 +125,81 @@ function MMI.predict(model::Union{KMeans,KMedoids}, fitresult, Xnew)
     return cluster_labels[pred]
 end
 
+####
+#### DBSCAN
+####
+"""
+DBSCAN(; kwargs...)
+
+$DBSCANDescription
+
+$DBFields
+
+See also the
+[package documentation](https://juliastats.org/Clustering.jl/stable/dbscan.html).
+"""
+@mlj_model mutable struct DBSCAN <: MMI.Unsupervised
+    radius::Real = 1.0::(_ > 0)
+    leafsize::Int = 20::(_ > 0)
+    min_neighbors::Int = 1::(_ > 0)
+    min_cluster_size::Int = 1::(_ > 0)
+end
+
+function MMI.fit(model::DBSCAN, verbosity::Int, X)
+    Xarray   = MMI.matrix(X, transpose=true)
+    clusters = Cl.dbscan(Xarray, model.radius;
+                       leafsize=model.leafsize,
+                       min_neighbors=model.min_neighbors,
+                       min_cluster_size=model.min_cluster_size)
+
+    # assignments and point types
+    npoints     = size(Xarray, 2)
+    assignments = zeros(Int, npoints)
+    pointtypes  = zeros(Int, npoints)
+    for (k, cluster) in enumerate(clusters)
+        for i in cluster.core_indices
+            assignments[i] = k
+            pointtypes[i] = 1
+        end
+        for i in cluster.boundary_indices
+            assignments[i] = k
+            pointtypes[i] = 0
+        end
+    end
+
+    result = (Xarray, assignments, pointtypes)
+    cache  = nothing
+    report = nothing
+    result, cache, report
+end
+
+MMI.fitted_params(::DBSCAN, fitresult) = (assignments=fitresult[1][2],
+                                          pointtypes=fitresult[1][3])
+
+function MMI.transform(::DBSCAN, fitresult, X)
+    # table with assignments in first column and
+    # point types in second column (core=1 vs. boundary=0)
+    _, assignments, pointtypes = fitresult[1]
+    X̃ = [assignments pointtypes]
+    MMI.table(X̃, prototype=X)
+end
+
+function MMI.predict(::DBSCAN, fitresult, Xnew)
+    X1, assignments, _ = fitresult[1]
+    X2 = MMI.matrix(Xnew, transpose=true)
+
+    labels = MMI.categorical(assignments)
+
+    # construct KDtree with points in X1
+    tree = KDTree(X1, Euclidean())
+
+    # find nearest neighbor of X2 in X1
+    inds, _ = nn(tree, X2)
+
+    # return assignment of nearest neighbor
+    labels[inds]
+end
+
 ####
 #### METADATA
 ####
@@ -324,5 +400,13 @@ See also
 KMedoids
 
 
-end # module
+metadata_model(
+    DBSCAN,
+    input = MMI.Table(Continuous),
+    output = MMI.Table(Continuous),
+    weights = false,
+    descr = DBSCANDescription,
+    path = "$(PKG).DBSCAN"
+)
 
+end # module
diff --git a/test/Project.toml b/test/Project.toml
new file mode 100644
index 0000000..b831062
--- /dev/null
+++ b/test/Project.toml
@@ -0,0 +1,7 @@
+[deps]
+Clustering = "aaaa29a8-35af-508c-8bc3-b662a17a0fe5"
+Distances = "b4f34e82-e78d-54a5-968a-f98e89d6e8f7"
+LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
+MLJBase = "a7f614a8-145f-11e9-1d2a-a57a1082229d"
+Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
+Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
diff --git a/test/runtests.jl b/test/runtests.jl
index cd2b3d0..b6f170c 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -4,11 +4,9 @@ import LinearAlgebra: norm
 
 using MLJBase
 using MLJClusteringInterface
-using Random:seed!
+using Random: seed!
 using Test
 
-const Dist = Distances
-
 seed!(132442)
 X, y = @load_crabs
 
@@ -16,7 +14,7 @@ X, y = @load_crabs
 #### KMEANS
 ####
 
-@testset "Kmeans" begin
+@testset "KMeans" begin
     barekm = KMeans()
     fitresult, cache, report = fit(barekm, 1, X)
     R = matrix(transform(barekm, fitresult, X))
@@ -28,25 +26,30 @@ X, y = @load_crabs
     p = predict(barekm, fitresult, X)
     @test argmin(R[1, :]) == p[1]
     @test argmin(R[10, :]) == p[10]
-
-
 end
 
 ####
 #### KMEDOIDS
 ####
 
-@testset "Kmedoids" begin
+@testset "KMedoids" begin
     barekm = KMedoids()
     fitresult, cache, report = fit(barekm, 1, X)
     X_array = matrix(X)
     R = matrix(transform(barekm, fitresult, X))
-    @test R[1, 2] ≈ Dist.evaluate(
+    @test R[1, 2] ≈ Distances.evaluate(
         barekm.metric, view(X_array, 1, :), view(fitresult[1], :, 2)
     )
-    @test R[10, 3] ≈ Dist.evaluate(
+    @test R[10, 3] ≈ Distances.evaluate(
         barekm.metric, view(X_array, 10, :), view(fitresult[1], :, 3)
     )
     p = predict(barekm, fitresult, X)
     @test all(report.assignments .== p)
 end
+
+@testset "DBSCAN" begin
+   dbscan = DBSCAN() 
+   fitresult = fit(dbscan, 1, X)
+   A = transform(dbscan, fitresult, X)
+   p = predict(dbscan, fitresult, X)
+end
\ No newline at end of file

From 8bc74a793a4aaca042c13c1c39b31b96e9ee3ec1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?J=C3=BAlio=20Hoffimann?= <julio.hoffimann@gmail.com>
Date: Tue, 5 Oct 2021 13:46:47 -0300
Subject: [PATCH 2/6] Add tests for DBSCAN

---
 Project.toml     |  1 -
 test/runtests.jl | 39 +++++++++++++++++++++++++++++++++++----
 2 files changed, 35 insertions(+), 5 deletions(-)

diff --git a/Project.toml b/Project.toml
index 7cf658b..2b23a07 100644
--- a/Project.toml
+++ b/Project.toml
@@ -13,4 +13,3 @@ Clustering = "0.14"
 Distances = "0.9, 0.10"
 MLJModelInterface = "1.4"
 julia = "1.6"
-
diff --git a/test/runtests.jl b/test/runtests.jl
index b6f170c..a16f269 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -48,8 +48,39 @@ end
 end
 
 @testset "DBSCAN" begin
-   dbscan = DBSCAN() 
-   fitresult = fit(dbscan, 1, X)
-   A = transform(dbscan, fitresult, X)
-   p = predict(dbscan, fitresult, X)
+    # five spot pattern
+    X = [
+        0.0 0.0
+        1.0 0.0
+        1.0 1.0
+        0.0 1.0
+        0.5 0.5
+    ]
+
+    # radius < √2 ==> 5 clusters
+    dbscan = DBSCAN(radius=0.1) 
+    fitresult = fit(dbscan, 1, X)
+    A = transform(dbscan, fitresult, X)
+    p = predict(dbscan, fitresult, X)
+    @test size(matrix(A)) == (5, 2)
+    @test A.x2 == [0,0,0,0,0]
+    @test Set(p) == Set(unique(p))
+
+    # radius > √2 ==> 1 cluster
+    dbscan = DBSCAN(radius=√2+eps()) 
+    fitresult = fit(dbscan, 1, X)
+    A = transform(dbscan, fitresult, X)
+    p = predict(dbscan, fitresult, X)
+    @test size(matrix(A)) == (5, 2)
+    @test A.x2 == [1,1,1,1,1]
+    @test unique(p) == [1]
+
+    # radius < √2 && min_cluster_size = 2 ==> all points are noise
+    dbscan = DBSCAN(radius=0.1, min_cluster_size=2) 
+    fitresult = fit(dbscan, 1, X)
+    A = transform(dbscan, fitresult, X)
+    p = predict(dbscan, fitresult, X)
+    @test size(matrix(A)) == (5, 2)
+    @test A.x2 == [0,0,0,0,0]
+    @test unique(p) == [0]
 end
\ No newline at end of file

From 51243ac4f88238e85e22820ba99e68eb7245c223 Mon Sep 17 00:00:00 2001
From: "Anthony D. Blaom" <anthony.blaom@gmail.com>
Date: Tue, 23 Aug 2022 17:03:23 +1200
Subject: [PATCH 3/6] re-implement DBSCAN as a Static transformer with report

---
 src/MLJClusteringInterface.jl | 117 ++++++++++++----------------------
 test/Project.toml             |   1 +
 test/runtests.jl              |  79 ++++++++++++++---------
 3 files changed, 92 insertions(+), 105 deletions(-)

diff --git a/src/MLJClusteringInterface.jl b/src/MLJClusteringInterface.jl
index 4aaedf8..45bbe95 100644
--- a/src/MLJClusteringInterface.jl
+++ b/src/MLJClusteringInterface.jl
@@ -13,7 +13,6 @@ import MLJModelInterface: Continuous, Count, Finite, Multiclass, Table, OrderedF
     @mlj_model, metadata_model, metadata_pkg
 
 using Distances
-using NearestNeighbors
 
 # ===================================================================
 ## EXPORTS
@@ -26,19 +25,14 @@ const MMI = MLJModelInterface
 const Cl = Clustering
 const PKG = "MLJClusteringInterface"
 
-####
-#### KMeans
-####
+
+# # K_MEANS
 
 @mlj_model mutable struct KMeans <: MMI.Unsupervised
     k::Int = 3::(_ ≥ 2)
     metric::SemiMetric = SqEuclidean()
 end
 
-####
-#### KMeans
-####
-
 function MMI.fit(model::KMeans, verbosity::Int, X)
     # NOTE: using transpose here to get a LinearAlgebra.Transpose object
     # which Kmeans can handle.
@@ -67,6 +61,8 @@ function MMI.transform(model::KMeans, fitresult, X)
     return MMI.table(X̃, prototype=X)
 end
 
+# # K_MEDOIDS
+
 @mlj_model mutable struct KMedoids <: MMI.Unsupervised
     k::Int = 3::(_ ≥ 2)
     metric::SemiMetric = SqEuclidean()
@@ -101,9 +97,8 @@ function MMI.transform(model::KMedoids, fitresult, X)
     return MMI.table(X̃, prototype=X)
 end
 
-####
-#### Predict methods
-####
+
+# # PREDICT FOR K_MEANS AND K_MEDOIDS
 
 function MMI.predict(model::Union{KMeans,KMedoids}, fitresult, Xnew)
     locations, cluster_labels = fitresult
@@ -125,84 +120,55 @@ function MMI.predict(model::Union{KMeans,KMedoids}, fitresult, Xnew)
     return cluster_labels[pred]
 end
 
-####
-#### DBSCAN
-####
-"""
-DBSCAN(; kwargs...)
+# # DBSCAN
 
-$DBSCANDescription
-
-$DBFields
-
-See also the
-[package documentation](https://juliastats.org/Clustering.jl/stable/dbscan.html).
-"""
-@mlj_model mutable struct DBSCAN <: MMI.Unsupervised
+@mlj_model mutable struct DBSCAN <: MMI.Static
     radius::Real = 1.0::(_ > 0)
     leafsize::Int = 20::(_ > 0)
     min_neighbors::Int = 1::(_ > 0)
     min_cluster_size::Int = 1::(_ > 0)
 end
 
-function MMI.fit(model::DBSCAN, verbosity::Int, X)
-    Xarray   = MMI.matrix(X, transpose=true)
-    clusters = Cl.dbscan(Xarray, model.radius;
-                       leafsize=model.leafsize,
-                       min_neighbors=model.min_neighbors,
-                       min_cluster_size=model.min_cluster_size)
+# As DBSCAN is `Static`, there is no `fit` to implement.
+
+
+function MMI.predict(model::DBSCAN, ::Nothing, X)
+
+    Xarray   = MMI.matrix(X)'
+
+    # output of core algorithm:
+    clusters = Cl.dbscan(
+        Xarray, model.radius;
+        leafsize=model.leafsize,
+        min_neighbors=model.min_neighbors,
+        min_cluster_size=model.min_cluster_size,
+    )
+    nclusters = length(clusters)
 
     # assignments and point types
     npoints     = size(Xarray, 2)
     assignments = zeros(Int, npoints)
-    pointtypes  = zeros(Int, npoints)
+    point_types  = fill(-1, npoints)
     for (k, cluster) in enumerate(clusters)
         for i in cluster.core_indices
             assignments[i] = k
-            pointtypes[i] = 1
+            point_types[i] = 1
         end
         for i in cluster.boundary_indices
             assignments[i] = k
-            pointtypes[i] = 0
+            point_types[i] = 0
         end
     end
 
-    result = (Xarray, assignments, pointtypes)
-    cache  = nothing
-    report = nothing
-    result, cache, report
+    yhat = MMI.categorical(assignments)
+    report = (; point_types, nclusters, clusters)
+    return yhat, report
 end
 
-MMI.fitted_params(::DBSCAN, fitresult) = (assignments=fitresult[1][2],
-                                          pointtypes=fitresult[1][3])
+MMI.reporting_operations(::Type{<:DBSCAN}) = (:predict,)
 
-function MMI.transform(::DBSCAN, fitresult, X)
-    # table with assignments in first column and
-    # point types in second column (core=1 vs. boundary=0)
-    _, assignments, pointtypes = fitresult[1]
-    X̃ = [assignments pointtypes]
-    MMI.table(X̃, prototype=X)
-end
 
-function MMI.predict(::DBSCAN, fitresult, Xnew)
-    X1, assignments, _ = fitresult[1]
-    X2 = MMI.matrix(Xnew, transpose=true)
-
-    labels = MMI.categorical(assignments)
-
-    # construct KDtree with points in X1
-    tree = KDTree(X1, Euclidean())
-
-    # find nearest neighbor of X2 in X1
-    inds, _ = nn(tree, X2)
-
-    # return assignment of nearest neighbor
-    labels[inds]
-end
-
-####
-#### METADATA
-####
+# # METADATA
 
 metadata_pkg.(
     (KMeans, KMedoids),
@@ -219,7 +185,6 @@ metadata_model(
     human_name = "K-means clusterer",
     input = MMI.Table(Continuous),
     output = MMI.Table(Continuous),
-    weights = false,
     path = "$(PKG).KMeans"
 )
 
@@ -228,9 +193,18 @@ metadata_model(
     human_name = "K-medoids clusterer",
     input = MMI.Table(Continuous),
     output = MMI.Table(Continuous),
-    weights = false,
     path = "$(PKG).KMedoids"
 )
+
+metadata_model(
+    DBSCAN,
+    human_name = "DBSCAN clusterer (density-based spatial clustering of "*
+    "applications with noise)",
+    input = MMI.Table(Continuous),
+    path = "$(PKG).DBSCAN"
+)
+
+
 """
 $(MMI.doc_header(KMeans))
 
@@ -400,13 +374,4 @@ See also
 KMedoids
 
 
-metadata_model(
-    DBSCAN,
-    input = MMI.Table(Continuous),
-    output = MMI.Table(Continuous),
-    weights = false,
-    descr = DBSCANDescription,
-    path = "$(PKG).DBSCAN"
-)
-
 end # module
diff --git a/test/Project.toml b/test/Project.toml
index b831062..6e71f0c 100644
--- a/test/Project.toml
+++ b/test/Project.toml
@@ -3,5 +3,6 @@ Clustering = "aaaa29a8-35af-508c-8bc3-b662a17a0fe5"
 Distances = "b4f34e82-e78d-54a5-968a-f98e89d6e8f7"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 MLJBase = "a7f614a8-145f-11e9-1d2a-a57a1082229d"
+MLJTestIntegration = "697918b4-fdc1-4f9e-8ff9-929724cee270"
 Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
diff --git a/test/runtests.jl b/test/runtests.jl
index a16f269..9c44aa0 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -3,6 +3,7 @@ import Distances
 import LinearAlgebra: norm
 
 using MLJBase
+using MLJTestIntegration
 using MLJClusteringInterface
 using Random: seed!
 using Test
@@ -10,9 +11,8 @@ using Test
 seed!(132442)
 X, y = @load_crabs
 
-####
-#### KMEANS
-####
+
+# # K_MEANS
 
 @testset "KMeans" begin
     barekm = KMeans()
@@ -28,9 +28,8 @@ X, y = @load_crabs
     @test argmin(R[10, :]) == p[10]
 end
 
-####
-#### KMEDOIDS
-####
+
+# # K_MEDOIDS
 
 @testset "KMedoids" begin
     barekm = KMedoids()
@@ -47,7 +46,11 @@ end
     @test all(report.assignments .== p)
 end
 
+
+# # DBSCAN
+
 @testset "DBSCAN" begin
+
     # five spot pattern
     X = [
         0.0 0.0
@@ -55,32 +58,50 @@ end
         1.0 1.0
         0.0 1.0
         0.5 0.5
-    ]
+    ] |> MLJBase.table
 
     # radius < √2 ==> 5 clusters
-    dbscan = DBSCAN(radius=0.1) 
-    fitresult = fit(dbscan, 1, X)
-    A = transform(dbscan, fitresult, X)
-    p = predict(dbscan, fitresult, X)
-    @test size(matrix(A)) == (5, 2)
-    @test A.x2 == [0,0,0,0,0]
-    @test Set(p) == Set(unique(p))
+    dbscan = DBSCAN(radius=0.1)
+    yhat1, report1 = predict(dbscan, nothing, X)
+    @test report1.nclusters == 5
+    @test report1.point_types == [0,0,0,0,0]
+    @test Set(yhat1) == Set(unique(yhat1))
+
+    # DbscanCluster fields:
+    @test propertynames(report1.clusters[1]) == (:size, :core_indices, :boundary_indices)
 
     # radius > √2 ==> 1 cluster
-    dbscan = DBSCAN(radius=√2+eps()) 
-    fitresult = fit(dbscan, 1, X)
-    A = transform(dbscan, fitresult, X)
-    p = predict(dbscan, fitresult, X)
-    @test size(matrix(A)) == (5, 2)
-    @test A.x2 == [1,1,1,1,1]
-    @test unique(p) == [1]
+    dbscan = DBSCAN(radius=√2+eps())
+    yhat, report = predict(dbscan, nothing, X)
+    @test report.nclusters == 1
+    @test report.point_types == [1,1,1,1,1]
+    @test length(unique(yhat)) == 1
 
     # radius < √2 && min_cluster_size = 2 ==> all points are noise
-    dbscan = DBSCAN(radius=0.1, min_cluster_size=2) 
-    fitresult = fit(dbscan, 1, X)
-    A = transform(dbscan, fitresult, X)
-    p = predict(dbscan, fitresult, X)
-    @test size(matrix(A)) == (5, 2)
-    @test A.x2 == [0,0,0,0,0]
-    @test unique(p) == [0]
-end
\ No newline at end of file
+    dbscan = DBSCAN(radius=0.1, min_cluster_size=2)
+    yhat, report = predict(dbscan, nothing, X)
+    @test report.nclusters == 0
+    @test report.point_types == [-1,-1,-1,-1,-1]
+    @test length(unique(yhat)) == 1
+
+    # MLJ integration:
+    model = DBSCAN(radius=0.1)
+    mach = machine(model) # no training data
+    yhat = predict(mach, X)
+    @test yhat == yhat1
+    @test MLJBase.report(mach).point_types == report1.point_types
+    @test MLJBase.report(mach).nclusters == report1.nclusters
+
+end
+
+@testset "MLJ interface" begin
+    models = [KMeans, KMedoids, DBSCAN]
+    failures, summary = MLJTestIntegration.test(
+        models,
+        X;
+        mod=@__MODULE__,
+        verbosity=0,
+        throw=false, # set to true to debug
+    )
+    @test isempty(failures)
+end

From e283bc7ba726a75aae0956b82bd96e1f00c6618f Mon Sep 17 00:00:00 2001
From: "Anthony D. Blaom" <anthony.blaom@gmail.com>
Date: Thu, 25 Aug 2022 09:30:56 +1200
Subject: [PATCH 4/6] change point_type labels; add doc-string

---
 src/MLJClusteringInterface.jl | 113 ++++++++++++++++++++++++++++++++--
 test/runtests.jl              |   7 ++-
 2 files changed, 112 insertions(+), 8 deletions(-)

diff --git a/src/MLJClusteringInterface.jl b/src/MLJClusteringInterface.jl
index 45bbe95..ca66beb 100644
--- a/src/MLJClusteringInterface.jl
+++ b/src/MLJClusteringInterface.jl
@@ -131,7 +131,6 @@ end
 
 # As DBSCAN is `Static`, there is no `fit` to implement.
 
-
 function MMI.predict(model::DBSCAN, ::Nothing, X)
 
     Xarray   = MMI.matrix(X)'
@@ -148,20 +147,22 @@ function MMI.predict(model::DBSCAN, ::Nothing, X)
     # assignments and point types
     npoints     = size(Xarray, 2)
     assignments = zeros(Int, npoints)
-    point_types  = fill(-1, npoints)
+    raw_point_types  = fill('N', npoints)
     for (k, cluster) in enumerate(clusters)
         for i in cluster.core_indices
             assignments[i] = k
-            point_types[i] = 1
+            raw_point_types[i] = 'C'
         end
         for i in cluster.boundary_indices
             assignments[i] = k
-            point_types[i] = 0
+            raw_point_types[i] = 'B'
         end
     end
+    point_types = MMI.categorical(raw_point_types)
+    cluster_labels = unique(assignments)
 
     yhat = MMI.categorical(assignments)
-    report = (; point_types, nclusters, clusters)
+    report = (; point_types, nclusters, cluster_labels, clusters)
     return yhat, report
 end
 
@@ -373,5 +374,107 @@ See also
 """
 KMedoids
 
+"""
+$(MMI.doc_header(DBSCAN))
+
+[DBSCAN](https://en.wikipedia.org/wiki/DBSCAN) is a clustering algorithm that groups
+together points that are closely packed together (points with many nearby neighbors),
+marking as outliers points that lie alone in low-density regions (whose nearest neighbors
+are too far away). More information is available at the [Clustering.jl
+documentation](https://juliastats.org/Clustering.jl/stable/index.html). Use `predict` to
+get cluster assignments. Point types - core, boundary or noise - are accessed from the
+machine report (see below).
+
+This is a static implementation, i.e., it does not generalize to new data instances, and
+there is no training data. For clusterers that do generalize, see [`KMeans`](@ref) or
+[`KMedoids`](@ref).
+
+In MLJ or MLJBase, create a machine with
+
+    mach = machine(model)
+
+# Hyper-parameters
+
+- `radius=1.0`: query radius.
+
+- `leafsize=20`: number of points binned in each leaf node of the nearest neighbor k-d
+  tree.
+
+- `min_neighbors=1`: minimum number of a core point neighbors.
+
+- `min_cluster_size=1`: minimum number of points in a valid cluster.
+
+
+# Operations
+
+- `predict(mach, X)`: return cluster label assignments, as an unordered
+  `CategoricalVector`. Here `X` is any table of input features (eg, a `DataFrame`) whose
+  columns are of scitype `Continuous`; check column scitypes with `schema(X)`. Note that
+  points of type `noise` will always get a label of `0`.
+
+
+# Report
+
+After calling `predict(mach)`, the fields of `report(mach)`  are:
+
+- `point_types`: A `CategoricalVector` with the DBSCAN point type classification, one
+  element per row of `X`. Elements are either `'C'`" (core), `'B'` (boundary), or `'N'`
+  (noise).
+
+- `nclusters`: The number of clusters (excluding the noise "cluster")
+
+- `cluster_labels`: The unique list of cluster labels
+
+- `clusters`: A vector of `Clustering.DbscanCluster` objects from Clustering.jl, which
+  have these fields:
+
+  - `size`: number of points in a cluster (core + boundary)
+
+  - `core_indices`: indices of points in the cluster core
+
+  - `boundary_indices`: indices of points on the cluster boundary
+
+
+# Examples
+
+```
+using MLJ
+
+X, labels  = make_moons(400, noise=0.09, rng=1) # synthetic data with 2 clusters; X
+y = map(labels) do label
+    label == 0 ? "cookie" : "monster"
+end;
+y = coerce(y, Multiclass);
+
+DBSCAN = @load DBSCAN pkg=Clustering
+model = DBSCAN(radius=0.13, min_cluster_size=5)
+mach = machine(model)
+
+# compute and output cluster assignments for observations in `X`:
+yhat = predict(mach, X)
+
+# get DBSCAN point types:
+report(mach).point_types
+report(mach).nclusters
+
+# compare cluster labels with actual labels:
+compare = zip(yhat, y) |> collect;
+compare[1:10] # clusters align with classes
+
+# visualize clusters, noise in red:
+points = zip(X.x1, X.x2) |> collect
+colors = map(yhat) do i
+   i == 0 ? :red :
+   i == 1 ? :blue :
+   i == 2 ? :green :
+   i == 3 ? :yellow :
+   :black
+end
+using Plots
+scatter(points, color=colors)
+```
+
+"""
+DBSCAN
 
 end # module
diff --git a/test/runtests.jl b/test/runtests.jl
index 9c44aa0..e4d7a97 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -64,8 +64,9 @@ end
     dbscan = DBSCAN(radius=0.1)
     yhat1, report1 = predict(dbscan, nothing, X)
     @test report1.nclusters == 5
-    @test report1.point_types == [0,0,0,0,0]
+    @test report1.point_types == fill('B', 5)
     @test Set(yhat1) == Set(unique(yhat1))
+    @test Set(report1.cluster_labels) == Set(unique(yhat1))
 
     # DbscanCluster fields:
     @test propertynames(report1.clusters[1]) == (:size, :core_indices, :boundary_indices)
@@ -74,14 +75,14 @@ end
     dbscan = DBSCAN(radius=√2+eps())
     yhat, report = predict(dbscan, nothing, X)
     @test report.nclusters == 1
-    @test report.point_types == [1,1,1,1,1]
+    @test report.point_types == fill('C', 5)
     @test length(unique(yhat)) == 1
 
     # radius < √2 && min_cluster_size = 2 ==> all points are noise
     dbscan = DBSCAN(radius=0.1, min_cluster_size=2)
     yhat, report = predict(dbscan, nothing, X)
     @test report.nclusters == 0
-    @test report.point_types == [-1,-1,-1,-1,-1]
+    @test report.point_types == fill('N', 5)
     @test length(unique(yhat)) == 1
 
     # MLJ integration:

From 63e60ec3c688d5ea9e3fe140fc02590d6888f22f Mon Sep 17 00:00:00 2001
From: "Anthony D. Blaom" <anthony.blaom@gmail.com>
Date: Thu, 25 Aug 2022 09:42:22 +1200
Subject: [PATCH 5/6] add a test compat MLJTestIntegration = "0.2.2"

---
 test/Project.toml | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/test/Project.toml b/test/Project.toml
index 6e71f0c..2ca2a1a 100644
--- a/test/Project.toml
+++ b/test/Project.toml
@@ -6,3 +6,6 @@ MLJBase = "a7f614a8-145f-11e9-1d2a-a57a1082229d"
 MLJTestIntegration = "697918b4-fdc1-4f9e-8ff9-929724cee270"
 Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
+
+[compat]
+MLJTestIntegration = "0.2.2"
\ No newline at end of file

From 85cfa5c9fb6f074fdd32911e5a7c45bf1a82e8aa Mon Sep 17 00:00:00 2001
From: "Anthony D. Blaom" <anthony.blaom@gmail.com>
Date: Thu, 25 Aug 2022 10:12:15 +1200
Subject: [PATCH 6/6] minor doc-string fix; include DBSCAN in pkg metadata
 declaration

---
 src/MLJClusteringInterface.jl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/MLJClusteringInterface.jl b/src/MLJClusteringInterface.jl
index ca66beb..68eae62 100644
--- a/src/MLJClusteringInterface.jl
+++ b/src/MLJClusteringInterface.jl
@@ -172,7 +172,7 @@ MMI.reporting_operations(::Type{<:DBSCAN}) = (:predict,)
 # # METADATA
 
 metadata_pkg.(
-    (KMeans, KMedoids),
+    (KMeans, KMedoids, DBSCAN),
     name="Clustering",
     uuid="aaaa29a8-35af-508c-8bc3-b662a17a0fe5",
     url="https://github.com/JuliaStats/Clustering.jl",
@@ -418,7 +418,7 @@ In MLJ or MLJBase, create a machine with
 After calling `predict(mach)`, the fields of `report(mach)`  are:
 
 - `point_types`: A `CategoricalVector` with the DBSCAN point type classification, one
-  element per row of `X`. Elements are either `'C'`" (core), `'B'` (boundary), or `'N'`
+  element per row of `X`. Elements are either `'C'` (core), `'B'` (boundary), or `'N'`
   (noise).
 
 - `nclusters`: The number of clusters (excluding the noise "cluster")