Skip to content

Commit

Permalink
Merge pull request #17 from JuliaAI/dbscan
Browse files Browse the repository at this point in the history
Add interface for DBSCAN
  • Loading branch information
ablaom authored Aug 28, 2022
2 parents 867f7f0 + 85cfa5c commit 39b1fdb
Show file tree
Hide file tree
Showing 4 changed files with 252 additions and 42 deletions.
9 changes: 0 additions & 9 deletions Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -13,12 +13,3 @@ Clustering = "0.14"
Distances = "0.9, 0.10"
MLJModelInterface = "1.4"
julia = "1.6"

[extras]
LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
MLJBase = "a7f614a8-145f-11e9-1d2a-a57a1082229d"
Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"

[targets]
test = ["LinearAlgebra", "MLJBase", "Random", "Test"]
188 changes: 170 additions & 18 deletions src/MLJClusteringInterface.jl
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ using Distances

# ===================================================================
## EXPORTS
export KMeans, KMedoids
export KMeans, KMedoids, DBSCAN

# ===================================================================
## CONSTANTS
Expand All @@ -25,19 +25,14 @@ const MMI = MLJModelInterface
const Cl = Clustering
const PKG = "MLJClusteringInterface"

####
#### KMeans
####

# # K_MEANS

@mlj_model mutable struct KMeans <: MMI.Unsupervised
k::Int = 3::(_ ≥ 2)
metric::SemiMetric = SqEuclidean()
end

####
#### KMeans
####

function MMI.fit(model::KMeans, verbosity::Int, X)
# NOTE: using transpose here to get a LinearAlgebra.Transpose object
# which Kmeans can handle.
Expand Down Expand Up @@ -66,6 +61,8 @@ function MMI.transform(model::KMeans, fitresult, X)
return MMI.table(X̃, prototype=X)
end

# # K_MEDOIDS

@mlj_model mutable struct KMedoids <: MMI.Unsupervised
k::Int = 3::(_ ≥ 2)
metric::SemiMetric = SqEuclidean()
Expand Down Expand Up @@ -100,9 +97,8 @@ function MMI.transform(model::KMedoids, fitresult, X)
return MMI.table(X̃, prototype=X)
end

####
#### Predict methods
####

# # PREDICT FOR K_MEANS AND K_MEDOIDS

function MMI.predict(model::Union{KMeans,KMedoids}, fitresult, Xnew)
locations, cluster_labels = fitresult
Expand All @@ -124,12 +120,59 @@ function MMI.predict(model::Union{KMeans,KMedoids}, fitresult, Xnew)
return cluster_labels[pred]
end

####
#### METADATA
####
# # DBSCAN

@mlj_model mutable struct DBSCAN <: MMI.Static
radius::Real = 1.0::(_ > 0)
leafsize::Int = 20::(_ > 0)
min_neighbors::Int = 1::(_ > 0)
min_cluster_size::Int = 1::(_ > 0)
end

# As DBSCAN is `Static`, there is no `fit` to implement.

function MMI.predict(model::DBSCAN, ::Nothing, X)

Xarray = MMI.matrix(X)'

# output of core algorithm:
clusters = Cl.dbscan(
Xarray, model.radius;
leafsize=model.leafsize,
min_neighbors=model.min_neighbors,
min_cluster_size=model.min_cluster_size,
)
nclusters = length(clusters)

# assignments and point types
npoints = size(Xarray, 2)
assignments = zeros(Int, npoints)
raw_point_types = fill('N', npoints)
for (k, cluster) in enumerate(clusters)
for i in cluster.core_indices
assignments[i] = k
raw_point_types[i] = 'C'
end
for i in cluster.boundary_indices
assignments[i] = k
raw_point_types[i] = 'B'
end
end
point_types = MMI.categorical(raw_point_types)
cluster_labels = unique(assignments)

yhat = MMI.categorical(assignments)
report = (; point_types, nclusters, cluster_labels, clusters)
return yhat, report
end

MMI.reporting_operations(::Type{<:DBSCAN}) = (:predict,)


# # METADATA

metadata_pkg.(
(KMeans, KMedoids),
(KMeans, KMedoids, DBSCAN),
name="Clustering",
uuid="aaaa29a8-35af-508c-8bc3-b662a17a0fe5",
url="https://github.com/JuliaStats/Clustering.jl",
Expand All @@ -143,7 +186,6 @@ metadata_model(
human_name = "K-means clusterer",
input = MMI.Table(Continuous),
output = MMI.Table(Continuous),
weights = false,
path = "$(PKG).KMeans"
)

Expand All @@ -152,9 +194,18 @@ metadata_model(
human_name = "K-medoids clusterer",
input = MMI.Table(Continuous),
output = MMI.Table(Continuous),
weights = false,
path = "$(PKG).KMedoids"
)

metadata_model(
DBSCAN,
human_name = "DBSCAN clusterer (density-based spatial clustering of "*
"applications with noise)",
input = MMI.Table(Continuous),
path = "$(PKG).DBSCAN"
)


"""
$(MMI.doc_header(KMeans))
Expand Down Expand Up @@ -323,6 +374,107 @@ See also
"""
KMedoids

"""
$(MMI.doc_header(DBSCAN))
end # module
[DBSCAN](https://en.wikipedia.org/wiki/DBSCAN) is a clustering algorithm that groups
together points that are closely packed together (points with many nearby neighbors),
marking as outliers points that lie alone in low-density regions (whose nearest neighbors
are too far away). More information is available at the [Clustering.jl
documentation](https://juliastats.org/Clustering.jl/stable/index.html). Use `predict` to
get cluster assignments. Point types - core, boundary or noise - are accessed from the
machine report (see below).
This is a static implementation, i.e., it does not generalize to new data instances, and
there is no training data. For clusterers that do generalize, see [`KMeans`](@ref) or
[`KMedoids`](@ref).
In MLJ or MLJBase, create a machine with
mach = machine(model)
# Hyper-parameters
- `radius=1.0`: query radius.
- `leafsize=20`: number of points binned in each leaf node of the nearest neighbor k-d
tree.
- `min_neighbors=1`: minimum number of a core point neighbors.
- `min_cluster_size=1`: minimum number of points in a valid cluster.
# Operations
- `predict(mach, X)`: return cluster label assignments, as an unordered
`CategoricalVector`. Here `X` is any table of input features (eg, a `DataFrame`) whose
columns are of scitype `Continuous`; check column scitypes with `schema(X)`. Note that
points of type `noise` will always get a label of `0`.
# Report
After calling `predict(mach)`, the fields of `report(mach)` are:
- `point_types`: A `CategoricalVector` with the DBSCAN point type classification, one
element per row of `X`. Elements are either `'C'` (core), `'B'` (boundary), or `'N'`
(noise).
- `nclusters`: The number of clusters (excluding the noise "cluster")
- `cluster_labels`: The unique list of cluster labels
- `clusters`: A vector of `Clustering.DbscanCluster` objects from Clustering.jl, which
have these fields:
- `size`: number of points in a cluster (core + boundary)
- `core_indices`: indices of points in the cluster core
- `boundary_indices`: indices of points on the cluster boundary
# Examples
```
using MLJ
X, labels = make_moons(400, noise=0.09, rng=1) # synthetic data with 2 clusters; X
y = map(labels) do label
label == 0 ? "cookie" : "monster"
end;
y = coerce(y, Multiclass);
DBSCAN = @load DBSCAN pkg=Clustering
model = DBSCAN(radius=0.13, min_cluster_size=5)
mach = machine(model)
# compute and output cluster assignments for observations in `X`:
yhat = predict(mach, X)
# get DBSCAN point types:
report(mach).point_types
report(mach).nclusters
# compare cluster labels with actual labels:
compare = zip(yhat, y) |> collect;
compare[1:10] # clusters align with classes
# visualize clusters, noise in red:
points = zip(X.x1, X.x2) |> collect
colors = map(yhat) do i
i == 0 ? :red :
i == 1 ? :blue :
i == 2 ? :green :
i == 3 ? :yellow :
:black
end
using Plots
scatter(points, color=colors)
```
"""
DBSCAN

end # module
11 changes: 11 additions & 0 deletions test/Project.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
[deps]
Clustering = "aaaa29a8-35af-508c-8bc3-b662a17a0fe5"
Distances = "b4f34e82-e78d-54a5-968a-f98e89d6e8f7"
LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
MLJBase = "a7f614a8-145f-11e9-1d2a-a57a1082229d"
MLJTestIntegration = "697918b4-fdc1-4f9e-8ff9-929724cee270"
Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"

[compat]
MLJTestIntegration = "0.2.2"
86 changes: 71 additions & 15 deletions test/runtests.jl
Original file line number Diff line number Diff line change
Expand Up @@ -3,20 +3,18 @@ import Distances
import LinearAlgebra: norm

using MLJBase
using MLJTestIntegration
using MLJClusteringInterface
using Random:seed!
using Random: seed!
using Test

const Dist = Distances

seed!(132442)
X, y = @load_crabs

####
#### KMEANS
####

@testset "Kmeans" begin
# # K_MEANS

@testset "KMeans" begin
barekm = KMeans()
fitresult, cache, report = fit(barekm, 1, X)
R = matrix(transform(barekm, fitresult, X))
Expand All @@ -28,25 +26,83 @@ X, y = @load_crabs
p = predict(barekm, fitresult, X)
@test argmin(R[1, :]) == p[1]
@test argmin(R[10, :]) == p[10]


end

####
#### KMEDOIDS
####

@testset "Kmedoids" begin
# # K_MEDOIDS

@testset "KMedoids" begin
barekm = KMedoids()
fitresult, cache, report = fit(barekm, 1, X)
X_array = matrix(X)
R = matrix(transform(barekm, fitresult, X))
@test R[1, 2] Dist.evaluate(
@test R[1, 2] Distances.evaluate(
barekm.metric, view(X_array, 1, :), view(fitresult[1], :, 2)
)
@test R[10, 3] Dist.evaluate(
@test R[10, 3] Distances.evaluate(
barekm.metric, view(X_array, 10, :), view(fitresult[1], :, 3)
)
p = predict(barekm, fitresult, X)
@test all(report.assignments .== p)
end


# # DBSCAN

@testset "DBSCAN" begin

# five spot pattern
X = [
0.0 0.0
1.0 0.0
1.0 1.0
0.0 1.0
0.5 0.5
] |> MLJBase.table

# radius < √2 ==> 5 clusters
dbscan = DBSCAN(radius=0.1)
yhat1, report1 = predict(dbscan, nothing, X)
@test report1.nclusters == 5
@test report1.point_types == fill('B', 5)
@test Set(yhat1) == Set(unique(yhat1))
@test Set(report1.cluster_labels) == Set(unique(yhat1))

# DbscanCluster fields:
@test propertynames(report1.clusters[1]) == (:size, :core_indices, :boundary_indices)

# radius > √2 ==> 1 cluster
dbscan = DBSCAN(radius=2+eps())
yhat, report = predict(dbscan, nothing, X)
@test report.nclusters == 1
@test report.point_types == fill('C', 5)
@test length(unique(yhat)) == 1

# radius < √2 && min_cluster_size = 2 ==> all points are noise
dbscan = DBSCAN(radius=0.1, min_cluster_size=2)
yhat, report = predict(dbscan, nothing, X)
@test report.nclusters == 0
@test report.point_types == fill('N', 5)
@test length(unique(yhat)) == 1

# MLJ integration:
model = DBSCAN(radius=0.1)
mach = machine(model) # no training data
yhat = predict(mach, X)
@test yhat == yhat1
@test MLJBase.report(mach).point_types == report1.point_types
@test MLJBase.report(mach).nclusters == report1.nclusters

end

@testset "MLJ interface" begin
models = [KMeans, KMedoids, DBSCAN]
failures, summary = MLJTestIntegration.test(
models,
X;
mod=@__MODULE__,
verbosity=0,
throw=false, # set to true to debug
)
@test isempty(failures)
end

0 comments on commit 39b1fdb

Please sign in to comment.