Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add interface for DBSCAN #17

Merged
merged 6 commits into from
Aug 28, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 0 additions & 9 deletions Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -13,12 +13,3 @@ Clustering = "0.14"
Distances = "0.9, 0.10"
MLJModelInterface = "1.4"
julia = "1.6"

[extras]
LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
MLJBase = "a7f614a8-145f-11e9-1d2a-a57a1082229d"
Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"

[targets]
test = ["LinearAlgebra", "MLJBase", "Random", "Test"]
188 changes: 170 additions & 18 deletions src/MLJClusteringInterface.jl
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ using Distances

# ===================================================================
## EXPORTS
export KMeans, KMedoids
export KMeans, KMedoids, DBSCAN

# ===================================================================
## CONSTANTS
Expand All @@ -25,19 +25,14 @@ const MMI = MLJModelInterface
const Cl = Clustering
const PKG = "MLJClusteringInterface"

####
#### KMeans
####

# # K_MEANS

@mlj_model mutable struct KMeans <: MMI.Unsupervised
k::Int = 3::(_ ≥ 2)
metric::SemiMetric = SqEuclidean()
end

####
#### KMeans
####

function MMI.fit(model::KMeans, verbosity::Int, X)
# NOTE: using transpose here to get a LinearAlgebra.Transpose object
# which Kmeans can handle.
Expand Down Expand Up @@ -66,6 +61,8 @@ function MMI.transform(model::KMeans, fitresult, X)
return MMI.table(X̃, prototype=X)
end

# # K_MEDOIDS

@mlj_model mutable struct KMedoids <: MMI.Unsupervised
k::Int = 3::(_ ≥ 2)
metric::SemiMetric = SqEuclidean()
Expand Down Expand Up @@ -100,9 +97,8 @@ function MMI.transform(model::KMedoids, fitresult, X)
return MMI.table(X̃, prototype=X)
end

####
#### Predict methods
####

# # PREDICT FOR K_MEANS AND K_MEDOIDS

function MMI.predict(model::Union{KMeans,KMedoids}, fitresult, Xnew)
locations, cluster_labels = fitresult
Expand All @@ -124,12 +120,59 @@ function MMI.predict(model::Union{KMeans,KMedoids}, fitresult, Xnew)
return cluster_labels[pred]
end

####
#### METADATA
####
# # DBSCAN

@mlj_model mutable struct DBSCAN <: MMI.Static
radius::Real = 1.0::(_ > 0)
leafsize::Int = 20::(_ > 0)
min_neighbors::Int = 1::(_ > 0)
min_cluster_size::Int = 1::(_ > 0)
end

# As DBSCAN is `Static`, there is no `fit` to implement.

function MMI.predict(model::DBSCAN, ::Nothing, X)

Xarray = MMI.matrix(X)'

# output of core algorithm:
clusters = Cl.dbscan(
Xarray, model.radius;
leafsize=model.leafsize,
min_neighbors=model.min_neighbors,
min_cluster_size=model.min_cluster_size,
)
nclusters = length(clusters)

# assignments and point types
npoints = size(Xarray, 2)
assignments = zeros(Int, npoints)
raw_point_types = fill('N', npoints)
for (k, cluster) in enumerate(clusters)
for i in cluster.core_indices
assignments[i] = k
raw_point_types[i] = 'C'
end
for i in cluster.boundary_indices
assignments[i] = k
raw_point_types[i] = 'B'
end
end
point_types = MMI.categorical(raw_point_types)
cluster_labels = unique(assignments)

yhat = MMI.categorical(assignments)
report = (; point_types, nclusters, cluster_labels, clusters)
return yhat, report
end

MMI.reporting_operations(::Type{<:DBSCAN}) = (:predict,)


# # METADATA

metadata_pkg.(
(KMeans, KMedoids),
(KMeans, KMedoids, DBSCAN),
name="Clustering",
uuid="aaaa29a8-35af-508c-8bc3-b662a17a0fe5",
url="https://github.com/JuliaStats/Clustering.jl",
Expand All @@ -143,7 +186,6 @@ metadata_model(
human_name = "K-means clusterer",
input = MMI.Table(Continuous),
output = MMI.Table(Continuous),
weights = false,
path = "$(PKG).KMeans"
)

Expand All @@ -152,9 +194,18 @@ metadata_model(
human_name = "K-medoids clusterer",
input = MMI.Table(Continuous),
output = MMI.Table(Continuous),
weights = false,
path = "$(PKG).KMedoids"
)

metadata_model(
DBSCAN,
human_name = "DBSCAN clusterer (density-based spatial clustering of "*
"applications with noise)",
input = MMI.Table(Continuous),
path = "$(PKG).DBSCAN"
)


"""
$(MMI.doc_header(KMeans))
Expand Down Expand Up @@ -323,6 +374,107 @@ See also
"""
KMedoids

"""
$(MMI.doc_header(DBSCAN))
end # module
[DBSCAN](https://en.wikipedia.org/wiki/DBSCAN) is a clustering algorithm that groups
together points that are closely packed together (points with many nearby neighbors),
marking as outliers points that lie alone in low-density regions (whose nearest neighbors
are too far away). More information is available at the [Clustering.jl
documentation](https://juliastats.org/Clustering.jl/stable/index.html). Use `predict` to
get cluster assignments. Point types - core, boundary or noise - are accessed from the
machine report (see below).
This is a static implementation, i.e., it does not generalize to new data instances, and
there is no training data. For clusterers that do generalize, see [`KMeans`](@ref) or
[`KMedoids`](@ref).
In MLJ or MLJBase, create a machine with
mach = machine(model)
# Hyper-parameters
- `radius=1.0`: query radius.
- `leafsize=20`: number of points binned in each leaf node of the nearest neighbor k-d
tree.
- `min_neighbors=1`: minimum number of a core point neighbors.
- `min_cluster_size=1`: minimum number of points in a valid cluster.
# Operations
- `predict(mach, X)`: return cluster label assignments, as an unordered
`CategoricalVector`. Here `X` is any table of input features (eg, a `DataFrame`) whose
columns are of scitype `Continuous`; check column scitypes with `schema(X)`. Note that
points of type `noise` will always get a label of `0`.
# Report
After calling `predict(mach)`, the fields of `report(mach)` are:
- `point_types`: A `CategoricalVector` with the DBSCAN point type classification, one
element per row of `X`. Elements are either `'C'` (core), `'B'` (boundary), or `'N'`
(noise).
- `nclusters`: The number of clusters (excluding the noise "cluster")
- `cluster_labels`: The unique list of cluster labels
- `clusters`: A vector of `Clustering.DbscanCluster` objects from Clustering.jl, which
have these fields:
- `size`: number of points in a cluster (core + boundary)
- `core_indices`: indices of points in the cluster core
- `boundary_indices`: indices of points on the cluster boundary
# Examples
```
using MLJ
X, labels = make_moons(400, noise=0.09, rng=1) # synthetic data with 2 clusters; X
y = map(labels) do label
label == 0 ? "cookie" : "monster"
end;
y = coerce(y, Multiclass);
DBSCAN = @load DBSCAN pkg=Clustering
model = DBSCAN(radius=0.13, min_cluster_size=5)
mach = machine(model)
# compute and output cluster assignments for observations in `X`:
yhat = predict(mach, X)
# get DBSCAN point types:
report(mach).point_types
report(mach).nclusters
# compare cluster labels with actual labels:
compare = zip(yhat, y) |> collect;
compare[1:10] # clusters align with classes
# visualize clusters, noise in red:
points = zip(X.x1, X.x2) |> collect
colors = map(yhat) do i
i == 0 ? :red :
i == 1 ? :blue :
i == 2 ? :green :
i == 3 ? :yellow :
:black
end
using Plots
scatter(points, color=colors)
```
"""
DBSCAN

end # module
11 changes: 11 additions & 0 deletions test/Project.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
[deps]
Clustering = "aaaa29a8-35af-508c-8bc3-b662a17a0fe5"
Distances = "b4f34e82-e78d-54a5-968a-f98e89d6e8f7"
LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
MLJBase = "a7f614a8-145f-11e9-1d2a-a57a1082229d"
MLJTestIntegration = "697918b4-fdc1-4f9e-8ff9-929724cee270"
Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"

[compat]
MLJTestIntegration = "0.2.2"
86 changes: 71 additions & 15 deletions test/runtests.jl
Original file line number Diff line number Diff line change
Expand Up @@ -3,20 +3,18 @@ import Distances
import LinearAlgebra: norm

using MLJBase
using MLJTestIntegration
using MLJClusteringInterface
using Random:seed!
using Random: seed!
using Test

const Dist = Distances

seed!(132442)
X, y = @load_crabs

####
#### KMEANS
####

@testset "Kmeans" begin
# # K_MEANS

@testset "KMeans" begin
barekm = KMeans()
fitresult, cache, report = fit(barekm, 1, X)
R = matrix(transform(barekm, fitresult, X))
Expand All @@ -28,25 +26,83 @@ X, y = @load_crabs
p = predict(barekm, fitresult, X)
@test argmin(R[1, :]) == p[1]
@test argmin(R[10, :]) == p[10]


end

####
#### KMEDOIDS
####

@testset "Kmedoids" begin
# # K_MEDOIDS

@testset "KMedoids" begin
barekm = KMedoids()
fitresult, cache, report = fit(barekm, 1, X)
X_array = matrix(X)
R = matrix(transform(barekm, fitresult, X))
@test R[1, 2] Dist.evaluate(
@test R[1, 2] Distances.evaluate(
barekm.metric, view(X_array, 1, :), view(fitresult[1], :, 2)
)
@test R[10, 3] Dist.evaluate(
@test R[10, 3] Distances.evaluate(
barekm.metric, view(X_array, 10, :), view(fitresult[1], :, 3)
)
p = predict(barekm, fitresult, X)
@test all(report.assignments .== p)
end


# # DBSCAN

@testset "DBSCAN" begin

# five spot pattern
X = [
0.0 0.0
1.0 0.0
1.0 1.0
0.0 1.0
0.5 0.5
] |> MLJBase.table

# radius < √2 ==> 5 clusters
dbscan = DBSCAN(radius=0.1)
yhat1, report1 = predict(dbscan, nothing, X)
@test report1.nclusters == 5
@test report1.point_types == fill('B', 5)
@test Set(yhat1) == Set(unique(yhat1))
@test Set(report1.cluster_labels) == Set(unique(yhat1))

# DbscanCluster fields:
@test propertynames(report1.clusters[1]) == (:size, :core_indices, :boundary_indices)

# radius > √2 ==> 1 cluster
dbscan = DBSCAN(radius=2+eps())
yhat, report = predict(dbscan, nothing, X)
@test report.nclusters == 1
@test report.point_types == fill('C', 5)
@test length(unique(yhat)) == 1

# radius < √2 && min_cluster_size = 2 ==> all points are noise
dbscan = DBSCAN(radius=0.1, min_cluster_size=2)
yhat, report = predict(dbscan, nothing, X)
@test report.nclusters == 0
@test report.point_types == fill('N', 5)
@test length(unique(yhat)) == 1

# MLJ integration:
model = DBSCAN(radius=0.1)
mach = machine(model) # no training data
yhat = predict(mach, X)
@test yhat == yhat1
@test MLJBase.report(mach).point_types == report1.point_types
@test MLJBase.report(mach).nclusters == report1.nclusters

end

@testset "MLJ interface" begin
ablaom marked this conversation as resolved.
Show resolved Hide resolved
models = [KMeans, KMedoids, DBSCAN]
failures, summary = MLJTestIntegration.test(
models,
X;
mod=@__MODULE__,
verbosity=0,
throw=false, # set to true to debug
)
@test isempty(failures)
end