diff --git a/assertx/float64.go b/assertx/float64.go new file mode 100644 index 0000000..36dd2bf --- /dev/null +++ b/assertx/float64.go @@ -0,0 +1,66 @@ +// Copyright 2023 Matrix Origin +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package assertx + +import ( + "math" +) + +// Float64 values have minor numerical differences between Apple M2 and Linux EC2, +// so we use an epsilon value to compare them in UT. We could not use reflect.DeepEqual because +// of these minor differences. We cannot use assert.InEpsilon because it requires the first +// argument to be non-zero. +// +// The epsilon 1e-9 epsilon is based on the following observation: +// - `strconv.FormatFloat(float64(value), 'f', -1, 64)` we use for outputting float64 values. +// - precision difference between Apple M2 and Linux EC2 float64 results (approx 1e-13) +// - service_weaver: https://github.com/ServiceWeaver/weaver/blob/8e7c225542b8f8267ec0da3e01a098366b6f8daf/internal/weaver/load.go#L31 +// - google lib: https://github.com/google/differential-privacy/blob/91b4ecebf33e71b8a215b7ea8325fb5e47b12671/privacy-on-beam/pbeam/pbeamtest/pbeamtest_test.go#L1406 +const float64EqualityThreshold = 1e-9 + +// InEpsilonF64Slices returns true if all the elements in v1 and v2 are within epsilon of each other. +func InEpsilonF64Slices(want, got [][]float64) bool { + if len(want) != len(got) { + return false + } + + for i := 0; i < len(want); i++ { + if !InEpsilonF64Slice(want[i], got[i]) { + return false + } + } + return true +} + +// InEpsilonF64Slice returns true if all the elements in v1 and v2 are within epsilon of each other. +// assert.InEpsilonSlice requires v1 to be non-zero. +func InEpsilonF64Slice(want, got []float64) bool { + if len(want) != len(got) { + return false + } + + for i := 0; i < len(want); i++ { + if !InEpsilonF64(want[i], got[i]) { + return false + } + } + return true +} + +// InEpsilonF64 returns true if v1 and v2 are within epsilon of each other. +// assert.InEpsilon requires v1 to be non-zero. +func InEpsilonF64(want, got float64) bool { + return want == got || math.Abs(want-got) < float64EqualityThreshold || (math.IsNaN(want) && math.IsNaN(got)) +} diff --git a/assertx/float64_test.go b/assertx/float64_test.go new file mode 100644 index 0000000..4fabe41 --- /dev/null +++ b/assertx/float64_test.go @@ -0,0 +1,166 @@ +// Copyright 2023 Matrix Origin +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package assertx + +import "testing" + +func TestInEpsilonF64(t *testing.T) { + type args struct { + want float64 + got float64 + } + tests := []struct { + name string + args args + want bool + }{ + + { + name: "Test 1 - difference is epsilon", + args: args{ + want: 2.0, + got: 2.0 + float64EqualityThreshold, + }, + want: false, + }, + { + name: "Test 2.a - difference is less than epsilon", + args: args{ + want: 2.0, + got: 2.0 + 0.5*float64EqualityThreshold, + }, + want: true, + }, + { + name: "Test 2.b - difference is more than epsilon", + args: args{ + want: 2.0, + got: 2.0 + 2*float64EqualityThreshold, + }, + want: false, + }, + { + name: "Test 2.c - difference is -ve of epsilon", + args: args{ + want: 2.0, + got: 2.0 - float64EqualityThreshold, + }, + want: false, + }, + { + name: "Test 3 - till 9th digit is same, 10th digit is different", + args: args{ + want: 1.732050800_0, + got: 1.732050800_9, + }, + want: true, + }, + { + name: "Test 4 - 9th digit is different", + args: args{ + want: 1.73205080_0, + got: 1.73205080_9, + }, + want: false, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + if got := InEpsilonF64(tt.args.want, tt.args.got); got != tt.want { + t.Errorf("%s InEpsilonF64() = %v, want %v", tt.name, got, tt.want) + } + }) + } +} + +func TestInEpsilonF64Slice(t *testing.T) { + type args struct { + want []float64 + got []float64 + } + tests := []struct { + name string + args args + want bool + }{ + { + name: "Test 1 - difference is epsilon", + args: args{ + want: []float64{2.0, 3.0}, + got: []float64{2.0 + float64EqualityThreshold, 3.0 + float64EqualityThreshold}, + }, + want: false, + }, + { + name: "Test 2 - till 9th digit is same, 10th digit is different)", + args: args{ + want: []float64{1.732050800_0}, + got: []float64{1.732050800_9}, + }, + want: true, + }, + { + name: "Test 3 - 9th digit is different", + args: args{ + want: []float64{1.73205080_0}, + got: []float64{1.73205080_9}, + }, + want: false, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + if got := InEpsilonF64Slice(tt.args.want, tt.args.got); got != tt.want { + t.Errorf("InEpsilonF64Slice() = %v, want %v", got, tt.want) + } + }) + } +} + +func TestInEpsilonF64Slices(t *testing.T) { + type args struct { + want [][]float64 + got [][]float64 + } + tests := []struct { + name string + args args + want bool + }{ + { + name: "Test 1 - difference is epsilon", + args: args{ + want: [][]float64{{2.0, 3.0}, {4.0}}, + got: [][]float64{{2.0 + float64EqualityThreshold, 3.0 + float64EqualityThreshold}, {4.0}}, + }, + want: false, + }, + { + name: "Test 2 - difference is less than epsilon (next neg-power of epsilon & epsilon/2)", + args: args{ + want: [][]float64{{2.0, 3.0}, {4.0}}, + got: [][]float64{{2.0 + 1e-1*float64EqualityThreshold, 3.0 + float64EqualityThreshold/2}, {4.0}}, + }, + want: true, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + if got := InEpsilonF64Slices(tt.args.want, tt.args.got); got != tt.want { + t.Errorf("InEpsilonF64Slices() = %v, want %v", got, tt.want) + } + }) + } +} diff --git a/clusterer/clusterer_bench_test.go b/clusterer/clusterer_bench_test.go deleted file mode 100644 index 539fb90..0000000 --- a/clusterer/clusterer_bench_test.go +++ /dev/null @@ -1,81 +0,0 @@ -package clusterer - -import ( - "github.com/arjunsk/kmeans/containers" - "github.com/arjunsk/kmeans/initializer" - "math/rand" - "testing" -) - -/* -date : 2023-10-2 -goos: darwin -goarch: arm64 -pkg: github.com/arjunsk/kmeans/clusterer -cpu: Apple M2 Pro -rows: 5_000 -dims: 1024 -k: 100 -Benchmark_kmeans/KMEANS-10 1 44640829958 ns/op -Benchmark_kmeans/KMEANS++-10 1 89310750250 ns/op -Benchmark_kmeans/ELKAN-10 1 24660727625 ns/op -*/ -func Benchmark_kmeans(b *testing.B) { - rowCnt := 5_000 - dims := 1024 - data := make([][]float64, rowCnt) - loadData(rowCnt, dims, data) - - kmeans, _ := NewKmeans(data, 100, - 0.01, 500, - containers.EuclideanDistance, initializer.NewRandomInitializer()) - kmeanspp, _ := NewKmeansPlusPlus(data, 100, - 0.01, 500, - containers.EuclideanDistance) - elkan, _ := NewKmeansElkan(data, 100, - 0.01, 500, - containers.EuclideanDistance, initializer.NewRandomInitializer()) - - b.Run("KMEANS", func(b *testing.B) { - b.ResetTimer() - for i := 0; i < b.N; i++ { - clusters, err := kmeans.Cluster() - if err != nil { - panic(err) - } - b.Logf("SSE %v", clusters.SSE()) - } - }) - - b.Run("KMEANS++", func(b *testing.B) { - b.Skipf("KMEANS++ will take alot of time for k=100. Hence skipping") - b.ResetTimer() - for i := 0; i < b.N; i++ { - clusters, err := kmeanspp.Cluster() - if err != nil { - panic(err) - } - b.Logf("SSE %v", clusters.SSE()) - } - }) - - b.Run("ELKAN", func(b *testing.B) { - b.ResetTimer() - for i := 0; i < b.N; i++ { - clusters, err := elkan.Cluster() - if err != nil { - panic(err) - } - b.Logf("SSE %v", clusters.SSE()) - } - }) -} - -func loadData(nb int, d int, xb [][]float64) { - for r := 0; r < nb; r++ { - xb[r] = make([]float64, d) - for c := 0; c < d; c++ { - xb[r][c] = float64(rand.Float32() * 1000) - } - } -} diff --git a/clusterer/clusterer_test.go b/clusterer/clusterer_test.go deleted file mode 100644 index 2c36df3..0000000 --- a/clusterer/clusterer_test.go +++ /dev/null @@ -1,57 +0,0 @@ -package clusterer - -import ( - "fmt" - "github.com/arjunsk/kmeans/containers" - "github.com/arjunsk/kmeans/initializer" - "testing" -) - -var vectors = [][]float64{ - {20.0, 20.0, 20.0, 20.0}, - {21.0, 21.0, 21.0, 21.0}, - {100.5, 100.5, 100.5, 100.5}, - {50.1, 50.1, 50.1, 50.1}, - {64.2, 64.2, 64.2, 64.2}, -} - -func TestCluster_lloyd(t *testing.T) { - kmeans, _ := NewKmeans(vectors, 2, - 0.01, 500, - containers.EuclideanDistance, initializer.NewRandomInitializer()) - clusters, err := kmeans.Cluster() - if err != nil || clusters == nil || len(clusters) != 2 { - t.Log("\nClusters:", clusters) - t.Fail() - } - fmt.Println(clusters) - fmt.Println(clusters.SSE()) -} - -func TestCluster_kpp(t *testing.T) { - kmeans, _ := NewKmeansPlusPlus(vectors, 2, - 0.01, 500, - containers.EuclideanDistance) - clusters, err := kmeans.Cluster() - if err != nil || clusters == nil || len(clusters) != 2 { - t.Log("\nClusters:", clusters) - t.Fail() - } - fmt.Println(clusters) - fmt.Println(clusters.SSE()) -} - -func TestCluster_elkan(t *testing.T) { - kmeans, _ := NewKmeansElkan(vectors, 2, - 0.01, 500, - containers.EuclideanDistance, - initializer.NewRandomInitializer()) - clusters, err := kmeans.Cluster() - if err != nil || clusters == nil || len(clusters) != 2 { - t.Log("\nError:", err) - t.Log("\nClusters:", clusters) - t.Fail() - } - fmt.Println(clusters) - fmt.Println(clusters.SSE()) -} diff --git a/clusterer/docs.go b/clusterer/docs.go deleted file mode 100644 index 93a63f1..0000000 --- a/clusterer/docs.go +++ /dev/null @@ -1,8 +0,0 @@ -package clusterer - -// Package clusterer provides API for running Kmeans clustering algorithm. - -// The kmeans algorithms implemented are -// - Lloyd's algorithm -// - Lloyd's with Kmeans++ initializer -// - Elkan's algorithm diff --git a/clusterer/elkan.go b/clusterer/elkan.go deleted file mode 100644 index 524d6ca..0000000 --- a/clusterer/elkan.go +++ /dev/null @@ -1,282 +0,0 @@ -package clusterer - -import ( - "github.com/arjunsk/kmeans/containers" - "github.com/arjunsk/kmeans/initializer" - "golang.org/x/sync/errgroup" - "math" -) - -// KmeansElkan Ref Paper: https://cdn.aaai.org/ICML/2003/ICML03-022.pdf -// Slides: https://slideplayer.com/slide/9088301/ -type KmeansElkan struct { - deltaThreshold float64 - iterationThreshold int - - distFn containers.DistanceFunction - initializer initializer.Initializer - - assignments []int // maps vector index to cluster index - lowerBounds [][]float64 // distances for vector and all clusters centroids - upperBounds []float64 // distance between each point and its assigned cluster centroid ie d(x, c(x)) - - // local state - vectors [][]float64 // input vectors - clusterCnt int // number of clusters ie k -} - -var _ Clusterer = new(KmeansElkan) - -func NewKmeansElkan(vectors [][]float64, clusterCnt int, - deltaThreshold float64, - iterationThreshold int, - distFn containers.DistanceFunction, - init initializer.Initializer) (Clusterer, error) { - - err := validateArgs(vectors, clusterCnt, deltaThreshold, iterationThreshold) - if err != nil { - return nil, err - } - - n := len(vectors) - - el := KmeansElkan{ - deltaThreshold: deltaThreshold, - iterationThreshold: iterationThreshold, - distFn: distFn, - initializer: init, - vectors: vectors, - clusterCnt: clusterCnt, - assignments: make([]int, n), - upperBounds: make([]float64, n), - } - - el.lowerBounds = make([][]float64, n) - for i := range el.lowerBounds { - el.lowerBounds[i] = make([]float64, clusterCnt) - } - - return &el, nil -} - -func (el *KmeansElkan) Cluster() (containers.Clusters, error) { - - clusters, err := el.initializer.InitCentroids(el.vectors, el.clusterCnt) - if err != nil { - return nil, err - } - - err = el.kmeansElkan(clusters) - if err != nil { - return nil, err - } - - return clusters, nil -} - -// kmeansElkan -// During each iteration of the algorithm, the lower bounds l(x, c) are updated for all points x and centers -// c. These updates take O(nk) time, so the complexity of the algorithm remains at least O(nke), even -// though the number of distance calculations is roughly O(n) only. -// Ref:https://www.cse.iitd.ac.in/~rjaiswal/2015/col870/Project/Nipun.pdf -// This variant needs O(n*k) additional memory to store bounds. -func (el *KmeansElkan) kmeansElkan(clusters containers.Clusters) error { - for i := 0; ; i++ { - movement := 0 - el.reset(clusters) - clusters.Reset() - - // step 1.a - // For all centers c and c', compute d(c, c'). - centroidSelfDistances := el.calculateCentroidDistances(clusters, el.clusterCnt) - - // step 1.b - // For all centers c, compute s(c)=0.5 min {d(c, c') | c'!= c}. - sc := el.computeSc(centroidSelfDistances, el.clusterCnt) - - // step 2 and 3 - movement, err := el.assignData(centroidSelfDistances, sc, clusters, el.vectors, i) - if err != nil { - return err - } - - // step 4 - // For each center c, let m(c) be the mean of the points assigned to c. - // step 7 - // Replace each center c by m(c). - moveDistances, err := clusters.RecenterWithDeltaDistance(el.distFn) - if err != nil { - return err - } - - // step 5 and 6 - el.updateBounds(moveDistances, el.vectors) - - if el.isConverged(i, movement) { - break - } - } - - return nil -} - -func (el *KmeansElkan) calculateCentroidDistances(clusters containers.Clusters, k int) [][]float64 { - centroidDistances := make([][]float64, k) - for i := 0; i < k; i++ { - centroidDistances[i] = make([]float64, k) - } - - //NOTE: We can parallelize this because [i][j] is computed on lower triangle. - //[i][j] computed don't read any other [r][c] value. - eg := new(errgroup.Group) - for i := 0; i < k-1; i++ { - for j := i + 1; j < k; j++ { - func(r, c int) { - eg.Go(func() error { - var err error - centroidDistances[r][c], err = el.distFn(clusters[r].Center(), clusters[c].Center()) - if err != nil { - return err - } - centroidDistances[c][r] = centroidDistances[r][c] - return nil - }) - }(i, j) - } - } - if err := eg.Wait(); err != nil { - panic(err) - } - - return centroidDistances -} - -// s(c) = 0.5 * min{d(c, c') | c' != c} -func (el *KmeansElkan) computeSc(centroidDistances [][]float64, k int) []float64 { - sc := make([]float64, k) - for i := 0; i < k; i++ { - scMin := math.MaxFloat64 - for j := 0; j < k; j++ { - if i == j { - continue - } - scMin = math.Min(centroidDistances[i][j], scMin) - } - sc[i] = 0.5 * scMin - } - return sc -} - -func (el *KmeansElkan) assignData(centroidDistances [][]float64, - sc []float64, - clusters containers.Clusters, - vectors [][]float64, - iterationCount int) (int, error) { - - moves := 0 - k := len(centroidDistances) - - for x := range vectors { - - // c(x) in the paper - meanIndex := el.assignments[x] - - // step 2. - // Identify all points x such that u(x) <= s(c(x)). - if el.upperBounds[x] <= sc[meanIndex] { - continue - } - - r := true //indicates that upper bound needs to be recalculated - - // step 3. - // For all remaining points x and centers c such that - // (i) c != c(x) and - // (ii) u(x)>l(x, c) and - // (iii) u(x)> 0.5 d(c(x), c): - for c := 0; c < k; c++ { - - if c == meanIndex { - continue // Pruned because this cluster is already the assignment. - } - - if el.upperBounds[x] <= el.lowerBounds[x][c] { - continue // Pruned by triangle inequality on lower bound. - } - - if el.upperBounds[x] <= centroidDistances[meanIndex][c]*0.5 { - continue // Pruned by triangle inequality on cluster distances. - } - - //step 3.a - Bounds update - // If r(x) then compute d(x, c(x)) and assign r(x)= false. Otherwise, d(x, c(x))=u(x). - if r { - distance, err := el.distFn(vectors[x], clusters[meanIndex].Center()) - if err != nil { - return 0, err - } - el.upperBounds[x] = distance - el.lowerBounds[x][meanIndex] = distance - r = false - } - - //step 3.b - Update - // If d(x, c(x))>l(x, c) or d(x, c(x))> 0.5 d(c(x), c) then - // Compute d(x, c) - // If d(x, c) el.lowerBounds[x][c] || - el.upperBounds[x] > centroidDistances[meanIndex][c]*0.5 { - newDistance, _ := el.distFn(vectors[x], clusters[c].Center()) - el.lowerBounds[x][c] = newDistance - if newDistance < el.upperBounds[x] { - meanIndex = c - el.upperBounds[x] = newDistance - } - } - - } - - // Update Assigment/Membership & use this info to later update Mean Centroid - if meanIndex != el.assignments[x] { - el.assignments[x] = meanIndex - moves++ - } else if iterationCount == 0 { - moves++ - } - clusters[meanIndex].AddMember(vectors[x]) - } - return moves, nil -} - -func (el *KmeansElkan) updateBounds(moveDistances []float64, data [][]float64) { - k := len(moveDistances) - - for x := range data { - for c := 0; c < k; c++ { - //Step 5 - //For each point x and center c, assign - // l(x, c)= max{ l(x, c)-d(c, m(c)), 0 } - el.lowerBounds[x][c] = math.Max(el.lowerBounds[x][c]-moveDistances[c], 0) - } - // Step 6 - // For each point x, assign - // u(x)=u(x)+d(m(c(x)), c(x)) - // r(x)= true - el.upperBounds[x] += moveDistances[el.assignments[x]] - } -} - -func (el *KmeansElkan) isConverged(i int, movement int) bool { - vectorCnt := float64(len(el.vectors)) - if i == el.iterationThreshold || movement < int(vectorCnt*el.deltaThreshold) || movement == 0 { - return true - } - return false -} - -func (el *KmeansElkan) reset(clusters containers.Clusters) { - clusters.Reset() - for i := range el.upperBounds { - el.upperBounds[i] = math.MaxFloat64 - } -} diff --git a/clusterer/lloyd.go b/clusterer/lloyd.go deleted file mode 100644 index c03b115..0000000 --- a/clusterer/lloyd.go +++ /dev/null @@ -1,135 +0,0 @@ -package clusterer - -import ( - "github.com/arjunsk/kmeans/containers" - "github.com/arjunsk/kmeans/initializer" - "math/rand" -) - -type Lloyd struct { - deltaThreshold float64 - iterationThreshold int - - distFn containers.DistanceFunction - initializer initializer.Initializer - - // local state - vectors [][]float64 - clusterCnt int -} - -var _ Clusterer = new(Lloyd) - -func NewKmeans(vectors [][]float64, clusterCnt int, - deltaThreshold float64, - iterationThreshold int, - distFn containers.DistanceFunction, - init initializer.Initializer) (Clusterer, error) { - - err := validateArgs(vectors, clusterCnt, deltaThreshold, iterationThreshold) - if err != nil { - return nil, err - } - - m := Lloyd{ - deltaThreshold: deltaThreshold, - iterationThreshold: iterationThreshold, - distFn: distFn, - initializer: init, - vectors: vectors, - clusterCnt: clusterCnt, - } - - return m, nil -} - -func (ll Lloyd) Cluster() (containers.Clusters, error) { - - clusters, err := ll.initializer.InitCentroids(ll.vectors, ll.clusterCnt) - if err != nil { - return nil, err - } - - err = ll.kmeans(clusters) - if err != nil { - return nil, err - } - - return clusters, nil -} - -// kmeans Complexity := O(n*k*e*d); n = number of vectors, k = number of clusters, e = number of iterations, d = number of dimensions -func (ll Lloyd) kmeans(clusters containers.Clusters) (err error) { - - assignments := make([]int, len(ll.vectors)) - var movement int - - for i := 0; ; i++ { - //1. Reset the state - movement = 0 - clusters.Reset() - - // 2. Assign vectors to the nearest cluster - movement, err = ll.assignData(ll.vectors, clusters, assignments, movement) - if err != nil { - return err - } - - // 3.b Update the cluster centroids for Empty clusters - for clusterId := 0; clusterId < len(clusters); clusterId++ { - if len(clusters[clusterId].Members()) == 0 { - //vecIdx represents an index of a vector from a "cluster with more than one member" - var vecIdx int - for { - vecIdx = rand.Intn(len(ll.vectors)) - if len(clusters[assignments[vecIdx]].Members()) > 1 { - break - } - } - clusters[clusterId].AddMember(ll.vectors[vecIdx]) - assignments[vecIdx] = clusterId - movement = len(ll.vectors) - } - } - - // 4. Recenter the clusters - if movement > 0 { - err = clusters.Recenter() - if err != nil { - return err - } - } - - if ll.isConverged(i, movement) { - break - } - } - - return nil -} - -func (ll Lloyd) assignData(vectors [][]float64, clusters containers.Clusters, clusterIds []int, movement int) (int, error) { - // 2. Assign each vector to the nearest cluster - for vecIdx, vec := range vectors { - clusterId, _, err := clusters.Nearest(vec, ll.distFn) - if err != nil { - return 0, err - } - clusters[clusterId].AddMember(vec) - - // 3.a Update the cluster id of the vector - if clusterIds[vecIdx] != clusterId { - clusterIds[vecIdx] = clusterId - movement++ - } - } - return movement, nil -} - -func (ll Lloyd) isConverged(i int, movement int) bool { - vectorCnt := float64(len(ll.vectors)) - if i == ll.iterationThreshold || movement < int(vectorCnt*ll.deltaThreshold) || movement == 0 { - return true - } - return false -} diff --git a/clusterer/plus_plus.go b/clusterer/plus_plus.go deleted file mode 100644 index 4cc1ec7..0000000 --- a/clusterer/plus_plus.go +++ /dev/null @@ -1,36 +0,0 @@ -package clusterer - -import ( - "github.com/arjunsk/kmeans/containers" - "github.com/arjunsk/kmeans/initializer" -) - -type KmeansPP struct { - Lloyd -} - -var _ Clusterer = new(KmeansPP) - -func NewKmeansPlusPlus(vectors [][]float64, clusterCnt int, - deltaThreshold float64, - iterationThreshold int, - distFn containers.DistanceFunction) (Clusterer, error) { - - clusterer := Lloyd{ - deltaThreshold: deltaThreshold, - iterationThreshold: iterationThreshold, - distFn: distFn, - initializer: initializer.NewKmeansPlusPlusInitializer(distFn), - vectors: vectors, - clusterCnt: clusterCnt, - } - - return &KmeansPP{ - Lloyd: clusterer, - }, nil - -} - -func (kpp KmeansPP) Cluster() (containers.Clusters, error) { - return kpp.Lloyd.Cluster() -} diff --git a/clusterer/types.go b/clusterer/types.go deleted file mode 100644 index 6d70e51..0000000 --- a/clusterer/types.go +++ /dev/null @@ -1,55 +0,0 @@ -package clusterer - -import ( - "errors" - "github.com/arjunsk/kmeans/containers" - "sync" - "sync/atomic" -) - -type Clusterer interface { - // Cluster will run the clustering algorithm. - // NOTE: We will not support adding clusterCnt as argument ie Cluster(clusterCnt). - // This would require a factory pattern to create states - // (assignments, lowerBounds, upperBounds, r) etc. for each call. - // NOTE: it is good that we return concretion of `Clusters`. The client can - // decide on the abstraction on `Clusters` and decide which functions to expose. - Cluster() (containers.Clusters, error) -} - -func validateArgs(vectors [][]float64, clusterCnt int, deltaThreshold float64, iterationThreshold int) error { - if len(vectors) == 0 { - return errors.New("kmeans: The data set must not be empty") - } - - if clusterCnt > len(vectors) { - return errors.New("kmeans: The count of the data set must at least equal k") - } - - if deltaThreshold <= 0.0 || deltaThreshold >= 1.0 { - return errors.New("kmeans: threshold is out of bounds (must be >0.0 and <1.0, in percent)") - } - - if iterationThreshold <= 0 { - return errors.New("kmeans: iterationThreshold must be > 0") - } - - vecDim := len(vectors[0]) - var dimMismatch atomic.Bool - var wg sync.WaitGroup - for i := 1; i < len(vectors); i++ { - wg.Add(1) - go func(i int) { - defer wg.Done() - if len(vectors[i]) != vecDim { - dimMismatch.Store(true) - } - }(i) - } - wg.Wait() - if dimMismatch.Load() { - return errors.New("kmeans: The data set must contain vectors of the same dimension") - } - - return nil -} diff --git a/clusterer/types_test.go b/clusterer/types_test.go deleted file mode 100644 index 2e83415..0000000 --- a/clusterer/types_test.go +++ /dev/null @@ -1,65 +0,0 @@ -package clusterer - -import "testing" - -func Test_validateArgs(t *testing.T) { - type args struct { - vectors [][]float64 - clusterCnt int - deltaThreshold float64 - iterationThreshold int - } - tests := []struct { - name string - args args - wantErr bool - }{ - { - name: "Test 1 - dimension mismatch", - args: args{ - vectors: [][]float64{{1, 2, 3}, {1, 2}}, - clusterCnt: 1, - deltaThreshold: 0.7, - iterationThreshold: 100, - }, - wantErr: true, - }, - { - name: "Test 2 - k > len(vectors)", - args: args{ - vectors: [][]float64{{1, 2, 3}, {1, 2, 4}}, - clusterCnt: 3, - deltaThreshold: 0.7, - iterationThreshold: 100, - }, - wantErr: true, - }, - { - name: "Test 3 - delta threshold", - args: args{ - vectors: [][]float64{{1, 2, 3}, {1, 2, 4}}, - clusterCnt: 3, - deltaThreshold: 1.1, - iterationThreshold: 100, - }, - wantErr: true, - }, - { - name: "Test 4 - iteration threshold", - args: args{ - vectors: [][]float64{{1, 2, 3}, {1, 2, 4}}, - clusterCnt: 3, - deltaThreshold: 0.9, - iterationThreshold: 0, - }, - wantErr: true, - }, - } - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - if err := validateArgs(tt.args.vectors, tt.args.clusterCnt, tt.args.deltaThreshold, tt.args.iterationThreshold); (err != nil) != tt.wantErr { - t.Errorf("validateArgs() error = %v, wantErr %v", err, tt.wantErr) - } - }) - } -} diff --git a/containers/cluster.go b/containers/cluster.go deleted file mode 100644 index d1bf782..0000000 --- a/containers/cluster.go +++ /dev/null @@ -1,89 +0,0 @@ -package containers - -import ( - "fmt" - "math" -) - -type Cluster struct { - center Vector - members []Vector -} - -func NewCluster(center Vector) Cluster { - return Cluster{ - center: center, - } -} - -func (c *Cluster) Recenter() { - memberCnt := len(c.members) - if memberCnt == 0 { - // If there is no members, keep the previous center only. - return - } - - // newCenter = "Mean" of the members - newCenter := make(Vector, len(c.center)) - for _, member := range c.members { - newCenter.Add(member) - } - newCenter.Mul(1 / float64(memberCnt)) - - c.center = newCenter -} - -func (c *Cluster) RecenterWithMovedDistance(distFn DistanceFunction) (moveDistance float64, err error) { - memberCnt := len(c.members) - if memberCnt == 0 { - //return 0, errors.New("kmeans: there is no mean for an empty cluster") - return 0, nil - } - - // newCenter is the "Mean" of the members - newCenter := make(Vector, len(c.center)) - for _, member := range c.members { - newCenter.Add(member) - } - newCenter.Mul(1 / float64(memberCnt)) - - moveDistance, err = distFn(c.center, newCenter) - if err != nil { - return 0, err - } - - c.center = newCenter - - return moveDistance, nil -} - -// SSE returns the sum of squared errors of the cluster -func (c *Cluster) SSE() float64 { - sse := 0.0 - for i := 0; i < len(c.members); i++ { - dist, _ := EuclideanDistance(c.center, c.members[i]) - sse += math.Pow(dist, 2) - } - return sse -} - -// Reset only resets the members of the cluster. The center is not reset. -func (c *Cluster) Reset() { - c.members = []Vector{} -} - -func (c *Cluster) String() string { - return fmt.Sprintf("center: %v, members: %v", c.center, c.members) -} - -func (c *Cluster) Center() Vector { - return c.center -} - -func (c *Cluster) Members() []Vector { - return c.members -} - -func (c *Cluster) AddMember(member Vector) { - c.members = append(c.members, member) -} diff --git a/containers/cluster_test.go b/containers/cluster_test.go deleted file mode 100644 index 84307c3..0000000 --- a/containers/cluster_test.go +++ /dev/null @@ -1,234 +0,0 @@ -package containers - -import ( - "reflect" - "testing" -) - -func TestCluster_Recenter(t *testing.T) { - type fields struct { - center Vector - members []Vector - } - tests := []struct { - name string - fields fields - wantCenter Vector - }{ - { - name: "test1", - fields: fields{ - center: Vector{1, 1}, - members: []Vector{{1, 1}, {2, 2}}, - }, - wantCenter: Vector{1.5, 1.5}, - }, - { - name: "test2", - fields: fields{ - center: Vector{1, 1}, - members: []Vector{{1, 1}, {2, 2}, {3, 3}}, - }, - wantCenter: Vector{2, 2}, - }, - } - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - c := &Cluster{ - center: tt.fields.center, - members: tt.fields.members, - } - c.Recenter() - if c.center.Compare(tt.wantCenter) != 0 { - t.Errorf("Recenter() gotCenter = %v, want %v", c.center, tt.wantCenter) - } - }) - } -} - -func TestCluster_RecenterReturningMovedDistance(t *testing.T) { - type fields struct { - center Vector - members []Vector - } - type args struct { - distFn DistanceFunction - } - tests := []struct { - name string - fields fields - args args - wantMoveDistances float64 - wantCenter Vector - wantErr bool - }{ - { - name: "empty cluster", - fields: fields{ - center: Vector{1, 1}, - members: []Vector{}, - }, - args: args{distFn: EuclideanDistance}, - wantCenter: Vector{1, 1}, // unchanged - }, - { - name: "non-empty cluster", - fields: fields{ - center: Vector{1, 1}, - members: []Vector{{1, 1}, {2, 2}}, - }, - args: args{distFn: EuclideanDistance}, - wantMoveDistances: 0.7071067811865476, - wantCenter: Vector{1.5, 1.5}, // changed - }, - } - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - c := &Cluster{ - center: tt.fields.center, - members: tt.fields.members, - } - gotMoveDistance, err := c.RecenterWithMovedDistance(tt.args.distFn) - if (err != nil) != tt.wantErr { - t.Errorf("RecenterReturningMovedDistance() error = %v, wantErr %v", err, tt.wantErr) - return - } - if gotMoveDistance != tt.wantMoveDistances { - t.Errorf("RecenterReturningMovedDistance() gotMoveDistance = %v, want %v", gotMoveDistance, tt.wantMoveDistances) - } - if c.center.Compare(tt.wantCenter) != 0 { - t.Errorf("Recenter() gotCenter = %v, want %v", c.center, tt.wantCenter) - } - }) - } -} - -func TestCluster_Reset(t *testing.T) { - type fields struct { - center Vector - members []Vector - } - tests := []struct { - name string - fields fields - }{ - { - name: "Test1", - fields: fields{ - center: Vector{1, 1}, - members: []Vector{{1, 1}, {2, 2}, {3, 3}}, - }, - }, - } - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - c := &Cluster{ - center: tt.fields.center, - members: tt.fields.members, - } - c.Reset() - if len(c.members) != 0 { - t.Errorf("Reset() = %v, want %v", c.members, []Vector{}) - } - if c.center.Compare(tt.fields.center) != 0 { - t.Errorf("Reset() = %v, want %v", c.center, tt.fields.center) - } - }) - } -} - -func TestCluster_SSE(t *testing.T) { - type fields struct { - center Vector - members []Vector - } - tests := []struct { - name string - fields fields - want float64 - }{ - { - name: "Test1", - fields: fields{ - center: Vector{1, 1}, - members: []Vector{{1, 1}, {3, 3}, {3, 3}}, - }, - want: 16.000000000000004, - }, - } - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - c := &Cluster{ - center: tt.fields.center, - members: tt.fields.members, - } - if got := c.SSE(); got != tt.want { - t.Errorf("SSE() = %v, want %v", got, tt.want) - } - }) - } -} - -func TestCluster_String(t *testing.T) { - type fields struct { - center Vector - members []Vector - } - tests := []struct { - name string - fields fields - want string - }{ - { - name: "Test1", - fields: fields{ - center: Vector{1, 1}, - members: []Vector{{1, 1}, {3, 3}, {3, 3}}, - }, - want: "center: [1 1], members: [[1 1] [3 3] [3 3]]", - }, - } - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - c := &Cluster{ - center: tt.fields.center, - members: tt.fields.members, - } - if got := c.String(); got != tt.want { - t.Errorf("String() = %v, want %v", got, tt.want) - } - }) - } -} - -func TestCluster_AddMember_Members(t *testing.T) { - - tests := []struct { - name string - cluster *Cluster - argMembers []Vector - wantMembers []Vector - }{ - { - name: "Test1", - cluster: &Cluster{ - center: Vector{1, 1}, - members: []Vector{{1, 2}, {2, 3}}, - }, - argMembers: []Vector{{4, 5}, {6, 7}}, - wantMembers: []Vector{{1, 2}, {2, 3}, {4, 5}, {6, 7}}, - }, - } - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - for _, member := range tt.argMembers { - tt.cluster.AddMember(member) - } - - if !reflect.DeepEqual(tt.cluster.Members(), tt.wantMembers) { - t.Errorf("Members() = %v, want %v", tt.cluster.Members(), tt.wantMembers) - } - - }) - } -} diff --git a/containers/clusters.go b/containers/clusters.go deleted file mode 100644 index 36051d3..0000000 --- a/containers/clusters.go +++ /dev/null @@ -1,103 +0,0 @@ -package containers - -import ( - "fmt" - "golang.org/x/sync/errgroup" - "sync" -) - -// Clusters is a collection of Cluster. -// None of the methods are pointer receivers, since we don't want to mutate the Clusters. -// We mutate the Cluster instead of Clusters. Cluster has a pointer receiver. -type Clusters []Cluster - -// Nearest returns the index, distance of the cluster nearest to point -func (c Clusters) Nearest(point Vector, distFn DistanceFunction) (minClusterIdx int, minDistance float64, err error) { - if distFn == nil { - panic(fmt.Errorf("distance function is nil")) - } - - minClusterIdx = 0 - - var currDistance = 0.0 - minDistance, err = distFn(point, c[0].Center()) - if err != nil { - return 0, 0, err - } - - for i := 1; i < len(c); i++ { - currDistance, err = distFn(point, c[i].Center()) - if err != nil { - return 0, 0, err - } - if currDistance < minDistance { - minDistance = currDistance - minClusterIdx = i - } - } - - return minClusterIdx, minDistance, nil -} - -func (c Clusters) Recenter() error { - clusterCnt := len(c) - var wg sync.WaitGroup - for i := 0; i < clusterCnt; i++ { - wg.Add(1) - go (func(i int) { - defer wg.Done() - c[i].Recenter() - })(i) - } - wg.Wait() - return nil -} - -func (c Clusters) RecenterWithDeltaDistance(distFn DistanceFunction) (moveDistances []float64, err error) { - if distFn == nil { - return nil, fmt.Errorf("distance function is nil") - } - - clusterCnt := len(c) - moveDistances = make([]float64, clusterCnt) - - eg := new(errgroup.Group) - for i := 0; i < clusterCnt; i++ { - func(id int) { - eg.Go(func() error { - moveDistances[id], err = c[id].RecenterWithMovedDistance(distFn) - if err != nil { - return err - } - return nil - }) - }(i) - } - - if err = eg.Wait(); err != nil { - return nil, err - } - return moveDistances, nil -} - -func (c Clusters) Reset() { - for i := 0; i < len(c); i++ { - c[i].Reset() - } -} - -func (c Clusters) String() string { - var s = "" - for i := 0; i < len(c); i++ { - s += fmt.Sprintf("%d: %s\n", i, c[i].String()) - } - return s -} - -func (c Clusters) SSE() float64 { - var sse = 0.0 - for i := 0; i < len(c); i++ { - sse += c[i].SSE() - } - return sse -} diff --git a/containers/clusters_test.go b/containers/clusters_test.go deleted file mode 100644 index a5fcfbd..0000000 --- a/containers/clusters_test.go +++ /dev/null @@ -1,269 +0,0 @@ -package containers - -import ( - "reflect" - "testing" -) - -func TestClusters_Recenter(t *testing.T) { - tests := []struct { - name string - c Clusters - wantErr bool - wantCenters []Vector - }{ - { - name: "Test1", - c: Clusters{ - Cluster{ - center: Vector{1, 1}, - members: []Vector{{1, 1}, {2, 2}, {3, 3}}, - }, - Cluster{ - center: Vector{1, 1}, - members: []Vector{{1, 1}, {2, 2}}, - }, - }, - wantErr: false, - wantCenters: []Vector{ - {2, 2}, - {1.5, 1.5}, - }, - }, - } - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - if err := tt.c.Recenter(); (err != nil) != tt.wantErr { - t.Errorf("Recenter() error = %v, wantErr %v", err, tt.wantErr) - } - for i, cluster := range tt.c { - if cluster.center.Compare(tt.wantCenters[i]) != 0 { - t.Errorf("Recenter() gotCenter = %v, want %v", cluster.center, tt.wantCenters[i]) - } - } - }) - } -} - -func TestClusters_RecenterWithDeltaDistance(t *testing.T) { - type args struct { - distFn DistanceFunction - } - tests := []struct { - name string - c Clusters - args args - wantMoveDistances []float64 - wantErr bool - wantCenters []Vector - }{ - { - name: "Test1", - c: Clusters{ - Cluster{ - center: Vector{1, 1}, - members: []Vector{{1, 1}, {2, 2}, {3, 3}}, - }, - Cluster{ - center: Vector{1, 1}, - members: []Vector{{1, 1}, {2, 2}}, - }, - }, - wantErr: false, - wantMoveDistances: []float64{ - 1.4142135623730951, - 0.7071067811865476, - }, - wantCenters: []Vector{ - {2, 2}, - {1.5, 1.5}, - }, - args: args{ - distFn: EuclideanDistance, - }, - }, - } - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - gotMoveDistances, err := tt.c.RecenterWithDeltaDistance(tt.args.distFn) - if (err != nil) != tt.wantErr { - t.Errorf("RecenterWithDeltaDistance() error = %v, wantErr %v", err, tt.wantErr) - return - } - if !reflect.DeepEqual(gotMoveDistances, tt.wantMoveDistances) { - t.Errorf("RecenterWithDeltaDistance() gotMoveDistances = %v, want %v", gotMoveDistances, tt.wantMoveDistances) - } - for i, cluster := range tt.c { - if cluster.center.Compare(tt.wantCenters[i]) != 0 { - t.Errorf("Recenter() gotCenter = %v, want %v", cluster.center, tt.wantCenters[i]) - } - } - }) - } -} - -func TestClusters_Reset(t *testing.T) { - tests := []struct { - name string - c Clusters - }{ - { - name: "Test1", - c: Clusters{ - Cluster{ - center: Vector{1, 1}, - members: []Vector{{1, 1}, {2, 2}, {3, 3}}, - }, - Cluster{ - center: Vector{2, 2}, - members: []Vector{{1, 1}, {2, 2}, {3, 3}}, - }, - }, - }, - } - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - tt.c.Reset() - for _, cluster := range tt.c { - if len(cluster.members) != 0 { - t.Errorf("Clusters.Reset() = %v, want %v", cluster.members, []Vector{}) - } - if cluster.center.Compare(Vector{}) == 0 { - // If center is cleared, then there is a problem. - t.Errorf("Clusters.Reset() = %v, want %v", Vector{}, cluster.center) - } - } - }) - } -} - -func TestClusters_Nearest(t *testing.T) { - type args struct { - point Vector - distFn DistanceFunction - } - tests := []struct { - name string - c Clusters - args args - wantMinClusterIdx int - wantMinDistance float64 - wantErr bool - }{ - { - name: "Test1", - c: Clusters{ - Cluster{ - center: Vector{1, 1}, - members: []Vector{{1, 1}, {2, 2}, {3, 3}}, - }, - Cluster{ - center: Vector{2, 2}, - members: []Vector{{1, 1}, {2, 2}, {3, 3}}, - }, - }, - args: args{ - point: Vector{1, 1}, - distFn: EuclideanDistance, - }, - wantMinClusterIdx: 0, - wantMinDistance: 0, - wantErr: false, - }, - { - name: "Test2", - c: Clusters{ - Cluster{ - center: Vector{1, 1}, - members: []Vector{{1, 1}, {2, 2}, {3, 3}}, - }, - Cluster{ - center: Vector{2, 2}, - members: []Vector{{1, 1}, {2, 2}, {3, 3}}, - }, - }, - args: args{ - point: Vector{3, 3}, - distFn: EuclideanDistance, - }, - wantMinClusterIdx: 1, - wantMinDistance: 1.4142135623730951, - wantErr: false, - }, - } - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - gotMinClusterIdx, gotMinDistance, err := tt.c.Nearest(tt.args.point, tt.args.distFn) - if (err != nil) != tt.wantErr { - t.Errorf("Nearest() error = %v, wantErr %v", err, tt.wantErr) - return - } - if gotMinClusterIdx != tt.wantMinClusterIdx { - t.Errorf("Nearest() gotMinClusterIdx = %v, want %v", gotMinClusterIdx, tt.wantMinClusterIdx) - } - if gotMinDistance != tt.wantMinDistance { - t.Errorf("Nearest() gotMinDistance = %v, want %v", gotMinDistance, tt.wantMinDistance) - } - }) - } -} - -func TestClusters_String(t *testing.T) { - tests := []struct { - name string - c Clusters - want string - }{ - { - name: "Test1", - c: Clusters{ - Cluster{ - center: Vector{1, 1}, - members: []Vector{{1, 1}, {2, 2}, {3, 3}}, - }, - Cluster{ - center: Vector{2, 2}, - members: []Vector{{1, 1}, {2, 2}, {3, 3}}, - }, - }, - want: "0: center: [1 1], members: [[1 1] [2 2] [3 3]]\n1: center: [2 2], members: [[1 1] [2 2] [3 3]]\n", - }, - } - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - if got := tt.c.String(); got != tt.want { - t.Errorf("String() = %v, want %v", got, tt.want) - } - }) - } -} - -func TestClusters_SSE(t *testing.T) { - tests := []struct { - name string - c Clusters - want float64 - }{ - { - name: "Test1", - c: Clusters{ - Cluster{ - center: Vector{1, 1}, - members: []Vector{{1, 1}, {2, 2}, {3, 3}}, - }, - Cluster{ - center: Vector{2, 2}, - members: []Vector{{1, 1}, {2, 2}, {3, 3}}, - }, - }, - want: 14.000000000000004, - }, - } - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - if got := tt.c.SSE(); got != tt.want { - t.Errorf("SSE() = %v, want %v", got, tt.want) - } - }) - } -} diff --git a/containers/vector.go b/containers/vector.go deleted file mode 100644 index 32c77d4..0000000 --- a/containers/vector.go +++ /dev/null @@ -1,45 +0,0 @@ -package containers - -type Vector []float64 - -// Add adds a vector to the current vector -func (v *Vector) Add(vec Vector) { - for c, val := range vec { - (*v)[c] += val - } -} - -// Mul multiplies the vector by a scalar -func (v *Vector) Mul(scalar float64) { - for c := range *v { - (*v)[c] *= scalar - } -} - -// Compare 2 vectors on a lexicographical order -// Ref: https://stackoverflow.com/a/23907444/1609570 -func (v *Vector) Compare(v2 Vector) int { - lenV1 := len(*v) - lenV2 := len(v2) - - minLen := lenV1 - if lenV2 < lenV1 { - minLen = lenV2 - } - - for c := 0; c < minLen; c++ { - if (*v)[c] < v2[c] { - return -1 - } else if (*v)[c] > v2[c] { - return +1 - } - } - - if lenV1 < lenV2 { - return -1 - } else if lenV1 > lenV2 { - return +1 - } - - return 0 -} diff --git a/containers/vector_distance.go b/containers/vector_distance.go deleted file mode 100644 index e322525..0000000 --- a/containers/vector_distance.go +++ /dev/null @@ -1,23 +0,0 @@ -package containers - -import ( - "fmt" - "math" -) - -// DistanceFunction is a function to find distance between 2 vectors -type DistanceFunction func(v1, v2 Vector) (float64, error) - -// EuclideanDistance returns the Euclidean distance between two vectors -// Ref: https://mathworld.wolfram.com/L2-Norm.html -func EuclideanDistance(v1, v2 Vector) (float64, error) { - if len(v1) != len(v2) { - return 0, fmt.Errorf("vectors must have the same length") - } - - distance := 0.0 - for c := range v1 { - distance += math.Pow(v1[c]-v2[c], 2) - } - return math.Sqrt(distance), nil -} diff --git a/containers/vector_distance_test.go b/containers/vector_distance_test.go deleted file mode 100644 index 8c215c8..0000000 --- a/containers/vector_distance_test.go +++ /dev/null @@ -1,55 +0,0 @@ -package containers - -import ( - "testing" -) - -func TestEuclideanDistance(t *testing.T) { - type args struct { - v1 Vector - v2 Vector - } - tests := []struct { - name string - args args - want float64 - wantErr bool - }{ - { - name: "Test1", - args: args{v1: Vector{1, 2, 3}, v2: Vector{1, 2, 3}}, - want: 0, - wantErr: false, - }, - { - name: "Test2", - args: args{v1: Vector{1, 2, 3}, v2: Vector{1, 2, 4}}, - want: 1, - wantErr: false, - }, - { - name: "Test3", - args: args{v1: Vector{1, 2, 3}, v2: Vector{1, 2}}, - want: 0, - wantErr: true, - }, - { - name: "Test4", - args: args{v1: Vector{1, 1}, v2: Vector{1.5, 1.5}}, - want: 0.7071067811865476, - wantErr: false, - }, - } - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - got, err := EuclideanDistance(tt.args.v1, tt.args.v2) - if (err != nil) != tt.wantErr { - t.Errorf("EuclideanDistance() error = %v, wantErr %v", err, tt.wantErr) - return - } - if got != tt.want { - t.Errorf("EuclideanDistance() got = %v, want %v", got, tt.want) - } - }) - } -} diff --git a/containers/vector_test.go b/containers/vector_test.go deleted file mode 100644 index 9806c61..0000000 --- a/containers/vector_test.go +++ /dev/null @@ -1,108 +0,0 @@ -package containers - -import "testing" - -func TestVector_Add(t *testing.T) { - type args struct { - vec Vector - } - tests := []struct { - name string - v Vector - args args - want Vector - }{ - { - name: "test1", - v: Vector{1, 2, 3}, - args: args{Vector{-1, 4, 6}}, - want: Vector{0, 6, 9}, - }, - } - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - tt.v.Add(tt.args.vec) - if tt.v.Compare(tt.want) != 0 { - t.Errorf("Vector.Add() = %v, want %v", tt.v, tt.want) - } - }) - } -} - -func TestVector_Mul(t *testing.T) { - type args struct { - scalar float64 - } - tests := []struct { - name string - v Vector - args args - want Vector - }{ - { - name: "test1", - v: Vector{1, 2, 3}, - args: args{4}, - want: Vector{4, 8, 12}, - }, - } - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - tt.v.Mul(tt.args.scalar) - if tt.v.Compare(tt.want) != 0 { - t.Errorf("Vector.Add() = %v, want %v", tt.v, tt.want) - } - }) - } -} - -func TestVector_Compare(t *testing.T) { - type args struct { - v2 Vector - } - tests := []struct { - name string - v Vector - args args - want int - }{ - - { - name: "test1", - v: Vector{1, 2, 3}, - args: args{Vector{1, 2, 3}}, - want: 0, - }, - { - name: "test2", - v: Vector{1, 2, 3}, - args: args{Vector{1, 2, 4}}, - want: -1, - }, - { - name: "test3", - v: Vector{1, 2, 3}, - args: args{Vector{1, 2, 2}}, - want: 1, - }, - { - name: "test4", - v: Vector{1, 2, 3}, - args: args{Vector{1, 2, 3, 4}}, - want: -1, - }, - { - name: "test5", - v: Vector{1, 2, 3, 4}, - args: args{Vector{1, 2, 3}}, - want: 1, - }, - } - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - if got := tt.v.Compare(tt.args.v2); got != tt.want { - t.Errorf("Compare() = %v, want %v", got, tt.want) - } - }) - } -} diff --git a/elkans/clusterer.go b/elkans/clusterer.go new file mode 100644 index 0000000..da174f3 --- /dev/null +++ b/elkans/clusterer.go @@ -0,0 +1,456 @@ +// Copyright 2023 Matrix Origin +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package elkans + +import ( + "github.com/arjunsk/kmeans" + "github.com/arjunsk/kmeans/moarray" + "github.com/arjunsk/kmeans/moerr" + "gonum.org/v1/gonum/mat" + "math" + "math/rand" + "sync" +) + +// ElkanClusterer is an improved kmeans algorithm which using the triangle inequality to reduce the number of +// distance calculations. As quoted from the paper: +// "The main contribution of this paper is an optimized version of the standard k-means method, with which the number +// of distance computations is in practice closer to `n` than to `nke`, where n is the number of vectors, k is the number +// of centroids, and e is the number of iterations needed until convergence." +// +// However, during each iteration of the algorithm, the lower bounds l(x, c) are updated for all points x and centers c. +// These updates take O(nk) time, so the complexity of the algorithm remains at least O(nke), even though the number +// of distance calculations is roughly O(n) only. +// NOTE that, distance calculation is very expensive for higher dimension vectors. +// +// Ref Paper: https://cdn.aaai.org/ICML/2003/ICML03-022.pdf +type ElkanClusterer struct { + + // for each of the n vectors, we keep track of the following data + vectorList []*mat.VecDense + vectorMetas []vectorMeta + assignments []int + + // for each of the k centroids, we keep track of the following data + centroids []*mat.VecDense + halfInterCentroidDistMatrix [][]float64 + minHalfInterCentroidDist []float64 + + // thresholds + maxIterations int // e in paper + deltaThreshold float64 // used for early convergence. we are not using it right now. + + // counts + clusterCnt int // k in paper + vectorCnt int // n in paper + + distFn kmeans.DistanceFunction + initType kmeans.InitType + rand *rand.Rand + normalize bool +} + +// vectorMeta holds required information for Elkan's kmeans pruning. +// lower is at-least distance of a vector to each of the k centroids. Thus, there are k values for each data point. +// upper is at-most (maximum possible) distance of a vector to its currently assigned or "closest" centroid. +// Hence, there's only one value for each data point. +// recompute is a flag to indicate if the distance to centroids needs to be recomputed. if false, +// the algorithm will rely on the 'upper' bound as an approximation instead of computing the exact distance. +type vectorMeta struct { + lower []float64 + upper float64 + recompute bool +} + +var _ kmeans.Clusterer = new(ElkanClusterer) + +func NewKMeans(vectors [][]float64, clusterCnt, + maxIterations int, deltaThreshold float64, + distanceType kmeans.DistanceType, initType kmeans.InitType, + normalize bool, +) (kmeans.Clusterer, error) { + + err := validateArgs(vectors, clusterCnt, maxIterations, deltaThreshold, distanceType, initType) + if err != nil { + return nil, err + } + + gonumVectors, err := moarray.ToGonumVectors[float64](vectors...) + if err != nil { + return nil, err + } + + assignments := make([]int, len(vectors)) + var metas = make([]vectorMeta, len(vectors)) + for i := range metas { + metas[i] = vectorMeta{ + lower: make([]float64, clusterCnt), + upper: 0, + recompute: true, + } + } + + centroidDist := make([][]float64, clusterCnt) + for i := range centroidDist { + centroidDist[i] = make([]float64, clusterCnt) + } + minCentroidDist := make([]float64, clusterCnt) + + distanceFunction, err := resolveDistanceFn(distanceType) + if err != nil { + return nil, err + } + + return &ElkanClusterer{ + maxIterations: maxIterations, + deltaThreshold: deltaThreshold, + + vectorList: gonumVectors, + assignments: assignments, + vectorMetas: metas, + + //centroids will be initialized by InitCentroids() + halfInterCentroidDistMatrix: centroidDist, + minHalfInterCentroidDist: minCentroidDist, + + distFn: distanceFunction, + initType: initType, + clusterCnt: clusterCnt, + vectorCnt: len(vectors), + + rand: rand.New(rand.NewSource(kmeans.DefaultRandSeed)), + normalize: normalize, + }, nil +} + +// InitCentroids initializes the centroids using initialization algorithms like random or kmeans++. +func (km *ElkanClusterer) InitCentroids() error { + var initializer Initializer + switch km.initType { + case kmeans.Random: + initializer = NewRandomInitializer() + case kmeans.KmeansPlusPlus: + initializer = NewKMeansPlusPlusInitializer(km.distFn) + default: + initializer = NewRandomInitializer() + } + km.centroids = initializer.InitCentroids(km.vectorList, km.clusterCnt) + return nil +} + +// Cluster returns the final centroids and the error if any. +func (km *ElkanClusterer) Cluster() ([][]float64, error) { + if km.normalize { + moarray.NormalizeGonumVectors(km.vectorList) + } + + if km.vectorCnt == km.clusterCnt { + return moarray.ToMoArrays[float64](km.vectorList), nil + } + + err := km.InitCentroids() // step 0.1 + if err != nil { + return nil, err + } + + km.initBounds() // step 0.2 + + res, err := km.elkansCluster() + if err != nil { + return nil, err + } + + return moarray.ToMoArrays[float64](res), nil +} + +func (km *ElkanClusterer) elkansCluster() ([]*mat.VecDense, error) { + + for iter := 0; ; iter++ { + km.computeCentroidDistances() // step 1 + + changes := km.assignData() // step 2 and 3 + + newCentroids := km.recalculateCentroids() // step 4 + + km.updateBounds(newCentroids) // step 5 and 6 + + km.centroids = newCentroids // step 7 + + //logutil.Debugf("kmeans iter=%d, changes=%d", iter, changes) + if iter != 0 && km.isConverged(iter, changes) { + break + } + } + return km.centroids, nil +} + +func validateArgs(vectorList [][]float64, clusterCnt, + maxIterations int, deltaThreshold float64, + distanceType kmeans.DistanceType, initType kmeans.InitType) error { + if len(vectorList) == 0 || len(vectorList[0]) == 0 { + return moerr.NewInternalErrorNoCtx("input vectors is empty") + } + if clusterCnt > len(vectorList) { + return moerr.NewInternalErrorNoCtx("cluster count is larger than vector count %d > %d", clusterCnt, len(vectorList)) + } + if maxIterations < 0 { + return moerr.NewInternalErrorNoCtx("max iteration is out of bounds (must be >= 0)") + } + if deltaThreshold <= 0.0 || deltaThreshold >= 1.0 { + return moerr.NewInternalErrorNoCtx("delta threshold is out of bounds (must be > 0.0 and < 1.0)") + } + if distanceType > 2 { + return moerr.NewInternalErrorNoCtx("distance type is not supported") + } + if initType > 1 { + return moerr.NewInternalErrorNoCtx("init type is not supported") + } + + // We need to validate that all vectors have the same dimension. + // This is already done by moarray.ToGonumVectors, so skipping it here. + + if (clusterCnt * clusterCnt) > math.MaxInt { + return moerr.NewInternalErrorNoCtx("cluster count is too large for int*int") + } + + return nil +} + +// initBounds initializes the lower bounds, upper bound and assignment for each vector. +func (km *ElkanClusterer) initBounds() { + // step 0.2 + // Set the lower bound l(x, c)=0 for each point x and center c. + // Assign each x to its closest initial center c(x)=min{ d(x, c) }, using Lemma 1 to avoid + // redundant distance calculations. Each time d(x, c) is computed, set l(x, c)=d(x, c). + // Assign upper bounds u(x)=min_c d(x, c). + for x := range km.vectorList { + minDist := math.MaxFloat64 + closestCenter := 0 + for c := range km.centroids { + dist := km.distFn(km.vectorList[x], km.centroids[c]) + km.vectorMetas[x].lower[c] = dist + if dist < minDist { + minDist = dist + closestCenter = c + } + } + + km.vectorMetas[x].upper = minDist + km.assignments[x] = closestCenter + } +} + +// computeCentroidDistances computes the centroid distances and the min centroid distances. +// NOTE: here we are save 0.5 of centroid distance to avoid 0.5 multiplication in step 3(iii) and 3.b. +func (km *ElkanClusterer) computeCentroidDistances() { + + // step 1.a + // For all centers c and c', compute 0.5 x d(c, c'). + var wg sync.WaitGroup + for r := 0; r < km.clusterCnt; r++ { + for c := r + 1; c < km.clusterCnt; c++ { + wg.Add(1) + go func(i, j int) { + defer wg.Done() + dist := 0.5 * km.distFn(km.centroids[i], km.centroids[j]) + km.halfInterCentroidDistMatrix[i][j] = dist + km.halfInterCentroidDistMatrix[j][i] = dist + }(r, c) + } + } + wg.Wait() + + // step 1.b + // For all centers c, compute s(c)=0.5 x min{d(c, c') | c'!= c}. + for i := 0; i < km.clusterCnt; i++ { + currMinDist := math.MaxFloat64 + for j := 0; j < km.clusterCnt; j++ { + if i == j { + continue + } + currMinDist = math.Min(currMinDist, km.halfInterCentroidDistMatrix[i][j]) + } + km.minHalfInterCentroidDist[i] = currMinDist + } +} + +// assignData assigns each vector to the nearest centroid. +// This is the place where most of the "distance computation skipping" happens. +func (km *ElkanClusterer) assignData() int { + + changes := 0 + + for currVector := range km.vectorList { + + // step 2 + // u(x) <= s(c(x)) + if km.vectorMetas[currVector].upper <= km.minHalfInterCentroidDist[km.assignments[currVector]] { + continue + } + + for c := range km.centroids { // c is nextPossibleCentroidIdx + // step 3 + // For all remaining points x and centers c such that + // (i) c != c(x) and + // (ii) u(x)>l(x, c) and + // (iii) u(x)> 0.5 x d(c(x), c) + if c != km.assignments[currVector] && + km.vectorMetas[currVector].upper > km.vectorMetas[currVector].lower[c] && + km.vectorMetas[currVector].upper > km.halfInterCentroidDistMatrix[km.assignments[currVector]][c] { + + //step 3.a - Bounds update + // If r(x) then compute d(x, c(x)) and assign r(x)= false. + var dxcx float64 + if km.vectorMetas[currVector].recompute { + km.vectorMetas[currVector].recompute = false + + dxcx = km.distFn(km.vectorList[currVector], km.centroids[km.assignments[currVector]]) + km.vectorMetas[currVector].upper = dxcx + km.vectorMetas[currVector].lower[km.assignments[currVector]] = dxcx + + if km.vectorMetas[currVector].upper <= km.vectorMetas[currVector].lower[c] { + continue // Pruned by triangle inequality on lower bound. + } + + if km.vectorMetas[currVector].upper <= km.halfInterCentroidDistMatrix[km.assignments[currVector]][c] { + continue // Pruned by triangle inequality on cluster distances. + } + + } else { + dxcx = km.vectorMetas[currVector].upper // Otherwise, d(x, c(x))=u(x). + } + + //step 3.b - Update + // If d(x, c(x))>l(x, c) or d(x, c(x))> 0.5 d(c(x), c) then + // Compute d(x, c) + // If d(x, c) km.vectorMetas[currVector].lower[c] || + dxcx > km.halfInterCentroidDistMatrix[km.assignments[currVector]][c] { + + dxc := km.distFn(km.vectorList[currVector], km.centroids[c]) // d(x,c) in the paper + km.vectorMetas[currVector].lower[c] = dxc + if dxc < dxcx { + km.vectorMetas[currVector].upper = dxc + km.assignments[currVector] = c + changes++ + } + } + } + } + } + return changes +} + +// recalculateCentroids calculates the new mean centroids based on the new assignments. +func (km *ElkanClusterer) recalculateCentroids() []*mat.VecDense { + membersCount := make([]int64, km.clusterCnt) + + newCentroids := make([]*mat.VecDense, km.clusterCnt) + for c := range newCentroids { + newCentroids[c] = mat.NewVecDense(km.vectorList[0].Len(), nil) + } + + // sum of all the members of the cluster + for x, vec := range km.vectorList { + cx := km.assignments[x] + membersCount[cx]++ + newCentroids[cx].AddVec(newCentroids[cx], vec) + } + + // means of the clusters = sum of all the members of the cluster / number of members in the cluster + for c := range newCentroids { + if membersCount[c] == 0 { + // pick a vector randomly from existing vectors as the new centroid + //newCentroids[c] = km.vectorList[km.rand.Intn(km.vectorCnt)] + + //// if the cluster is empty, reinitialize it to a random vector, since you can't find the mean of an empty set + randVector := make([]float64, km.vectorList[0].Len()) + for l := range randVector { + randVector[l] = km.rand.Float64() + } + newCentroids[c] = mat.NewVecDense(km.vectorList[0].Len(), randVector) + + // normalize the random vector + if km.normalize { + moarray.NormalizeGonumVector(newCentroids[c]) + } + } else { + // find the mean of the cluster members + // note: we don't need to normalize here, since the vectors are already normalized + newCentroids[c].ScaleVec(1.0/float64(membersCount[c]), newCentroids[c]) + } + + } + + return newCentroids +} + +// updateBounds updates the lower and upper bounds for each vector. +func (km *ElkanClusterer) updateBounds(newCentroid []*mat.VecDense) { + + // compute the centroid shift distance matrix once. + // d(c', m(c')) in the paper + centroidShiftDist := make([]float64, km.clusterCnt) + var wg sync.WaitGroup + for c := 0; c < km.clusterCnt; c++ { + wg.Add(1) + go func(cIdx int) { + defer wg.Done() + centroidShiftDist[cIdx] = km.distFn(km.centroids[cIdx], newCentroid[cIdx]) + //logutil.Debugf("centroidShiftDist[%d]=%f", cIdx, centroidShiftDist[cIdx]) + }(c) + } + wg.Wait() + + // step 5 + //For each point x and center c, assign + // l(x, c)= max{ l(x, c)-d(c, m(c)), 0 } + for x := range km.vectorList { + for c := range km.centroids { + shift := km.vectorMetas[x].lower[c] - centroidShiftDist[c] + km.vectorMetas[x].lower[c] = math.Max(shift, 0) + } + + // step 6 + // For each point x, assign + // u(x)= u(x) + d(m(c(x)), c(x)) + // r(x)= true + cx := km.assignments[x] + km.vectorMetas[x].upper += centroidShiftDist[cx] + km.vectorMetas[x].recompute = true + } +} + +// isConverged checks if the algorithm has converged. +func (km *ElkanClusterer) isConverged(iter int, changes int) bool { + if iter == km.maxIterations || changes == 0 { + return true + } + // NOTE: we are not using deltaThreshold right now. + //if changes < int(float64(km.vectorCnt)*km.deltaThreshold) { + // return true + //} + return false +} + +// SSE returns the sum of squared errors. +func (km *ElkanClusterer) SSE() float64 { + sse := 0.0 + for i := range km.vectorList { + distErr := km.distFn(km.vectorList[i], km.centroids[km.assignments[i]]) + sse += math.Pow(distErr, 2) + } + return sse +} diff --git a/elkans/clusterer_bench_test.go b/elkans/clusterer_bench_test.go new file mode 100644 index 0000000..15ec209 --- /dev/null +++ b/elkans/clusterer_bench_test.go @@ -0,0 +1,110 @@ +// Copyright 2023 Matrix Origin +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package elkans + +import ( + "github.com/arjunsk/kmeans" + "math/rand" + "strconv" + "testing" +) + +/* +date : 2023-10-31 +goos: darwin +goarch: arm64 +cpu: Apple M2 Pro +row count: 10_000 +dims: 128 +k: 1000 +Benchmark_kmeans/Spherical_Elkan_Random-10 1 16735189750 ns/op +Benchmark_kmeans/Spherical_Elkan_Kmeans++-10 1 17543632667 ns/op +*/ +func Benchmark_kmeans(b *testing.B) { + //logutil.SetupMOLogger(&logutil.LogConfig{ + // Level: "debug", + // Format: "console", + //}) + + rowCnt := 1000_000 + k := 1000 + sampleCnt := CalcSampleCount(int64(k), int64(rowCnt)) + dims := 128 + + data := make([][]float64, sampleCnt) + populateRandData(int(sampleCnt), dims, data) + + b.Run("Spherical_Elkan_Random", func(b *testing.B) { + b.ResetTimer() + clusterRand, _ := NewKMeans(data, k, + 500, 0.01, + kmeans.L2Distance, kmeans.Random, true) + _, err := clusterRand.Cluster() + if err != nil { + panic(err) + } + b.Log("SSE - clusterRand", strconv.FormatFloat(clusterRand.SSE(), 'f', -1, 32)) + + }) + + //Will not work for large datasets without sampling + b.Run("Spherical_Elkan_Kmeans++", func(b *testing.B) { + b.ResetTimer() + kmeansPlusPlus, _ := NewKMeans(data, k, + 500, 0.01, + kmeans.L2Distance, kmeans.KmeansPlusPlus, true) + _, err := kmeansPlusPlus.Cluster() + if err != nil { + panic(err) + } + b.Log("SSE - clusterRand", strconv.FormatFloat(kmeansPlusPlus.SSE(), 'f', -1, 32)) + }) + +} + +func populateRandData(rowCnt int, dim int, vecs [][]float64) { + random := rand.New(rand.NewSource(kmeans.DefaultRandSeed)) + for r := 0; r < rowCnt; r++ { + vecs[r] = make([]float64, dim) + for c := 0; c < dim; c++ { + vecs[r][c] = float64(random.Float32() * 1000) + } + } +} + +const ( + KmeansSamplePerList = 50 + MaxSampleCount = 10_000 +) + +// CalcSampleCount is used to calculate the sample count for Kmeans index. +func CalcSampleCount(lists, totalCnt int64) (sampleCnt int64) { + + if totalCnt > lists*KmeansSamplePerList { + sampleCnt = lists * KmeansSamplePerList + } else { + sampleCnt = totalCnt + } + + if totalCnt > MaxSampleCount && sampleCnt < MaxSampleCount { + sampleCnt = MaxSampleCount + } + + if sampleCnt > MaxSampleCount { + sampleCnt = MaxSampleCount + } + + return sampleCnt +} diff --git a/elkans/clusterer_test.go b/elkans/clusterer_test.go new file mode 100644 index 0000000..bf6b951 --- /dev/null +++ b/elkans/clusterer_test.go @@ -0,0 +1,595 @@ +// Copyright 2023 Matrix Origin +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package elkans + +import ( + "github.com/arjunsk/kmeans" + "github.com/arjunsk/kmeans/assertx" + "github.com/arjunsk/kmeans/moarray" + "reflect" + "testing" +) + +func Test_NewKMeans(t *testing.T) { + type constructorArgs struct { + vectorList [][]float64 + clusterCnt int + maxIterations int + deltaThreshold float64 + distType kmeans.DistanceType + initType kmeans.InitType + } + tests := []struct { + name string + fields constructorArgs + wantErr bool + }{ + { + name: "Test 1 - Dimension mismatch", + fields: constructorArgs{ + vectorList: [][]float64{ + {1, 2, 3, 4}, + {1, 2, 4, 5}, + {1, 2, 4}, + }, + clusterCnt: 2, + maxIterations: 500, + deltaThreshold: 0.01, + distType: kmeans.L2Distance, + initType: kmeans.Random, + }, + wantErr: true, + }, + { + name: "Test 2 - ClusterCnt> len(vectorList)", + fields: constructorArgs{ + vectorList: [][]float64{ + {1, 2, 3, 4}, + {1, 2, 4, 5}, + {1, 2, 4, 6}, + }, + clusterCnt: 4, + maxIterations: 500, + deltaThreshold: 0.01, + distType: kmeans.L2Distance, + initType: kmeans.Random, + }, + wantErr: true, + }, + { + name: "Test 3 - ClusterCnt> len(vectorList)", + fields: constructorArgs{ + vectorList: [][]float64{ + {1, 2, 3, 4}, + {1, 2, 4, 5}, + {1, 2, 4, 6}, + }, + clusterCnt: 4, + maxIterations: 500, + deltaThreshold: 0.01, + distType: kmeans.L2Distance, + initType: kmeans.Random, + }, + wantErr: true, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + _, err := NewKMeans(tt.fields.vectorList, tt.fields.clusterCnt, + tt.fields.maxIterations, tt.fields.deltaThreshold, + tt.fields.distType, tt.fields.initType, false) //<-- Not Spherical Kmeans UT + if (err != nil) != tt.wantErr { + t.Errorf("NewKMeans() error = %v, wantErr %v", err, tt.wantErr) + return + } + }) + } +} + +func Test_Cluster(t *testing.T) { + type constructorArgs struct { + vectorList [][]float64 + clusterCnt int + maxIterations int + deltaThreshold float64 + distType kmeans.DistanceType + initType kmeans.InitType + } + tests := []struct { + name string + fields constructorArgs + want [][]float64 + wantErr bool + wantSSE float64 + }{ + { + name: "Test 1 - Skewed data (Random Init)", + fields: constructorArgs{ + vectorList: [][]float64{ + {1, 2, 3, 4}, + {1, 2, 4, 5}, + {1, 2, 4, 5}, + {1, 2, 3, 4}, + {1, 2, 4, 5}, + {1, 2, 4, 5}, + {10, 2, 4, 5}, + {10, 3, 4, 5}, + {10, 5, 4, 5}, + {10, 2, 4, 5}, + {10, 3, 4, 5}, + {10, 5, 4, 5}, + }, + clusterCnt: 2, + maxIterations: 500, + deltaThreshold: 0.01, + distType: kmeans.L2Distance, + initType: kmeans.Random, + }, + want: [][]float64{ + //{0.15915269938161652, 0.31830539876323305, 0.5757527355814478, 0.7349054349630643}, // approx {1, 2, 3.6666666666666665, 4.666666666666666} + //{0.8077006350571528, 0.26637173227965466, 0.3230802540228611, 0.4038503175285764}, // approx {10, 3.333333333333333, 4, 5} + {10, 3.333333333333333, 4, 5}, + {1, 2, 3.6666666666666665, 4.666666666666666}, + }, + //wantSSE: 0.0657884123589134, + wantSSE: 12, + wantErr: false, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + clusterer, _ := NewKMeans(tt.fields.vectorList, tt.fields.clusterCnt, + tt.fields.maxIterations, tt.fields.deltaThreshold, + tt.fields.distType, tt.fields.initType, false) + got, err := clusterer.Cluster() + if (err != nil) != tt.wantErr { + t.Errorf("Cluster() error = %v, wantErr %v", err, tt.wantErr) + return + } + + if !assertx.InEpsilonF64Slices(tt.want, got) { + t.Errorf("Cluster() got = %v, want %v", got, tt.want) + } + + if !assertx.InEpsilonF64(tt.wantSSE, clusterer.SSE()) { + t.Errorf("SSE() got = %v, want %v", clusterer.SSE(), tt.wantSSE) + } + }) + } +} + +func TestElkanClusterer_initBounds(t *testing.T) { + type constructorArgs struct { + vectorList [][]float64 + clusterCnt int + maxIterations int + deltaThreshold float64 + distType kmeans.DistanceType + initType kmeans.InitType + } + type internalState struct { + centroids [][]float64 + } + type wantState struct { + vectorMetas []vectorMeta + assignment []int + } + tests := []struct { + name string + fields constructorArgs + state internalState + want wantState + }{ + { + name: "Test 1", + fields: constructorArgs{ + vectorList: [][]float64{ + {1, 2, 3, 4}, + {1, 2, 4, 5}, + {1, 2, 4, 5}, + {10, 20, 30, 40}, + {11, 23, 33, 47}, + }, + clusterCnt: 2, + maxIterations: 500, + deltaThreshold: 0.01, + distType: kmeans.L2Distance, + initType: kmeans.Random, + }, + state: internalState{ + centroids: [][]float64{ + {1, 2, 3, 4}, + {10, 20, 30, 40}, + }, + }, + want: wantState{ + vectorMetas: []vectorMeta{ + { + lower: []float64{0, 49.29503017546495}, + upper: 0, + recompute: true, + }, + { + lower: []float64{1.4142135623730951, 48.02082881417188}, + upper: 1.4142135623730951, + recompute: true, + }, + { + lower: []float64{1.4142135623730951, 48.02082881417188}, + upper: 1.4142135623730951, + recompute: true, + }, + { + lower: []float64{49.29503017546495, 0}, + upper: 0, + recompute: true, + }, + { + lower: []float64{57.35852159879994, 8.246211251235321}, + upper: 8.246211251235321, + recompute: true, + }, + }, + assignment: []int{0, 0, 0, 1, 1}, + }, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + km, err := NewKMeans(tt.fields.vectorList, tt.fields.clusterCnt, + tt.fields.maxIterations, tt.fields.deltaThreshold, + tt.fields.distType, tt.fields.initType, false) + + // NOTE: here km.Normalize() is skipped as we not calling km.Cluster() in this test. + // Here we are only testing the working of initBounds() function. + if err != nil { + t.Errorf("Error while creating KMeans object %v", err) + } + if ekm, ok := km.(*ElkanClusterer); ok { + ekm.centroids, _ = moarray.ToGonumVectors[float64](tt.state.centroids...) + ekm.initBounds() + if !reflect.DeepEqual(ekm.assignments, tt.want.assignment) { + t.Errorf("assignments got = %v, want %v", ekm.assignments, tt.want.assignment) + } + + for i := 0; i < len(tt.want.vectorMetas); i++ { + if !assertx.InEpsilonF64Slice(tt.want.vectorMetas[i].lower, ekm.vectorMetas[i].lower) { + t.Errorf("vectorMetas got = %v, want %v", ekm.vectorMetas, tt.want.vectorMetas) + } + + if !assertx.InEpsilonF64(tt.want.vectorMetas[i].upper, ekm.vectorMetas[i].upper) { + t.Errorf("vectorMetas got = %v, want %v", ekm.vectorMetas, tt.want.vectorMetas) + } + + if ekm.vectorMetas[i].recompute != tt.want.vectorMetas[i].recompute { + t.Errorf("vectorMetas got = %v, want %v", ekm.vectorMetas, tt.want.vectorMetas) + } + } + + } else if !ok { + t.Errorf("km not of type ElkanClusterer") + } + + }) + } +} + +func TestElkanClusterer_computeCentroidDistances(t *testing.T) { + type constructorArgs struct { + vectorList [][]float64 + clusterCnt int + maxIterations int + deltaThreshold float64 + distType kmeans.DistanceType + initType kmeans.InitType + } + type internalState struct { + centroids [][]float64 + } + type wantState struct { + halfInterCentroidDistMatrix [][]float64 + minHalfInterCentroidDist []float64 + } + tests := []struct { + name string + fields constructorArgs + state internalState + want wantState + }{ + { + name: "Test 1", + fields: constructorArgs{ + vectorList: [][]float64{ + // This is dummy data. Won't be used for this test function. + {0}, {0}, {0}, {0}, {0}, {0}, + }, + clusterCnt: 2, + maxIterations: 500, + deltaThreshold: 0.01, + distType: kmeans.L2Distance, + initType: kmeans.Random, + }, + state: internalState{ + centroids: [][]float64{ + {1, 2, 3, 4}, + {10, 20, 30, 40}, + }, + }, + want: wantState{ + halfInterCentroidDistMatrix: [][]float64{ + {0, 24.647515087732476}, {24.647515087732476, 0}, + }, + minHalfInterCentroidDist: []float64{24.647515087732476, 24.647515087732476}, + }, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + km, err := NewKMeans(tt.fields.vectorList, tt.fields.clusterCnt, + tt.fields.maxIterations, tt.fields.deltaThreshold, + tt.fields.distType, tt.fields.initType, false) + + if err != nil { + t.Errorf("Error while creating KMeans object %v", err) + } + if ekm, ok := km.(*ElkanClusterer); ok { + ekm.centroids, _ = moarray.ToGonumVectors[float64](tt.state.centroids...) + ekm.computeCentroidDistances() + + // NOTE: here we are not considering the vectors in the vectorList. Hence we don't need to worry about + // the normalization impact. Here we are only testing the working of computeCentroidDistances() function. + + if !assertx.InEpsilonF64Slices(tt.want.halfInterCentroidDistMatrix, ekm.halfInterCentroidDistMatrix) { + t.Errorf("halfInterCentroidDistMatrix got = %v, want %v", ekm.halfInterCentroidDistMatrix, tt.want.halfInterCentroidDistMatrix) + } + + if !assertx.InEpsilonF64Slice(tt.want.minHalfInterCentroidDist, ekm.minHalfInterCentroidDist) { + t.Errorf("minHalfInterCentroidDist got = %v, want %v", ekm.minHalfInterCentroidDist, tt.want.minHalfInterCentroidDist) + } + + if !reflect.DeepEqual(ekm.minHalfInterCentroidDist, tt.want.minHalfInterCentroidDist) { + t.Errorf("minHalfInterCentroidDist got = %v, want %v", ekm.minHalfInterCentroidDist, tt.want.minHalfInterCentroidDist) + } + + } else if !ok { + t.Errorf("km not of type ElkanClusterer") + } + + }) + } +} + +func TestElkanClusterer_recalculateCentroids(t *testing.T) { + type constructorArgs struct { + vectorList [][]float64 + clusterCnt int + maxIterations int + deltaThreshold float64 + distType kmeans.DistanceType + initType kmeans.InitType + } + type internalState struct { + assignments []int + } + type wantState struct { + centroids [][]float64 + } + tests := []struct { + name string + fields constructorArgs + state internalState + want wantState + }{ + { + name: "Test 1", + fields: constructorArgs{ + vectorList: [][]float64{ + {1, 2, 3, 4}, + {1, 2, 4, 5}, + {1, 2, 4, 5}, + {10, 20, 30, 40}, + {11, 23, 33, 47}, + }, + clusterCnt: 2, + maxIterations: 500, + deltaThreshold: 0.01, + distType: kmeans.L2Distance, + initType: kmeans.Random, + }, + state: internalState{ + assignments: []int{0, 0, 0, 1, 1}, + }, + want: wantState{ + centroids: [][]float64{ + {1, 2, 3.6666666666666665, 4.666666666666666}, + {10.5, 21.5, 31.5, 43.5}, + }, + }, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + km, err := NewKMeans(tt.fields.vectorList, tt.fields.clusterCnt, + tt.fields.maxIterations, tt.fields.deltaThreshold, + tt.fields.distType, tt.fields.initType, false) + + if err != nil { + t.Errorf("Error while creating KMeans object %v", err) + } + if ekm, ok := km.(*ElkanClusterer); ok { + ekm.assignments = tt.state.assignments + + // NOTE: here km.Normalize() is skipped as we not calling km.Cluster() in this test. + // Here we are only testing the working of recalculateCentroids() function. + + got := ekm.recalculateCentroids() + if !assertx.InEpsilonF64Slices(tt.want.centroids, moarray.ToMoArrays[float64](got)) { + t.Errorf("centroids got = %v, want %v", moarray.ToMoArrays[float64](got), tt.want.centroids) + } + + } else if !ok { + t.Errorf("km not of type ElkanClusterer") + } + + }) + } +} + +func TestElkanClusterer_updateBounds(t *testing.T) { + type constructorArgs struct { + vectorList [][]float64 + clusterCnt int + maxIterations int + deltaThreshold float64 + distType kmeans.DistanceType + initType kmeans.InitType + } + type internalState struct { + vectorMetas []vectorMeta + centroids [][]float64 + newCentroids [][]float64 + } + type wantState struct { + vectorMetas []vectorMeta + } + tests := []struct { + name string + fields constructorArgs + state internalState + want wantState + }{ + { + name: "Test 1", + fields: constructorArgs{ + vectorList: [][]float64{ + {1, 2, 3, 4}, + {1, 2, 4, 5}, + {1, 2, 4, 5}, + {10, 20, 30, 40}, + {11, 23, 33, 47}, + }, + clusterCnt: 2, + maxIterations: 500, + deltaThreshold: 0.01, + distType: kmeans.L2Distance, + initType: kmeans.Random, + }, + state: internalState{ + vectorMetas: []vectorMeta{ + { + lower: []float64{0, 49.29503017546495}, + upper: 0, + recompute: false, + }, + { + lower: []float64{1.4142135623730951, 48.02082881417188}, + upper: 1.4142135623730951, + recompute: false, + }, + { + lower: []float64{1.4142135623730951, 48.02082881417188}, + upper: 1.4142135623730951, + recompute: false, + }, + { + lower: []float64{49.29503017546495, 0}, + upper: 0, + recompute: false, + }, + { + lower: []float64{57.358521598799946, 8.246211251235321}, + upper: 8.246211251235321, + recompute: false, + }, + }, + centroids: [][]float64{ + {1, 2, 3, 4}, + {10, 20, 30, 40}, + }, + newCentroids: [][]float64{ + {1, 2, 3.6666666666666665, 4.666666666666667}, + {10.5, 21.5, 31.5, 43.5}, + }, + }, + want: wantState{ + vectorMetas: []vectorMeta{ + { + lower: []float64{0, 45.17192454984729}, + upper: 0.9428090415820634, + recompute: true, + }, + { + lower: []float64{0.4714045207910318, 43.89772318855422}, + upper: 2.3570226039551585, + recompute: true, + }, + { + lower: []float64{0.4714045207910318, 43.89772318855422}, + upper: 2.3570226039551585, + recompute: true, + }, + { + lower: []float64{48.352221133882885, 0}, + upper: 0.9428090415820634, + recompute: true, + }, + { + lower: []float64{56.41571255721788, 4.123105625617661}, + upper: 9.189020292817384, + recompute: true, + }, + }, + }, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + km, err := NewKMeans(tt.fields.vectorList, tt.fields.clusterCnt, + tt.fields.maxIterations, tt.fields.deltaThreshold, + tt.fields.distType, tt.fields.initType, false) + + if err != nil { + t.Errorf("Error while creating KMeans object %v", err) + } + if ekm, ok := km.(*ElkanClusterer); ok { + ekm.vectorMetas = tt.state.vectorMetas + ekm.centroids, _ = moarray.ToGonumVectors[float64](tt.state.centroids...) + + // NOTE: here km.Normalize() is skipped as we not calling km.Cluster() in this test. + // Here we are only testing the working of updateBounds() function. + gonumVectors, _ := moarray.ToGonumVectors[float64](tt.state.newCentroids...) + ekm.updateBounds(gonumVectors) + + for i := 0; i < len(tt.want.vectorMetas); i++ { + if !assertx.InEpsilonF64Slice(tt.want.vectorMetas[i].lower, ekm.vectorMetas[i].lower) { + t.Errorf("vectorMetas got = %v, want %v", ekm.vectorMetas, tt.want.vectorMetas) + } + + if !assertx.InEpsilonF64(tt.want.vectorMetas[i].upper, ekm.vectorMetas[i].upper) { + t.Errorf("vectorMetas got = %v, want %v", ekm.vectorMetas, tt.want.vectorMetas) + } + + if ekm.vectorMetas[i].recompute != tt.want.vectorMetas[i].recompute { + t.Errorf("vectorMetas got = %v, want %v", ekm.vectorMetas, tt.want.vectorMetas) + } + } + + } else if !ok { + t.Errorf("km not of type ElkanClusterer") + } + + }) + } +} diff --git a/elkans/distance_func.go b/elkans/distance_func.go new file mode 100644 index 0000000..ef60ea7 --- /dev/null +++ b/elkans/distance_func.go @@ -0,0 +1,75 @@ +// Copyright 2023 Matrix Origin +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package elkans + +import ( + "github.com/arjunsk/kmeans" + "github.com/arjunsk/kmeans/moerr" + "gonum.org/v1/gonum/mat" +) + +// L2Distance is used for L2Distance distance in Euclidean Kmeans. +func L2Distance(v1, v2 *mat.VecDense) float64 { + diff := mat.NewVecDense(v1.Len(), nil) + diff.SubVec(v1, v2) + return mat.Norm(diff, 2) +} + +//// SphericalDistance is used for InnerProduct and CosineDistance in Spherical Kmeans. +//// NOTE: spherical distance between two points on a sphere is equal to the +//// angular distance between the two points, scaled by pi. +//// Refs: +//// https://en.wikipedia.org/wiki/Great-circle_distance#Vector_version +//func SphericalDistance(v1, v2 *mat.VecDense) float64 { +// // Compute the dot product of the two vectors. +// // The dot product of two vectors is a measure of their similarity, +// // and it can be used to calculate the angle between them. +// dp := mat.Dot(v1, v2) +// +// // Prevent NaN with acos with loss of precision. +// if dp > 1.0 { +// dp = 1.0 +// } else if dp < -1.0 { +// dp = -1.0 +// } +// +// theta := math.Acos(dp) +// +// //To scale the result to the range [0, 1], we divide by Pi. +// return theta / math.Pi +// +// // NOTE: +// // Cosine distance is a measure of the similarity between two vectors. [Not satisfy triangle inequality] +// // Angular distance is a measure of the angular separation between two points. [Satisfy triangle inequality] +// // Spherical distance is a measure of the spatial separation between two points on a sphere. [Satisfy triangle inequality] +//} + +// resolveDistanceFn returns the distance function corresponding to the distance type +// Distance function should satisfy triangle inequality. +// We use +// - L2Distance distance for L2Distance +// - SphericalDistance for InnerProduct and CosineDistance +func resolveDistanceFn(distType kmeans.DistanceType) (kmeans.DistanceFunction, error) { + var distanceFunction kmeans.DistanceFunction + switch distType { + case kmeans.L2Distance: + distanceFunction = L2Distance + //case kmeans.InnerProduct, kmeans.CosineDistance: + // distanceFunction = SphericalDistance + default: + return nil, moerr.NewInternalErrorNoCtx("invalid distance type") + } + return distanceFunction, nil +} diff --git a/elkans/distance_func_bench_test.go b/elkans/distance_func_bench_test.go new file mode 100644 index 0000000..6235c95 --- /dev/null +++ b/elkans/distance_func_bench_test.go @@ -0,0 +1,81 @@ +// Copyright 2024 Matrix Origin +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package elkans + +import ( + "github.com/arjunsk/kmeans/moarray" + "gonum.org/v1/gonum/mat" + "math/rand" + "testing" +) + +/* +Benchmark_L2Distance/L2_Distance-10 1570082 1014 ns/op +Benchmark_L2Distance/Normalize_L2-10 1277733 1064 ns/op +Benchmark_L2Distance/L2_Distance(v1,_NormalizeL2)-10 589376 1883 ns/op +*/ +func Benchmark_L2Distance(b *testing.B) { + dim := 128 + + b.Run("L2 Distance", func(b *testing.B) { + v1, v2 := randomGonumVectors(b.N, dim), randomGonumVectors(b.N, dim) + b.ResetTimer() + + for i := 0; i < b.N; i++ { + _ = L2Distance(v1[i], v2[i]) + } + }) + + b.Run("Normalize L2", func(b *testing.B) { + v1 := randomVectors(b.N, dim) + b.ResetTimer() + + for i := 0; i < b.N; i++ { + _, _ = moarray.NormalizeL2[float64](v1[i]) + } + }) + + b.Run("L2 Distance(v1, NormalizeL2)", func(b *testing.B) { + v1, v2 := randomGonumVectors(b.N, dim), randomVectors(b.N, dim) + b.ResetTimer() + + for i := 0; i < b.N; i++ { + v21, _ := moarray.NormalizeL2[float64](v2[i]) + _ = L2Distance(v1[i], moarray.ToGonumVector(v21)) + } + }) + +} + +func randomVectors(size, dim int) [][]float64 { + vectors := make([][]float64, size) + for i := range vectors { + for j := 0; j < dim; j++ { + vectors[i] = append(vectors[i], rand.Float64()) + } + } + return vectors +} + +func randomGonumVectors(size, dim int) []*mat.VecDense { + vectors := make([]*mat.VecDense, size) + for i := range vectors { + vectors[i] = mat.NewVecDense(dim, nil) + for j := 0; j < dim; j++ { + vectors[i].SetVec(j, rand.Float64()) + } + } + return vectors +} diff --git a/elkans/distance_func_test.go b/elkans/distance_func_test.go new file mode 100644 index 0000000..383830f --- /dev/null +++ b/elkans/distance_func_test.go @@ -0,0 +1,169 @@ +// Copyright 2023 Matrix Origin +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package elkans + +import ( + "github.com/arjunsk/kmeans/moarray" + "testing" +) + +func Test_L2Distance(t *testing.T) { + type args struct { + v1 []float64 + v2 []float64 + } + tests := []struct { + name string + args args + want float64 + }{ + { + name: "Test 1", + args: args{ + v1: []float64{1, 2, 3, 4}, + v2: []float64{1, 2, 4, 5}, + }, + want: 1.4142135623730951, + }, + { + name: "Test 2", + args: args{ + v1: []float64{10, 20, 30, 40}, + v2: []float64{10.5, 21.5, 31.5, 43.5}, + }, + want: 4.123105625617661, + }, + { + name: "Test 3.a", + args: args{ + v1: []float64{1, 1}, + v2: []float64{4, 1}, + }, + want: 3, + }, + { + name: "Test 3.b", + args: args{ + v1: []float64{4, 1}, + v2: []float64{1, 4}, + }, + want: 4.242640687119286, + }, + { + name: "Test 3.c", + args: args{ + v1: []float64{1, 4}, + v2: []float64{1, 1}, + }, + want: 3, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + if got := L2Distance(moarray.ToGonumVector[float64](tt.args.v1), moarray.ToGonumVector[float64](tt.args.v2)); got != tt.want { + t.Errorf("L2Distance() = %v, want %v", got, tt.want) + } + }) + } +} + +//func Test_AngularDistance(t *testing.T) { +// type args struct { +// v1 []float64 +// v2 []float64 +// } +// tests := []struct { +// name string +// args args +// want float64 +// }{ +// { +// name: "Test 1", +// args: args{ +// v1: []float64{1, 2, 3, 4}, +// v2: []float64{1, 2, 4, 5}, +// }, +// want: 0, +// }, +// { +// name: "Test 2", +// args: args{ +// v1: []float64{10, 20, 30, 40}, +// v2: []float64{10.5, 21.5, 31.5, 43.5}, +// }, +// want: 0, +// }, +// // Test 3: Triangle Inequality check on **un-normalized** vector +// // A(1,0),B(2,2), C(0,1) => AB + AC !>= BC => 0 + 0 !>= 0.5 +// { +// name: "Test 3.a", +// args: args{ +// v1: []float64{1, 0}, +// v2: []float64{2, 2}, +// }, +// want: 0, +// }, +// { +// name: "Test 3.b", +// args: args{ +// v1: []float64{2, 2}, +// v2: []float64{0, 1}, +// }, +// want: 0, +// }, +// { +// name: "Test 3.c", +// args: args{ +// v1: []float64{0, 1}, +// v2: []float64{1, 0}, +// }, +// want: 0.5, +// }, +// // Test 4: Triangle Inequality check on **normalized** vector +// // A(1,0),B(2,2), C(0,1) => AB + AC >= BC => 0.25 + 0.25 >= 0.5 +// //{ +// // name: "Test 4.a", +// // args: args{ +// // v1: moarray.NormalizeMoVecf64([]float64{1, 0}), +// // v2: moarray.NormalizeMoVecf64([]float64{2, 2}), +// // }, +// // want: 0.25000000000000006, +// //}, +// //{ +// // name: "Test 4.b", +// // args: args{ +// // v1: moarray.NormalizeMoVecf64([]float64{2, 2}), +// // v2: moarray.NormalizeMoVecf64([]float64{0, 1}), +// // }, +// // want: 0.25000000000000006, +// //}, +// //{ +// // name: "Test 4.c", +// // args: args{ +// // v1: moarray.NormalizeMoVecf64([]float64{0, 1}), +// // v2: moarray.NormalizeMoVecf64([]float64{1, 0}), +// // }, +// // want: 0.5, +// //}, +// } +// for _, tt := range tests { +// t.Run(tt.name, func(t *testing.T) { +// +// if got := SphericalDistance(moarray.ToGonumVector[float64](tt.args.v1), moarray.ToGonumVector[float64](tt.args.v2)); !assertx.InEpsilonF64(got, tt.want) { +// t.Errorf("SphericalDistance() = %v, want %v", got, tt.want) +// } +// }) +// } +//} diff --git a/elkans/initializer.go b/elkans/initializer.go new file mode 100644 index 0000000..fee83e1 --- /dev/null +++ b/elkans/initializer.go @@ -0,0 +1,110 @@ +// Copyright 2023 Matrix Origin +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package elkans + +import ( + "github.com/arjunsk/kmeans" + "gonum.org/v1/gonum/mat" + "math" + "math/rand" +) + +type Initializer interface { + InitCentroids(vectors []*mat.VecDense, k int) (centroids []*mat.VecDense) +} + +var _ Initializer = (*Random)(nil) + +// Random initializes the centroids with random centroids from the vector list. +type Random struct { + rand rand.Rand +} + +func NewRandomInitializer() Initializer { + return &Random{ + rand: *rand.New(rand.NewSource(kmeans.DefaultRandSeed)), + } +} + +func (r *Random) InitCentroids(vectors []*mat.VecDense, k int) (centroids []*mat.VecDense) { + centroids = make([]*mat.VecDense, k) + for i := 0; i < k; i++ { + randIdx := r.rand.Intn(len(vectors)) + centroids[i] = vectors[randIdx] + } + return centroids +} + +// KMeansPlusPlus initializes the centroids using kmeans++ algorithm. +// Complexity: O(k*n*k); n = number of vectors, k = number of clusters +// Ref Paper: https://theory.stanford.edu/~sergei/papers/kMeansPP-soda.pdf +// The reason why we have kmeans++ is that it is more stable than random initialization. +// For example, we have 3 clusters. +// Using random, we could get 3 centroids: 1&2 which are close to each other and part of cluster 1. 3 is in the middle of 2&3. +// Using kmeans++, we are sure that 3 centroids are farther away from each other. +type KMeansPlusPlus struct { + rand rand.Rand + distFn kmeans.DistanceFunction +} + +func NewKMeansPlusPlusInitializer(distFn kmeans.DistanceFunction) Initializer { + return &KMeansPlusPlus{ + rand: *rand.New(rand.NewSource(kmeans.DefaultRandSeed)), + distFn: distFn, + } +} + +func (kpp *KMeansPlusPlus) InitCentroids(vectors []*mat.VecDense, k int) (centroids []*mat.VecDense) { + numSamples := len(vectors) + centroids = make([]*mat.VecDense, k) + + // 1. start with a random center + centroids[0] = vectors[kpp.rand.Intn(numSamples)] + + distances := make([]float64, numSamples) + for j := range distances { + distances[j] = math.MaxFloat64 + } + + for nextCentroidIdx := 1; nextCentroidIdx < k; nextCentroidIdx++ { + + // 2. for each data point, compute the min distance to the existing centers + var totalDistToExistingCenters float64 + for vecIdx := range vectors { + // this is a deviation from standard kmeans.here we are not using minDistance to all the existing centers. + // This code was very slow: https://github.com/matrixorigin/matrixone/blob/77ff1452bd56cd93a10e3327632adebdbaf279cb/pkg/sql/plan/function/functionAgg/algos/kmeans/elkans/initializer.go#L81-L86 + // but instead we are using the distance to the last center that was chosen. + distance := kpp.distFn(vectors[vecIdx], centroids[nextCentroidIdx-1]) + distance *= distance + if distance < distances[vecIdx] { + distances[vecIdx] = distance + } + totalDistToExistingCenters += distances[vecIdx] + } + + // 3. choose the next random center, using a weighted probability distribution + // where it is chosen with probability proportional to D(x)^2 + // Ref: https://en.wikipedia.org/wiki/K-means%2B%2B#Improved_initialization_algorithm + target := kpp.rand.Float64() * totalDistToExistingCenters + for idx, distance := range distances { + target -= distance + if target <= 0 { + centroids[nextCentroidIdx] = vectors[idx] + break + } + } + } + return centroids +} diff --git a/elkans/initializer_test.go b/elkans/initializer_test.go new file mode 100644 index 0000000..aa1ca2e --- /dev/null +++ b/elkans/initializer_test.go @@ -0,0 +1,155 @@ +// Copyright 2023 Matrix Origin +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package elkans + +import ( + "github.com/arjunsk/kmeans/moarray" + "reflect" + "testing" +) + +func TestRandom_InitCentroids(t *testing.T) { + type args struct { + vectors [][]float64 + k int + } + tests := []struct { + name string + args args + wantCentroids [][]float64 + }{ + { + name: "TestRandom_InitCentroids", + args: args{ + vectors: [][]float64{ + {1, 2, 3, 4}, + {1, 2, 4, 5}, + {1, 2, 4, 5}, + {1, 2, 3, 4}, + {1, 2, 4, 5}, + {1, 2, 4, 5}, + {10, 2, 4, 5}, + {10, 3, 4, 5}, + {10, 5, 4, 5}, + {10, 2, 4, 5}, + {10, 3, 4, 5}, + {10, 5, 4, 5}, + }, + k: 2, + }, + wantCentroids: [][]float64{ + // NOTE: values of random initialization need not be farther apart, it is random. + // NOTE: we get the same random values in the test case because we are using a constant seed value. + {1, 2, 4, 5}, + {1, 2, 3, 4}, + }, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + r := NewRandomInitializer() + gonumVectors, _ := moarray.ToGonumVectors[float64](tt.args.vectors...) + if gotCentroids := r.InitCentroids(gonumVectors, tt.args.k); !reflect.DeepEqual(moarray.ToMoArrays[float64](gotCentroids), tt.wantCentroids) { + t.Errorf("InitCentroids() = %v, want %v", moarray.ToMoArrays[float64](gotCentroids), tt.wantCentroids) + } + }) + } +} + +func TestKMeansPlusPlus_InitCentroids(t *testing.T) { + type args struct { + vectors [][]float64 + k int + } + tests := []struct { + name string + args args + wantCentroids [][]float64 + }{ + { + name: "TestKMeansPlusPlus_InitCentroids", + args: args{ + vectors: [][]float64{ + {1, 2, 3, 4}, + {1, 2, 4, 5}, + {1, 2, 4, 5}, + {1, 2, 3, 4}, + {1, 2, 4, 5}, + {1, 2, 4, 5}, + {10, 2, 4, 5}, + {10, 3, 4, 5}, + {10, 5, 4, 5}, + {10, 2, 4, 5}, + {10, 3, 4, 5}, + {10, 5, 4, 5}, + }, + k: 2, + }, + // Kmeans++ picked the relatively farthest points as the initial centroids + wantCentroids: [][]float64{ + {1, 2, 4, 5}, + {10, 5, 4, 5}, + }, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + r := NewKMeansPlusPlusInitializer(L2Distance) + gonumVectors, _ := moarray.ToGonumVectors[float64](tt.args.vectors...) + if gotCentroids := r.InitCentroids(gonumVectors, tt.args.k); !reflect.DeepEqual(moarray.ToMoArrays[float64](gotCentroids), tt.wantCentroids) { + t.Errorf("InitCentroids() = %v, want %v", moarray.ToMoArrays[float64](gotCentroids), tt.wantCentroids) + } + }) + } +} + +/* +date : 2023-11-20 +goos: darwin +goarch: arm64 +cpu: Apple M2 Pro +rows: 10_000 +dims: 1024 +k : 10 +Benchmark_InitCentroids/RANDOM-10 108 10574740 ns/op +Benchmark_InitCentroids/KMEANS++-10 1 1081363458 ns/op +*/ +func Benchmark_InitCentroids(b *testing.B) { + rowCnt := 10_000 + dims := 1024 + k := 10 + + data := make([][]float64, rowCnt) + populateRandData(rowCnt, dims, data) + + random := NewRandomInitializer() + kmeanspp := NewKMeansPlusPlusInitializer(L2Distance) + + b.Run("RANDOM", func(b *testing.B) { + b.ResetTimer() + for i := 0; i < b.N; i++ { + gonumVectors, _ := moarray.ToGonumVectors[float64](data...) + _ = random.InitCentroids(gonumVectors, k) + } + }) + + b.Run("KMEANS++", func(b *testing.B) { + b.ResetTimer() + for i := 0; i < b.N; i++ { + gonumVectors, _ := moarray.ToGonumVectors[float64](data...) + _ = kmeanspp.InitCentroids(gonumVectors, k) + } + }) +} diff --git a/examples/custom/main.go b/examples/custom/main.go deleted file mode 100644 index 54a39bc..0000000 --- a/examples/custom/main.go +++ /dev/null @@ -1,50 +0,0 @@ -package main - -import ( - "fmt" - "github.com/arjunsk/kmeans" - "github.com/arjunsk/kmeans/containers" - "github.com/arjunsk/kmeans/initializer" -) - -func main() { - vectors := [][]float64{ - {1, 2, 3, 4}, - {0, 3, 4, 1}, - {0, 9, 3, 1}, - {0, 8, 4, 4}, - {130, 200, 343, 224}, - {100, 200, 300, 400}, - {300, 400, 200, 110}, - } - - clusterer, err := kmeans.NewCluster(kmeans.ELKAN, vectors, 2, kmeans.WithInitializer(initializer.NewKmeansPlusPlusInitializer(containers.EuclideanDistance))) - /* - To define custom initializer, you can use this syntax: - - type Custom struct {} - - func (c *Custom) InitCentroids(vectors [][]float64, clusterCnt int) (containers.Clusters, error) { - panic("implement me") - } - - var init initializer.Initializer = &Custom{} - kmeans.NewCluster(kmeans.ELKAN, vectors, 2, kmeans.WithInitializer(init) ) - */ - - if err != nil { - panic(err) - } - - clusters, err := clusterer.Cluster() - if err != nil { - panic(err) - } - - for _, cluster := range clusters { - fmt.Println(cluster.Center()) - } - // Output: - // [0.25 5.5 3.5 2.5] - // [176.66666666666666 266.66666666666663 281 244.66666666666666] -} diff --git a/examples/sampling/main.go b/examples/sampling/main.go deleted file mode 100644 index f8df14b..0000000 --- a/examples/sampling/main.go +++ /dev/null @@ -1,42 +0,0 @@ -package main - -import ( - "fmt" - "github.com/arjunsk/kmeans" - "github.com/arjunsk/kmeans/sampler" -) - -func main() { - vectors := [][]float64{ - {1, 2, 3, 4}, - {0, 3, 4, 1}, - {0, 9, 3, 1}, - {0, 8, 4, 4}, - {130, 200, 343, 224}, - {100, 200, 300, 400}, - {300, 400, 200, 110}, - } - - // sub-sample - var sampleFn sampler.Sampling[[]float64] = sampler.SrsSampling[[]float64] - subset := sampleFn(vectors, 50) - - // run clustering - clusterer, err := kmeans.NewCluster(kmeans.ELKAN, subset, 2) - if err != nil { - panic(err) - } - - clusters, err := clusterer.Cluster() - if err != nil { - panic(err) - } - - for _, cluster := range clusters { - fmt.Println(cluster.Center()) - } - // Output: - // [1 2 3 4] - // [130 200 343 224] - -} diff --git a/examples/simple/main.go b/examples/simple/main.go deleted file mode 100644 index 8c15459..0000000 --- a/examples/simple/main.go +++ /dev/null @@ -1,36 +0,0 @@ -package main - -import ( - "fmt" - "github.com/arjunsk/kmeans" -) - -func main() { - vectors := [][]float64{ - {1, 2, 3, 4}, - {0, 3, 4, 1}, - {0, 9, 3, 1}, - {0, 8, 4, 4}, - {130, 200, 343, 224}, - {100, 200, 300, 400}, - {300, 400, 200, 110}, - } - - clusterer, err := kmeans.NewCluster(kmeans.ELKAN, vectors, 2) - if err != nil { - panic(err) - } - - clusters, err := clusterer.Cluster() - if err != nil { - panic(err) - } - - for _, cluster := range clusters { - fmt.Println(cluster.Center()) - } - // Output: - // [1 2 3 4] - // [130 200 343 224] - -} diff --git a/go.mod b/go.mod index 70337df..846db74 100644 --- a/go.mod +++ b/go.mod @@ -2,4 +2,9 @@ module github.com/arjunsk/kmeans go 1.20 -require golang.org/x/sync v0.4.0 +require ( + golang.org/x/exp v0.0.0-20231006140011-7918f672742d + golang.org/x/sync v0.4.0 +) + +require gonum.org/v1/gonum v0.14.0 diff --git a/go.sum b/go.sum index 6a07ba6..095ff49 100644 --- a/go.sum +++ b/go.sum @@ -1,2 +1,8 @@ +github.com/matrixorigin/matrixone v1.1.1 h1:Rt3sZlDxVNtNbHb/HtoZqWK3YGoPiaMFptCbAuBaHP4= +github.com/matrixorigin/matrixone v1.1.1/go.mod h1:whaQNRSLcKnsWw/3qLdfs5IGMaYpsayoo8p8F4gwSA8= +golang.org/x/exp v0.0.0-20231006140011-7918f672742d h1:jtJma62tbqLibJ5sFQz8bKtEM8rJBtfilJ2qTU199MI= +golang.org/x/exp v0.0.0-20231006140011-7918f672742d/go.mod h1:ldy0pHrwJyGW56pPQzzkH36rKxoZW1tw7ZJpeKx+hdo= golang.org/x/sync v0.4.0 h1:zxkM55ReGkDlKSM+Fu41A+zmbZuaPVbGMzvvdUPznYQ= golang.org/x/sync v0.4.0/go.mod h1:FU7BRWz2tNW+3quACPkgCx/L+uEAv1htQ0V83Z9Rj+Y= +gonum.org/v1/gonum v0.14.0 h1:2NiG67LD1tEH0D7kM+ps2V+fXmsAnpUeec7n8tcr4S0= +gonum.org/v1/gonum v0.14.0/go.mod h1:AoWeoz0becf9QMWtE8iWXNXc27fK4fNeHNf/oMejGfU= diff --git a/initializer/docs.go b/initializer/docs.go deleted file mode 100644 index 0f3d652..0000000 --- a/initializer/docs.go +++ /dev/null @@ -1,7 +0,0 @@ -package initializer - -// Package initializer provides API for initializing centroids. - -// Initializing algorithms implemented are -// - Random initialization -// - Kmeans++ initialization diff --git a/initializer/initializer_bench_test.go b/initializer/initializer_bench_test.go deleted file mode 100644 index 1bbc780..0000000 --- a/initializer/initializer_bench_test.go +++ /dev/null @@ -1,57 +0,0 @@ -package initializer - -import ( - "github.com/arjunsk/kmeans/containers" - "math/rand" - "testing" -) - -/* -date : 2023-10-1 -goos: darwin -goarch: arm64 -pkg: github.com/arjunsk/kmeans/initializer -cpu: Apple M2 Pro -rows: 1000 -dims: 1024 -Benchmark_InitCentroids/RANDOM-10 104032 11292 ns/op -Benchmark_InitCentroids/KMEANS++-10 1 3840350291 ns/op -*/ -func Benchmark_InitCentroids(b *testing.B) { - rowCnt := 1_000 - dims := 1024 - data := make([][]float64, rowCnt) - loadData(rowCnt, dims, data) - - random := NewRandomInitializer() - kmeanspp := NewKmeansPlusPlusInitializer(containers.EuclideanDistance) - - b.Run("RANDOM", func(b *testing.B) { - b.ResetTimer() - for i := 0; i < b.N; i++ { - _, err := random.InitCentroids(data, 100) - if err != nil { - panic(err) - } - } - }) - - b.Run("KMEANS++", func(b *testing.B) { - b.ResetTimer() - for i := 0; i < b.N; i++ { - _, err := kmeanspp.InitCentroids(data, 100) - if err != nil { - panic(err) - } - } - }) -} - -func loadData(nb int, d int, xb [][]float64) { - for r := 0; r < nb; r++ { - xb[r] = make([]float64, d) - for c := 0; c < d; c++ { - xb[r][c] = float64(rand.Float32() * 1000) - } - } -} diff --git a/initializer/initializer_test.go b/initializer/initializer_test.go deleted file mode 100644 index ffdfc91..0000000 --- a/initializer/initializer_test.go +++ /dev/null @@ -1,133 +0,0 @@ -package initializer - -import ( - "github.com/arjunsk/kmeans/containers" - "reflect" - "testing" -) - -// NOTE: This test is not used to check the output, but to compare the center choices with Kmeans++ initializer. -// Hence, using Logf instead of Errorf. -// NOTE: Kmeans initializer will return bad centers and will often throw warning more than Kmeans++ initializer test. -// This is expected behaviour. You can try re-running the test at package level, to see the difference. -func TestInitCentroids_random(t *testing.T) { - tests := []IO{ - genIO1(), - } - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - k := NewRandomInitializer() - got, err := k.InitCentroids(tt.args.vectors, tt.args.clusterCnt) - if (err != nil) != tt.wantErr { - t.Errorf("InitCentroids() error = %v, wantErr %v", err, tt.wantErr) - return - } - if len(got) != tt.args.clusterCnt { - t.Errorf("InitCentroids() got = %v, want = %v", len(got), tt.args.clusterCnt) - return - } - - oneMatched := false - for _, want := range tt.wantPossibilities { - if reflect.DeepEqual(got[0].Center(), want[0].Center()) && reflect.DeepEqual(got[1].Center(), want[1].Center()) || - reflect.DeepEqual(got[0].Center(), want[1].Center()) && reflect.DeepEqual(got[1].Center(), want[0].Center()) { - oneMatched = true - break - } - } - - if !oneMatched { - t.Logf("Kmeans initializer returned bad centers [Expected Behaviour]."+ - "Got = %v, want = %v", got, tt.wantPossibilities) - } - }) - } -} - -// NOTE: This test is not deterministic, but it is very unlikely to fail. -// Hence, using Logf instead of Errorf. -func TestInitCentroids_kmeansPlusPlus(t *testing.T) { - - tests := []struct { - IO - distFn containers.DistanceFunction - }{ - { - IO: genIO1(), - distFn: containers.EuclideanDistance, - }, - } - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - k := NewKmeansPlusPlusInitializer(tt.distFn) - got, err := k.InitCentroids(tt.args.vectors, tt.args.clusterCnt) - if (err != nil) != tt.wantErr { - t.Errorf("InitCentroids() error = %v, wantErr %v", err, tt.wantErr) - return - } - - if len(got) != tt.args.clusterCnt { - t.Errorf("InitCentroids() got = %v, want = %v", len(got), tt.args.clusterCnt) - return - } - - oneMatched := false - for _, want := range tt.wantPossibilities { - if reflect.DeepEqual(got[0].Center(), want[0].Center()) && reflect.DeepEqual(got[1].Center(), want[1].Center()) || - reflect.DeepEqual(got[0].Center(), want[1].Center()) && reflect.DeepEqual(got[1].Center(), want[0].Center()) { - oneMatched = true - break - } - } - - if !oneMatched { - t.Logf("Kmeans++ initializer returned bad centers [A Rare Occurance]."+ - "Got = %v, want = %v", got, tt.wantPossibilities) - } - }) - } -} - -func genIO1() IO { - return IO{ - name: "Test1", - args: Args{ - vectors: [][]float64{ - {1, 2, 3, 4}, {0, 3, 4, 1}, - {130, 200, 343, 224}, {100, 200, 300, 400}, - }, - clusterCnt: 2, - }, - wantPossibilities: []containers.Clusters{ - { - containers.NewCluster(containers.Vector{1, 2, 3, 4}), - containers.NewCluster(containers.Vector{100, 200, 300, 400}), - }, - { - containers.NewCluster(containers.Vector{1, 2, 3, 4}), - containers.NewCluster(containers.Vector{130, 200, 343, 224}), - }, - { - containers.NewCluster(containers.Vector{0, 3, 4, 1}), - containers.NewCluster(containers.Vector{100, 200, 300, 400}), - }, - { - containers.NewCluster(containers.Vector{0, 3, 4, 1}), - containers.NewCluster(containers.Vector{130, 200, 343, 224}), - }, - }, - wantErr: false, - } -} - -type Args struct { - vectors [][]float64 - clusterCnt int -} - -type IO struct { - name string - args Args - wantErr bool - wantPossibilities []containers.Clusters -} diff --git a/initializer/kmeans_plus_plus.go b/initializer/kmeans_plus_plus.go deleted file mode 100644 index 44e2793..0000000 --- a/initializer/kmeans_plus_plus.go +++ /dev/null @@ -1,77 +0,0 @@ -package initializer - -import ( - "errors" - "github.com/arjunsk/kmeans/containers" - "math/rand" -) - -type KmeansPlusPlus struct { - DistFn containers.DistanceFunction -} - -func NewKmeansPlusPlusInitializer(distFn containers.DistanceFunction) Initializer { - return &KmeansPlusPlus{ - DistFn: distFn, - } -} - -// InitCentroids initializes the centroids using kmeans++ algorithm -// Ref: https://www.youtube.com/watch?v=HatwtJSsj5Q -// Ref (animation):https://www.youtube.com/watch?v=efKGmOH4Y_A -// Complexity: O(k*n*k); n = number of vectors, k = number of clusters -// Reason: For k-1 times, compute the distance of each vector to its nearest center O(nk) -func (kpp *KmeansPlusPlus) InitCentroids(vectors [][]float64, clusterCnt int) (clusters containers.Clusters, err error) { - err = validateArgs(vectors, clusterCnt) - if err != nil { - return nil, err - } - - if kpp.DistFn == nil { - return nil, errors.New("KMeans: distance function cannot be nil") - } - - clusters = make([]containers.Cluster, clusterCnt) - - // 1. start with a random center - randIdx := rand.Intn(len(vectors)) - clusters[0] = containers.NewCluster(vectors[randIdx]) - - // O(k-1) - for i := 1; i < clusterCnt; i++ { - // NOTE: Since Nearest function is called on clusters-1, parallel handling - // can cause bugs, since all the clusters are not initialized. - distances := make([]float64, len(vectors)) - sum := 0.0 - minDistance := 0.0 - // 2. for each data point, compute the distance to the existing centers - // O(n) - for vecIdx, vec := range vectors { - - // O(k) - _, minDistance, err = clusters[:i].Nearest(vec, kpp.DistFn) - if err != nil { - return nil, err - } - - distances[vecIdx] = minDistance * minDistance // D(x)^2 - sum += distances[vecIdx] - } - - // 3. choose the next random center, using a weighted probability distribution - // where it is chosen with probability proportional to D(x)^2 - // Ref: https://en.wikipedia.org/wiki/K-means%2B%2B#Improved_initialization_algorithm - // Ref: https://stats.stackexchange.com/a/272133/397621 - target := rand.Float64() * sum - idx := 0 - for currSum := distances[0]; currSum < target; currSum += distances[idx] { - idx++ - } - - // Select a cluster center based on a probability distribution where vectors - // with larger distances have a higher chance of being chosen as the center. - clusters[i] = containers.NewCluster(vectors[idx]) - - } - return clusters, nil -} diff --git a/initializer/random.go b/initializer/random.go deleted file mode 100644 index 848b939..0000000 --- a/initializer/random.go +++ /dev/null @@ -1,30 +0,0 @@ -package initializer - -import ( - "github.com/arjunsk/kmeans/containers" - "math/rand" - "time" -) - -type Random struct{} - -func NewRandomInitializer() Initializer { - return &Random{} -} - -func (k *Random) InitCentroids(vectors [][]float64, clusterCnt int) (containers.Clusters, error) { - err := validateArgs(vectors, clusterCnt) - if err != nil { - return nil, err - } - - inputCnt := len(vectors) - var clusters containers.Clusters = make([]containers.Cluster, clusterCnt) - random := rand.New(rand.NewSource(time.Now().UnixNano())) - for i := 0; i < clusterCnt; i++ { - randIdx := random.Intn(inputCnt) - clusters[i] = containers.NewCluster(vectors[randIdx]) - } - - return clusters, nil -} diff --git a/initializer/types.go b/initializer/types.go deleted file mode 100644 index a64c5ed..0000000 --- a/initializer/types.go +++ /dev/null @@ -1,22 +0,0 @@ -package initializer - -import ( - "errors" - "github.com/arjunsk/kmeans/containers" -) - -// Initializer is an interface and not a function type because initializer like kmeans++ -// requires extra arguments. -type Initializer interface { - InitCentroids(vectors [][]float64, clusterCnt int) (containers.Clusters, error) -} - -func validateArgs(vectors [][]float64, clusterCnt int) error { - if clusterCnt <= 0 { - return errors.New("KMeans: k cannot be less than or equal to zero") - } - if vectors == nil || len(vectors) == 0 { - return errors.New("KMeans: data cannot be empty") - } - return nil -} diff --git a/initializer/types_test.go b/initializer/types_test.go deleted file mode 100644 index cc3e331..0000000 --- a/initializer/types_test.go +++ /dev/null @@ -1,55 +0,0 @@ -package initializer - -import "testing" - -func Test_validateArgs(t *testing.T) { - type args struct { - vectors [][]float64 - clusterCnt int - } - tests := []struct { - name string - args args - wantErr bool - }{ - { - name: "Test 1 - nil vectors", - args: args{ - vectors: nil, - clusterCnt: 1, - }, - wantErr: true, - }, - { - name: "Test 2 - empty vectors", - args: args{ - vectors: make([][]float64, 0), - clusterCnt: 1, - }, - wantErr: true, - }, - { - name: "Test 3 - no error", - args: args{ - vectors: [][]float64{{1, 2, 3}, {4, 5, 6}}, - clusterCnt: 1, - }, - wantErr: false, - }, - { - name: "Test 3 - k = 0", - args: args{ - vectors: [][]float64{{1, 2, 3}, {4, 5, 6}}, - clusterCnt: 0, - }, - wantErr: true, - }, - } - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - if err := validateArgs(tt.args.vectors, tt.args.clusterCnt); (err != nil) != tt.wantErr { - t.Errorf("validateArgs() error = %v, wantErr %v", err, tt.wantErr) - } - }) - } -} diff --git a/moarray/external.go b/moarray/external.go new file mode 100644 index 0000000..a8c8c0b --- /dev/null +++ b/moarray/external.go @@ -0,0 +1,162 @@ +// Copyright 2023 Matrix Origin +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package moarray + +import ( + "github.com/arjunsk/kmeans/moerr" + "golang.org/x/exp/constraints" + "gonum.org/v1/gonum/mat" + "math" +) + +// Compare returns an integer comparing two arrays/vectors lexicographically. +// TODO: this function might not be correct. we need to compare using tolerance for float values. +// TODO: need to check if we need len(v1)==len(v2) check. +func Compare[T constraints.Float](v1, v2 []T) int { + minLen := len(v1) + if len(v2) < minLen { + minLen = len(v2) + } + + for i := 0; i < minLen; i++ { + if v1[i] < v2[i] { + return -1 + } else if v1[i] > v2[i] { + return 1 + } + } + + if len(v1) < len(v2) { + return -1 + } else if len(v1) > len(v2) { + return 1 + } + return 0 +} + +/* ------------ [START] Performance critical functions. ------- */ + +func InnerProduct[T constraints.Float](v1, v2 []T) (float64, error) { + + vec, err := ToGonumVectors[T](v1, v2) + if err != nil { + return 0, err + } + + return mat.Dot(vec[0], vec[1]), nil +} + +func L2Distance[T constraints.Float](v1, v2 []T) (float64, error) { + vec, err := ToGonumVectors[T](v1, v2) + if err != nil { + return 0, err + } + + diff := mat.NewVecDense(vec[0].Len(), nil) + diff.SubVec(vec[0], vec[1]) + + return math.Sqrt(mat.Dot(diff, diff)), nil +} + +func CosineDistance[T constraints.Float](v1, v2 []T) (float64, error) { + cosineSimilarity, err := CosineSimilarity[T](v1, v2) + if err != nil { + return 0, err + } + + return 1 - cosineSimilarity, nil +} + +func CosineSimilarity[T constraints.Float](v1, v2 []T) (float64, error) { + + vec, err := ToGonumVectors[T](v1, v2) + if err != nil { + return 0, err + } + + dotProduct := mat.Dot(vec[0], vec[1]) + + normVec1 := mat.Norm(vec[0], 2) + normVec2 := mat.Norm(vec[1], 2) + + if normVec1 == 0 || normVec2 == 0 { + return 0, moerr.NewInternalErrorNoCtx("cosine_similarity: one of the vectors is zero") + } + + cosineSimilarity := dotProduct / (normVec1 * normVec2) + + // Handle precision issues. Clamp the cosine_similarity to the range [-1, 1]. + if cosineSimilarity > 1.0 { + cosineSimilarity = 1.0 + } else if cosineSimilarity < -1.0 { + cosineSimilarity = -1.0 + } + + // NOTE: Downcast the float64 cosine_similarity to float32 and check if it is + // 1.0 or -1.0 to avoid precision issue. + // + // Example for corner case: + // - cosine_similarity(a,a) = 1: + // - Without downcasting check, we get the following results: + // cosine_similarity( [0.46323407, 23.498016, 563.923, 56.076736, 8732.958] , + // [0.46323407, 23.498016, 563.923, 56.076736, 8732.958] ) = 0.9999999999999998 + // - With downcasting, we get the following results: + // cosine_similarity( [0.46323407, 23.498016, 563.923, 56.076736, 8732.958] , + // [0.46323407, 23.498016, 563.923, 56.076736, 8732.958] ) = 1 + // + // Reason: + // The reason for this check is + // 1. gonums mat.Dot, mat.Norm returns float64. In other databases, we mostly do float32 operations. + // 2. float64 operations are not exact. + // mysql> select 76586261.65813679/(8751.35770370157 *8751.35770370157); + //+-----------------------------------------------------------+ + //| 76586261.65813679 / (8751.35770370157 * 8751.35770370157) | + //+-----------------------------------------------------------+ + //| 1.000000000000 | + //+-----------------------------------------------------------+ + //mysql> select cast(76586261.65813679 as double)/(8751.35770370157 * 8751.35770370157); + //+---------------------------------------------------------------------------+ + //| cast(76586261.65813679 as double) / (8751.35770370157 * 8751.35770370157) | + //+---------------------------------------------------------------------------+ + //| 0.9999999999999996 | + //+---------------------------------------------------------------------------+ + // 3. We only need to handle the case for 1.0 and -1.0 with float32 precision. + // Rest of the cases can have float64 precision. + cosineSimilarityF32 := float32(cosineSimilarity) + if cosineSimilarityF32 == 1 { + cosineSimilarity = 1 + } else if cosineSimilarityF32 == -1 { + cosineSimilarity = -1 + } + + return cosineSimilarity, nil +} + +func NormalizeL2[T constraints.Float](v1 []T) ([]T, error) { + + vec := ToGonumVector[T](v1) + + norm := mat.Norm(vec, 2) + if norm == 0 { + // NOTE: don't throw error here. If you throw error, then when a zero vector comes in the Vector Index + // Mapping Query, the query will fail. Instead, return the same zero vector. + // This is consistent with FAISS:https://github.com/facebookresearch/faiss/blob/0716bde2500edb2e18509bf05f5dfa37bd698082/faiss/utils/distances.cpp#L97 + return v1, nil + } + + vec.ScaleVec(1/norm, vec) + + return ToMoArray[T](vec), nil +} diff --git a/moarray/external_test.go b/moarray/external_test.go new file mode 100644 index 0000000..1da8dc9 --- /dev/null +++ b/moarray/external_test.go @@ -0,0 +1,918 @@ +// Copyright 2023 Matrix Origin +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package moarray + +import ( + "github.com/matrixorigin/matrixone/pkg/common/assertx" + "reflect" + "testing" +) + +func TestAdd(t *testing.T) { + type args struct { + leftArgF32 []float32 + rightArgF32 []float32 + + leftArgF64 []float64 + rightArgF64 []float64 + } + type testCase struct { + name string + args args + wantF32 []float32 + wantF64 []float64 + } + tests := []testCase{ + { + name: "Test1 - float32", + args: args{leftArgF32: []float32{1, 2, 3}, rightArgF32: []float32{2, 3, 4}}, + wantF32: []float32{3, 5, 7}, + }, + { + name: "Test2 - float64", + args: args{leftArgF64: []float64{1, 2, 3}, rightArgF64: []float64{2, 3, 4}}, + wantF64: []float64{3, 5, 7}, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + + if tt.args.rightArgF32 != nil { + if gotRes, err := Add[float32](tt.args.leftArgF32, tt.args.rightArgF32); err != nil || !reflect.DeepEqual(gotRes, tt.wantF32) { + t.Errorf("Add() = %v, want %v", gotRes, tt.wantF32) + } + } + if tt.args.rightArgF64 != nil { + if gotRes, err := Add[float64](tt.args.leftArgF64, tt.args.rightArgF64); err != nil || !assertx.InEpsilonF64Slice(gotRes, tt.wantF64) { + t.Errorf("Add() = %v, want %v", gotRes, tt.wantF64) + } + } + }) + } +} + +func TestSubtract(t *testing.T) { + type args struct { + leftArgF32 []float32 + rightArgF32 []float32 + + leftArgF64 []float64 + rightArgF64 []float64 + } + type testCase struct { + name string + args args + wantF32 []float32 + wantF64 []float64 + } + tests := []testCase{ + { + name: "Test1 - float32", + args: args{leftArgF32: []float32{1, 2, 3}, rightArgF32: []float32{2, 3, 4}}, + wantF32: []float32{-1, -1, -1}, + }, + { + name: "Test2 - float64", + args: args{leftArgF64: []float64{1, 4, 3}, rightArgF64: []float64{1, 3, 4}}, + wantF64: []float64{0, 1, -1}, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + + if tt.args.rightArgF32 != nil { + if gotRes, err := Subtract[float32](tt.args.leftArgF32, tt.args.rightArgF32); err != nil || !reflect.DeepEqual(gotRes, tt.wantF32) { + t.Errorf("Subtract() = %v, want %v", gotRes, tt.wantF32) + } + } + if tt.args.rightArgF64 != nil { + if gotRes, err := Subtract[float64](tt.args.leftArgF64, tt.args.rightArgF64); err != nil || !assertx.InEpsilonF64Slice(tt.wantF64, gotRes) { + t.Errorf("Subtract() = %v, want %v", gotRes, tt.wantF64) + } + } + }) + } +} + +func TestMultiply(t *testing.T) { + type args struct { + leftArgF32 []float32 + rightArgF32 []float32 + + leftArgF64 []float64 + rightArgF64 []float64 + } + type testCase struct { + name string + args args + wantF32 []float32 + wantF64 []float64 + } + tests := []testCase{ + { + name: "Test1 - float32", + args: args{leftArgF32: []float32{1, 2, 3}, rightArgF32: []float32{2, 3, 4}}, + wantF32: []float32{2, 6, 12}, + }, + { + name: "Test2 - float64", + args: args{leftArgF64: []float64{1, 4, 3}, rightArgF64: []float64{1, 3, 4}}, + wantF64: []float64{1, 12, 12}, + }, + { + name: "Test3 - float64", + args: args{leftArgF64: []float64{0.66616553}, rightArgF64: []float64{0.66616553}}, + wantF64: []float64{0.4437765133601809}, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + + if tt.args.rightArgF32 != nil { + if gotRes, err := Multiply[float32](tt.args.leftArgF32, tt.args.rightArgF32); err != nil || !reflect.DeepEqual(tt.wantF32, gotRes) { + t.Errorf("Multiply() = %v, want %v", gotRes, tt.wantF32) + } + } + if tt.args.rightArgF64 != nil { + if gotRes, err := Multiply[float64](tt.args.leftArgF64, tt.args.rightArgF64); err != nil || !assertx.InEpsilonF64Slice(tt.wantF64, gotRes) { + t.Errorf("Multiply() = %v, want %v", gotRes, tt.wantF64) + } + } + }) + } +} + +func TestDivide(t *testing.T) { + type args struct { + leftArgF32 []float32 + rightArgF32 []float32 + + leftArgF64 []float64 + rightArgF64 []float64 + } + type testCase struct { + name string + args args + wantF32 []float32 + wantF64 []float64 + wantErr bool + } + tests := []testCase{ + { + name: "Test1 - float32", + args: args{leftArgF32: []float32{1, 2, 3}, rightArgF32: []float32{2, 3, 4}}, + wantF32: []float32{0.5, 0.6666667, 0.75}, + }, + { + name: "Test2 - float32 - div by zero", + args: args{leftArgF32: []float32{1, 4, 3}, rightArgF32: []float32{1, 0, 4}}, + wantErr: true, + }, + { + name: "Test3 - float64", + args: args{leftArgF64: []float64{1, 4, 3}, rightArgF64: []float64{1, 3, 4}}, + wantF64: []float64{1, 1.3333333333333333, 0.75}, + }, + { + name: "Test4 - float64 - div by zero", + args: args{leftArgF64: []float64{1, 4, 3}, rightArgF64: []float64{1, 0, 4}}, + wantErr: true, + }, + { + name: "Test5 - float64 - dimension mismatch", + args: args{leftArgF64: []float64{1, 4}, rightArgF64: []float64{1, 1, 4}}, + wantErr: true, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + + if tt.args.rightArgF32 != nil { + if tt.wantErr { + if _, gotErr := Divide[float32](tt.args.leftArgF32, tt.args.rightArgF32); gotErr == nil { + t.Errorf("Divide() should throw error") + } + } else if gotRes, err := Divide[float32](tt.args.leftArgF32, tt.args.rightArgF32); err != nil || !reflect.DeepEqual(gotRes, tt.wantF32) { + t.Errorf("Divide() = %v, want %v", gotRes, tt.wantF32) + } + } + if tt.args.rightArgF64 != nil { + if tt.wantErr { + if _, gotErr := Divide[float64](tt.args.leftArgF64, tt.args.rightArgF64); gotErr == nil { + t.Errorf("Divide() should throw error") + } + } else if gotRes, err := Divide[float64](tt.args.leftArgF64, tt.args.rightArgF64); err != nil || !assertx.InEpsilonF64Slice(tt.wantF64, gotRes) { + t.Errorf("Divide() = %v, want %v", gotRes, tt.wantF64) + } + } + }) + } +} + +func TestCompare(t *testing.T) { + type args struct { + leftArgF32 []float32 + rightArgF32 []float32 + + leftArgF64 []float64 + rightArgF64 []float64 + } + type testCase struct { + name string + args args + want int + } + tests := []testCase{ + { + name: "Test1 - float32-less", + args: args{leftArgF32: []float32{1, 2, 3}, rightArgF32: []float32{2, 3, 4}}, + want: -1, + }, + { + name: "Test2 - float32-large", + args: args{leftArgF32: []float32{3, 2, 3}, rightArgF32: []float32{2, 3, 4}}, + want: 1, + }, + { + name: "Test3 - float32-equal", + args: args{leftArgF32: []float32{3, 2, 3}, rightArgF32: []float32{3, 2, 3}}, + want: 0, + }, + { + name: "Test4 - float64-less", + args: args{leftArgF64: []float64{1, 2, 3}, rightArgF64: []float64{2, 3, 4}}, + want: -1, + }, + { + name: "Test5 - float64-large", + args: args{leftArgF64: []float64{3, 2, 3}, rightArgF64: []float64{2, 3, 4}}, + want: 1, + }, + { + name: "Test6 - float64-equal", + args: args{leftArgF64: []float64{3, 2, 3}, rightArgF64: []float64{3, 2, 3}}, + want: 0, + }, + { + name: "Test7 - float64 difference dims", + args: args{leftArgF64: []float64{3, 2}, rightArgF64: []float64{3, 2, 3}}, + want: -1, + }, + { + name: "Test7 - float64 difference dims", + args: args{leftArgF64: []float64{3, 2, 3}, rightArgF64: []float64{3, 2}}, + want: 1, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + + if tt.args.rightArgF32 != nil { + if gotRes := Compare[float32](tt.args.leftArgF32, tt.args.rightArgF32); !reflect.DeepEqual(gotRes, tt.want) { + t.Errorf("CompareArray() = %v, want %v", gotRes, tt.want) + } + } + if tt.args.rightArgF64 != nil { + if gotRes := Compare[float64](tt.args.leftArgF64, tt.args.rightArgF64); !reflect.DeepEqual(gotRes, tt.want) { + t.Errorf("CompareArray() = %v, want %v", gotRes, tt.want) + } + } + + }) + } +} + +func TestCast(t *testing.T) { + type args struct { + argF32 []float32 + argF64 []float64 + } + type testCase struct { + name string + args args + wantF32 []float32 + wantF64 []float64 + } + tests := []testCase{ + { + name: "Test1 - float32 to float64", + args: args{argF32: []float32{1, 2, 3}}, + wantF64: []float64{1, 2, 3}, + }, + { + name: "Test2 - float64 to float32", + args: args{argF64: []float64{1, 4, 3}}, + wantF32: []float32{1, 4, 3}, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + + if tt.args.argF32 != nil && tt.wantF64 != nil { + if gotResF64, err := Cast[float32, float64](tt.args.argF32); err != nil || !assertx.InEpsilonF64Slice(gotResF64, tt.wantF64) { + t.Errorf("Cast() = %v, want %v", gotResF64, tt.wantF64) + } + } + if tt.args.argF64 != nil && tt.wantF32 != nil { + if gotResF32, err := Cast[float64, float32](tt.args.argF64); err != nil || !reflect.DeepEqual(gotResF32, tt.wantF32) { + t.Errorf("Cast() = %v, want %v", gotResF32, tt.wantF32) + } + } + }) + } +} + +func TestAbs(t *testing.T) { + type args struct { + argF32 []float32 + argF64 []float64 + } + type testCase struct { + name string + args args + + wantF32 []float32 + wantF64 []float64 + } + tests := []testCase{ + { + name: "Test1 - float32", + args: args{argF32: []float32{-1, 0, -3.4e+38}}, + wantF32: []float32{1, 0, 3.4e+38}, + }, + { + name: "Test2 - float64", + args: args{argF64: []float64{-1, 0, 3}}, + wantF64: []float64{1, 0, 3}, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + + if tt.args.argF32 != nil { + if gotRes, err := Abs[float32](tt.args.argF32); err != nil || !reflect.DeepEqual(gotRes, tt.wantF32) { + t.Errorf("Abs() = %v, want %v", gotRes, tt.wantF32) + } + } + if tt.args.argF64 != nil { + if gotRes, err := Abs[float64](tt.args.argF64); err != nil || !assertx.InEpsilonF64Slice(tt.wantF64, gotRes) { + t.Errorf("Abs() = %v, want %v", gotRes, tt.wantF64) + } + } + }) + } +} + +func TestNormalizeL2(t *testing.T) { + type args struct { + argF32 []float32 + argF64 []float64 + } + type testCase struct { + name string + args args + + wantF32 []float32 + wantF64 []float64 + wantErr bool + } + tests := []testCase{ + { + name: "Test1 - float32 - zero vector", + args: args{argF32: []float32{0, 0, 0}}, + wantF32: []float32{0, 0, 0}, + }, + { + name: "Test1.b - float32", + args: args{argF32: []float32{1, 2, 3}}, + wantF32: []float32{0.26726124, 0.5345225, 0.80178374}, + }, + { + name: "Test1.c - float32", + args: args{argF32: []float32{10, 3.333333333333333, 4, 5}}, + wantF32: []float32{0.8108108, 0.27027026, 0.32432434, 0.4054054}, + }, + { + name: "Test2 - float64 - zero vector", + args: args{argF64: []float64{0, 0, 0}}, + wantF64: []float64{0, 0, 0}, + }, + { + name: "Test3 - float64", + args: args{argF64: []float64{1, 2, 3}}, + wantF64: []float64{0.2672612419124244, 0.5345224838248488, 0.8017837257372732}, + }, + { + name: "Test4 - float64", + args: args{argF64: []float64{-1, 2, 3}}, + wantF64: []float64{-0.2672612419124244, 0.5345224838248488, 0.8017837257372732}, + }, + { + name: "Test5 - float64", + args: args{argF64: []float64{10, 3.333333333333333, 4, 5}}, + wantF64: []float64{0.8108108108108107, 0.27027027027027023, 0.3243243243243243, 0.4054054054054054}, + }, + { + name: "Test6 - float64", + args: args{argF64: []float64{1, 2, 3.6666666666666665, 4.666666666666666}}, + wantF64: []float64{0.15767649936829103, 0.31535299873658207, 0.5781471643504004, 0.7358236637186913}, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + + if tt.args.argF32 != nil { + if tt.wantErr { + if _, err := NormalizeL2[float32](tt.args.argF32); err == nil { + t.Errorf("NormalizeL2() should throw error") + } + } else if gotRes, err := NormalizeL2[float32](tt.args.argF32); err != nil || !reflect.DeepEqual(tt.wantF32, gotRes) { + t.Errorf("NormalizeL2() = %v, want %v", gotRes, tt.wantF32) + } + } + if tt.args.argF64 != nil { + if tt.wantErr { + if _, err := NormalizeL2[float64](tt.args.argF64); err == nil { + t.Errorf("NormalizeL2() should throw error") + } + } else if gotRes, err := NormalizeL2[float64](tt.args.argF64); err != nil || !assertx.InEpsilonF64Slice(tt.wantF64, gotRes) { + t.Errorf("NormalizeL2() = %v, want %v", gotRes, tt.wantF64) + } + } + }) + } +} + +func TestSqrt(t *testing.T) { + type args struct { + argF32 []float32 + argF64 []float64 + } + type testCase struct { + name string + args args + + wantF32 []float64 // ie result for argF32 is []float32 + wantF64 []float64 // ie result for argF64 is []float64 + wantErr bool + } + tests := []testCase{ + { + name: "Test1 - float32", + args: args{argF32: []float32{1, 0, 4}}, + wantF32: []float64{1, 0, 2}, + }, + { + name: "Test2 - float32 error case", + args: args{argF32: []float32{-1, 0, 4}}, + wantErr: true, + }, + { + name: "Test3 - float64", + args: args{argF64: []float64{1, 0, 4}}, + wantF64: []float64{1, 0, 2}, + }, + { + name: "Test4 - float64 error case", + args: args{argF64: []float64{-1, 0, 4}}, + wantErr: true, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + + if tt.args.argF32 != nil { + if tt.wantErr { + if _, err := Sqrt[float32](tt.args.argF32); err == nil { + t.Errorf("Sqrt() should throw error") + } + } else if gotRes, err := Sqrt[float32](tt.args.argF32); err != nil || !reflect.DeepEqual(gotRes, tt.wantF32) { + t.Errorf("Sqrt() = %v, want %v", err, tt.wantErr) + t.Errorf("Sqrt() = %v, want %v", gotRes, tt.wantF32) + } + } + if tt.args.argF64 != nil { + if tt.wantErr { + if _, err := Sqrt[float64](tt.args.argF64); err == nil { + t.Errorf("Sqrt() should throw error") + } + } else if gotRes, err := Sqrt[float64](tt.args.argF64); err != nil || !assertx.InEpsilonF64Slice(tt.wantF64, gotRes) { + t.Errorf("Sqrt() = %v, want %v", gotRes, tt.wantF64) + } + } + + }) + } +} + +func TestSummation(t *testing.T) { + type args struct { + argF32 []float32 + argF64 []float64 + } + type testCase struct { + name string + args args + want float64 + } + tests := []testCase{ + { + name: "Test1 - float32", + args: args{argF32: []float32{1, 2, 3}}, + want: 6, + }, + { + name: "Test2 - float64", + args: args{argF64: []float64{1, 2, 3}}, + want: 6, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + + if tt.args.argF32 != nil { + if gotRes, err := Summation[float32](tt.args.argF32); err != nil || !assertx.InEpsilonF64(tt.want, gotRes) { + t.Errorf("Summation() = %v, want %v", gotRes, tt.want) + } + } + if tt.args.argF64 != nil { + if gotRes, err := Summation[float64](tt.args.argF64); err != nil || !assertx.InEpsilonF64(tt.want, gotRes) { + t.Errorf("Summation() = %v, want %v", gotRes, tt.want) + } + } + + }) + } +} + +func TestInnerProduct(t *testing.T) { + type args struct { + argLeftF32 []float32 + argRightF32 []float32 + + argLeftF64 []float64 + argRightF64 []float64 + } + type testCase struct { + name string + args args + want float64 + } + tests := []testCase{ + { + name: "Test1 - float32", + args: args{argLeftF32: []float32{1, 2, 3}, argRightF32: []float32{1, 2, 3}}, + want: 14, + }, + { + name: "Test2 - float64", + args: args{argLeftF64: []float64{1, 2, 3}, argRightF64: []float64{1, 2, 3}}, + want: 14, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + + if tt.args.argLeftF32 != nil { + if gotRes, _ := InnerProduct[float32](tt.args.argLeftF32, tt.args.argRightF32); !assertx.InEpsilonF64(tt.want, gotRes) { + t.Errorf("InnerProduct() = %v, want %v", gotRes, tt.want) + } + } + if tt.args.argLeftF64 != nil { + if gotRes, _ := InnerProduct[float64](tt.args.argLeftF64, tt.args.argRightF64); !assertx.InEpsilonF64(tt.want, gotRes) { + t.Errorf("InnerProduct() = %v, want %v", gotRes, tt.want) + } + } + + }) + } +} + +func TestL1Norm(t *testing.T) { + type args struct { + argF32 []float32 + argF64 []float64 + } + type testCase struct { + name string + args args + want float64 + } + tests := []testCase{ + { + name: "Test1 - float32", + args: args{argF32: []float32{1, 2, 3}}, + want: 6, + }, + { + name: "Test2 - float64", + args: args{argF64: []float64{1, 2, 3}}, + want: 6, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + + if tt.args.argF32 != nil { + if gotRes, _ := L1Norm[float32](tt.args.argF32); !assertx.InEpsilonF64(tt.want, gotRes) { + t.Errorf("L1Norm() = %v, want %v", gotRes, tt.want) + } + } + if tt.args.argF64 != nil { + if gotRes, _ := L1Norm[float64](tt.args.argF64); !assertx.InEpsilonF64(tt.want, gotRes) { + t.Errorf("L1Norm() = %v, want %v", gotRes, tt.want) + } + } + + }) + } +} + +func TestL2Norm(t *testing.T) { + type args struct { + argF32 []float32 + argF64 []float64 + } + type testCase struct { + name string + args args + want float64 + } + tests := []testCase{ + { + name: "Test1 - float32", + args: args{argF32: []float32{1, 2, 3}}, + want: 3.741657386773941, + }, + { + name: "Test2 - float64", + args: args{argF64: []float64{1, 2, 3}}, + want: 3.741657386773941, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + + if tt.args.argF32 != nil { + if gotRes, _ := L2Norm[float32](tt.args.argF32); !assertx.InEpsilonF64(tt.want, gotRes) { + t.Errorf("L2Norm() = %v, want %v", gotRes, tt.want) + } + } + if tt.args.argF64 != nil { + if gotRes, _ := L2Norm[float64](tt.args.argF64); !assertx.InEpsilonF64(tt.want, gotRes) { + t.Errorf("L2Norm() = %v, want %v", gotRes, tt.want) + } + } + + }) + } +} + +func TestCosineSimilarity(t *testing.T) { + type args struct { + argLeftF32 []float32 + argRightF32 []float32 + + argLeftF64 []float64 + argRightF64 []float64 + } + type testCase struct { + name string + args args + want float64 + } + tests := []testCase{ + { + name: "Test1.a - float32", + args: args{argLeftF32: []float32{1, 2, 3}, argRightF32: []float32{1, 2, 3}}, + want: 1, + }, + { + name: "Test1.b - float32", + args: args{argLeftF32: []float32{0.46323407, 23.498016, 563.923, 56.076736, 8732.958}, argRightF32: []float32{0.46323407, 23.498016, 563.923, 56.076736, 8732.958}}, + want: 1, + }, + { + name: "Test2.a - float64", + args: args{argLeftF64: []float64{1, 2, 3}, argRightF64: []float64{1, 2, 3}}, + want: 1, + }, + { + name: "Test2.b - float64", + args: args{argLeftF64: []float64{0.46323407, 23.498016, 563.923, 56.076736, 8732.958}, argRightF64: []float64{0.46323407, 23.498016, 563.923, 56.076736, 8732.958}}, + want: 1, + }, + { + name: "Test2.c - float64", + args: args{argLeftF64: []float64{0.8166459, 0.66616553, 0.4886152}, argRightF64: []float64{0.8166459, 0.66616553, 0.4886152}}, + want: 1, + }, + { + name: "Test2.d - float64", + args: args{argLeftF64: []float64{8.5606893, 6.7903588, 821.977768}, argRightF64: []float64{8.5606893, 6.7903588, 821.977768}}, + want: 1, + }, + { + name: "Test2.e - float64", + args: args{argLeftF64: []float64{0.9260021, 0.26637346, 0.06567037}, argRightF64: []float64{0.9260021, 0.26637346, 0.06567037}}, + want: 1, + }, + { + name: "Test2.f - float64", + args: args{argLeftF64: []float64{0.45756745, 65.2996871, 321.623636, 3.60082066, 87.58445764}, argRightF64: []float64{0.45756745, 65.2996871, 321.623636, 3.60082066, 87.58445764}}, + want: 1, + }, + { + name: "Test2.g - float64", + args: args{argLeftF64: []float64{0.46323407, 23.49801546, 563.9229458, 56.07673508, 8732.9583881}, argRightF64: []float64{0.46323407, 23.49801546, 563.9229458, 56.07673508, 8732.9583881}}, + want: 1, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + + if tt.args.argLeftF32 != nil { + if gotRes, _ := CosineSimilarity[float32](tt.args.argLeftF32, tt.args.argRightF32); !reflect.DeepEqual(tt.want, gotRes) { + t.Errorf("CosineSimilarity() = %v, want %v", gotRes, tt.want) + } + } + if tt.args.argLeftF64 != nil { + if gotRes, _ := CosineSimilarity[float64](tt.args.argLeftF64, tt.args.argRightF64); !reflect.DeepEqual(tt.want, gotRes) { + t.Errorf("CosineSimilarity() = %v, want %v", gotRes, tt.want) + } + } + + }) + } +} + +func TestL2Distance(t *testing.T) { + type args struct { + argLeftF32 []float32 + argRightF32 []float32 + + argLeftF64 []float64 + argRightF64 []float64 + } + type testCase struct { + name string + args args + want float64 + } + tests := []testCase{ + { + name: "Test1 - float32", + args: args{argLeftF32: []float32{1, 2, 3}, argRightF32: []float32{10, 20, 30}}, + want: 33.67491648096547, + }, + { + name: "Test2 - float64", + args: args{argLeftF64: []float64{1, 2, 3}, argRightF64: []float64{10, 20, 30}}, + want: 33.67491648096547, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + + if tt.args.argLeftF32 != nil { + if gotRes, _ := L2Distance[float32](tt.args.argLeftF32, tt.args.argRightF32); !assertx.InEpsilonF64(tt.want, gotRes) { + t.Errorf("L2Distance() = %v, want %v", gotRes, tt.want) + } + } + if tt.args.argLeftF64 != nil { + if gotRes, _ := L2Distance[float64](tt.args.argLeftF64, tt.args.argRightF64); !assertx.InEpsilonF64(tt.want, gotRes) { + t.Errorf("L2Distance() = %v, want %v", gotRes, tt.want) + } + } + + }) + } +} + +func TestCosineDistance(t *testing.T) { + type args struct { + argLeftF32 []float32 + argRightF32 []float32 + + argLeftF64 []float64 + argRightF64 []float64 + } + type testCase struct { + name string + args args + want float64 + } + tests := []testCase{ + { + name: "Test1 - float32", + args: args{argLeftF32: []float32{1, 2, 3}, argRightF32: []float32{-1, -2, -3}}, + want: 2, + }, + { + name: "Test2 - float64", + args: args{argLeftF64: []float64{1, 2, 3}, argRightF64: []float64{1, 2, 3}}, + want: 0, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + + if tt.args.argLeftF32 != nil { + if gotRes, _ := CosineDistance[float32](tt.args.argLeftF32, tt.args.argRightF32); !assertx.InEpsilonF64(tt.want, gotRes) { + t.Errorf("CosineDistance() = %v, want %v", gotRes, tt.want) + } + } + if tt.args.argLeftF64 != nil { + if gotRes, _ := CosineDistance[float64](tt.args.argLeftF64, tt.args.argRightF64); !assertx.InEpsilonF64(tt.want, gotRes) { + t.Errorf("CosineDistance() = %v, want %v", gotRes, tt.want) + } + } + + }) + } +} + +func TestScalarOp(t *testing.T) { + type args struct { + argVecF32 []float32 + argVecF64 []float64 + argOp string + argSca float64 + } + type testCase struct { + name string + args args + wantVecF32 []float32 + wantVecF64 []float64 + } + tests := []testCase{ + { + name: "Test1 - float32", + args: args{argVecF32: []float32{1, 2, 3}, argOp: "+", argSca: 2}, + wantVecF32: []float32{3, 4, 5}, + }, + { + name: "Test2 - float32", + args: args{argVecF32: []float32{1, 2, 3}, argOp: "-", argSca: 2}, + wantVecF32: []float32{-1, 0, 1}, + }, + { + name: "Test3 - float32", + args: args{argVecF32: []float32{1, 2, 3}, argOp: "*", argSca: 2}, + wantVecF32: []float32{2, 4, 6}, + }, + { + name: "Test4 - float32", + args: args{argVecF32: []float32{1, 2, 3}, argOp: "/", argSca: 2}, + wantVecF32: []float32{0.5, 1, 1.5}, + }, + + { + name: "Test5 - float64", + args: args{argVecF64: []float64{1, 2, 3}, argOp: "+", argSca: 2}, + wantVecF64: []float64{3, 4, 5}, + }, + { + name: "Test6 - float64", + args: args{argVecF64: []float64{1, 2, 3}, argOp: "-", argSca: 2}, + wantVecF64: []float64{-1, 0, 1}, + }, + { + name: "Test7 - float64", + args: args{argVecF64: []float64{1, 2, 3}, argOp: "*", argSca: 2}, + wantVecF64: []float64{2, 4, 6}, + }, + { + name: "Test8 - float64", + args: args{argVecF64: []float64{1, 2, 3}, argOp: "/", argSca: 2}, + wantVecF64: []float64{0.5, 1, 1.5}, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + + if tt.args.argVecF32 != nil { + if gotRes, _ := ScalarOp[float32](tt.args.argVecF32, tt.args.argOp, tt.args.argSca); !reflect.DeepEqual(gotRes, tt.wantVecF32) { + t.Errorf("ScalarOp() = %v, want %v", gotRes, tt.wantVecF32) + } + } else if tt.args.argVecF64 != nil { + if gotRes, _ := ScalarOp[float64](tt.args.argVecF64, tt.args.argOp, tt.args.argSca); !reflect.DeepEqual(gotRes, tt.wantVecF64) { + t.Errorf("ScalarOp() = %v, want %v", gotRes, tt.wantVecF64) + } + } + + }) + } +} diff --git a/moarray/gonums_utils.go b/moarray/gonums_utils.go new file mode 100644 index 0000000..6b85a8f --- /dev/null +++ b/moarray/gonums_utils.go @@ -0,0 +1,76 @@ +// Copyright 2023 Matrix Origin +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package moarray + +import ( + "github.com/arjunsk/kmeans/moerr" + "golang.org/x/exp/constraints" + "gonum.org/v1/gonum/mat" +) + +func ToGonumVector[T constraints.Float](arr1 []T) *mat.VecDense { + + n := len(arr1) + _arr1 := make([]float64, n) + + //TODO: @arjun optimize this cast to retain float32 precision in float64 array + // if float64, just copy + // if float32, convert to float64 without losing precision + for i := 0; i < n; i++ { + _arr1[i] = float64(arr1[i]) + } + + return mat.NewVecDense(n, _arr1) +} + +func ToGonumVectors[T constraints.Float](arrays ...[]T) (res []*mat.VecDense, err error) { + + n := len(arrays) + if n == 0 { + return res, nil + } + + array0Dim := len(arrays[0]) + for i := 1; i < n; i++ { + if len(arrays[i]) != array0Dim { + return nil, moerr.NewArrayInvalidOpNoCtx(array0Dim, len(arrays[i])) + } + } + + res = make([]*mat.VecDense, n) + + for i, arr := range arrays { + res[i] = ToGonumVector[T](arr) + } + return res, nil +} + +func ToMoArray[T constraints.Float](vec *mat.VecDense) (arr []T) { + n := vec.Len() + arr = make([]T, n) + for i := 0; i < n; i++ { + //TODO: @arjun optimize this cast + arr[i] = T(vec.AtVec(i)) + } + return +} + +func ToMoArrays[T constraints.Float](vecs []*mat.VecDense) [][]T { + moVectors := make([][]T, len(vecs)) + for i, vec := range vecs { + moVectors[i] = ToMoArray[T](vec) + } + return moVectors +} diff --git a/moarray/gonums_utils_test.go b/moarray/gonums_utils_test.go new file mode 100644 index 0000000..844d093 --- /dev/null +++ b/moarray/gonums_utils_test.go @@ -0,0 +1,79 @@ +// Copyright 2023 Matrix Origin +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package moarray + +import ( + "gonum.org/v1/gonum/mat" + "reflect" + "testing" +) + +func Test_ToGonumVectors(t *testing.T) { + type args struct { + vectors [][]float64 + } + tests := []struct { + name string + args args + want []*mat.VecDense + }{ + { + name: "Test1", + args: args{ + vectors: [][]float64{{1, 2, 3}, {4, 5, 6}}, + }, + want: []*mat.VecDense{ + mat.NewVecDense(3, []float64{1, 2, 3}), + mat.NewVecDense(3, []float64{4, 5, 6}), + }, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + if got, _ := ToGonumVectors[float64](tt.args.vectors...); !reflect.DeepEqual(got, tt.want) { + t.Errorf("ToGonumVectors() = %v, want %v", got, tt.want) + } + }) + } +} + +func Test_ToMoArrays(t *testing.T) { + type args struct { + vectors []*mat.VecDense + } + tests := []struct { + name string + args args + want [][]float64 + }{ + { + name: "Test1", + args: args{ + vectors: []*mat.VecDense{ + mat.NewVecDense(3, []float64{1, 2, 3}), + mat.NewVecDense(3, []float64{4, 5, 6}), + }, + }, + want: [][]float64{{1, 2, 3}, {4, 5, 6}}, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + if got := ToMoArrays[float64](tt.args.vectors); !reflect.DeepEqual(got, tt.want) { + t.Errorf("ToMoArrays() = %v, want %v", got, tt.want) + } + }) + } +} diff --git a/moarray/internal.go b/moarray/internal.go new file mode 100644 index 0000000..fad5099 --- /dev/null +++ b/moarray/internal.go @@ -0,0 +1,42 @@ +// Copyright 2023 Matrix Origin +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package moarray + +import "gonum.org/v1/gonum/mat" + +// These functions are use internally by the kmeans algorithm and vector index etc. They are not exposed externally. + +// NormalizeGonumVector normalizes a vector in place. +// Note that this function is used by the kmeans algorithm. Here, if we get a zero vector, we do not normalize it and +// return it directly. This is because the zero vector is a valid vector in the kmeans algorithm. +func NormalizeGonumVector(vector *mat.VecDense) { + norm := mat.Norm(vector, 2) + if norm != 0 { + vector.ScaleVec(1/norm, vector) + } +} + +func NormalizeGonumVectors(vectors []*mat.VecDense) { + for i := range vectors { + NormalizeGonumVector(vectors[i]) + } +} + +//// NormalizeMoVecf64 is used only in test functions. +//func NormalizeMoVecf64(vector []float64) []float64 { +// res := ToGonumVector[float64](vector) +// //NormalizeGonumVector(res) +// return ToMoArray[float64](res) +//} diff --git a/moarray/internal_test.go b/moarray/internal_test.go new file mode 100644 index 0000000..82d9077 --- /dev/null +++ b/moarray/internal_test.go @@ -0,0 +1,69 @@ +// Copyright 2023 Matrix Origin +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package moarray + +//func TestNormalizeMoArray(t *testing.T) { +// type args struct { +// vector []float64 +// } +// tests := []struct { +// name string +// args args +// want []float64 +// }{ +// { +// name: "Test1", +// args: args{ +// vector: []float64{1, 2, 3}, +// }, +// want: []float64{0.2672612419124244, 0.5345224838248488, 0.8017837257372732}, +// }, +// { +// name: "Test2", +// args: args{ +// vector: []float64{-1, 2, 3}, +// }, +// want: []float64{-0.2672612419124244, 0.5345224838248488, 0.8017837257372732}, +// }, +// { +// name: "Test3", +// args: args{ +// vector: []float64{0, 0, 0}, +// }, +// want: []float64{0, 0, 0}, +// }, +// { +// name: "Test4", +// args: args{ +// vector: []float64{10, 3.333333333333333, 4, 5}, +// }, +// want: []float64{0.8108108108108107, 0.27027027027027023, 0.3243243243243243, 0.4054054054054054}, +// }, +// { +// name: "Test5", +// args: args{ +// vector: []float64{1, 2, 3.6666666666666665, 4.666666666666666}, +// }, +// want: []float64{0.15767649936829103, 0.31535299873658207, 0.5781471643504005, 0.7358236637186913}, +// }, +// } +// for _, tt := range tests { +// t.Run(tt.name, func(t *testing.T) { +// if got := NormalizeMoVecf64(tt.args.vector); !assertx.InEpsilonF64Slice(tt.want, got) { +// t.Errorf("NormalizeMoVecf64() = %v, want %v", got, tt.want) +// } +// }) +// } +//} diff --git a/moerr/error.go b/moerr/error.go new file mode 100644 index 0000000..70a08ab --- /dev/null +++ b/moerr/error.go @@ -0,0 +1,20 @@ +package moerr + +import ( + "errors" + "fmt" +) + +func NewInternalErrorNoCtx(msg string, args ...any) error { + xmsg := fmt.Sprintf(msg, args...) + return errors.New(xmsg) +} + +func NewDivByZeroNoCtx() error { + return errors.New("division by zero") +} + +func NewArrayInvalidOpNoCtx(expected, actual int) error { + xmsg := fmt.Sprintf("vector ops between different dimensions (%v, %v) is not permitted.", expected, actual) + return errors.New(xmsg) +} diff --git a/options.go b/options.go deleted file mode 100644 index 8d82a7e..0000000 --- a/options.go +++ /dev/null @@ -1,116 +0,0 @@ -package kmeans - -import ( - "fmt" - "github.com/arjunsk/kmeans/clusterer" - "github.com/arjunsk/kmeans/containers" - "github.com/arjunsk/kmeans/initializer" -) - -var ( - defaultDeltaThreshold = 0.01 - defaultIterationThreshold = 500 -) - -type ClustererType int - -const ( - ELKAN ClustererType = iota - KMEANS - KMEANSPP // Kmeans Plus Plus -) - -type options struct { - clusterType ClustererType - vectors [][]float64 - clusterCnt int - distanceFn containers.DistanceFunction - initializer initializer.Initializer - deltaThreshold *float64 - iterationThreshold *int -} - -func (o *options) fillDefaults() { - if o.initializer == nil && o.clusterType != KMEANSPP { - o.initializer = initializer.NewRandomInitializer() - } - if o.distanceFn == nil { - o.distanceFn = containers.EuclideanDistance - } - if o.deltaThreshold == nil { - o.deltaThreshold = &defaultDeltaThreshold - } - if o.iterationThreshold == nil { - o.iterationThreshold = &defaultIterationThreshold - } - -} - -type Option func(options *options) error - -// NewCluster constructs an options instance with the provided functional options. -func NewCluster(clustererType ClustererType, vectors [][]float64, clusterCnt int, opts ...Option) (clusterer.Clusterer, error) { - o := &options{ - clusterType: clustererType, - vectors: vectors, - clusterCnt: clusterCnt, - } - for _, opt := range opts { - if err := opt(o); err != nil { - return nil, err - } - } - o.fillDefaults() - - switch o.clusterType { - case ELKAN: - return clusterer.NewKmeansElkan(o.vectors, o.clusterCnt, - *o.deltaThreshold, *o.iterationThreshold, - o.distanceFn, o.initializer) - case KMEANS: - return clusterer.NewKmeans(o.vectors, o.clusterCnt, - *o.deltaThreshold, *o.iterationThreshold, - o.distanceFn, o.initializer) - case KMEANSPP: - if o.initializer != nil { - panic("can't override initializer for KMEANS_PLUS_PLUS") - } - return clusterer.NewKmeansPlusPlus(o.vectors, o.clusterCnt, - *o.deltaThreshold, *o.iterationThreshold, - o.distanceFn) - default: - return nil, fmt.Errorf("invalid cluster type %v", o.clusterType) - } -} - -// WithDistanceFunction sets the distance function in options. -func WithDistanceFunction(fn containers.DistanceFunction) Option { - return func(o *options) error { - o.distanceFn = fn - return nil - } -} - -// WithInitializer sets the initializer in options. -func WithInitializer(init initializer.Initializer) Option { - return func(o *options) error { - o.initializer = init - return nil - } -} - -// WithDeltaThreshold sets the delta threshold in options. -func WithDeltaThreshold(delta float64) Option { - return func(o *options) error { - o.deltaThreshold = &delta - return nil - } -} - -// WithIterationThreshold sets the iteration threshold in options. -func WithIterationThreshold(iterations int) Option { - return func(o *options) error { - o.iterationThreshold = &iterations - return nil - } -} diff --git a/sampler/srs.go b/sampler/srs.go deleted file mode 100644 index 74f4fe0..0000000 --- a/sampler/srs.go +++ /dev/null @@ -1,33 +0,0 @@ -package sampler - -import ( - "math/rand" - "time" -) - -// SrsSampling ie Simple Random Sampling, sample input with equal probability. -func SrsSampling[T any](input []T, percent float64) []T { - if percent >= 100.0 { - return input - } - if percent <= 0.0 { - return nil - } - - r := rand.New(rand.NewSource(time.Now().UnixNano())) - - n := int(float64(len(input)) * percent / 100.0) // calculate how many items based on percentage - subset := make([]T, 0, n) - selected := make(map[int]bool) - - for i := 0; i < n; i++ { - idx := r.Intn(len(input)) - for selected[idx] { - idx = r.Intn(len(input)) - } - selected[idx] = true - subset = append(subset, input[idx]) - } - - return subset -} diff --git a/sampler/srs_test.go b/sampler/srs_test.go deleted file mode 100644 index 44d0c07..0000000 --- a/sampler/srs_test.go +++ /dev/null @@ -1,34 +0,0 @@ -package sampler - -import ( - "testing" -) - -func TestSrsSampling(t *testing.T) { - type args[T any] struct { - input []T - percent float64 - } - type testCase[T any] struct { - name string - args args[T] - wantCount int - } - tests := []testCase[int]{ - { - name: "Test1", - args: struct { - input []int - percent float64 - }{input: []int{1, 2, 3, 4, 5, 6}, percent: 50}, - wantCount: 3, - }, - } - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - if got := SrsSampling(tt.args.input, tt.args.percent); len(got) != tt.wantCount { - t.Errorf("SrsSampling() actual wantCount = %v, expected wantCount %v", len(got), tt.wantCount) - } - }) - } -} diff --git a/sampler/types.go b/sampler/types.go deleted file mode 100644 index 87972d4..0000000 --- a/sampler/types.go +++ /dev/null @@ -1,5 +0,0 @@ -package sampler - -// This pkg implements sub-sampling algorithms to pick a subset of items from the input - -type Sampling[T any] func(input []T, percent float64) []T diff --git a/types.go b/types.go new file mode 100644 index 0000000..c7dc9d7 --- /dev/null +++ b/types.go @@ -0,0 +1,45 @@ +// Copyright 2023 Matrix Origin +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package kmeans + +import "gonum.org/v1/gonum/mat" + +const DefaultRandSeed = 1 + +type Clusterer interface { + InitCentroids() error + Cluster() ([][]float64, error) + SSE() float64 +} + +type DistanceType uint16 + +const ( + L2Distance DistanceType = iota + //InnerProduct + //CosineDistance +) + +type InitType uint16 + +const ( + Random InitType = iota + KmeansPlusPlus +) + +// DistanceFunction is a function that computes the distance between two vectors +// NOTE: clusterer already ensures that the all the input vectors are of the same length, +// so we don't need to check for that here again and return error if the lengths are different. +type DistanceFunction func(v1, v2 *mat.VecDense) float64