diff --git a/README.md b/README.md index 2d9a981..cd8d4b6 100644 --- a/README.md +++ b/README.md @@ -6,9 +6,7 @@ This is a simple implementation of the [Elkan's Kmeans](https://cdn.aaai.org/ICML/2003/ICML03-022.pdf) -algorithm in Go. The library also contains [Kmeans++](https://en.wikipedia.org/wiki/K-means%2B%2B), -[Lloyd's kmeans](https://en.wikipedia.org/wiki/K-means_clustering#Standard_algorithm_(naive_k-means)) and -[Simple Random Sampling](https://en.wikipedia.org/wiki/Simple_random_sample) algorithms. +algorithm in Go. ### Installing @@ -24,36 +22,44 @@ package main import ( "fmt" "github.com/arjunsk/kmeans" + "github.com/arjunsk/kmeans/elkans" ) func main() { - vectors := [][]float64{ + vectorList := [][]float64{ {1, 2, 3, 4}, - {0, 3, 4, 1}, - {0, 9, 3, 1}, - {0, 8, 4, 4}, - {130, 200, 343, 224}, - {100, 200, 300, 400}, - {300, 400, 200, 110}, + {1, 2, 4, 5}, + {1, 2, 4, 5}, + {1, 2, 3, 4}, + {1, 2, 4, 5}, + {1, 2, 4, 5}, + {10, 2, 4, 5}, + {10, 3, 4, 5}, + {10, 5, 4, 5}, + {10, 2, 4, 5}, + {10, 3, 4, 5}, + {10, 5, 4, 5}, } - clusterer, err := kmeans.NewCluster(kmeans.ELKAN, vectors, 2) + clusterer, err := elkans.NewKMeans(vectorList, 2, + 500, 0.5, + kmeans.L2Distance, kmeans.KmeansPlusPlus, false) if err != nil { panic(err) } - clusters, err := clusterer.Cluster() + centroids, err := clusterer.Cluster() if err != nil { panic(err) } - for _, cluster := range clusters { - fmt.Println(cluster.Center()) + for _, centroid := range centroids { + fmt.Println(centroid) } - // Output: - // [1 2 3 4] - // [130 200 343 224] - + /* + [1 2 3.6666666666666665 4.666666666666666] + [10 3.333333333333333 4 5] + */ } ``` @@ -61,57 +67,7 @@ func main() {
Read More -#### Why not Kmeans++ initialization in Elkan's? - -The default settings of Elkan's Kmeans is to use [random initialization](/initializer/random.go) -instead of [Kmeans++ initialization](/initializer/kmeans_plus_plus.go). - -Based on the excerpt -from [FAISS discussion](https://github.com/facebookresearch/faiss/issues/268#issuecomment-348184505), it was observed -that Kmeans++ overhead computation cost is not worth for large scale use case. - -> Scikitlearn uses k-means++ initialization by default (you can also use random points), which is good in the specific -> corner-case you consider. It should actually gives you perfect result even without any iteration with high -> probability, -> because the kind of evaluation you consider is exactly what k-means++ has be designed to better handle. -> We have not implemented it in Faiss, because with our former Yael library, which implements both k-means++ and regular -> random initialization, we observed that the overhead computational cost was not worth the saving (negligible) in all -> large-scale settings we have considered. - -#### When should you consider sub-sampling? - -As mentioned [here](https://github.com/facebookresearch/faiss/wiki/FAQ#can-i-ignore-warning-clustering-xxx-points-to-yyy-centroids), -when the number of vectors is large, it is recommended to use sub-sampling. - - -> When applying k-means algorithm to cluster n points to k centroids, there are several cases: -> -> - n < k: this raises an exception with an assertion because we cannot do anything meaningful -> - n < min_points_per_centroid * k: this produces the warning above. It means that usually there are too few points to - reliably estimate the centroids. This may still be ok if the dataset to index is as small as the training set. -> - n < max_points_per_centroid * k: comfort zone -> - n > max_points_per_centroid * k: there are too many points, making k-means unnecessarily slow. Then the training set - is sampled. -> ->The parameters {min,max}_points_per_centroids (39 and 256 by default) belong to the ClusteringParameters structure. - -#### What could be your sample size? -- [Apache Sedona](https://github.com/apache/sedona/blob/06e7d679ff979a4f052e0afe5df0b303bf8d70fb/spark/common/src/main/java/org/apache/sedona/core/utils/RDDSampleUtils.java#L36C10-L36C10) uses the following sampling rule. - -> Number of partitions (ie K) cannot exceed half the number of records. -> -> Returns total number of records if it is < 1000. Otherwise, returns 1% of the total number -> of records or 2x number of partitions whichever is larger. Never returns a -> number > Integer.MAX_VALUE. - -The 2x could be based on the dimension of vector (here is geo-coordinates). For example, if the vector is -1000 dimension, then the sample size could be Max( 1% * total vectors, 1000 x k). - -- Based on FAISS, the sample size could be `max_points_per_centroid * k` if `n > max_points_per_centroid * k`. - - - -#### What should be the ideal K? +#### What should be the ideal Centroids Count? Based on the recommendations from [PGVector](https://github.com/pgvector/pgvector/tree/master#ivfflat) IVF INDEX, the idea K should diff --git a/elkans/clusterer.go b/elkans/clusterer.go index da174f3..37476e3 100644 --- a/elkans/clusterer.go +++ b/elkans/clusterer.go @@ -16,8 +16,8 @@ package elkans import ( "github.com/arjunsk/kmeans" - "github.com/arjunsk/kmeans/moarray" - "github.com/arjunsk/kmeans/moerr" + moarray2 "github.com/arjunsk/kmeans/utils/moarray" + "github.com/arjunsk/kmeans/utils/moerr" "gonum.org/v1/gonum/mat" "math" "math/rand" @@ -87,7 +87,7 @@ func NewKMeans(vectors [][]float64, clusterCnt, return nil, err } - gonumVectors, err := moarray.ToGonumVectors[float64](vectors...) + gonumVectors, err := moarray2.ToGonumVectors[float64](vectors...) if err != nil { return nil, err } @@ -153,11 +153,11 @@ func (km *ElkanClusterer) InitCentroids() error { // Cluster returns the final centroids and the error if any. func (km *ElkanClusterer) Cluster() ([][]float64, error) { if km.normalize { - moarray.NormalizeGonumVectors(km.vectorList) + moarray2.NormalizeGonumVectors(km.vectorList) } if km.vectorCnt == km.clusterCnt { - return moarray.ToMoArrays[float64](km.vectorList), nil + return moarray2.ToMoArrays[float64](km.vectorList), nil } err := km.InitCentroids() // step 0.1 @@ -172,7 +172,7 @@ func (km *ElkanClusterer) Cluster() ([][]float64, error) { return nil, err } - return moarray.ToMoArrays[float64](res), nil + return moarray2.ToMoArrays[float64](res), nil } func (km *ElkanClusterer) elkansCluster() ([]*mat.VecDense, error) { @@ -384,7 +384,7 @@ func (km *ElkanClusterer) recalculateCentroids() []*mat.VecDense { // normalize the random vector if km.normalize { - moarray.NormalizeGonumVector(newCentroids[c]) + moarray2.NormalizeGonumVector(newCentroids[c]) } } else { // find the mean of the cluster members diff --git a/elkans/clusterer_test.go b/elkans/clusterer_test.go index bf6b951..a9708a8 100644 --- a/elkans/clusterer_test.go +++ b/elkans/clusterer_test.go @@ -16,8 +16,8 @@ package elkans import ( "github.com/arjunsk/kmeans" - "github.com/arjunsk/kmeans/assertx" - "github.com/arjunsk/kmeans/moarray" + "github.com/arjunsk/kmeans/utils/assertx" + "github.com/arjunsk/kmeans/utils/moarray" "reflect" "testing" ) diff --git a/elkans/distance_func.go b/elkans/distance_func.go index ef60ea7..02f1859 100644 --- a/elkans/distance_func.go +++ b/elkans/distance_func.go @@ -16,8 +16,9 @@ package elkans import ( "github.com/arjunsk/kmeans" - "github.com/arjunsk/kmeans/moerr" + "github.com/arjunsk/kmeans/utils/moerr" "gonum.org/v1/gonum/mat" + "math" ) // L2Distance is used for L2Distance distance in Euclidean Kmeans. @@ -27,34 +28,34 @@ func L2Distance(v1, v2 *mat.VecDense) float64 { return mat.Norm(diff, 2) } -//// SphericalDistance is used for InnerProduct and CosineDistance in Spherical Kmeans. -//// NOTE: spherical distance between two points on a sphere is equal to the -//// angular distance between the two points, scaled by pi. -//// Refs: -//// https://en.wikipedia.org/wiki/Great-circle_distance#Vector_version -//func SphericalDistance(v1, v2 *mat.VecDense) float64 { -// // Compute the dot product of the two vectors. -// // The dot product of two vectors is a measure of their similarity, -// // and it can be used to calculate the angle between them. -// dp := mat.Dot(v1, v2) -// -// // Prevent NaN with acos with loss of precision. -// if dp > 1.0 { -// dp = 1.0 -// } else if dp < -1.0 { -// dp = -1.0 -// } -// -// theta := math.Acos(dp) -// -// //To scale the result to the range [0, 1], we divide by Pi. -// return theta / math.Pi -// -// // NOTE: -// // Cosine distance is a measure of the similarity between two vectors. [Not satisfy triangle inequality] -// // Angular distance is a measure of the angular separation between two points. [Satisfy triangle inequality] -// // Spherical distance is a measure of the spatial separation between two points on a sphere. [Satisfy triangle inequality] -//} +// SphericalDistance is used for InnerProduct and CosineDistance in Spherical Kmeans. +// NOTE: spherical distance between two points on a sphere is equal to the +// angular distance between the two points, scaled by pi. +// Refs: +// https://en.wikipedia.org/wiki/Great-circle_distance#Vector_version +func SphericalDistance(v1, v2 *mat.VecDense) float64 { + // Compute the dot product of the two vectors. + // The dot product of two vectors is a measure of their similarity, + // and it can be used to calculate the angle between them. + dp := mat.Dot(v1, v2) + + // Prevent NaN with acos with loss of precision. + if dp > 1.0 { + dp = 1.0 + } else if dp < -1.0 { + dp = -1.0 + } + + theta := math.Acos(dp) + + //To scale the result to the range [0, 1], we divide by Pi. + return theta / math.Pi + + // NOTE: + // Cosine distance is a measure of the similarity between two vectors. [Not satisfy triangle inequality] + // Angular distance is a measure of the angular separation between two points. [Satisfy triangle inequality] + // Spherical distance is a measure of the spatial separation between two points on a sphere. [Satisfy triangle inequality] +} // resolveDistanceFn returns the distance function corresponding to the distance type // Distance function should satisfy triangle inequality. @@ -66,8 +67,8 @@ func resolveDistanceFn(distType kmeans.DistanceType) (kmeans.DistanceFunction, e switch distType { case kmeans.L2Distance: distanceFunction = L2Distance - //case kmeans.InnerProduct, kmeans.CosineDistance: - // distanceFunction = SphericalDistance + case kmeans.InnerProduct, kmeans.CosineDistance: + distanceFunction = SphericalDistance default: return nil, moerr.NewInternalErrorNoCtx("invalid distance type") } diff --git a/elkans/distance_func_bench_test.go b/elkans/distance_func_bench_test.go index 6235c95..e644359 100644 --- a/elkans/distance_func_bench_test.go +++ b/elkans/distance_func_bench_test.go @@ -15,7 +15,7 @@ package elkans import ( - "github.com/arjunsk/kmeans/moarray" + moarray2 "github.com/arjunsk/kmeans/utils/moarray" "gonum.org/v1/gonum/mat" "math/rand" "testing" @@ -43,7 +43,7 @@ func Benchmark_L2Distance(b *testing.B) { b.ResetTimer() for i := 0; i < b.N; i++ { - _, _ = moarray.NormalizeL2[float64](v1[i]) + _, _ = moarray2.NormalizeL2[float64](v1[i]) } }) @@ -52,8 +52,8 @@ func Benchmark_L2Distance(b *testing.B) { b.ResetTimer() for i := 0; i < b.N; i++ { - v21, _ := moarray.NormalizeL2[float64](v2[i]) - _ = L2Distance(v1[i], moarray.ToGonumVector(v21)) + v21, _ := moarray2.NormalizeL2[float64](v2[i]) + _ = L2Distance(v1[i], moarray2.ToGonumVector(v21)) } }) diff --git a/elkans/distance_func_test.go b/elkans/distance_func_test.go index 383830f..23f7ba2 100644 --- a/elkans/distance_func_test.go +++ b/elkans/distance_func_test.go @@ -15,7 +15,8 @@ package elkans import ( - "github.com/arjunsk/kmeans/moarray" + "github.com/arjunsk/kmeans/utils/assertx" + "github.com/arjunsk/kmeans/utils/moarray" "testing" ) @@ -79,91 +80,91 @@ func Test_L2Distance(t *testing.T) { } } -//func Test_AngularDistance(t *testing.T) { -// type args struct { -// v1 []float64 -// v2 []float64 -// } -// tests := []struct { -// name string -// args args -// want float64 -// }{ -// { -// name: "Test 1", -// args: args{ -// v1: []float64{1, 2, 3, 4}, -// v2: []float64{1, 2, 4, 5}, -// }, -// want: 0, -// }, -// { -// name: "Test 2", -// args: args{ -// v1: []float64{10, 20, 30, 40}, -// v2: []float64{10.5, 21.5, 31.5, 43.5}, -// }, -// want: 0, -// }, -// // Test 3: Triangle Inequality check on **un-normalized** vector -// // A(1,0),B(2,2), C(0,1) => AB + AC !>= BC => 0 + 0 !>= 0.5 -// { -// name: "Test 3.a", -// args: args{ -// v1: []float64{1, 0}, -// v2: []float64{2, 2}, -// }, -// want: 0, -// }, -// { -// name: "Test 3.b", -// args: args{ -// v1: []float64{2, 2}, -// v2: []float64{0, 1}, -// }, -// want: 0, -// }, -// { -// name: "Test 3.c", -// args: args{ -// v1: []float64{0, 1}, -// v2: []float64{1, 0}, -// }, -// want: 0.5, -// }, -// // Test 4: Triangle Inequality check on **normalized** vector -// // A(1,0),B(2,2), C(0,1) => AB + AC >= BC => 0.25 + 0.25 >= 0.5 -// //{ -// // name: "Test 4.a", -// // args: args{ -// // v1: moarray.NormalizeMoVecf64([]float64{1, 0}), -// // v2: moarray.NormalizeMoVecf64([]float64{2, 2}), -// // }, -// // want: 0.25000000000000006, -// //}, -// //{ -// // name: "Test 4.b", -// // args: args{ -// // v1: moarray.NormalizeMoVecf64([]float64{2, 2}), -// // v2: moarray.NormalizeMoVecf64([]float64{0, 1}), -// // }, -// // want: 0.25000000000000006, -// //}, -// //{ -// // name: "Test 4.c", -// // args: args{ -// // v1: moarray.NormalizeMoVecf64([]float64{0, 1}), -// // v2: moarray.NormalizeMoVecf64([]float64{1, 0}), -// // }, -// // want: 0.5, -// //}, -// } -// for _, tt := range tests { -// t.Run(tt.name, func(t *testing.T) { -// -// if got := SphericalDistance(moarray.ToGonumVector[float64](tt.args.v1), moarray.ToGonumVector[float64](tt.args.v2)); !assertx.InEpsilonF64(got, tt.want) { -// t.Errorf("SphericalDistance() = %v, want %v", got, tt.want) -// } -// }) -// } -//} +func Test_AngularDistance(t *testing.T) { + type args struct { + v1 []float64 + v2 []float64 + } + tests := []struct { + name string + args args + want float64 + }{ + { + name: "Test 1", + args: args{ + v1: []float64{1, 2, 3, 4}, + v2: []float64{1, 2, 4, 5}, + }, + want: 0, + }, + { + name: "Test 2", + args: args{ + v1: []float64{10, 20, 30, 40}, + v2: []float64{10.5, 21.5, 31.5, 43.5}, + }, + want: 0, + }, + // Test 3: Triangle Inequality check on **un-normalized** vector + // A(1,0),B(2,2), C(0,1) => AB + AC !>= BC => 0 + 0 !>= 0.5 + { + name: "Test 3.a", + args: args{ + v1: []float64{1, 0}, + v2: []float64{2, 2}, + }, + want: 0, + }, + { + name: "Test 3.b", + args: args{ + v1: []float64{2, 2}, + v2: []float64{0, 1}, + }, + want: 0, + }, + { + name: "Test 3.c", + args: args{ + v1: []float64{0, 1}, + v2: []float64{1, 0}, + }, + want: 0.5, + }, + ////Test 4: Triangle Inequality check on **normalized** vector + ////A(1,0),B(2,2), C(0,1) => AB + AC >= BC => 0.25 + 0.25 >= 0.5 + //{ + // name: "Test 4.a", + // args: args{ + // v1: moarray.NormalizeMoVecf64([]float64{1, 0}), + // v2: moarray.NormalizeMoVecf64([]float64{2, 2}), + // }, + // want: 0.25000000000000006, + //}, + //{ + // name: "Test 4.b", + // args: args{ + // v1: moarray.NormalizeMoVecf64([]float64{2, 2}), + // v2: moarray.NormalizeMoVecf64([]float64{0, 1}), + // }, + // want: 0.25000000000000006, + //}, + //{ + // name: "Test 4.c", + // args: args{ + // v1: moarray.NormalizeMoVecf64([]float64{0, 1}), + // v2: moarray.NormalizeMoVecf64([]float64{1, 0}), + // }, + // want: 0.5, + //}, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + + if got := SphericalDistance(moarray.ToGonumVector[float64](tt.args.v1), moarray.ToGonumVector[float64](tt.args.v2)); !assertx.InEpsilonF64(got, tt.want) { + t.Errorf("SphericalDistance() = %v, want %v", got, tt.want) + } + }) + } +} diff --git a/elkans/initializer_test.go b/elkans/initializer_test.go index aa1ca2e..39310b0 100644 --- a/elkans/initializer_test.go +++ b/elkans/initializer_test.go @@ -15,7 +15,7 @@ package elkans import ( - "github.com/arjunsk/kmeans/moarray" + "github.com/arjunsk/kmeans/utils/moarray" "reflect" "testing" ) diff --git a/examples/sample1/main.go b/examples/sample1/main.go new file mode 100644 index 0000000..6299cff --- /dev/null +++ b/examples/sample1/main.go @@ -0,0 +1,40 @@ +package main + +import ( + "fmt" + "github.com/arjunsk/kmeans" + "github.com/arjunsk/kmeans/elkans" +) + +func main() { + vectorList := [][]float64{ + {1, 2, 3, 4}, + {1, 2, 4, 5}, + {1, 2, 4, 5}, + {1, 2, 3, 4}, + {1, 2, 4, 5}, + {1, 2, 4, 5}, + {10, 2, 4, 5}, + {10, 3, 4, 5}, + {10, 5, 4, 5}, + {10, 2, 4, 5}, + {10, 3, 4, 5}, + {10, 5, 4, 5}, + } + + clusterer, err := elkans.NewKMeans(vectorList, 2, + 500, 0.5, + kmeans.L2Distance, kmeans.KmeansPlusPlus, false) + if err != nil { + panic(err) + } + + centroids, err := clusterer.Cluster() + if err != nil { + panic(err) + } + + for _, centroid := range centroids { + fmt.Println(centroid) + } +} diff --git a/moarray/external.go b/moarray/external.go deleted file mode 100644 index a8c8c0b..0000000 --- a/moarray/external.go +++ /dev/null @@ -1,162 +0,0 @@ -// Copyright 2023 Matrix Origin -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package moarray - -import ( - "github.com/arjunsk/kmeans/moerr" - "golang.org/x/exp/constraints" - "gonum.org/v1/gonum/mat" - "math" -) - -// Compare returns an integer comparing two arrays/vectors lexicographically. -// TODO: this function might not be correct. we need to compare using tolerance for float values. -// TODO: need to check if we need len(v1)==len(v2) check. -func Compare[T constraints.Float](v1, v2 []T) int { - minLen := len(v1) - if len(v2) < minLen { - minLen = len(v2) - } - - for i := 0; i < minLen; i++ { - if v1[i] < v2[i] { - return -1 - } else if v1[i] > v2[i] { - return 1 - } - } - - if len(v1) < len(v2) { - return -1 - } else if len(v1) > len(v2) { - return 1 - } - return 0 -} - -/* ------------ [START] Performance critical functions. ------- */ - -func InnerProduct[T constraints.Float](v1, v2 []T) (float64, error) { - - vec, err := ToGonumVectors[T](v1, v2) - if err != nil { - return 0, err - } - - return mat.Dot(vec[0], vec[1]), nil -} - -func L2Distance[T constraints.Float](v1, v2 []T) (float64, error) { - vec, err := ToGonumVectors[T](v1, v2) - if err != nil { - return 0, err - } - - diff := mat.NewVecDense(vec[0].Len(), nil) - diff.SubVec(vec[0], vec[1]) - - return math.Sqrt(mat.Dot(diff, diff)), nil -} - -func CosineDistance[T constraints.Float](v1, v2 []T) (float64, error) { - cosineSimilarity, err := CosineSimilarity[T](v1, v2) - if err != nil { - return 0, err - } - - return 1 - cosineSimilarity, nil -} - -func CosineSimilarity[T constraints.Float](v1, v2 []T) (float64, error) { - - vec, err := ToGonumVectors[T](v1, v2) - if err != nil { - return 0, err - } - - dotProduct := mat.Dot(vec[0], vec[1]) - - normVec1 := mat.Norm(vec[0], 2) - normVec2 := mat.Norm(vec[1], 2) - - if normVec1 == 0 || normVec2 == 0 { - return 0, moerr.NewInternalErrorNoCtx("cosine_similarity: one of the vectors is zero") - } - - cosineSimilarity := dotProduct / (normVec1 * normVec2) - - // Handle precision issues. Clamp the cosine_similarity to the range [-1, 1]. - if cosineSimilarity > 1.0 { - cosineSimilarity = 1.0 - } else if cosineSimilarity < -1.0 { - cosineSimilarity = -1.0 - } - - // NOTE: Downcast the float64 cosine_similarity to float32 and check if it is - // 1.0 or -1.0 to avoid precision issue. - // - // Example for corner case: - // - cosine_similarity(a,a) = 1: - // - Without downcasting check, we get the following results: - // cosine_similarity( [0.46323407, 23.498016, 563.923, 56.076736, 8732.958] , - // [0.46323407, 23.498016, 563.923, 56.076736, 8732.958] ) = 0.9999999999999998 - // - With downcasting, we get the following results: - // cosine_similarity( [0.46323407, 23.498016, 563.923, 56.076736, 8732.958] , - // [0.46323407, 23.498016, 563.923, 56.076736, 8732.958] ) = 1 - // - // Reason: - // The reason for this check is - // 1. gonums mat.Dot, mat.Norm returns float64. In other databases, we mostly do float32 operations. - // 2. float64 operations are not exact. - // mysql> select 76586261.65813679/(8751.35770370157 *8751.35770370157); - //+-----------------------------------------------------------+ - //| 76586261.65813679 / (8751.35770370157 * 8751.35770370157) | - //+-----------------------------------------------------------+ - //| 1.000000000000 | - //+-----------------------------------------------------------+ - //mysql> select cast(76586261.65813679 as double)/(8751.35770370157 * 8751.35770370157); - //+---------------------------------------------------------------------------+ - //| cast(76586261.65813679 as double) / (8751.35770370157 * 8751.35770370157) | - //+---------------------------------------------------------------------------+ - //| 0.9999999999999996 | - //+---------------------------------------------------------------------------+ - // 3. We only need to handle the case for 1.0 and -1.0 with float32 precision. - // Rest of the cases can have float64 precision. - cosineSimilarityF32 := float32(cosineSimilarity) - if cosineSimilarityF32 == 1 { - cosineSimilarity = 1 - } else if cosineSimilarityF32 == -1 { - cosineSimilarity = -1 - } - - return cosineSimilarity, nil -} - -func NormalizeL2[T constraints.Float](v1 []T) ([]T, error) { - - vec := ToGonumVector[T](v1) - - norm := mat.Norm(vec, 2) - if norm == 0 { - // NOTE: don't throw error here. If you throw error, then when a zero vector comes in the Vector Index - // Mapping Query, the query will fail. Instead, return the same zero vector. - // This is consistent with FAISS:https://github.com/facebookresearch/faiss/blob/0716bde2500edb2e18509bf05f5dfa37bd698082/faiss/utils/distances.cpp#L97 - return v1, nil - } - - vec.ScaleVec(1/norm, vec) - - return ToMoArray[T](vec), nil -} diff --git a/moarray/external_test.go b/moarray/external_test.go deleted file mode 100644 index 1da8dc9..0000000 --- a/moarray/external_test.go +++ /dev/null @@ -1,918 +0,0 @@ -// Copyright 2023 Matrix Origin -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package moarray - -import ( - "github.com/matrixorigin/matrixone/pkg/common/assertx" - "reflect" - "testing" -) - -func TestAdd(t *testing.T) { - type args struct { - leftArgF32 []float32 - rightArgF32 []float32 - - leftArgF64 []float64 - rightArgF64 []float64 - } - type testCase struct { - name string - args args - wantF32 []float32 - wantF64 []float64 - } - tests := []testCase{ - { - name: "Test1 - float32", - args: args{leftArgF32: []float32{1, 2, 3}, rightArgF32: []float32{2, 3, 4}}, - wantF32: []float32{3, 5, 7}, - }, - { - name: "Test2 - float64", - args: args{leftArgF64: []float64{1, 2, 3}, rightArgF64: []float64{2, 3, 4}}, - wantF64: []float64{3, 5, 7}, - }, - } - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - - if tt.args.rightArgF32 != nil { - if gotRes, err := Add[float32](tt.args.leftArgF32, tt.args.rightArgF32); err != nil || !reflect.DeepEqual(gotRes, tt.wantF32) { - t.Errorf("Add() = %v, want %v", gotRes, tt.wantF32) - } - } - if tt.args.rightArgF64 != nil { - if gotRes, err := Add[float64](tt.args.leftArgF64, tt.args.rightArgF64); err != nil || !assertx.InEpsilonF64Slice(gotRes, tt.wantF64) { - t.Errorf("Add() = %v, want %v", gotRes, tt.wantF64) - } - } - }) - } -} - -func TestSubtract(t *testing.T) { - type args struct { - leftArgF32 []float32 - rightArgF32 []float32 - - leftArgF64 []float64 - rightArgF64 []float64 - } - type testCase struct { - name string - args args - wantF32 []float32 - wantF64 []float64 - } - tests := []testCase{ - { - name: "Test1 - float32", - args: args{leftArgF32: []float32{1, 2, 3}, rightArgF32: []float32{2, 3, 4}}, - wantF32: []float32{-1, -1, -1}, - }, - { - name: "Test2 - float64", - args: args{leftArgF64: []float64{1, 4, 3}, rightArgF64: []float64{1, 3, 4}}, - wantF64: []float64{0, 1, -1}, - }, - } - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - - if tt.args.rightArgF32 != nil { - if gotRes, err := Subtract[float32](tt.args.leftArgF32, tt.args.rightArgF32); err != nil || !reflect.DeepEqual(gotRes, tt.wantF32) { - t.Errorf("Subtract() = %v, want %v", gotRes, tt.wantF32) - } - } - if tt.args.rightArgF64 != nil { - if gotRes, err := Subtract[float64](tt.args.leftArgF64, tt.args.rightArgF64); err != nil || !assertx.InEpsilonF64Slice(tt.wantF64, gotRes) { - t.Errorf("Subtract() = %v, want %v", gotRes, tt.wantF64) - } - } - }) - } -} - -func TestMultiply(t *testing.T) { - type args struct { - leftArgF32 []float32 - rightArgF32 []float32 - - leftArgF64 []float64 - rightArgF64 []float64 - } - type testCase struct { - name string - args args - wantF32 []float32 - wantF64 []float64 - } - tests := []testCase{ - { - name: "Test1 - float32", - args: args{leftArgF32: []float32{1, 2, 3}, rightArgF32: []float32{2, 3, 4}}, - wantF32: []float32{2, 6, 12}, - }, - { - name: "Test2 - float64", - args: args{leftArgF64: []float64{1, 4, 3}, rightArgF64: []float64{1, 3, 4}}, - wantF64: []float64{1, 12, 12}, - }, - { - name: "Test3 - float64", - args: args{leftArgF64: []float64{0.66616553}, rightArgF64: []float64{0.66616553}}, - wantF64: []float64{0.4437765133601809}, - }, - } - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - - if tt.args.rightArgF32 != nil { - if gotRes, err := Multiply[float32](tt.args.leftArgF32, tt.args.rightArgF32); err != nil || !reflect.DeepEqual(tt.wantF32, gotRes) { - t.Errorf("Multiply() = %v, want %v", gotRes, tt.wantF32) - } - } - if tt.args.rightArgF64 != nil { - if gotRes, err := Multiply[float64](tt.args.leftArgF64, tt.args.rightArgF64); err != nil || !assertx.InEpsilonF64Slice(tt.wantF64, gotRes) { - t.Errorf("Multiply() = %v, want %v", gotRes, tt.wantF64) - } - } - }) - } -} - -func TestDivide(t *testing.T) { - type args struct { - leftArgF32 []float32 - rightArgF32 []float32 - - leftArgF64 []float64 - rightArgF64 []float64 - } - type testCase struct { - name string - args args - wantF32 []float32 - wantF64 []float64 - wantErr bool - } - tests := []testCase{ - { - name: "Test1 - float32", - args: args{leftArgF32: []float32{1, 2, 3}, rightArgF32: []float32{2, 3, 4}}, - wantF32: []float32{0.5, 0.6666667, 0.75}, - }, - { - name: "Test2 - float32 - div by zero", - args: args{leftArgF32: []float32{1, 4, 3}, rightArgF32: []float32{1, 0, 4}}, - wantErr: true, - }, - { - name: "Test3 - float64", - args: args{leftArgF64: []float64{1, 4, 3}, rightArgF64: []float64{1, 3, 4}}, - wantF64: []float64{1, 1.3333333333333333, 0.75}, - }, - { - name: "Test4 - float64 - div by zero", - args: args{leftArgF64: []float64{1, 4, 3}, rightArgF64: []float64{1, 0, 4}}, - wantErr: true, - }, - { - name: "Test5 - float64 - dimension mismatch", - args: args{leftArgF64: []float64{1, 4}, rightArgF64: []float64{1, 1, 4}}, - wantErr: true, - }, - } - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - - if tt.args.rightArgF32 != nil { - if tt.wantErr { - if _, gotErr := Divide[float32](tt.args.leftArgF32, tt.args.rightArgF32); gotErr == nil { - t.Errorf("Divide() should throw error") - } - } else if gotRes, err := Divide[float32](tt.args.leftArgF32, tt.args.rightArgF32); err != nil || !reflect.DeepEqual(gotRes, tt.wantF32) { - t.Errorf("Divide() = %v, want %v", gotRes, tt.wantF32) - } - } - if tt.args.rightArgF64 != nil { - if tt.wantErr { - if _, gotErr := Divide[float64](tt.args.leftArgF64, tt.args.rightArgF64); gotErr == nil { - t.Errorf("Divide() should throw error") - } - } else if gotRes, err := Divide[float64](tt.args.leftArgF64, tt.args.rightArgF64); err != nil || !assertx.InEpsilonF64Slice(tt.wantF64, gotRes) { - t.Errorf("Divide() = %v, want %v", gotRes, tt.wantF64) - } - } - }) - } -} - -func TestCompare(t *testing.T) { - type args struct { - leftArgF32 []float32 - rightArgF32 []float32 - - leftArgF64 []float64 - rightArgF64 []float64 - } - type testCase struct { - name string - args args - want int - } - tests := []testCase{ - { - name: "Test1 - float32-less", - args: args{leftArgF32: []float32{1, 2, 3}, rightArgF32: []float32{2, 3, 4}}, - want: -1, - }, - { - name: "Test2 - float32-large", - args: args{leftArgF32: []float32{3, 2, 3}, rightArgF32: []float32{2, 3, 4}}, - want: 1, - }, - { - name: "Test3 - float32-equal", - args: args{leftArgF32: []float32{3, 2, 3}, rightArgF32: []float32{3, 2, 3}}, - want: 0, - }, - { - name: "Test4 - float64-less", - args: args{leftArgF64: []float64{1, 2, 3}, rightArgF64: []float64{2, 3, 4}}, - want: -1, - }, - { - name: "Test5 - float64-large", - args: args{leftArgF64: []float64{3, 2, 3}, rightArgF64: []float64{2, 3, 4}}, - want: 1, - }, - { - name: "Test6 - float64-equal", - args: args{leftArgF64: []float64{3, 2, 3}, rightArgF64: []float64{3, 2, 3}}, - want: 0, - }, - { - name: "Test7 - float64 difference dims", - args: args{leftArgF64: []float64{3, 2}, rightArgF64: []float64{3, 2, 3}}, - want: -1, - }, - { - name: "Test7 - float64 difference dims", - args: args{leftArgF64: []float64{3, 2, 3}, rightArgF64: []float64{3, 2}}, - want: 1, - }, - } - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - - if tt.args.rightArgF32 != nil { - if gotRes := Compare[float32](tt.args.leftArgF32, tt.args.rightArgF32); !reflect.DeepEqual(gotRes, tt.want) { - t.Errorf("CompareArray() = %v, want %v", gotRes, tt.want) - } - } - if tt.args.rightArgF64 != nil { - if gotRes := Compare[float64](tt.args.leftArgF64, tt.args.rightArgF64); !reflect.DeepEqual(gotRes, tt.want) { - t.Errorf("CompareArray() = %v, want %v", gotRes, tt.want) - } - } - - }) - } -} - -func TestCast(t *testing.T) { - type args struct { - argF32 []float32 - argF64 []float64 - } - type testCase struct { - name string - args args - wantF32 []float32 - wantF64 []float64 - } - tests := []testCase{ - { - name: "Test1 - float32 to float64", - args: args{argF32: []float32{1, 2, 3}}, - wantF64: []float64{1, 2, 3}, - }, - { - name: "Test2 - float64 to float32", - args: args{argF64: []float64{1, 4, 3}}, - wantF32: []float32{1, 4, 3}, - }, - } - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - - if tt.args.argF32 != nil && tt.wantF64 != nil { - if gotResF64, err := Cast[float32, float64](tt.args.argF32); err != nil || !assertx.InEpsilonF64Slice(gotResF64, tt.wantF64) { - t.Errorf("Cast() = %v, want %v", gotResF64, tt.wantF64) - } - } - if tt.args.argF64 != nil && tt.wantF32 != nil { - if gotResF32, err := Cast[float64, float32](tt.args.argF64); err != nil || !reflect.DeepEqual(gotResF32, tt.wantF32) { - t.Errorf("Cast() = %v, want %v", gotResF32, tt.wantF32) - } - } - }) - } -} - -func TestAbs(t *testing.T) { - type args struct { - argF32 []float32 - argF64 []float64 - } - type testCase struct { - name string - args args - - wantF32 []float32 - wantF64 []float64 - } - tests := []testCase{ - { - name: "Test1 - float32", - args: args{argF32: []float32{-1, 0, -3.4e+38}}, - wantF32: []float32{1, 0, 3.4e+38}, - }, - { - name: "Test2 - float64", - args: args{argF64: []float64{-1, 0, 3}}, - wantF64: []float64{1, 0, 3}, - }, - } - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - - if tt.args.argF32 != nil { - if gotRes, err := Abs[float32](tt.args.argF32); err != nil || !reflect.DeepEqual(gotRes, tt.wantF32) { - t.Errorf("Abs() = %v, want %v", gotRes, tt.wantF32) - } - } - if tt.args.argF64 != nil { - if gotRes, err := Abs[float64](tt.args.argF64); err != nil || !assertx.InEpsilonF64Slice(tt.wantF64, gotRes) { - t.Errorf("Abs() = %v, want %v", gotRes, tt.wantF64) - } - } - }) - } -} - -func TestNormalizeL2(t *testing.T) { - type args struct { - argF32 []float32 - argF64 []float64 - } - type testCase struct { - name string - args args - - wantF32 []float32 - wantF64 []float64 - wantErr bool - } - tests := []testCase{ - { - name: "Test1 - float32 - zero vector", - args: args{argF32: []float32{0, 0, 0}}, - wantF32: []float32{0, 0, 0}, - }, - { - name: "Test1.b - float32", - args: args{argF32: []float32{1, 2, 3}}, - wantF32: []float32{0.26726124, 0.5345225, 0.80178374}, - }, - { - name: "Test1.c - float32", - args: args{argF32: []float32{10, 3.333333333333333, 4, 5}}, - wantF32: []float32{0.8108108, 0.27027026, 0.32432434, 0.4054054}, - }, - { - name: "Test2 - float64 - zero vector", - args: args{argF64: []float64{0, 0, 0}}, - wantF64: []float64{0, 0, 0}, - }, - { - name: "Test3 - float64", - args: args{argF64: []float64{1, 2, 3}}, - wantF64: []float64{0.2672612419124244, 0.5345224838248488, 0.8017837257372732}, - }, - { - name: "Test4 - float64", - args: args{argF64: []float64{-1, 2, 3}}, - wantF64: []float64{-0.2672612419124244, 0.5345224838248488, 0.8017837257372732}, - }, - { - name: "Test5 - float64", - args: args{argF64: []float64{10, 3.333333333333333, 4, 5}}, - wantF64: []float64{0.8108108108108107, 0.27027027027027023, 0.3243243243243243, 0.4054054054054054}, - }, - { - name: "Test6 - float64", - args: args{argF64: []float64{1, 2, 3.6666666666666665, 4.666666666666666}}, - wantF64: []float64{0.15767649936829103, 0.31535299873658207, 0.5781471643504004, 0.7358236637186913}, - }, - } - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - - if tt.args.argF32 != nil { - if tt.wantErr { - if _, err := NormalizeL2[float32](tt.args.argF32); err == nil { - t.Errorf("NormalizeL2() should throw error") - } - } else if gotRes, err := NormalizeL2[float32](tt.args.argF32); err != nil || !reflect.DeepEqual(tt.wantF32, gotRes) { - t.Errorf("NormalizeL2() = %v, want %v", gotRes, tt.wantF32) - } - } - if tt.args.argF64 != nil { - if tt.wantErr { - if _, err := NormalizeL2[float64](tt.args.argF64); err == nil { - t.Errorf("NormalizeL2() should throw error") - } - } else if gotRes, err := NormalizeL2[float64](tt.args.argF64); err != nil || !assertx.InEpsilonF64Slice(tt.wantF64, gotRes) { - t.Errorf("NormalizeL2() = %v, want %v", gotRes, tt.wantF64) - } - } - }) - } -} - -func TestSqrt(t *testing.T) { - type args struct { - argF32 []float32 - argF64 []float64 - } - type testCase struct { - name string - args args - - wantF32 []float64 // ie result for argF32 is []float32 - wantF64 []float64 // ie result for argF64 is []float64 - wantErr bool - } - tests := []testCase{ - { - name: "Test1 - float32", - args: args{argF32: []float32{1, 0, 4}}, - wantF32: []float64{1, 0, 2}, - }, - { - name: "Test2 - float32 error case", - args: args{argF32: []float32{-1, 0, 4}}, - wantErr: true, - }, - { - name: "Test3 - float64", - args: args{argF64: []float64{1, 0, 4}}, - wantF64: []float64{1, 0, 2}, - }, - { - name: "Test4 - float64 error case", - args: args{argF64: []float64{-1, 0, 4}}, - wantErr: true, - }, - } - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - - if tt.args.argF32 != nil { - if tt.wantErr { - if _, err := Sqrt[float32](tt.args.argF32); err == nil { - t.Errorf("Sqrt() should throw error") - } - } else if gotRes, err := Sqrt[float32](tt.args.argF32); err != nil || !reflect.DeepEqual(gotRes, tt.wantF32) { - t.Errorf("Sqrt() = %v, want %v", err, tt.wantErr) - t.Errorf("Sqrt() = %v, want %v", gotRes, tt.wantF32) - } - } - if tt.args.argF64 != nil { - if tt.wantErr { - if _, err := Sqrt[float64](tt.args.argF64); err == nil { - t.Errorf("Sqrt() should throw error") - } - } else if gotRes, err := Sqrt[float64](tt.args.argF64); err != nil || !assertx.InEpsilonF64Slice(tt.wantF64, gotRes) { - t.Errorf("Sqrt() = %v, want %v", gotRes, tt.wantF64) - } - } - - }) - } -} - -func TestSummation(t *testing.T) { - type args struct { - argF32 []float32 - argF64 []float64 - } - type testCase struct { - name string - args args - want float64 - } - tests := []testCase{ - { - name: "Test1 - float32", - args: args{argF32: []float32{1, 2, 3}}, - want: 6, - }, - { - name: "Test2 - float64", - args: args{argF64: []float64{1, 2, 3}}, - want: 6, - }, - } - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - - if tt.args.argF32 != nil { - if gotRes, err := Summation[float32](tt.args.argF32); err != nil || !assertx.InEpsilonF64(tt.want, gotRes) { - t.Errorf("Summation() = %v, want %v", gotRes, tt.want) - } - } - if tt.args.argF64 != nil { - if gotRes, err := Summation[float64](tt.args.argF64); err != nil || !assertx.InEpsilonF64(tt.want, gotRes) { - t.Errorf("Summation() = %v, want %v", gotRes, tt.want) - } - } - - }) - } -} - -func TestInnerProduct(t *testing.T) { - type args struct { - argLeftF32 []float32 - argRightF32 []float32 - - argLeftF64 []float64 - argRightF64 []float64 - } - type testCase struct { - name string - args args - want float64 - } - tests := []testCase{ - { - name: "Test1 - float32", - args: args{argLeftF32: []float32{1, 2, 3}, argRightF32: []float32{1, 2, 3}}, - want: 14, - }, - { - name: "Test2 - float64", - args: args{argLeftF64: []float64{1, 2, 3}, argRightF64: []float64{1, 2, 3}}, - want: 14, - }, - } - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - - if tt.args.argLeftF32 != nil { - if gotRes, _ := InnerProduct[float32](tt.args.argLeftF32, tt.args.argRightF32); !assertx.InEpsilonF64(tt.want, gotRes) { - t.Errorf("InnerProduct() = %v, want %v", gotRes, tt.want) - } - } - if tt.args.argLeftF64 != nil { - if gotRes, _ := InnerProduct[float64](tt.args.argLeftF64, tt.args.argRightF64); !assertx.InEpsilonF64(tt.want, gotRes) { - t.Errorf("InnerProduct() = %v, want %v", gotRes, tt.want) - } - } - - }) - } -} - -func TestL1Norm(t *testing.T) { - type args struct { - argF32 []float32 - argF64 []float64 - } - type testCase struct { - name string - args args - want float64 - } - tests := []testCase{ - { - name: "Test1 - float32", - args: args{argF32: []float32{1, 2, 3}}, - want: 6, - }, - { - name: "Test2 - float64", - args: args{argF64: []float64{1, 2, 3}}, - want: 6, - }, - } - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - - if tt.args.argF32 != nil { - if gotRes, _ := L1Norm[float32](tt.args.argF32); !assertx.InEpsilonF64(tt.want, gotRes) { - t.Errorf("L1Norm() = %v, want %v", gotRes, tt.want) - } - } - if tt.args.argF64 != nil { - if gotRes, _ := L1Norm[float64](tt.args.argF64); !assertx.InEpsilonF64(tt.want, gotRes) { - t.Errorf("L1Norm() = %v, want %v", gotRes, tt.want) - } - } - - }) - } -} - -func TestL2Norm(t *testing.T) { - type args struct { - argF32 []float32 - argF64 []float64 - } - type testCase struct { - name string - args args - want float64 - } - tests := []testCase{ - { - name: "Test1 - float32", - args: args{argF32: []float32{1, 2, 3}}, - want: 3.741657386773941, - }, - { - name: "Test2 - float64", - args: args{argF64: []float64{1, 2, 3}}, - want: 3.741657386773941, - }, - } - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - - if tt.args.argF32 != nil { - if gotRes, _ := L2Norm[float32](tt.args.argF32); !assertx.InEpsilonF64(tt.want, gotRes) { - t.Errorf("L2Norm() = %v, want %v", gotRes, tt.want) - } - } - if tt.args.argF64 != nil { - if gotRes, _ := L2Norm[float64](tt.args.argF64); !assertx.InEpsilonF64(tt.want, gotRes) { - t.Errorf("L2Norm() = %v, want %v", gotRes, tt.want) - } - } - - }) - } -} - -func TestCosineSimilarity(t *testing.T) { - type args struct { - argLeftF32 []float32 - argRightF32 []float32 - - argLeftF64 []float64 - argRightF64 []float64 - } - type testCase struct { - name string - args args - want float64 - } - tests := []testCase{ - { - name: "Test1.a - float32", - args: args{argLeftF32: []float32{1, 2, 3}, argRightF32: []float32{1, 2, 3}}, - want: 1, - }, - { - name: "Test1.b - float32", - args: args{argLeftF32: []float32{0.46323407, 23.498016, 563.923, 56.076736, 8732.958}, argRightF32: []float32{0.46323407, 23.498016, 563.923, 56.076736, 8732.958}}, - want: 1, - }, - { - name: "Test2.a - float64", - args: args{argLeftF64: []float64{1, 2, 3}, argRightF64: []float64{1, 2, 3}}, - want: 1, - }, - { - name: "Test2.b - float64", - args: args{argLeftF64: []float64{0.46323407, 23.498016, 563.923, 56.076736, 8732.958}, argRightF64: []float64{0.46323407, 23.498016, 563.923, 56.076736, 8732.958}}, - want: 1, - }, - { - name: "Test2.c - float64", - args: args{argLeftF64: []float64{0.8166459, 0.66616553, 0.4886152}, argRightF64: []float64{0.8166459, 0.66616553, 0.4886152}}, - want: 1, - }, - { - name: "Test2.d - float64", - args: args{argLeftF64: []float64{8.5606893, 6.7903588, 821.977768}, argRightF64: []float64{8.5606893, 6.7903588, 821.977768}}, - want: 1, - }, - { - name: "Test2.e - float64", - args: args{argLeftF64: []float64{0.9260021, 0.26637346, 0.06567037}, argRightF64: []float64{0.9260021, 0.26637346, 0.06567037}}, - want: 1, - }, - { - name: "Test2.f - float64", - args: args{argLeftF64: []float64{0.45756745, 65.2996871, 321.623636, 3.60082066, 87.58445764}, argRightF64: []float64{0.45756745, 65.2996871, 321.623636, 3.60082066, 87.58445764}}, - want: 1, - }, - { - name: "Test2.g - float64", - args: args{argLeftF64: []float64{0.46323407, 23.49801546, 563.9229458, 56.07673508, 8732.9583881}, argRightF64: []float64{0.46323407, 23.49801546, 563.9229458, 56.07673508, 8732.9583881}}, - want: 1, - }, - } - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - - if tt.args.argLeftF32 != nil { - if gotRes, _ := CosineSimilarity[float32](tt.args.argLeftF32, tt.args.argRightF32); !reflect.DeepEqual(tt.want, gotRes) { - t.Errorf("CosineSimilarity() = %v, want %v", gotRes, tt.want) - } - } - if tt.args.argLeftF64 != nil { - if gotRes, _ := CosineSimilarity[float64](tt.args.argLeftF64, tt.args.argRightF64); !reflect.DeepEqual(tt.want, gotRes) { - t.Errorf("CosineSimilarity() = %v, want %v", gotRes, tt.want) - } - } - - }) - } -} - -func TestL2Distance(t *testing.T) { - type args struct { - argLeftF32 []float32 - argRightF32 []float32 - - argLeftF64 []float64 - argRightF64 []float64 - } - type testCase struct { - name string - args args - want float64 - } - tests := []testCase{ - { - name: "Test1 - float32", - args: args{argLeftF32: []float32{1, 2, 3}, argRightF32: []float32{10, 20, 30}}, - want: 33.67491648096547, - }, - { - name: "Test2 - float64", - args: args{argLeftF64: []float64{1, 2, 3}, argRightF64: []float64{10, 20, 30}}, - want: 33.67491648096547, - }, - } - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - - if tt.args.argLeftF32 != nil { - if gotRes, _ := L2Distance[float32](tt.args.argLeftF32, tt.args.argRightF32); !assertx.InEpsilonF64(tt.want, gotRes) { - t.Errorf("L2Distance() = %v, want %v", gotRes, tt.want) - } - } - if tt.args.argLeftF64 != nil { - if gotRes, _ := L2Distance[float64](tt.args.argLeftF64, tt.args.argRightF64); !assertx.InEpsilonF64(tt.want, gotRes) { - t.Errorf("L2Distance() = %v, want %v", gotRes, tt.want) - } - } - - }) - } -} - -func TestCosineDistance(t *testing.T) { - type args struct { - argLeftF32 []float32 - argRightF32 []float32 - - argLeftF64 []float64 - argRightF64 []float64 - } - type testCase struct { - name string - args args - want float64 - } - tests := []testCase{ - { - name: "Test1 - float32", - args: args{argLeftF32: []float32{1, 2, 3}, argRightF32: []float32{-1, -2, -3}}, - want: 2, - }, - { - name: "Test2 - float64", - args: args{argLeftF64: []float64{1, 2, 3}, argRightF64: []float64{1, 2, 3}}, - want: 0, - }, - } - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - - if tt.args.argLeftF32 != nil { - if gotRes, _ := CosineDistance[float32](tt.args.argLeftF32, tt.args.argRightF32); !assertx.InEpsilonF64(tt.want, gotRes) { - t.Errorf("CosineDistance() = %v, want %v", gotRes, tt.want) - } - } - if tt.args.argLeftF64 != nil { - if gotRes, _ := CosineDistance[float64](tt.args.argLeftF64, tt.args.argRightF64); !assertx.InEpsilonF64(tt.want, gotRes) { - t.Errorf("CosineDistance() = %v, want %v", gotRes, tt.want) - } - } - - }) - } -} - -func TestScalarOp(t *testing.T) { - type args struct { - argVecF32 []float32 - argVecF64 []float64 - argOp string - argSca float64 - } - type testCase struct { - name string - args args - wantVecF32 []float32 - wantVecF64 []float64 - } - tests := []testCase{ - { - name: "Test1 - float32", - args: args{argVecF32: []float32{1, 2, 3}, argOp: "+", argSca: 2}, - wantVecF32: []float32{3, 4, 5}, - }, - { - name: "Test2 - float32", - args: args{argVecF32: []float32{1, 2, 3}, argOp: "-", argSca: 2}, - wantVecF32: []float32{-1, 0, 1}, - }, - { - name: "Test3 - float32", - args: args{argVecF32: []float32{1, 2, 3}, argOp: "*", argSca: 2}, - wantVecF32: []float32{2, 4, 6}, - }, - { - name: "Test4 - float32", - args: args{argVecF32: []float32{1, 2, 3}, argOp: "/", argSca: 2}, - wantVecF32: []float32{0.5, 1, 1.5}, - }, - - { - name: "Test5 - float64", - args: args{argVecF64: []float64{1, 2, 3}, argOp: "+", argSca: 2}, - wantVecF64: []float64{3, 4, 5}, - }, - { - name: "Test6 - float64", - args: args{argVecF64: []float64{1, 2, 3}, argOp: "-", argSca: 2}, - wantVecF64: []float64{-1, 0, 1}, - }, - { - name: "Test7 - float64", - args: args{argVecF64: []float64{1, 2, 3}, argOp: "*", argSca: 2}, - wantVecF64: []float64{2, 4, 6}, - }, - { - name: "Test8 - float64", - args: args{argVecF64: []float64{1, 2, 3}, argOp: "/", argSca: 2}, - wantVecF64: []float64{0.5, 1, 1.5}, - }, - } - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - - if tt.args.argVecF32 != nil { - if gotRes, _ := ScalarOp[float32](tt.args.argVecF32, tt.args.argOp, tt.args.argSca); !reflect.DeepEqual(gotRes, tt.wantVecF32) { - t.Errorf("ScalarOp() = %v, want %v", gotRes, tt.wantVecF32) - } - } else if tt.args.argVecF64 != nil { - if gotRes, _ := ScalarOp[float64](tt.args.argVecF64, tt.args.argOp, tt.args.argSca); !reflect.DeepEqual(gotRes, tt.wantVecF64) { - t.Errorf("ScalarOp() = %v, want %v", gotRes, tt.wantVecF64) - } - } - - }) - } -} diff --git a/types.go b/types.go index c7dc9d7..49fbf67 100644 --- a/types.go +++ b/types.go @@ -28,8 +28,8 @@ type DistanceType uint16 const ( L2Distance DistanceType = iota - //InnerProduct - //CosineDistance + InnerProduct + CosineDistance ) type InitType uint16 diff --git a/assertx/float64.go b/utils/assertx/float64.go similarity index 100% rename from assertx/float64.go rename to utils/assertx/float64.go diff --git a/assertx/float64_test.go b/utils/assertx/float64_test.go similarity index 100% rename from assertx/float64_test.go rename to utils/assertx/float64_test.go diff --git a/utils/moarray/external.go b/utils/moarray/external.go new file mode 100644 index 0000000..ddb4050 --- /dev/null +++ b/utils/moarray/external.go @@ -0,0 +1,37 @@ +// Copyright 2023 Matrix Origin +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package moarray + +import ( + "golang.org/x/exp/constraints" + "gonum.org/v1/gonum/mat" +) + +func NormalizeL2[T constraints.Float](v1 []T) ([]T, error) { + + vec := ToGonumVector[T](v1) + + norm := mat.Norm(vec, 2) + if norm == 0 { + // NOTE: don't throw error here. If you throw error, then when a zero vector comes in the Vector Index + // Mapping Query, the query will fail. Instead, return the same zero vector. + // This is consistent with FAISS:https://github.com/facebookresearch/faiss/blob/0716bde2500edb2e18509bf05f5dfa37bd698082/faiss/utils/distances.cpp#L97 + return v1, nil + } + + vec.ScaleVec(1/norm, vec) + + return ToMoArray[T](vec), nil +} diff --git a/utils/moarray/external_test.go b/utils/moarray/external_test.go new file mode 100644 index 0000000..3692a3c --- /dev/null +++ b/utils/moarray/external_test.go @@ -0,0 +1,101 @@ +// Copyright 2023 Matrix Origin +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package moarray + +import ( + "github.com/arjunsk/kmeans/utils/assertx" + "reflect" + "testing" +) + +func TestNormalizeL2(t *testing.T) { + type args struct { + argF32 []float32 + argF64 []float64 + } + type testCase struct { + name string + args args + + wantF32 []float32 + wantF64 []float64 + wantErr bool + } + tests := []testCase{ + { + name: "Test1 - float32 - zero vector", + args: args{argF32: []float32{0, 0, 0}}, + wantF32: []float32{0, 0, 0}, + }, + { + name: "Test1.b - float32", + args: args{argF32: []float32{1, 2, 3}}, + wantF32: []float32{0.26726124, 0.5345225, 0.80178374}, + }, + { + name: "Test1.c - float32", + args: args{argF32: []float32{10, 3.333333333333333, 4, 5}}, + wantF32: []float32{0.8108108, 0.27027026, 0.32432434, 0.4054054}, + }, + { + name: "Test2 - float64 - zero vector", + args: args{argF64: []float64{0, 0, 0}}, + wantF64: []float64{0, 0, 0}, + }, + { + name: "Test3 - float64", + args: args{argF64: []float64{1, 2, 3}}, + wantF64: []float64{0.2672612419124244, 0.5345224838248488, 0.8017837257372732}, + }, + { + name: "Test4 - float64", + args: args{argF64: []float64{-1, 2, 3}}, + wantF64: []float64{-0.2672612419124244, 0.5345224838248488, 0.8017837257372732}, + }, + { + name: "Test5 - float64", + args: args{argF64: []float64{10, 3.333333333333333, 4, 5}}, + wantF64: []float64{0.8108108108108107, 0.27027027027027023, 0.3243243243243243, 0.4054054054054054}, + }, + { + name: "Test6 - float64", + args: args{argF64: []float64{1, 2, 3.6666666666666665, 4.666666666666666}}, + wantF64: []float64{0.15767649936829103, 0.31535299873658207, 0.5781471643504004, 0.7358236637186913}, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + + if tt.args.argF32 != nil { + if tt.wantErr { + if _, err := NormalizeL2[float32](tt.args.argF32); err == nil { + t.Errorf("NormalizeL2() should throw error") + } + } else if gotRes, err := NormalizeL2[float32](tt.args.argF32); err != nil || !reflect.DeepEqual(tt.wantF32, gotRes) { + t.Errorf("NormalizeL2() = %v, want %v", gotRes, tt.wantF32) + } + } + if tt.args.argF64 != nil { + if tt.wantErr { + if _, err := NormalizeL2[float64](tt.args.argF64); err == nil { + t.Errorf("NormalizeL2() should throw error") + } + } else if gotRes, err := NormalizeL2[float64](tt.args.argF64); err != nil || !assertx.InEpsilonF64Slice(tt.wantF64, gotRes) { + t.Errorf("NormalizeL2() = %v, want %v", gotRes, tt.wantF64) + } + } + }) + } +} diff --git a/moarray/gonums_utils.go b/utils/moarray/gonums_utils.go similarity index 97% rename from moarray/gonums_utils.go rename to utils/moarray/gonums_utils.go index 6b85a8f..ae9d6f2 100644 --- a/moarray/gonums_utils.go +++ b/utils/moarray/gonums_utils.go @@ -15,7 +15,7 @@ package moarray import ( - "github.com/arjunsk/kmeans/moerr" + "github.com/arjunsk/kmeans/utils/moerr" "golang.org/x/exp/constraints" "gonum.org/v1/gonum/mat" ) diff --git a/moarray/gonums_utils_test.go b/utils/moarray/gonums_utils_test.go similarity index 100% rename from moarray/gonums_utils_test.go rename to utils/moarray/gonums_utils_test.go diff --git a/moarray/internal.go b/utils/moarray/internal.go similarity index 85% rename from moarray/internal.go rename to utils/moarray/internal.go index fad5099..803faad 100644 --- a/moarray/internal.go +++ b/utils/moarray/internal.go @@ -34,9 +34,9 @@ func NormalizeGonumVectors(vectors []*mat.VecDense) { } } -//// NormalizeMoVecf64 is used only in test functions. -//func NormalizeMoVecf64(vector []float64) []float64 { -// res := ToGonumVector[float64](vector) -// //NormalizeGonumVector(res) -// return ToMoArray[float64](res) -//} +// NormalizeMoVecf64 is used only in test functions. +func NormalizeMoVecf64(vector []float64) []float64 { + res := ToGonumVector[float64](vector) + //NormalizeGonumVector(res) + return ToMoArray[float64](res) +} diff --git a/moarray/internal_test.go b/utils/moarray/internal_test.go similarity index 100% rename from moarray/internal_test.go rename to utils/moarray/internal_test.go diff --git a/moerr/error.go b/utils/moerr/error.go similarity index 82% rename from moerr/error.go rename to utils/moerr/error.go index 70a08ab..02f9d28 100644 --- a/moerr/error.go +++ b/utils/moerr/error.go @@ -10,10 +10,6 @@ func NewInternalErrorNoCtx(msg string, args ...any) error { return errors.New(xmsg) } -func NewDivByZeroNoCtx() error { - return errors.New("division by zero") -} - func NewArrayInvalidOpNoCtx(expected, actual int) error { xmsg := fmt.Sprintf("vector ops between different dimensions (%v, %v) is not permitted.", expected, actual) return errors.New(xmsg)