From d7b4841c42660c098ab860762fbce2e3706b44b0 Mon Sep 17 00:00:00 2001 From: NicklasXYZ Date: Sun, 17 Mar 2024 00:03:50 +0100 Subject: [PATCH 1/4] Add set similarity measures --- src/gleam_community/maths/metrics.gleam | 129 +++++++++++++++++- test/gleam_community/maths/metrics_test.gleam | 22 +++ 2 files changed, 147 insertions(+), 4 deletions(-) diff --git a/src/gleam_community/maths/metrics.gleam b/src/gleam_community/maths/metrics.gleam index e35e336..cd9bfa6 100644 --- a/src/gleam_community/maths/metrics.gleam +++ b/src/gleam_community/maths/metrics.gleam @@ -30,6 +30,9 @@ //// * [`manhatten_distance`](#float_manhatten_distance) //// * [`minkowski_distance`](#minkowski_distance) //// * [`euclidean_distance`](#euclidean_distance) +//// * [`jaccard_index`](#jaccard_index) +//// * [`sorensen_dice_coefficient`](#sorensen_dice_coefficient) +//// * [`tversky_index`](#tversky_index) //// * **Basic statistical measures** //// * [`mean`](#mean) //// * [`median`](#median) @@ -44,6 +47,7 @@ import gleam_community/maths/predicates import gleam_community/maths/conversion import gleam/list import gleam/pair +import gleam/set import gleam/float ///
@@ -292,7 +296,7 @@ pub fn euclidean_distance( } ///
-/// +/// /// Spot a typo? Open an issue! /// ///
@@ -347,7 +351,7 @@ pub fn mean(arr: List(Float)) -> Result(Float, String) { } ///
-/// +/// /// Spot a typo? Open an issue! /// ///
@@ -414,7 +418,7 @@ pub fn median(arr: List(Float)) -> Result(Float, String) { } ///
-/// +/// /// Spot a typo? Open an issue! /// ///
@@ -490,7 +494,7 @@ pub fn variance(arr: List(Float), ddof: Int) -> Result(Float, String) { } ///
-/// +/// /// Spot a typo? Open an issue! /// ///
@@ -555,3 +559,120 @@ pub fn standard_deviation(arr: List(Float), ddof: Int) -> Result(Float, String) } } } + +///
+/// +/// Spot a typo? Open an issue! +/// +///
+/// +///
+/// Example: +/// +/// import gleeunit/should +/// import gleam_community/maths/metrics +/// +/// pub fn example () { +/// } +///
+/// +///
+/// +/// Back to top ↑ +/// +///
+/// +pub fn jaccard_index(aset: set.Set(a), bset: set.Set(a)) -> Float { + let assert Ok(result) = tversky_index(aset, bset, 1.0, 1.0) + result +} + +///
+/// +/// Spot a typo? Open an issue! +/// +///
+/// +///
+/// Example: +/// +/// import gleeunit/should +/// import gleam_community/maths/metrics +/// +/// pub fn example () { +/// } +///
+/// +///
+/// +/// Back to top ↑ +/// +///
+/// +pub fn sorensen_dice_coefficient(aset: set.Set(a), bset: set.Set(a)) -> Float { + let assert Ok(result) = tversky_index(aset, bset, 0.5, 0.5) + result +} + +///
+/// +/// Spot a typo? Open an issue! +/// +///
+/// +/// The Tversky index is a generalization of the Sørensen–Dice coefficient and the Jaccard index. +/// +///
+/// Example: +/// +/// import gleeunit/should +/// import gleam_community/maths/metrics +/// +/// pub fn example () { +/// } +///
+/// +///
+/// +/// Back to top ↑ +/// +///
+/// +pub fn tversky_index( + aset: set.Set(a), + bset: set.Set(a), + alpha: Float, + beta: Float, +) -> Result(Float, String) { + case alpha >=. 0.0, beta >=. 0.0 { + True, True -> { + let intersection: Float = + set.intersection(aset, bset) + |> set.size() + |> conversion.int_to_float() + let difference1: Float = + set.difference(aset, bset) + |> set.size() + |> conversion.int_to_float() + let difference2: Float = + set.difference(bset, aset) + |> set.size() + |> conversion.int_to_float() + intersection + /. { intersection +. alpha *. difference1 +. beta *. difference2 } + |> Ok + } + False, True -> { + "Invalid input argument: alpha < 0. Valid input is alpha >= 0." + |> Error + } + True, False -> { + "Invalid input argument: beta < 0. Valid input is beta >= 0." + |> Error + } + _, _ -> { + "Invalid input argument: alpha < 0 and beta < 0. Valid input is alpha >= 0 and beta >= 0." + |> Error + } + } +} diff --git a/test/gleam_community/maths/metrics_test.gleam b/test/gleam_community/maths/metrics_test.gleam index 8e407e6..cbd8d5e 100644 --- a/test/gleam_community/maths/metrics_test.gleam +++ b/test/gleam_community/maths/metrics_test.gleam @@ -2,6 +2,7 @@ import gleam_community/maths/elementary import gleam_community/maths/metrics import gleam_community/maths/predicates import gleeunit/should +import gleam/set pub fn float_list_norm_test() { let assert Ok(tol) = elementary.power(-10.0, -6.0) @@ -212,3 +213,24 @@ pub fn example_standard_deviation_test() { |> metrics.standard_deviation(ddof) |> should.equal(Ok(1.0)) } + +pub fn example_jaccard_index_test() { + metrics.jaccard_index(set.from_list([]), set.from_list([])) + |> should.equal(0.0) + + let set_a: set.Set(Int) = set.from_list([0, 1, 2, 5, 6, 8, 9]) + let set_b: set.Set(Int) = set.from_list([0, 2, 3, 4, 5, 7, 9]) + metrics.jaccard_index(set_a, set_b) + |> should.equal(4.0 /. 10.0) + + let set_c: set.Set(Int) = set.from_list([0, 1, 2, 3, 4, 5]) + let set_d: set.Set(Int) = set.from_list([6, 7, 8, 9, 10]) + metrics.jaccard_index(set_c, set_d) + |> should.equal(0.0 /. 11.0) + + let set_e: set.Set(String) = set.from_list(["cat", "dog", "hippo", "monkey"]) + let set_f: set.Set(String) = + set.from_list(["monkey", "rhino", "ostrich", "salmon"]) + metrics.jaccard_index(set_e, set_f) + |> should.equal(1.0 /. 7.0) +} From c825bb522f6a3228107e254dd8149deb92c4d6f3 Mon Sep 17 00:00:00 2001 From: NicklasXYZ Date: Sun, 17 Mar 2024 00:18:45 +0100 Subject: [PATCH 2/4] Add overlap coefficient --- src/gleam_community/maths/metrics.gleam | 35 +++++++++++++++++++ test/gleam_community/maths/metrics_test.gleam | 22 ++++++++++++ 2 files changed, 57 insertions(+) diff --git a/src/gleam_community/maths/metrics.gleam b/src/gleam_community/maths/metrics.gleam index cd9bfa6..4c45285 100644 --- a/src/gleam_community/maths/metrics.gleam +++ b/src/gleam_community/maths/metrics.gleam @@ -49,6 +49,7 @@ import gleam/list import gleam/pair import gleam/set import gleam/float +import gleam/int ///
/// @@ -676,3 +677,37 @@ pub fn tversky_index( } } } + +///
+/// +/// Spot a typo? Open an issue! +/// +///
+/// +/// +///
+/// Example: +/// +/// import gleeunit/should +/// import gleam_community/maths/metrics +/// +/// pub fn example () { +/// } +///
+/// +///
+/// +/// Back to top ↑ +/// +///
+/// +pub fn overlap_coefficient(aset: set.Set(a), bset: set.Set(a)) -> Float { + let intersection: Float = + set.intersection(aset, bset) + |> set.size() + |> conversion.int_to_float() + let minsize: Float = + piecewise.minimum(set.size(aset), set.size(bset), int.compare) + |> conversion.int_to_float() + intersection /. minsize +} diff --git a/test/gleam_community/maths/metrics_test.gleam b/test/gleam_community/maths/metrics_test.gleam index cbd8d5e..2cb5ef1 100644 --- a/test/gleam_community/maths/metrics_test.gleam +++ b/test/gleam_community/maths/metrics_test.gleam @@ -234,3 +234,25 @@ pub fn example_jaccard_index_test() { metrics.jaccard_index(set_e, set_f) |> should.equal(1.0 /. 7.0) } + +pub fn example_overlap_coefficient_test() { + metrics.overlap_coefficient(set.from_list([]), set.from_list([])) + |> should.equal(0.0) + + let set_a: set.Set(Int) = set.from_list([0, 1, 2, 5, 6, 8, 9]) + let set_b: set.Set(Int) = set.from_list([0, 2, 3, 4, 5, 7, 9]) + metrics.overlap_coefficient(set_a, set_b) + |> should.equal(4.0 /. 7.0) + + let set_c: set.Set(Int) = set.from_list([0, 1, 2, 3, 4, 5]) + let set_d: set.Set(Int) = set.from_list([6, 7, 8, 9, 10]) + metrics.overlap_coefficient(set_c, set_d) + |> should.equal(0.0 /. 5.0) + + let set_e: set.Set(String) = + set.from_list(["cat", "dog", "hippo", "monkey", "rhino"]) + let set_f: set.Set(String) = + set.from_list(["monkey", "rhino", "ostrich", "salmon"]) + metrics.overlap_coefficient(set_e, set_f) + |> should.equal(2.0 /. 4.0) +} From 24e496a4a844a3386b94f4a672340b44bdf1f617 Mon Sep 17 00:00:00 2001 From: NicklasXYZ Date: Tue, 19 Mar 2024 15:04:44 +0100 Subject: [PATCH 3/4] Add new distance & similarity measures --- src/gleam_community/maths/metrics.gleam | 204 ++++++++++++++++-- test/gleam_community/maths/metrics_test.gleam | 55 ++++- 2 files changed, 242 insertions(+), 17 deletions(-) diff --git a/src/gleam_community/maths/metrics.gleam b/src/gleam_community/maths/metrics.gleam index 4c45285..a4ee8ab 100644 --- a/src/gleam_community/maths/metrics.gleam +++ b/src/gleam_community/maths/metrics.gleam @@ -137,7 +137,7 @@ pub fn norm(arr: List(Float), p: Float) -> Float { /// let assert Ok(tol) = elementary.power(-10.0, -6.0) /// /// // Empty lists returns 0.0 -/// metrics.float_manhatten_distance([], []) +/// metrics.manhatten_distance([], []) /// |> should.equal(Ok(0.0)) /// /// // Differing lengths returns error @@ -567,13 +567,36 @@ pub fn standard_deviation(arr: List(Float), ddof: Int) -> Result(Float, String) /// ///
/// +/// The Jaccard index measures similarity between two sets of elements. Mathematically, the Jaccard index +/// is defined as: +/// +/// \\[ +/// \text{JI}(X, Y) = \frac{|X \cap Y|}{|X \cup Y|} \in \left[0, 1\right] +/// \\] +/// +/// where: +/// +/// - $$X$$ and $$Y$$ are two sets being compared, +/// - $$|X \cap Y|$$ represents the size of the intersection of the two sets +/// - $$|X \cup Y|$$ denotes the size of the union of the two sets +/// +/// The value of the Jaccard index ranges from 0 to 1, where 0 indicates that the two sets share no elements +/// and 1 indicates that the sets are identical. The Jaccard index is a special case of the +/// [Tversky index](#tversky_index) (with $$\alpha=\beta=1$$). +/// ///
/// Example: /// /// import gleeunit/should /// import gleam_community/maths/metrics +/// import gleam/set /// /// pub fn example () { +/// let xset: set.Set(String) = set.from_list(["cat", "dog", "hippo", "monkey"]) +/// let yset: set.Set(String) = +/// set.from_list(["monkey", "rhino", "ostrich", "salmon"]) +/// metrics.jaccard_index(xset, yset) +/// |> should.equal(1.0 /. 7.0) /// } ///
/// @@ -583,8 +606,8 @@ pub fn standard_deviation(arr: List(Float), ddof: Int) -> Result(Float, String) /// ///
/// -pub fn jaccard_index(aset: set.Set(a), bset: set.Set(a)) -> Float { - let assert Ok(result) = tversky_index(aset, bset, 1.0, 1.0) +pub fn jaccard_index(xset: set.Set(a), yset: set.Set(a)) -> Float { + let assert Ok(result) = tversky_index(xset, yset, 1.0, 1.0) result } @@ -594,13 +617,36 @@ pub fn jaccard_index(aset: set.Set(a), bset: set.Set(a)) -> Float { /// /// /// +/// The Sørensen-Dice coefficient measures the similarity between two sets of elements. Mathematically, the +/// coefficient is defined as: +/// +/// \\[ +/// \text{DSC}(X, Y) = \frac{2 \times |X \cap Y|}{|X| + |Y|} \in \left[0, 1\right] +/// \\] +/// +/// where: +/// - $$X$$ and $$Y$$ are two sets being compared +/// - $$|X \cap Y|$$ is the size of the intersection of the two sets (i.e., the number of elements common to both sets) +/// - $$|X|$$ and $$|Y|$$ are the sizes of the sets $$X$$ and $$Y$$, respectively +/// +/// The coefficient ranges from 0 to 1, where 0 indicates no similarity (the sets share no elements) and 1 +/// indicates perfect similarity (the sets are identical). The higher the coefficient, the greater the similarity +/// between the two sets. The Sørensen-Dice coefficient is a special case of the +/// [Tversky index](#tversky_index) (with $$\alpha=\beta=0.5$$). +/// ///
/// Example: /// /// import gleeunit/should /// import gleam_community/maths/metrics +/// import gleam/set /// /// pub fn example () { +/// let xset: set.Set(String) = set.from_list(["cat", "dog", "hippo", "monkey"]) +/// let yset: set.Set(String) = +/// set.from_list(["monkey", "rhino", "ostrich", "salmon", "spider"]) +/// metrics.sorensen_dice_coefficient(xset, yset) +/// |> should.equal(2.0 *. 1.0 /. { 4.0 +. 5.0 }) /// } ///
/// @@ -610,8 +656,8 @@ pub fn jaccard_index(aset: set.Set(a), bset: set.Set(a)) -> Float { /// /// /// -pub fn sorensen_dice_coefficient(aset: set.Set(a), bset: set.Set(a)) -> Float { - let assert Ok(result) = tversky_index(aset, bset, 0.5, 0.5) +pub fn sorensen_dice_coefficient(xset: set.Set(a), yset: set.Set(a)) -> Float { + let assert Ok(result) = tversky_index(xset, yset, 0.5, 0.5) result } @@ -621,15 +667,39 @@ pub fn sorensen_dice_coefficient(aset: set.Set(a), bset: set.Set(a)) -> Float { /// /// /// -/// The Tversky index is a generalization of the Sørensen–Dice coefficient and the Jaccard index. +/// The Tversky index is a generalization of the Jaccard index and Sørensen-Dice coefficient, which adds +/// flexibility through two parameters, $$\alpha$$ and $$\beta$$, allowing for asymmetric similarity +/// measures between sets. The Tversky index is defined as: /// +/// \\[ +/// \text{TI}(X, Y) = \frac{|X \cap Y|}{|X \cap Y| + \alpha|X - Y| + \beta|Y - X|} +/// \\] +/// +/// where: +/// +/// - $$X$$ and $$Y$$ are the sets being compared +/// - $$|X - Y|$$ and $$|Y - X|$$ are the sizes of the relative complements of $$Y$$ in $$X$$ and $$X$$ in $$Y$$, respectively, +/// - $$\alpha$$ and $$\beta$$ are parameters that weigh the relative importance of the elements unique to $$X$$ and $$Y$$ +/// +/// The Tversky index reduces to the Jaccard index when \(\alpha = \beta = 1\) and to the Sorensen-Dice +/// coefficient when \(\alpha = \beta = 0.5\). In general, the Tversky index can take on any non-negative value, including 0. +/// The index equals 0 when there is no intersection between the two sets, indicating no similarity. However, unlike similarity +/// measures bounded strictly between 0 and 1, the Tversky index does not have a strict upper limit of 1 when $$\alpha \neq \beta$$. +/// ///
/// Example: /// /// import gleeunit/should /// import gleam_community/maths/metrics +/// import gleam/set /// /// pub fn example () { +/// let yset: set.Set(String) = set.from_list(["cat", "dog", "hippo", "monkey"]) +/// let xset: set.Set(String) = +/// set.from_list(["monkey", "rhino", "ostrich", "salmon"]) +/// // Test Jaccard index (alpha = beta = 1) +/// metrics.tversky_index(xset, yset, 1.0, 1.0) +/// |> should.equal(1.0 /. 7.0) /// } ///
/// @@ -640,23 +710,23 @@ pub fn sorensen_dice_coefficient(aset: set.Set(a), bset: set.Set(a)) -> Float { /// /// pub fn tversky_index( - aset: set.Set(a), - bset: set.Set(a), + xset: set.Set(a), + yset: set.Set(a), alpha: Float, beta: Float, ) -> Result(Float, String) { case alpha >=. 0.0, beta >=. 0.0 { True, True -> { let intersection: Float = - set.intersection(aset, bset) + set.intersection(xset, yset) |> set.size() |> conversion.int_to_float() let difference1: Float = - set.difference(aset, bset) + set.difference(xset, yset) |> set.size() |> conversion.int_to_float() let difference2: Float = - set.difference(bset, aset) + set.difference(yset, xset) |> set.size() |> conversion.int_to_float() intersection @@ -684,14 +754,39 @@ pub fn tversky_index( /// /// /// -/// +/// The Overlap coefficient, also known as the Szymkiewicz–Simpson coefficient, is a measure of +/// similarity between two sets that focuses on the size of the intersection relative to the +/// smaller of the two sets. It is defined mathematically as: +/// +/// \\[ +/// \text{OC}(X, Y) = \frac{|X \cap Y|}{\min(|X|, |Y|)} \in \left[0, 1\right] +/// \\] +/// +/// where: +/// +/// - $$X$$ and $$Y$$ are the sets being compared +/// - $$|X \cap Y|$$ is the size of the intersection of the sets +/// - $$\min(|X|, |Y|)$$ is the size of the smaller set among $$X$$ and $$Y$$ +/// +/// The coefficient ranges from 0 to 1, where 0 indicates no overlap and 1 indicates that the +/// smaller set is a suyset of the larger set. This measure is especially useful in situations +/// where the similarity in terms of the proportion of overlap is more relevant than the +/// difference in sizes between the two sets. +/// ///
/// Example: /// /// import gleeunit/should /// import gleam_community/maths/metrics +/// import gleam/set /// /// pub fn example () { +/// let set_a: set.Set(String) = +/// set.from_list(["horse", "dog", "hippo", "monkey", "bird"]) +/// let set_b: set.Set(String) = +/// set.from_list(["monkey", "bird", "ostrich", "salmon"]) +/// metrics.overlap_coefficient(set_a, set_b) +/// |> should.equal(2.0 /. 4.0) /// } ///
/// @@ -701,13 +796,92 @@ pub fn tversky_index( /// /// /// -pub fn overlap_coefficient(aset: set.Set(a), bset: set.Set(a)) -> Float { +pub fn overlap_coefficient(xset: set.Set(a), yset: set.Set(a)) -> Float { let intersection: Float = - set.intersection(aset, bset) + set.intersection(xset, yset) |> set.size() |> conversion.int_to_float() let minsize: Float = - piecewise.minimum(set.size(aset), set.size(bset), int.compare) + piecewise.minimum(set.size(xset), set.size(yset), int.compare) |> conversion.int_to_float() intersection /. minsize } + +///
+/// +/// Spot a typo? Open an issue! +/// +///
+/// +/// Calculate the cosine similarity between two lists (representing vectors): +/// +/// \\[ +/// \frac{\sum_{i=1}^n x_i \cdot y_i}{\left(\sum_{i=1}^n x_i^2\right)^{\frac{1}{2}} \cdot \left(\sum_{i=1}^n y_i^2\right)^{\frac{1}{2}}} +/// \\] +/// +/// In the formula, $n$ is the length of the two lists and $x_i, y_i$ are the values in the respective input lists indexed by $i$. The numerator +/// represents the dot product of the two vectors, while the denominator is the product of the magnitudes (Euclidean norms) of the two vectors. +/// The cosine similarity provides a value between -1 and 1, where 1 means the vectors are in the same direction, -1 means they are in exactly +/// opposite directions, and 0 indicates orthogonality. +/// +///
+/// Example: +/// +/// import gleeunit/should +/// import gleam_community/maths/metrics +/// +/// pub fn example () { +/// // Two orthogonal vectors +/// metrics.cosine_similarity([-1.0, 1.0, 0.0], [1.0, 1.0, -1.0]) +/// |> should.equal(Ok(0.0)) +/// +/// // Two identical (parallel) vectors +/// metrics.cosine_similarity([1.0, 2.0, 3.0], [1.0, 2.0, 3.0]) +/// |> should.equal(Ok(1.0)) +/// +/// // Two parallel, but oppositely oriented vectors +/// metrics.cosine_similarity([-1.0, -2.0, -3.0], [1.0, 2.0, 3.0]) +/// |> should.equal(Ok(-1.0)) +/// } +///
+/// +///
+/// +/// Back to top ↑ +/// +///
+/// +pub fn cosine_similarity( + xarr: List(Float), + yarr: List(Float), +) -> Result(Float, String) { + let xlen: Int = list.length(xarr) + let ylen: Int = list.length(yarr) + case xarr, yarr { + [], _ -> + "Invalid input argument: The list xarr is empty." + |> Error + _, [] -> + "Invalid input argument: The list yarr is empty." + |> Error + _, _ -> { + case xlen == ylen { + False -> + "Invalid input argument: length(xarr) != length(yarr). Valid input is when length(xarr) == length(yarr)." + |> Error + True -> { + list.fold( + list.zip(xarr, yarr), + 0.0, + fn(acc: Float, a: #(Float, Float)) -> Float { + let result: Float = pair.first(a) *. pair.second(a) + result +. acc + }, + ) + /. { norm(xarr, 2.0) *. norm(yarr, 2.0) } + |> Ok + } + } + } + } +} diff --git a/test/gleam_community/maths/metrics_test.gleam b/test/gleam_community/maths/metrics_test.gleam index 2cb5ef1..e2f7307 100644 --- a/test/gleam_community/maths/metrics_test.gleam +++ b/test/gleam_community/maths/metrics_test.gleam @@ -235,6 +235,27 @@ pub fn example_jaccard_index_test() { |> should.equal(1.0 /. 7.0) } +pub fn example_sorensen_dice_coefficient_test() { + metrics.sorensen_dice_coefficient(set.from_list([]), set.from_list([])) + |> should.equal(0.0) + + let set_a: set.Set(Int) = set.from_list([0, 1, 2, 5, 6, 8, 9]) + let set_b: set.Set(Int) = set.from_list([0, 2, 3, 4, 5, 7, 9]) + metrics.sorensen_dice_coefficient(set_a, set_b) + |> should.equal(2.0 *. 4.0 /. { 7.0 +. 7.0 }) + + let set_c: set.Set(Int) = set.from_list([0, 1, 2, 3, 4, 5]) + let set_d: set.Set(Int) = set.from_list([6, 7, 8, 9, 10]) + metrics.sorensen_dice_coefficient(set_c, set_d) + |> should.equal(2.0 *. 0.0 /. { 6.0 +. 5.0 }) + + let set_e: set.Set(String) = set.from_list(["cat", "dog", "hippo", "monkey"]) + let set_f: set.Set(String) = + set.from_list(["monkey", "rhino", "ostrich", "salmon", "spider"]) + metrics.sorensen_dice_coefficient(set_e, set_f) + |> should.equal(2.0 *. 1.0 /. { 4.0 +. 5.0 }) +} + pub fn example_overlap_coefficient_test() { metrics.overlap_coefficient(set.from_list([]), set.from_list([])) |> should.equal(0.0) @@ -250,9 +271,39 @@ pub fn example_overlap_coefficient_test() { |> should.equal(0.0 /. 5.0) let set_e: set.Set(String) = - set.from_list(["cat", "dog", "hippo", "monkey", "rhino"]) + set.from_list(["horse", "dog", "hippo", "monkey", "bird"]) let set_f: set.Set(String) = - set.from_list(["monkey", "rhino", "ostrich", "salmon"]) + set.from_list(["monkey", "bird", "ostrich", "salmon"]) metrics.overlap_coefficient(set_e, set_f) |> should.equal(2.0 /. 4.0) } + +pub fn example_cosine_similarity_test() { + // Empty lists returns an error + metrics.cosine_similarity([], []) + |> should.be_error() + + // One empty list returns an error + metrics.cosine_similarity([1.0, 2.0, 3.0], []) + |> should.be_error() + + // One empty list returns an error + metrics.cosine_similarity([], [1.0, 2.0, 3.0]) + |> should.be_error() + + // Differen sized lists returns an error + metrics.cosine_similarity([1.0, 2.0], [1.0, 2.0, 3.0, 4.0]) + |> should.be_error() + + // Two orthogonal vectors (represented by lists) + metrics.cosine_similarity([-1.0, 1.0, 0.0], [1.0, 1.0, -1.0]) + |> should.equal(Ok(0.0)) + + // Two identical (parallel) vectors (represented by lists) + metrics.cosine_similarity([1.0, 2.0, 3.0], [1.0, 2.0, 3.0]) + |> should.equal(Ok(1.0)) + + // Two parallel, but oppositely oriented vectors (represented by lists) + metrics.cosine_similarity([-1.0, -2.0, -3.0], [1.0, 2.0, 3.0]) + |> should.equal(Ok(-1.0)) +} From fc4a4a8b0994534b0533c162efc0028a6f63da10 Mon Sep 17 00:00:00 2001 From: NicklasXYZ Date: Tue, 19 Mar 2024 15:18:07 +0100 Subject: [PATCH 4/4] Fix typos --- src/gleam_community/maths/metrics.gleam | 25 ++++++++++++++----------- 1 file changed, 14 insertions(+), 11 deletions(-) diff --git a/src/gleam_community/maths/metrics.gleam b/src/gleam_community/maths/metrics.gleam index a4ee8ab..772b80d 100644 --- a/src/gleam_community/maths/metrics.gleam +++ b/src/gleam_community/maths/metrics.gleam @@ -25,14 +25,17 @@ //// //// Metrics: A module offering functions for calculating distances and other types of metrics. //// -//// * **Distances** +//// * **Distance measures** //// * [`norm`](#norm) -//// * [`manhatten_distance`](#float_manhatten_distance) +//// * [`manhatten_distance`](#manhatten_distance) //// * [`minkowski_distance`](#minkowski_distance) //// * [`euclidean_distance`](#euclidean_distance) +//// * [`cosine_similarity`](#cosine_similarity) +//// * **Set & string similarity measures** //// * [`jaccard_index`](#jaccard_index) //// * [`sorensen_dice_coefficient`](#sorensen_dice_coefficient) //// * [`tversky_index`](#tversky_index) +//// * [`overlap_coefficient`](#overlap_coefficient) //// * **Basic statistical measures** //// * [`mean`](#mean) //// * [`median`](#median) @@ -571,7 +574,7 @@ pub fn standard_deviation(arr: List(Float), ddof: Int) -> Result(Float, String) /// is defined as: /// /// \\[ -/// \text{JI}(X, Y) = \frac{|X \cap Y|}{|X \cup Y|} \in \left[0, 1\right] +/// \frac{|X \cap Y|}{|X \cup Y|} \\; \in \\; \left[0, 1\right] /// \\] /// /// where: @@ -621,7 +624,7 @@ pub fn jaccard_index(xset: set.Set(a), yset: set.Set(a)) -> Float { /// coefficient is defined as: /// /// \\[ -/// \text{DSC}(X, Y) = \frac{2 \times |X \cap Y|}{|X| + |Y|} \in \left[0, 1\right] +/// \frac{2 |X \cap Y|}{|X| + |Y|} \\; \in \\; \left[0, 1\right] /// \\] /// /// where: @@ -672,7 +675,7 @@ pub fn sorensen_dice_coefficient(xset: set.Set(a), yset: set.Set(a)) -> Float { /// measures between sets. The Tversky index is defined as: /// /// \\[ -/// \text{TI}(X, Y) = \frac{|X \cap Y|}{|X \cap Y| + \alpha|X - Y| + \beta|Y - X|} +/// \frac{|X \cap Y|}{|X \cap Y| + \alpha|X - Y| + \beta|Y - X|} /// \\] /// /// where: @@ -759,7 +762,7 @@ pub fn tversky_index( /// smaller of the two sets. It is defined mathematically as: /// /// \\[ -/// \text{OC}(X, Y) = \frac{|X \cap Y|}{\min(|X|, |Y|)} \in \left[0, 1\right] +/// \frac{|X \cap Y|}{\min(|X|, |Y|)} \\; \in \\; \left[0, 1\right] /// \\] /// /// where: @@ -816,13 +819,13 @@ pub fn overlap_coefficient(xset: set.Set(a), yset: set.Set(a)) -> Float { /// Calculate the cosine similarity between two lists (representing vectors): /// /// \\[ -/// \frac{\sum_{i=1}^n x_i \cdot y_i}{\left(\sum_{i=1}^n x_i^2\right)^{\frac{1}{2}} \cdot \left(\sum_{i=1}^n y_i^2\right)^{\frac{1}{2}}} +/// \frac{\sum_{i=1}^n x_i \cdot y_i}{\left(\sum_{i=1}^n x_i^2\right)^{\frac{1}{2}} \cdot \left(\sum_{i=1}^n y_i^2\right)^{\frac{1}{2}}} \\; \in \\; \left[-1, 1\right] /// \\] /// -/// In the formula, $n$ is the length of the two lists and $x_i, y_i$ are the values in the respective input lists indexed by $i$. The numerator -/// represents the dot product of the two vectors, while the denominator is the product of the magnitudes (Euclidean norms) of the two vectors. -/// The cosine similarity provides a value between -1 and 1, where 1 means the vectors are in the same direction, -1 means they are in exactly -/// opposite directions, and 0 indicates orthogonality. +/// In the formula, $$n$$ is the length of the two lists and $$x_i$$, $$y_i$$ are the values in the respective input lists indexed by $$i$$. +/// The numerator represents the dot product of the two vectors, while the denominator is the product of the magnitudes (Euclidean norms) of +/// the two vectors. The cosine similarity provides a value between -1 and 1, where 1 means the vectors are in the same direction, -1 means +/// they are in exactly opposite directions, and 0 indicates orthogonality. /// ///
/// Example: