From e8bbcce58762b8d30edda512a854db1da8d7ec91 Mon Sep 17 00:00:00 2001 From: NicklasXYZ <18580183+NicklasXYZ@users.noreply.github.com> Date: Wed, 10 Apr 2024 23:09:02 +0200 Subject: [PATCH 1/2] Add Levenshtein distance, fix typos, align doc examples with tests --- src/gleam_community/maths/metrics.gleam | 254 ++++++++++++++---- test/gleam_community/maths/metrics_test.gleam | 87 ++++-- 2 files changed, 257 insertions(+), 84 deletions(-) diff --git a/src/gleam_community/maths/metrics.gleam b/src/gleam_community/maths/metrics.gleam index 8b204cb..d5ddc69 100644 --- a/src/gleam_community/maths/metrics.gleam +++ b/src/gleam_community/maths/metrics.gleam @@ -23,11 +23,12 @@ //// //// --- //// -//// Metrics: A module offering functions for calculating distances and other types of metrics. +//// Metrics: A module offering functions for calculating distances and other +//// types of metrics. //// //// * **Distance measures** //// * [`norm`](#norm) -//// * [`manhatten_distance`](#manhatten_distance) +//// * [`manhattan_distance`](#manhattan_distance) //// * [`euclidean_distance`](#euclidean_distance) //// * [`chebyshev_distance`](#chebyshev_distance) //// * [`minkowski_distance`](#minkowski_distance) @@ -54,6 +55,7 @@ import gleam/pair import gleam/set import gleam/float import gleam/int +import gleam/string ///
/// @@ -67,7 +69,8 @@ import gleam/int /// \left( \sum_{i=1}^n \left|x_i\right|^{p} \right)^{\frac{1}{p}} /// \\] /// -/// In the formula, $$n$$ is the length of the list and $$x_i$$ is the value in the input list indexed by $$i$$. +/// In the formula, $$n$$ is the length of the list and $$x_i$$ is the value in +/// the input list indexed by $$i$$. /// ///
/// Example: @@ -121,13 +124,14 @@ pub fn norm(arr: List(Float), p: Float) -> Float { /// ///
/// -/// Calculate the Manhatten distance between two lists (representing vectors): +/// Calculate the Manhattan distance between two lists (representing vectors): /// /// \\[ /// \sum_{i=1}^n \left|x_i - y_i \right| /// \\] /// -/// In the formula, $$n$$ is the length of the two lists and $$x_i, y_i$$ are the values in the respective input lists indexed by $$i$$. +/// In the formula, $$n$$ is the length of the two lists and $$x_i, y_i$$ are the +/// values in the respective input lists indexed by $$i$$. /// ///
/// Example: @@ -141,14 +145,14 @@ pub fn norm(arr: List(Float), p: Float) -> Float { /// let assert Ok(tol) = elementary.power(-10.0, -6.0) /// /// // Empty lists returns an error -/// metrics.manhatten_distance([], []) +/// metrics.manhattan_distance([], []) /// |> should.be_error() /// /// // Differing lengths returns error -/// metrics.manhatten_distance([], [1.0]) +/// metrics.manhattan_distance([], [1.0]) /// |> should.be_error() /// -/// let assert Ok(result) = metrics.manhatten_distance([0.0, 0.0], [1.0, 2.0]) +/// let assert Ok(result) = metrics.manhattan_distance([0.0, 0.0], [1.0, 2.0]) /// result /// |> predicates.is_close(3.0, 0.0, tol) /// |> should.be_true() @@ -161,7 +165,7 @@ pub fn norm(arr: List(Float), p: Float) -> Float { /// /// /// -pub fn manhatten_distance( +pub fn manhattan_distance( xarr: List(Float), yarr: List(Float), ) -> Result(Float, String) { @@ -180,9 +184,11 @@ pub fn manhatten_distance( /// \left( \sum_{i=1}^n \left|x_i - y_i \right|^{p} \right)^{\frac{1}{p}} /// \\] /// -/// In the formula, $$p >= 1$$ is the order, $$n$$ is the length of the two lists and $$x_i, y_i$$ are the values in the respective input lists indexed by $$i$$. +/// In the formula, $$p >= 1$$ is the order, $$n$$ is the length of the two lists +/// and $$x_i, y_i$$ are the values in the respective input lists indexed by $$i$$. /// -/// The Minkowski distance is a generalization of both the Euclidean distance ($$p=2$$) and the Manhattan distance ($$p = 1$$). +/// The Minkowski distance is a generalization of both the Euclidean distance +/// ($$p=2$$) and the Manhattan distance ($$p = 1$$). /// ///
/// Example: @@ -195,7 +201,7 @@ pub fn manhatten_distance( /// pub fn example () { /// let assert Ok(tol) = elementary.power(-10.0, -6.0) /// -/// // Empty lists returns 0.0 +/// // Empty lists returns an error /// metrics.minkowski_distance([], [], 1.0) /// |> should.be_error() /// @@ -269,7 +275,8 @@ pub fn minkowski_distance( /// \left( \sum_{i=1}^n \left|x_i - y_i \right|^{2} \right)^{\frac{1}{2}} /// \\] /// -/// In the formula, $$n$$ is the length of the two lists and $$x_i, y_i$$ are the values in the respective input lists indexed by $$i$$. +/// In the formula, $$n$$ is the length of the two lists and $$x_i, y_i$$ are the +/// values in the respective input lists indexed by $$i$$. /// ///
/// Example: @@ -282,11 +289,11 @@ pub fn minkowski_distance( /// pub fn example () { /// let assert Ok(tol) = elementary.power(-10.0, -6.0) /// -/// // Empty lists returns 0.0 +/// // Empty lists returns an error /// metrics.euclidean_distance([], []) /// |> should.be_error() /// -/// // Differing lengths returns error +/// // Differing lengths returns an error /// metrics.euclidean_distance([], [1.0]) /// |> should.be_error() /// @@ -322,7 +329,8 @@ pub fn euclidean_distance( /// \text{max}_{i=1}^n \left|x_i - y_i \right| /// \\] /// -/// In the formula, $$n$$ is the length of the two lists and $$x_i, y_i$$ are the values in the respective input lists indexed by $$i$$. +/// In the formula, $$n$$ is the length of the two lists and $$x_i, y_i$$ are the +/// values in the respective input lists indexed by $$i$$. /// ///
/// Example: @@ -397,8 +405,8 @@ pub fn chebyshev_distance( /// \bar{x} = \frac{1}{n}\sum_{i=1}^n x_i /// \\] /// -/// In the formula, $$n$$ is the sample size (the length of the list) and -/// $$x_i$$ is the sample point in the input list indexed by $$i$$. +/// In the formula, $$n$$ is the sample size (the length of the list) and $$x_i$$ +/// is the sample point in the input list indexed by $$i$$. /// ///
/// Example: @@ -514,12 +522,13 @@ pub fn median(arr: List(Float)) -> Result(Float, String) { /// /// /// Calculate the sample variance of the elements in a list: +/// /// \\[ /// s^{2} = \frac{1}{n - d} \sum_{i=1}^{n}(x_i - \bar{x}) /// \\] /// -/// In the formula, $$n$$ is the sample size (the length of the list) and -/// $$x_i$$ is the sample point in the input list indexed by $$i$$. +/// In the formula, $$n$$ is the sample size (the length of the list) and $$x_i$$ +/// is the sample point in the input list indexed by $$i$$. /// Furthermore, $$\bar{x}$$ is the sample mean and $$d$$ is the "Delta /// Degrees of Freedom", and is by default set to $$d = 0$$, which gives a biased /// estimate of the sample variance. Setting $$d = 1$$ gives an unbiased estimate. @@ -594,11 +603,12 @@ pub fn variance(arr: List(Float), ddof: Int) -> Result(Float, String) { /// s = \left(\frac{1}{n - d} \sum_{i=1}^{n}(x_i - \bar{x})\right)^{\frac{1}{2}} /// \\] /// -/// In the formula, $$n$$ is the sample size (the length of the list) and -/// $$x_i$$ is the sample point in the input list indexed by $$i$$. +/// In the formula, $$n$$ is the sample size (the length of the list) and $$x_i$$ +/// is the sample point in the input list indexed by $$i$$. /// Furthermore, $$\bar{x}$$ is the sample mean and $$d$$ is the "Delta /// Degrees of Freedom", and is by default set to $$d = 0$$, which gives a biased -/// estimate of the sample standard deviation. Setting $$d = 1$$ gives an unbiased estimate. +/// estimate of the sample standard deviation. Setting $$d = 1$$ gives an unbiased +/// estimate. /// ///
/// Example: @@ -656,8 +666,8 @@ pub fn standard_deviation(arr: List(Float), ddof: Int) -> Result(Float, String) /// /// /// -/// The Jaccard index measures similarity between two sets of elements. Mathematically, the Jaccard index -/// is defined as: +/// The Jaccard index measures similarity between two sets of elements. +/// Mathematically, the Jaccard index is defined as: /// /// \\[ /// \frac{|X \cap Y|}{|X \cup Y|} \\; \in \\; \left[0, 1\right] @@ -669,9 +679,10 @@ pub fn standard_deviation(arr: List(Float), ddof: Int) -> Result(Float, String) /// - $$|X \cap Y|$$ represents the size of the intersection of the two sets /// - $$|X \cup Y|$$ denotes the size of the union of the two sets /// -/// The value of the Jaccard index ranges from 0 to 1, where 0 indicates that the two sets share no elements -/// and 1 indicates that the sets are identical. The Jaccard index is a special case of the -/// [Tversky index](#tversky_index) (with $$\alpha=\beta=1$$). +/// The value of the Jaccard index ranges from 0 to 1, where 0 indicates that the +/// two sets share no elements and 1 indicates that the sets are identical. The +/// Jaccard index is a special case of the [Tversky index](#tversky_index) (with +/// $$\alpha=\beta=1$$). /// ///
/// Example: @@ -706,8 +717,8 @@ pub fn jaccard_index(xset: set.Set(a), yset: set.Set(a)) -> Float { /// /// /// -/// The Sørensen-Dice coefficient measures the similarity between two sets of elements. Mathematically, the -/// coefficient is defined as: +/// The Sørensen-Dice coefficient measures the similarity between two sets of +/// elements. Mathematically, the coefficient is defined as: /// /// \\[ /// \frac{2 |X \cap Y|}{|X| + |Y|} \\; \in \\; \left[0, 1\right] @@ -715,12 +726,14 @@ pub fn jaccard_index(xset: set.Set(a), yset: set.Set(a)) -> Float { /// /// where: /// - $$X$$ and $$Y$$ are two sets being compared -/// - $$|X \cap Y|$$ is the size of the intersection of the two sets (i.e., the number of elements common to both sets) +/// - $$|X \cap Y|$$ is the size of the intersection of the two sets (i.e., the +/// number of elements common to both sets) /// - $$|X|$$ and $$|Y|$$ are the sizes of the sets $$X$$ and $$Y$$, respectively /// -/// The coefficient ranges from 0 to 1, where 0 indicates no similarity (the sets share no elements) and 1 -/// indicates perfect similarity (the sets are identical). The higher the coefficient, the greater the similarity -/// between the two sets. The Sørensen-Dice coefficient is a special case of the +/// The coefficient ranges from 0 to 1, where 0 indicates no similarity (the sets +/// share no elements) and 1 indicates perfect similarity (the sets are identical). +/// The higher the coefficient, the greater the similarity between the two sets. +/// The Sørensen-Dice coefficient is a special case of the /// [Tversky index](#tversky_index) (with $$\alpha=\beta=0.5$$). /// ///
@@ -756,9 +769,10 @@ pub fn sorensen_dice_coefficient(xset: set.Set(a), yset: set.Set(a)) -> Float { /// /// /// -/// The Tversky index is a generalization of the Jaccard index and Sørensen-Dice coefficient, which adds -/// flexibility through two parameters, $$\alpha$$ and $$\beta$$, allowing for asymmetric similarity -/// measures between sets. The Tversky index is defined as: +/// The Tversky index is a generalization of the Jaccard index and Sørensen-Dice +/// coefficient, which adds flexibility through two parameters, $$\alpha$$ and +/// $$\beta$$, allowing for asymmetric similarity measures between sets. The +/// Tversky index is defined as: /// /// \\[ /// \frac{|X \cap Y|}{|X \cap Y| + \alpha|X - Y| + \beta|Y - X|} @@ -767,13 +781,17 @@ pub fn sorensen_dice_coefficient(xset: set.Set(a), yset: set.Set(a)) -> Float { /// where: /// /// - $$X$$ and $$Y$$ are the sets being compared -/// - $$|X - Y|$$ and $$|Y - X|$$ are the sizes of the relative complements of $$Y$$ in $$X$$ and $$X$$ in $$Y$$, respectively, -/// - $$\alpha$$ and $$\beta$$ are parameters that weigh the relative importance of the elements unique to $$X$$ and $$Y$$ +/// - $$|X - Y|$$ and $$|Y - X|$$ are the sizes of the relative complements of +/// $$Y$$ in $$X$$ and $$X$$ in $$Y$$, respectively, +/// - $$\alpha$$ and $$\beta$$ are parameters that weigh the relative importance +/// of the elements unique to $$X$$ and $$Y$$ /// -/// The Tversky index reduces to the Jaccard index when \(\alpha = \beta = 1\) and to the Sorensen-Dice -/// coefficient when \(\alpha = \beta = 0.5\). In general, the Tversky index can take on any non-negative value, including 0. -/// The index equals 0 when there is no intersection between the two sets, indicating no similarity. However, unlike similarity -/// measures bounded strictly between 0 and 1, the Tversky index does not have a strict upper limit of 1 when $$\alpha \neq \beta$$. +/// The Tversky index reduces to the Jaccard index when $$\alpha = \beta = 1$$ and +/// to the Sørensen-Dice coefficient when $$\alpha = \beta = 0.5$$. In general, the +/// Tversky index can take on any non-negative value, including 0. The index equals +/// 0 when there is no intersection between the two sets, indicating no similarity. +/// However, unlike similarity measures bounded strictly between 0 and 1, the +/// Tversky index does not have a strict upper limit of 1 when $$\alpha \neq \beta$$. /// ///
/// Example: @@ -843,9 +861,10 @@ pub fn tversky_index( /// /// /// -/// The Overlap coefficient, also known as the Szymkiewicz–Simpson coefficient, is a measure of -/// similarity between two sets that focuses on the size of the intersection relative to the -/// smaller of the two sets. It is defined mathematically as: +/// The Overlap coefficient, also known as the Szymkiewicz–Simpson coefficient, is +/// a measure of similarity between two sets that focuses on the size of the +/// intersection relative to the smaller of the two sets. It is defined +/// mathematically as: /// /// \\[ /// \frac{|X \cap Y|}{\min(|X|, |Y|)} \\; \in \\; \left[0, 1\right] @@ -857,10 +876,11 @@ pub fn tversky_index( /// - $$|X \cap Y|$$ is the size of the intersection of the sets /// - $$\min(|X|, |Y|)$$ is the size of the smaller set among $$X$$ and $$Y$$ /// -/// The coefficient ranges from 0 to 1, where 0 indicates no overlap and 1 indicates that the -/// smaller set is a suyset of the larger set. This measure is especially useful in situations -/// where the similarity in terms of the proportion of overlap is more relevant than the -/// difference in sizes between the two sets. +/// The coefficient ranges from 0 to 1, where 0 indicates no overlap and 1 +/// indicates that the smaller set is a suyset of the larger set. This +/// measure is especially useful in situations where the similarity in terms +/// of the proportion of overlap is more relevant than the difference in sizes +/// between the two sets. /// ///
/// Example: @@ -905,13 +925,18 @@ pub fn overlap_coefficient(xset: set.Set(a), yset: set.Set(a)) -> Float { /// Calculate the cosine similarity between two lists (representing vectors): /// /// \\[ -/// \frac{\sum_{i=1}^n x_i \cdot y_i}{\left(\sum_{i=1}^n x_i^2\right)^{\frac{1}{2}} \cdot \left(\sum_{i=1}^n y_i^2\right)^{\frac{1}{2}}} \\; \in \\; \left[-1, 1\right] +/// \frac{\sum_{i=1}^n x_i \cdot y_i}{\left(\sum_{i=1}^n x_i^2\right)^{\frac{1}{2}} +/// \cdot \left(\sum_{i=1}^n y_i^2\right)^{\frac{1}{2}}} +/// \\; \in \\; \left[-1, 1\right] /// \\] /// -/// In the formula, $$n$$ is the length of the two lists and $$x_i$$, $$y_i$$ are the values in the respective input lists indexed by $$i$$. -/// The numerator represents the dot product of the two vectors, while the denominator is the product of the magnitudes (Euclidean norms) of -/// the two vectors. The cosine similarity provides a value between -1 and 1, where 1 means the vectors are in the same direction, -1 means -/// they are in exactly opposite directions, and 0 indicates orthogonality. +/// In the formula, $$n$$ is the length of the two lists and $$x_i$$, $$y_i$$ are +/// the values in the respective input lists indexed by $$i$$. The numerator +/// represents the dot product of the two vectors, while the denominator is the +/// product of the magnitudes (Euclidean norms) of the two vectors. The cosine +/// similarity provides a value between -1 and 1, where 1 means the vectors are +/// in the same direction, -1 means they are in exactly opposite directions, +/// and 0 indicates orthogonality. /// ///
/// Example: @@ -974,3 +999,122 @@ pub fn cosine_similarity( } } } + +/// +/// +/// Calculate the Levenshtein distance between two strings, i.e., measure the +/// difference between two strings (essentially sequences). It is defined as +/// the minimum number of single-character edits required to change one string +/// into the other, using operations: +/// - insertions +/// - deletions +/// - substitutions +/// +/// Note: The implementation is primarily based on the elixir implementation +/// [https://hex.pm/packages/levenshtein](levenshtein). +/// +///
+/// Example: +/// +/// import gleeunit/should +/// import gleam_community/maths/metrics +/// +/// pub fn example () { +/// metrics.levenshtein_distance("hello", "hello") +/// |> should.equal(0) +/// +/// metrics.levenshtein_distance("cat", "cut") +/// |> should.equal(1) +/// +/// metrics.levenshtein_distance("kitten", "sitting") +/// |> should.equal(3) +/// } +///
+/// +/// +/// +/// +pub fn levenshtein_distance(xstring: String, ystring: String) -> Int { + case xstring, ystring { + xstring, ystring if xstring == ystring -> { + 0 + } + xstring, ystring if xstring == "" -> { + string.length(ystring) + } + xstring, ystring if ystring == "" -> { + string.length(xstring) + } + _, _ -> { + let xstring_graphemes = string.to_graphemes(xstring) + let ystring_graphemes = string.to_graphemes(ystring) + let ystring_length = list.length(ystring_graphemes) + let distance_list = list.range(0, ystring_length) + + do_edit_distance(xstring_graphemes, ystring_graphemes, distance_list, 1) + } + } +} + +fn do_edit_distance( + xstring: List(String), + ystring: List(String), + distance_list: List(Int), + step: Int, +) -> Int { + case xstring { + // Safe as 'distance_list' is never empty + [] -> { + let assert Ok(last) = list.last(distance_list) + last + } + [xstring_head, ..xstring_tail] -> { + let new_distance_list = + distance_list_helper(ystring, distance_list, xstring_head, [step], step) + do_edit_distance(xstring_tail, ystring, new_distance_list, step + 1) + } + } +} + +fn distance_list_helper( + ystring: List(String), + distance_list: List(Int), + grapheme: String, + new_distance_list: List(Int), + last_distance: Int, +) -> List(Int) { + case ystring { + [] -> list.reverse(new_distance_list) + [ystring_head, ..ystring_tail] -> { + let assert [distance_list_head, ..distance_list_tail] = distance_list + let difference = case ystring_head == grapheme { + True -> { + 0 + } + False -> { + 1 + } + } + let assert [first, ..] = distance_list_tail + let min = + last_distance + 1 + |> piecewise.minimum(first + 1, int.compare) + |> piecewise.minimum(distance_list_head + difference, int.compare) + distance_list_helper( + ystring_tail, + distance_list_tail, + grapheme, + [min, ..new_distance_list], + min, + ) + } + } +} diff --git a/test/gleam_community/maths/metrics_test.gleam b/test/gleam_community/maths/metrics_test.gleam index 4ba9cd9..8fca446 100644 --- a/test/gleam_community/maths/metrics_test.gleam +++ b/test/gleam_community/maths/metrics_test.gleam @@ -50,38 +50,24 @@ pub fn float_list_norm_test() { |> should.be_true() } -pub fn float_list_manhatten_test() { +pub fn float_list_manhattan_test() { let assert Ok(tol) = elementary.power(-10.0, -6.0) // Empty lists returns an error - metrics.manhatten_distance([], []) + metrics.manhattan_distance([], []) |> should.be_error() // Differing lengths returns error - metrics.manhatten_distance([], [1.0]) + metrics.manhattan_distance([], [1.0]) |> should.be_error() - // Manhatten distance (p = 1) - let assert Ok(result) = metrics.manhatten_distance([0.0, 0.0], [1.0, 2.0]) + // manhattan distance (p = 1) + let assert Ok(result) = metrics.manhattan_distance([0.0, 0.0], [1.0, 2.0]) result |> predicates.is_close(3.0, 0.0, tol) |> should.be_true() } -// pub fn int_list_manhatten_test() { -// // Empty lists returns 0 -// metrics.int_manhatten_distance([], []) -// |> should.equal(Ok(0)) - -// // Differing lengths returns error -// metrics.int_manhatten_distance([], [1]) -// |> should.be_error() - -// let assert Ok(result) = metrics.int_manhatten_distance([0, 0], [1, 2]) -// result -// |> should.equal(3) -// } - pub fn float_list_minkowski_test() { let assert Ok(tol) = elementary.power(-10.0, -6.0) @@ -130,7 +116,7 @@ pub fn float_list_minkowski_test() { |> predicates.is_close(2.23606797749979, 0.0, tol) |> should.be_true() - // Manhatten distance (p = 1) + // Manhattan distance (p = 1) let assert Ok(result) = metrics.minkowski_distance([0.0, 0.0], [1.0, 2.0], 1.0) result @@ -156,7 +142,7 @@ pub fn float_list_euclidean_test() { |> should.be_true() } -pub fn example_mean_test() { +pub fn mean_test() { // An empty list returns an error [] |> metrics.mean() @@ -168,7 +154,7 @@ pub fn example_mean_test() { |> should.equal(Ok(2.0)) } -pub fn example_median_test() { +pub fn median_test() { // An empty list returns an error [] |> metrics.median() @@ -184,7 +170,7 @@ pub fn example_median_test() { |> should.equal(Ok(2.5)) } -pub fn example_variance_test() { +pub fn variance_test() { // Degrees of freedom let ddof: Int = 1 @@ -199,7 +185,7 @@ pub fn example_variance_test() { |> should.equal(Ok(1.0)) } -pub fn example_standard_deviation_test() { +pub fn standard_deviation_test() { // Degrees of freedom let ddof: Int = 1 @@ -214,7 +200,7 @@ pub fn example_standard_deviation_test() { |> should.equal(Ok(1.0)) } -pub fn example_jaccard_index_test() { +pub fn jaccard_index_test() { metrics.jaccard_index(set.from_list([]), set.from_list([])) |> should.equal(0.0) @@ -235,7 +221,7 @@ pub fn example_jaccard_index_test() { |> should.equal(1.0 /. 7.0) } -pub fn example_sorensen_dice_coefficient_test() { +pub fn sorensen_dice_coefficient_test() { metrics.sorensen_dice_coefficient(set.from_list([]), set.from_list([])) |> should.equal(0.0) @@ -256,7 +242,7 @@ pub fn example_sorensen_dice_coefficient_test() { |> should.equal(2.0 *. 1.0 /. { 4.0 +. 5.0 }) } -pub fn example_overlap_coefficient_test() { +pub fn overlap_coefficient_test() { metrics.overlap_coefficient(set.from_list([]), set.from_list([])) |> should.equal(0.0) @@ -278,7 +264,7 @@ pub fn example_overlap_coefficient_test() { |> should.equal(2.0 /. 4.0) } -pub fn example_cosine_similarity_test() { +pub fn cosine_similarity_test() { // Empty lists returns an error metrics.cosine_similarity([], []) |> should.be_error() @@ -308,7 +294,7 @@ pub fn example_cosine_similarity_test() { |> should.equal(Ok(-1.0)) } -pub fn example_chebyshev_distance() { +pub fn chebyshev_distance_test() { // Empty lists returns an error metrics.chebyshev_distance([], []) |> should.be_error() @@ -330,6 +316,9 @@ pub fn example_chebyshev_distance() { |> should.equal(Ok(2.0)) metrics.chebyshev_distance([1.0, 0.0], [2.0, 0.0]) + |> should.equal(Ok(1.0)) + + metrics.chebyshev_distance([1.0, 0.0], [-2.0, 0.0]) |> should.equal(Ok(3.0)) metrics.chebyshev_distance([-5.0, -10.0, -3.0], [-1.0, -12.0, -3.0]) @@ -338,3 +327,43 @@ pub fn example_chebyshev_distance() { metrics.chebyshev_distance([1.0, 2.0, 3.0], [1.0, 2.0, 3.0]) |> should.equal(Ok(0.0)) } + +pub fn edit_distance_test() { + // Try different types of valid input... + + // Requires 5 insertions to transform the empty string into "hello" + metrics.levenshtein_distance("", "hello") + |> should.equal(5) + // Requires 5 deletions to remove all characters from "hello" to match the empty string + metrics.levenshtein_distance("hello", "") + |> should.equal(5) + + // Requires 2 deletions to remove two 'b's and 1 substitution to change 'b' to 'a' + metrics.levenshtein_distance("bbb", "a") + |> should.equal(3) + // Requires 2 insertions to add two 'b's and 1 substitution to change 'a' to 'b' + metrics.levenshtein_distance("a", "bbb") + |> should.equal(3) + + // No changes needed, since the strings are identical + metrics.levenshtein_distance("hello", "hello") + |> should.equal(0) + + // Requires 1 substitution to change 'a' to 'u' + metrics.levenshtein_distance("cat", "cut") + |> should.equal(1) + + // Requires 2 substitutions (k -> s, e -> i) and 1 insertion (g at the end) + metrics.levenshtein_distance("kitten", "sitting") + |> should.equal(3) + + // Some more complex cases, involving multiple insertions, deletions, and substitutions + metrics.levenshtein_distance("gggtatccat", "cctaggtccct") + |> should.equal(6) + + metrics.levenshtein_distance( + "This is a longer string", + "This is also a much longer string", + ) + |> should.equal(10) +} From e135ea33183bf1ca9a2f0786b881fa6ff50f4503 Mon Sep 17 00:00:00 2001 From: NicklasXYZ <18580183+NicklasXYZ@users.noreply.github.com> Date: Wed, 10 Apr 2024 23:11:14 +0200 Subject: [PATCH 2/2] Rename metrics test function --- test/gleam_community/maths/metrics_test.gleam | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/gleam_community/maths/metrics_test.gleam b/test/gleam_community/maths/metrics_test.gleam index 8fca446..e5888a0 100644 --- a/test/gleam_community/maths/metrics_test.gleam +++ b/test/gleam_community/maths/metrics_test.gleam @@ -328,7 +328,7 @@ pub fn chebyshev_distance_test() { |> should.equal(Ok(0.0)) } -pub fn edit_distance_test() { +pub fn levenshtein_distance_test() { // Try different types of valid input... // Requires 5 insertions to transform the empty string into "hello"