From 41e65732f295c396bd913cf0f34b40e724c7df7a Mon Sep 17 00:00:00 2001 From: Paul Hoadley Date: Thu, 8 Nov 2012 17:01:15 +1030 Subject: [PATCH 1/3] Factors distance() implementation into new method. #312 This is really just a cut and paste of the implementation of 'double distance(String, String)' into 'int levenshteinDistance(String, String)', and a replacement of the body of the former method with a call to the latter. (We don't even need a cast to go from int to double.) We also take the opportunity to deprecate distance(), and mark it for potential future removal. Note that distance() still passes all the tests in ERXStringUtilitiesTest.testDistance(). --- .../foundation/ERXStringUtilities.java | 75 ++++++++++++------- 1 file changed, 48 insertions(+), 27 deletions(-) diff --git a/Frameworks/Core/ERExtensions/Sources/er/extensions/foundation/ERXStringUtilities.java b/Frameworks/Core/ERExtensions/Sources/er/extensions/foundation/ERXStringUtilities.java index 0db7f8343ee..b26d11020d0 100644 --- a/Frameworks/Core/ERExtensions/Sources/er/extensions/foundation/ERXStringUtilities.java +++ b/Frameworks/Core/ERExtensions/Sources/er/extensions/foundation/ERXStringUtilities.java @@ -82,7 +82,28 @@ public class ERXStringUtilities { * a single entry for English. */ private static NSArray _defaultTargetDisplayLanguages = new NSArray(DEFAULT_TARGET_DISPLAY_LANGUAGE); - + + /** + * Returns the Levenshtein + * distance between {@code a} and {@code b} as a {@code double}. (This + * method is being retained for backwards compatibility, and will be removed + * at some future point. New code should use + * {@link #levenshteinDistance(String, String)}.) + * + * @param a + * first string + * @param b + * second string + * @return Levenshtein distance between {@code a} and {@code b} + * @deprecated Use {@link #levenshteinDistance(String, String)}, which + * correctly returns an {@code int} result + */ + @Deprecated + public static double distance(String a, String b) { + return levenshteinDistance(a, b); + } + /** * Java port of the distance algorithm. * @@ -156,32 +177,32 @@ public class ERXStringUtilities { * @param b second string * @return the distance between the two strings */ - public static double distance(String a, String b) { - int n = a.length(); - int m = b.length(); - int c[][] = new int[n+1][m+1]; - for(int i = 0; i<=n; i++){ - c[i][0] = i; - } - for(int j = 0; j<=m; j++){ - c[0][j] = j; - } - for(int i = 1; i<=n; i++){ - for(int j = 1; j<=m; j++){ - int x = c[i-1][j] + 1; - int y = c[i][j-1] + 1; - int z = 0; - if(a.charAt(i-1) == b.charAt(j-1)) - z = c[i-1][j-1]; - else - z = c[i-1][j-1] + 1; - int temp = Math.min(x,y); - c[i][j] = Math.min(z, temp); - } - } - return c[n][m]; - } - + public static int levenshteinDistance(String a, String b) { + int n = a.length(); + int m = b.length(); + int c[][] = new int[n + 1][m + 1]; + for (int i = 0; i <= n; i++) { + c[i][0] = i; + } + for (int j = 0; j <= m; j++) { + c[0][j] = j; + } + for (int i = 1; i <= n; i++) { + for (int j = 1; j <= m; j++) { + int x = c[i - 1][j] + 1; + int y = c[i][j - 1] + 1; + int z = 0; + if (a.charAt(i - 1) == b.charAt(j - 1)) + z = c[i - 1][j - 1]; + else + z = c[i - 1][j - 1] + 1; + int temp = Math.min(x, y); + c[i][j] = Math.min(z, temp); + } + } + return c[n][m]; + } + /** holds the base adjustment for fuzzy matching */ // FIXME: Not thread safe // MOVEME: Needs to go with the fuzzy matching stuff From 888ef95db7a6c21df533977618a982bd97dacce7 Mon Sep 17 00:00:00 2001 From: Paul Hoadley Date: Thu, 8 Nov 2012 17:17:57 +1030 Subject: [PATCH 2/3] Adds test for ERXStringUtilities.levenshteinDistance(). #312 --- .../foundation/ERXStringUtilitiesTest.java | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/Tests/ERXTest/Sources/er/extensions/foundation/ERXStringUtilitiesTest.java b/Tests/ERXTest/Sources/er/extensions/foundation/ERXStringUtilitiesTest.java index 9701d258ad3..bbf39555853 100644 --- a/Tests/ERXTest/Sources/er/extensions/foundation/ERXStringUtilitiesTest.java +++ b/Tests/ERXTest/Sources/er/extensions/foundation/ERXStringUtilitiesTest.java @@ -44,7 +44,7 @@ private class LevenshteinExample { /** * Levenshtein distance between {@code s1} and {@code s2} */ - public double d; + public int d; /** * Constructor @@ -56,7 +56,7 @@ private class LevenshteinExample { * @param d * Levenshtein distance */ - public LevenshteinExample(String s1, String s2, double d) { + public LevenshteinExample(String s1, String s2, int d) { this.s1 = s1; this.s2 = s2; this.d = d; @@ -235,7 +235,7 @@ public void testMaskStringWithCharacter4() { } /** - * Tests {@code ERXStringUtilities.distance(String, String)}. + * Tests {@link ERXStringUtilities#distance(String, String)}. */ @Test public void testDistance() { @@ -243,4 +243,15 @@ public void testDistance() { assertEquals(l.d, ERXStringUtilities.distance(l.s1, l.s2), 0.00001); } } + + /** + * Tests {@link ERXStringUtilities#levenshteinDistance(String, String)}. + */ + @Test + public void testLevenshteinDistance() { + for (LevenshteinExample l : levs) { + assertEquals(l.d, + ERXStringUtilities.levenshteinDistance(l.s1, l.s2)); + } + } } From b081b450f08f130f9adb6c8041c990c30e6ee74c Mon Sep 17 00:00:00 2001 From: Paul Hoadley Date: Thu, 8 Nov 2012 18:32:28 +1030 Subject: [PATCH 3/3] Improves Javadoc for levenshteinDistance(). #312 --- .../foundation/ERXStringUtilities.java | 153 +++++++++--------- 1 file changed, 80 insertions(+), 73 deletions(-) diff --git a/Frameworks/Core/ERExtensions/Sources/er/extensions/foundation/ERXStringUtilities.java b/Frameworks/Core/ERExtensions/Sources/er/extensions/foundation/ERXStringUtilities.java index b26d11020d0..9da4bcf67da 100644 --- a/Frameworks/Core/ERExtensions/Sources/er/extensions/foundation/ERXStringUtilities.java +++ b/Frameworks/Core/ERExtensions/Sources/er/extensions/foundation/ERXStringUtilities.java @@ -104,79 +104,86 @@ public static double distance(String a, String b) { return levenshteinDistance(a, b); } - /** - * Java port of the distance algorithm. - * - * The code below comes from the following post on http://mail.python.org - * Fuzzy string matching - * Magnus L. Hetland mlh@idt.ntnu.no - * 27 Aug 1999 15:51:03 +0200 - * - * Explanation of the distance algorithm... - * - * The algorithm: - * - * def distance(a,b): - * c = {} - * n = len(a); m = len(b) - * - * for i in range(0,n+1): - * c[i,0] = i - * for j in range(0,m+1): - * c[0,j] = j - * - * for i in range(1,n+1): - * for j in range(1,m+1): - * x = c[i-1,j]+1 - * y = c[i,j-1]+1 - * if a[i-1] == b[j-1]: - * z = c[i-1,j-1] - * else: - * z = c[i-1,j-1]+1 - * c[i,j] = min(x,y,z) - * return c[n,m] - * - * It calculates the following: Given two strings, a and b, and three - * operations, adding, subtracting and exchanging single characters, what - * is the minimal number of steps needed to translate a into b? - * - * The method is based on the following idea: - * - * We want to find the distance between a[:x] and b[:y]. To do this, we - * first calculate - * - * 1) the distance between a[:x-1] and b[:y], adding the cost of a - * subtract-operation, used to get from a[:x] to a[:z-1]; - * - * 2) the distance between a[:x] and b[:y-1], adding the cost of an - * addition-operation, used to get from b[:y-1] to b[:y]; - * - * 3) the distance between a[:x-1] and b[:y-1], adding the cost of a - * *possible* exchange of the letter b[y] (with a[x]). - * - * The cost of the subtraction and addition operations are 1, while the - * exchange operation has a cost of 1 if a[x] and b[y] are different, and - * 0 otherwise. - * - * After calculating these costs, we choose the least one of them - * (since we want to use the best solution.) - * - * Instead of doing this recursively, i.e. calculating ourselves "back" - * from the final value, we build a cost-matrix c containing the optimal - * costs, so we can reuse them when calculating the later values. The - * costs c[i,0] (from string of length n to empty string) are all i, and - * correspondingly all c[0,j] (from empty string to string of length j) - * are j. - * - * Finally, the cost of translating between the full strings a and b - * (c[n,m]) is returned. - * - * I guess that ought to cover it... - * -------------------------- - * @param a first string - * @param b second string - * @return the distance between the two strings - */ + /** + *

+ * Returns the Levenshtein + * distance between {@code a} and {@code b}. This code is based on some Python code posted to a mailing list by Magnus L. Hetland + * <mlh@idt.ntnu.no>, and assumed to be in the public domain. + *

+ * + *

Algorithm

+ * + *
+	 * def distance(a,b):
+	 *   c = {}
+	 *   n = len(a); m = len(b)
+	 * 
+	 *   for i in range(0,n+1):
+	 *     c[i,0] = i
+	 *   for j in range(0,m+1):
+	 *     c[0,j] = j
+	 * 
+	 *   for i in range(1,n+1):
+	 *     for j in range(1,m+1):
+	 *       x = c[i-1,j]+1
+	 *       y = c[i,j-1]+1
+	 *       if a[i-1] == b[j-1]:
+	 *         z = c[i-1,j-1]
+	 *       else:
+	 *         z = c[i-1,j-1]+1
+	 *       c[i,j] = min(x,y,z)
+	 *   return c[n,m]
+	 * 
+ * + *

+ * It calculates the following: Given two strings, {@code a} and {@code b}, + * and three operations, adding, subtracting and exchanging single + * characters, what is the minimal number of steps needed to translate + * {@code a} into {@code b}? The method is based on the following idea. We + * want to find the distance between {@code a[:x]} and {@code b[:y]}. To do + * this, we first calculate: + *

+ * + *
    + *
  1. the distance between {@code a[:x-1]} and {@code b[:y]}, adding the + * cost of a subtract-operation, used to get from {@code a[:x]} to + * {@code a[:z-1]};
  2. + *
  3. the distance between {@code a[:x]} and {@code b[:y-1]}, adding the + * cost of an addition-operation, used to get from {@code b[:y-1]} to + * {@code b[:y]};
  4. + *
  5. the distance between {@code a[:x-1]} and {@code b[:y-1]}, adding the + * cost of a possible exchange of the letter {@code b[y]} (with + * {@code a[x]}).
  6. + *
+ * + *

+ * The cost of the subtraction and addition operations are 1, while the + * exchange operation has a cost of 1 if {@code a[x]} and {@code b[y]} are + * different, and 0 otherwise. After calculating these costs, we choose the + * least one of them (since we want to use the best solution.) + *

+ * + *

+ * Instead of doing this recursively, i.e. calculating ourselves "back" from + * the final value, we build a cost-matrix {@code c} containing the optimal + * costs, so we can reuse them when calculating the later values. The costs + * {@code c[i,0]} (from string of length {@code n} to empty string) are all + * {@code i}, and correspondingly all {@code c[0,j]} (from empty string to + * string of length {@code j}) are {@code j}. Finally, the cost of + * translating between the full strings {@code a} and {@code b} ( + * {@code c[n,m]}) is returned. + *

+ * + * @param a + * first string + * @param b + * second string + * @return the distance between the two strings + */ public static int levenshteinDistance(String a, String b) { int n = a.length(); int m = b.length();