Skip to content

Commit

Permalink
Merge pull request #314 from paulhoadley/distance-refactor
Browse files Browse the repository at this point in the history
Minor refactoring of ERXStringUtilities.distance(). #312
  • Loading branch information
darkv committed Nov 8, 2012
2 parents a988e11 + b081b45 commit 0ed93d2
Show file tree
Hide file tree
Showing 2 changed files with 139 additions and 100 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -82,106 +82,134 @@ public class ERXStringUtilities {
* a single entry for English.
*/
private static NSArray _defaultTargetDisplayLanguages = new NSArray(DEFAULT_TARGET_DISPLAY_LANGUAGE);

/**
* Java port of the distance algorithm.
*
* The code below comes from the following post on http://mail.python.org
* Fuzzy string matching
* Magnus L. Hetland [email protected]
* 27 Aug 1999 15:51:03 +0200
*
* Explanation of the distance algorithm...
*
* The algorithm:
*
* def distance(a,b):
* c = {}
* n = len(a); m = len(b)
*
* for i in range(0,n+1):
* c[i,0] = i
* for j in range(0,m+1):
* c[0,j] = j
*
* for i in range(1,n+1):
* for j in range(1,m+1):
* x = c[i-1,j]+1
* y = c[i,j-1]+1
* if a[i-1] == b[j-1]:
* z = c[i-1,j-1]
* else:
* z = c[i-1,j-1]+1
* c[i,j] = min(x,y,z)
* return c[n,m]
*
* It calculates the following: Given two strings, a and b, and three
* operations, adding, subtracting and exchanging single characters, what
* is the minimal number of steps needed to translate a into b?
*
* The method is based on the following idea:
*
* We want to find the distance between a[:x] and b[:y]. To do this, we
* first calculate
*
* 1) the distance between a[:x-1] and b[:y], adding the cost of a
* subtract-operation, used to get from a[:x] to a[:z-1];
*
* 2) the distance between a[:x] and b[:y-1], adding the cost of an
* addition-operation, used to get from b[:y-1] to b[:y];
*
* 3) the distance between a[:x-1] and b[:y-1], adding the cost of a
* *possible* exchange of the letter b[y] (with a[x]).
*
* The cost of the subtraction and addition operations are 1, while the
* exchange operation has a cost of 1 if a[x] and b[y] are different, and
* 0 otherwise.
*
* After calculating these costs, we choose the least one of them
* (since we want to use the best solution.)
*
* Instead of doing this recursively, i.e. calculating ourselves "back"
* from the final value, we build a cost-matrix c containing the optimal
* costs, so we can reuse them when calculating the later values. The
* costs c[i,0] (from string of length n to empty string) are all i, and
* correspondingly all c[0,j] (from empty string to string of length j)
* are j.
*
* Finally, the cost of translating between the full strings a and b
* (c[n,m]) is returned.
*
* I guess that ought to cover it...
* --------------------------
* @param a first string
* @param b second string
* @return the distance between the two strings
*/

/**
* Returns the <a
* href="http://en.wikipedia.org/wiki/Levenshtein_distance">Levenshtein
* distance</a> between {@code a} and {@code b} as a {@code double}. (This
* method is being retained for backwards compatibility, and will be removed
* at some future point. New code should use
* {@link #levenshteinDistance(String, String)}.)
*
* @param a
* first string
* @param b
* second string
* @return Levenshtein distance between {@code a} and {@code b}
* @deprecated Use {@link #levenshteinDistance(String, String)}, which
* correctly returns an {@code int} result
*/
@Deprecated
public static double distance(String a, String b) {
int n = a.length();
int m = b.length();
int c[][] = new int[n+1][m+1];
for(int i = 0; i<=n; i++){
c[i][0] = i;
}
for(int j = 0; j<=m; j++){
c[0][j] = j;
}
for(int i = 1; i<=n; i++){
for(int j = 1; j<=m; j++){
int x = c[i-1][j] + 1;
int y = c[i][j-1] + 1;
int z = 0;
if(a.charAt(i-1) == b.charAt(j-1))
z = c[i-1][j-1];
else
z = c[i-1][j-1] + 1;
int temp = Math.min(x,y);
c[i][j] = Math.min(z, temp);
}
}
return c[n][m];
return levenshteinDistance(a, b);
}

/**
* <p>
* Returns the <a
* href="http://en.wikipedia.org/wiki/Levenshtein_distance">Levenshtein
* distance</a> between {@code a} and {@code b}. This code is based on <a
* href
* ="http://mail.python.org/pipermail/python-list/1999-August/006031.html"
* >some Python code posted to a mailing list</a> by Magnus L. Hetland
* &lt;[email protected]&gt;, and assumed to be in the public domain.
* </p>
*
* <h3>Algorithm</h3>
*
* <pre>
* <code>def distance(a,b):
* c = {}
* n = len(a); m = len(b)
*
* for i in range(0,n+1):
* c[i,0] = i
* for j in range(0,m+1):
* c[0,j] = j
*
* for i in range(1,n+1):
* for j in range(1,m+1):
* x = c[i-1,j]+1
* y = c[i,j-1]+1
* if a[i-1] == b[j-1]:
* z = c[i-1,j-1]
* else:
* z = c[i-1,j-1]+1
* c[i,j] = min(x,y,z)
* return c[n,m]</code>
* </pre>
*
* <p>
* It calculates the following: Given two strings, {@code a} and {@code b},
* and three operations, adding, subtracting and exchanging single
* characters, what is the minimal number of steps needed to translate
* {@code a} into {@code b}? The method is based on the following idea. We
* want to find the distance between {@code a[:x]} and {@code b[:y]}. To do
* this, we first calculate:
* </p>
*
* <ol>
* <li>the distance between {@code a[:x-1]} and {@code b[:y]}, adding the
* cost of a subtract-operation, used to get from {@code a[:x]} to
* {@code a[:z-1]};</li>
* <li>the distance between {@code a[:x]} and {@code b[:y-1]}, adding the
* cost of an addition-operation, used to get from {@code b[:y-1]} to
* {@code b[:y]};</li>
* <li>the distance between {@code a[:x-1]} and {@code b[:y-1]}, adding the
* cost of a <em>possible</em> exchange of the letter {@code b[y]} (with
* {@code a[x]}).</li>
* </ol>
*
* <p>
* The cost of the subtraction and addition operations are 1, while the
* exchange operation has a cost of 1 if {@code a[x]} and {@code b[y]} are
* different, and 0 otherwise. After calculating these costs, we choose the
* least one of them (since we want to use the best solution.)
* </p>
*
* <p>
* Instead of doing this recursively, i.e. calculating ourselves "back" from
* the final value, we build a cost-matrix {@code c} containing the optimal
* costs, so we can reuse them when calculating the later values. The costs
* {@code c[i,0]} (from string of length {@code n} to empty string) are all
* {@code i}, and correspondingly all {@code c[0,j]} (from empty string to
* string of length {@code j}) are {@code j}. Finally, the cost of
* translating between the full strings {@code a} and {@code b} (
* {@code c[n,m]}) is returned.
* </p>
*
* @param a
* first string
* @param b
* second string
* @return the distance between the two strings
*/
public static int levenshteinDistance(String a, String b) {
int n = a.length();
int m = b.length();
int c[][] = new int[n + 1][m + 1];
for (int i = 0; i <= n; i++) {
c[i][0] = i;
}
for (int j = 0; j <= m; j++) {
c[0][j] = j;
}
for (int i = 1; i <= n; i++) {
for (int j = 1; j <= m; j++) {
int x = c[i - 1][j] + 1;
int y = c[i][j - 1] + 1;
int z = 0;
if (a.charAt(i - 1) == b.charAt(j - 1))
z = c[i - 1][j - 1];
else
z = c[i - 1][j - 1] + 1;
int temp = Math.min(x, y);
c[i][j] = Math.min(z, temp);
}
}
return c[n][m];
}

/** holds the base adjustment for fuzzy matching */
// FIXME: Not thread safe
// MOVEME: Needs to go with the fuzzy matching stuff
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ private class LevenshteinExample {
/**
* Levenshtein distance between {@code s1} and {@code s2}
*/
public double d;
public int d;

/**
* Constructor
Expand All @@ -56,7 +56,7 @@ private class LevenshteinExample {
* @param d
* Levenshtein distance
*/
public LevenshteinExample(String s1, String s2, double d) {
public LevenshteinExample(String s1, String s2, int d) {
this.s1 = s1;
this.s2 = s2;
this.d = d;
Expand Down Expand Up @@ -235,12 +235,23 @@ public void testMaskStringWithCharacter4() {
}

/**
* Tests {@code ERXStringUtilities.distance(String, String)}.
* Tests {@link ERXStringUtilities#distance(String, String)}.
*/
@Test
public void testDistance() {
for (LevenshteinExample l : levs) {
assertEquals(l.d, ERXStringUtilities.distance(l.s1, l.s2), 0.00001);
}
}

/**
* Tests {@link ERXStringUtilities#levenshteinDistance(String, String)}.
*/
@Test
public void testLevenshteinDistance() {
for (LevenshteinExample l : levs) {
assertEquals(l.d,
ERXStringUtilities.levenshteinDistance(l.s1, l.s2));
}
}
}

0 comments on commit 0ed93d2

Please sign in to comment.