-
Notifications
You must be signed in to change notification settings - Fork 166
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #314 from paulhoadley/distance-refactor
Minor refactoring of ERXStringUtilities.distance(). #312
- Loading branch information
Showing
2 changed files
with
139 additions
and
100 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -82,106 +82,134 @@ public class ERXStringUtilities { | |
* a single entry for English. | ||
*/ | ||
private static NSArray _defaultTargetDisplayLanguages = new NSArray(DEFAULT_TARGET_DISPLAY_LANGUAGE); | ||
|
||
/** | ||
* Java port of the distance algorithm. | ||
* | ||
* The code below comes from the following post on http://mail.python.org | ||
* Fuzzy string matching | ||
* Magnus L. Hetland [email protected] | ||
* 27 Aug 1999 15:51:03 +0200 | ||
* | ||
* Explanation of the distance algorithm... | ||
* | ||
* The algorithm: | ||
* | ||
* def distance(a,b): | ||
* c = {} | ||
* n = len(a); m = len(b) | ||
* | ||
* for i in range(0,n+1): | ||
* c[i,0] = i | ||
* for j in range(0,m+1): | ||
* c[0,j] = j | ||
* | ||
* for i in range(1,n+1): | ||
* for j in range(1,m+1): | ||
* x = c[i-1,j]+1 | ||
* y = c[i,j-1]+1 | ||
* if a[i-1] == b[j-1]: | ||
* z = c[i-1,j-1] | ||
* else: | ||
* z = c[i-1,j-1]+1 | ||
* c[i,j] = min(x,y,z) | ||
* return c[n,m] | ||
* | ||
* It calculates the following: Given two strings, a and b, and three | ||
* operations, adding, subtracting and exchanging single characters, what | ||
* is the minimal number of steps needed to translate a into b? | ||
* | ||
* The method is based on the following idea: | ||
* | ||
* We want to find the distance between a[:x] and b[:y]. To do this, we | ||
* first calculate | ||
* | ||
* 1) the distance between a[:x-1] and b[:y], adding the cost of a | ||
* subtract-operation, used to get from a[:x] to a[:z-1]; | ||
* | ||
* 2) the distance between a[:x] and b[:y-1], adding the cost of an | ||
* addition-operation, used to get from b[:y-1] to b[:y]; | ||
* | ||
* 3) the distance between a[:x-1] and b[:y-1], adding the cost of a | ||
* *possible* exchange of the letter b[y] (with a[x]). | ||
* | ||
* The cost of the subtraction and addition operations are 1, while the | ||
* exchange operation has a cost of 1 if a[x] and b[y] are different, and | ||
* 0 otherwise. | ||
* | ||
* After calculating these costs, we choose the least one of them | ||
* (since we want to use the best solution.) | ||
* | ||
* Instead of doing this recursively, i.e. calculating ourselves "back" | ||
* from the final value, we build a cost-matrix c containing the optimal | ||
* costs, so we can reuse them when calculating the later values. The | ||
* costs c[i,0] (from string of length n to empty string) are all i, and | ||
* correspondingly all c[0,j] (from empty string to string of length j) | ||
* are j. | ||
* | ||
* Finally, the cost of translating between the full strings a and b | ||
* (c[n,m]) is returned. | ||
* | ||
* I guess that ought to cover it... | ||
* -------------------------- | ||
* @param a first string | ||
* @param b second string | ||
* @return the distance between the two strings | ||
*/ | ||
|
||
/** | ||
* Returns the <a | ||
* href="http://en.wikipedia.org/wiki/Levenshtein_distance">Levenshtein | ||
* distance</a> between {@code a} and {@code b} as a {@code double}. (This | ||
* method is being retained for backwards compatibility, and will be removed | ||
* at some future point. New code should use | ||
* {@link #levenshteinDistance(String, String)}.) | ||
* | ||
* @param a | ||
* first string | ||
* @param b | ||
* second string | ||
* @return Levenshtein distance between {@code a} and {@code b} | ||
* @deprecated Use {@link #levenshteinDistance(String, String)}, which | ||
* correctly returns an {@code int} result | ||
*/ | ||
@Deprecated | ||
public static double distance(String a, String b) { | ||
int n = a.length(); | ||
int m = b.length(); | ||
int c[][] = new int[n+1][m+1]; | ||
for(int i = 0; i<=n; i++){ | ||
c[i][0] = i; | ||
} | ||
for(int j = 0; j<=m; j++){ | ||
c[0][j] = j; | ||
} | ||
for(int i = 1; i<=n; i++){ | ||
for(int j = 1; j<=m; j++){ | ||
int x = c[i-1][j] + 1; | ||
int y = c[i][j-1] + 1; | ||
int z = 0; | ||
if(a.charAt(i-1) == b.charAt(j-1)) | ||
z = c[i-1][j-1]; | ||
else | ||
z = c[i-1][j-1] + 1; | ||
int temp = Math.min(x,y); | ||
c[i][j] = Math.min(z, temp); | ||
} | ||
} | ||
return c[n][m]; | ||
return levenshteinDistance(a, b); | ||
} | ||
|
||
/** | ||
* <p> | ||
* Returns the <a | ||
* href="http://en.wikipedia.org/wiki/Levenshtein_distance">Levenshtein | ||
* distance</a> between {@code a} and {@code b}. This code is based on <a | ||
* href | ||
* ="http://mail.python.org/pipermail/python-list/1999-August/006031.html" | ||
* >some Python code posted to a mailing list</a> by Magnus L. Hetland | ||
* <[email protected]>, and assumed to be in the public domain. | ||
* </p> | ||
* | ||
* <h3>Algorithm</h3> | ||
* | ||
* <pre> | ||
* <code>def distance(a,b): | ||
* c = {} | ||
* n = len(a); m = len(b) | ||
* | ||
* for i in range(0,n+1): | ||
* c[i,0] = i | ||
* for j in range(0,m+1): | ||
* c[0,j] = j | ||
* | ||
* for i in range(1,n+1): | ||
* for j in range(1,m+1): | ||
* x = c[i-1,j]+1 | ||
* y = c[i,j-1]+1 | ||
* if a[i-1] == b[j-1]: | ||
* z = c[i-1,j-1] | ||
* else: | ||
* z = c[i-1,j-1]+1 | ||
* c[i,j] = min(x,y,z) | ||
* return c[n,m]</code> | ||
* </pre> | ||
* | ||
* <p> | ||
* It calculates the following: Given two strings, {@code a} and {@code b}, | ||
* and three operations, adding, subtracting and exchanging single | ||
* characters, what is the minimal number of steps needed to translate | ||
* {@code a} into {@code b}? The method is based on the following idea. We | ||
* want to find the distance between {@code a[:x]} and {@code b[:y]}. To do | ||
* this, we first calculate: | ||
* </p> | ||
* | ||
* <ol> | ||
* <li>the distance between {@code a[:x-1]} and {@code b[:y]}, adding the | ||
* cost of a subtract-operation, used to get from {@code a[:x]} to | ||
* {@code a[:z-1]};</li> | ||
* <li>the distance between {@code a[:x]} and {@code b[:y-1]}, adding the | ||
* cost of an addition-operation, used to get from {@code b[:y-1]} to | ||
* {@code b[:y]};</li> | ||
* <li>the distance between {@code a[:x-1]} and {@code b[:y-1]}, adding the | ||
* cost of a <em>possible</em> exchange of the letter {@code b[y]} (with | ||
* {@code a[x]}).</li> | ||
* </ol> | ||
* | ||
* <p> | ||
* The cost of the subtraction and addition operations are 1, while the | ||
* exchange operation has a cost of 1 if {@code a[x]} and {@code b[y]} are | ||
* different, and 0 otherwise. After calculating these costs, we choose the | ||
* least one of them (since we want to use the best solution.) | ||
* </p> | ||
* | ||
* <p> | ||
* Instead of doing this recursively, i.e. calculating ourselves "back" from | ||
* the final value, we build a cost-matrix {@code c} containing the optimal | ||
* costs, so we can reuse them when calculating the later values. The costs | ||
* {@code c[i,0]} (from string of length {@code n} to empty string) are all | ||
* {@code i}, and correspondingly all {@code c[0,j]} (from empty string to | ||
* string of length {@code j}) are {@code j}. Finally, the cost of | ||
* translating between the full strings {@code a} and {@code b} ( | ||
* {@code c[n,m]}) is returned. | ||
* </p> | ||
* | ||
* @param a | ||
* first string | ||
* @param b | ||
* second string | ||
* @return the distance between the two strings | ||
*/ | ||
public static int levenshteinDistance(String a, String b) { | ||
int n = a.length(); | ||
int m = b.length(); | ||
int c[][] = new int[n + 1][m + 1]; | ||
for (int i = 0; i <= n; i++) { | ||
c[i][0] = i; | ||
} | ||
for (int j = 0; j <= m; j++) { | ||
c[0][j] = j; | ||
} | ||
for (int i = 1; i <= n; i++) { | ||
for (int j = 1; j <= m; j++) { | ||
int x = c[i - 1][j] + 1; | ||
int y = c[i][j - 1] + 1; | ||
int z = 0; | ||
if (a.charAt(i - 1) == b.charAt(j - 1)) | ||
z = c[i - 1][j - 1]; | ||
else | ||
z = c[i - 1][j - 1] + 1; | ||
int temp = Math.min(x, y); | ||
c[i][j] = Math.min(z, temp); | ||
} | ||
} | ||
return c[n][m]; | ||
} | ||
|
||
/** holds the base adjustment for fuzzy matching */ | ||
// FIXME: Not thread safe | ||
// MOVEME: Needs to go with the fuzzy matching stuff | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters