// NewStringDiff will create a new instance of StringDiff
func NewStringDiff(s1, s2 string) *StringDiff {
return &StringDiff{
@@ -288,3 +293,140 @@ func (sd *StringDiff) JaroWinklerDistance(p float32) float32 {
return dw
+// DamerauLevenshteinDistance Algorithm is an extension to the Levenshtein
+// Algorithm which solves the edit distance problem between a source string and
+// a target string with the following operations:
+// Read https://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance
+func DamerauLevenshteinDistance(s1, s2 string) int {
+ sd := NewStringDiff(s1, s2)
+ return sd.DamerauLevenshteinDistance(1,1,1,1)
+// DamerauLevenshteinDistance Algorithm is an extension to the Levenshtein
+// Algorithm which solves the edit distance problem between a source string and
+// a target string with the following operations:
+// - Character Insertion
+// - Character Deletion
+// - Character Replacement
+// - Adjacent Character Swap
+// Note that the adjacent character swap operation is an edit that may be
+// applied when two adjacent characters in the source string match two adjacent
+// characters in the target string, but in reverse order, rather than a general
+// allowance for adjacent character swaps.
+// This implementation allows the client to specify the costs of the various
+// edit operations with the restriction that the cost of two swap operations
+// must not be less than the cost of a delete operation followed by an insert
+// operation. This restriction is required to preclude two swaps involving the
+// same character being required for optimality which, in turn, enables a fast
+// dynamic programming solution.
+// The running time of the Damerau-Levenshtein algorithm is O(n*m) where n is
+// the length of the source string and m is the length of the target string.
+// This implementation consumes O(n*m) space.
+// This code is an adaptation from https://github.com/KevinStern/software-and-algorithms/blob/master/src/main/java/blogspot/software_and_algorithms/stern_library/string/DamerauLevenshteinAlgorithm.java
+func (sd *StringDiff) DamerauLevenshteinDistance(deleteCost, insertCost,
+ replaceCost, swapCost int) int {
+ if 2 * swapCost < insertCost + deleteCost {
+ panic(fmt.Sprintf("Unsupported cost assignment. Expression 2 * %d(swapCost) < %d(insertCost) + %d(deleteCost) is detected", swapCost, insertCost, deleteCost))
+ }
+ source := []byte(sd.S1)
+ target := []byte(sd.S2)
+ if len(source) == 0 {
+ return len(target) * insertCost
+ }
+ if len(target) == 0 {
+ return len(source) * deleteCost
+ }
+ table := make([][]int, len(source))
+ for i := range table {
+ table[i] = make([]int, len(target))
+ }
+ sourceIndexByCharacter := make(map[byte]int)
+ if source[0] != target[0] {
+ table[0][0] = minInt(replaceCost, deleteCost + insertCost)
+ }
+ sourceIndexByCharacter[source[0]] = 0
+ for i := 1; i < len(source); i++ {
+ deleteDistance := table[i - 1][0] + deleteCost
+ insertDistance := (i + 1) * deleteCost + insertCost
+ ops := replaceCost
+ if source[i] == target[0] {
+ ops = 0
+ }
+ matchDistance := i * deleteCost + ops
+ table[i][0] = minInt(minInt(deleteDistance, insertDistance),
+ matchDistance)
+ }
+ for j := 1; j < len(target); j++ {
+ deleteDistance := (j + 1) * insertCost + deleteCost;
+ insertDistance := table[0][j - 1] + insertCost
+ ops := replaceCost
+ if source[0] == target[j] {
+ ops = 0
+ }
+ matchDistance := j * insertCost + ops
+ table[0][j] = minInt(minInt(deleteDistance, insertDistance),
+ matchDistance)
+ }
+ for i := 1; i < len(source); i++ {
+ maxSourceLetterMatchIndex := -1
+ if source[i] == target[0] {
+ maxSourceLetterMatchIndex = 0
+ }
+ for j := 1; j < len(target); j++ {
+ sourceIndexByCharacterNil := true
+ var candidateSwapIndex int
+ if v, ok := sourceIndexByCharacter[target[j]]; ok {
+ candidateSwapIndex = v
+ sourceIndexByCharacterNil = false
+ }
+ jSwap := maxSourceLetterMatchIndex
+ deleteDistance := table[i - 1][j] + deleteCost
+ insertDistance := table[i][j - 1] + insertCost
+ matchDistance := table[i - 1][j - 1]
+ if source[i] != target[j] {
+ matchDistance += replaceCost
+ } else {
+ maxSourceLetterMatchIndex = j
+ }
+ var swapDistance int
+ if sourceIndexByCharacterNil != true && jSwap != -1 {
+ iSwap := candidateSwapIndex
+ var preSwapCost int
+ if iSwap == 0 && jSwap == 0 {
+ preSwapCost = 0
+ } else {
+ preSwapCost = table[maxInt(0, iSwap - 1)][maxInt(0, jSwap - 1)]
+ }
+ swapDistance = preSwapCost + (i - iSwap - 1) * deleteCost + (j - jSwap - 1) * insertCost + swapCost
+ } else {
+ swapDistance = math.MaxInt32
+ }
+ table[i][j] = minInt(minInt(minInt(deleteDistance, insertDistance), matchDistance), swapDistance)
+ }
+ sourceIndexByCharacter[source[i]] = i
+ }
+ return table[len(source) - 1][len(target) - 1]
+func minInt(a,b int) int {
+ if a < b {
+ return a
+ }
+ return b
+func maxInt(a,b int) int {
+ if a > b {
+ return a
+ }
+ return b
\ No newline at end of file
+func TestDamerauLevenshteinDistance(t *testing.T) {
+ testData := make([]*TestLehvenstein, 0)
+ testData = append(testData, &TestLehvenstein{
+ S1: "abc",
+ S2: "abd",
+ D: 1,
+ }, &TestLehvenstein{
+ S1: "abc",
+ S2: "abc",
+ D: 0,
+ }, &TestLehvenstein{
+ S1: "abc",
+ S2: "ade",
+ D: 2,
+ }, &TestLehvenstein{
+ S1: "abc",
+ S2: "def",
+ D: 3,
+ }, &TestLehvenstein{
+ S1: "abc",
+ S2: "abca",
+ D: 1,
+ }, &TestLehvenstein{
+ S1: "abc",
+ S2: "abcabc",
+ D: 3,
+ }, &TestLehvenstein{
+ S1: "abc",
+ S2: "ab",
+ D: 1,
+ }, &TestLehvenstein{
+ S1: "abc",
+ S2: "",
+ D: 3,
+ }, &TestLehvenstein{ // test swap
+ S1: "abcde",
+ S2: "abced",
+ D: 1,
+ }, &TestLehvenstein{ // test swap
+ S1: "abcde",
+ S2: "ebcda",
+ D: 2,
+ })
+ for _, td := range testData {
+ sd := NewStringDiff(td.S1, td.S2)
+ if sd.DamerauLevenshteinDistance(1,1,1,1) != td.D {
+ t.Error("Distance between", td.S1, "and", td.S2, "expected to", td.D, "but", sd.DamerauLevenshteinDistance(1,1,1,1))
+ }
+ }
fmt.Printf("Distance is %d \n", beda.LevenshteinDistance("abcd", "bc"))
+### Damerau-Levenshtein Distance
+(From [Wikipedia](https://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance))
+Damerau-Levenshtein Distance is a string metric for measuring the edit distance between two
+sequences. Informally, the Damerau–Levenshtein distance between two words is the minimum
+number of operations (consisting of insertions, deletions or substitutions of a single
+character, or transposition of two adjacent characters) required to change one word into the other.
+The Damerau–Levenshtein distance differs from the classical Levenshtein distance by
+including transpositions among its allowable operations in addition to the three classical
+single-character edit operations (insertions, deletions and substitutions).
+Reading :
+- [https://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance](https://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance)
+API :
+func DamerauLevenshteinDistance(s1, s2 string) int
+func (sd *StringDiff) DamerauLevenshteinDistance(deleteCost, insertCost, replaceCost, swapCost int) int
+`func DamerauLevenshteinDistance` take 2 arguments,
+`s1` is the first string to compare
+`s2` is the second string to compare
+The closer return value to 0 means the more similar the two words.
+This function uses the default value of 1 for all `deleteCost`, `insertCost`, `replaceCost` and `swapCost`
+`func (sd *StringDiff) DamerauLevenshteinDistance` takes 4 arguments,
+`deleteCost` is multiplier factor for delete operation
+`insertCost` is multiplier factor for insert operation
+`replaceCost` is multiplier factor for replace operation
+`swapCost` is multiplier factor for swap operation
+A multiplier value enable us to weight on how impactful each of the operation
+contributing to the change distance.
+Example :
+sd := beda.NewStringDiff("abcd", "bc")
+lDist := sd.DamerauLevenshteinDistance(1,1,1,1)
+fmt.Printf("Distance is %d \n", lDist) // prints : Distance is 2
+fmt.Printf("Distance is %d \n", beda.DamerauLevenshteinDistance("abcd", "bc"))
### TriGram Compare
TrigramCompare is a case of n-gram, a contiguous sequence of n (three, in this case) items from a given sample.
