diff --git a/README.md b/README.md index ecff68b..0e2c3b2 100644 --- a/README.md +++ b/README.md @@ -44,20 +44,21 @@ Designed to be fully compatible with Unicode characters!
This library is 100% test covered ๐Ÿ˜ ## Features -- [Levenshtein](https://en.wikipedia.org/wiki/Levenshtein_distance) โœจ -- [LCS](https://en.wikipedia.org/wiki/Longest_common_subsequence_problem) (Longest common subsequence) with edit distance, backtrack and diff functions โœจ -- [Hamming](https://en.wikipedia.org/wiki/Hamming_distance) โœจ -- [Damerau-Levenshtein](https://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance), with following variants : - - OSA (Optimal string alignment) โœจ - - Adjacent transpositions โœจ -- [Jaro & Jaro-Winkler](https://fr.wikipedia.org/wiki/Distance_de_Jaro-Winkler) similarity algorithms โœจ -- [Cosine Similarity](https://en.wikipedia.org/wiki/Cosine_similarity) algorithm to compare strings โœจ -- [Jaccard Index](https://en.wikipedia.org/wiki/Jaccard_index) โœจ - -- Computed similarity percentage functions based on all available edit distance algorithms in this lib โœจ -- Fuzzy search functions based on edit distance with unique or multiples strings output โœจ -- Unicode compatibility ! ๐Ÿฅณ -- And many more to come ! + +- [Levenshtein](https://en.wikipedia.org/wiki/Levenshtein_distance) +- [LCS](https://en.wikipedia.org/wiki/Longest_common_subsequence_problem) (Longest common subsequence) with edit distance, backtrack and diff functions +- [Hamming](https://en.wikipedia.org/wiki/Hamming_distance) +- [Damerau-Levenshtein](https://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance), with following variants: + - OSA (Optimal string alignment) + - Adjacent transpositions +- [Jaro & Jaro-Winkler](https://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance) similarity algorithms +- [Cosine Similarity](https://en.wikipedia.org/wiki/Cosine_similarity) +- [Jaccard Index](https://en.wikipedia.org/wiki/Jaccard_index) +- [QGram](https://en.wikipedia.org/wiki/N-gram) +- [Sorensen-Dice](https://en.wikipedia.org/wiki/S%C3%B8rensen%E2%80%93Dice_coefficient) +- Computed similarity percentage functions based on all available edit distance algorithms in this lib +- Fuzzy search functions based on edit distance with unique or multiples strings output +- Unicode compatibility ๐Ÿฅณ ## Benchmarks You can check an interactive Google chart with few benchmark cases for all similarity algorithms in this library through **StringsSimilarity** function [here](http://benchgraph.codingberg.com/q5) diff --git a/qgram.go b/qgram.go new file mode 100644 index 0000000..118cab5 --- /dev/null +++ b/qgram.go @@ -0,0 +1,56 @@ +package edlib + +import ( + "math" +) + +// QgramDistance compute the q-gram similarity between two strings +// Takes two strings as parameters, a split length which defines the k-gram shingle length +func QgramDistance(str1, str2 string, splitLength int) int { + splittedStr1 := Shingle(str1, splitLength) + splittedStr2 := Shingle(str2, splitLength) + + union := make(map[string]int) + for i := range splittedStr1 { + union[i] = 0 + } + for i := range splittedStr2 { + union[i] = 0 + } + + res := 0 + + for i := range union { + res += int(math.Abs(float64(splittedStr1[i] - splittedStr2[i]))) + } + + return res +} + +// QgramDistanceCustomNgram compute the q-gram similarity between two custom set of individuals +// Takes two n-gram map as parameters +func QgramDistanceCustomNgram(splittedStr1, splittedStr2 map[string]int) int { + union := make(map[string]int) + for i := range splittedStr1 { + union[i] = 0 + } + for i := range splittedStr2 { + union[i] = 0 + } + + res := 0 + for i := range union { + res += int(math.Abs(float64(splittedStr1[i] - splittedStr2[i]))) + } + + return res +} + +// QgramSimilarity compute a similarity index (between 0 and 1) between two strings from a Qgram distance +// Takes two strings as parameters, a split length which defines the k-gram shingle length +func QgramSimilarity(str1, str2 string, splitLength int) float32 { + splittedStr1 := Shingle(str1, splitLength) + splittedStr2 := Shingle(str2, splitLength) + res := float32(QgramDistanceCustomNgram(splittedStr1, splittedStr2)) + return 1 - (res / float32(len(splittedStr1)+len(splittedStr2))) +} diff --git a/qgram_test.go b/qgram_test.go new file mode 100644 index 0000000..a566d15 --- /dev/null +++ b/qgram_test.go @@ -0,0 +1,35 @@ +package edlib + +import ( + "testing" +) + +func TestQgramDistance(t *testing.T) { + type args struct { + str1 string + str2 string + splitLength int + } + tests := []struct { + name string + args args + want int + }{ + {"Qgram sim 1", args{"Radiohead", "Radiohead", 2}, 0.0}, + {"Qgram sim 2", args{"ABCD", "ABCE", 2}, 2.0}, + {"Qgram sim 3", args{"Radiohead", "Carly Rae Jepsen", 2}, 21.0}, + {"Qgram sim 4", args{"I love horror movies", "Lights out is a horror movie", 2}, 22.0}, + {"Qgram sim 5", args{"love horror movies", "Lights out horror movie", 2}, 15.0}, + {"Qgram sim 6", args{"็งใฎๅๅ‰ใฏใ‚ธใƒงใƒณใงใ™", "็งใฎๅๅ‰ใฏใ‚ธใƒงใƒณใƒปใƒ‰ใ‚ฅใงใ™", 2}, 5}, + {"Qgram sim 7", args{"๐Ÿ™‚๐Ÿ˜„๐Ÿ™‚๐Ÿ˜„ ๐Ÿ˜„๐Ÿ™‚๐Ÿ˜„", "๐Ÿ™‚๐Ÿ˜„๐Ÿ™‚๐Ÿ˜„ ๐Ÿ˜„๐Ÿ™‚๐Ÿ˜„ ๐Ÿ™‚๐Ÿ˜„๐Ÿ™‚", 2}, 4}, + {"Qgram sim 8", args{"", "", 2}, 0.0}, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + if got := QgramDistance(tt.args.str1, tt.args.str2, tt.args.splitLength); got != tt.want { + t.Errorf("QgramDistance() = %v, want %v", got, tt.want) + } + }) + } +} diff --git a/sorensen-dice.go b/sorensen-dice.go new file mode 100644 index 0000000..214f088 --- /dev/null +++ b/sorensen-dice.go @@ -0,0 +1,19 @@ +package edlib + +// SorensenDiceCoefficient computes the Sorensen-Dice coefficient between two strings +// Takes two strings as parameters, a split length which defines the k-gram shingle length +func SorensenDiceCoefficient(str1, str2 string, splitLength int) float32 { + if str1 == "" && str2 == "" { + return 0 + } + shingle1 := Shingle(str1, splitLength) + shingle2 := Shingle(str2, splitLength) + + intersection := float32(0) + for i := range shingle1 { + if _, ok := shingle2[i]; ok { + intersection++ + } + } + return 2.0 * intersection / float32(len(shingle1)+len(shingle2)) +} diff --git a/sorensen-dice_test.go b/sorensen-dice_test.go new file mode 100644 index 0000000..4c93eae --- /dev/null +++ b/sorensen-dice_test.go @@ -0,0 +1,35 @@ +package edlib + +import ( + "testing" +) + +func TestSorensenDiceCoefficient(t *testing.T) { + type args struct { + str1 string + str2 string + splitLength int + } + tests := []struct { + name string + args args + want float32 + }{ + {"SorensenDiceCoefficient 1", args{"night", "nacht", 2}, 0.25}, + {"SorensenDiceCoefficient 2", args{"Radiohead", "Radiohead", 2}, 1.0}, + {"SorensenDiceCoefficient 3", args{"", "", 2}, 0.0}, + {"SorensenDiceCoefficient 4", args{"Radiohead", "Carly Rae Jepsen", 2}, 0.09090909}, + {"SorensenDiceCoefficient 5", args{"I love horror movies", "Lights out is a horror movie", 2}, 0.52380955}, + {"SorensenDiceCoefficient 6", args{"love horror movies", "Lights out horror movie", 2}, 0.6111111}, + {"SorensenDiceCoefficient 7", args{"็งใฎๅๅ‰ใฏใ‚ธใƒงใƒณใงใ™", "็งใฎๅๅ‰ใฏใ‚ธใƒงใƒณใƒปใƒ‰ใ‚ฅใงใ™", 2}, 0.7619048}, + {"SorensenDiceCoefficient 8", args{"๐Ÿ™‚๐Ÿ˜„๐Ÿ™‚๐Ÿ˜„ ๐Ÿ˜„๐Ÿ™‚๐Ÿ˜„", "๐Ÿ™‚๐Ÿ˜„๐Ÿ™‚๐Ÿ˜„ ๐Ÿ˜„๐Ÿ™‚๐Ÿ˜„ ๐Ÿ™‚๐Ÿ˜„๐Ÿ™‚", 2}, 0.8888889}, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + if got := SorensenDiceCoefficient(tt.args.str1, tt.args.str2, tt.args.splitLength); got != tt.want { + t.Errorf("SorensenDiceCoefficient() = %v, want %v", got, tt.want) + } + }) + } +} diff --git a/string-analysis.go b/string-analysis.go index a8e2795..6f10f82 100644 --- a/string-analysis.go +++ b/string-analysis.go @@ -20,6 +20,8 @@ const ( JaroWinkler Cosine Jaccard + SorensenDice + Qgram ) // StringsSimilarity return a similarity index [0..1] between two strings based on given edit distance algorithm in parameter. @@ -49,6 +51,10 @@ func StringsSimilarity(str1 string, str2 string, algo Algorithm) (float32, error return CosineSimilarity(str1, str2, 2), nil case Jaccard: return JaccardSimilarity(str1, str2, 2), nil + case SorensenDice: + return SorensenDiceCoefficient(str1, str2, 2), nil + case Qgram: + return QgramSimilarity(str1, str2, 2), nil default: return 0.0, errors.New("Illegal argument for algorithm method") } diff --git a/string-analysis_test.go b/string-analysis_test.go index 46ea4d7..374e676 100644 --- a/string-analysis_test.go +++ b/string-analysis_test.go @@ -153,6 +153,42 @@ func TestStringsSimilarity(t *testing.T) { {"Jaccard : Sentence 4", args{"็งใฎๅๅ‰ใฏใ‚ธใƒงใƒณใงใ™", "็งใฎๅๅ‰ใฏใ‚ธใƒงใƒณใƒปใƒ‰ใ‚ฅใงใ™", Jaccard}, 0.61538464, false}, {"Jaccard : Sentence 5", args{"๐Ÿ™‚๐Ÿ˜„๐Ÿ™‚๐Ÿ˜„ ๐Ÿ˜„๐Ÿ™‚๐Ÿ˜„", "๐Ÿ™‚๐Ÿ˜„๐Ÿ™‚๐Ÿ˜„ ๐Ÿ˜„๐Ÿ™‚๐Ÿ˜„ ๐Ÿ™‚๐Ÿ˜„๐Ÿ™‚", Jaccard}, 0.8, false}, + // SorensenDice method + {"SorensenDice : First arg empty", args{"", "abcde", SorensenDice}, 0.0, false}, + {"SorensenDice : Second arg empty", args{"abcde", "", SorensenDice}, 0.0, false}, + {"SorensenDice : Same args", args{"abcde", "abcde", SorensenDice}, 1.0, false}, + {"SorensenDice : No characters match", args{"abcd", "effgghh", SorensenDice}, 0.0, false}, + {"SorensenDice : CRATE/TRACE", args{"CRATE", "TRACE", SorensenDice}, 0.25, false}, + {"SorensenDice : MARTHA/MARHTA", args{"MARTHA", "MARHTA", SorensenDice}, 0.4, false}, + {"SorensenDice : DIXON/DICKSONX", args{"DIXON", "DICKSONX", SorensenDice}, 0.36363637, false}, + {"SorensenDice Sentence 1", args{"night", "nacht", SorensenDice}, 0.25, false}, + {"SorensenDice Sentence 2", args{"Radiohead", "Radiohead", SorensenDice}, 1.0, false}, + {"SorensenDice Sentence 3", args{"", "", SorensenDice}, 0.0, false}, + {"SorensenDice Sentence 4", args{"Radiohead", "Carly Rae Jepsen", SorensenDice}, 0.09090909, false}, + {"SorensenDice Sentence 5", args{"I love horror movies", "Lights out is a horror movie", SorensenDice}, 0.52380955, false}, + {"SorensenDice Sentence 6", args{"love horror movies", "Lights out horror movie", SorensenDice}, 0.6111111, false}, + {"SorensenDice Sentence 7", args{"็งใฎๅๅ‰ใฏใ‚ธใƒงใƒณใงใ™", "็งใฎๅๅ‰ใฏใ‚ธใƒงใƒณใƒปใƒ‰ใ‚ฅใงใ™", SorensenDice}, 0.7619048, false}, + {"SorensenDice Sentence 8", args{"๐Ÿ™‚๐Ÿ˜„๐Ÿ™‚๐Ÿ˜„ ๐Ÿ˜„๐Ÿ™‚๐Ÿ˜„", "๐Ÿ™‚๐Ÿ˜„๐Ÿ™‚๐Ÿ˜„ ๐Ÿ˜„๐Ÿ™‚๐Ÿ˜„ ๐Ÿ™‚๐Ÿ˜„๐Ÿ™‚", SorensenDice}, 0.8888889, false}, + + // Qgram method + {"Qgram: First arg empty", args{"", "abcde", Qgram}, 0.0, false}, + {"Qgram : Second arg empty", args{"abcde", "", Qgram}, 0.0, false}, + {"Qgram : Same args", args{"abcde", "abcde", Qgram}, 1.0, false}, + {"Qgram : No characters match", args{"abcd", "effgghh", Qgram}, 0.0, false}, + {"Qgram : CRATE/TRACE", args{"CRATE", "TRACE", Qgram}, 0.25, false}, + {"Qgram : MARTHA/MARHTA", args{"MARTHA", "MARHTA", Qgram}, 0.39999998, false}, + {"Qgram : DIXON/DICKSONX", args{"DIXON", "DICKSONX", Qgram}, 0.36363637, false}, + {"Qgram Sentence 1", args{"Radiohead", "Radiohead", Qgram}, 1.0, false}, + {"Qgram Sentence 2", args{"ABCD", "ABCE", Qgram}, 0.6666666, false}, + {"Qgram Sentence 3", args{"Radiohead", "Carly Rae Jepsen", Qgram}, 0.04545456, false}, + {"Qgram Sentence 4", args{"I love horror movies", "Lights out is a horror movie", Qgram}, 0.47619045, false}, + {"Qgram Sentence 5", args{"love horror movies", "Lights out horror movie", Qgram}, 0.5833334, false}, + {"Qgram Sentence 6", args{"็งใฎๅๅ‰ใฏใ‚ธใƒงใƒณใงใ™", "็งใฎๅๅ‰ใฏใ‚ธใƒงใƒณใƒปใƒ‰ใ‚ฅใงใ™", Qgram}, 0.7619048, false}, + {"Qgram Sentence 7", args{"๐Ÿ™‚๐Ÿ˜„๐Ÿ™‚๐Ÿ˜„ ๐Ÿ˜„๐Ÿ™‚๐Ÿ˜„", "๐Ÿ™‚๐Ÿ˜„๐Ÿ™‚๐Ÿ˜„ ๐Ÿ˜„๐Ÿ™‚๐Ÿ˜„ ๐Ÿ™‚๐Ÿ˜„๐Ÿ™‚", Qgram}, 0.5555556, false}, + + // TODO: Must refactor compare method to handle NaN values + // {"Qgram Sentence 8", args{"", "", Qgram}, float32(math.NaN()), false}, + // Illegal argument error {"Undefined integer value for method", args{"abc", "abcde", 42}, 0.0, true}, }