From be2829850fc605b96ebc373b94e2b34e7a86a435 Mon Sep 17 00:00:00 2001 From: Tom Crouch Date: Sun, 8 Oct 2017 19:02:38 +0100 Subject: [PATCH] add bounded version of restricted edit distance --- README.md | 10 +- lib/edits.rb | 1 + lib/edits/compare.rb | 36 ++++++++ lib/edits/levenshtein.rb | 32 +------ lib/edits/restricted_edit.rb | 81 ++++++++++++++++ spec/edits/damerau_levenshtein_spec.rb | 2 +- spec/edits/levenshtein_spec.rb | 2 +- spec/edits/restricted_edit_spec.rb | 122 ++++++++++++++++++++----- tasks/benchmark/levenshtein.rake | 61 ++++++++++++- 9 files changed, 289 insertions(+), 58 deletions(-) create mode 100644 lib/edits/compare.rb diff --git a/README.md b/README.md index 10e3e43..e021d76 100644 --- a/README.md +++ b/README.md @@ -42,8 +42,8 @@ Edits::Levenshtein.distance "acer", "earn" # Max distance Edits::Levenshtein.distance_with_max "iota", "atom", 2 # => 2 -Edits::Levenshtein.most_similar "atom", %w[tram atlas rota racer] -# => "atlas" +Edits::Levenshtein.most_similar "atom", %w[tree rota toes racer] +# => "toes" ``` ### Restricted Edit (Optimal Alignment) @@ -59,6 +59,12 @@ Edits::RestrictedEdit.distance "iota", "atom" # => 3 Edits::RestrictedEdit.distance "acer", "earn" # => 4 + +# Max distance +Edits::RestrictedEdit.distance_with_max "iota", "atom", 2 +# => 2 +Edits::RestrictedEdit.most_similar "atom", %w[tree rota toes racer] +# => "rota" ``` ### Damerau-Levenshtein diff --git a/lib/edits.rb b/lib/edits.rb index 6262213..8bcdb9b 100644 --- a/lib/edits.rb +++ b/lib/edits.rb @@ -2,6 +2,7 @@ require "edits/version" +require "edits/compare" require "edits/damerau_levenshtein" require "edits/hamming" require "edits/jaro" diff --git a/lib/edits/compare.rb b/lib/edits/compare.rb new file mode 100644 index 0000000..c24727b --- /dev/null +++ b/lib/edits/compare.rb @@ -0,0 +1,36 @@ +# frozen_string_literal: true + +module Edits + # Comparison helpers + module Compare + # Given a prototype string and an array of strings, determines which + # string is most similar to the prototype. + # + # `most_similar("foo", strings)` is functionally equivalent to + # `strings.min_by { |s| distance("foo", s) }`, leveraging + # {.distance_with_max}. + # + # @example + # most_similar("atom", %w[tram atlas rota racer]) + # # => "atlas" + # @param prototype [String] + # @param strings [] + # @return [String, nil] most similar string, or nil for empty array + def most_similar(prototype, strings) + return nil if strings.empty? + min_s = strings[0] + min_d = distance(prototype, min_s) + + strings[1..-1].each do |s| + return min_s if min_d.zero? + d = distance_with_max(prototype, s, min_d) + if d < min_d + min_d = d + min_s = s + end + end + + min_s + end + end +end diff --git a/lib/edits/levenshtein.rb b/lib/edits/levenshtein.rb index 572c199..47defd6 100644 --- a/lib/edits/levenshtein.rb +++ b/lib/edits/levenshtein.rb @@ -8,6 +8,8 @@ module Edits # * Deletion # * Substitution module Levenshtein + extend Compare + # Calculate the Levenshtein (edit) distance of two sequences. # # @note A true distance metric, satisfies triangle inequality. @@ -125,35 +127,5 @@ def self.distance_with_max(seq1, seq2, max) last_row[cols] > max ? max : last_row[cols] end - - # Given a prototype string and an array of strings, determines which - # string is most similar to the prototype. - # - # `Levenshtein.most_similar("foo", strings)` is functionally equivalent to - # `strings.min_by { |s| Levenshtein.distance("foo", s) }`, leveraging - # {.distance_with_max}. - # - # @example - # Edits::Levenshtein.most_similar("atom", %w[tram atlas rota racer]) - # # => "atlas" - # @param prototype [String] - # @param strings [] - # @return [String, nil] most similar string, or nil for empty array - def self.most_similar(prototype, strings) - return nil if strings.empty? - min_s = strings[0] - min_d = distance(prototype, min_s) - - strings[1..-1].each do |s| - return min_s if min_d.zero? - d = distance_with_max(prototype, s, min_d) - if d < min_d - min_d = d - min_s = s - end - end - - min_s - end end end diff --git a/lib/edits/restricted_edit.rb b/lib/edits/restricted_edit.rb index 3a2cd6f..bcd1c26 100644 --- a/lib/edits/restricted_edit.rb +++ b/lib/edits/restricted_edit.rb @@ -13,6 +13,8 @@ module Edits # This variant is restricted by the condition that no sub-string is edited # more than once. module RestrictedEdit + extend Compare + # Calculate the Restricted Damerau-Levenshtein distance (Optimal Alignment) # of two sequences. # @@ -82,5 +84,84 @@ def self.distance(seq1, seq2) curr_row[cols] end + + # Calculate the Restricted Damerau-Levenshtein distance (Optimal Alignment) + # of two sequences, bounded by a maximum value. + # + # @example + # Edits::RestrictedEdit.distance("cloud", "crayon") + # # => 5 + # Edits::RestrictedEdit.distance_with_max("cloud", "crayon", 2) + # # => 2 + # @param seq1 [String, Array] + # @param seq2 [String, Array] + # @param max [Integer] maximum distance + # @return [Integer] + def self.distance_with_max(seq1, seq2, max) + seq1, seq2 = seq2, seq1 if seq1.length > seq2.length + + rows = seq1.length + cols = seq2.length + return cols > max ? max : cols if rows.zero? + return rows > max ? max : rows if cols.zero? + return max if (cols - rows) >= max + + # array of codepoints outperforms String + seq1 = seq1.codepoints if seq1.is_a? String + seq2 = seq2.codepoints if seq2.is_a? String + + # 'infinite' edit distance for padding cost matrix. + # Can be any value > max[rows, cols] + inf = cols + 1 + + # retain previous two rows of cost matrix, + # padded with "inf" as matrix is not fully evaluated + lastlast_row = Array.new(inf, inf) + last_row = Array.new(inf, inf) + curr_row = 0.upto(cols).to_a + + rows.times do |row| + # rotate row arrays + curr_row, last_row, lastlast_row = lastlast_row, curr_row, last_row + + # Ukkonen cut-off + min_col = row > max ? row - max : 0 + max_col = row + max + max_col = cols - 1 if max_col > cols - 1 + + curr_row[min_col] = min_col.zero? ? row + 1 : inf + seq1_item = seq1[row] + diagonal = cols - rows + row + + min_col.upto(max_col) do |col| + return max if diagonal == col && last_row[col] >= max + + sub_cost = seq1_item == seq2[col] ? 0 : 1 + is_swap = sub_cost.positive? && + row.positive? && col.positive? && + seq1_item == seq2[col - 1] && + seq1[row - 1] == seq2[col] + + # | Xt | | | + # | | Xs | Xd | + # | | Xi | ? | + # substitution, deletion, insertion, transposition + cost = [ + last_row[col] + sub_cost, + last_row[col + 1] + 1, + curr_row[col] + 1 + ].min + + if is_swap + swap = lastlast_row[col - 1] + 1 + cost = swap if swap < cost + end + + curr_row[col + 1] = cost + end + end + + curr_row[cols] > max ? max : curr_row[cols] + end end end diff --git a/spec/edits/damerau_levenshtein_spec.rb b/spec/edits/damerau_levenshtein_spec.rb index 1978a92..78f9ceb 100644 --- a/spec/edits/damerau_levenshtein_spec.rb +++ b/spec/edits/damerau_levenshtein_spec.rb @@ -10,7 +10,7 @@ include_examples "levenshtein" [ - # swaps + # simple transpositions ["a cat", "an act", 2], ["abc", "acb", 1], ["abc", "bac", 1], diff --git a/spec/edits/levenshtein_spec.rb b/spec/edits/levenshtein_spec.rb index 51a0f20..45d010c 100644 --- a/spec/edits/levenshtein_spec.rb +++ b/spec/edits/levenshtein_spec.rb @@ -5,7 +5,7 @@ RSpec.describe Edits::Levenshtein do cases = [ - # swaps + # simple transpositions ["a cat", "an act", 3], ["abc", "acb", 2], ["abc", "bac", 2], diff --git a/spec/edits/restricted_edit_spec.rb b/spec/edits/restricted_edit_spec.rb index ae0cad6..6644983 100644 --- a/spec/edits/restricted_edit_spec.rb +++ b/spec/edits/restricted_edit_spec.rb @@ -4,34 +4,36 @@ require "edits/levenshtein_shared" RSpec.describe Edits::RestrictedEdit do + cases = [ + # simple transpositions + ["a cat", "an act", 2], + ["abc", "acb", 1], + ["abc", "bac", 1], + ["abcdef", "abcdfe", 1], + ["abcdefghij", "acbdegfhji", 3], + ["acre", "acer", 1], + ["art", "ran", 2], + ["caned", "acned", 1], + ["iota", "atom", 3], + ["minion", "noir", 4], + + # complex transpositions + ["a cat", "a tc", 3], + ["a cat", "an abct", 4], + ["acer", "earn", 4], + ["craned", "read", 4], + ["information", "informant", 4], + ["raced", "dear", 5], + ["roam", "art", 4], + ["tram", "rota", 4] + ] + describe ".distance" do subject { described_class.distance a, b } include_examples "levenshtein" - [ - # swaps - ["a cat", "an act", 2], - ["abc", "acb", 1], - ["abc", "bac", 1], - ["abcdef", "abcdfe", 1], - ["abcdefghij", "acbdegfhji", 3], - ["acre", "acer", 1], - ["art", "ran", 2], - ["caned", "acned", 1], - ["iota", "atom", 3], - ["minion", "noir", 4], - - # complex transpositions - ["a cat", "a tc", 3], - ["a cat", "an abct", 4], - ["acer", "earn", 4], - ["craned", "read", 4], - ["information", "informant", 4], - ["raced", "dear", 5], - ["roam", "art", 4], - ["tram", "rota", 4] - ].each do |(a, b, distance)| + cases.each do |(a, b, distance)| context "with '#{a}', '#{b}'" do let(:a) { a } let(:b) { b } @@ -40,4 +42,78 @@ end end end + + describe ".distance_with_max" do + subject { described_class.distance_with_max a, b, max } + + context "when max is 100" do + let(:max) { 100 } + + include_examples "levenshtein" + + cases.each do |(a, b, distance)| + context "with '#{a}', '#{b}'" do + let(:a) { a } + let(:b) { b } + + it { is_expected.to eq distance } + end + end + end + + context "when max is 4" do + let(:max) { 4 } + + cases.each do |(a, b, distance)| + context "with '#{a}', '#{b}'" do + let(:a) { a } + let(:b) { b } + + it { is_expected.to eq(distance > max ? max : distance) } + end + end + + context "with '', 'abcdfe'" do + let(:a) { "" } + let(:b) { "abcdfe" } + + it { is_expected.to eq max } + end + + context "with 'abcdfe', ''" do + let(:a) { "abcdfe" } + let(:b) { "" } + + it { is_expected.to eq max } + end + end + end + + describe ".most_similar" do + let(:prototype) { "atom" } + + subject { described_class.most_similar prototype, words } + + context "with empty array" do + let(:words) { [] } + + it { is_expected.to be_nil } + end + + context "when a single word has the lowest distance" do + let(:words) { %w[light at atlas beer iota train] } + + it "returns the word with lowest distance from prototype" do + expect(subject).to eq "at" + end + end + + context "when two words share the lowest distance" do + let(:words) { %w[light beer iota train] } + + it "returns the first with lowest distance from prototype" do + expect(subject).to eq "iota" + end + end + end end diff --git a/tasks/benchmark/levenshtein.rake b/tasks/benchmark/levenshtein.rake index 6c07d46..dbeea88 100644 --- a/tasks/benchmark/levenshtein.rake +++ b/tasks/benchmark/levenshtein.rake @@ -5,7 +5,7 @@ require "benchmark/ips" require "edits" namespace :benchmark do - desc "distance vs. distance_with_max (x100)" + desc "levenshtein distance vs. distance_with_max (x100)" task :lev_max do words = File.read("/usr/share/dict/words") .split(/\n/).compact.shuffle(random: Random.new(1)) @@ -64,6 +64,65 @@ namespace :benchmark do end end + desc "restricted distance vs. distance_with_max (x100)" + task :restricted_max do + words = File.read("/usr/share/dict/words") + .split(/\n/).compact.shuffle(random: Random.new(1)) + .take(101) + + Benchmark.ips do |x| + x.report("distance") do + words.each_cons(2) do |a, b| + Edits::RestrictedEdit.distance a, b + end + end + + x.report("with max 1") do + words.each_cons(2) do |a, b| + Edits::RestrictedEdit.distance_with_max a, b, 1 + end + end + + x.report("with max 2") do + words.each_cons(2) do |a, b| + Edits::RestrictedEdit.distance_with_max a, b, 2 + end + end + + x.report("with max 3") do + words.each_cons(2) do |a, b| + Edits::RestrictedEdit.distance_with_max a, b, 3 + end + end + + x.report("with max 4") do + words.each_cons(2) do |a, b| + Edits::RestrictedEdit.distance_with_max a, b, 4 + end + end + + x.report("with max 6") do + words.each_cons(2) do |a, b| + Edits::RestrictedEdit.distance_with_max a, b, 6 + end + end + + x.report("with max 8") do + words.each_cons(2) do |a, b| + Edits::RestrictedEdit.distance_with_max a, b, 8 + end + end + + x.report("with max 50") do + words.each_cons(2) do |a, b| + Edits::RestrictedEdit.distance_with_max a, b, 100 + end + end + + x.compare! + end + end + desc "most_similar vs. min_by (100 words)" task :lev_similar do words = File.read("/usr/share/dict/words")