Skip to content

Commit

Permalink
add bounded version of restricted edit distance
Browse files Browse the repository at this point in the history
  • Loading branch information
tcrouch committed Oct 8, 2017
1 parent f19e792 commit be28298
Show file tree
Hide file tree
Showing 9 changed files with 289 additions and 58 deletions.
10 changes: 8 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -42,8 +42,8 @@ Edits::Levenshtein.distance "acer", "earn"
# Max distance
Edits::Levenshtein.distance_with_max "iota", "atom", 2
# => 2
Edits::Levenshtein.most_similar "atom", %w[tram atlas rota racer]
# => "atlas"
Edits::Levenshtein.most_similar "atom", %w[tree rota toes racer]
# => "toes"
```

### Restricted Edit (Optimal Alignment)
Expand All @@ -59,6 +59,12 @@ Edits::RestrictedEdit.distance "iota", "atom"
# => 3
Edits::RestrictedEdit.distance "acer", "earn"
# => 4

# Max distance
Edits::RestrictedEdit.distance_with_max "iota", "atom", 2
# => 2
Edits::RestrictedEdit.most_similar "atom", %w[tree rota toes racer]
# => "rota"
```

### Damerau-Levenshtein
Expand Down
1 change: 1 addition & 0 deletions lib/edits.rb
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

require "edits/version"

require "edits/compare"
require "edits/damerau_levenshtein"
require "edits/hamming"
require "edits/jaro"
Expand Down
36 changes: 36 additions & 0 deletions lib/edits/compare.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
# frozen_string_literal: true

module Edits
# Comparison helpers
module Compare
# Given a prototype string and an array of strings, determines which
# string is most similar to the prototype.
#
# `most_similar("foo", strings)` is functionally equivalent to
# `strings.min_by { |s| distance("foo", s) }`, leveraging
# {.distance_with_max}.
#
# @example
# most_similar("atom", %w[tram atlas rota racer])
# # => "atlas"
# @param prototype [String]
# @param strings [<String>]
# @return [String, nil] most similar string, or nil for empty array
def most_similar(prototype, strings)
return nil if strings.empty?
min_s = strings[0]
min_d = distance(prototype, min_s)

strings[1..-1].each do |s|
return min_s if min_d.zero?
d = distance_with_max(prototype, s, min_d)
if d < min_d
min_d = d
min_s = s
end
end

min_s
end
end
end
32 changes: 2 additions & 30 deletions lib/edits/levenshtein.rb
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@ module Edits
# * Deletion
# * Substitution
module Levenshtein
extend Compare

# Calculate the Levenshtein (edit) distance of two sequences.
#
# @note A true distance metric, satisfies triangle inequality.
Expand Down Expand Up @@ -125,35 +127,5 @@ def self.distance_with_max(seq1, seq2, max)

last_row[cols] > max ? max : last_row[cols]
end

# Given a prototype string and an array of strings, determines which
# string is most similar to the prototype.
#
# `Levenshtein.most_similar("foo", strings)` is functionally equivalent to
# `strings.min_by { |s| Levenshtein.distance("foo", s) }`, leveraging
# {.distance_with_max}.
#
# @example
# Edits::Levenshtein.most_similar("atom", %w[tram atlas rota racer])
# # => "atlas"
# @param prototype [String]
# @param strings [<String>]
# @return [String, nil] most similar string, or nil for empty array
def self.most_similar(prototype, strings)
return nil if strings.empty?
min_s = strings[0]
min_d = distance(prototype, min_s)

strings[1..-1].each do |s|
return min_s if min_d.zero?
d = distance_with_max(prototype, s, min_d)
if d < min_d
min_d = d
min_s = s
end
end

min_s
end
end
end
81 changes: 81 additions & 0 deletions lib/edits/restricted_edit.rb
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@ module Edits
# This variant is restricted by the condition that no sub-string is edited
# more than once.
module RestrictedEdit
extend Compare

# Calculate the Restricted Damerau-Levenshtein distance (Optimal Alignment)
# of two sequences.
#
Expand Down Expand Up @@ -82,5 +84,84 @@ def self.distance(seq1, seq2)

curr_row[cols]
end

# Calculate the Restricted Damerau-Levenshtein distance (Optimal Alignment)
# of two sequences, bounded by a maximum value.
#
# @example
# Edits::RestrictedEdit.distance("cloud", "crayon")
# # => 5
# Edits::RestrictedEdit.distance_with_max("cloud", "crayon", 2)
# # => 2
# @param seq1 [String, Array]
# @param seq2 [String, Array]
# @param max [Integer] maximum distance
# @return [Integer]
def self.distance_with_max(seq1, seq2, max)
seq1, seq2 = seq2, seq1 if seq1.length > seq2.length

rows = seq1.length
cols = seq2.length
return cols > max ? max : cols if rows.zero?
return rows > max ? max : rows if cols.zero?
return max if (cols - rows) >= max

# array of codepoints outperforms String
seq1 = seq1.codepoints if seq1.is_a? String
seq2 = seq2.codepoints if seq2.is_a? String

# 'infinite' edit distance for padding cost matrix.
# Can be any value > max[rows, cols]
inf = cols + 1

# retain previous two rows of cost matrix,
# padded with "inf" as matrix is not fully evaluated
lastlast_row = Array.new(inf, inf)
last_row = Array.new(inf, inf)
curr_row = 0.upto(cols).to_a

rows.times do |row|
# rotate row arrays
curr_row, last_row, lastlast_row = lastlast_row, curr_row, last_row

# Ukkonen cut-off
min_col = row > max ? row - max : 0
max_col = row + max
max_col = cols - 1 if max_col > cols - 1

curr_row[min_col] = min_col.zero? ? row + 1 : inf
seq1_item = seq1[row]
diagonal = cols - rows + row

min_col.upto(max_col) do |col|
return max if diagonal == col && last_row[col] >= max

sub_cost = seq1_item == seq2[col] ? 0 : 1
is_swap = sub_cost.positive? &&
row.positive? && col.positive? &&
seq1_item == seq2[col - 1] &&
seq1[row - 1] == seq2[col]

# | Xt | | |
# | | Xs | Xd |
# | | Xi | ? |
# substitution, deletion, insertion, transposition
cost = [
last_row[col] + sub_cost,
last_row[col + 1] + 1,
curr_row[col] + 1
].min

if is_swap
swap = lastlast_row[col - 1] + 1
cost = swap if swap < cost
end

curr_row[col + 1] = cost
end
end

curr_row[cols] > max ? max : curr_row[cols]
end
end
end
2 changes: 1 addition & 1 deletion spec/edits/damerau_levenshtein_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
include_examples "levenshtein"

[
# swaps
# simple transpositions
["a cat", "an act", 2],
["abc", "acb", 1],
["abc", "bac", 1],
Expand Down
2 changes: 1 addition & 1 deletion spec/edits/levenshtein_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

RSpec.describe Edits::Levenshtein do
cases = [
# swaps
# simple transpositions
["a cat", "an act", 3],
["abc", "acb", 2],
["abc", "bac", 2],
Expand Down
122 changes: 99 additions & 23 deletions spec/edits/restricted_edit_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -4,34 +4,36 @@
require "edits/levenshtein_shared"

RSpec.describe Edits::RestrictedEdit do
cases = [
# simple transpositions
["a cat", "an act", 2],
["abc", "acb", 1],
["abc", "bac", 1],
["abcdef", "abcdfe", 1],
["abcdefghij", "acbdegfhji", 3],
["acre", "acer", 1],
["art", "ran", 2],
["caned", "acned", 1],
["iota", "atom", 3],
["minion", "noir", 4],

# complex transpositions
["a cat", "a tc", 3],
["a cat", "an abct", 4],
["acer", "earn", 4],
["craned", "read", 4],
["information", "informant", 4],
["raced", "dear", 5],
["roam", "art", 4],
["tram", "rota", 4]
]

describe ".distance" do
subject { described_class.distance a, b }

include_examples "levenshtein"

[
# swaps
["a cat", "an act", 2],
["abc", "acb", 1],
["abc", "bac", 1],
["abcdef", "abcdfe", 1],
["abcdefghij", "acbdegfhji", 3],
["acre", "acer", 1],
["art", "ran", 2],
["caned", "acned", 1],
["iota", "atom", 3],
["minion", "noir", 4],

# complex transpositions
["a cat", "a tc", 3],
["a cat", "an abct", 4],
["acer", "earn", 4],
["craned", "read", 4],
["information", "informant", 4],
["raced", "dear", 5],
["roam", "art", 4],
["tram", "rota", 4]
].each do |(a, b, distance)|
cases.each do |(a, b, distance)|
context "with '#{a}', '#{b}'" do
let(:a) { a }
let(:b) { b }
Expand All @@ -40,4 +42,78 @@
end
end
end

describe ".distance_with_max" do
subject { described_class.distance_with_max a, b, max }

context "when max is 100" do
let(:max) { 100 }

include_examples "levenshtein"

cases.each do |(a, b, distance)|
context "with '#{a}', '#{b}'" do
let(:a) { a }
let(:b) { b }

it { is_expected.to eq distance }
end
end
end

context "when max is 4" do
let(:max) { 4 }

cases.each do |(a, b, distance)|
context "with '#{a}', '#{b}'" do
let(:a) { a }
let(:b) { b }

it { is_expected.to eq(distance > max ? max : distance) }
end
end

context "with '', 'abcdfe'" do
let(:a) { "" }
let(:b) { "abcdfe" }

it { is_expected.to eq max }
end

context "with 'abcdfe', ''" do
let(:a) { "abcdfe" }
let(:b) { "" }

it { is_expected.to eq max }
end
end
end

describe ".most_similar" do
let(:prototype) { "atom" }

subject { described_class.most_similar prototype, words }

context "with empty array" do
let(:words) { [] }

it { is_expected.to be_nil }
end

context "when a single word has the lowest distance" do
let(:words) { %w[light at atlas beer iota train] }

it "returns the word with lowest distance from prototype" do
expect(subject).to eq "at"
end
end

context "when two words share the lowest distance" do
let(:words) { %w[light beer iota train] }

it "returns the first with lowest distance from prototype" do
expect(subject).to eq "iota"
end
end
end
end
Loading

0 comments on commit be28298

Please sign in to comment.