|
1 | 1 | module Text.Fuzzy.Levenshtein where |
2 | 2 |
|
3 | | -import Data.Function (fix) |
4 | 3 | import Data.List (sortOn) |
5 | | -import Data.MemoTrie |
| 4 | +import Data.Text (Text) |
6 | 5 | import qualified Data.Text as T |
7 | | -import qualified Data.Text.Array as T |
8 | | -import Data.Text.Internal (Text (..)) |
| 6 | +import Text.EditDistance |
9 | 7 | import Text.Fuzzy.Parallel |
10 | 8 |
|
11 | | --- | Same caveats apply w.r.t. ASCII as in 'Text.Fuzzy.Parallel'. |
12 | | --- Might be worth optimizing this at some point, but it's good enoughᵗᵐ for now |
13 | | -levenshtein :: Text -> Text -> Int |
14 | | -levenshtein a b | T.null a = T.length b |
15 | | -levenshtein a b | T.null b = T.length a |
16 | | -levenshtein (Text aBuf aOff aLen) (Text bBuf bOff bLen) = do |
17 | | - let aTot = aOff + aLen |
18 | | - bTot = bOff + bLen |
19 | | - go' _ (!aIx, !bIx) | aIx >= aTot || bIx >= bTot = max (aTot - aIx) (bTot - bIx) |
20 | | - go' f (!aIx, !bIx) | T.unsafeIndex aBuf aIx == T.unsafeIndex bBuf bIx = f (aIx + 1, bIx + 1) |
21 | | - go' f (!aIx, !bIx) = |
22 | | - minimum |
23 | | - [ 2 + f (aIx + 1, bIx + 1), -- Give substitutions a heavier cost, so multiple typos cost more |
24 | | - 1 + f (aIx + 1, bIx), |
25 | | - 1 + f (aIx, bIx + 1) |
26 | | - ] |
27 | | - go = fix (memo . go') |
28 | | - go (aOff, bOff) |
29 | | - |
30 | 9 | -- | Sort the given list according to it's levenshtein distance relative to the |
31 | 10 | -- given string. |
32 | 11 | levenshteinScored :: Int -> Text -> [Text] -> [Scored Text] |
33 | | -levenshteinScored chunkSize needle haystack = |
| 12 | +levenshteinScored chunkSize needle haystack = do |
| 13 | + let levenshtein = levenshteinDistance $ defaultEditCosts {substitutionCosts=ConstantCost 2} |
34 | 14 | sortOn score $ |
35 | 15 | matchPar chunkSize needle haystack id $ |
36 | | - \a b -> Just $ levenshtein a b |
| 16 | + \a b -> Just $ levenshtein (T.unpack a) (T.unpack b) |
0 commit comments