Skip to content

Commit acb6bf3

Browse files
committed
wip Aho Corasick with HashMap
1 parent 1d3ca55 commit acb6bf3

File tree

3 files changed

+36
-28
lines changed

3 files changed

+36
-28
lines changed

ac-library-hs.cabal

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -35,16 +35,16 @@ common warnings
3535

3636
common dependencies
3737
build-depends:
38-
, base >=4.9 && <4.22
39-
, bitvec <1.2
40-
, bytestring <0.14
41-
, containers <0.9
42-
, primitive >=0.6.4.0 && <0.10
43-
, random >=1.2.0 && <1.4
44-
, transformers >=0.2.0.0
45-
, vector >=0.13.0 && <0.14
46-
, vector-algorithms <0.10
47-
, wide-word <0.2
38+
, base >=4.9 && <4.22
39+
, bitvec <1.2
40+
, bytestring <0.14
41+
, primitive >=0.6.4.0 && <0.10
42+
, random >=1.2.0 && <1.4
43+
, transformers >=0.2.0.0
44+
, unordered-containers <0.3
45+
, vector >=0.13.0 && <0.14
46+
, vector-algorithms <0.10
47+
, wide-word <0.2
4848

4949
default-language: GHC2021
5050

src/AtCoder/Extra/AhoCorasick.hs

Lines changed: 26 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -7,9 +7,10 @@
77
-- >>> import AtCoder.Extra.AhoCorasick qualified as AC
88
-- >>> import Data.Vector.Unboxed qualified as VU
99
-- >>> let patterns = V.fromList [VU.fromList [0, 1], VU.fromList [0, 2], VU.fromList [2, 3]]
10-
-- >>> let ac = AC.build 26 patterns
10+
-- >>> let ac = AC.build patterns
1111
-- >>> AC.size ac
1212
-- 6
13+
--
1314
-- >>> AC.retrieve ac (VU.singleton 2)
1415
-- 4
1516
--
@@ -21,6 +22,7 @@ module AtCoder.Extra.AhoCorasick
2122
next,
2223
nextN,
2324
retrieve,
25+
match,
2426
)
2527
where
2628

@@ -29,7 +31,7 @@ import Control.Monad (when)
2931
import Control.Monad.Fix (fix)
3032
import Control.Monad.ST (runST)
3133
import Data.Foldable (for_)
32-
import Data.IntMap.Strict qualified as IM
34+
import Data.HashMap.Strict qualified as HM
3335
import Data.Maybe (fromJust)
3436
import Data.Vector qualified as V
3537
import Data.Vector.Generic qualified as VG
@@ -50,7 +52,7 @@ data AhoCorasick = AhoCorasick
5052
-- | Vertex -> (Char -> Vertex)
5153
--
5254
-- @since 1.5.3.0
53-
nextAc :: !(V.Vector (IM.IntMap Int)),
55+
nextAc :: !(V.Vector (HM.HashMap Int Int)),
5456
-- | Links to parent vertex.
5557
--
5658
-- @since 1.5.3.0
@@ -61,7 +63,7 @@ data AhoCorasick = AhoCorasick
6163
suffixAc :: !(VU.Vector Int)
6264
}
6365

64-
-- | \(O(\Gamma \sum_i |S_i|)\)
66+
-- | \(O(\sum_i |S_i| \Gamma)\)
6567
--
6668
-- ==== Constraints
6769
-- - \(|S_i| > 0\)
@@ -78,7 +80,7 @@ build patterns
7880
| VG.null patterns =
7981
AhoCorasick
8082
1
81-
(V.singleton IM.empty)
83+
(V.singleton HM.empty)
8284
(VU.replicate 1 0)
8385
(VU.replicate 1 0)
8486
| otherwise =
@@ -113,13 +115,13 @@ next ::
113115
Int
114116
next AhoCorasick {..} v c0 =
115117
let !c' = inner c0
116-
!v' = fromJust $ IM.lookup c' (nextAc VG.! v)
118+
!v' = fromJust $ HM.lookup c' (nextAc VG.! v)
117119
in v'
118120
where
119121
inner c
120122
-- fallback to a suffix
121123
-- TODO: why suffixAc -> Char?
122-
| IM.notMember c (nextAc VG.! v) = inner $! suffixAc VG.! c
124+
| not (HM.member c (nextAc VG.! v)) = inner $! suffixAc VG.! c
123125
| otherwise = c
124126

125127
-- | \(O(|S_i|)\) Applies `next` N times for a given input string.
@@ -162,25 +164,32 @@ retrieve ::
162164
Int
163165
retrieve ac = nextN ac 0
164166

165-
-- | \(O(\Gamma \sum_i |S_i|)\)
167+
-- | \(O(|S_i|)\) TODO
168+
--
169+
-- @since 1.5.3.0
170+
{-# INLINEABLE match #-}
171+
match :: (HasCallStack) => AhoCorasick -> VU.Vector Int -> VU.Vector (Int, Int)
172+
match ac t = VU.empty
173+
174+
-- | \(O(\sum_i |S_i| \Gamma)\)
166175
{-# INLINEABLE buildTrie #-}
167-
buildTrie :: (HasCallStack) => V.Vector (VU.Vector Int) -> (Int, V.Vector (IM.IntMap Int), VU.Vector Int)
176+
buildTrie :: (HasCallStack) => V.Vector (VU.Vector Int) -> (Int, V.Vector (HM.HashMap Int Int), VU.Vector Int)
168177
buildTrie patterns = runST $ do
169178
let !nMaxNodes = (1 +) . V.sum $ V.map VU.length patterns
170179
-- (Vertex, Char) -> Vertex
171-
nextVec <- VM.replicate nMaxNodes IM.empty
180+
nextVec <- VM.replicate nMaxNodes HM.empty
172181
parentVec <- VUM.replicate nMaxNodes (0 :: Int)
173182
nNodesVec <- VUM.replicate 1 (1 :: Int)
174183

175184
VG.forM_ patterns $ \pat -> do
176185
VG.foldM'
177186
( \ !u c -> do
178-
v0 <- IM.lookup c <$> VGM.read nextVec u
187+
v0 <- HM.lookup c <$> VGM.read nextVec u
179188
case v0 of
180189
Nothing -> do
181190
v <- VGM.read nNodesVec 0
182191
VGM.write nNodesVec 0 $! v + 1
183-
VGM.modify nextVec (IM.insert c v) u
192+
VGM.modify nextVec (HM.insert c v) u
184193
VGM.write parentVec v u
185194
pure v
186195
Just v -> pure v
@@ -193,15 +202,15 @@ buildTrie patterns = runST $ do
193202
!parent <- VG.take nNodes <$> VU.unsafeFreeze parentVec
194203
pure (nNodes, next, parent)
195204

196-
-- | \(O(\Gamma \sum_i |S_i|)\)
205+
-- | \(O(\sum_i |S_i| \Gamma)\)
197206
{-# INLINEABLE runBfs #-}
198-
runBfs :: (HasCallStack) => Int -> V.Vector (IM.IntMap Int) -> VU.Vector Int
207+
runBfs :: (HasCallStack) => Int -> V.Vector (HM.HashMap Int Int) -> VU.Vector Int
199208
runBfs nNodes next = VU.create $ do
200209
-- BFS
201210
suffixVec <- VUM.replicate nNodes (0 :: Int)
202211
que <- Q.new @_ @Int nNodes
203212

204-
for_ (IM.elems (next VG.! 0)) $ \v -> do
213+
for_ (HM.elems (next VG.! 0)) $ \v -> do
205214
when (v /= -1) $ do
206215
Q.pushBack que v
207216

@@ -211,12 +220,12 @@ runBfs nNodes next = VU.create $ do
211220
Nothing -> pure ()
212221
Just u -> do
213222
-- visit neighbors
214-
for_ (IM.assocs (next VG.! u)) $ \(!c, !v) -> do
223+
for_ (HM.toList (next VG.! u)) $ \(!c, !v) -> do
215224
Q.pushBack que v
216225
-- find the longest suffix to continue with `c`
217226
flip fix u $ \suffixLoop p -> do
218227
!suf <- VGM.read suffixVec p
219-
case IM.lookup c (next VG.! suf) of
228+
case HM.lookup c (next VG.! suf) of
220229
Just sufC -> do
221230
VGM.write suffixVec v sufC
222231
Nothing

test/Tests/Extra/AhoCorasick.hs

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,6 @@ module Tests.Extra.AhoCorasick where
22

33
import AtCoder.Extra.AhoCorasick qualified as Ac
44
import Data.Vector qualified as V
5-
import Data.Vector.Algorithms.Intro qualified as VAI
65
import Data.Vector.Generic qualified as VG
76
import Data.Vector.Unboxed qualified as VU
87
import Test.Tasty

0 commit comments

Comments
 (0)