Skip to content

Commit

Permalink
Add files via upload
Browse files Browse the repository at this point in the history
  • Loading branch information
hkspirt authored Sep 13, 2018
1 parent 14bdc36 commit 4d38a1b
Show file tree
Hide file tree
Showing 4 changed files with 10,564 additions and 0 deletions.
136 changes: 136 additions & 0 deletions ahocorasick.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,136 @@
//----------------
//Func : Aho Corasick Word Match 敏感词匹配
//Author: xjh
//Date : 2018/09/13
//Note : 基于github.com/gansidui/ahocorasick
// 支持线程安全
// 支持中文(UTF8)
// 比使用正则匹配regexp有上千倍性能提升
//----------------

package ahocorasick

import (
"container/list"
)

type trieNode struct {
count int
fail *trieNode
child map[rune]*trieNode
index int
}

func newTrieNode() *trieNode {
return &trieNode{
count: 0,
fail: nil,
child: make(map[rune]*trieNode),
index: -1,
}
}

type ACMatcher struct {
root *trieNode
size int
}

func NewMatcher(dict []string) *ACMatcher {
m := &ACMatcher{
root: newTrieNode(),
size: 0,
}

for i := range dict {
m.insert(dict[i])
}
m.build()
return m
}

//包含敏感词位置、个数
func (m *ACMatcher) Match(s string) []int {
curNode := m.root
var p *trieNode = nil
ret := make([]int, 0)
mark := make(map[int]bool)
for _, v := range s {
for curNode.child[v] == nil && curNode != m.root {
curNode = curNode.fail
}
curNode = curNode.child[v]
if curNode == nil {
curNode = m.root
}
p = curNode
for p != m.root && p.count > 0 && !mark[p.index] {
mark[p.index] = true
for i := 0; i < p.count; i++ {
ret = append(ret, p.index)
}
p = p.fail
}
}
return ret
}

//是否包含敏感词,查找到任意立即返回
func (m *ACMatcher) Has(s string) bool{
curNode := m.root
var p *trieNode = nil
for _, v := range s {
for curNode.child[v] == nil && curNode != m.root {
curNode = curNode.fail
}
curNode = curNode.child[v]
if curNode == nil {
curNode = m.root
}
p = curNode
for p != m.root && p.count > 0 {
return true
}
}
return false
}


func (m *ACMatcher) build() {
ll := list.New()
ll.PushBack(m.root)
for ll.Len() > 0 {
temp := ll.Remove(ll.Front()).(*trieNode)
var p *trieNode = nil
for i, v := range temp.child {
if temp == m.root {
v.fail = m.root
} else {
p = temp.fail
for p != nil {
if p.child[i] != nil {
v.fail = p.child[i]
break
}
p = p.fail
}
if p == nil {
v.fail = m.root
}
}
ll.PushBack(v)
}
}
}

func (m *ACMatcher) insert(s string) {
curNode := m.root
for _, v := range s {
if curNode.child[v] == nil {
curNode.child[v] = newTrieNode()
}
curNode = curNode.child[v]
}
curNode.count++
curNode.index = m.size
m.size++
}
78 changes: 78 additions & 0 deletions ahocorasick_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
package ahocorasick

import (
"fmt"
"math/rand"
"regexp"
"testing"
)

var RegMatcher = regexp.MustCompile(BuildSensitiveStr("./sensitive_words.csv"))
var words = BuildSensitiveArray("./sensitive_words.csv")
var AcMatcher = NewMatcher(words)

var hasSensStr string
var noSensStr string

func init() {
hasSensStr = fmt.Sprintf("AA%sBB%s%sCC", words[rand.Intn(len(words))], words[rand.Intn(len(words))], words[rand.Intn(len(words))])
noSensStr = "你真是个伟人哈哈呵呵火啊物abcd"
}

func TestACMatcher_Match(t *testing.T) {
ret1 := AcMatcher.Match(hasSensStr)
if len(ret1) != 6 {
t.Fatal(hasSensStr)
}
ret2 := AcMatcher.Match(noSensStr)
if len(ret2) != 0 {
t.Fatal(noSensStr)
}
}

func TestACMatcher_Has(t *testing.T) {
ret1 := AcMatcher.Has(hasSensStr)
if ret1 == false {
t.Fatal(hasSensStr)
}
ret2 := AcMatcher.Has(noSensStr)
if ret2 == true {
t.Fatal(noSensStr)
}
}

func BenchmarkRegMatcher_Reg_Has(b *testing.B) {
for idx := 1; idx < 50; idx++ {
RegMatcher.MatchString(hasSensStr)
}
}

func BenchmarkRegMatcher_Reg_No(b *testing.B) {
for idx := 1; idx < 50; idx++ {
RegMatcher.MatchString(noSensStr)
}
}

func BenchmarkACMatcher_Ac_Has(b *testing.B) {
for idx := 1; idx < 50000; idx++ {
AcMatcher.Has(hasSensStr)
}
}

func BenchmarkACMatcher_Ac_No(b *testing.B) {
for idx := 1; idx < 50000; idx++ {
AcMatcher.Has(noSensStr)
}
}

func BenchmarkACMatcher_Ac_MatchHas(b *testing.B) {
for idx := 1; idx < 50000; idx++ {
AcMatcher.Match(hasSensStr)
}
}

func BenchmarkACMatcher_Ac_MatchNo(b *testing.B) {
for idx := 1; idx < 50000; idx++ {
AcMatcher.Match(noSensStr)
}
}
Loading

0 comments on commit 4d38a1b

Please sign in to comment.