-
Notifications
You must be signed in to change notification settings - Fork 121
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
0 parents
commit 1690214
Showing
6 changed files
with
1,154 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
node_modules | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,210 @@ | ||
'use strict'; | ||
|
||
function FastScanner(words) { | ||
this.root = buildTree(words); | ||
} | ||
|
||
function buildTree(words) { | ||
// 词汇去重 | ||
words = dedupAndSort(words); | ||
var root = { | ||
next: {}, // 子节点指针 | ||
val: null, // 当前节点的字符,null表示根节点 | ||
back: null, // 回溯指针,也称失败指针 | ||
parent: null, // 父节点指针, | ||
depth: 0, // 节点深度 | ||
accept: false // 是否形成了一个完整的词汇,中间节点也可能为true | ||
} | ||
// make trie tree | ||
for (var i = 0; i < words.length; i++) { | ||
addWord(root, words[i]); | ||
} | ||
// fix backtrace pointer | ||
fallbackAll(root); | ||
return root; | ||
} | ||
|
||
function dedupAndSort(words) { | ||
// 砍掉空格 | ||
words = words.map(function (word) { | ||
return word.trim() | ||
}); | ||
// 滤掉空串 | ||
words = words.filter(function (word) { | ||
return word.length > 0 | ||
}); | ||
var seen = {}; | ||
var out = []; | ||
for (var i = 0; i < words.length; i++) { | ||
var word = words[i]; | ||
if (!seen[word]) { | ||
seen[word] = true; | ||
out[out.length] = word; | ||
} | ||
} | ||
return out.sort(); | ||
} | ||
|
||
function addWord(root, word) { | ||
var current = root; | ||
for (var i = 0; i < word.length; i++) { | ||
var c = word[i]; | ||
var next = current.next[c]; | ||
if (!next) { | ||
current.next[c] = { | ||
next: {}, | ||
val: c, | ||
accept: false, | ||
back: root, | ||
parent: current, | ||
depth: current.depth + 1 | ||
} | ||
} | ||
current = current.next[c]; | ||
} | ||
current.accept = true; | ||
} | ||
|
||
function fallbackAll(root) { | ||
var curExpands = [root]; | ||
while (curExpands.length > 0) { | ||
var nextExpands = []; | ||
for (var i = 0; i < curExpands.length; i++) { | ||
var node = curExpands[i]; | ||
for (var c in node.next) { | ||
nextExpands.push(node.next[c]); | ||
} | ||
// 根节点 | ||
if (node.back == null) { | ||
break; | ||
} | ||
var parent = node.parent | ||
var back = parent.back | ||
// 第一层节点也谈不上回溯 | ||
if (back == null) { | ||
continue; | ||
} | ||
// 匹配父节点的回溯节点的子节点 | ||
for (var c in back.next) { | ||
if (node.val == back.next[c].val) { | ||
node.back = back.next[c] | ||
break | ||
} | ||
} | ||
} | ||
curExpands = nextExpands | ||
} | ||
} | ||
|
||
function fallback(root, word) { | ||
var current = root.next[word[0]] | ||
for (var i = 1; i < word.length; i++) { | ||
var c = word[i] | ||
var parent = current.parent | ||
var back = parent.back | ||
// 第一层节点也谈不上回溯 | ||
if (back == null) { | ||
current = current.next[c] | ||
continue; | ||
} | ||
// 匹配父节点的回溯节点的子节点 | ||
for (var c in back.next) { | ||
if (current.val == back.next[c].val) { | ||
current.back = back.next[c] | ||
break | ||
} | ||
} | ||
current = current.next[c] | ||
} | ||
} | ||
|
||
function selectLongest(offsetWords) { | ||
var stands = {} | ||
for (var i = 0; i < offsetWords.length; i++) { | ||
var offword = offsetWords[i]; | ||
var word = stands[offword[0]]; | ||
if (!word || word.length < offword[1].length) { | ||
stands[offword[0]] = offword[1]; | ||
} | ||
} | ||
var offsets = Object.keys(stands).map(function (key) { | ||
return parseInt(key) | ||
}).sort(function (a, b) { | ||
return a - b | ||
}); | ||
return offsets.map(function (off) { | ||
return [off, stands[off]] | ||
}); | ||
} | ||
|
||
FastScanner.prototype.add = function add(word) { | ||
addWord(this.root, word) | ||
fallback(this.root, word) | ||
} | ||
|
||
function collect(node) { | ||
var word = []; | ||
while (node.val != null) { | ||
word.unshift(node.val); | ||
node = node.parent; | ||
} | ||
return word.join('') | ||
} | ||
|
||
FastScanner.prototype.hits = function hits(content, options) { | ||
var offWords = this.search(content, options); | ||
var seen = {}; | ||
for (var i = 0; i < offWords.length; i++) { | ||
seed[offWords[1]] = true; | ||
} | ||
return Object.keys(seed) | ||
} | ||
|
||
FastScanner.prototype.search = function search(content, options) { | ||
var offWords = []; | ||
var current = this.root; | ||
options = options || {} | ||
for (var i = 0; i < content.length;) { | ||
var c = content[i]; | ||
var next = current.next[c]; | ||
if (next) { | ||
current = next; | ||
i++; | ||
// 收集匹配的词汇 | ||
if (next.accept) { | ||
var word = collect(current) | ||
offWords.push([i - word.length, word]); | ||
// 只选第一个词 | ||
if (options.quick) { | ||
return offWords | ||
} | ||
} | ||
continue; | ||
} | ||
var back = current.back; | ||
if (back == null) { | ||
i++; | ||
continue; | ||
} | ||
// 回溯 | ||
var delta = current.depth - back.depth - 1; | ||
current = back; | ||
i -= delta; | ||
// 收集匹配的词汇 | ||
if (current.accept) { | ||
var word = collect(current) | ||
offWords.push([i - word.length, word]); | ||
// 只选第一个词 | ||
if (options.quick) { | ||
return offWords | ||
} | ||
} | ||
} | ||
// 同一个位置选最长的 | ||
if (options.longest) { | ||
return selectLongest(offWords) | ||
} | ||
return offWords | ||
} | ||
|
||
module.exports = FastScanner; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,89 @@ | ||
var assert = require('assert'); | ||
var fs = require('fs'); | ||
var FastScanner = require('./index'); | ||
describe('测试单一词汇', function () { | ||
var scanner = new FastScanner(["江泽民"]); | ||
it('成功扫出来', function () { | ||
var content = "我不是江泽民的儿子,我跟江泽民没有任何关系" | ||
var offWords = scanner.search(content) | ||
assert.deepEqual([[3, '江泽民'], [12, '江泽民']], offWords) | ||
offWords = scanner.search(content, { quick: true }) | ||
assert.deepEqual([[3, '江泽民']], offWords) | ||
offWords = scanner.search(content, { longest: true }) | ||
assert.deepEqual([[3, '江泽民'], [12, '江泽民']], offWords) | ||
}); | ||
it('扫不出来的', function () { | ||
var content = "我不喜欢喝江小白,我喜欢喝鸡尾酒" | ||
var offWords = scanner.search(content) | ||
assert.equal(0, offWords.length) | ||
var offwords = scanner.search(content, { quick: true }) | ||
assert.equal(0, offWords.length) | ||
var offWords = scanner.search(content, { longest: true }) | ||
assert.equal(0, offWords.length) | ||
}); | ||
}); | ||
describe('测试多个独立词汇', function () { | ||
var scanner = new FastScanner(["江泽民", "习近平", "胡锦涛"]); | ||
it('成功扫出来', function () { | ||
var content = "我不是江泽民的儿子,也不是习近平的儿子,更不是胡锦涛的儿子" | ||
var offWords = scanner.search(content) | ||
assert.deepEqual([[3, '江泽民'], [13, '习近平'], [23, '胡锦涛']], offWords) | ||
var offWords = scanner.search(content, { quick: true }) | ||
assert.deepEqual([[3, '江泽民']], offWords) | ||
var offWords = scanner.search(content, { longest: true }) | ||
assert.deepEqual([[3, '江泽民'], [13, '习近平'], [23, '胡锦涛']], offWords) | ||
}); | ||
it('扫不出来的', function () { | ||
var content = "我不喜欢喝江小白,我喜欢喝鸡尾酒" | ||
var offWords = scanner.search(content) | ||
assert.equal(0, offWords.length) | ||
var offWords = scanner.search(content, { quick: true }) | ||
assert.equal(0, offWords.length) | ||
var offWords = scanner.search(content, { longest: true }) | ||
assert.equal(0, offWords.length) | ||
}); | ||
}); | ||
describe('测试叠加词汇', function () { | ||
it('简单扫一下', function () { | ||
var scanner = new FastScanner(["近平", "习近平棒", "习近平好"]) | ||
var content = "习近平拽" | ||
var offWords = scanner.search(content) | ||
assert.deepEqual([[1, '近平']], offWords) | ||
}); | ||
it('扫的狠一点', function () { | ||
var scanner = new FastScanner(["近平", "习近平", "习近平好"]) | ||
var content = "我不说习近平好,也不是习近平坏" | ||
var offWords = scanner.search(content) | ||
assert.deepEqual([[3, '习近平'], [3, '习近平好'], [4, '近平'], [11, '习近平'], [12, '近平']], offWords) | ||
var offWords = scanner.search(content, { quick: true }) | ||
assert.deepEqual([[3, '习近平']], offWords) | ||
var offWords = scanner.search(content, { longest: true }) | ||
assert.deepEqual([[3, '习近平好'], [4, '近平'], [11, '习近平'], [12, '近平']], offWords) | ||
}); | ||
}); | ||
describe('猛量单词测试', function () { | ||
var words = fs.readFileSync("./words.test") | ||
words = words.toString().split("\n") | ||
words = words.filter(function (word) { | ||
return word.length > 0; | ||
}); | ||
var scanner = new FastScanner(words); | ||
it('扫啊扫啊扫的痛啊', function () { | ||
var content = ` | ||
1995年中共执政当局开始寻求强化法轮功的组织构架及与政府的关系。 | ||
中国政府的国家体委、公共健康部和气功科研会,访问李洪志,要求联合成立法轮功协会,但李洪志表示拒绝。 | ||
同年,气功科研会通过一项新规定,命令所有气功分会必须建立中国共产党党支部,但李洪志再次表示拒绝。 | ||
李洪志与中国气功科研会的关系在1996年持续恶化。 | ||
1996 年3月,法轮功因拒不接受中国气功协会新负责人在“气功团体内部收取会员费创收”和“成立中国共产党党支部组织”的要求, | ||
主动申请退出中国气功协会和中国 气功科研会, 以独立非政府形式运作。 | ||
自此,李洪志及其法轮功脱离了中国气功协会中的人脉和利益交换,同时失去了功派在中国政府体制系统的保护。 | ||
法轮功申请退出中国气功协会,是与中国政府对气功的态度产生变化相对应的; | ||
当时随气功激进反对者在政府部门中的影响力增加,中国政府开始控制和影响各气功组织。 | ||
90年代中期,中国政府主管的媒体开始发表文章批评气功。 | ||
法轮功起初并没有受批评,但在1996年3月退出中国气功协会后,失去了政府体制的保护。 | ||
`; | ||
console.log(scanner.search(content)); | ||
console.log(scanner.search(content, {quick: true})); | ||
console.log(scanner.search(content, {longest: true})); | ||
}); | ||
}); |
Oops, something went wrong.