Skip to content

Commit

Permalink
first commit
Browse files Browse the repository at this point in the history
  • Loading branch information
pyloque committed Oct 23, 2018
0 parents commit 1690214
Show file tree
Hide file tree
Showing 6 changed files with 1,154 additions and 0 deletions.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
node_modules

210 changes: 210 additions & 0 deletions index.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,210 @@
'use strict';

function FastScanner(words) {
this.root = buildTree(words);
}

function buildTree(words) {
// 词汇去重
words = dedupAndSort(words);
var root = {
next: {}, // 子节点指针
val: null, // 当前节点的字符,null表示根节点
back: null, // 回溯指针,也称失败指针
parent: null, // 父节点指针,
depth: 0, // 节点深度
accept: false // 是否形成了一个完整的词汇,中间节点也可能为true
}
// make trie tree
for (var i = 0; i < words.length; i++) {
addWord(root, words[i]);
}
// fix backtrace pointer
fallbackAll(root);
return root;
}

function dedupAndSort(words) {
// 砍掉空格
words = words.map(function (word) {
return word.trim()
});
// 滤掉空串
words = words.filter(function (word) {
return word.length > 0
});
var seen = {};
var out = [];
for (var i = 0; i < words.length; i++) {
var word = words[i];
if (!seen[word]) {
seen[word] = true;
out[out.length] = word;
}
}
return out.sort();
}

function addWord(root, word) {
var current = root;
for (var i = 0; i < word.length; i++) {
var c = word[i];
var next = current.next[c];
if (!next) {
current.next[c] = {
next: {},
val: c,
accept: false,
back: root,
parent: current,
depth: current.depth + 1
}
}
current = current.next[c];
}
current.accept = true;
}

function fallbackAll(root) {
var curExpands = [root];
while (curExpands.length > 0) {
var nextExpands = [];
for (var i = 0; i < curExpands.length; i++) {
var node = curExpands[i];
for (var c in node.next) {
nextExpands.push(node.next[c]);
}
// 根节点
if (node.back == null) {
break;
}
var parent = node.parent
var back = parent.back
// 第一层节点也谈不上回溯
if (back == null) {
continue;
}
// 匹配父节点的回溯节点的子节点
for (var c in back.next) {
if (node.val == back.next[c].val) {
node.back = back.next[c]
break
}
}
}
curExpands = nextExpands
}
}

function fallback(root, word) {
var current = root.next[word[0]]
for (var i = 1; i < word.length; i++) {
var c = word[i]
var parent = current.parent
var back = parent.back
// 第一层节点也谈不上回溯
if (back == null) {
current = current.next[c]
continue;
}
// 匹配父节点的回溯节点的子节点
for (var c in back.next) {
if (current.val == back.next[c].val) {
current.back = back.next[c]
break
}
}
current = current.next[c]
}
}

function selectLongest(offsetWords) {
var stands = {}
for (var i = 0; i < offsetWords.length; i++) {
var offword = offsetWords[i];
var word = stands[offword[0]];
if (!word || word.length < offword[1].length) {
stands[offword[0]] = offword[1];
}
}
var offsets = Object.keys(stands).map(function (key) {
return parseInt(key)
}).sort(function (a, b) {
return a - b
});
return offsets.map(function (off) {
return [off, stands[off]]
});
}

FastScanner.prototype.add = function add(word) {
addWord(this.root, word)
fallback(this.root, word)
}

function collect(node) {
var word = [];
while (node.val != null) {
word.unshift(node.val);
node = node.parent;
}
return word.join('')
}

FastScanner.prototype.hits = function hits(content, options) {
var offWords = this.search(content, options);
var seen = {};
for (var i = 0; i < offWords.length; i++) {
seed[offWords[1]] = true;
}
return Object.keys(seed)
}

FastScanner.prototype.search = function search(content, options) {
var offWords = [];
var current = this.root;
options = options || {}
for (var i = 0; i < content.length;) {
var c = content[i];
var next = current.next[c];
if (next) {
current = next;
i++;
// 收集匹配的词汇
if (next.accept) {
var word = collect(current)
offWords.push([i - word.length, word]);
// 只选第一个词
if (options.quick) {
return offWords
}
}
continue;
}
var back = current.back;
if (back == null) {
i++;
continue;
}
// 回溯
var delta = current.depth - back.depth - 1;
current = back;
i -= delta;
// 收集匹配的词汇
if (current.accept) {
var word = collect(current)
offWords.push([i - word.length, word]);
// 只选第一个词
if (options.quick) {
return offWords
}
}
}
// 同一个位置选最长的
if (options.longest) {
return selectLongest(offWords)
}
return offWords
}

module.exports = FastScanner;
89 changes: 89 additions & 0 deletions index.test.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
var assert = require('assert');
var fs = require('fs');
var FastScanner = require('./index');
describe('测试单一词汇', function () {
var scanner = new FastScanner(["江泽民"]);
it('成功扫出来', function () {
var content = "我不是江泽民的儿子,我跟江泽民没有任何关系"
var offWords = scanner.search(content)
assert.deepEqual([[3, '江泽民'], [12, '江泽民']], offWords)
offWords = scanner.search(content, { quick: true })
assert.deepEqual([[3, '江泽民']], offWords)
offWords = scanner.search(content, { longest: true })
assert.deepEqual([[3, '江泽民'], [12, '江泽民']], offWords)
});
it('扫不出来的', function () {
var content = "我不喜欢喝江小白,我喜欢喝鸡尾酒"
var offWords = scanner.search(content)
assert.equal(0, offWords.length)
var offwords = scanner.search(content, { quick: true })
assert.equal(0, offWords.length)
var offWords = scanner.search(content, { longest: true })
assert.equal(0, offWords.length)
});
});
describe('测试多个独立词汇', function () {
var scanner = new FastScanner(["江泽民", "习近平", "胡锦涛"]);
it('成功扫出来', function () {
var content = "我不是江泽民的儿子,也不是习近平的儿子,更不是胡锦涛的儿子"
var offWords = scanner.search(content)
assert.deepEqual([[3, '江泽民'], [13, '习近平'], [23, '胡锦涛']], offWords)
var offWords = scanner.search(content, { quick: true })
assert.deepEqual([[3, '江泽民']], offWords)
var offWords = scanner.search(content, { longest: true })
assert.deepEqual([[3, '江泽民'], [13, '习近平'], [23, '胡锦涛']], offWords)
});
it('扫不出来的', function () {
var content = "我不喜欢喝江小白,我喜欢喝鸡尾酒"
var offWords = scanner.search(content)
assert.equal(0, offWords.length)
var offWords = scanner.search(content, { quick: true })
assert.equal(0, offWords.length)
var offWords = scanner.search(content, { longest: true })
assert.equal(0, offWords.length)
});
});
describe('测试叠加词汇', function () {
it('简单扫一下', function () {
var scanner = new FastScanner(["近平", "习近平棒", "习近平好"])
var content = "习近平拽"
var offWords = scanner.search(content)
assert.deepEqual([[1, '近平']], offWords)
});
it('扫的狠一点', function () {
var scanner = new FastScanner(["近平", "习近平", "习近平好"])
var content = "我不说习近平好,也不是习近平坏"
var offWords = scanner.search(content)
assert.deepEqual([[3, '习近平'], [3, '习近平好'], [4, '近平'], [11, '习近平'], [12, '近平']], offWords)
var offWords = scanner.search(content, { quick: true })
assert.deepEqual([[3, '习近平']], offWords)
var offWords = scanner.search(content, { longest: true })
assert.deepEqual([[3, '习近平好'], [4, '近平'], [11, '习近平'], [12, '近平']], offWords)
});
});
describe('猛量单词测试', function () {
var words = fs.readFileSync("./words.test")
words = words.toString().split("\n")
words = words.filter(function (word) {
return word.length > 0;
});
var scanner = new FastScanner(words);
it('扫啊扫啊扫的痛啊', function () {
var content = `
1995年中共执政当局开始寻求强化法轮功的组织构架及与政府的关系。
中国政府的国家体委、公共健康部和气功科研会,访问李洪志,要求联合成立法轮功协会,但李洪志表示拒绝。
同年,气功科研会通过一项新规定,命令所有气功分会必须建立中国共产党党支部,但李洪志再次表示拒绝。
李洪志与中国气功科研会的关系在1996年持续恶化。
1996 年3月,法轮功因拒不接受中国气功协会新负责人在“气功团体内部收取会员费创收”和“成立中国共产党党支部组织”的要求,
主动申请退出中国气功协会和中国 气功科研会, 以独立非政府形式运作。
自此,李洪志及其法轮功脱离了中国气功协会中的人脉和利益交换,同时失去了功派在中国政府体制系统的保护。
法轮功申请退出中国气功协会,是与中国政府对气功的态度产生变化相对应的;
当时随气功激进反对者在政府部门中的影响力增加,中国政府开始控制和影响各气功组织。
90年代中期,中国政府主管的媒体开始发表文章批评气功。
法轮功起初并没有受批评,但在1996年3月退出中国气功协会后,失去了政府体制的保护。
`;
console.log(scanner.search(content));
console.log(scanner.search(content, {quick: true}));
console.log(scanner.search(content, {longest: true}));
});
});
Loading

0 comments on commit 1690214

Please sign in to comment.