-
Notifications
You must be signed in to change notification settings - Fork 2
/
dadot.js
28 lines (27 loc) · 1.07 KB
/
dadot.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
var cheerio = require('cheerio'),
_ = require('underscore');
var Dadot = module.exports = {
extract: function(html) {
var $ = cheerio.load(html);
var article_candidate = $('br,p').parent(':contains("다.")');
var refined_list = _.map(article_candidate, function(el) {
el = _.clone(el);
$(el).children().remove();
var result = "";
if ($(el).html()) {
result = $(el).html()
.replace(/<!--(.|\s)*?-->/gi,"")
.replace(/\r?\n|\r/gi,"") // remove enter key (carrage returen)
.replace(/\t/gi,"")
.replace(/ /," ")
.replace(/<style[\w\W]+<\/style>/gi,"") // remove <style>
.replace(/<script[\w\W]+<\/script>/gi,"") // remove <script>
.replace(/<[^\>]+>/gi," "); // remove other tag
}
return result;
});
return _.max(refined_list, function(refined) {
return refined.split('다.').length;
});
}
}