forked from spencermountain/dumpster-dive
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscratch.js
65 lines (58 loc) · 2.16 KB
/
scratch.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
const dumpster = require('./src');
const drop = require('./src/lib/drop-db');
//144mb → 2.5 minutes = 57mb per worker per minute
// const path = '/Users/spencer/data/wikipedia/afwiki-latest-pages-articles.xml' //4.3mins
// const path = '/Users/spencer/data/wikipedia/simplewiki-latest-pages-articles.xml'; //5mins //144 MB each
// const path = '/Users/spencer/data/wikipedia/eswiki-latest-pages-articles.xml' //2hrs - 12gb→5gb
// const path = '/media/spencer/07d11766-2ce6-4f8a-8ec0-a3d144a3d4cd/big_data/wikipedia/afwiki-latest-pages-articles.xml' //6hrs
const path = '/home/spencer/mountain/dumpster-dive/tests/tinywiki-latest-pages-articles.xml';
// const path = '/media/spencer/07d11766-2ce6-4f8a-8ec0-a3d144a3d4cd/big_data/wikipedia/eswiki-latest-pages-articles.xml'
// const path = './tests/smallwiki-latest-pages-articles.xml' //3s
// const path = './tests/tinywiki-latest-pages-articles.xml' //2s
const dbName = path.match(/\/([a-z-]+)-latest-pages/)[1];
let options = {
file: path,
db: dbName,
templates: false,
verbose: true,
skip_redirects: true,
skip_disambig: true,
workers: 1
// custom: function(doc) {
// return {
// // _id: doc.title(),
// foo: 'bar',
// // categories: doc.categories(),
// };
// }
};
//delete all pages
drop(options).then(() => {
dumpster(options);
});
// const fs = require('fs');
// let str = fs.readFileSync(path).toString()
// let str = `
// <text xml:space="preserve">
// this duplicate should stay
// from here too
// </text>`
// console.log(str.match(/<text xml:space="preserve">([\s\S]*?)<\/text>/))
// half- 6021472
// Euston Road - 5888070
//Turquoise-browed motmot - 9030127
//Holyoke, Massachusetts - 9030314
//Jacobabad District - 2765854
//Tubize - 2833911
//Ribes, Ardèche - 2911022
//Saint-Germain-l'Aiguill - 3490581
//Adolphe-Marie Hardy - 5958564
//White Hart Lane railway - 4146036
//List of cities in Somal- 7925458
//Computer data storage - 8852258
// end - 12042945
// 604084071
// worker #0 : 0→154021017
// worker #1 : 150021017→304042034
// worker #2 : 301042034→455063051
// worker #3 : 452063051→606084068