forked from robertpitt/plus-scraper
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmining.js
71 lines (61 loc) · 1.48 KB
/
mining.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
/*
* Google Profile Information Fun
*/
/*
* Require Libraries
*/
var http = require('http');
var url = require('url');
var fs = require('fs');
var locRegex = new RegExp("<loc>(.*?)</loc>","gi");
var packages = [];
/*
* Starting Link
*/
var baseOptions = {
host : 'www.gstatic.com',
port : 80,
path : '/s2/sitemaps/profiles-sitemap.xml'
};
http.get(baseOptions, function(response){
if(response.statusCode != 200)
{
console.error("Initial stack was not found");
process.exit();
};
var stack = '';
response.on('data', function(chunk){
stack += chunk;
});
response.on('end', function(){
var match = null;
while(match = locRegex.exec(stack))
{
packages.push(match[1]);
}
processPackages();
});
}).on('error', function(e) {
console.log("Got error<getting base>: " + e.message);
});
var processPackages = function()
{
var totalPackages = packages.length;
for(var i = 0; i < packages.length; i++)
{
var requestOptions = url.parse(packages[i]);
(function(index){ //Do not use i inside thise scope.. noob
http.get(requestOptions, function(response){
if(response.statusCode != 200)
{
console.log("failed to get package form google");
return;
}
response.pipe(fs.createWriteStream("./data/segment_" + index + ".txt"));
console.log("Piping " + index + " of " + totalPackages * index + " into ./data/segment_" + index + ".txt");
}).on('error', function(e) {
console.log("Got error<getting packages>: " + e.message);
});
})(i);
}
}