This repository has been archived by the owner on Nov 6, 2019. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 7
/
Copy pathgoogle-ranking.js
197 lines (153 loc) · 6.13 KB
/
google-ranking.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
/*
== HTML structure ==
search results:
- body > #main #cnt .mw #rcnt #center_col #search ol li
- [simpler] #search ol li
inside each li:
> div.vsc > h3 > a = link, take .text() for title, don't take the href
> div.vsc > div.s > .f > cite = visible url, take .text()
> div.vsc > div.s > .st = description
pager:
- body > #main #cnt #foot #xjs #navcnt #nav [td.navend] > a | a#pnnext (differs in js/js-less modes)
- [simpler] #nav a#pnnext
*/
var jscrape = require('jscrape'), // lazy combo of jquery+jsdom+request
async = require('async');
var gBase = 'http://www.google.com'; // maybe expand to other languages?
// returns the search URL for a query and page
var searchUrl = function searchUrl(searchPhrase) {
// spaces=>+, otherwise escape
searchPhrase = escape( searchPhrase.replace(/ /g, '+') );
var url = gBase + '/search?hl=en&output=search&q=' + searchPhrase + '&';
// [no longer using pages this way, see below]
// if (!isNaN(pageNum) && pageNum > 1) url += 'start=' + (10*pageNum) + '&';
return url;
};
module.exports.searchUrl = searchUrl;
// given a search URL (for a single results page), request and parse results
var getGoogleResultsPage = function getGoogleResultsPage(url, callback) {
// (default 'Windows NT 6.0' probably looks fishy coming from a Linux server)
jscrape.headers['User-Agent'] = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_4) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.52 Safari/536.5';
// console.log('getting', url);
jscrape(url, function (error, $, response, body) {
if (error) return next(error);
if (!$) return next(new Error("Missing jQuery object"));
// (highly unlikely)
if (response.statusCode !== 200) return next(new Error("Bad status code " + response.statusCode));
var res = {
nextPageUrl: null,
results: []
};
// parse results
$('#search ol li.g').each(function(){
var $rc = $(this).find('div.rc');
res.results.push({
title: $rc.find('> h3 a').text(),
url: $rc.find('> div.s .f cite').text(),
description: $rc.find('> div.s .st').text(),
// page: pageNum,
ranking: res.results.length + 1
});
});
// parse the Next link
var nextPageUrl = $('#nav a#pnnext').attr('href');
if (typeof nextPageUrl == 'undefined' || nextPageUrl === null || nextPageUrl === '') {
res.nextPageUrl = null;
}
// should be a relative url
else if (/^http/.test(nextPageUrl)) {
return callback(new Error("Next-page link is not in expected format"));
}
else {
res.nextPageUrl = gBase + nextPageUrl;
}
callback(null, res);
});
};
// find where in the top 100 results a match is found.
// (only gets as many as needed, doesn't get 100 if found earlier)
// urlChecker:
// - can be a string, then visible URL is indexOf'd w/ the string.
// - can be a function, gets a result array (w/url, title, description), should return true on match.
// callback gets [error, result] where result contains page & ranking, or false if not found.
var getGoogleRanking = function getGoogleRanking(searchPhrase, urlChecker, callback) {
if (typeof urlChecker === 'string') {
urlChecker = defaultUrlChecker(urlChecker);
}
else if (typeof urlChecker !== 'function')
throw new Error('urlChecker needs to be a string or a function');
var pageNum = 1,
url = searchUrl(searchPhrase), // initial
found = false;
// get 10 pages of results. get the next-page url from the results of each.
// (could just use start=N param, but seems more authentic to follow actual results link.
// also maybe less likely to raise red flags)
async.whilst(
function test() { return pageNum <= 10 && url != null && !found; },
function getNextPage(next) {
// console.log(pageNum, url);
getGoogleResultsPage(url, function(error, pageResults){
// console.dir(pageResults);
if (error) return next(error);
// pageResults have 'nextPageUrl' (string) and results (array)
url = pageResults.nextPageUrl || null;
for (var i=0; i<pageResults.results.length; i++) {
if (urlChecker(pageResults.results[i]) === true) {
found = pageResults.results[i];
found.page = pageNum;
// console.log('Found!', found);
return next(); // will stop b/c found is not falsy
}
}
pageNum++;
next();
});
},
function done(error) {
if (error) return callback(error);
callback(null, found);
}
);
};
module.exports.getGoogleRanking = getGoogleRanking;
// get 100 top results for a query
// searchPhrase: string to search for
// callback gets error or array of results
var getGoogleResults = function getGoogleResults(searchPhrase, callback) {
var pageNum = 1,
url = searchUrl(searchPhrase),
results = [];
// get 10 pages of results. get the next-page url from the results of each.
// (could just use start=N param, but seems more authentic to follow actual results link.
// also maybe less likely to raise red flags)
async.whilst(
function test() { return pageNum <= 10 && url != null; },
function getNextPage(next) {
// console.log(pageNum, url, results.length);
getGoogleResultsPage(url, function(error, pageResults){
// console.dir(pageResults);
if (error) return next(error);
// pageResults have 'nextPageUrl' (string) and results (array)
url = pageResults.nextPageUrl || null;
results = results.concat(pageResults.results);
pageNum++;
next();
});
},
function done(error) {
if (error) return callback(error);
callback(null, results);
}
);
};
module.exports.getGoogleResults = getGoogleResults;
// default urlChecker for a string match. returns a function.
var defaultUrlChecker = function(url) {
// Remove protocol prefix
url = url.replace(/^https?:\/\//, '');
return function(result) {
if (typeof result.url !== 'undefined')
if (result.url.indexOf(url) !== -1)
return true;
};
};