forked from HTTPArchive/legacy.httparchive.org
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathurls.inc
230 lines (176 loc) · 7.16 KB
/
urls.inc
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
<?php
/*
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
require_once("utils.inc");
require_once("dbapi.inc");
// Get all the info about a particular URL.
// If $bMassage is true then various permutations of $url will be attempted, e.g.,
// appending "/" and prepending "www.".
function getUrl($url, $bMassage=false) {
global $gUrlsTable;
// TODO - What if there are multiple results?
$query = "select * from $gUrlsTable where urlOrig='$url' or urlFixed='$url';";
$row = doRowQuery($query);
$existingUrl = ( $row['urlFixed'] ? $row['urlFixed'] : $row['urlOrig'] );
if ( ! $existingUrl && $bMassage ) {
if ( FALSE === strpos($url, "/", 10) ) {
// add a trailing "/"
$alturl = $url . "/";
$query = "select * from $gUrlsTable where urlOrig='$alturl' or urlFixed='$alturl';";
$row = doRowQuery($query);
$existingUrl = ( $row['urlFixed'] ? $row['urlFixed'] : $row['urlOrig'] );
}
if ( !$existingUrl && FALSE === strpos($alturl, "http://www.") && FALSE === strpos($alturl, "https://www.") ) {
// add "www."
$alturl = str_replace("http://", "http://www.", $alturl);
$alturl = str_replace("https://", "https://www.", $alturl);
$query = "select * from $gUrlsTable where urlOrig='$alturl' or urlFixed='$alturl';";
$row = doRowQuery($query);
$existingUrl = ( $row['urlFixed'] ? $row['urlFixed'] : $row['urlOrig'] );
}
}
if ( $existingUrl ) {
// Set the best value for URL.
$row['url'] = $existingUrl;
}
else {
$row = null;
}
return $row;
}
// Return the URL to Alexa for more info about the website's rank.
function rankUrl($url) {
global $gUrlsTable;
$query = "select urlOrig from $gUrlsTable where " . getUrlhashCond($url) . " and (urlFixed='$url' or urlOrig='$url') order by rank asc;";
$urlOrig = doSimpleQuery($query);
$aMatches = array();
$rankUrl = "http://www.alexa.com/siteinfo";
if ( preg_match('/^http[s]*:\/\/www\.(.*)/', $urlOrig, $aMatches) ) {
$rankUrl = "http://www.alexa.com/siteinfo/" . $aMatches[1];
}
else if ( preg_match('/^http[s]*:\/\/(.*)\//', $urlOrig, $aMatches) || preg_match('/^http[s]*:\/\/(.*)/', $urlOrig, $aMatches) ) {
$rankUrl = "http://www.alexa.com/search?q=" . $aMatches[1] . "&r=site_siteinfo&p=bigtop";
}
else if ( preg_match('/^(htt.*)/', $urlOrig, $aMatches) ) {
$rankUrl = "http://www.alexa.com/search?q=" . $aMatches[1] . "&r=site_siteinfo&p=bigtop";
}
return $rankUrl;
}
// Return the website's rank.
// TODO - Eventually we might want to return the rank RELATIVE to a specific pageid.
// For example, foobar.com might be rank 123 today but was 456 at the time of the run.
function rank($url, $pageid=null) {
global $gUrlsTable;
$query = "select rank from $gUrlsTable where " . getUrlhashCond($url) . " and (urlOrig='$url') order by rank asc;";
$rank = doSimpleQuery($query);
return ( $rank ? $rank : "n/a" );
}
function addSite($url) {
global $gUrlsChangeTable;
// Add the URL to a queue for later removal.
$cmd = "insert into $gUrlsChangeTable set url='$url', action='add', createdate = " . time() . ";";
return doSimpleCommand($cmd);
}
function removeSite($url) {
global $gUrlsChangeTable;
// Add the URL to a queue for later removal.
$cmd = "insert into $gUrlsChangeTable set url='$url', action='remove', createdate = " . time() . ";";
return doSimpleCommand($cmd);
}
// return true if the URL is already in the list
function urlExists($url, &$other=NULL, &$optout=NULL) {
global $gUrlsTable;
$query = "select urlOrig, urlFixed, other, optout from $gUrlsTable where urlOrig='$url' or urlFixed='$url';";
$row = doRowQuery($query);
$existingUrl = ( $row['urlFixed'] ? $row['urlFixed'] : $row['urlOrig'] );
$other = $row['other'];
$optout = $row['optout'];
if ( !$existingUrl && FALSE === strpos($url, "http://www.") && FALSE === strpos($url, "https://www.") ) {
$url = str_replace("http://", "http://www.", $url);
$url = str_replace("https://", "https://www.", $url);
$query = "select urlOrig, urlFixed, other, optout from $gUrlsTable where urlOrig='$url' or urlFixed='$url';";
$row = doRowQuery($query);
$existingUrl = ( $row['urlFixed'] ? $row['urlFixed'] : $row['urlOrig'] );
$other = $row['other'];
$optout = $row['optout'];
}
return $existingUrl;
}
// Returns a mysql result instance of all pending urls
function pendingUrls() {
global $gUrlsChangeTable;
$query = "select url, createDate, action from $gUrlsChangeTable order by action, createDate asc;";
$result = doQuery($query);
return ( $result ? $result : -1);
}
// Add a url to the urls table (and remove it from the urlschanges table).
function approveAddUrl($url) {
global $gUrlsChangeTable;
addOtherUrl($url);
$cmd = "delete from $gUrlsChangeTable where url='$url';";
doSimpleCommand($cmd);
}
// Remove a url from the urls table (and remove it from the urlschanges table).
function approveRemoveUrl($url) {
global $gUrlsChangeTable;
removeUrlData($url);
$cmd = "delete from $gUrlsChangeTable where url='$url';";
doSimpleCommand($cmd);
}
// Set "other=true" for a URL in the urls table.
function addOtherUrl($url) {
global $gUrlsTable;
$cmd = "replace into $gUrlsTable set urlOrig='$url', other=1, timeAdded=" . time() . ", urlhash=" . getUrlhashFunc($url) . ";";
$result = doSimpleCommand($cmd);
}
// Rejects a url in the 'add' state.
function rejectUrl($url) {
global $gUrlsChangeTable;
$cmd = "delete from $gUrlsChangeTable where url='$url';";
doSimpleCommand($cmd);
}
function removeUrlData($url) {
global $gPagesTable, $gRequestsTable, $gUrlsTable;
// delete it from the urls table so it's no longer crawled
$cmd = "delete from $gUrlsTable where urlOrig='$url' or urlFixed='$url';";
doSimpleCommand($cmd);
$result = doQuery("select pageid from $gPagesTable where url='$url';");
$aPageids = array();
while ( $row = mysqli_fetch_assoc($result) ) {
$aPageids[] = $row['pageid'];
}
mysqli_free_result($result);
$sPageids = implode(",", $aPageids);
if ( $sPageids ) {
$cmd = "delete from $gPagesTable where pageid in ($sPageids);";
doSimpleCommand($cmd);
$cmd = "delete from $gRequestsTable where pageid in ($sPageids);";
doSimpleCommand($cmd);
}
// TODO - we should recompute all the stats at this point - ouch
}
function optoutUrl($url) {
global $gUrlsTable;
$cmd = "update $gUrlsTable set optout=true where urlOrig='$url' or urlFixed='$url';";
doSimpleCommand($cmd);
}
// Since it's hard to remember the SQL for converting a URL to urlhash,
// use this helper function to return the appropriate SQL where condition.
function getUrlhashCond($url) {
return "urlhash = " . getUrlhashFunc($url);
}
// Since it's hard to remember the SQL for converting a URL to urlhash,
// use this helper function to return the appropriate SQL conversion functions.
function getUrlhashFunc($url) {
return "conv(substring(md5('$url'), 1, 4), 16, 10)";
}
?>