-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcrawler_new.py
62 lines (53 loc) · 1.3 KB
/
crawler_new.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
import pymomgo
import requests
import re
import lxml
'''def get_page(url):
try:
import urllib
return urllib.urlopen(url).read()
except:
return'''
def get_next_target(page):
start_link=page.find('<a href=')
if start_link == -1:
return None,0
start_quote=page.find('"',start_link)
end_quote=page.find('"',start_quote+1)
url=page[start_quote+1 : end_quote]
return url,end_quote
#appending links to tocrawl
def union(p,q):
for e in q:
if e not in p:
p.append(e)
#tocrawl=list of all pages left to crawl
#crawled=pages which are crawled
def crawl_web(seed):
tocrawl=[seed]
crawled=[]
while tocrawl:
page=tocrawl.pop()
if page not in crawled:
union(tocrawl,get_all_links(page)
crawled.append(page)
return crawled
#getting a list of url's
def get_all_links(page):
links=[]
while True:
url,endpos=get_next_target(page)
if url:
links.append(url)
page=page[endpos:]
else:
break
return links
user_agent = {'User-agent': 'Mozilla/5.0'}
url=list(sys.argv)[1]
r = requests.get(url, headers = user_agent)
html=r.text
soup = BeautifulSoup(html,"lxml")
c=crawl_web(soup)
for l in c:
print l