-
Notifications
You must be signed in to change notification settings - Fork 0
/
web_search.py
67 lines (51 loc) · 2.75 KB
/
web_search.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
from lxml import html
import re
from urllib.request import Request, urlopen
def stringify_children(node):
from lxml.etree import tostring
from itertools import chain
parts = ([node.text] +
list(chain(*([c.text, tostring(c), c.tail] for c in node.getchildren()))) +
[node.tail])
parts = [(i if type(i) == str else i.decode()) for i in parts if(i)]
parts = [re.sub("<.*?>","",i ) for i in parts]
# filter removes possible Nones in texts and tails
return ''.join(filter(None, parts))
hex_chars = {
'%21': '!', '%22': '"', '%23': '#', '%24': '$', '%25': '%', '%26': '&', '%27': "'", '%28': '(', '%29': ')', '%2A': '*',
'%2B': '+', '%2C': ',', '%2D': '-', r'%2E': '.', r'%2F': '/', '%3A': ':', '%3B': ';','%3C': '<', '%3D': '=', r'%3E': '>',
r'%3F': '?', '%40': '@', '%5B': '[', '%5C': '\\','%5D': ']', r'%5E': '^', r'%5F': '_', '%60': '`', '%7B': '{', '%7C': '|', '%7D': '}',r'%7E': '~'
}
def decode_url(link):
#Fixes URL which are encased with hex characters
for hex_keys in hex_chars.keys():
if(hex_keys in link):
link = link.replace(hex_keys,hex_chars[hex_keys])
return(link)
#Google search results using xPath
def google_results(search_string):
#Takes in a search string to google and returns a google search results which gives description, url and heading
searching = "https://www.google.com/search?q="+"%20".join(search_string.split())
req = Request(searching,headers={'User-Agent': 'Mozilla/5.0'})
webpage = urlopen(req).read()
tree = html.fromstring(webpage)
parsed_info = {}
url_regex = r"(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'\".,<>?«»“”‘’]))"
hyper_links = tree.xpath('//*[@id="main"]/div[*]/div/div[1]/a')#[7:]
results = len(hyper_links)
descriptions = tree.xpath('//*[@id="main"]/div[*]/div/div[3]/div/div/div/div/div')
descriptions = [stringify_children(x) for x in descriptions ]
headings = tree.xpath('//*[@id="main"]/div[*]/div/div[1]/a/h3/div/text()')
results = [len(headings),len(hyper_links),len(descriptions)]
for iter in range(min(results)):
dirty_url = hyper_links[iter].attrib["href"].split("&sa=")[0]
cleaned_url = decode_url(dirty_url)
cleaned_url = re.findall(url_regex,cleaned_url)
if(len(cleaned_url) == 0):
continue
parsed_info[headings[iter]] = {}
parsed_info[headings[iter]]['hyper_links'] = cleaned_url[0][0]
parsed_info[headings[iter]]['description'] = descriptions[iter]#.text
return(parsed_info)
# google_results("Parsing made easy")
# google_results("Create paper planes")