-
Notifications
You must be signed in to change notification settings - Fork 0
/
with_class.py
83 lines (73 loc) · 2.66 KB
/
with_class.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
from urllib.request import urlopen
from bs4 import BeautifulSoup
import requests
import urllib
import pandas as pd
from urllib.request import urlopen
from tqdm import trange
def tiny_url(url):
apiurl = "http://tinyurl.com/api-create.php?url="
tinyurl = urllib.request.urlopen(apiurl + url).read()
return tinyurl.decode("utf-8")
class scrape():
def __init__(self,url1,in_what,what_find):
self.url1 = url1
self.in_what = in_what
self.what_find = what_find
#print(self.what_find[0][0],",attrs=",self.what_find[0][1])
def fetch_data(self):
headers = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:66.0) Gecko/20100101 Firefox/66.0", "Accept-Encoding":"gzip, deflate", "Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", "DNT":"1","Connection":"close", "Upgrade-Insecure-Requests":"1"}
#print(self.url1)
r = requests.get(self.url1)
content = r.content
soup = BeautifulSoup(content, features="html.parser")
all1 = []
#print(soup)
#print(self.in_what[0],"attrs=",self.in_what[1])
for d in soup.findAll(self.in_what[0],attrs=self.in_what[1]):
#print("this is d: ",d)
all2 = []
for i in range(len(self.what_find)):
#print(self.what_find[i][0],"attrs=",self.what_find[i][1])
found = d.find(self.what_find[i][0],self.what_find[i][1])
#print("this is found",found)
all2.append(found)
all1.append(all2)
return all1
outp = []
how_many = 3
for z in range(1,how_many):
this = scrape("https://lib.ugent.be/en/catalog?page="+str(z)+"&q=machine+learning",
["article",{'class':'col-md-12 search-result'}],
[["h2",{"class":"search-result__title meta-title"}],
["span",{"class":"meta-type"}],
["small",{"class":"help-block text-center"}]],)
test= this.fetch_data()
#print("this is test: \n\n",test[:3])
#print(len(test))
for i in (t:=trange(len(test))):
t.set_description("the page %.2f in the page %.2f"%(z,i))
tm = []
for j in range(len(test[i])):
if j == 0:
inb = test[i][j].a.text.strip().replace("\n","")
url1 = "https://lib.ugent.be/"+test[i][j].a["href"]
url2 = tiny_url(url1)
tm.append(inb)
tm.append(url2)
elif j == 2:
#print(test[i][j].text)
#exit()
try:
inb = test[i][j].text.strip().replace("\n","")
#print("this in inb: ",inb)
tm.append("i CAN'T read it:(")
except:
tm.append("i can read it:)")
else:
inb = test[i][j].text.strip().replace("\n","")
tm.append(inb)
outp.append(tm)
for i in range(len(outp)):
if outp[i][-1] != "i CAN'T read it:(":
print(outp[i])