-
-
Notifications
You must be signed in to change notification settings - Fork 1
/
MODULE.py
51 lines (38 loc) · 1.27 KB
/
MODULE.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
import requests
from bs4 import BeautifulSoup
from time import sleep
titleList = []
urlList = []
spisNews = []
spisUrl = []
# Parse info
headers = {"User-Agent":
"Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.5) Gecko/20091102 Firefox/3.5.5 (.NET CLR 3.5.30729)"}
@property
def parseSite(url, divLocation, titleAtr, titleClass):
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.text, "lxml")
block = soup.findAll("div", class_=divLocation)
for i in block:
title = i.find(titleAtr, class_=titleClass).text
href = i.find(titleAtr, class_=titleClass).get('href')
url2 = url.rpartition('.ru')[0]
url2 = str(url2) + '.ru' + str(href)
titleList.append(title)
urlList.append(str(url2))
@property
def getInfo(fileLocation):
txt = 'None'
file = open(fileLocation, 'a', encoding="utf=8")
count = 0
try:
# using `_` keyword for non-used vars in iterations
for _ in titleList:
count += 1
txt = str(titleList[count])
urlTxt = str(urlList[count])
file.write(txt + ' | ' + urlTxt + "\n")
except IndexError:
print("You parsed all news in data file")
file.close
return spisUrl, spisNews