-
Notifications
You must be signed in to change notification settings - Fork 2
/
site_parser.py
39 lines (36 loc) · 1.14 KB
/
site_parser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
#-*- coding: utf-8 -*-
"""
Parsowanie strony z ogłoszeniami
"""
import re
PATTERN_ANNOUNCEMENTS = re.compile("<thead>.*?<tr>.*?<td colspan=\\\"2\\\">(.*?)</td>.*?</tr>.*?</thead>.*?<td>(.*?)</td>.*?<td>(.*?)</td>.*?<td>(.*?)</td>.*?</tr><tfoot>",
re.DOTALL)
def repl(matchobj):
"""
:param matchobj: Obiekt do porównania
:return:
"""
gr = matchobj.group(0)
if gr == "<br />":
return ""
elif gr =="ó":
return "ó"
elif gr == "Ó":
return "Ó"
elif gr == """:
return "\""
def announcements_from_html(html):
"""
Funkja parsująca stronę z ogłoszeniami
:param html: Zawartość https://librus.synergia.pl/ogloszenia
:return: lista [{"author": autor,
"title": tytuł,
"time": czas,
"content": zawartość}]
"""
data = [{"title": a[0],
"author": a[1][1:],
"time": a[2][1:],
"content": re.sub("(<br />)|\\ó\\;|\\Ó\\;", repl, a[3])}
for a in PATTERN_ANNOUNCEMENTS.findall(html)]
return data