-
Notifications
You must be signed in to change notification settings - Fork 9
/
Copy pathread_parser.py
123 lines (112 loc) · 3.11 KB
/
read_parser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
import re
from collections import defaultdict
from lxml import html
from lxml import etree
from book import Book
def get_rating_from_title(rating_title: str):
try:
parts = rating_title.split()
try:
return int(parts[-3])
except ValueError:
return float(parts[-3])
except IndexError:
# Case for 'нет рейтинга' string
return None
except Exception as ex:
print('get_rating_from_title("%s"): %s' % (rating_title, ex))
return None
def get_max_rating_from_title(rating_title: str):
try:
parts = rating_title.split()
return int(parts[-1])
except ValueError:
# Case for 'нет рейтинга' string
return None
except Exception as ex:
print('get_max_rating_from_title("%s"): %s' % (rating_title, ex))
return None
def try_get_link(link: str):
if "/book/" in link:
return link
return None
def parse_book(row, last_date: str, without_rating: bool):
link = None
rating = None
max_rating = None
for cell in row.iter():
if rating is None:
spans = cell.xpath('.//span')
if len(spans) == 2:
rating_title = spans[1].get('title')
rating = get_rating_from_title(rating_title)
max_rating = get_max_rating_from_title(rating_title)
if link is None:
hrefs = cell.xpath('.//a')
for href in hrefs:
link = try_get_link(hrefs[0].get('href'))
if rating is None and without_rating:
rating = -1
if link is not None and rating is not None:
return Book(link, rating, max_rating, last_date)
if link is not None or rating is not None:
if link is None:
print('Parsing error (link is not parsed):')
if rating is None:
print('Parsing error (rating is not parsed):')
print(etree.tostring(row))
print('')
return None
def try_parse_month(raw_month: str):
dict = defaultdict(lambda: '01', {
'Январь': '01',
'Февраль': '02',
'Март': '03',
'Апрель': '04',
'Май': '05',
'Июнь': '06',
'Июль': '07',
'Август': '08',
'Сентябрь': '09',
'Октябрь': '10',
'Ноябрь': '11',
'Декабрь': '12'
})
return dict[raw_month]
def try_parse_date(row):
headers = row.xpath('.//td/h2')
for header in headers:
raw_text = header.text
if raw_text is not None:
m = re.search('\d{4} г.', raw_text)
if m is not None:
year = m.group(0).split(' ')[0]
raw_month = raw_text.split(' ')[0]
month = try_parse_month(raw_month)
return '%s-%s-01' % (year, month)
return None
# ReadParser - parse read list in html format
class ReadParser:
def load_from_file(this, file_name: str):
try:
with open(file_name, 'r', encoding="utf-8") as file:
this.content = file.read()
return True
except Exception as ex:
print('load_from_file("%s"): %s' % (file_name, ex))
this.content = None
return False
def parse_books(this, without_rating: bool) -> list[Book]:
books = []
books_html = html.fromstring(this.content)
rows = books_html.xpath('//tr')
last_date = None
for row in rows:
result = parse_book(row, last_date, without_rating)
if result is not None:
books.append(result)
else:
date = try_parse_date(row)
if date is not None:
last_date = date
return books