-
-
Notifications
You must be signed in to change notification settings - Fork 416
/
Copy patharatext.py
55 lines (43 loc) · 1.92 KB
/
aratext.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
import re
from urllib.parse import urlparse
import dateparser
from scrapy import Request
from gazette.items import Gazette
from gazette.spiders.base import BaseGazetteSpider
class BaseAratextSpider(BaseGazetteSpider):
def parse(self, response, page=1):
for item in response.css("#edicoes-anteriores tbody tr"):
raw_edition_date = (
item.css("td")[2].css("::text").get().split(",")[1].strip()
)
edition_date = dateparser.parse(raw_edition_date, languages=["pt"]).date()
raw_edition_number = item.css("a::text").get().strip()
edition_number = re.search(r"(\d+)/", raw_edition_number).group(1)
path = item.css("a").attrib["href"]
intermediary_page = (
urlparse(self.start_urls[0])._replace(path=path).geturl()
)
if self.start_date <= edition_date <= self.end_date:
gazette = {
"date": edition_date,
"edition_number": edition_number,
"is_extra_edition": False,
"power": self.power,
}
yield Request(
intermediary_page,
callback=self.parse_intermediary_page,
cb_kwargs={"gazette": gazette},
)
last_page = response.xpath('//*[@class="pagination"]//*[@rel="next"]') == []
if edition_date > self.start_date and not last_page:
page += 1
yield Request(
f"{self.start_urls[0]}?page={page}",
callback=self.parse,
cb_kwargs={"page": page},
)
def parse_intermediary_page(self, response, gazette):
file_path = response.css("#Box-area-title a").attrib["href"]
gazette_url = urlparse(self.start_urls[0])._replace(path=file_path).geturl()
yield Gazette(**gazette, file_urls=[gazette_url])