-
-
Notifications
You must be signed in to change notification settings - Fork 415
/
Copy pathinstar.py
81 lines (66 loc) · 2.59 KB
/
instar.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
import datetime
import math
import scrapy
from gazette.items import Gazette
from gazette.spiders.base import BaseGazetteSpider
class BaseInstarSpider(BaseGazetteSpider):
power = "executive_legislative"
def start_requests(self):
page = 1
start_date = self.start_date.strftime("%d-%m-%Y")
end_date = self.end_date.strftime("%d-%m-%Y")
start_url = f"{self.base_url}/{page}/{start_date}/{end_date}/0/0/"
yield scrapy.Request(
start_url,
cb_kwargs={"page": page, "start_date": start_date, "end_date": end_date},
)
def _pagination_requests(self, response, page, start_date, end_date):
if page == 1:
num_results = int(response.css(".sw_qtde_resultados::text").get("0"))
results_per_page = 50
total_pages = math.ceil(num_results / results_per_page)
for next_page in range(2, total_pages + 1):
next_page_url = (
f"{self.base_url}/{next_page}/{start_date}/{end_date}/0/0/"
)
yield scrapy.Request(
next_page_url,
cb_kwargs={
"page": next_page,
"start_date": start_date,
"end_date": end_date,
},
)
def parse(self, response, page, start_date, end_date):
gazettes = response.css(".dof_publicacao_diario")
for gazette in gazettes:
raw_gazette_date = gazette.css("span::text").re_first(
r"\d{2}\/\d{2}\/\d{4}"
)
gazette_date = datetime.datetime.strptime(
raw_gazette_date, "%d/%m/%Y"
).date()
edition_number = gazette.css(".dof_titulo_publicacao span::text").re_first(
r"\d+"
)
gazette_url = response.urljoin(gazette.css("a::attr(href)").get())
item = Gazette(
date=gazette_date,
edition_number=edition_number,
is_extra_edition=False,
power=self.power,
)
yield scrapy.Request(
gazette_url, callback=self.parse_gazette_url, cb_kwargs={"item": item}
)
yield from self._pagination_requests(response, page, start_date, end_date)
def parse_gazette_url(self, response, item):
gazette_url = response.urljoin(
response.css(".d_titulo_edicao a::attr(href)").get()
)
yield Gazette(
file_urls=[
gazette_url,
],
**item,
)