-
-
Notifications
You must be signed in to change notification settings - Fork 416
/
Copy pathatende_v2.py
92 lines (75 loc) · 2.84 KB
/
atende_v2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
import re
import dateparser
from scrapy import FormRequest
from gazette.items import Gazette
from gazette.spiders.base import BaseGazetteSpider
class BaseAtendeV2Spider(BaseGazetteSpider):
"""
Base spider for Gazettes that are available from cities listed on
https://{city_subdomain}.atende.net
This base class deals with 'Layout 2' gazette pages, usually requested
from 'https://{city_subdomain}.atende.net/diariooficial'.
"""
allowed_domains = ["atende.net"]
BASE_URL = ""
# Must be defined into child classes
city_subdomain = ""
def start_requests(self):
self.BASE_URL = f"https://{self.city_subdomain}.atende.net/diariooficial/edicao/pagina/atende.php"
yield FormRequest(
url=self.BASE_URL,
method="GET",
formdata=self.get_params("pagina", 1),
cb_kwargs={"page": 1},
)
def parse(self, response, page):
for item in response.css("div.nova_listagem div.linha"):
date_raw = item.css("div.data::text").get()
date = dateparser.parse(date_raw, languages=["pt"]).date()
if date > self.end_date:
continue
if date < self.start_date:
return
edition_type = item.css("div.tipo::text").get()
is_extra = bool(
re.search(
r"suplementar | retificação | extraordinária | extra",
edition_type,
re.IGNORECASE,
)
)
edition_number = item.css("div.titulo::text").re_first(r"\d+")
download_url = item.css("button::attr(data-link)")[-1].get()
yield Gazette(
date=date,
edition_number=edition_number,
is_extra_edition=is_extra,
file_urls=[download_url],
power="executive_legislative",
)
if page < self.get_last_page(response):
yield FormRequest(
url=self.BASE_URL,
method="GET",
formdata=self.get_params("pagina", page + 1),
cb_kwargs={"page": page + 1},
)
def get_params(self, filtro, value):
params = {
"rot": "54015",
"aca": "101",
"ajax": "t",
"processo": "loadPluginDiarioOficial",
}
if filtro == "pagina":
params[
"parametro"
] = f'{{"codigoPlugin":1,"filtroPlugin":{{"pagina":"{value}"}}}}'
elif filtro == "edicao":
params[
"parametro"
] = f'{{"codigoPlugin":2,"filtroPlugin":{{"codigoEdicao":"{value}"}}}}'
return params
def get_last_page(self, response):
pages = response.css("div#paginacao li.dst button::attr(value)").getall()[-1]
return int(pages)