-
-
Notifications
You must be signed in to change notification settings - Fork 416
/
Copy pathsigpub.py
88 lines (73 loc) · 3.32 KB
/
sigpub.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
import json
from datetime import date
import scrapy
from dateutil.rrule import DAILY, rrule
from gazette.items import Gazette
from gazette.spiders.base import BaseGazetteSpider
class BaseSigpubSpider(BaseGazetteSpider):
"""www.diariomunicipal.com.br (Sigpub) base spider
Documents obtained by this kind of spider are text-PDFs with many cities in it.
That's because the websites are usually made for associations of cities.
TODO:
- All variations have a "possible" start date of 01/01/2009, but that may cause
many unnecessary requests to be made if they actually start making available
documents later. Some investigation for the start date of each website needs to
be made in this case.
Observations:
- These websites have an "Advanced Search", but they are protected by ReCaptcha.
"""
start_date = date(2009, 1, 1)
def start_requests(self):
"""Requests start page where the calendar widget is available."""
yield scrapy.Request(self.CALENDAR_URL, callback=self.parse_calendar)
def parse_calendar(self, response):
"""Makes requests for each date to see if a document is available."""
default_form_fields = {
"calendar[_token]": response.xpath(
"//input[@id='calendar__token']/@value"
).get()
}
for gazette_date, date_form_fields in self.available_dates_form_fields():
formdata = {**default_form_fields, **date_form_fields}
yield scrapy.FormRequest(
url=response.urljoin("materia/calendario"),
formdata=formdata,
meta={"date": gazette_date, "edition_type": "regular"},
callback=self.parse_gazette_info,
)
yield scrapy.FormRequest(
url=response.urljoin("materia/calendario/extra"),
formdata=formdata,
meta={"date": gazette_date, "edition_type": "extra"},
callback=self.parse_gazette_info,
)
def parse_gazette_info(self, response):
"""Parses document availability endpoint and gets document URL if available."""
body = json.loads(response.text)
meta = response.meta
if "error" in body:
self.logger.debug(
f"{meta['edition_type'].capitalize()} Gazette not available for {meta['date'].date()}"
)
return
for edition in body["edicao"]:
url = f"{body['url_arquivos']}{edition['link_diario']}.pdf"
yield Gazette(
date=meta["date"].date(),
file_urls=[url],
power="executive_legislative",
is_extra_edition=(meta["edition_type"] == "extra"),
edition_number=edition.get("numero_edicao", ""),
)
def available_dates_form_fields(self):
"""Generates dates and corresponding form fields for availability endpoint."""
available_dates = rrule(
freq=DAILY, dtstart=self.start_date, until=self.end_date
)
for query_date in available_dates:
form_fields = {
"calendar[day]": str(query_date.day),
"calendar[month]": str(query_date.month),
"calendar[year]": str(query_date.year),
}
yield query_date, form_fields