-
Notifications
You must be signed in to change notification settings - Fork 0
/
schools.py
93 lines (78 loc) · 4.24 KB
/
schools.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
import requests
import re
from bs4 import BeautifulSoup
from pathlib import Path
from globals import tables_data, faulty_tables
headers = {'User-Agent': 'Mozilla/5.0'}
login_url = 'http://www.edumich.gob.mx/sigem_tel/index/1/'
page_with_tables = "http://www.edumich.gob.mx/sigem_tel/sisat_registro_2223/" \
"3c2a3acc3ebccb7e62352756b14fd812b6913fe1/I2223/"
class School:
table_count = int(0)
table_count_check = int(0)
def __init__(self, credentials):
self.login, self.password = credentials.split(':')
self.name = self.login.split('_')[0]
def _login_to_school(self, session):
response = session.post(login_url, data={'inputEmail': '',
'inputPassword': self.password, 'grabar': 'si'})
response.raise_for_status()
if response.url == login_url:
print(f"Could not login to the school {self.login} with the password '{self.password}'!")
print("Check credentials for that school and start again!")
exit(1)
def _get_tables_urls(self, session):
response = session.get(page_with_tables, headers=headers)
response.raise_for_status()
soup = BeautifulSoup(response.content, 'html.parser')
tables_page_elements = soup.find_all(title="Concentrado de Información")
return [table_page_element.get('href') for table_page_element in tables_page_elements]
def _download_table(self, session, table_url, count, number_of_tables):
table_page = session.get(table_url, headers=headers)
table_page.raise_for_status()
table_page_soup = BeautifulSoup(table_page.content, 'html.parser')
download_table_url = table_page_soup.find(class_="btn btn-warning btn-sm").get('href')
print(f"Descargando tabla {count + 1} de {number_of_tables}...", end='')
print('\r', end='')
table_file = session.get(download_table_url, allow_redirects=True, stream=True)
correct_content_type = 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet'
if table_file.headers['Content-Type'] != correct_content_type:
print(f'''Something wrong with the file!\n
Log in manually in the school {self.name} and then check the link: {download_table_url}\n
You can download this file manually and put it in the folder 'Tables''')
file_name = f"Unknown table {count + 1} from {self.name}"
faulty_tables.append(file_name)
tables_data[file_name] = {'school': self.name, 'course': None, 'group': None,
'download_url': download_table_url}
return file_name
file_name = re.search(f'^.*="(.+)"$', table_file.headers['Content-Disposition']).group(1)
file_path = Path(f"tables/{self.name}_{file_name}")
with open(file_path, 'wb') as file:
# Download the file in chunks and update the progress bar
for chunk in table_file.iter_content(chunk_size=1024):
if chunk:
file.write(chunk)
School.table_count_check += 1
return file_name
def get_tables(self):
"""
Downloads and saves tables for the school.
"""
with requests.session() as session:
self._login_to_school(session)
tables_urls = self._get_tables_urls(session)
number_of_tables = len(tables_urls)
print(f"Encontradas {number_of_tables} tablas para la escuela '{self.name}'.")
# Iterate through the table URLs and call the _download_table method
for count, table_url in enumerate(tables_urls):
file_name = self._download_table(session, table_url, count, number_of_tables)
class_data = re.search(r'"(\d)([a-zA-Z])"', file_name)
if class_data:
course_year = class_data.group(1)
group = class_data.group(2)
else:
course_year = None
group = None
faulty_tables.append(file_name)
tables_data[file_name] = {'school': self.name, 'course': course_year, 'group': group,
'download_url': table_url}