-
Notifications
You must be signed in to change notification settings - Fork 0
/
scraper.py
123 lines (104 loc) · 4.19 KB
/
scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import os
import json
from bs4 import BeautifulSoup
def getDependencies(raw):
'''Processa os intertravamentos das disciplinas, removendo caracteres indesejados e mantendo apenas os códigos'''
first = raw.split('ou')
first = list(map(lambda x: x.split('e'), first))
first = [item for sublist in first for item in sublist]
first = list(map(lambda x: x.replace('(', ''), first))
first = [i.strip() for i in first]
first = [i.replace(')', '') for i in first]
first = [i.replace(' ', '') for i in first]
first = [i.replace('*', '') for i in first]
first = [i for i in first if 'OBR' not in i]
first = [i for i in first if 'TOT' not in i]
if "" in first:
return []
return first
def getCourses():
'''Pega as páginas da matriz de todos os cursos do catálogo e salva na pasta /pages'''
driver = webdriver.Firefox()
driver.get('http://www.catalogo.ufv.br/')
print(driver.title)
try:
# Espera até os elementos carregarem
works = WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.CLASS_NAME, "bloco"))
)
finally:
blocos = driver.find_elements_by_tag_name('a')
names = []
links = []
# Pega os links e os nomes das disciplinas
for bloco in blocos:
link = bloco.get_attribute('href')
name = bloco.get_attribute('title')
if 'interno.php' in link:
names.append(name)
links.append(link)
links = list(map(lambda x: x.replace('interno', 'matriz'), links))
try:
os.mkdir('pages')
except:
pass
for index, link in enumerate(links):
driver.get(link)
with open("pages/" + names[index].replace(" ", "").lower() + '.html', 'w') as f:
f.write(driver.page_source)
driver.quit()
def scrapePages():
'''Pega todas as informações relevantes das matrizes do curso'''
ID = 1
# Json structure for Courses
courses = {
"course": []
}
disciplines = {
"disciplines": []
}
for filename in os.listdir(os.path.join(os.getcwd(),'pages')):
# Open all pages
with open(os.path.join(os.getcwd(), 'pages', filename)) as f:
page = BeautifulSoup(f, "html.parser")
# Get the name of the Course
name = page.find('h2', {'id': 'titulo'}).text.strip()
discs = []
table = page.select("div.col-md-12")[1]
trs = table.select("tr")
i = 0
# Discipline information starts in tr 8
for row in trs[8:]:
per = row.select("th.periodo")
if per:
i = i + 1 if "Optativas" not in per[0].text else 50
continue
th = row.find("th")
if "Total" not in th.text:
# Some disciplines start on tr 7, so I have to check
if i == 0:
i += 1
code = th.text
tds = row.select("td")
discName = tds[0].text.strip()
discs.append({"Code": code, "Semester": i})
deps = getDependencies(tds[3].text)
disc = {'code': code, 'name': discName, 'dependencies': deps}
disciplines['disciplines'].append(disc)
course = {
"id": ID,
"name": name,
"disciplines": discs
}
courses['course'].append(course)
ID += 1
with open(os.path.join(os.getcwd(), 'courses.json'), 'w') as f:
json.dump(courses, f, ensure_ascii=False)
with open(os.path.join(os.getcwd(), 'disciplines.json'), 'w') as f:
json.dump(disciplines, f, ensure_ascii=False)
getCourses()
scrapePages()