-
Notifications
You must be signed in to change notification settings - Fork 12
/
Copy pathscrap_teachers.py
157 lines (141 loc) · 5.52 KB
/
scrap_teachers.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#-- General imports --#
from __future__ import division
from codecs import decode
import requests
import re
import json
import os
class Directory(object):
"""This class contains all the necessary procedures to scrap http://directori.upc.edu
Attributes:
type(:obj:`str`): string like 'cs','ac','essi' corresponding to UPC's FIB departments
teacher_url(:obj:`str`): url that can easily be formatted to access teacher's pages
url(:obj:`str`): url of the root site for the department (contains IDs for all teachers of the department)
data(:obj:`dict`): dictonary with the following format that contains the information about the teachers
{
name: {
'mail': value,
'office': value
}
}
where value can either be None or (:obj:`str`)
"""
def __init__(self, key):
self.type = key
print("Scraper defined for {}'s department".format(self.type))
self.teacher_url = 'http://directori.upc.edu/directori/dadesPersona.jsp?id={}'
with open('./Data/urls_upc.json', 'r') as fp:
self.url = json.load(fp)[key]
self.data = {}
self.start_id = '<a href="dadesPersona.jsp?id='
self.end_id = '">'
self.start_name = '<td colspan="2"><b>'
self.end_name = '</b></td>'
self.start_mail = '<span class="mail">'
self.end_mail = '</span>'
self.start_office = '</a><br />'
self.end_office = '<br/>C. JORDI GIRONA, 1-3<br/>'
"""
This function scraps the directory saving the ids of each teacher, and uses
the teacher url formatted with each id to get the information parsing the html
file using regular expressions.
"""
def scrap_directory(self):
print("scraping {} ...".format(self.url))
response = requests.get(self.url)
content = str(response.content)
ids = self.get_ids(content)
total = len(ids)
current = 0
for item in ids:
print("{}%".format(int(float(current)/float(total) *100)))
query_url = self.teacher_url.format(item)
content = requests.get(query_url).content
mail = self.get_mail(content)
name = self.get_name(content)
office = self.get_office(content)
if office:
self.data[name] = {
'mail': mail,
'office': office
}
else:
self.data[name] = {
'mail': mail,
'office': None
}
current +=1
self.dump_data()
print("Scraping done succesfully!")
print("Scraped {} teachers.".format(len(ids)))
"""
Helper function that parses html page to extract each teacher id
"""
def get_ids(self, content):
ids = []
for i in str(content).split(self.start_id):
for item in i.split(self.end_id):
try:
ids.append(int(item))
except:
pass
return ids
"""
Helper function that parses html page to extract a mail from a teachers page
"""
def get_mail(self, content):
try:
mail = str(re.findall('%s(.*)%s' % (self.start_mail, self.end_mail), str(content))[0])
mail = mail.replace('\\n','').replace('\\t','')
mail = mail.replace('<img src="img/arrobaG.gif" align="top"/>', '@')
mail = mail.split('<')[0]
return mail
except:
print("Teacher without mail...")
return None
"""
Helper function that parses html page to extract a office from a teachers page
"""
def get_office(self, content):
content = str(content).replace('\\n','').replace('\\t','')
try:
office = str(re.findall('%s(.*)%s' % (self.start_office, self.end_office), str(content))[0])
office = office.replace('<br/>',' ').title()
office = office.split('C. Jordi Girona')[0]
return office
except:
print ("Teacher without office...")
return None
"""
Helper function that parses html page to extract the name from a teachers page
"""
def get_name(self, content):
name = re.findall('%s(.*)%s' % (self.start_name, self.end_name), str(content))[0]
name = str(name.replace('\\n','').replace('\\t','').replace(' ', ' ').split('<b>')[-1])
name = decode(name, 'unicode_escape')
if name[-1] == ' ': name = name[:-1]
return name.lower()
"""
Helper function that stores the results into persistence
files can be found at Data/teachers/*.json
"""
def dump_data(self):
print("Dumping data into persistence...")
try:
os.remove('./Data/teachers/{}.json'.format(self.type))
except:
pass
with open('./Data/teachers/{}.json'.format(self.type), 'w') as fp:
json.dump(self.data, fp, indent = 2)
if __name__ == "__main__":
directory = input("Which directory do you want to scrap? (type 'all' to scrap every directory)\n")
if directory != 'all':
directory = Directory(directory)
directory.scrap_directory()
else:
directories = ['essi','cs','ac', 'esaii', 'fis', 'eio', 'mat', 'oe', 'thatc', 'iri']
for directory in directories:
directory = Directory(directory)
directory.scrap_directory()