-
Notifications
You must be signed in to change notification settings - Fork 1
/
h_scrape.py
191 lines (175 loc) · 6.38 KB
/
h_scrape.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
import csv, requests, time, sys, re
from bs4 import BeautifulSoup as bs
from lxml import etree
import xml.etree.ElementTree as ET
import urllib2
from lxml.html.clean import clean_html
from yattag import Doc
import datetime
#the input csv file is a list of xml source sites and their relative attributes
if len(sys.argv) > 1:
elenco_albi = sys.argv[1]
else:
elenco_albi = "elenco_albi.csv"
#the output xmls
if len(sys.argv) > 2:
output_dir = sys.argv[2]
else:
output_dir = "./"
with open (elenco_albi, "rb") as csvfile:
reader = csv.reader(csvfile)
reader.next() # Skip the header row
for line in reader:
albo = line[0]
url = line[1]
title_xpath = line[2]
pubDate_xpath = line[3]
href_xpath = line[4]
partial_url = line[5]
pubEnd_xpath = line[6]
uid_xpath = line[7]
type_xpath = line[8]
#META NEW SPECS
RSS_title = line[9]
RSS_link = line[10]
RSS_description = line[11]
channel_category_type = line[12]
channel_category_municipality = line[13]
channel_category_province = line[14]
channel_category_region = line[15]
channel_category_latitude = line[16]
channel_category_longitude = line[17]
channel_category_country = line[18]
channel_category_name = line[19]
channel_category_uid = line[20]
time_format = line[21]
channel_category_webmaster = line[22]
current_date = datetime.date.today().strftime("%a, %d %b %Y %H:%M:%S %z +0200")
halley_code = line[23]
print albo
def clean_title(x):
match = re.search(r'^">', x)
if match:
x = re.sub(r'^">', '', x)
return x
else:
return x
def clean_date(x):
try:
d = datetime.datetime.strptime(x.text, time_format)
clean_d = d.strftime("%a, %d %b %Y %H:%M:%S %z +0200")
return clean_d
except Exception as e:
#print e
clean_d = datetime.date.today().strftime("%a, %d %b %Y %H:%M:%S %z +0200") #METODO DA MIGLIORARE
return clean_d
def clean_detail_link(x):
match = re.search(r'\'(.*?)\'', str(x))
if match:
x = match.group().replace("'","")
return x
title_list = []
href_list = []
pubDate_list = []
pubEnd_list = []
uid_list = []
type_list = []
guid_list = []
details_list_of_lists = []
raw_datalist = []
def open_page():
global page
response = urllib2.urlopen(url)
resp_data = response.read()
page = etree.XML(resp_data)
response.close()
def scrape_data():
global raw_datalist
year = datetime.date.today().year
#get and clean title
title_tags = page.findall(title_xpath)
for item in title_tags:
rawname = bs(item.text.encode('utf-8'), "xml")
title_item = rawname.find('a').text
title_list.append(title_item.encode('utf-8'))
#get hrefs and details
href_tags = page.findall(href_xpath)
for item in href_tags:
guid_list.append(item.get('id'))
href_clean = partial_url + item.get('id').encode('utf-8')
href_list.append(href_clean)
detail_res = requests.get(href_clean)
detail_page = bs(detail_res.text,"lxml")
detail_tags = detail_page.find_all('a')
detail_link_list = []
for a in detail_tags:
detail_link_url = "http://halleyweb.com/"+halley_code+"/mc/"+clean_detail_link(a['onclick'])
detail_link_list.append(detail_link_url)
details_list_of_lists.append(detail_link_list)
#get pubdates
pubDate_tags = page.findall(pubDate_xpath)
for item in pubDate_tags:
pubDate_list.append(clean_date(item))
#get pubends
pubEnd_tags = page.findall(pubEnd_xpath)
for item in pubEnd_tags:
pubEnd_list.append(clean_date(item))
#get uids
uid_tags = page.findall(uid_xpath)
for item in uid_tags:
uid_list.append(str(year)+"/"+item.text)
#get types
tyep_tags = page.findall(type_xpath)
for item in tyep_tags:
type_list.append(item.text.encode('utf-8'))
raw_datalist = zip(title_list, href_list, pubDate_list, pubEnd_list, uid_list, type_list, guid_list, details_list_of_lists)
#print raw_datalist
def generate_csv():
with open(albo +'_data.csv', 'wb') as f:
writer = csv.writer(f)
for row in raw_datalist:
writer.writerow(row)
def generate_feed():
doc, tag, text, line = Doc().ttl()
doc.asis('<?xml version="1.0" encoding="UTF-8"?>')
with tag('rss',
('xmlns:atom', 'http://www.w3.org/2005/Atom'),
('version', '2.0')
):
with tag('channel'):
line('title', RSS_title)
line('link', RSS_link)
line('description', RSS_description)
line('language', 'it')
line('pubDate', current_date)
line('docs', 'http://albopop.it/comune/') #TODO
line('category', channel_category_type, domain="http://albopop.it/specs#channel-category-type")
line('category', channel_category_municipality, domain="http://albopop.it/specs#channel-category-municipality")
line('category', channel_category_province, domain="http://albopop.it/specs#channel-category-province")
line('category', channel_category_region, domain="http://albopop.it/specs#channel-category-region")
line('category', channel_category_latitude, domain="http://albopop.it/specs#channel-category-latitude")
line('category', channel_category_longitude, domain="http://albopop.it/specs#channel-category-longitude")
line('category', channel_category_country, domain="http://albopop.it/specs#channel-category-country")
line('category', channel_category_name, domain="http://albopop.it/specs#channel-category-name")
line('category', channel_category_uid, domain="http://albopop.it/specs#channel-category-uid")
line('webMaster', channel_category_webmaster)
for row in raw_datalist:
with tag('item'):
line('title', row[0])
line('link', row[1])
line('description', row[0])
line('pubDate', row[2])
line('guid', row[1], isPermaLink='true')
line('category', row[2], domain="http://albopop.it/specs#item-category-pubStart")
line('category', row[3], domain="http://albopop.it/specs#item-category-pubEnd")
line('category', row[4], domain="http://albopop.it/specs#item-category-uid")
line('category', row[5], domain="http://albopop.it/specs#item-category-type")
for link in row[7]:
doc.stag('enclosure', url=str(link), length="3000", type="application/pdf")
#print(doc.getvalue())
with open(output_dir+'/'+albo+'_feed.xml','wf') as f:
f.write(doc.getvalue())
open_page()
scrape_data()
#generate_csv() #solo per osservare, ma puo' servire da backup
generate_feed()