-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmodels.py
114 lines (93 loc) · 3.64 KB
/
models.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
from django.db import models
import urllib2
from bs4 import BeautifulSoup
from sets import Set
from urlparse import urljoin
class AggregatedEventState:
NEW = 0
ADDED_TO_AGORA = 1
NOT_OF_INTEREST = 2
HAS_ERROR = 3
CHOICES = (
(NEW, u'New'),
(ADDED_TO_AGORA, u"Added to Agora"),
(NOT_OF_INTEREST, u"Not of interest"),
(HAS_ERROR, u"Has error"),
)
class EventScrapped(models.Model):
url = models.CharField(max_length=100)
title = models.CharField(max_length=255)
description = models.TextField()
start_date = models.CharField(max_length=255)
downloaded_on = models.DateTimeField(auto_now_add=True)
state = models.IntegerField(choices=AggregatedEventState.CHOICES,
default=AggregatedEventState.NEW)
def __unicode__(self):
return self.title
class PagePattern(models.Model):
base_url = models.URLField(max_length=255, null=True)
site_name = models.CharField(max_length=20, null=True)
title = models.CharField(max_length=255)
description = models.CharField(max_length=255)
start_time = models.CharField(max_length=255)
def __unicode__(self):
return self.site_name
def attributes(self):
return {"title": self.title,
"description": self.description,
"start_date": self.start_time}
def storeEvent(self, event):
EventScrapped(url=event['url'], title=event['title'],
description=event['description'],
start_date=event['start_date']).save()
def fetch_event(self, page_url, storeFetched=True):
response = urllib2.urlopen(page_url)
html = response.read()
soup = BeautifulSoup(html)
event = {"url": page_url}
for attribute, css in self.attributes().iteritems():
value_list = soup.select(css)
try:
value = ""
for string in value_list[0].stripped_strings:
value = value + string
event[attribute] = value
except Exception as exception:
print exception
print attribute, "not found"
return None
if storeFetched:
self.storeEvent(event)
return event
class ScraperLog(models.Model):
run_on = models.DateTimeField(auto_now_add=True)
crawler = models.ForeignKey('SiteCrawler', editable=False)
num_parsed_events = models.IntegerField()
class SiteCrawler(models.Model):
index_page_url = models.URLField(max_length=255)
event_urls = models.URLField(max_length=255)
event_page_crawler = models.ForeignKey(PagePattern,
related_name='index_pages')
site_name = models.CharField(max_length=30)
def __unicode__(self):
return self.site_name
def fetch_events(self, storeFetched=True):
response = urllib2.urlopen(self.index_page_url)
html = response.read()
soup = BeautifulSoup(html)
all_urls = Set()
for link in soup.find_all('a'):
relative_url = link.get('href')
absolute_url = urljoin(self.index_page_url, relative_url)
if absolute_url.startswith(self.event_urls):
all_urls.add(absolute_url)
events = []
event_crawler = self.event_page_crawler
for event_url in all_urls:
event = event_crawler.fetch_event(event_url, storeFetched)
if event:
events.append(event)
num_events = len(events)
if num_events > 0:
ScraperLog(crawler=self, num_parsed_events=num_events).save()
return events