-
Notifications
You must be signed in to change notification settings - Fork 15
/
Copy pathDomzSpider.py
67 lines (64 loc) · 2.47 KB
/
DomzSpider.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
# -*- coding: utf-8 -*-
#from scrapy.spider import BaseSpider
from scrapy.contrib.spiders import CrawlSpider,Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.dupefilter import RFPDupeFilter
from scrapy.selector import HtmlXPathSelector
from searchEngine.items import SearchengineItem
class DmozSpider(CrawlSpider):
name = "dmoz"
allowed_domains = [
"news.cn",
"news.xinhuanet.com"
]
start_urls = [
"http://www.news.cn/",
"http://www.news.cn/mil/index.htm",
"http://www.news.cn/politics/"
"http://www.news.cn/world/index.htm",
"http://www.news.cn/tech/index.htm"
]
rules = (
#Rule(SgmlLinkExtractor(allow=('page/[0-9]+', ))),
#Rule(SgmlLinkExtractor(allow=['/' ]),'item_parse')
Rule(SgmlLinkExtractor(allow=('/', )),callback='item_parse'),
)
def item_parse(self,response,dont_filter=False):
#self.log("%s"%response.url)
item = SearchengineItem()
item['url'] = response.url
item['title'] = response.selector.xpath('//title/text()').extract()
item['keywords'] = response.selector.xpath('//meta[@name="keywords"]/@content').extract()
item['description'] = response.selector.xpath('//meta[@name="description"]/@content').extract()
for t in item['title']:
print t.encode('utf-8')
for t in item['keywords']:
print t.encode('utf-8')
for t in item['description']:
print t.encode('utf-8')
return item
#print item['title']
'''
def parse(self, response):
#hxs = HtmlXPathSelector(response)
#sites = hxs.select('//head')
#res = HtmlXPathSelector(response)
item = SearchengineItem()
#for site in sites:
# item = SearchengineItem()
# item['title'] = site.select('//title/text()').extract()
# item['link'] = site.select('meta/@keywords').extract()
#item['desc'] = site.select('text()').extract()
#items.append(item)
item['title'] = response.selector.xpath('//title/text()').extract()
item['keywords'] = response.selector.xpath('//meta[@name="keywords"]/@content').extract()
item['description'] = response.selector.xpath('//meta[@name="description"]/@content').extract()
for t in item['title']:
print t.encode('utf-8')
for t in item['keywords']:
print t.encode('utf-8')
for t in item['description']:
print t.encode('utf-8')
#print item['title']
return item
'''