5
5
from scrapy .selector import HtmlXPathSelector
6
6
from scrmusic .items import XiamiItem
7
7
from scrapy .http import Request
8
- cnt = 2
8
+ cnt = 4
9
9
class DmozSpider (BaseSpider ):
10
10
name = "xiami"
11
11
allowed_domains = ["xiami.com" ]
12
12
#设置爬取速度
13
- download_delay = 1
13
+ # download_delay = 1
14
14
start_urls = [
15
15
# 第一个网页地址
16
- "http://www.xiami.com/space/charts-recent/u/40753994?spm=a1z1s.6928797.1561534497.9.itdx5s" ,
17
- # "http://www.xiami.com/space/charts-recent/u/5447372?spm=a1z1s.6928793.1561534497.9.LVnEOi",
18
- # "http://www.xiami.com/space/charts-recent/u/5447372/page/2",
19
- # "http://www.xiami.com/space/charts-recent/u/5447372/page/3",
20
- # "http://www.xiami.com/space/charts-recent/u/5447372/page/4",
21
- # "http://www.xiami.com/space/charts-recent/u/5447372/page/5",
22
- # "http://www.xiami.com/space/charts-recent/u/5447372/page/6",
23
- # "http://www.xiami.com/space/charts-recent/u/5447372/page/7",
24
- # "http://www.xiami.com/space/charts-recent/u/5447372/page/8",
25
- # "http://www.xiami.com/space/charts-recent/u/5447372/page/9",
26
- # "http://www.xiami.com/space/charts-recent/u/5447372/page/10",
27
- # "http://www.xiami.com/space/charts-recent/u/5447372/page/11",
28
- # "http://www.xiami.com/space/charts-recent/u/5447372/page/12",
29
- # "http://www.xiami.com/space/charts-recent/u/5447372/page/13",
30
- # "http://www.xiami.com/space/charts-recent/u/5447372/page/14",
31
- # "http://www.xiami.com/space/charts-recent/u/5447372/page/15",
32
- # "http://www.xiami.com/space/charts-recent/u/5447372/page/16",
33
- # "http://www.xiami.com/space/charts-recent/u/5447372/page/17",
34
- # "http://www.xiami.com/space/charts-recent/u/5447372/page/18",
35
- # "http://www.xiami.com/space/charts-recent/u/5447372/page/19",
16
+ # "http://www.xiami.com/space/charts-recent/u/40753994?spm=a1z1s.6928797.1561534497.9.itdx5s",
17
+ "http://www.xiami.com/space/charts-recent/u/5447372?spm=a1z1s.6928793.1561534497.9.LVnEOi" ,
18
+ "http://www.xiami.com/space/charts-recent/u/5447372/page/2" ,
19
+ "http://www.xiami.com/space/charts-recent/u/5447372/page/3" ,
20
+ "http://www.xiami.com/space/charts-recent/u/5447372/page/4" ,
21
+ "http://www.xiami.com/space/charts-recent/u/5447372/page/5" ,
22
+ "http://www.xiami.com/space/charts-recent/u/5447372/page/6" ,
23
+ "http://www.xiami.com/space/charts-recent/u/5447372/page/7" ,
24
+ "http://www.xiami.com/space/charts-recent/u/5447372/page/8" ,
25
+ "http://www.xiami.com/space/charts-recent/u/5447372/page/9" ,
26
+ "http://www.xiami.com/space/charts-recent/u/5447372/page/10" ,
27
+ "http://www.xiami.com/space/charts-recent/u/5447372/page/11" ,
28
+ "http://www.xiami.com/space/charts-recent/u/5447372/page/12" ,
29
+ "http://www.xiami.com/space/charts-recent/u/5447372/page/13" ,
30
+ "http://www.xiami.com/space/charts-recent/u/5447372/page/14" ,
31
+ "http://www.xiami.com/space/charts-recent/u/5447372/page/15" ,
32
+ "http://www.xiami.com/space/charts-recent/u/5447372/page/16" ,
33
+ "http://www.xiami.com/space/charts-recent/u/5447372/page/17" ,
34
+ "http://www.xiami.com/space/charts-recent/u/5447372/page/18" ,
35
+ "http://www.xiami.com/space/charts-recent/u/5447372/page/19" ,
36
36
]
37
37
"""
38
38
rules = (
@@ -48,24 +48,34 @@ def parse(self, response):
48
48
# 标记是哪个用户
49
49
user = hxs .x ('//head/title/text()' ).extract ()[0 ][:- 7 ].encode ('utf-8' )
50
50
# 标记用户总共有多少条收听记录
51
- #sum = hxs.x('//span').extract()[-3].encode('utf-8').split('共')[1].split('条')[0]
51
+ sum = hxs .x ('//span' ).extract ()[- 3 ].encode ('utf-8' ).split ('共' )[1 ].split ('条' )[0 ]
52
+ currentPage = hxs .x ('//span' ).extract ()[- 3 ].encode ('utf-8' ).split ('第' )[1 ].split ('页' )[0 ]
53
+
52
54
#for site in sites:
53
- for i in range (1 , 2 ):
54
- item = XiamiItem ()
55
- item ['user' ] = user
56
- item ['song' ] = sites .x ('tr[' + str (i ) + ']/td[2]/a' ).extract ()[0 ].split ('\" ' )[3 ].encode ('utf-8' )
57
- print '_______________' + item ['song' ]
58
- item ['artist' ] = sites .x ('tr[' + str (i ) + ']/td[2]/a/text()' ).extract ()[1 ].encode ('utf-8' )
59
- items .append (item )
60
- yield items
61
- #return items
62
- yield items
63
- if cnt > 1 :
55
+ if int (currentPage ) <= int (sum ) / 50 :
56
+ for i in range (1 , 50 ):
57
+ item = XiamiItem ()
58
+ item ['user' ] = user
59
+ item ['song' ] = sites .x ('tr[' + str (i ) + ']/td[2]/a' ).extract ()[0 ].split ('\" ' )[3 ].encode ('utf-8' )
60
+ print '_______________' + item ['song' ]
61
+ item ['artist' ] = sites .x ('tr[' + str (i ) + ']/td[2]/a/text()' ).extract ()[1 ].encode ('utf-8' )
62
+ print '+++++++++++++++' + item ['artist' ]
63
+ items .append (item )
64
+ #yield item
65
+ return items
66
+ #yield items
67
+ """
68
+ if cnt < 10:
64
69
urls = hxs.x('//div[@class="all_page"]/a/@href').extract()
65
- cnt = cnt - 1
70
+ print urls
71
+ cnt = cnt + 1
66
72
#for url in urls:
67
- link = 'http://www.xiami.com' + urls [- 1 ]#.split('/page/')[0] + '/page/' + "2"
73
+ link = 'http://www.xiami.com/space/charts-recent/u/5447372/page/' + str(cnt) # + urls[-1]#.split('/page/')[0] + '/page/' + "2"
68
74
print "+++++++++++++" + link
69
- req = Request (url = link , callback = self .parse )
75
+ req = Request(url = link, meta = {
76
+ 'dont_redirect': True,
77
+ 'handle_httpstatus_list': [302]
78
+
79
+ }, callback=self.parse)
70
80
yield req
71
-
81
+ """
0 commit comments