Skip to content

Commit a7923f8

Browse files
committed
edit003
1 parent 73af9a3 commit a7923f8

File tree

7 files changed

+882
-41
lines changed

7 files changed

+882
-41
lines changed

scrmusic/items.json

Lines changed: 833 additions & 2 deletions
Large diffs are not rendered by default.

scrmusic/scrmusic/pipelines.pyc

0 Bytes
Binary file not shown.

scrmusic/scrmusic/settings.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414
NEWSPIDER_MODULE = 'scrmusic.spiders'
1515
USER_AGENT = '%s/%s' % (BOT_NAME, BOT_VERSION)
1616
#禁止cookies,防止被ban
17-
COOKIES_ENABLED = True
17+
#COOKIES_ENABLED = True
1818
ITEM_PIPELINES = {
1919
'scrmusic.pipelines.ScrmusicPipeline':300
2020
}

scrmusic/scrmusic/settings.pyc

-37 Bytes
Binary file not shown.

scrmusic/scrmusic/spiders/items.json

Whitespace-only changes.

scrmusic/scrmusic/spiders/xiami_spider.py

Lines changed: 48 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -5,34 +5,34 @@
55
from scrapy.selector import HtmlXPathSelector
66
from scrmusic.items import XiamiItem
77
from scrapy.http import Request
8-
cnt = 2
8+
cnt = 4
99
class DmozSpider(BaseSpider):
1010
name = "xiami"
1111
allowed_domains = ["xiami.com"]
1212
#设置爬取速度
13-
download_delay = 1
13+
#download_delay = 1
1414
start_urls = [
1515
# 第一个网页地址
16-
"http://www.xiami.com/space/charts-recent/u/40753994?spm=a1z1s.6928797.1561534497.9.itdx5s",
17-
# "http://www.xiami.com/space/charts-recent/u/5447372?spm=a1z1s.6928793.1561534497.9.LVnEOi",
18-
#"http://www.xiami.com/space/charts-recent/u/5447372/page/2",
19-
#"http://www.xiami.com/space/charts-recent/u/5447372/page/3",
20-
#"http://www.xiami.com/space/charts-recent/u/5447372/page/4",
21-
#"http://www.xiami.com/space/charts-recent/u/5447372/page/5",
22-
#"http://www.xiami.com/space/charts-recent/u/5447372/page/6",
23-
#"http://www.xiami.com/space/charts-recent/u/5447372/page/7",
24-
#"http://www.xiami.com/space/charts-recent/u/5447372/page/8",
25-
#"http://www.xiami.com/space/charts-recent/u/5447372/page/9",
26-
#"http://www.xiami.com/space/charts-recent/u/5447372/page/10",
27-
#"http://www.xiami.com/space/charts-recent/u/5447372/page/11",
28-
#"http://www.xiami.com/space/charts-recent/u/5447372/page/12",
29-
#"http://www.xiami.com/space/charts-recent/u/5447372/page/13",
30-
#"http://www.xiami.com/space/charts-recent/u/5447372/page/14",
31-
#"http://www.xiami.com/space/charts-recent/u/5447372/page/15",
32-
#"http://www.xiami.com/space/charts-recent/u/5447372/page/16",
33-
#"http://www.xiami.com/space/charts-recent/u/5447372/page/17",
34-
#"http://www.xiami.com/space/charts-recent/u/5447372/page/18",
35-
#"http://www.xiami.com/space/charts-recent/u/5447372/page/19",
16+
#"http://www.xiami.com/space/charts-recent/u/40753994?spm=a1z1s.6928797.1561534497.9.itdx5s",
17+
"http://www.xiami.com/space/charts-recent/u/5447372?spm=a1z1s.6928793.1561534497.9.LVnEOi",
18+
"http://www.xiami.com/space/charts-recent/u/5447372/page/2",
19+
"http://www.xiami.com/space/charts-recent/u/5447372/page/3",
20+
"http://www.xiami.com/space/charts-recent/u/5447372/page/4",
21+
"http://www.xiami.com/space/charts-recent/u/5447372/page/5",
22+
"http://www.xiami.com/space/charts-recent/u/5447372/page/6",
23+
"http://www.xiami.com/space/charts-recent/u/5447372/page/7",
24+
"http://www.xiami.com/space/charts-recent/u/5447372/page/8",
25+
"http://www.xiami.com/space/charts-recent/u/5447372/page/9",
26+
"http://www.xiami.com/space/charts-recent/u/5447372/page/10",
27+
"http://www.xiami.com/space/charts-recent/u/5447372/page/11",
28+
"http://www.xiami.com/space/charts-recent/u/5447372/page/12",
29+
"http://www.xiami.com/space/charts-recent/u/5447372/page/13",
30+
"http://www.xiami.com/space/charts-recent/u/5447372/page/14",
31+
"http://www.xiami.com/space/charts-recent/u/5447372/page/15",
32+
"http://www.xiami.com/space/charts-recent/u/5447372/page/16",
33+
"http://www.xiami.com/space/charts-recent/u/5447372/page/17",
34+
"http://www.xiami.com/space/charts-recent/u/5447372/page/18",
35+
"http://www.xiami.com/space/charts-recent/u/5447372/page/19",
3636
]
3737
"""
3838
rules = (
@@ -48,24 +48,34 @@ def parse(self, response):
4848
# 标记是哪个用户
4949
user = hxs.x('//head/title/text()').extract()[0][:-7].encode('utf-8')
5050
# 标记用户总共有多少条收听记录
51-
#sum = hxs.x('//span').extract()[-3].encode('utf-8').split('共')[1].split('条')[0]
51+
sum = hxs.x('//span').extract()[-3].encode('utf-8').split('共')[1].split('条')[0]
52+
currentPage = hxs.x('//span').extract()[-3].encode('utf-8').split('第')[1].split('页')[0]
53+
5254
#for site in sites:
53-
for i in range(1, 2):
54-
item = XiamiItem()
55-
item['user'] = user
56-
item['song'] = sites.x('tr[' + str(i) + ']/td[2]/a').extract()[0].split('\"')[3].encode('utf-8')
57-
print '_______________' + item['song']
58-
item['artist'] = sites.x('tr[' + str(i) + ']/td[2]/a/text()').extract()[1].encode('utf-8')
59-
items.append(item)
60-
yield items
61-
#return items
62-
yield items
63-
if cnt > 1:
55+
if int(currentPage) <= int(sum) / 50:
56+
for i in range(1, 50):
57+
item = XiamiItem()
58+
item['user'] = user
59+
item['song'] = sites.x('tr[' + str(i) + ']/td[2]/a').extract()[0].split('\"')[3].encode('utf-8')
60+
print '_______________' + item['song']
61+
item['artist'] = sites.x('tr[' + str(i) + ']/td[2]/a/text()').extract()[1].encode('utf-8')
62+
print '+++++++++++++++' + item['artist']
63+
items.append(item)
64+
#yield item
65+
return items
66+
#yield items
67+
"""
68+
if cnt < 10:
6469
urls = hxs.x('//div[@class="all_page"]/a/@href').extract()
65-
cnt = cnt - 1
70+
print urls
71+
cnt = cnt + 1
6672
#for url in urls:
67-
link = 'http://www.xiami.com' + urls[-1]#.split('/page/')[0] + '/page/' + "2"
73+
link = 'http://www.xiami.com/space/charts-recent/u/5447372/page/' + str(cnt) #+ urls[-1]#.split('/page/')[0] + '/page/' + "2"
6874
print "+++++++++++++" + link
69-
req = Request(url = link, callback=self.parse)
75+
req = Request(url = link, meta = {
76+
'dont_redirect': True,
77+
'handle_httpstatus_list': [302]
78+
79+
}, callback=self.parse)
7080
yield req
71-
81+
"""
1.16 KB
Binary file not shown.

0 commit comments

Comments
 (0)