Skip to content

Commit f5148e6

Browse files
author
CodeJCSON
committed
Crawl爬取有缘网站信息
1 parent 10b0268 commit f5148e6

File tree

12 files changed

+588
-0
lines changed

12 files changed

+588
-0
lines changed

CrawlYouYuan/.idea/CrawlYouYuan.iml

Lines changed: 11 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

CrawlYouYuan/.idea/misc.xml

Lines changed: 4 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

CrawlYouYuan/.idea/modules.xml

Lines changed: 8 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

CrawlYouYuan/.idea/workspace.xml

Lines changed: 293 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

CrawlYouYuan/CrawlYouYuan/__init__.py

Whitespace-only changes.

CrawlYouYuan/CrawlYouYuan/items.py

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
# -*- coding: utf-8 -*-
2+
3+
# Define here the models for your scraped items
4+
#
5+
# See documentation in:
6+
# http://doc.scrapy.org/en/latest/topics/items.html
7+
8+
import scrapy
9+
10+
class CrawlyouyuanItem(scrapy.Item):
11+
# 用户名
12+
username = scrapy.Field()
13+
# 年龄
14+
age = scrapy.Field()
15+
# 头像图片的链接
16+
header_url = scrapy.Field()
17+
# 相册图片的链接
18+
images_url = scrapy.Field()
19+
# 内心独白
20+
content = scrapy.Field()
21+
# 籍贯
22+
place_from = scrapy.Field()
23+
# 学历
24+
education = scrapy.Field()
25+
# 兴趣爱好
26+
hobby = scrapy.Field()
27+
# 个人主页
28+
source_url = scrapy.Field()
29+
# 数据来源网站
30+
sourec = scrapy.Field()
31+
# utc 时间
32+
time = scrapy.Field()
33+
# 爬虫名
34+
spidername = scrapy.Field()
Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
# -*- coding: utf-8 -*-
2+
3+
# Define your item pipelines here
4+
#
5+
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
6+
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
7+
import json
8+
import codecs
9+
10+
class CrawlyouyuanPipeline(object):
11+
12+
def __init__(self):
13+
self.filename = codecs.open('content.json', 'w', encoding='utf-8')
14+
15+
def process_item(self, item, spider):
16+
html = json.dumps(dict(item), ensure_ascii=False)
17+
self.filename.write(html + '\n')
18+
return item
19+
20+
def spider_closed(self, spider):
21+
self.filename.close()
22+

CrawlYouYuan/CrawlYouYuan/settings.py

Lines changed: 89 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,89 @@
1+
# -*- coding: utf-8 -*-
2+
3+
# Scrapy settings for CrawlYouYuan project
4+
#
5+
# For simplicity, this file contains only settings considered important or
6+
# commonly used. You can find more settings consulting the documentation:
7+
#
8+
# http://doc.scrapy.org/en/latest/topics/settings.html
9+
# http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
10+
# http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
11+
12+
BOT_NAME = 'CrawlYouYuan'
13+
14+
SPIDER_MODULES = ['CrawlYouYuan.spiders']
15+
NEWSPIDER_MODULE = 'CrawlYouYuan.spiders'
16+
17+
18+
# Crawl responsibly by identifying yourself (and your website) on the user-agent
19+
USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:56.0)'
20+
# Obey robots.txt rules
21+
ROBOTSTXT_OBEY = True
22+
23+
# Configure maximum concurrent requests performed by Scrapy (default: 16)
24+
#CONCURRENT_REQUESTS = 32
25+
26+
# Configure a delay for requests for the same website (default: 0)
27+
# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
28+
# See also autothrottle settings and docs
29+
#DOWNLOAD_DELAY = 3
30+
# The download delay setting will honor only one of:
31+
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
32+
#CONCURRENT_REQUESTS_PER_IP = 16
33+
34+
# Disable cookies (enabled by default)
35+
#COOKIES_ENABLED = False
36+
37+
# Disable Telnet Console (enabled by default)
38+
#TELNETCONSOLE_ENABLED = False
39+
40+
# Override the default request headers:
41+
#DEFAULT_REQUEST_HEADERS = {
42+
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
43+
# 'Accept-Language': 'en',
44+
#}
45+
46+
# Enable or disable spider middlewares
47+
# See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
48+
#SPIDER_MIDDLEWARES = {
49+
# 'CrawlYouYuan.middlewares.MyCustomSpiderMiddleware': 543,
50+
#}
51+
52+
# Enable or disable downloader middlewares
53+
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
54+
#DOWNLOADER_MIDDLEWARES = {
55+
# 'CrawlYouYuan.middlewares.MyCustomDownloaderMiddleware': 543,
56+
#}
57+
58+
# Enable or disable extensions
59+
# See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
60+
#EXTENSIONS = {
61+
# 'scrapy.extensions.telnet.TelnetConsole': None,
62+
#}
63+
64+
# Configure item pipelines
65+
# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
66+
ITEM_PIPELINES = {
67+
'CrawlYouYuan.pipelines.CrawlyouyuanPipeline': 300,
68+
}
69+
70+
# Enable and configure the AutoThrottle extension (disabled by default)
71+
# See http://doc.scrapy.org/en/latest/topics/autothrottle.html
72+
#AUTOTHROTTLE_ENABLED = True
73+
# The initial download delay
74+
#AUTOTHROTTLE_START_DELAY = 5
75+
# The maximum download delay to be set in case of high latencies
76+
#AUTOTHROTTLE_MAX_DELAY = 60
77+
# The average number of requests Scrapy should be sending in parallel to
78+
# each remote server
79+
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
80+
# Enable showing throttling stats for every response received:
81+
#AUTOTHROTTLE_DEBUG = False
82+
83+
# Enable and configure HTTP caching (disabled by default)
84+
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
85+
#HTTPCACHE_ENABLED = True
86+
#HTTPCACHE_EXPIRATION_SECS = 0
87+
#HTTPCACHE_DIR = 'httpcache'
88+
#HTTPCACHE_IGNORE_HTTP_CODES = []
89+
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
# This package will contain the spiders of your Scrapy project
2+
#
3+
# Please refer to the documentation for information on how to create and manage
4+
# your spiders.
Lines changed: 110 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,110 @@
1+
# -*- coding: utf-8 -*-
2+
import scrapy
3+
from scrapy.linkextractors import LinkExtractor
4+
from scrapy.spiders import CrawlSpider, Rule
5+
from CrawlYouYuan.items import CrawlyouyuanItem
6+
import re
7+
class YouyuanSpider(CrawlSpider):
8+
name = 'youyuan'
9+
allowed_domains = ['youyuan.com']
10+
start_urls = ['http://www.youyuan.com/find/beijing/mm18-25/advance-0-0-0-0-0-0-0/p1/']
11+
# 自动生成的文件不需要改东西,只需要添加rules文件里面Rule角色就可以
12+
# 每一页匹配规则
13+
page_links = LinkExtractor(allow=(r"youyuan.com/find/beijing/mm18-25/advance-0-0-0-0-0-0-0/p\d+/"))
14+
# 每个人个人主页匹配规则
15+
profile_links = LinkExtractor(allow=(r"youyuan.com/\d+-profile/"))
16+
rules = (
17+
# 没有回调函数,说明follow是True
18+
Rule(page_links),
19+
# 有回调函数,说明follow是False
20+
Rule(profile_links, callback='parse_item', follow=True),
21+
)
22+
23+
def parse_item(self, response):
24+
item = CrawlyouyuanItem()
25+
26+
item['username'] = self.get_username(response)
27+
# 年龄
28+
item['age'] = self.get_age(response)
29+
# 头像图片的链接
30+
item['header_url'] = self.get_header_url(response)
31+
# 相册图片的链接
32+
item['images_url'] = self.get_images_url(response)
33+
# 内心独白
34+
item['content'] = self.get_content(response)
35+
# 籍贯
36+
item['place_from'] = self.get_place_from(response)
37+
# 学历
38+
item['education'] = self.get_education(response)
39+
# 兴趣爱好
40+
item['hobby'] = self.get_hobby(response)
41+
# 个人主页
42+
item['source_url'] = response.url
43+
# 数据来源网站
44+
item['sourec'] = "youyuan"
45+
46+
yield item
47+
48+
def get_username(self, response):
49+
username = response.xpath("//dl[@class='personal_cen']//div[@class='main']/strong/text()").extract()
50+
if len(username):
51+
username = username[0]
52+
else:
53+
username = "NULL"
54+
return username.strip()
55+
56+
def get_age(self, response):
57+
age = response.xpath("//dl[@class='personal_cen']//dd/p/text()").extract()
58+
if len(age):
59+
age = re.findall(u"\d+岁", age[0])[0]
60+
else:
61+
age = "NULL"
62+
return age.strip()
63+
64+
def get_header_url(self, response):
65+
header_url = response.xpath("//dl[@class='personal_cen']/dt/img/@src").extract()
66+
if len(header_url):
67+
header_url = header_url[0]
68+
else:
69+
header_url = "NULL"
70+
return header_url.strip()
71+
72+
def get_images_url(self, response):
73+
images_url = response.xpath("//div[@class='ph_show']/ul/li/a/img/@src").extract()
74+
if len(images_url):
75+
images_url = ", ".join(images_url)
76+
else:
77+
images_url = "NULL"
78+
return images_url
79+
80+
def get_content(self, response):
81+
content = response.xpath("//div[@class='pre_data']/ul/li/p/text()").extract()
82+
if len(content):
83+
content = content[0]
84+
else:
85+
content = "NULL"
86+
return content.strip()
87+
88+
def get_place_from(self, response):
89+
place_from = response.xpath("//div[@class='pre_data']/ul/li[2]//ol[1]/li[1]/span/text()").extract()
90+
if len(place_from):
91+
place_from = place_from[0]
92+
else:
93+
place_from = "NULL"
94+
return place_from.strip()
95+
96+
def get_education(self, response):
97+
education = response.xpath("//div[@class='pre_data']/ul/li[3]//ol[2]/li[2]/span/text()").extract()
98+
if len(education):
99+
education = education[0]
100+
else:
101+
education = "NULL"
102+
return education.strip()
103+
104+
def get_hobby(self, response):
105+
hobby = response.xpath("//dl[@class='personal_cen']//ol/li/text()").extract()
106+
if len(hobby):
107+
hobby = ",".join(hobby).replace(" ", "")
108+
else:
109+
hobby = "NULL"
110+
return hobby.strip()

CrawlYouYuan/begin.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
from scrapy import cmdline
2+
cmdline.execute('scrapy crawl youyuan'.split())

CrawlYouYuan/scrapy.cfg

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
# Automatically created by: scrapy startproject
2+
#
3+
# For more information about the [deploy] section see:
4+
# https://scrapyd.readthedocs.org/en/latest/deploy.html
5+
6+
[settings]
7+
default = CrawlYouYuan.settings
8+
9+
[deploy]
10+
#url = http://localhost:6800/
11+
project = CrawlYouYuan

0 commit comments

Comments
 (0)