lb2281075105
diff --git a/‎CrawlYouYuan/.idea/CrawlYouYuan.iml
Lines changed: 11 additions & 0 deletions b/‎CrawlYouYuan/.idea/CrawlYouYuan.iml
Lines changed: 11 additions & 0 deletions
diff --git a/‎CrawlYouYuan/.idea/misc.xml
Lines changed: 4 additions & 0 deletions b/‎CrawlYouYuan/.idea/misc.xml
Lines changed: 4 additions & 0 deletions
diff --git a/‎CrawlYouYuan/.idea/modules.xml
Lines changed: 8 additions & 0 deletions b/‎CrawlYouYuan/.idea/modules.xml
Lines changed: 8 additions & 0 deletions
diff --git a/‎CrawlYouYuan/.idea/workspace.xml
Lines changed: 293 additions & 0 deletions b/‎CrawlYouYuan/.idea/workspace.xml
Lines changed: 293 additions & 0 deletions
diff --git a/‎CrawlYouYuan/CrawlYouYuan/__init__.py b/‎CrawlYouYuan/CrawlYouYuan/__init__.py
diff --git a/‎CrawlYouYuan/CrawlYouYuan/items.py
Lines changed: 34 additions & 0 deletions b/‎CrawlYouYuan/CrawlYouYuan/items.py
Lines changed: 34 additions & 0 deletions
diff --git a/‎CrawlYouYuan/CrawlYouYuan/pipelines.py
Lines changed: 22 additions & 0 deletions b/‎CrawlYouYuan/CrawlYouYuan/pipelines.py
Lines changed: 22 additions & 0 deletions
diff --git a/‎CrawlYouYuan/CrawlYouYuan/settings.py
Lines changed: 89 additions & 0 deletions b/‎CrawlYouYuan/CrawlYouYuan/settings.py
Lines changed: 89 additions & 0 deletions
diff --git a/‎CrawlYouYuan/CrawlYouYuan/spiders/__init__.py
Lines changed: 4 additions & 0 deletions b/‎CrawlYouYuan/CrawlYouYuan/spiders/__init__.py
Lines changed: 4 additions & 0 deletions
diff --git a/‎CrawlYouYuan/CrawlYouYuan/spiders/youyuan.py
Lines changed: 110 additions & 0 deletions b/‎CrawlYouYuan/CrawlYouYuan/spiders/youyuan.py
Lines changed: 110 additions & 0 deletions
diff --git a/‎CrawlYouYuan/begin.py
Lines changed: 2 additions & 0 deletions b/‎CrawlYouYuan/begin.py
Lines changed: 2 additions & 0 deletions
diff --git a/‎CrawlYouYuan/scrapy.cfg
Lines changed: 11 additions & 0 deletions b/‎CrawlYouYuan/scrapy.cfg
Lines changed: 11 additions & 0 deletions
@@ -0,0 +1,34 @@
+# -*- coding: utf-8 -*-
+
+# Define here the models for your scraped items
+#
+# See documentation in:
+# http://doc.scrapy.org/en/latest/topics/items.html
+
+import scrapy
+
+class CrawlyouyuanItem(scrapy.Item):
+    # 用户名
+    username = scrapy.Field()
+    # 年龄
+    age = scrapy.Field()
+    # 头像图片的链接
+    header_url = scrapy.Field()
+    # 相册图片的链接
+    images_url = scrapy.Field()
+    # 内心独白
+    content = scrapy.Field()
+    # 籍贯
+    place_from = scrapy.Field()
+    # 学历
+    education = scrapy.Field()
+    # 兴趣爱好
+    hobby = scrapy.Field()
+    # 个人主页
+    source_url = scrapy.Field()
+    # 数据来源网站
+    sourec = scrapy.Field()
+    # utc 时间
+    time = scrapy.Field()
+    # 爬虫名
+    spidername = scrapy.Field()
@@ -0,0 +1,22 @@
+# -*- coding: utf-8 -*-
+
+# Define your item pipelines here
+#
+# Don't forget to add your pipeline to the ITEM_PIPELINES setting
+# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
+import json
+import codecs
+
+class CrawlyouyuanPipeline(object):
+
+    def __init__(self):
+        self.filename = codecs.open('content.json', 'w', encoding='utf-8')
+
+    def process_item(self, item, spider):
+        html = json.dumps(dict(item), ensure_ascii=False)
+        self.filename.write(html + '\n')
+        return item
+
+    def spider_closed(self, spider):
+        self.filename.close()
+
@@ -0,0 +1,89 @@
+# -*- coding: utf-8 -*-
+
+# Scrapy settings for CrawlYouYuan project
+#
+# For simplicity, this file contains only settings considered important or
+# commonly used. You can find more settings consulting the documentation:
+#
+#     http://doc.scrapy.org/en/latest/topics/settings.html
+#     http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
+#     http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
+
+BOT_NAME = 'CrawlYouYuan'
+
+SPIDER_MODULES = ['CrawlYouYuan.spiders']
+NEWSPIDER_MODULE = 'CrawlYouYuan.spiders'
+
+
+# Crawl responsibly by identifying yourself (and your website) on the user-agent
+USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:56.0)'
+# Obey robots.txt rules
+ROBOTSTXT_OBEY = True
+
+# Configure maximum concurrent requests performed by Scrapy (default: 16)
+#CONCURRENT_REQUESTS = 32
+
+# Configure a delay for requests for the same website (default: 0)
+# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
+# See also autothrottle settings and docs
+#DOWNLOAD_DELAY = 3
+# The download delay setting will honor only one of:
+#CONCURRENT_REQUESTS_PER_DOMAIN = 16
+#CONCURRENT_REQUESTS_PER_IP = 16
+
+# Disable cookies (enabled by default)
+#COOKIES_ENABLED = False
+
+# Disable Telnet Console (enabled by default)
+#TELNETCONSOLE_ENABLED = False
+
+# Override the default request headers:
+#DEFAULT_REQUEST_HEADERS = {
+#   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
+#   'Accept-Language': 'en',
+#}
+
+# Enable or disable spider middlewares
+# See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
+#SPIDER_MIDDLEWARES = {
+#    'CrawlYouYuan.middlewares.MyCustomSpiderMiddleware': 543,
+#}
+
+# Enable or disable downloader middlewares
+# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
+#DOWNLOADER_MIDDLEWARES = {
+#    'CrawlYouYuan.middlewares.MyCustomDownloaderMiddleware': 543,
+#}
+
+# Enable or disable extensions
+# See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
+#EXTENSIONS = {
+#    'scrapy.extensions.telnet.TelnetConsole': None,
+#}
+
+# Configure item pipelines
+# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
+ITEM_PIPELINES = {
+   'CrawlYouYuan.pipelines.CrawlyouyuanPipeline': 300,
+}
+
+# Enable and configure the AutoThrottle extension (disabled by default)
+# See http://doc.scrapy.org/en/latest/topics/autothrottle.html
+#AUTOTHROTTLE_ENABLED = True
+# The initial download delay
+#AUTOTHROTTLE_START_DELAY = 5
+# The maximum download delay to be set in case of high latencies
+#AUTOTHROTTLE_MAX_DELAY = 60
+# The average number of requests Scrapy should be sending in parallel to
+# each remote server
+#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
+# Enable showing throttling stats for every response received:
+#AUTOTHROTTLE_DEBUG = False
+
+# Enable and configure HTTP caching (disabled by default)
+# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
+#HTTPCACHE_ENABLED = True
+#HTTPCACHE_EXPIRATION_SECS = 0
+#HTTPCACHE_DIR = 'httpcache'
+#HTTPCACHE_IGNORE_HTTP_CODES = []
+#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
@@ -0,0 +1,4 @@
+# This package will contain the spiders of your Scrapy project
+#
+# Please refer to the documentation for information on how to create and manage
+# your spiders.
@@ -0,0 +1,110 @@
+# -*- coding: utf-8 -*-
+import scrapy
+from scrapy.linkextractors import LinkExtractor
+from scrapy.spiders import CrawlSpider, Rule
+from CrawlYouYuan.items import CrawlyouyuanItem
+import re
+class YouyuanSpider(CrawlSpider):
+    name = 'youyuan'
+    allowed_domains = ['youyuan.com']
+    start_urls = ['http://www.youyuan.com/find/beijing/mm18-25/advance-0-0-0-0-0-0-0/p1/']
+    # 自动生成的文件不需要改东西，只需要添加rules文件里面Rule角色就可以
+    # 每一页匹配规则
+    page_links = LinkExtractor(allow=(r"youyuan.com/find/beijing/mm18-25/advance-0-0-0-0-0-0-0/p\d+/"))
+    # 每个人个人主页匹配规则
+    profile_links = LinkExtractor(allow=(r"youyuan.com/\d+-profile/"))
+    rules = (
+        # 没有回调函数，说明follow是True
+        Rule(page_links),
+        # 有回调函数，说明follow是False
+        Rule(profile_links, callback='parse_item', follow=True),
+    )
+
+    def parse_item(self, response):
+        item = CrawlyouyuanItem()
+
+        item['username'] = self.get_username(response)
+        # 年龄
+        item['age'] = self.get_age(response)
+        # 头像图片的链接
+        item['header_url'] = self.get_header_url(response)
+        # 相册图片的链接
+        item['images_url'] = self.get_images_url(response)
+        # 内心独白
+        item['content'] = self.get_content(response)
+        # 籍贯
+        item['place_from'] = self.get_place_from(response)
+        # 学历
+        item['education'] = self.get_education(response)
+        # 兴趣爱好
+        item['hobby'] = self.get_hobby(response)
+        # 个人主页
+        item['source_url'] = response.url
+        # 数据来源网站
+        item['sourec'] = "youyuan"
+
+        yield item
+
+    def get_username(self, response):
+        username = response.xpath("//dl[@class='personal_cen']//div[@class='main']/strong/text()").extract()
+        if len(username):
+            username = username[0]
+        else:
+            username = "NULL"
+        return username.strip()
+
+    def get_age(self, response):
+        age = response.xpath("//dl[@class='personal_cen']//dd/p/text()").extract()
+        if len(age):
+            age = re.findall(u"\d+岁", age[0])[0]
+        else:
+            age = "NULL"
+        return age.strip()
+
+    def get_header_url(self, response):
+        header_url = response.xpath("//dl[@class='personal_cen']/dt/img/@src").extract()
+        if len(header_url):
+            header_url = header_url[0]
+        else:
+            header_url = "NULL"
+        return header_url.strip()
+
+    def get_images_url(self, response):
+        images_url = response.xpath("//div[@class='ph_show']/ul/li/a/img/@src").extract()
+        if len(images_url):
+            images_url = ", ".join(images_url)
+        else:
+            images_url = "NULL"
+        return images_url
+
+    def get_content(self, response):
+        content = response.xpath("//div[@class='pre_data']/ul/li/p/text()").extract()
+        if len(content):
+            content = content[0]
+        else:
+            content = "NULL"
+        return content.strip()
+
+    def get_place_from(self, response):
+        place_from = response.xpath("//div[@class='pre_data']/ul/li[2]//ol[1]/li[1]/span/text()").extract()
+        if len(place_from):
+            place_from = place_from[0]
+        else:
+            place_from = "NULL"
+        return place_from.strip()
+
+    def get_education(self, response):
+        education = response.xpath("//div[@class='pre_data']/ul/li[3]//ol[2]/li[2]/span/text()").extract()
+        if len(education):
+            education = education[0]
+        else:
+            education = "NULL"
+        return education.strip()
+
+    def get_hobby(self, response):
+        hobby = response.xpath("//dl[@class='personal_cen']//ol/li/text()").extract()
+        if len(hobby):
+            hobby = ",".join(hobby).replace(" ", "")
+        else:
+            hobby = "NULL"
+        return hobby.strip()
@@ -0,0 +1,2 @@
+from scrapy import cmdline
+cmdline.execute('scrapy crawl youyuan'.split())
@@ -0,0 +1,11 @@
+# Automatically created by: scrapy startproject
+#
+# For more information about the [deploy] section see:
+# https://scrapyd.readthedocs.org/en/latest/deploy.html
+
+[settings]
+default = CrawlYouYuan.settings
+
+[deploy]
+#url = http://localhost:6800/
+project = CrawlYouYuan
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	`+from scrapy import cmdline`
	`2`	`+cmdline.execute('scrapy crawl youyuan'.split())`