Skip to content

Latest commit

 

History

History
547 lines (492 loc) · 20.7 KB

576-2704167-社交媒体_微博热搜_b站评论_b站热搜.sy.md

File metadata and controls

547 lines (492 loc) · 20.7 KB

查看数据

图片描述

实现接口

  • 从爬取到
    • 情感分析
    • 词云图
    • 散点图
import json
import re
import openpyxl
import requests
from lxml import etree
import csv

cookies = {
    'SUB': '_2AkMRcyacf8NxqwFRmfsQxG3lb4t3wwrEieKnL9dHJRMxHRl-yT9vqhA7tRB6OvMIc71hiVugltoaHcZ3tShbEbigs6j1',
    'SUBP': '0033WrSXqPxfM72-Ws9jqgMF55529P9D9WWlN0z94LR6Vq-2CZGGpQC7',
    'XSRF-TOKEN': 'ALBZ5zY_YLCch7bfNKBZmnqA',
    'WBPSESS': 'gJ7ElPMf_3q2cdj5JUfmvOmOpSk7C1JxpzOkDaH8sK1Kcrj3y9bjLcsXbcXwDAKNQ5BVj_MmHDUbEie0S_8hpMBP57KXkPC2z7FS_xrFiJH7jJhopjBRFKQtskOuMVg8',
    '_s_tentry': 'weibo.com',
    'Apache': '9474795353437.963.1714399695446',
    'SINAGLOBAL': '9474795353437.963.1714399695446',
    'ULV': '1714399695470:1:1:1:9474795353437.963.1714399695446:',
}

headers = {
    'accept': 'application/json, text/plain, */*',
    'accept-language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
    'client-version': 'v2.45.7',
    # 'cookie': 'SUB=_2AkMRcyacf8NxqwFRmfsQxG3lb4t3wwrEieKnL9dHJRMxHRl-yT9vqhA7tRB6OvMIc71hiVugltoaHcZ3tShbEbigs6j1; SUBP=0033WrSXqPxfM72-Ws9jqgMF55529P9D9WWlN0z94LR6Vq-2CZGGpQC7; XSRF-TOKEN=ALBZ5zY_YLCch7bfNKBZmnqA; WBPSESS=gJ7ElPMf_3q2cdj5JUfmvOmOpSk7C1JxpzOkDaH8sK1Kcrj3y9bjLcsXbcXwDAKNQ5BVj_MmHDUbEie0S_8hpMBP57KXkPC2z7FS_xrFiJH7jJhopjBRFKQtskOuMVg8; _s_tentry=weibo.com; Apache=9474795353437.963.1714399695446; SINAGLOBAL=9474795353437.963.1714399695446; ULV=1714399695470:1:1:1:9474795353437.963.1714399695446:',
    'priority': 'u=1, i',
    'referer': 'https://weibo.com/newlogin?tabtype=weibo&gid=102803&openLoginLayer=0&url=https%3A%2F%2Fwww.weibo.com%2F',
    'sec-ch-ua': '"Chromium";v="124", "Microsoft Edge";v="124", "Not-A.Brand";v="99"',
    'sec-ch-ua-mobile': '?0',
    'sec-ch-ua-platform': '"Windows"',
    'sec-fetch-dest': 'empty',
    'sec-fetch-mode': 'cors',
    'sec-fetch-site': 'same-origin',
    'server-version': 'v2024.04.29.1',
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36 Edg/124.0.0.0',
    'x-requested-with': 'XMLHttpRequest',
    'x-xsrf-token': 'ALBZ5zY_YLCch7bfNKBZmnqA',
}

url = requests.get('https://weibo.com/ajax/side/hotSearch', cookies=cookies, headers=headers)
# print(url.text)


ws=[]
ws.append(['顺序','热搜分类','热搜关键词'])
data = json.loads(url.text)['data']['realtime']
for i in data:
    try:
        print(f'热搜:{i["realpos"]}, 热搜分类[{i["category"]}], 热搜关键词:{i["word"]}')
        ws.append([i["realpos"],i["category"],i["word"].replace(' ','')])
    except:
        pass

with open("热搜.csv", 'w', newline='', encoding='utf-8') as f:
    writer = csv.writer(f)
    for i in ws:
        writer.writerow(i)

b站评论

import requests  # 发送请求
import pandas as pd  # 保存csv文件
import os  # 判断文件是否存在
import time
from time import sleep  # 设置等待,防止反爬
import random  # 生成随机数
from PIL import Image
import jieba.analyse
from  pprint import pprint
import numpy as np
from wordcloud import WordCloud
from snownlp import SnowNLP
import matplotlib.pyplot as plt
from matplotlib.font_manager import FontProperties
import matplotlib.dates as mdates
plt.rcParams['font.sans-serif'] = ['SimHei'] # 设置显示中文字体
plt.rcParams['axes.unicode_minus'] = False  # 设置正常显示符号
my_font = FontProperties(fname=r"/Users/easyo/Library/Fonts/方正像素16.ttf", size=12)


# 请求头
headers = {
   'authority': 'api.bilibili.com',
   'accept': 'application/json, text/plain, */*',
   'accept-language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
   # 需定期更换cookie,否则location爬不到
   'cookie': "buvid3=400B449F-552C-4641-2982-7A9D4008CE3D06541infoc; b_nut=1713932206; _uuid=F41AF179-6FB2-6FE9-F898-E5D46565D77202875infoc; enable_web_push=DISABLE; FEED_LIVE_VERSION=V_WATCHLATER_PIP_WINDOW3; buvid4=A926CCEE-259E-5574-F40D-6A4529C094C708059-024042404-8M%2Fk2zYeusDF%2FagXt%2FweLhkVEVvdqLmWsrQNaxTc8cqHcQKXI6g%2Br0n2Iy%2BS4hA7; CURRENT_FNVAL=4048; rpdid=0zbfVFVf3J|CY59vGJe|41u|3w1RZu3X; bsource=search_sougo; header_theme_version=CLOSE; DedeUserID=44780233; DedeUserID__ckMd5=a3f833e80f6ac30a; home_feed_column=5; fingerprint=6bc4ba3263b0340544fd36f78b75cdca; buvid_fp_plain=undefined; buvid_fp=6bc4ba3263b0340544fd36f78b75cdca; bili_ticket=eyJhbGciOiJIUzI1NiIsImtpZCI6InMwMyIsInR5cCI6IkpXVCJ9.eyJleHAiOjE3MTQzMTQ5NzksImlhdCI6MTcxNDA1NTcxOSwicGx0IjotMX0.y8ksDiq8et6P8vEePKcXDScX32YUtTCQFhCIlSVIHLs; bili_ticket_expires=1714314919; browser_resolution=1708-818; SESSDATA=206ce534%2C1729749695%2Cf9ef4%2A41CjDNW93e15MamYUOxdoJjwGBTp0griKXnb-tskr20ghsRvgwLTPT7mQ8jof1k7j0wD8SVkpZSUllT3MxUGE4NVFNZ3pTeVVRNE9GMVV1ajc3MkpCcElRZTlGYUdlTmNtTWdIbGdPZ3dQVDJDRHd1UDdqWWdDMFU5ME1VWDd1RXVrbW1KWUtpOGpBIIEC; bili_jct=c32cacedc61a1ed9fb9313455af62794; sid=788cfgno; bp_video_offset_44780233=925053543733264419; b_lsid=101052C68B_18F1FB82769; PVID=4",
   'origin': 'https://www.bilibili.com',
   'referer': 'https://www.bilibili.com/video/BV1FG4y1Z7po/?spm_id_from=333.337.search-card.all.click&vd_source=69a50ad969074af9e79ad13b34b1a548',
   'sec-ch-ua': '"Chromium";v="106", "Microsoft Edge";v="106", "Not;A=Brand";v="99"',
   'sec-ch-ua-mobile': '?0',
   'sec-ch-ua-platform': '"Windows"',
   'sec-fetch-dest': 'empty',
   'sec-fetch-mode': 'cors',
   'sec-fetch-site': 'same-site',
   'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36 Edg/106.0.1370.47'
}


def trans_date(v_timestamp):
   """10位时间戳转换为时间字符串"""
   timeArray = time.localtime(v_timestamp)
   otherStyleTime = time.strftime("%Y-%m-%d %H:%M:%S", timeArray)
   return otherStyleTime


def bv2av(bid):
   """把哔哩哔哩视频的bv号转成av号"""
   table = 'fZodR9XQDSUm21yCkr6zBqiveYah8bt4xsWpHnJE7jL5VG3guMTKNPAwcF'
   tr = {}
   for i in range(58):
       tr[table[i]] = i
   s = [11, 10, 3, 8, 4, 6]
   r = 0
   for i in range(6):
       r += tr[bid[s[i]]] * 58 ** i
   aid = (r - 8728348608) ^ 177451812
   return aid


def get_comment(v_aid, v_bid):
   """
   爬取B站评论数据
   :param v_aid: B站视频的aid号
   :param v_bid: B站视频的bid号
   :return: None
   """
   # 循环页码爬取评论
   for i in range(max_page):
       try:
           sleep(random.uniform(0, 5))  # 随机等待,防止反爬
           url = "https://api.bilibili.com/x/v2/reply/wbi/main?oid=1752243679&type=1&mode=3&pagination_str=%7B%22offset%22:%22%7B%5C%22type%5C%22:1,%5C%22direction%5C%22:1,%5C%22session_id%5C%22:%5C%221755366185780515%5C%22,%5C%22data%5C%22:%7B%7D%7D%22%7D&plat=1&web_location=1315875&w_rid=3bf0582b0eb173e9b202b9dadd0e8b97&wts=1714224814".format(
               i, v_aid)  # 请求地址
           response = requests.get(url, headers=headers, )  # 发送请求
           data_list = response.json()['data']['replies']  # 解析评论数据
           print('正在爬取B站评论[{}]: 第{}页,共{}条评论'.format(bid, i + 1, len(data_list)))
           comment_list = []  # 评论内容
           location_list = []  # IP属地
           time_list = []  # 评论时间
           user_list = []  # 评论作者
           like_list = []  # 点赞数
           # 循环爬取每一条评论数据
           for a in data_list:
               # 评论内容
               comment = a['content']['message']
               comment_list.append(comment)
               # IP属地
               try:
                   location = a['reply_control']['location']
               except:
                   location = ""
               location_list.append(location.replace("IP属地:", ""))
               # 评论时间
               time = trans_date(a['ctime'])
               time_list.append(time)
               # 评论作者
               user = a['member']['uname']
               user_list.append(user)
               # 点赞数
               like = a['like']
               like_list.append(like)
           # 把列表拼装为DataFrame数据
           df = pd.DataFrame({
               '视频链接': 'https://www.bilibili.com/video/' + v_bid,
               '评论页码': (i + 1),
               '评论作者': user_list,
               '评论时间': time_list,
               'IP属地': location_list,
               '点赞数': like_list,
               '评论内容': comment_list,
           })
           if os.path.exists(outfile):
               header = False
           else:
               header = True
           # 把评论数据保存到csv文件
           df.to_csv(outfile, mode='a+', encoding='utf_8_sig', index=False, header=header)
       except Exception as e:
           print('爬评论发生异常: {},继续..'.format(str(e)))


if __name__ == '__main__':
   print('爬虫开始执行!')
   bid_list = ['BV1sx421Q7nb', 'BV1Hz421Q7Wf', 'BV1Jt421V7dE']
   # 评论最大爬取页(每页20条评论)
   max_page = 10
   # 获取当前时间戳
   now = time.strftime("%Y%m%d%H%M%S", time.localtime())
   # 输出文件名
   outfile = 'b站评论_{}.csv'.format(now)
   # 循环爬取这几个视频的评论
   for bid in bid_list:
       # 转换aid
       aid = bv2av(bid=bid)
       # 爬取评论
       get_comment(v_aid=aid, v_bid=bid)
   print('爬虫正常结束!')
   # 去除重复
   df = pd.read_csv(outfile)
   # 去重
   df.drop_duplicates(subset=['评论作者','评论时间','评论内容'], inplace=True, keep='first')
   # 再次保存csv文件
   df.to_csv('最新的评论数据.csv' , index=False, encoding='utf_8_sig')
   print('数据清洗完成')

# 情感分析打标
   def sentiment_analyse(v_cmt_list):
       """
       情感分析打分
       :param v_cmt_list: 需要处理的评论列表
       :return:
       """

       score_list = []  # 情感评分值
       tag_list = []  # 打标分类结果
       pos_count = 0  # 计数器-积极
       neg_count = 0  # 计数器-气愤
       nog_count = 0  # 计数器-焦虑
       for comment in v_cmt_list:
           tag = ''
           sentiments_score = SnowNLP(str(comment)).sentiments
           if sentiments_score<0.1:
               tag = '气愤'
               neg_count += 1
           else:
               if sentiments_score<0.4:
                   tag = '焦虑'
                   nog_count += 1
               else:
                   neg_count += 1
                   tag = '积极'
                   pos_count += 1
           score_list.append(sentiments_score)  # 得分值
           tag_list.append(tag)  # 判定结果
       print('积极评价占比:', round(pos_count / (pos_count + neg_count + nog_count), 4))
       print('焦虑评价占比:', round(nog_count / (pos_count + neg_count + nog_count), 4))
       print('气愤评价占比:', round(neg_count / (pos_count + neg_count + nog_count), 4))
       df['情感得分'] = score_list
       df['分析结果'] = tag_list
       # 把情感分析结果保存到excel文件
       df.to_excel('关于提案_情感评分结果.xlsx', index=True)
       print('情感分析结果已生成:关于提案_情感评分结果.xlsx')
# 把情感分析结果保存到excel文件
       df.to_excel('关于提案_情感评分结果.xlsx', index=None)
       print('情感分析结果已生成:关于提案_情感评分结果.xlsx')
       labels=['气愤','焦虑','积极']
       sizes=[neg_count,nog_count,pos_count]
       plt.pie(sizes,labels=labels,autopct='%1.1f%%',startangle=90)
       plt.axis('equal')
       plt.title('情感倾向分布—饼图')
       plt.savefig('情感倾向分布-饼图.png',dpi=300)
       plt.show()
       plt.close()

def make_wordcloud(v_str, v_stopwords, v_outfile):
  """
  绘制词云图
  :param v_str: 输入字符串
  :param v_stopwords: 停用词
  :param v_outfile: 输出文件
  :return: None
  """
  print('开始生成词云图:{}'.format(v_outfile))
  try:
     stopwords = v_stopwords  # 停用词
     backgroud_Image = np.array(Image.open('47.png'))  # 读取背景图片
     wc = WordCloud(
        background_color="white",  # 背景颜色
        width=1500,  # 图宽
        height=1200,  # 图高
        max_words=1000,  # 最多字数
        font_path="C:\Windows\Fonts\simhei.ttf",  # 字体文件路径,根据实际情况(Windows)替换
        stopwords=stopwords,  # 停用词
        mask=backgroud_Image,  # 背景图片
     )
     jieba_text = " ".join(jieba.lcut(v_str))  # jieba分词
     wc.generate_from_text(jieba_text)  # 生成词云图
     wc.to_file(v_outfile)  # 保存图片文件
     print('词云文件保存成功:{}'.format(v_outfile))
  except Exception as e:
     print('make_wordcloud except: {}'.format(str(e)))

def plot(df_cmt_date):
   df_cmt_date.columns = ['评论日期', '评论数量']
   import matplotlib.dates as mdates
   df_cmt_date.plot('评论日期', '评论数量', kind='scatter')
   plt.title('评论数量统计-散点图')
   plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m-%d'))
   plt.savefig('评论数量统计-散点图.png', dpi=300)
   plt.show()

if __name__ == '__main__':
  df =pd.read_csv('最新的评论数据.csv' )
  df['评论时间2'] = pd.to_datetime(df['评论时间'])
  df['评论日期'] = df['评论时间2'].dt.date
  df_cmt_date = df['评论日期'].value_counts()
  df_cmt_date = df_cmt_date.reset_index()
  v_cmt_list=df["评论内容"].values.tolist()
  print('length of v_cmt_list is:{}'.format(len(v_cmt_list)))
  sentiment_analyse(v_cmt_list=v_cmt_list)
  v_cmt_str=' '.join(str(i) for i in v_cmt_list)
  keywords_top20=jieba.analyse.extract_tags(v_cmt_str,withWeight=True,topK=20)
  print('top20关键词及权重:')
  pprint(keywords_top20)
  make_wordcloud(v_str=v_cmt_str,v_stopwords=['有','13','21','50'],v_outfile='提案_词云图.jpg')
  plot(df_cmt_date)

效果

图片描述

图片描述

图片描述

bili

import requests
import json
import pandas as pd

url = 'https://api.bilibili.com/x/web-interface/ranking/v2?rid=0&type=all'
headers = {
    'Referer': 'https://www.bilibili.com/v/popular/rank/all/',
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36'
}
response = requests.get(url, headers=headers)
data = json.loads(response.text)

video_list = data['data']['list']
rank_list = []
title_list = []
play_cnt_list = []
danmu_cnt_list = []
coin_cnt_list = []
author_list = []
video_url = []

for idx, video in enumerate(video_list):
    rank_list.append(idx + 1)
    title_list.append(video['title'])
    play_cnt_list.append(video['stat']['view'])
    danmu_cnt_list.append(video['stat']['danmaku'])
    coin_cnt_list.append(video['stat']['coin'])
    author_list.append(video['owner']['name'])
    video_url.append(f'https://www.bilibili.com/video/{video["bvid"]}')

df = pd.DataFrame({
    '排行': rank_list,
    '视频标题': title_list,
    '视频地址': video_url,
    '作者': author_list,
    '播放数': play_cnt_list,
    '弹幕数': danmu_cnt_list,
    '硬币数': coin_cnt_list,
})

df.to_csv('/users/easyo/B站热榜.csv', index=False, encoding="utf_8_sig")

小红书笔记

import requests  # 发送请求
import random
from time import sleep  # 设置等待,防止反爬
import time
import pandas as pd  # 保存csv
import datetime
import os


def trans_date(v_timestamp):
	"""13位时间戳转换为时间字符串"""
	v_timestamp = int(str(v_timestamp)[:10])
	timeArray = time.localtime(v_timestamp)
	otherStyleTime = time.strftime("%Y-%m-%d %H:%M:%S", timeArray)
	return otherStyleTime


# 当前时间戳
now = datetime.datetime.now().strftime('%Y%m%d%H%M%S')
# 爬取目标
page_id = '5d2dc8564c11180001712db3'
# 保存文件名
result_file = '小红书TAG_{}_{}.csv'.format(page_id, now)

# 请求头
h1 = {
	'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36',
}

# 初始化页码
page = 1
while True:
	title_list = []  # 笔记标题
	note_id_list = []  # 笔记id
	note_url_list = []  # 笔记链接
	likes_list = []  # 点赞数
	author_name_list = []  # 作者昵称
	author_id_list = []  # 作者id
	author_url_list = []  # 作者链接
	create_time_list = []  # 发布时间
	# 随机等待,防止反爬
	sleep(random.uniform(0, 2))
	# 判断是否首页
	if page == 1:
		url = 'https://www.xiaohongshu.com/web_api/sns/v3/page/notes?page_size=6&sort=hot&page_id={}&sid=&limit=false'.format(
			page_id)
	else:
		url = 'https://www.xiaohongshu.com/web_api/sns/v3/page/notes?page_size=6&sort=hot&page_id={}&sid=&cursor={}&limit=false'.format(
			page_id, next_cursor)
	# 发送请求
	r = requests.get(url, headers=h1)
	# 解析数据
	json_data = r.json()
	note_num = len(json_data['data']['notes'])
	print('开始爬第{}页,笔记数量:{}'.format(page, note_num))
	# 解析
	for i in json_data['data']['notes']:
		# 笔记标题
		title = i['title']
		title_list.append(title)
		# 笔记id
		note_id = i['id']
		note_id_list.append(note_id)
		# 笔记链接
		note_url_list.append('https://www.xiaohongshu.com/explore/' + note_id)
		# 点赞数
		like_count = i['likes']
		likes_list.append(like_count)
		# 作者昵称
		author_name = i['user']['nickname']
		author_name_list.append(author_name)
		# 作者id
		author_id = i['user']['userid']
		author_id_list.append(author_id)
		# 作者链接
		author_url_list.append('https://www.xiaohongshu.com/user/profile/' + author_id)
		# 发布时间
		create_time = trans_date(i['create_time'])
		create_time_list.append(create_time)
	# 保存数据到DF
	df = pd.DataFrame(
		{
			'页码': page,
			'笔记标题': title_list,
			'笔记id': note_id_list,
			'笔记链接': note_url_list,
			'点赞数': likes_list,
			'作者昵称': author_name_list,
			'作者id': author_id_list,
			'作者链接': author_url_list,
			'发布时间': create_time_list,
		}
	)
	# 设置csv文件表头
	if os.path.exists(result_file):
		header = False
	else:
		header = True
	# 保存到csv
	df.to_csv(result_file, mode='a+', header=header, index=False, encoding='utf_8_sig')
	print('文件保存成功:', result_file)
	# 判断终止条件
	next_cursor = json_data['data']['cursor']
	if not json_data['data']['has_more']:
		print('没有下一页了,终止循环!')
		break
	page += 1

豆瓣评论

import requests
from lxml import etree
#import pandas as pd
#import time

def page(url):
    headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:109.0) Gecko/20100101 Firefox/116.0',
               'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8',
               'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2',
               # 'Accept-Encoding': 'gzip, deflate, br',
               'Connection': 'keep-alive',
               'Referer': 'https://movie.douban.com/',
               'Upgrade-Insecure-Requests': '1',
               'Sec-Fetch-Dest': 'iframe',
               'Sec-Fetch-Mode': 'navigate',
               'Sec-Fetch-Site': 'cross-site',
               'Cookie': 'll="118381"; bid=u8W938yW3Rw; __utma=30149280.1427340690.1714406596.1714476685.1714485673.6; __utmz=30149280.1714485673.6.3.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; dbcl2="230302338:Y8MCu494BKI"; push_noty_num=0; push_doumail_num=0; __utmv=30149280.23030; __gads=ID=20dba742ff0011d4:T=1714406703:RT=1714485675:S=ALNI_MZz8jZp982rT48QdQdDBjh5a_vi0w; __gpi=UID=00000dff8c542177:T=1714406703:RT=1714485675:S=ALNI_MZxicDjcSAMi_pdkeduQ3otQCutaQ; __eoi=ID=4874846fe6a3305c:T=1714406703:RT=1714485675:S=AA-Afjb2qzoXLCtivOMEIgch2k1e; __utmc=30149280; ck=mJ0K; frodotk_db="f723232f0ba97d93e569698d22fc78b9"; __utmb=30149280.2.10.1714485673; ap_v=0,6.0; __utmt=1',
               }
    response= requests.get(url=url ,headers=headers)
    c_html = response.content
    return c_html

def next_p(c_html,base_url):
    link = None
    html_n = etree.HTML(c_html)
    lu_next = html_n.xpath('//div[@id="paginator"]/a[@class="next"]/@href')
    if lu_next:
        link = base_url + lu_next[0]
   # time.sleep(5)
    return link

def pin(c_html):
    #print(c_html)
    et_html= etree.HTML(c_html)
    lujing = et_html.xpath("/html/body/div[3]/div[1]/div/div[1]/div[4]/div/div[2]/p/span")
    return lujing
    '''for i in lujing:
        return i.text'''

def tol():
    base_url = 'https://movie.douban.com/subject/4321270/comments'
    link = base_url
    while link:
        c_html = page(link)
        data=[]
        for j in pin(c_html):
            data.append(j.text)
        print(data)
        link=next_p(c_html,base_url)
    return data

if __name__ == '__main__':
    #df = pd.DataFrame(tol())
    #df.to_csv('output.csv',index = False, encoding='utf-8')
    tol()