-
Notifications
You must be signed in to change notification settings - Fork 3
/
video_info_for_up_bilibili.py
136 lines (125 loc) · 4.89 KB
/
video_info_for_up_bilibili.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
# -*- coding: utf-8 -*-
"""
Created on Wed Mar 28 15:22:37 2018
@author: Administrator
"""
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
import re
import time
import pandas as pd
def get_av_list(up_num):
chrome_options = Options()
chrome_options.add_argument("--headless") # define headless
driver = webdriver.Chrome(chrome_options=chrome_options)
#up_num='883968'#up主ID
av_all=[]
driver.get('http://space.bilibili.com/%s#/video'%(up_num,))
#counter=0
while(1):
html=driver.page_source
soup = BeautifulSoup(html, features='lxml')
all_href = soup.find_all('a')
#print(all_href)
av_path=re.compile(r'(?<=av)([0-9]+)$')
for l in all_href:
data=l.get('href')
if data:
if av_path.findall(data):
temp=av_path.findall(data)[0]
av_all.append(int(temp))
elements=driver.find_elements_by_class_name('be-pager-next')#点击下一页
time.sleep(1)
try:
elements[0].click()
time.sleep(1)
except:
if av_all==[]:
continue
else:
break
#开始检测是否每个视频都存在
av_all=list(set(av_all))
av_all.sort(reverse=True)#降序
return av_all
def get_av_info(mid,name,av_list,dict_whole):
chrome_options = Options()
chrome_options.add_argument("--headless") # define headless
driver = webdriver.Chrome(chrome_options=chrome_options)
for av in av_list:
#==========================单个视频的信息======================#
driver.get('https://www.bilibili.com/video/av%s'%(str(av),))
html=driver.page_source
if '视频去哪了呢' in html or '追番' in html:
av_list.remove(av)#如果视频不存在,丢掉该av,下一个
print(str(av)+'removed')
continue
else:
pass
dict_whole['av'].append(av)#视频av号
dict_whole['up_id'].append(mid)#upID
dict_whole['up_name'].append(name)#up昵称
soup = BeautifulSoup(html, features='lxml')
title=soup.find('h1').text.strip()
dict_whole['title'].append(title)#视频标题
play_num=re.findall(r'总播放数([0-9]+)',html)[0]
dict_whole['play_num'].append(int(play_num))#总播放量
danmu_num=re.findall(r'总弹幕数([0-9]+)',html)[0]
dict_whole['danmu_num'].append(int(danmu_num))#总弹幕数
try:
top_rank=re.findall(r'最高全站日排行([0-9]+)名',html)[0]
dict_whole['top_rank'].append(int(top_rank))#排行
except:
dict_whole['top_rank'].append(0)
coin_num=re.findall(r'投硬币枚数([0-9]+)',html)[0]
dict_whole['coin_num'].append(int(coin_num))#硬币数
fav_num=re.findall(r'收藏人数([0-9]+)',html)[0]
dict_whole['favorite_num'].append(int(fav_num))#收藏人数
dict_whole['time'].append(soup.find('time').text)#视频时间
#分类
catalog=''
crumb=soup.find_all('span',attrs={'class':'crumb'})
for item in crumb:
catalog=catalog+item.find('a').contents[0]+' '
dict_whole['catalog'].append(catalog)
#评论数
comment=re.findall('itemprop=\"commentCount\" content=\"([0-9]+)\"',html)[0]
dict_whole['comment_num'].append(int(comment))
time_local = time.localtime(time.time())
#转换成新的时间格式(2016-05-05 20:28:54)
part_date= time.strftime("%Y-%m-%d %H:%M:%S",time_local)
dict_whole['part_date'].append(part_date)
#tag
tag=''
for item in soup.find_all('li',attrs={'class':'tag'}):
tag=tag+item.text+' '
dict_whole['tags'].append(tag)
print(str(av)+' info_done')
return av_list
def main():
try:
df=pd.read_excel('output_up.xlsx')
up_id=list(df['mid'])#所要爬的up主ID
up_name=list(df['name'])
global dict_whole
dict_whole={'av':[],'up_id':[],'up_name':[],'play_num':[],
'danmu_num':[],'coin_num':[],'favorite_num':[],'time':[],
'title':[],'top_rank':[],'tags':[],'catalog':[],'comment_num':[],'part_date':[]}
for mid,name in zip(up_id,up_name):
print((mid,name))
av_list=get_av_list(mid)
print(len(av_list))
time.sleep(1)
av_list_final=get_av_info(mid,name,av_list,dict_whole)
print(len(av_list_final))
except Exception as e:
print(e)#错误
finally:
writer=pd.ExcelWriter('up_video_info.xlsx')
df=pd.DataFrame.from_dict(dict_whole,orient='index')
df=df.transpose()
df.to_excel(writer,index=False)
writer.save()
if __name__=='__main__':
main()