-
Notifications
You must be signed in to change notification settings - Fork 0
/
scraper.py
113 lines (98 loc) · 4.2 KB
/
scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
# -*- coding: utf-8 -*-
import os
import googleapiclient.discovery
import httplib2
from configparser import ConfigParser
import pandas as pd
import json
import time
from datetime import datetime
import common.logger as logger
_logger = logger.Logger('info')
def read_config(filename):
conf = ConfigParser()
conf.read(filename)
api_service_name = conf.get("youtube", "api_service_name")
api_version = conf.get("youtube", "api_version")
API_KEY = conf.get("youtube", "API_KEY")
proxy_host = conf.get("proxy", "host")
proxy_port = conf.getint("proxy", "port")
return api_service_name, api_version, API_KEY, proxy_host, proxy_port
def build_client(api_service_name, api_version, API_KEY, proxy_host, proxy_port):
proxy_info = httplib2.ProxyInfo(proxy_type=httplib2.socks.PROXY_TYPE_HTTP,
proxy_host=proxy_host,
proxy_port=proxy_port)
http = httplib2.Http(timeout=10, proxy_info=proxy_info, disable_ssl_certificate_validation=False)
youtube = googleapiclient.discovery.build(api_service_name, api_version, developerKey=API_KEY, http=http)
return youtube
def get_comments(youtube, videoId, pageToken):
request = youtube.commentThreads().list(part="id,snippet",
maxResults=100,
videoId=videoId,
pageToken=pageToken)
try:
response = request.execute()
return response
except Exception as error:
_logger.error(error)
return None
def process_response(response):
try:
nextPageToken = response["nextPageToken"]
except Exception as error:
nextPageToken = None
result = []
for index, item in enumerate(response["items"]):
video_id = item["snippet"]["videoId"]
comment_id = item["snippet"]["topLevelComment"]["id"]
comment = item["snippet"]["topLevelComment"]["snippet"]
comment_display = comment["textDisplay"]
comment_original = comment["textOriginal"]
author_name = comment["authorDisplayName"]
author_profile_img_url = comment["authorProfileImageUrl"]
author_channel_url = comment["authorChannelUrl"]
author_channel_id = author_channel_url[-24:]
publish_date = comment["publishedAt"]
result.append({
'video_id': video_id,
'comment_id': comment_id,
'comment_display': comment_display,
'comment_original': comment_original,
'author_name': author_name,
'author_profile_img_url': author_profile_img_url,
'author_channel_url': author_channel_url,
'author_channel_id': author_channel_id,
'publish_date': publish_date
})
print("{0}: video {1} || comment {2}".format(index, video_id, comment_id))
return nextPageToken, result
if __name__ == "__main__":
api_service_name, api_version, API_KEY, proxy_host, proxy_port = read_config('config.ini')
# print(api_service_name, type(api_service_name))
# print(api_version, type(api_version))
# print(API_KEY, type(API_KEY))
# print(proxy_host, type(proxy_host))
# print(proxy_port, type(proxy_port))
youtube = build_client(api_service_name, api_version, API_KEY, proxy_host, proxy_port)
df_id = pd.read_csv('VideoId.csv', engine='python')
videoIdList = df_id.video_id.to_list()
try:
comments = []
for index, videoId in enumerate(videoIdList):
nextPageToken = None
while True:
# time.sleep(0.5)
response = get_comments(youtube, videoId, nextPageToken)
if not response:
break
nextPageToken, result = process_response(response)
comments += result
if not nextPageToken: # 该视频的评论抓完了
break
except Exception as error:
_logger.error(error)
finally:
if comments:
filename = "./output/all_comments_" + str(datetime.timestamp(datetime.now())) + ".json"
with open(filename, 'w', encoding='utf-8') as f:
json.dump(comments, f, indent=4)