-
Notifications
You must be signed in to change notification settings - Fork 65
/
电影天堂.py
79 lines (67 loc) · 2.75 KB
/
电影天堂.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
import requests
from bs4 import BeautifulSoup
import re
import csv
import _thread
import time
User_Agent = 'Mozilla/5.0(Macintosh;IntelMacOSX10_7_0)AppleWebKit/535.11(KHTML,likeGecko)Chrome/17.0.963.56Safari/535.11'
headers = {
'User-Agent': User_Agent,
}
# 获取每一页的所有电影下载链接
def get_new_movie(page):
r = requests.get(url="https://www.ygdy8.com/html/gndy/dyzz/list_23_{}.html".format(page), headers=headers)
r.encoding = 'gb2312' #因为该网站的编码是gb2312,所以这里我们需要设置一下编码,否则会报错
html = r.text
name_list = [] # 用来装获取到的电影名称
download_list = [] # 用来装获取到的电影下载链接
bs = BeautifulSoup(html, "html.parser") # 用BeautifulSoup进行html解析
b = bs.findAll(class_="co_content8")
b = b[0].findAll(class_="ulink") # 此处拿到了每一页中的电影列表
for i in range(0, len(b)):
name = b[i].get_text() # 获取每个电影的名称
href = "https://www.ygdy8.com/"+b[i].get("href") # 获取每个电影的详情页面的url
print(b[i].get_text())
r1 = requests.get(url=href,headers=headers) # 访问每部电影的详情页面
r1.encoding = 'gb2312'
html1 = r1.text
bs1 = BeautifulSoup(html1, "html.parser")
b1 = bs1.find("tbody").find_next("td").find_next("a")
download_url = b1.get("href") # 获取到下载链接
print(download_url)
name_list.append(name)
download_list.append(download_url)
return name_list,download_list
def get_total_page(url):
r = requests.get(url=url,headers=headers)
r.encoding = 'gb2312'
pattern = re.compile(r'(?<=页/)\d+') # re解析
t = pattern.findall(r.text)
return int(t[0])
def wirte_into_csv(name,down_url):
f = open('最新电影.csv', 'a+', encoding='utf-8') # a+表示追加
csv_writer = csv.writer(f)
csv_writer.writerow([name,down_url])
f.close()
def run(start_page, end_page):
for p in range(start_page, end_page):
name_list, down_list = get_new_movie(p)
for i in range(0, len(name_list)):
wirte_into_csv(name_list[i],down_list[i])
time.sleep(3)
if __name__ == '__main__':
# get_new_movie()
total_page = get_total_page("https://www.ygdy8.com/html/gndy/oumei/list_7_1.html")
total_page = int(total_page/25+1)
end = int(total_page/4)
print(end)
# 创建四个线程,并对其分配任务
try:
_thread.start_new_thread(run, (1, end))
_thread.start_new_thread(run, (end+1, end*2))
_thread.start_new_thread(run, (end*2 + 1, end * 3))
_thread.start_new_thread(run, (end*3 + 1, end * 4))
except:
print("Error: 无法启动线程")
while(1):
pass