-
Notifications
You must be signed in to change notification settings - Fork 0
/
main.py
159 lines (123 loc) · 5.37 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
import argparse
import json
import re
from pathlib import Path
from urllib.parse import urljoin
import requests
from bs4 import BeautifulSoup
DEFAULT_SOURCE = 'Youtube'
def parse_track_json(index, track_json):
track = {
"title": track_json['title'],
"artist": ', '.join([a['name'] for a in track_json['artists']]),
'index': index + 1,
}
if len(track_json.get('albums', [])) > 0:
track['album'] = track_json['albums'][0]['title']
if 'coverUri' in track_json or 'cover_uri' in track_json:
track_uri = track_json.get('cover_uri') if track_json.get('coverUri') is None else track_json['coverUri']
if track_uri:
url = track_uri.replace('%', '').strip()
track['thumbnail'] = urljoin(f'https://{url}', '50x50')
return track
def save_to_file(tracks, tracklist_title):
playlist_data = {'name': tracklist_title,
'numberOfTrack': len(tracks),
'source': DEFAULT_SOURCE,
'tracks': tracks}
file_name = f"{tracklist_title}.json"
print(f'Exported: {file_name} with {len(tracks)} tracks')
json.dump(playlist_data, open(file_name, 'w', encoding='utf-8'))
def get_tracks_from_js(soup):
tracks = []
js_script = soup.find('script', string=re.compile('var Mu={'))
tracklist_title = ''
if js_script:
json_text = re.findall(r'var Mu=(.*);', js_script.text)[0]
json_data = json.loads(json_text)
if 'pageData' not in json_data or 'playlist' not in json_data['pageData']:
print("Couldn't parse from js script.")
return [], ''
tracklist_title = json_data['pageData']['playlist']['title']
for index, track_json in enumerate(json_data['pageData']['playlist']['tracks']):
track = parse_track_json(index, track_json)
tracks.append(track)
return tracks, tracklist_title
def get_tracks_from_html(soup):
tracks = []
tracklist_title_element = soup.find(class_='page-playlist__title')
if not tracklist_title_element:
tracklist_title_element = soup.find('div', class_='sidebar__title typo-h2')
tracklist_title = tracklist_title_element.text.strip()
if not tracklist_title:
tracklist_title = tracklist_title_element.attrs['value']
for index, track_element in enumerate(soup.find_all('div', class_='d-track')):
title_element = track_element.find('div', class_='d-track__name')
artist_element = track_element.find('span', class_='d-track__artists')
image = track_element.find('img', class_='entity-cover__image deco-pane')
track = {
"title": title_element.text.strip(),
"artist": artist_element.text.strip(),
'index': index + 1,
}
if image:
track['thumbnail'] = 'https:' + image.attrs['src']
tracks.append(track)
return tracks, tracklist_title
def proceed_tracks(client, tracks_data, tracklist_title):
tracks_list = client.tracks(track_ids=[t['id'] for t in tracks_data])
tracks = [parse_track_json(index, track_json.__dict__) for index, track_json in enumerate(tracks_list)]
save_to_file(tracks, tracklist_title)
def get_tracks_by_api(token):
from yandex_music import Client
client = Client(token).init()
likes_playlist = client.users_likes_tracks()
if likes_playlist:
proceed_tracks(client, likes_playlist.tracks, 'likes')
all_playlists = client.users_playlists_list()
playlists = client.users_playlists(kind=[pl['kind'] for pl in all_playlists])
for playlist in playlists:
proceed_tracks(client, playlist['tracks'], playlist['title'])
def get_html(url):
headers = {
"User-Agent": "curl/8.7.3",
"Accept": "*/*",
"Connection": "keep-alive",
}
session = requests.Session()
session.headers = {}
response = session.get(url, headers=headers)
if response.ok:
return response.text
else:
print(f"Couldn't load {url} - [{response.status_code}]")
return None
def get_file(file):
if Path(file).exists():
return Path(file).read_text(encoding='utf-8')
else:
print(f"File {file} not exits!")
return None
if __name__ == "__main__":
parser = argparse.ArgumentParser(description='Парсер плейлистов Яндекс.Музыки')
parser.add_argument('-u', '--urls', nargs='+', help='Ссылки на плейлисты')
parser.add_argument("-f", "--files", nargs='+', help="Файлы с путем до html файла с плейлистом.")
parser.add_argument("-t", "--token", help="Токен Яндекс.Музыки - для доступа к аккаунту через API")
args = parser.parse_args()
if args.token:
get_tracks_by_api(args.token)
elif args.urls or args.files:
if args.urls:
htmls = [get_html(url) for url in args.urls]
else:
htmls = [get_file(file) for file in args.files]
for html in htmls:
if html is None:
continue
soup = BeautifulSoup(html, 'lxml')
if soup.find('div', class_='CheckboxCaptcha-Label') is not None:
raise Exception("Captcha showed!")
tracks, title = get_tracks_from_js(soup)
if not tracks:
tracks, title = get_tracks_from_html(soup)
save_to_file(tracks, title)