-
Notifications
You must be signed in to change notification settings - Fork 0
/
lyrics_scraper.py
122 lines (101 loc) · 4.23 KB
/
lyrics_scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
import os.path
import requests
from bs4 import BeautifulSoup
import json
from pprint import pprint
import os
from hashlib import sha1
import random
import re
from time import sleep
save_path = '/home/mathuryash5/6th Sem/Natural Language Processing/Project/Data/'
CACHE_DIR = os.path.join(os.path.dirname(__file__), 'cache')
user_agents = [
'Mozilla/5.0 (Windows; U; Windows NT 5.1; it; rv:1.8.1.11) Gecko/20071127 Firefox/2.0.0.11',
'Opera/9.25 (Windows NT 5.1; U; en)',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)',
'Mozilla/5.0 (compatible; Konqueror/3.5; Linux) KHTML/3.5.5 (like Gecko) (Kubuntu)',
'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.142 Safari/535.19',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.7; rv:11.0) Gecko/20100101 Firefox/11.0',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:8.0.1) Gecko/20100101 Firefox/8.0.1',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.151 Safari/535.19']
proxies = [{"http": "http://107.170.13.140:3128"}, {"http": "http://198.23.67.90:3128"}]
def url_to_filename(url):
#Make a URL into a file name, using SHA1 hashes.
# use a sha1 hash to convert the url into a unique filename
hash_file = sha1(url.encode('utf-8')).hexdigest() + '.html'
return os.path.join(CACHE_DIR, hash_file)
def store_local(url, content):
#Save a local copy of the file.
# If the cache directory does not exist, make one.
if not os.path.isdir(CACHE_DIR):
os.makedirs(CACHE_DIR)
# Save to disk.
local_path = url_to_filename(url)
with open(local_path, 'wb') as f:
f.write(content)
def load_local(url):
# Read a local copy of a URL.
local_path = url_to_filename(url)
if not os.path.exists(local_path):
return None
with open(local_path, 'rb') as f:
return f.read()
# function to download the lyrics
def get_lyric_AZ(url, proxies, user_agents):
if load_local(url):
print("i was here ---------------------------\n\n")
soup = BeautifulSoup(load_local(url), 'lxml')
# get the goods (which is the tag where the lyrics text is there)
for goods in soup.find_all("div", {"class":None}):
if len(goods.text) == 0:
continue
return goods.text
else:
print('ooooooooooooooooooooooooooooooooooo\n\n')
# so that the server doesn't understand it's a bot scraping
sleep(random.randint(0,20))
# get request with headers --> from the array and proxies --> from another array to prevent bot from getting blocked
response = requests.get(url, headers = {'User-Agent': random.choice(user_agents)}, proxies = random.choice(proxies))
# store the response and it's contents in the form of url, so that you don't repeat scraping by checking in local folder
store_local(url, response.content)
soup = BeautifulSoup(response.content, 'lxml')
tmp = ""
# get the goods (which is the tag where the lyrics text is there)
for goods in soup.find_all("div", {"class":None}):
if len(goods.text) == 0:
print("hello asdgggggggggggggggggggggggggggggg")
continue
return goods.text
def main():
print("Welcome to main ! ! ! ! ")
# read the song data from JSON, save song name and artist name to parse the url for get request
with open('run_results.json') as data_file:
data = json.load(data_file)
for song_data in data["selection1"]:
song_name = song_data["song_name"]
artist = song_data["artist"]
new_artist = artist.replace(" ","")
new_song_name = song_name.replace(" ","")
new_artist = new_artist.replace("\'","")
new_song_name = new_song_name.replace("\'","")
l_song = new_song_name.lower()
l_artist = new_artist.lower()
print(l_artist,l_song)
if(l_artist == "thewho"):
l_artist="who"
elif(l_artist == "rarycharles"):
l_song="whatdisayparts12"
# parsing the url to form the request
url = "https://www.azlyrics.com/lyrics/"+l_artist+"/"+l_song+".html"
print("\nURL_SONG =",url)
# get lyrics from the function
goodies = get_lyric_AZ(url,proxies,user_agents)
if(goodies is None):
continue
complete_name = os.path.join(save_path,l_song+".txt") #parsing the path and name of the songs
file = open(complete_name,"w")
file.write(goodies)
print("Successfully scraped song :",(l_song))
file.close()
main()