-
Notifications
You must be signed in to change notification settings - Fork 0
/
beatles.py
75 lines (48 loc) · 1.96 KB
/
beatles.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
# Write-up: http://techn.ology.net/predicting-beatles-song-authorship-with-scikit-learn/
import re
import os.path
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
www = 'beatles-who-wrote-what.txt'
path = 'Beatles/'
d = {}
with open(www) as f:
for line in f:
if not line.startswith('Album: '):
if re.search(r'\w', line) and 'composer' not in line:
song = line.split('\t')
if len(song) > 1:
what = song[0].strip()
what = re.sub(r'\W', '_', what)
who = song[1].strip()
if ('Lennon' == who) or ('McCartney' == who):
filename = path + what + '.txt'
if os.path.isfile(filename):
with open(filename, encoding='latin1') as fh:
lyrics = fh.read()
d[what] = { 'author': who, 'lyrics': lyrics }
X = []
y = []
for key in sorted(d):
X.append(d[key]['lyrics'])
y.append(d[key]['author'])
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
vect = CountVectorizer() #stop_words='english' #ngram_range=(1, 2)
vect.fit(X_train)
X_train_dtm = vect.transform(X_train)
X_test_dtm = vect.transform(X_test)
nb = MultinomialNB(alpha=0.001)
nb.fit(X_train_dtm, y_train)
y_pred = nb.predict(X_test_dtm)
print(metrics.accuracy_score(y_test, y_pred)) # 0.625
# TF-IDF SCALE THE VOCABULARY
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_dtm)
X_test_tfidf = tfidf_transformer.fit_transform(X_test_dtm)
nb = MultinomialNB(alpha=0.001)
nb.fit(X_train_tfidf, y_train)
y_pred = nb.predict(X_test_tfidf)
print(metrics.accuracy_score(y_test, y_pred)) # 0.625