-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathonlyfakenews.py
93 lines (74 loc) · 3.35 KB
/
onlyfakenews.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
import numpy as np # linear algebra
import pandas as pd
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import metrics
from sklearn import svm
news = pd.read_csv("fake.csv")
news["country_id"] = 0
news["country_id"][news["country"]=="US"] = 1
news["country_id"][news["country"]=="CO"] = 2
news["country_id"][news["country"]=="FR"] = 3
news["country_id"][news["country"]=="DE"] = 4
news["country_id"][news["country"]=="GB"] = 5
news["country_id"][news["country"]=="CA"] = 6
news["country_id"][news["country"]=="AU"] = 7
news["country_id"][news["country"]=="EU"] = 8
news["country_id"][news["country"]=="NL"] = 9
news["country_id"][news["country"]=="LI"] = 10
news["country_id"][news["country"]=="SG"] = 11
news["country_id"][news["country"]=="IO"] = 12
news["country_id"][news["country"]=="ME"] = 13
news["country_id"][news["country"]=="TV"] = 14
news["country_id"][news["country"]=="ES"] = 15
news["country_id"][news["country"]=="RU"] = 16
news["country_id"][news["country"]=="IN"] = 17
news["country_id"][news["country"]=="US"] = 18
news["country_id"][news["country"]=="US"] = 19
news["country_id"][news["country"]=="EE"] = 20
news["country_id"][news["country"]=="SE"] = 21
news["country_id"][news["country"]=="ZA"] = 22
news["country_id"][news["country"]=="IS"] = 23
news["country_id"][news["country"]=="BG"] = 24
news["country_id"][news["country"]=="CH"] = 25
news["type_id"] = 0
news["type_id"][news["type"]=="bias"] = 1
news["type_id"][news["type"]=="conspiracy"] = 2
news["type_id"][news["type"]=="fake"] = 3
news["type_id"][news["type"]=="bs"] = 4
news["type_id"][news["type"]=="satire"] = 5
news["type_id"][news["type"]=="hate"] = 6
news["type_id"][news["type"]=="junksci"] = 7
news["type_id"][news["type"]=="state"] = 8
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(news["text"][news["language"]=="english"], news["type_id"][news["language"]=="english"], test_size=0.2, random_state=53)
count_vectorizer = CountVectorizer(stop_words='english')
# Fit and transform the training data
count_train = count_vectorizer.fit_transform(X_train.values.astype('U'))
# Transform the test set
count_test = count_vectorizer.transform(X_test.values.astype('U'))
# Initialize the `tfidf_vectorizer`
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_df=0.7)
# Fit and transform the training data
tfidf_train = tfidf_vectorizer.fit_transform(X_train.values.astype('U'))
# Transform the test set
tfidf_test = tfidf_vectorizer.transform(X_test.values.astype('U'))
clf = MultinomialNB()
clf_svm = svm.SVC(kernel='linear', C = 1.0)
clf.fit(count_train, y_train)
clf_svm.fit(count_train, y_train)
pred = clf.predict(count_test)
pred_svm = clf_svm.predict(count_test)
score = metrics.accuracy_score(y_test, pred)
print("Accuracy for Count Vectorizer: %0.3f" % score)
score = metrics.accuracy_score(y_test, pred_svm)
print("Accuracy for Count Vectorizer (SVM): %0.3f" % score)
clf.fit(tfidf_train, y_train)
clf_svm.fit(tfidf_train, y_train)
pred = clf.predict(tfidf_test)
pred_svm = clf_svm.predict(count_test)
score = metrics.accuracy_score(y_test, pred)
print("Accuracy for TFIDF Vectorizer: %0.3f" % score)
score = metrics.accuracy_score(y_test, pred_svm)
print("Accuracy for TFIDF Vectorizer (SVM): %0.3f" % score)