-
Notifications
You must be signed in to change notification settings - Fork 0
/
Sentimental1.py
177 lines (107 loc) · 5.49 KB
/
Sentimental1.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
import pandas as pd
import numpy as np
import re
from sklearn.metrics import accuracy_score,ConfusionMatrixDisplay,classification_report
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfTransformer,TfidfVectorizer,CountVectorizer
from sklearn.linear_model import LogisticRegression
import joblib
import nltk
from nltk.stem import SnowballStemmer
pd.set_option('display.max_columns', None)
pd.set_option('display.expand_frame_repr', False)
pd.set_option('max_colwidth', -1)
# Reading Data
df=pd.read_csv('train.csv',encoding='latin1')
df_test=pd.read_csv('test.csv',encoding='latin1')
df_=df.copy()
df_test_=df_test.copy()
df.info()
# Data Cleaning
df.isna().sum();
df=df.dropna(axis=0) # Text was null, so it not usefull data. Drop it.
df_test.isna().sum();
df=df.dropna(how='all',axis=0);
df_test=df_test.dropna(how='all',axis=0);
# # # # # #Some Rows are completely null
def text_clean(text):
text = re.sub(pattern=r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b',repl= '',string= text) # For the mail
text = re.sub(pattern=r'https\S+|www\S+|https\S+', repl= ' ',string= text) # For the URL
text = re.sub(pattern=r'(\\d|\\W)+', repl= ' ',string= text)#Remove digits and non-word characters (\W).
text = re.sub(pattern=r'\@\w+|\#', repl= ' ',string= text) #Remove Twitter handles (@username) and hashtags (#).
text = re.sub(pattern=r'[^\w\s\`]', repl= ' ',string= text) #Remove any characters that are not alphanumeric, whitespace, or a backtick (`).
text = re.sub(pattern='[0-9]', repl= ' ',string= text) # For the number
text = re.sub(r'\b\w\b',repl= ' ',string= text) #For the single character
text = re.sub(pattern=r'[^\w\s]', repl=' ', string=text) # Remove backtick from the pattern
return(text)
df["text"]=df["text"].apply(text_clean)
df["text"]=df["text"].str.strip()### For the white space
#Stemming
nltk.download('punkt') # Ensure you have punkt tokenizer downloaded
stemmer = SnowballStemmer(language='english')
def tokenizaton_stemming(text):
tokenization=nltk.word_tokenize(text)
stemming= [stemmer.stem(tokens) for tokens in tokenization]
return stemming
### For the all same country, population, land are and density have to be same
country_mistake=[]
for country in df['Country'].unique():
if df[df['Country']==country]['Population -2020'].nunique()!=1:
country_mistake.append(country)
country_mistake.append('Population -2020')
elif df[df['Country']==country]['Land Area (Km²)'].nunique()!=1:
country_mistake.append(country)
country_mistake.append('Land Area (Km²)')
elif df[df['Country']==country]['Density (P/Km²)'].nunique()!=1:
country_mistake.append(country)
country_mistake.append('Density (P/Km²)')
### country_mistake it is an empthy, so it is good. We looked whether there is a diverstiy for the certain data.
# Controlling Most 20 words
def frequency_of_words(labels, number_of_words=20, tokenize=None):
for label in labels:
cv = CountVectorizer(stop_words='english', tokenizer=tokenize)
matrix = cv.fit_transform(df[df['sentiment'] == label]['text'])
freqs = zip(cv.get_feature_names_out(), matrix.sum(axis=0).tolist()[0])
# sort from largest to smallest
print(f"Top {number_of_words} words used for {label} reviews.")
print(sorted(freqs, key=lambda x: -x[1])[:number_of_words])
frequency_of_words(['positive', 'negative', 'neutral'], tokenize=tokenizaton_stemming)
# # If the 20 words are made up nonsense, you have to control your data cleaning process
# Machine Learning Sentiment Analysis
# # Test and Train data
X_train=df['text']
y_train=df['sentiment']
X_test=df_test['text']
y_test=df_test['sentiment']
# Model Selection
# # # LinearSVC()
models={LinearSVC():'svc',LogisticRegression(max_iter=1000):'lr',MultinomialNB():'nb',KNeighborsClassifier():'knn',GradientBoostingClassifier():'gb'}
def model_selection(expected_model):
# Please import expected model before the run the function
for model,shortcuts in expected_model.items():
pipe=Pipeline([('tfidf',TfidfVectorizer(stop_words='english',tokenizer=tokenizaton_stemming)),(shortcuts,model)])
pipe.fit(X_train,y_train)
y_predict=pipe.predict(X_test)
print(f'{shortcuts} Results',classification_report(y_test,y_predict))
print(f'{shortcuts} Results',accuracy_score(y_test,y_predict))
model_selection(models)
# LogisticRegression has the most accuracy.Now lets set the hyperparameters
pipe_lr=Pipeline([('tfidf',TfidfVectorizer(stop_words='english',tokenizer=tokenizaton_stemming)),('lr', LogisticRegression(max_iter=10000,class_weight='balanced',C=1))])
pipe_lr.fit(X_train,y_train)
y_predict_lr=pipe_lr.predict(X_test)
print(classification_report(y_test,y_predict_lr))
print(accuracy_score(y_test,y_predict_lr))
# Load model
joblib.dump(pipe_lr,'finalmodel.pkl')
# Test
model = joblib.load('finalmodel.pkl')
# Now, you can use the loaded model for prediction
prediction = model.predict(['It is awesome'])
print(prediction)
# For the analysis, I will extract data.
df=df.dropna()
df.to_csv('Cleaned.csv')