-
Notifications
You must be signed in to change notification settings - Fork 0
/
Dashboard.py
147 lines (94 loc) · 5.33 KB
/
Dashboard.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
import pandas as pd
import numpy as np
import re
import joblib
import streamlit as st
import plotly.express as px
import pycountry
import seaborn as sns
import nltk
import matplotlib.pyplot as plt
from nltk.stem import SnowballStemmer
from sklearn.feature_extraction.text import TfidfTransformer,TfidfVectorizer,CountVectorizer
import time
nltk.download('punkt') # Ensure you have punkt tokenizer downloaded
stemmer = SnowballStemmer(language='english')
def tokenizaton_stemming(text):
tokenization=nltk.word_tokenize(text)
stemming= [stemmer.stem(tokens) for tokens in tokenization]
return stemming
def frequency_of_words(labels, number_of_words=20, tokenize=None):
frequencies = {}
for label in labels:
cv = CountVectorizer(stop_words='english', tokenizer=tokenize)
matrix = cv.fit_transform(df[df['sentiment'] == label]['text'].values.astype('U'))
freqs = zip(cv.get_feature_names_out(), matrix.sum(axis=0).tolist()[0])
# sort from largest to smallest
frequencies[label] = sorted(freqs, key=lambda x: -x[1])[:number_of_words]
return frequencies
# Data Cleaning
df=pd.read_csv("Cleaned.csv",encoding='latin1')
df=df.set_index(df['textID'])
df=df.drop(columns='Unnamed: 0')
# Data Preparation
iso_alpha_dict = {country.name: country.alpha_3 for country in pycountry.countries}
country_to_iso_dict = {country: iso_alpha_dict.get(country, None) for country in df['Country']}
df['ISO_alpha'] = df['Country'].map(country_to_iso_dict)
number_of_tweets = df.groupby('Country').size().reset_index(name='Number of Tweets')
df = pd.merge(df, number_of_tweets, on='Country')
# For the Word Cloud
from wordcloud import WordCloud
word_frequencies=frequency_of_words(['positive', 'negative', 'neutral'], tokenize=tokenizaton_stemming)
positive_frequencies = word_frequencies.get('positive', [])
neutral_frequencies=word_frequencies.get('neutral', [])
negative_frequencies=word_frequencies.get('negative', [])
nlp= joblib.load('finalmodel.pkl')
# Deployement
st.set_page_config( page_title="Sentiment Analysis and WordCloud", page_icon="🤗",layout="wide")
st.title("NLP - Sentiment Analysis ☁️☁️") # Head of the Website and its title
with st.sidebar:
st.title('X Tweets Sentiment Analysis ☁️')
sidebar=st.sidebar.selectbox(label='',options=("Model Predicton","Data Frame","WordCloud"))
if sidebar=="Model Predicton":
text=st.text_area("Please write a tweet that you want to learn its sentiment 🤗")
result=nlp.predict([text])
if st.button("Lets Predict🤗") and len(text)>1:
if result=='positive':
st.write("This is a positive tweet :sunglasses:")
elif result=='neutral':
st.write("This is a neutral tweet 😐")
elif result=='negative':
st.write("This is a negative tweet 😕")
else:
st.write('There is a mistake')
st.success('Congrats!! you have just learned your tweets sentiment and it is {results} tweet'.format(results=result))
with st.expander('About', expanded=True):
st.write('''
- :orange[**Linkedn**]: https://www.linkedin.com/in/nevzatayhan/.
- :orange[**GitHub**]: https://github.com/NevzatTaha
- For any cooperations or suggestions please send an Email: [email protected]
''')
elif sidebar == "Data Frame":
st.subheader('Dataframe')
st.write(df[['text', 'sentiment',
'Time of Tweet', 'Age of User', 'Country', 'Population -2020',
'Land Area (Km²)', 'Density (P/Km²)']])
with st.expander('Details',expanded=True ):
st.write('''
- :orange[**Resource of the Project**]: https://www.kaggle.com/datasets/abhi8923shriv/sentiment-analysis-dataset/data)
''')
# elif sidebar == "Country Information":
# st.subheader('This is world map that shows number of tweets and the size of the countries.')
# earth_map = px.scatter_geo(data_frame=df, locations='ISO_alpha',color="Land Area (Km²)",hover_name='Country', size="Number of Tweets", projection="natural earth")
# earth_map.update_layout(width=1000)
# st.plotly_chart(earth_map)
elif sidebar=="WordCloud":# Generate WordCloud
st.subheader("These are words cloud that explains which words are most used for the specific labels.")
wordcloud_positive = WordCloud(width=600, height=400, background_color='white',stopwords="english").generate_from_frequencies(dict(positive_frequencies))
wordcloud_neutral = WordCloud(width=600, height=400, background_color='white',stopwords="english").generate_from_frequencies(dict(neutral_frequencies))
wordcloud_negative = WordCloud(width=600, height=400, background_color='white',colormap= 'cool',stopwords="english").generate_from_frequencies(dict(negative_frequencies))
fig,ax=plt.subplots(nrows=(3),figsize=(7,10))
# Plot the WordCloud
st.image(wordcloud_positive.to_array(), caption='Word Cloud for Positive Reviews')
st.image(wordcloud_neutral.to_array(), caption='#Word Cloud for Neutral Reviews')
st.image(wordcloud_negative.to_array(), caption=' Word Cloud for Negative Reviews')