-
Notifications
You must be signed in to change notification settings - Fork 0
/
sentiment_analysis.py
163 lines (123 loc) · 5.37 KB
/
sentiment_analysis.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
from transformers import pipeline
import mysql.connector
import pandas as pd
from tqdm import tqdm
import pickle
from config import *
from nltk.corpus import stopwords
from wordcloud import WordCloud
import os.path
import numpy as np
from collections import Counter
import scipy.stats as stats
def get_posts(type):
reddit_db = mysql.connector.connect(
host=HOST, user=USER, password=PASSWORD, database=DATABASE_NAME
)
cursor = reddit_db.cursor()
cursor.execute(f"select * from posts where subreddit = '{type}'")
posts = cursor.fetchall()
posts = pd.DataFrame(
posts, columns=["post_id", "post_title",
"post_body", "subreddit_name", "timestamp"]
).drop(columns=["timestamp"])
return posts
def get_posts_sentiment(type):
if os.path.exists(f"data/{type}_sentiment.pickle"):
with open(f"data/{type}_sentiment.pickle", "rb") as f:
sentiment = pickle.load(f)
return sentiment
else:
sentiment_pipeline = pipeline(
"sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english", top_k=None, truncation=True
)
sentiment = dict()
posts = get_posts(type)
for _, post in tqdm(posts.iterrows(), total=len(posts)):
sentiment[post.post_id] = sentiment_pipeline(post.post_body)[0]
with open(f"data/{type}_sentiment.pickle", "wb") as f:
pickle.dump(sentiment, f)
return sentiment
def get_posts_emotions(type):
if os.path.exists(f"data/{type}_emotions.pickle"):
with open(f"data/{type}_emotions.pickle", "rb") as f:
sentiment = pickle.load(f)
return sentiment
else:
classifier = pipeline(
"text-classification", model='bhadresh-savani/distilbert-base-uncased-emotion', top_k=None, truncation=True)
emotions = dict()
posts = get_posts(type)
for _, post in tqdm(posts.iterrows(), total=len(posts)):
emotions[post.post_id] = classifier(post.post_body)[0]
with open(f"data/{type}_emotions.pickle", "wb") as f:
pickle.dump(emotions, f)
return emotions
def get_sentiment_max(type):
sentiment = get_posts_sentiment(type)
sentiment_max = dict({"POSITIVE": 0, "NEGATIVE": 0})
for eval in list(sentiment.values()):
sentiment_max[max(eval, key=lambda x: x["score"])["label"]] += 1
return sentiment_max, len(sentiment)
def get_emotions_avg(type):
emotions = get_posts_emotions(type)
emotions_sum = dict({"love": 0, "joy": 0, "anger": 0,
"sadness": 0, "surprise": 0, "fear": 0})
for eval in list(emotions.values()):
for emotion in eval:
emotions_sum[emotion["label"]] += emotion["score"]
emotions_avg = {k: emotions_sum[k]/len(emotions) for k in emotions_sum}
return emotions_avg
def get_emotions_max(emotions):
emotions_max = dict({"love": 0, "joy": 0, "anger": 0,
"sadness": 0, "surprise": 0, "fear": 0})
for eval in list(emotions.values()):
emotions_max[max(eval, key=lambda x: x["score"])["label"]] += 1
return emotions_max
def get_posts_emotion_scores(personality, emotion):
posts = get_posts(personality)
emotions = get_posts_emotions(posts)
emotion_scores = []
for post_evaluation in emotions.values():
for emotion_evaluation in post_evaluation:
if emotion_evaluation["label"] == emotion:
emotion_scores.append(emotion_evaluation["score"])
return emotion_scores
def get_trait_types(trait):
TYPES = ["infj", "intj", "infp", "intp", "istj", "isfj", "istp", "isfp",
"enfj", "enfp", "entp", "entj", "estj", "esfj", "estp", "esfp"]
return [type for type in TYPES if trait in type]
def get_correlation(trait, non_trait, emotion):
trait_presence = []
emotion_scores = []
for type in tqdm(get_trait_types(trait)):
type_scores = get_posts_emotion_scores(type, emotion)
emotion_scores.extend(type_scores)
trait_presence.extend(np.ones(len(type_scores)))
for type in tqdm(get_trait_types(non_trait)):
type_scores = get_posts_emotion_scores(type, emotion)
emotion_scores.extend(type_scores)
trait_presence.extend(np.zeros(len(type_scores)))
corr, pvalue = stats.pointbiserialr(trait_presence, emotion_scores)
return corr, pvalue
def get_wordcloud(type):
type_posts = get_posts(type)
text = " ".join([word.lower() for word in " ".join(type_posts["post_body"]).split(
) if word.lower() not in stopwords.words("english")])
wordcloud = WordCloud(width=1600, height=800).generate(text)
return wordcloud
def get_types_avg_emotion(types):
types_sum = dict({"love": 0, "joy": 0, "anger": 0,
"sadness": 0, "surprise": 0, "fear": 0})
for personality in types:
personality_avg = get_emotions_avg(personality)
types_sum = dict(Counter(types_sum) + Counter(personality_avg))
return {k: types_sum[k]/len(types) for k in types_sum}
def get_types_sentiment_max(types):
types_sum = dict({"POSITIVE": 0, "NEGATIVE": 0})
total_cardinality = 0
for personality in types:
sentiment, cardinality = get_sentiment_max(personality)
total_cardinality += cardinality
types_sum = dict(Counter(types_sum) + Counter(sentiment))
return {k: types_sum[k]/total_cardinality for k in types_sum}