-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtopic_modeling.py
31 lines (23 loc) · 1.18 KB
/
topic_modeling.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer
from preprocessing import run_preprocessing
from result_visualization import run_plot_topwords
def run_topic_modeling(text):
# Preprocessing and Wordembedding(bags of words)
text_preprocessed = text.apply(run_preprocessing)
vectorizer = CountVectorizer(max_df=0.90, min_df=2, stop_words="english")
text_preprocessed = vectorizer.fit_transform(text_preprocessed)
# Apply LDA topic modeling
num_topics = 6 # Specify the number of topics you want to discover
lda_model = LatentDirichletAllocation(n_components=num_topics, random_state=42)
lda_features = lda_model.fit_transform(text_preprocessed)
# Print topic top words
feature_names = vectorizer.get_feature_names_out()
for topic_idx, topic in enumerate(lda_model.components_):
print(f"Topic {topic_idx + 1}:")
print(" ".join([feature_names[i] for i in topic.argsort()[:-15:-1]]))
# plot the top words
n_topword = 15
title = "Topics in LDA model"
path = "./images/Topic Modeling.png"
run_plot_topwords(lda_model, n_topword, feature_names, title, path)