Skip to content

Commit 9b08981

Browse files
committed
去除停用词,从训练集中提取并转成theme词表和sentiment词表
1 parent 59e6001 commit 9b08981

File tree

6 files changed

+6975
-32
lines changed

6 files changed

+6975
-32
lines changed

analysis.py

Lines changed: 38 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -17,52 +17,56 @@
1717
from sklearn.decomposition import LatentDirichletAllocation
1818
import pyLDAvis
1919
import pyLDAvis.sklearn
20-
21-
#把所有词作为特征
22-
def bag_of_words(words):
23-
return dict([(word, True) for word in words])
24-
25-
#把双词搭配(bigrams)作为特征
26-
def bigram(words, score_fn=BigramAssocMeasures.chi_sq, n=1000):
27-
bigram_finder = BigramCollocationFinder.from_words(words) #把文本变成双词搭配的形式
28-
bigrams = bigram_finder.nbest(score_fn, n) #使用了卡方统计的方法,选择排名前1000的双词
29-
return bag_of_words(bigrams)
30-
31-
#把所有词和双词搭配一起作为特征
32-
def bigram_words(words, score_fn=BigramAssocMeasures.chi_sq, n=1000):
33-
bigram_finder = BigramCollocationFinder.from_words(words)
34-
bigrams = bigram_finder.nbest(score_fn, n)
35-
return bag_of_words(words + bigrams) #所有词和(信息量大的)双词搭配一起作为特征
20+
from gensim.models import word2vec
21+
import jieba.analyse
3622

3723
def pre_process(filename='data/train.xlsx'):
3824
df = pd.read_excel(filename)
3925
#去掉主题是空行的,就是啥都没有的那些数据
4026
NONE_VIN = (df["theme-主题"].isnull()) | (df["theme-主题"].apply(lambda x: str(x).strip("NULL;").isspace()))
41-
df_null = df[NONE_VIN]
4227
df_not_null = df[~NONE_VIN]
43-
#去掉主题和情感关键词都是NULL的数据,因为这些是没用的
44-
#dd = df_not_null[~(df_not_null['theme-主题'].apply(lambda x: str(x).strip("NULL;").strip()==''))]
28+
#增加content_cutted那一列,这一列先对数据去掉停用词,再分词
29+
df_not_null["content_cutted"] = df_not_null['content-评论内容'].apply(seg_sentence)
30+
4531
return df_not_null
32+
33+
34+
4635
def chinese_word_cut(mytext):
36+
# 去掉停用词
37+
#jieba.analyse.set_stop_words(stpwrdlst)
38+
#a = jieba.analyse.extract_tags(mytext, topK=20, withWeight=False, allowPOS=())
39+
#分词
40+
#print(jieba.cut(mytext))
4741
return " ".join(jieba.cut(mytext))
4842

49-
def word_cut(df):
50-
nwordall = []
51-
for t in df['content-评论内容']:
52-
words = pseg.cut(t)
53-
nword = ['']
54-
for w in words:
55-
if ((w.flag == 'n' or w.flag == 'v' or w.flag == 'a') and len(w.word) > 1):
56-
nword.append(w.word)
57-
nwordall.append(nword)
58-
print(nwordall)
43+
# 创建停用词list
44+
def stopwordslist(filepath):
45+
stopwords = [line.strip() for line in open(filepath, 'r', encoding='utf-8').readlines()]
46+
return stopwords
47+
48+
#对文本做停用词处理
49+
def seg_sentence(sentence):
50+
sentence = sentence
51+
sentence_seged = jieba.cut(sentence)
52+
stopwords = stopwordslist('data/stopwords.txt') # 这里加载停用词的路径
53+
outstr = ''
54+
for word in sentence_seged:
55+
if word not in stopwords:
56+
if word != '\t':
57+
outstr += word
58+
outstr += " "
59+
return outstr
60+
61+
#主要处理算法,TF-IDF、LDA
5962
def deal(df):
63+
stopwords = stopwordslist('data/stopwords.txt')
6064
#从文本中提取1000个最重要的特征关键词,然后停止
6165
n_features = 1000
6266
#关键词提取和向量转换过程
6367
tf_vectorizer = CountVectorizer(strip_accents='unicode',
6468
max_features=n_features,
65-
stop_words='english',
69+
stop_words=stopwords,
6670
max_df=0.5,
6771
min_df=10)
6872
tf = tf_vectorizer.fit_transform(df["content_cutted"])
@@ -78,6 +82,9 @@ def deal(df):
7882
tf_feature_names = tf_vectorizer.get_feature_names()
7983
print_top_words(lda, tf_feature_names, n_top_words)
8084
#pyLDAvis.sklearn.prepare(lda, tf, tf_vectorizer)
85+
#model = word2vec.Word2Vec(df["content_cutted"], min_count=5, size=50)
86+
#model.save('word2vec_model')
87+
8188

8289
def print_top_words(model, feature_names, n_top_words):
8390
for topic_idx, topic in enumerate(model.components_):
@@ -88,6 +95,5 @@ def print_top_words(model, feature_names, n_top_words):
8895

8996

9097
df = pre_process()
91-
df["content_cutted"] = df['content-评论内容'].apply(chinese_word_cut)
9298
deal(df)
93-
99+
#new_model=word2vec.Word2Vec.load('word2vec_model')

0 commit comments

Comments
 (0)