17
17
from sklearn .decomposition import LatentDirichletAllocation
18
18
import pyLDAvis
19
19
import pyLDAvis .sklearn
20
-
21
- #把所有词作为特征
22
- def bag_of_words (words ):
23
- return dict ([(word , True ) for word in words ])
24
-
25
- #把双词搭配(bigrams)作为特征
26
- def bigram (words , score_fn = BigramAssocMeasures .chi_sq , n = 1000 ):
27
- bigram_finder = BigramCollocationFinder .from_words (words ) #把文本变成双词搭配的形式
28
- bigrams = bigram_finder .nbest (score_fn , n ) #使用了卡方统计的方法,选择排名前1000的双词
29
- return bag_of_words (bigrams )
30
-
31
- #把所有词和双词搭配一起作为特征
32
- def bigram_words (words , score_fn = BigramAssocMeasures .chi_sq , n = 1000 ):
33
- bigram_finder = BigramCollocationFinder .from_words (words )
34
- bigrams = bigram_finder .nbest (score_fn , n )
35
- return bag_of_words (words + bigrams ) #所有词和(信息量大的)双词搭配一起作为特征
20
+ from gensim .models import word2vec
21
+ import jieba .analyse
36
22
37
23
def pre_process (filename = 'data/train.xlsx' ):
38
24
df = pd .read_excel (filename )
39
25
#去掉主题是空行的,就是啥都没有的那些数据
40
26
NONE_VIN = (df ["theme-主题" ].isnull ()) | (df ["theme-主题" ].apply (lambda x : str (x ).strip ("NULL;" ).isspace ()))
41
- df_null = df [NONE_VIN ]
42
27
df_not_null = df [~ NONE_VIN ]
43
- #去掉主题和情感关键词都是NULL的数据,因为这些是没用的
44
- #dd = df_not_null[~(df_not_null['theme-主题'].apply(lambda x: str(x).strip("NULL;").strip()==''))]
28
+ #增加content_cutted那一列,这一列先对数据去掉停用词,再分词
29
+ df_not_null ["content_cutted" ] = df_not_null ['content-评论内容' ].apply (seg_sentence )
30
+
45
31
return df_not_null
32
+
33
+
34
+
46
35
def chinese_word_cut (mytext ):
36
+ # 去掉停用词
37
+ #jieba.analyse.set_stop_words(stpwrdlst)
38
+ #a = jieba.analyse.extract_tags(mytext, topK=20, withWeight=False, allowPOS=())
39
+ #分词
40
+ #print(jieba.cut(mytext))
47
41
return " " .join (jieba .cut (mytext ))
48
42
49
- def word_cut (df ):
50
- nwordall = []
51
- for t in df ['content-评论内容' ]:
52
- words = pseg .cut (t )
53
- nword = ['' ]
54
- for w in words :
55
- if ((w .flag == 'n' or w .flag == 'v' or w .flag == 'a' ) and len (w .word ) > 1 ):
56
- nword .append (w .word )
57
- nwordall .append (nword )
58
- print (nwordall )
43
+ # 创建停用词list
44
+ def stopwordslist (filepath ):
45
+ stopwords = [line .strip () for line in open (filepath , 'r' , encoding = 'utf-8' ).readlines ()]
46
+ return stopwords
47
+
48
+ #对文本做停用词处理
49
+ def seg_sentence (sentence ):
50
+ sentence = sentence
51
+ sentence_seged = jieba .cut (sentence )
52
+ stopwords = stopwordslist ('data/stopwords.txt' ) # 这里加载停用词的路径
53
+ outstr = ''
54
+ for word in sentence_seged :
55
+ if word not in stopwords :
56
+ if word != '\t ' :
57
+ outstr += word
58
+ outstr += " "
59
+ return outstr
60
+
61
+ #主要处理算法,TF-IDF、LDA
59
62
def deal (df ):
63
+ stopwords = stopwordslist ('data/stopwords.txt' )
60
64
#从文本中提取1000个最重要的特征关键词,然后停止
61
65
n_features = 1000
62
66
#关键词提取和向量转换过程
63
67
tf_vectorizer = CountVectorizer (strip_accents = 'unicode' ,
64
68
max_features = n_features ,
65
- stop_words = 'english' ,
69
+ stop_words = stopwords ,
66
70
max_df = 0.5 ,
67
71
min_df = 10 )
68
72
tf = tf_vectorizer .fit_transform (df ["content_cutted" ])
@@ -78,6 +82,9 @@ def deal(df):
78
82
tf_feature_names = tf_vectorizer .get_feature_names ()
79
83
print_top_words (lda , tf_feature_names , n_top_words )
80
84
#pyLDAvis.sklearn.prepare(lda, tf, tf_vectorizer)
85
+ #model = word2vec.Word2Vec(df["content_cutted"], min_count=5, size=50)
86
+ #model.save('word2vec_model')
87
+
81
88
82
89
def print_top_words (model , feature_names , n_top_words ):
83
90
for topic_idx , topic in enumerate (model .components_ ):
@@ -88,6 +95,5 @@ def print_top_words(model, feature_names, n_top_words):
88
95
89
96
90
97
df = pre_process ()
91
- df ["content_cutted" ] = df ['content-评论内容' ].apply (chinese_word_cut )
92
98
deal (df )
93
-
99
+ #new_model=word2vec.Word2Vec.load('word2vec_model')
0 commit comments