-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathEmails_classify_Proprocessing.py
148 lines (129 loc) · 4.7 KB
/
Emails_classify_Proprocessing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
import re
import os
import argparse
import pandas as pd
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk import word_tokenize, pos_tag
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
def del_punctuation(content):
"""
删除标点,保证单词之间仅有一个空格
:param content: 文件内容
:return: 文件内容
"""
content = re.sub("[\\\'\t\n\.\!\/_,$%^*()\[\]+\"<>\-:]+|[|+——!,。??、~@#¥%……&*()]+", " ", content)
content = ' '.join(content.split())
return content
def stemming(content):
"""
词干提取stemming
:param content: 文件内容
:return: 文件内容
"""
stemmer = SnowballStemmer("english") # 选择目标语言为英语
all_words = content.split(' ')
new_content = []
for word in all_words:
new_word = stemmer.stem(word.lower()) # Stem a word 并且转化为小写
if new_word != ' ':
new_content.append(new_word)
return " ".join(new_content)
def get_wordnet_pos(tag):
"""
获取单词的词性
:param tag: 词性
:return: 词类型
"""
if tag.startswith('J'):
return wordnet.ADJ
elif tag.startswith('V'):
return wordnet.VERB
elif tag.startswith('N'):
return wordnet.NOUN
elif tag.startswith('R'):
return wordnet.ADV
else:
return None
def lemmatization(content):
"""
词形还原 lemmatization
:param content: 文件内容
:return: 文件内容
"""
all_words = word_tokenize(content) # 分词
tagged_sent = pos_tag(all_words) # 获取单词词性
wnl = WordNetLemmatizer()
new_content = []
for tag in tagged_sent:
wordnet_pos = get_wordnet_pos(tag[1]) or wordnet.NOUN
new_content.append(wnl.lemmatize(tag[0], pos=wordnet_pos)) # 词形还原
return " ".join(new_content)
def remove_stop_words(content):
"""
去停用词
:param content: 文件内容
:return: 文件内容
"""
stopwords_list = stopwords.words('english')
all_words = content.split(' ')
new_content = []
for word in all_words:
if word not in stopwords_list:
new_content.append(word)
return " ".join(new_content)
def tfidf(file_list, label):
"""
特征提取 采用tfidf
:param file_list: 合成的总文件
:param label: 每个文件的类别标签
:return: dataframe 表格
"""
vectorizer = TfidfVectorizer()
vectors = vectorizer.fit_transform(file_list)
feature_names = vectorizer.get_feature_names()
dense = vectors.todense()
denselist = dense.tolist()
df = pd.DataFrame(denselist, columns=feature_names)
df['@label'] = label
print(df)
return df
if __name__ == '__main__':
# 可配置式方法选择
parser = argparse.ArgumentParser(description='Choose the method(stemming or lemmatization)')
parser.add_argument('--method', '-m', help='method 词干提取stemming或词性还原lemmatization,非必要参数,但有默认值', default = 'stemming')
args = parser.parse_args()
method = args.method
file_r_list = [] # 存储所有的文件内容
label = [] # 存储类别结果,'baseball':1; 'hockey':-1
for type_name in ['baseball', 'hockey']:
url = 'dataset/Case1-classification/' + type_name
for file_name in os.listdir(url):
try:
file = open(url + '/' + file_name, 'r', encoding='latin-1')
lines = file.readlines()
except Exception as e:
print(url + '/' + file_name + '无法打开')
print(e)
continue
# 读取每一封邮件“第四行——末尾”,并存入content中
content = ''
for i in range(3, len(lines)):
content += lines[i]
# 数据预处理:1. 去除标点符号;2. 词性还原/词干提取; 3. 去除停用词
content = del_punctuation(content)
if method == 'stemming':
content = stemming(content)
elif method == 'lemmatization':
content = lemmatization(content)
content = remove_stop_words(content)
file_r_list.append(content)
# 数据打标签
if type_name == 'baseball':
label.append(1)
elif type_name == 'hockey':
label.append(-1)
preed_data = tfidf(file_r_list, label)
preed_data.to_csv(method + '_preed_data.csv', index = False)