-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtfidfval.py
91 lines (69 loc) · 2.42 KB
/
tfidfval.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
import re
import numpy as np
from collections import Counter
import inflection #软件工程
from nltk import PorterStemmer
porter = PorterStemmer()
from nltk.corpus import stopwords
#缩略词处理
def abbrev(line):
pat_is = re.compile("(it|he|she|that|this|there|here)(\"s)", re.I)
# 's
pat_s1 = re.compile("(?<=[a-zA-Z])\"s")
# s
pat_s2 = re.compile("(?<=s)\"s?")
# not
pat_not = re.compile("(?<=[a-zA-Z])n\"t")
# would
pat_would = re.compile("(?<=[a-zA-Z])\"d")
# will
pat_will = re.compile("(?<=[a-zA-Z])\"ll")
# am
pat_am = re.compile("(?<=[I|i])\"m")
# are
pat_are = re.compile("(?<=[a-zA-Z])\"re")
# have
pat_ve = re.compile("(?<=[a-zA-Z])\"ve")
line = pat_is.sub(r"\1 is", line)
line = pat_s1.sub("", line)
line = pat_s2.sub("", line)
line = pat_not.sub(" not", line)
line = pat_would.sub(" would", line)
line = pat_will.sub(" will", line)
line = pat_am.sub(" am", line)
line = pat_are.sub(" are", line)
line = pat_ve.sub(" have", line)
new_line = line.replace('\'', ' ')
return new_line
#程序类型分词
def tokenize(text):
abbrev_text = abbrev(text)
proed_text = inflection.underscore(abbrev_text)
word_stem = [porter.stem(w) for w in re.findall(r"[\w'-]+|[^\s\w]", proed_text)]
words = [ i for i in word_stem if i not in stopwords.words('english')]
return words
def tf(word, count):
return count[word] / sum(count.values())
def n_containing(word, count_list):
return np.sum([1 for count in count_list if word in count])
def idf(word, count_list):
return np.log(len(count_list) / (1 + n_containing(word, count_list)))+1
def tfidf(word, count, count_list):
return tf(word, count) * idf(word, count_list)
def count(corpus_list):
words_list=[tokenize(line) for line in corpus_list]
scores=[]
#语料转成语句
count_list = [Counter(i) for i in words_list]
for i, count in enumerate(count_list):
#计算逆文档频率
sums=np.sqrt(np.sum([np.square(tfidf(word, count, count_list)) for word in count]))
#计算
words_tfidf = {word: tfidf(word, count, count_list)/sums for word in count}
#字典value
tfidf_score=np.sum(list(words_tfidf.values()))
scores.append(tfidf_score)
scores=np.asarray(scores)
#归一化操作
norm_scores=(scores-scores.min())/(scores.max()-scores.min())
return norm_scores