-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtfidf.py
119 lines (89 loc) · 3.62 KB
/
tfidf.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
from nltk.text import sent_tokenize
import math
class TfIdf():
def __init__(self, parent=None):
super(TfIdf, self).__init__(parent)
self.parentWindow = parent
sentences = sent_tokenize("") # NLTK function
total_documents = len(sentences)
# def _create_frequency_matrix(sentences):
# frequency_matrix = {}
# stopWords = set(stopwords.words("english"))
# ps = PorterStemmer()
#
# for sent in sentences:
# freq_table = {}
# words = word_tokenize(sent)
# for word in words:
# word = word.lower()
# word = ps.stem(word)
# if word in stopWords:
# continue
#
# if word in freq_table:
# freq_table[word] += 1
# else:
# freq_table[word] = 1
#
# frequency_matrix[sent[:15]] = freq_table
#
# return frequency_matrix
def _create_tf_matrix(freq_matrix):
tf_matrix = {}
for sent, f_table in freq_matrix.items():
tf_table = {}
count_words_in_sentence = len(f_table)
for word, count in f_table.items():
tf_table[word] = count / count_words_in_sentence
tf_matrix[sent] = tf_table
return tf_matrix
def _create_idf_matrix(freq_matrix, count_doc_per_words, total_documents):
idf_matrix = {}
for sent, f_table in freq_matrix.items():
idf_table = {}
for word in f_table.keys():
idf_table[word] = math.log10(total_documents / float(count_doc_per_words[word]))
idf_matrix[sent] = idf_table
return idf_matrix
def _create_tf_idf_matrix(tf_matrix, idf_matrix):
tf_idf_matrix = {}
for (sent1, f_table1), (sent2, f_table2) in zip(tf_matrix.items(), idf_matrix.items()):
tf_idf_table = {}
for (word1, value1), (word2, value2) in zip(f_table1.items(),
f_table2.items()): # here, keys are the same in both the table
tf_idf_table[word1] = float(value1 * value2)
tf_idf_matrix[sent1] = tf_idf_table
return tf_idf_matrix
def _score_sentences(tf_idf_matrix) -> dict:
"""
score a sentence by its word's TF
Basic algorithm: adding the TF frequency of every non-stop word in a sentence divided by total no of words in a sentence.
:rtype: dict
"""
sentenceValue = {}
for sent, f_table in tf_idf_matrix.items():
total_score_per_sentence = 0
count_words_in_sentence = len(f_table)
for word, score in f_table.items():
total_score_per_sentence += score
sentenceValue[sent] = total_score_per_sentence / count_words_in_sentence
return sentenceValue
def _find_average_score(sentenceValue) -> int:
"""
Find the average score from the sentence value dictionary
:rtype: int
"""
sumValues = 0
for entry in sentenceValue:
sumValues += sentenceValue[entry]
# Average value of a sentence from original summary_text
average = (sumValues / len(sentenceValue))
return average
def _generate_summary(sentences, sentenceValue, threshold):
sentence_count = 0
summary = ''
for sentence in sentences:
if sentence[:15] in sentenceValue and sentenceValue[sentence[:15]] >= (threshold):
summary += " " + sentence
sentence_count += 1
return summary