Skip to content

Commit f88e0ac

Browse files
[pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
1 parent fdaf4bf commit f88e0ac

File tree

1 file changed

+44
-59
lines changed

1 file changed

+44
-59
lines changed
Lines changed: 44 additions & 59 deletions
Original file line numberDiff line numberDiff line change
@@ -1,74 +1,69 @@
11
import numpy as np
22
import re
3-
#to seprate words and normlize it
3+
# to seprate words and normlize it
4+
45

56
def decompose(text):
67
text = text.lower()
7-
text = re.sub(r'[^a-z0-9\s]', '', text)
8-
text = re.sub(r'\s+', ' ', text)
9-
8+
text = re.sub(r"[^a-z0-9\s]", "", text)
9+
text = re.sub(r"\s+", " ", text)
10+
1011
return text.split()
11-
1212

13-
#creating tfidf class
13+
14+
# creating tfidf class
1415
class TfIdfVectorizer:
15-
1616
def __init__(self):
17-
self.vocab=None
18-
self.idf=None
19-
20-
#these method to compute the tf for each word in given data
21-
def compute_tf(self,data):
22-
17+
self.vocab = None
18+
self.idf = None
19+
20+
# these method to compute the tf for each word in given data
21+
def compute_tf(self, data):
2322
tf = []
2423
doc_words = []
25-
24+
2625
for document in data:
27-
2826
words = decompose(document)
29-
30-
freq = {} #these dictionerie have for each unique words it number of apprition in one sentence
31-
27+
28+
freq = {} # these dictionerie have for each unique words it number of apprition in one sentence
29+
3230
for word in words:
33-
34-
freq[word] = freq.get(word , 0)+1
35-
31+
freq[word] = freq.get(word, 0) + 1
32+
3633
if word not in doc_words:
3734
doc_words.append(word)
38-
39-
#calculating tf
40-
41-
for word in freq :
35+
36+
# calculating tf
37+
38+
for word in freq:
4239
freq[word] /= len(words)
43-
40+
4441
tf.append(freq)
45-
46-
47-
#computing idf
42+
43+
# computing idf
4844
idf = {}
49-
45+
5046
n = len(data)
51-
47+
5248
for word in doc_words:
5349
df = sum(1 for doc in tf if word in doc)
5450
idf[word] = np.log((n + 1) / (1 + df)) + 1
55-
56-
self.idf=idf
51+
52+
self.idf = idf
5753
tfidf = []
58-
54+
5955
self.idf = idf
60-
61-
#computing tfidf for each word
62-
63-
56+
57+
# computing tfidf for each word
58+
6459
for doc_tf in tf:
6560
vector = [doc_tf.get(word, 0) * idf[word] for word in doc_words]
6661
tfidf.append(vector)
67-
62+
6863
self.vocab = doc_words
69-
70-
return np.array(tfidf,dtype=float)
71-
64+
65+
return np.array(tfidf, dtype=float)
66+
7267
def encode(self, data):
7368
if self.vocab is None or self.idf is None:
7469
raise ValueError("You should fit the model first")
@@ -77,36 +72,26 @@ def encode(self, data):
7772
for doc in data:
7873
words = decompose(doc)
7974
freq = {}
80-
75+
8176
# Count term frequencies for words that exist in the vocabulary
8277
for word in words:
83-
8478
if word in self.vocab:
8579
freq[word] = freq.get(word, 0) + 1
86-
80+
8781
# Normalize TF by document length
8882
for word in freq:
8983
freq[word] /= len(words)
90-
84+
9185
# Align vector according to vocab and multiply by IDF
9286
vector = [freq.get(word, 0) * self.idf[word] for word in self.vocab]
9387
tfidf_matrix.append(vector)
9488

95-
return np.array(tfidf_matrix, dtype=float)
96-
97-
89+
return np.array(tfidf_matrix, dtype=float)
90+
91+
9892
if __name__ == "__main__":
9993
documents = ["the cat sat on the mat", "the dog chased the cat"]
10094
vectorizer = TfIdfVectorizer()
10195
tfidf_matrix = vectorizer.compute_tf(documents)
10296
print("Vocabulary:", vectorizer.vocab)
103-
print("TF-IDF Matrix:\n", tfidf_matrix)
104-
105-
106-
107-
108-
109-
110-
111-
112-
97+
print("TF-IDF Matrix:\n", tfidf_matrix)

0 commit comments

Comments
 (0)