Skip to content

Commit fdaf4bf

Browse files
committed
implimenting tfidf using just python and numpy under feature extraction
1 parent 68473af commit fdaf4bf

File tree

1 file changed

+112
-0
lines changed

1 file changed

+112
-0
lines changed
Lines changed: 112 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,112 @@
1+
import numpy as np
2+
import re
3+
#to seprate words and normlize it
4+
5+
def decompose(text):
6+
text = text.lower()
7+
text = re.sub(r'[^a-z0-9\s]', '', text)
8+
text = re.sub(r'\s+', ' ', text)
9+
10+
return text.split()
11+
12+
13+
#creating tfidf class
14+
class TfIdfVectorizer:
15+
16+
def __init__(self):
17+
self.vocab=None
18+
self.idf=None
19+
20+
#these method to compute the tf for each word in given data
21+
def compute_tf(self,data):
22+
23+
tf = []
24+
doc_words = []
25+
26+
for document in data:
27+
28+
words = decompose(document)
29+
30+
freq = {} #these dictionerie have for each unique words it number of apprition in one sentence
31+
32+
for word in words:
33+
34+
freq[word] = freq.get(word , 0)+1
35+
36+
if word not in doc_words:
37+
doc_words.append(word)
38+
39+
#calculating tf
40+
41+
for word in freq :
42+
freq[word] /= len(words)
43+
44+
tf.append(freq)
45+
46+
47+
#computing idf
48+
idf = {}
49+
50+
n = len(data)
51+
52+
for word in doc_words:
53+
df = sum(1 for doc in tf if word in doc)
54+
idf[word] = np.log((n + 1) / (1 + df)) + 1
55+
56+
self.idf=idf
57+
tfidf = []
58+
59+
self.idf = idf
60+
61+
#computing tfidf for each word
62+
63+
64+
for doc_tf in tf:
65+
vector = [doc_tf.get(word, 0) * idf[word] for word in doc_words]
66+
tfidf.append(vector)
67+
68+
self.vocab = doc_words
69+
70+
return np.array(tfidf,dtype=float)
71+
72+
def encode(self, data):
73+
if self.vocab is None or self.idf is None:
74+
raise ValueError("You should fit the model first")
75+
76+
tfidf_matrix = []
77+
for doc in data:
78+
words = decompose(doc)
79+
freq = {}
80+
81+
# Count term frequencies for words that exist in the vocabulary
82+
for word in words:
83+
84+
if word in self.vocab:
85+
freq[word] = freq.get(word, 0) + 1
86+
87+
# Normalize TF by document length
88+
for word in freq:
89+
freq[word] /= len(words)
90+
91+
# Align vector according to vocab and multiply by IDF
92+
vector = [freq.get(word, 0) * self.idf[word] for word in self.vocab]
93+
tfidf_matrix.append(vector)
94+
95+
return np.array(tfidf_matrix, dtype=float)
96+
97+
98+
if __name__ == "__main__":
99+
documents = ["the cat sat on the mat", "the dog chased the cat"]
100+
vectorizer = TfIdfVectorizer()
101+
tfidf_matrix = vectorizer.compute_tf(documents)
102+
print("Vocabulary:", vectorizer.vocab)
103+
print("TF-IDF Matrix:\n", tfidf_matrix)
104+
105+
106+
107+
108+
109+
110+
111+
112+

0 commit comments

Comments
 (0)