11import numpy as np
22import re
3- #to seprate words and normlize it
3+ # to seprate words and normlize it
4+
45
56def decompose (text ):
67 text = text .lower ()
7- text = re .sub (r' [^a-z0-9\s]' , '' , text )
8- text = re .sub (r' \s+' , ' ' , text )
9-
8+ text = re .sub (r" [^a-z0-9\s]" , "" , text )
9+ text = re .sub (r" \s+" , " " , text )
10+
1011 return text .split ()
11-
1212
13- #creating tfidf class
13+
14+ # creating tfidf class
1415class TfIdfVectorizer :
15-
1616 def __init__ (self ):
17- self .vocab = None
18- self .idf = None
19-
20- #these method to compute the tf for each word in given data
21- def compute_tf (self ,data ):
22-
17+ self .vocab = None
18+ self .idf = None
19+
20+ # these method to compute the tf for each word in given data
21+ def compute_tf (self , data ):
2322 tf = []
2423 doc_words = []
25-
24+
2625 for document in data :
27-
2826 words = decompose (document )
29-
30- freq = {} # these dictionerie have for each unique words it number of apprition in one sentence
31-
27+
28+ freq = {} # these dictionerie have for each unique words it number of apprition in one sentence
29+
3230 for word in words :
33-
34- freq [word ] = freq .get (word , 0 )+ 1
35-
31+ freq [word ] = freq .get (word , 0 ) + 1
32+
3633 if word not in doc_words :
3734 doc_words .append (word )
38-
39- #calculating tf
40-
41- for word in freq :
35+
36+ # calculating tf
37+
38+ for word in freq :
4239 freq [word ] /= len (words )
43-
40+
4441 tf .append (freq )
45-
46-
47- #computing idf
42+
43+ # computing idf
4844 idf = {}
49-
45+
5046 n = len (data )
51-
47+
5248 for word in doc_words :
5349 df = sum (1 for doc in tf if word in doc )
5450 idf [word ] = np .log ((n + 1 ) / (1 + df )) + 1
55-
56- self .idf = idf
51+
52+ self .idf = idf
5753 tfidf = []
58-
54+
5955 self .idf = idf
60-
61- #computing tfidf for each word
62-
63-
56+
57+ # computing tfidf for each word
58+
6459 for doc_tf in tf :
6560 vector = [doc_tf .get (word , 0 ) * idf [word ] for word in doc_words ]
6661 tfidf .append (vector )
67-
62+
6863 self .vocab = doc_words
69-
70- return np .array (tfidf ,dtype = float )
71-
64+
65+ return np .array (tfidf , dtype = float )
66+
7267 def encode (self , data ):
7368 if self .vocab is None or self .idf is None :
7469 raise ValueError ("You should fit the model first" )
@@ -77,36 +72,26 @@ def encode(self, data):
7772 for doc in data :
7873 words = decompose (doc )
7974 freq = {}
80-
75+
8176 # Count term frequencies for words that exist in the vocabulary
8277 for word in words :
83-
8478 if word in self .vocab :
8579 freq [word ] = freq .get (word , 0 ) + 1
86-
80+
8781 # Normalize TF by document length
8882 for word in freq :
8983 freq [word ] /= len (words )
90-
84+
9185 # Align vector according to vocab and multiply by IDF
9286 vector = [freq .get (word , 0 ) * self .idf [word ] for word in self .vocab ]
9387 tfidf_matrix .append (vector )
9488
95- return np .array (tfidf_matrix , dtype = float )
96-
97-
89+ return np .array (tfidf_matrix , dtype = float )
90+
91+
9892if __name__ == "__main__" :
9993 documents = ["the cat sat on the mat" , "the dog chased the cat" ]
10094 vectorizer = TfIdfVectorizer ()
10195 tfidf_matrix = vectorizer .compute_tf (documents )
10296 print ("Vocabulary:" , vectorizer .vocab )
103- print ("TF-IDF Matrix:\n " , tfidf_matrix )
104-
105-
106-
107-
108-
109-
110-
111-
112-
97+ print ("TF-IDF Matrix:\n " , tfidf_matrix )
0 commit comments