1+ import numpy as np
2+ import re
3+ #to seprate words and normlize it
4+
5+ def decompose (text ):
6+ text = text .lower ()
7+ text = re .sub (r'[^a-z0-9\s]' , '' , text )
8+ text = re .sub (r'\s+' , ' ' , text )
9+
10+ return text .split ()
11+
12+
13+ #creating tfidf class
14+ class TfIdfVectorizer :
15+
16+ def __init__ (self ):
17+ self .vocab = None
18+ self .idf = None
19+
20+ #these method to compute the tf for each word in given data
21+ def compute_tf (self ,data ):
22+
23+ tf = []
24+ doc_words = []
25+
26+ for document in data :
27+
28+ words = decompose (document )
29+
30+ freq = {} #these dictionerie have for each unique words it number of apprition in one sentence
31+
32+ for word in words :
33+
34+ freq [word ] = freq .get (word , 0 )+ 1
35+
36+ if word not in doc_words :
37+ doc_words .append (word )
38+
39+ #calculating tf
40+
41+ for word in freq :
42+ freq [word ] /= len (words )
43+
44+ tf .append (freq )
45+
46+
47+ #computing idf
48+ idf = {}
49+
50+ n = len (data )
51+
52+ for word in doc_words :
53+ df = sum (1 for doc in tf if word in doc )
54+ idf [word ] = np .log ((n + 1 ) / (1 + df )) + 1
55+
56+ self .idf = idf
57+ tfidf = []
58+
59+ self .idf = idf
60+
61+ #computing tfidf for each word
62+
63+
64+ for doc_tf in tf :
65+ vector = [doc_tf .get (word , 0 ) * idf [word ] for word in doc_words ]
66+ tfidf .append (vector )
67+
68+ self .vocab = doc_words
69+
70+ return np .array (tfidf ,dtype = float )
71+
72+ def encode (self , data ):
73+ if self .vocab is None or self .idf is None :
74+ raise ValueError ("You should fit the model first" )
75+
76+ tfidf_matrix = []
77+ for doc in data :
78+ words = decompose (doc )
79+ freq = {}
80+
81+ # Count term frequencies for words that exist in the vocabulary
82+ for word in words :
83+
84+ if word in self .vocab :
85+ freq [word ] = freq .get (word , 0 ) + 1
86+
87+ # Normalize TF by document length
88+ for word in freq :
89+ freq [word ] /= len (words )
90+
91+ # Align vector according to vocab and multiply by IDF
92+ vector = [freq .get (word , 0 ) * self .idf [word ] for word in self .vocab ]
93+ tfidf_matrix .append (vector )
94+
95+ return np .array (tfidf_matrix , dtype = float )
96+
97+
98+ if __name__ == "__main__" :
99+ documents = ["the cat sat on the mat" , "the dog chased the cat" ]
100+ vectorizer = TfIdfVectorizer ()
101+ tfidf_matrix = vectorizer .compute_tf (documents )
102+ print ("Vocabulary:" , vectorizer .vocab )
103+ print ("TF-IDF Matrix:\n " , tfidf_matrix )
104+
105+
106+
107+
108+
109+
110+
111+
112+
0 commit comments