[pre-commit.ci] auto fixes from pre-commit.com hooks

pre-commit-ci[bot] · pre-commit-ci[bot] · commit f88e0acd5cfb · 2026-03-15T16:29:08.000Z
for more information, see https://pre-commit.ci
diff --git a/machine_learning/feature_extraction/tf-idf.py b/machine_learning/feature_extraction/tf-idf.py
@@ -1,74 +1,69 @@
 import numpy as np
 import re
-#to seprate words and normlize it
+# to seprate words and normlize it
+
 
 def decompose(text):
     text = text.lower()
-    text = re.sub(r'[^a-z0-9\s]', '', text)
-    text = re.sub(r'\s+', ' ', text)
-    
+    text = re.sub(r"[^a-z0-9\s]", "", text)
+    text = re.sub(r"\s+", " ", text)
+
     return text.split()
-    
 
-#creating tfidf class 
+
+# creating tfidf class
 class TfIdfVectorizer:
-    
     def __init__(self):
-        self.vocab=None
-        self.idf=None
-        
-    #these method to compute the tf for each word in given data
-    def compute_tf(self,data):
-        
+        self.vocab = None
+        self.idf = None
+
+    # these method to compute the tf for each word in given data
+    def compute_tf(self, data):
         tf = []
         doc_words = []
-        
+
         for document in data:
-            
             words = decompose(document)
-            
-            freq = {} #these dictionerie have for each unique words it number of apprition in one sentence
-            
+
+            freq = {}  # these dictionerie have for each unique words it number of apprition in one sentence
+
             for word in words:
-                
-                freq[word] = freq.get(word , 0)+1 
-                
+                freq[word] = freq.get(word, 0) + 1
+
                 if word not in doc_words:
                     doc_words.append(word)
-            
-            #calculating tf
-            
-            for word in freq :
+
+            # calculating tf
+
+            for word in freq:
                 freq[word] /= len(words)
-            
+
             tf.append(freq)
-                
-             
-        #computing idf    
+
+        # computing idf
         idf = {}
-        
+
         n = len(data)
-        
+
         for word in doc_words:
             df = sum(1 for doc in tf if word in doc)
             idf[word] = np.log((n + 1) / (1 + df)) + 1
-        
-        self.idf=idf    
+
+        self.idf = idf
         tfidf = []
-        
+
         self.idf = idf
-        
-        #computing tfidf for each word
-        
-        
+
+        # computing tfidf for each word
+
         for doc_tf in tf:
             vector = [doc_tf.get(word, 0) * idf[word] for word in doc_words]
             tfidf.append(vector)
-        
+
         self.vocab = doc_words
-        
-        return np.array(tfidf,dtype=float) 
-    
+
+        return np.array(tfidf, dtype=float)
+
     def encode(self, data):
         if self.vocab is None or self.idf is None:
             raise ValueError("You should fit the model first")
@@ -77,36 +72,26 @@ def encode(self, data):
         for doc in data:
             words = decompose(doc)
             freq = {}
-            
+
             # Count term frequencies for words that exist in the vocabulary
             for word in words:
-                
                 if word in self.vocab:
                     freq[word] = freq.get(word, 0) + 1
-            
+
             # Normalize TF by document length
             for word in freq:
                 freq[word] /= len(words)
-            
+
             # Align vector according to vocab and multiply by IDF
             vector = [freq.get(word, 0) * self.idf[word] for word in self.vocab]
             tfidf_matrix.append(vector)
 
-        return np.array(tfidf_matrix, dtype=float)       
-                
-                        
+        return np.array(tfidf_matrix, dtype=float)
+
+
 if __name__ == "__main__":
     documents = ["the cat sat on the mat", "the dog chased the cat"]
     vectorizer = TfIdfVectorizer()
     tfidf_matrix = vectorizer.compute_tf(documents)
     print("Vocabulary:", vectorizer.vocab)
-    print("TF-IDF Matrix:\n", tfidf_matrix)                                  
-            
-        
-                    
-            
-
-
-    
-    
-    
+    print("TF-IDF Matrix:\n", tfidf_matrix)