Merge pull request #967 from serengil/feat-task-2301-vgg-normalizatio…

…n-layer vgg normalization layer bug for gpu users
serengil · Jan 23, 2024 · 88814e6 · 88814e6
2 parents 3265be2 + 5ffa7bf
commit 88814e6
Show file tree

Hide file tree

Showing 5 changed files with 84 additions and 141 deletions.
diff --git a/README.md b/README.md
@@ -4,7 +4,7 @@
 
 [![PyPI Downloads](https://static.pepy.tech/personalized-badge/deepface?period=total&units=international_system&left_color=grey&right_color=blue&left_text=pypi%20downloads)](https://pepy.tech/project/deepface)
 [![Conda Downloads](https://img.shields.io/conda/dn/conda-forge/deepface?color=green&label=conda%20downloads)](https://anaconda.org/conda-forge/deepface)
-[![Stars](https://img.shields.io/github/stars/serengil/deepface?color=yellow&style=flat)](https://github.com/serengil/deepface/stargazers)
+[![Stars](https://img.shields.io/github/stars/serengil/deepface?color=yellow&style=flat&label=%E2%AD%90%20stars)](https://github.com/serengil/deepface/stargazers)
 [![License](http://img.shields.io/:license-MIT-green.svg?style=flat)](https://github.com/serengil/deepface/blob/master/LICENSE)
 [![Tests](https://github.com/serengil/deepface/actions/workflows/tests.yml/badge.svg)](https://github.com/serengil/deepface/actions/workflows/tests.yml)
 

diff --git a/deepface/DeepFace.py b/deepface/DeepFace.py
@@ -45,7 +45,7 @@ def build_model(model_name: str) -> Any:
             VGG-Face, Facenet, OpenFace, DeepFace, DeepID for face recognition
             Age, Gender, Emotion, Race for facial attributes
     Returns:
-            built model with corresponding class
+        built_model
     """
     return modeling.build_model(model_name=model_name)
 
@@ -62,57 +62,37 @@ def verify(
 ) -> Dict[str, Any]:
     """
     Verify if an image pair represents the same person or different persons.
-
-    The verification function converts facial images to vectors and calculates the similarity
-    between those vectors. Vectors of images of the same person should exhibit higher similarity
-    (or lower distance) than vectors of images of different persons.
-
     Args:
         img1_path (str or np.ndarray): Path to the first image. Accepts exact image path
             as a string, numpy array (BGR), or base64 encoded images.
-
         img2_path (str or np.ndarray): Path to the second image. Accepts exact image path
             as a string, numpy array (BGR), or base64 encoded images.
-
         model_name (str): Model for face recognition. Options: VGG-Face, Facenet, Facenet512,
             OpenFace, DeepFace, DeepID, Dlib, ArcFace and SFace (default is VGG-Face).
-
         detector_backend (string): face detector backend. Options: 'opencv', 'retinaface',
             'mtcnn', 'ssd', 'dlib', 'mediapipe', 'yolov8' (default is opencv)
-
         distance_metric (string): Metric for measuring similarity. Options: 'cosine',
             'euclidean', 'euclidean_l2' (default is cosine).
-
         enforce_detection (boolean): If no face is detected in an image, raise an exception.
             Set to False to avoid the exception for low-resolution images (default is True).
-
         align (bool): Flag to enable face alignment (default is True).
-
         normalization (string): Normalize the input image before feeding it to the model.
             Options: base, raw, Facenet, Facenet2018, VGGFace, VGGFace2, ArcFace (default is base)
-
     Returns:
-        result (dict): A dictionary containing verification results.
-
+        result (dict): A dictionary containing verification results with following keys.
         - 'verified' (bool): Indicates whether the images represent the same person (True)
             or different persons (False).
-
         - 'distance' (float): The distance measure between the face vectors.
             A lower distance indicates higher similarity.
-
         - 'max_threshold_to_verify' (float): The maximum threshold used for verification.
             If the distance is below this threshold, the images are considered a match.
-
         - 'model' (str): The chosen face recognition model.
-
         - 'similarity_metric' (str): The chosen similarity metric for measuring distances.
-
         - 'facial_areas' (dict): Rectangular regions of interest for faces in both images.
             - 'img1': {'x': int, 'y': int, 'w': int, 'h': int}
                     Region of interest for the first image.
             - 'img2': {'x': int, 'y': int, 'w': int, 'h': int}
                     Region of interest for the second image.
-
         - 'time' (float): Time taken for the verification process in seconds.
     """
 
@@ -138,77 +118,59 @@ def analyze(
 ) -> List[Dict[str, Any]]:
     """
     Analyze facial attributes such as age, gender, emotion, and race in the provided image.
-
     Args:
         img_path (str or np.ndarray): The exact path to the image, a numpy array in BGR format,
             or a base64 encoded image. If the source image contains multiple faces, the result will
             include information for each detected face.
-
         actions (tuple): Attributes to analyze. The default is ('age', 'gender', 'emotion', 'race').
             You can exclude some of these attributes from the analysis if needed.
-
         enforce_detection (boolean): If no face is detected in an image, raise an exception.
             Set to False to avoid the exception for low-resolution images (default is True).
-
         detector_backend (string): face detector backend. Options: 'opencv', 'retinaface',
             'mtcnn', 'ssd', 'dlib', 'mediapipe', 'yolov8' (default is opencv).
-
         distance_metric (string): Metric for measuring similarity. Options: 'cosine',
             'euclidean', 'euclidean_l2' (default is cosine).
-
         align (boolean): Perform alignment based on the eye positions (default is True).
-
         silent (boolean): Suppress or allow some log messages for a quieter analysis process
             (default is False).
-
     Returns:
         results (List[Dict[str, Any]]): A list of dictionaries, where each dictionary represents
-           the analysis results for a detected face.
-
-           Each dictionary in the list contains the following keys:
-
-           - 'region' (dict): Represents the rectangular region of the detected face in the image.
-               - 'x': x-coordinate of the top-left corner of the face.
-               - 'y': y-coordinate of the top-left corner of the face.
-               - 'w': Width of the detected face region.
-               - 'h': Height of the detected face region.
-
-           - 'age' (float): Estimated age of the detected face.
-
-           - 'face_confidence' (float): Confidence score for the detected face.
-                Indicates the reliability of the face detection.
-
-           - 'dominant_gender' (str): The dominant gender in the detected face.
-                Either "Man" or "Woman."
-
-           - 'gender' (dict): Confidence scores for each gender category.
-               - 'Man': Confidence score for the male gender.
-               - 'Woman': Confidence score for the female gender.
-
-           - 'dominant_emotion' (str): The dominant emotion in the detected face.
-                Possible values include "sad," "angry," "surprise," "fear," "happy,"
-                "disgust," and "neutral."
-
-           - 'emotion' (dict): Confidence scores for each emotion category.
-               - 'sad': Confidence score for sadness.
-               - 'angry': Confidence score for anger.
-               - 'surprise': Confidence score for surprise.
-               - 'fear': Confidence score for fear.
-               - 'happy': Confidence score for happiness.
-               - 'disgust': Confidence score for disgust.
-               - 'neutral': Confidence score for neutrality.
-
-           - 'dominant_race' (str): The dominant race in the detected face.
-                Possible values include "indian," "asian," "latino hispanic,"
-                "black," "middle eastern," and "white."
-
-           - 'race' (dict): Confidence scores for each race category.
-               - 'indian': Confidence score for Indian ethnicity.
-               - 'asian': Confidence score for Asian ethnicity.
-               - 'latino hispanic': Confidence score for Latino/Hispanic ethnicity.
-               - 'black': Confidence score for Black ethnicity.
-               - 'middle eastern': Confidence score for Middle Eastern ethnicity.
-               - 'white': Confidence score for White ethnicity.
+           the analysis results for a detected face. Each dictionary in the list contains the
+           following keys:
+        - 'region' (dict): Represents the rectangular region of the detected face in the image.
+            - 'x': x-coordinate of the top-left corner of the face.
+            - 'y': y-coordinate of the top-left corner of the face.
+            - 'w': Width of the detected face region.
+            - 'h': Height of the detected face region.
+        - 'age' (float): Estimated age of the detected face.
+        - 'face_confidence' (float): Confidence score for the detected face.
+            Indicates the reliability of the face detection.
+        - 'dominant_gender' (str): The dominant gender in the detected face.
+            Either "Man" or "Woman."
+        - 'gender' (dict): Confidence scores for each gender category.
+            - 'Man': Confidence score for the male gender.
+            - 'Woman': Confidence score for the female gender.
+        - 'dominant_emotion' (str): The dominant emotion in the detected face.
+            Possible values include "sad," "angry," "surprise," "fear," "happy,"
+            "disgust," and "neutral."
+        - 'emotion' (dict): Confidence scores for each emotion category.
+            - 'sad': Confidence score for sadness.
+            - 'angry': Confidence score for anger.
+            - 'surprise': Confidence score for surprise.
+            - 'fear': Confidence score for fear.
+            - 'happy': Confidence score for happiness.
+            - 'disgust': Confidence score for disgust.
+            - 'neutral': Confidence score for neutrality.
+        - 'dominant_race' (str): The dominant race in the detected face.
+            Possible values include "indian," "asian," "latino hispanic,"
+            "black," "middle eastern," and "white."
+        - 'race' (dict): Confidence scores for each race category.
+            - 'indian': Confidence score for Indian ethnicity.
+            - 'asian': Confidence score for Asian ethnicity.
+            - 'latino hispanic': Confidence score for Latino/Hispanic ethnicity.
+            - 'black': Confidence score for Black ethnicity.
+            - 'middle eastern': Confidence score for Middle Eastern ethnicity.
+            - 'white': Confidence score for White ethnicity.
     """
     return demography.analyze(
         img_path=img_path,
@@ -233,46 +195,36 @@ def find(
 ) -> List[pd.DataFrame]:
     """
     Identify individuals in a database
-
     Args:
         img_path (str or np.ndarray): The exact path to the image, a numpy array in BGR format,
             or a base64 encoded image. If the source image contains multiple faces, the result will
             include information for each detected face.
-
         db_path (string): Path to the folder containing image files. All detected faces
             in the database will be considered in the decision-making process.
-
         model_name (str): Model for face recognition. Options: VGG-Face, Facenet, Facenet512,
-            OpenFace, DeepFace, DeepID, Dlib, ArcFace and SFace
-
+            OpenFace, DeepFace, DeepID, Dlib, ArcFace and SFace (default is VGG-Face).
         distance_metric (string): Metric for measuring similarity. Options: 'cosine',
-            'euclidean', 'euclidean_l2'.
-
+            'euclidean', 'euclidean_l2' (default is cosine).
         enforce_detection (boolean): If no face is detected in an image, raise an exception.
-            Default is True. Set to False to avoid the exception for low-resolution images.
-
+            Set to False to avoid the exception for low-resolution images (default is True).
         detector_backend (string): face detector backend. Options: 'opencv', 'retinaface',
-            'mtcnn', 'ssd', 'dlib', 'mediapipe', 'yolov8'.
-
-        align (boolean): Perform alignment based on the eye positions.
-
+            'mtcnn', 'ssd', 'dlib', 'mediapipe', 'yolov8' (default is opencv).
+        align (boolean): Perform alignment based on the eye positions (default is True).
         normalization (string): Normalize the input image before feeding it to the model.
-            Default is base. Options: base, raw, Facenet, Facenet2018, VGGFace, VGGFace2, ArcFace
-
-        silent (boolean): Suppress or allow some log messages for a quieter analysis process.
-
+            Options: base, raw, Facenet, Facenet2018, VGGFace, VGGFace2, ArcFace (default is base).
+        silent (boolean): Suppress or allow some log messages for a quieter analysis process
+            (default is False).
     Returns:
         results (List[pd.DataFrame]): A list of pandas dataframes. Each dataframe corresponds
             to the identity information for an individual detected in the source image.
             The DataFrame columns include:
-
-            - 'identity': Identity label of the detected individual.
-            - 'target_x', 'target_y', 'target_w', 'target_h': Bounding box coordinates of the
-                    target face in the database.
-            - 'source_x', 'source_y', 'source_w', 'source_h': Bounding box coordinates of the
-                    detected face in the source image.
-            - '{model_name}_{distance_metric}': Similarity score between the faces based on the
-                    specified model and distance metric
+        - 'identity': Identity label of the detected individual.
+        - 'target_x', 'target_y', 'target_w', 'target_h': Bounding box coordinates of the
+                target face in the database.
+        - 'source_x', 'source_y', 'source_w', 'source_h': Bounding box coordinates of the
+                detected face in the source image.
+        - '{model_name}_{distance_metric}': Similarity score between the faces based on the
+                specified model and distance metric
     """
     return recognition.find(
         img_path=img_path,
@@ -302,25 +254,20 @@ def represent(
         img_path (str or np.ndarray): The exact path to the image, a numpy array in BGR format,
             or a base64 encoded image. If the source image contains multiple faces, the result will
             include information for each detected face.
-
         model_name (str): Model for face recognition. Options: VGG-Face, Facenet, Facenet512,
-            OpenFace, DeepFace, DeepID, Dlib, ArcFace and SFace
-
+            OpenFace, DeepFace, DeepID, Dlib, ArcFace and SFace (default is VGG-Face.).
         enforce_detection (boolean): If no face is detected in an image, raise an exception.
-            Default is True. Set to False to avoid the exception for low-resolution images.
-
+            Default is True. Set to False to avoid the exception for low-resolution images
+            (default is True).
         detector_backend (string): face detector backend. Options: 'opencv', 'retinaface',
-            'mtcnn', 'ssd', 'dlib', 'mediapipe', 'yolov8'.
-
-        align (boolean): Perform alignment based on the eye positions.
-
+            'mtcnn', 'ssd', 'dlib', 'mediapipe', 'yolov8' (default is opencv).
+        align (boolean): Perform alignment based on the eye positions (default is True).
         normalization (string): Normalize the input image before feeding it to the model.
             Default is base. Options: base, raw, Facenet, Facenet2018, VGGFace, VGGFace2, ArcFace
-
+            (default is base).
     Returns:
         results (List[Dict[str, Any]]): A list of dictionaries, each containing the
             following fields:
-
         - embedding (np.array): Multidimensional vector representing facial features.
             The number of dimensions varies based on the reference model
             (e.g., FaceNet returns 128 dimensions, VGG-Face returns 4096 dimensions).
@@ -359,13 +306,13 @@ def stream(
             in the database will be considered in the decision-making process.
 
         model_name (str): Model for face recognition. Options: VGG-Face, Facenet, Facenet512,
-            OpenFace, DeepFace, DeepID, Dlib, ArcFace and SFace
+            OpenFace, DeepFace, DeepID, Dlib, ArcFace and SFace (default is VGG-Face).
 
         detector_backend (string): face detector backend. Options: 'opencv', 'retinaface',
-            'mtcnn', 'ssd', 'dlib', 'mediapipe', 'yolov8'.
+            'mtcnn', 'ssd', 'dlib', 'mediapipe', 'yolov8' (default is opencv).
 
         distance_metric (string): Metric for measuring similarity. Options: 'cosine',
-            'euclidean', 'euclidean_l2'.
+            'euclidean', 'euclidean_l2' (default is cosine).
 
         enable_face_analysis (bool): Flag to enable face analysis (default is True).
 
@@ -408,22 +355,15 @@ def extract_faces(
     Args:
         img_path (str or np.ndarray): Path to the first image. Accepts exact image path
             as a string, numpy array (BGR), or base64 encoded images.
-
         target_size (tuple): final shape of facial image. black pixels will be
-            added to resize the image.
-
+            added to resize the image (default is (224, 224)).
         detector_backend (string): face detector backend. Options: 'opencv', 'retinaface',
             'mtcnn', 'ssd', 'dlib', 'mediapipe', 'yolov8' (default is opencv)
-
         enforce_detection (boolean): If no face is detected in an image, raise an exception.
-            Default is True. Set to False to avoid the exception for low-resolution images.
-
+            Set to False to avoid the exception for low-resolution images (default is True).
         align (bool): Flag to enable face alignment (default is True).
-
         grayscale (boolean): Flag to convert the image to grayscale before
             processing (default is False).
-
-
     Returns:
         results (List[Dict[str, Any]]): A list of dictionaries, where each dictionary contains:
         - "face" (np.ndarray): The detected face as a NumPy array.

diff --git a/deepface/basemodels/VGGFace.py b/deepface/basemodels/VGGFace.py
@@ -2,9 +2,9 @@
 import os
 import gdown
 import numpy as np
-from deepface.commons import functions
-from deepface.commons.logger import Logger
+from deepface.commons import functions, distance
 from deepface.models.FacialRecognition import FacialRecognition
+from deepface.commons.logger import Logger
 
 logger = Logger(module="basemodels.VGGFace")
 
@@ -20,9 +20,7 @@
         Flatten,
         Dropout,
         Activation,
-        Lambda,
     )
-    from keras import backend as K
 else:
     from tensorflow.keras.models import Model, Sequential
     from tensorflow.keras.layers import (
@@ -32,9 +30,7 @@
         Flatten,
         Dropout,
         Activation,
-        Lambda,
     )
-    from tensorflow.keras import backend as K
 
 # ---------------------------------------
 
@@ -58,7 +54,11 @@ def find_embeddings(self, img: np.ndarray) -> List[float]:
         """
         # model.predict causes memory issue when it is called in a for loop
         # embedding = model.predict(img, verbose=0)[0].tolist()
-        return self.model(img, training=False).numpy()[0].tolist()
+        # having normalization layer in descriptor troubles for some gpu users (e.g. issue 957, 966)
+        # instead we are now calculating it with traditional way not with keras backend
+        embedding = self.model(img, training=False).numpy()[0].tolist()
+        embedding = distance.l2_normalize(embedding)
+        return embedding.tolist()
 
 
 def base_model() -> Sequential:
@@ -144,9 +144,10 @@ def load_model(
     # as described here: https://github.com/serengil/deepface/issues/944
     base_model_output = Sequential()
     base_model_output = Flatten()(model.layers[-5].output)
-    base_model_output = Lambda(lambda x: K.l2_normalize(x, axis=1), name="norm_layer")(
-        base_model_output
-    )
+    # keras backend's l2 normalization layer troubles some gpu users (e.g. issue 957, 966)
+    # base_model_output = Lambda(lambda x: K.l2_normalize(x, axis=1), name="norm_layer")(
+    #     base_model_output
+    # )
     vgg_face_descriptor = Model(inputs=model.input, outputs=base_model_output)
 
     return vgg_face_descriptor