serengil · serengil · Mar 8, 2024 · Mar 8, 2024 · Mar 8, 2024 · Mar 8, 2024
diff --git a/README.md b/README.md
@@ -312,9 +312,11 @@ $ deepface analyze -img_path tests/dataset/img1.jpg
 
 You can also run these commands if you are running deepface with docker. Please follow the instructions in the [shell script](https://github.com/serengil/deepface/blob/master/scripts/dockerize.sh#L17).
 
-## Contribution [![Tests](https://github.com/serengil/deepface/actions/workflows/tests.yml/badge.svg)](https://github.com/serengil/deepface/actions/workflows/tests.yml)
+## Contribution
 
-Pull requests are more than welcome! You should run the unit tests and linting locally by running `make test && make lint` before creating a PR. Once a PR sent, GitHub test workflow will be run automatically and unit test results will be available in [GitHub actions](https://github.com/serengil/deepface/actions) before approval. Besides, workflow will evaluate the code with pylint as well.
+Pull requests are more than welcome! If you are planning to contribute a large patch, please create an issue first to get any upfront questions or design decisions out of the way first.
+
+Before creating a PR, you should run the unit tests and linting locally by running `make test && make lint` command. Once a PR sent, GitHub test workflow will be run automatically and unit test and linting jobs will be available in [GitHub actions](https://github.com/serengil/deepface/actions) before approval.
 
 ## Support
 

diff --git a/deepface/DeepFace.py b/deepface/DeepFace.py
@@ -62,6 +62,7 @@ def verify(
     align: bool = True,
     expand_percentage: int = 0,
     normalization: str = "base",
+    silent: bool = False,
 ) -> Dict[str, Any]:
     """
     Verify if an image pair represents the same person or different persons.
@@ -91,6 +92,9 @@ def verify(
         normalization (string): Normalize the input image before feeding it to the model.
             Options: base, raw, Facenet, Facenet2018, VGGFace, VGGFace2, ArcFace (default is base)
 
+        silent (boolean): Suppress or allow some log messages for a quieter analysis process
+            (default is False).
+
     Returns:
         result (dict): A dictionary containing verification results with following keys.
 
@@ -126,6 +130,7 @@ def verify(
         align=align,
         expand_percentage=expand_percentage,
         normalization=normalization,
+        silent=silent,
     )
 
 

diff --git a/deepface/commons/package_utils.py b/deepface/commons/package_utils.py
@@ -1,3 +1,6 @@
+# built-in dependencies
+import hashlib
+
 # 3rd party dependencies
 import tensorflow as tf
 
@@ -14,3 +17,16 @@ def get_tf_major_version() -> int:
         major_version (int)
     """
     return int(tf.__version__.split(".", maxsplit=1)[0])
+
+
+def find_hash_of_file(file_path: str) -> str:
+    """
+    Find hash of image file
+    Args:
+        file_path (str): exact image path
+    Returns:
+        hash (str): digest with sha1 algorithm
+    """
+    with open(file_path, "rb") as f:
+        digest = hashlib.sha1(f.read()).hexdigest()
+    return digest
diff --git a/deepface/modules/preprocessing.py b/deepface/modules/preprocessing.py
@@ -34,7 +34,7 @@ def load_image(img: Union[str, np.ndarray]) -> Tuple[np.ndarray, str]:
         return load_base64(img), "base64 encoded string"
 
     # The image is a url
-    if img.startswith("http://") or img.startswith("https://"):
+    if img.lower().startswith("http://") or img.lower().startswith("https://"):
         return load_image_from_web(url=img), img
 
     # The image is a path

diff --git a/deepface/modules/recognition.py b/deepface/modules/recognition.py
@@ -1,7 +1,7 @@
 # built-in dependencies
 import os
 import pickle
-from typing import List, Union, Optional
+from typing import List, Union, Optional, Dict, Any
 import time
 
 # 3rd party dependencies
@@ -11,6 +11,7 @@
 
 # project dependencies
 from deepface.commons.logger import Logger
+from deepface.commons import package_utils
 from deepface.modules import representation, detection, modeling, verification
 from deepface.models.FacialRecognition import FacialRecognition
 
@@ -97,14 +98,16 @@ def find(
 
     # ---------------------------------------
 
-    file_name = f"representations_{model_name}.pkl"
-    file_name = file_name.replace("-", "_").lower()
+    file_name = f"ds_{model_name}_{detector_backend}_v2.pkl"
+    file_name = file_name.replace("-", "").lower()
     datastore_path = os.path.join(db_path, file_name)
     representations = []
 
+    # required columns for representations
     df_cols = [
         "identity",
-        f"{model_name}_representation",
+        "hash",
+        "embedding",
         "target_x",
         "target_y",
         "target_w",
@@ -120,35 +123,59 @@ def find(
     with open(datastore_path, "rb") as f:
         representations = pickle.load(f)
 
-    # Check if the representations are out-of-date
-    if len(representations) > 0:
-        if len(representations[0]) != len(df_cols):
+    # check each item of representations list has required keys
+    for i, current_representation in enumerate(representations):
+        missing_keys = list(set(df_cols) - set(current_representation.keys()))
+        if len(missing_keys) > 0:
             raise ValueError(
-                f"Seems existing {datastore_path} is out-of-the-date."
-                "Please delete it and re-run."
+                f"{i}-th item does not have some required keys - {missing_keys}."
+                f"Consider to delete {datastore_path}"
             )
-        pickled_images = [representation[0] for representation in representations]
-    else:
-        pickled_images = []
+
+    # embedded images
+    pickled_images = [representation["identity"] for representation in representations]
 
     # Get the list of images on storage
     storage_images = __list_images(path=db_path)
 
+    if len(storage_images) == 0:
+        raise ValueError(f"No item found in {db_path}")
+
     # Enforce data consistency amongst on disk images and pickle file
     must_save_pickle = False
-    new_images = list(set(storage_images) - set(pickled_images)) # images added to storage
-    old_images = list(set(pickled_images) - set(storage_images)) # images removed from storage
+    new_images = list(set(storage_images) - set(pickled_images))  # images added to storage
+    old_images = list(set(pickled_images) - set(storage_images))  # images removed from storage
+
+    # detect replaced images
+    replaced_images = []
+    for current_representation in representations:
+        identity = current_representation["identity"]
+        if identity in old_images:
+            continue
+        alpha_hash = current_representation["hash"]
+        beta_hash = package_utils.find_hash_of_file(identity)
+        if alpha_hash != beta_hash:
+            logger.debug(f"Even though {identity} represented before, it's replaced later.")
+            replaced_images.append(identity)
+
+    if not silent and (len(new_images) > 0 or len(old_images) > 0 or len(replaced_images) > 0):
+        logger.info(
+            f"Found {len(new_images)} newly added image(s)"
+            f", {len(old_images)} removed image(s)"
+            f", {len(replaced_images)} replaced image(s)."
+        )
 
-    if not silent and (len(new_images) > 0 or len(old_images) > 0):
-        logger.info(f"Found {len(new_images)} new images and {len(old_images)} removed images")
+    # append replaced images into both old and new images. these will be dropped and re-added.
+    new_images = new_images + replaced_images
+    old_images = old_images + replaced_images
 
     # remove old images first
-    if len(old_images)>0:
-        representations = [rep for rep in representations if rep[0] not in old_images]
+    if len(old_images) > 0:
+        representations = [rep for rep in representations if rep["identity"] not in old_images]
         must_save_pickle = True
 
     # find representations for new images
-    if len(new_images)>0:
+    if len(new_images) > 0:
         representations += __find_bulk_embeddings(
             employees=new_images,
             model_name=model_name,
@@ -158,7 +185,7 @@ def find(
             align=align,
             normalization=normalization,
             silent=silent,
-        ) # add new images
+        )  # add new images
         must_save_pickle = True
 
     if must_save_pickle:
@@ -176,10 +203,10 @@ def find(
 
     # ----------------------------
     # now, we got representations for facial database
-    df = pd.DataFrame(
-        representations,
-        columns=df_cols,
-    )
+    df = pd.DataFrame(representations)
+
+    if silent is False:
+        logger.info(f"Searching {img_path} in {df.shape[0]} length datastore")
 
     # img path might have more than once face
     source_objs = detection.extract_faces(
@@ -216,9 +243,9 @@ def find(
 
         distances = []
         for _, instance in df.iterrows():
-            source_representation = instance[f"{model_name}_representation"]
+            source_representation = instance["embedding"]
             if source_representation is None:
-                distances.append(float("inf")) # no representation for this image
+                distances.append(float("inf"))  # no representation for this image
                 continue
 
             target_dims = len(list(target_representation))
@@ -230,21 +257,9 @@ def find(
                     + " after pickle created. Delete the {file_name} and re-run."
                 )
 
-            if distance_metric == "cosine":
-                distance = verification.find_cosine_distance(
-                    source_representation, target_representation
-                )
-            elif distance_metric == "euclidean":
-                distance = verification.find_euclidean_distance(
-                    source_representation, target_representation
-                )
-            elif distance_metric == "euclidean_l2":
-                distance = verification.find_euclidean_distance(
-                    verification.l2_normalize(source_representation),
-                    verification.l2_normalize(target_representation),
-                )
-            else:
-                raise ValueError(f"invalid distance metric passes - {distance_metric}")
+            distance = verification.find_distance(
+                source_representation, target_representation, distance_metric
+            )
 
             distances.append(distance)
 
@@ -254,7 +269,7 @@ def find(
         result_df["threshold"] = target_threshold
         result_df["distance"] = distances
 
-        result_df = result_df.drop(columns=[f"{model_name}_representation"])
+        result_df = result_df.drop(columns=["embedding"])
         # pylint: disable=unsubscriptable-object
         result_df = result_df[result_df["distance"] <= target_threshold]
         result_df = result_df.sort_values(by=["distance"], ascending=True).reset_index(drop=True)
@@ -297,7 +312,7 @@ def __find_bulk_embeddings(
     expand_percentage: int = 0,
     normalization: str = "base",
     silent: bool = False,
-):
+) -> List[Dict["str", Any]]:
     """
     Find embeddings of a list of images
 
@@ -323,15 +338,17 @@ def __find_bulk_embeddings(
 
         silent (bool): enable or disable informative logging
     Returns:
-        representations (list): pivot list of embeddings with
-            image name and detected face area's coordinates
+        representations (list): pivot list of dict with
+            image name, hash, embedding and detected face area's coordinates
     """
     representations = []
     for employee in tqdm(
         employees,
         desc="Finding representations",
         disable=silent,
     ):
+        file_hash = package_utils.find_hash_of_file(employee)
+
         try:
             img_objs = detection.extract_faces(
                 img_path=employee,
@@ -342,15 +359,23 @@ def __find_bulk_embeddings(
                 align=align,
                 expand_percentage=expand_percentage,
             )
+
         except ValueError as err:
-            logger.error(
-                f"Exception while extracting faces from {employee}: {str(err)}"
-            )
+            logger.error(f"Exception while extracting faces from {employee}: {str(err)}")
             img_objs = []
 
         if len(img_objs) == 0:
-            logger.warn(f"No face detected in {employee}. It will be skipped in detection.")
-            representations.append((employee, None, 0, 0, 0, 0))
+            representations.append(
+                {
+                    "identity": employee,
+                    "hash": file_hash,
+                    "embedding": None,
+                    "target_x": 0,
+                    "target_y": 0,
+                    "target_w": 0,
+                    "target_h": 0,
+                }
+            )
         else:
             for img_obj in img_objs:
                 img_content = img_obj["face"]
@@ -365,13 +390,16 @@ def __find_bulk_embeddings(
                 )
 
                 img_representation = embedding_obj[0]["embedding"]
-                representations.append((
-                    employee,
-                    img_representation,
-                    img_region["x"],
-                    img_region["y"],
-                    img_region["w"],
-                    img_region["h"]
-                    ))
+                representations.append(
+                    {
+                        "identity": employee,
+                        "hash": file_hash,
+                        "embedding": img_representation,
+                        "target_x": img_region["x"],
+                        "target_y": img_region["y"],
+                        "target_w": img_region["w"],
+                        "target_h": img_region["h"],
+                    }
+                )
 
     return representations