Only one model when aggregating (#17)

ternaus · Nov 25, 2023 · 991a143 · 991a143
1 parent 1bb9e95
commit 991a143
Show file tree

Hide file tree

Showing 7 changed files with 47 additions and 51 deletions.
diff --git a/README.md b/README.md
@@ -2,53 +2,50 @@
 
 ![](https://habrastorage.org/webt/gy/-1/xd/gy-1xdtfz3_i7xxt-nqzl4mfhuw.jpeg)
 
-A Python wrapper for the YoloV5Face model, providing easy-to-use functionalities for face detection in images.
+A user-friendly Python wrapper for the YoloV5Face model, designed to simplify face detection in images. This wrapper offers straightforward functionalities for quick integration into Python projects, along with customization options for handling various face detection scenarios.
 
 ## Installation
 
-Install the YoloV5Face wrapper using pip:
+Install the YoloV5Face wrapper using pip to easily incorporate it into your projects:
 
 ```bash
 pip install -U yolo5face
 ```
 
-## Inference
+## Face Detection: Standard and Enhanced
 
-Use the wrapper to quickly deploy face detection in your projects:
+The YoloV5Face wrapper supports both standard and enhanced face detection. The standard detection is suitable for most use cases, while the enhanced detection, which aggregates results over multiple target sizes, is ideal for images with faces of varying sizes.
+
+### Getting Started
+
+To detect faces in an image:
 
 ```bash
 from yolo5face.get_model import get_model
 import cv2
 
-model = get_model("yolov5n", device=-1, target_size=512, min_face=24)
+# Initialize the model
+model = get_model("yolov5n", device=-1, min_face=24)
 
+# Load your image
 image = cv2.imread(<IMAGE_PATH>)
 image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
 
-boxes, key_points, scores = model(image)
-```
-
-* **device**: Specify device `cpu`, `cuda`, `mps` or integer for the number of cuda device.
-* **target_size**: The minimum size of the target image for detection.
-* **min_face**: The minimum face size in pixels. Faces smaller than this value will be ignored.
-
-## Enhanced Detection with Aggregated Target Sizes
-
-In addition to standard detection, this wrapper supports enhanced detection capabilities by aggregating results over multiple target sizes. This feature is especially useful in scenarios where face sizes vary significantly within the same image.
+# Standard Detection
+boxes, key_points, scores = model(image, target_size=512)
 
-To use this feature:
-
-```bash
-from yolo5face.get_model import get_model
+# Enhanced Detection (aggregating over multiple target sizes)
+enhanced_boxes, enhanced_key_points, enhanced_scores = model(image, target_size=[320, 640, 1280])
+```
 
-model = get_model("yolov5n", device=-1, target_size=[320, 640, 1280], min_face=24)
+Parameters:
 
-# Aggregate detections over the specified target sizes
-boxes, key_points, scores = model(image)
-```
+* **device**: Set the processing device (cpu, cuda, mps, or CUDA device number).
+* **target_size**: For standard detection, it's the minimum size of the target image. For enhanced detection, provide a list of sizes for better accuracy.
+* **min_face**: Minimum size of faces to detect in pixels. Smaller faces will be ignored.
 
-This approach leverages multiple detections at different scales, followed by Non-Maximum Suppression, to provide a more comprehensive set of detections.
+This approach, especially the enhanced detection, uses multiple scales for improved accuracy and is followed by Non-Maximum Suppression to refine the results.
 
 ## License
 
-This YoloV5Face wrapper is released under the [MIT License](LICENSE)
+This YoloV5Face wrapper is released under the [MIT License](LICENSE).
diff --git a/tests/aggregate_test.py b/tests/aggregate_test.py
@@ -4,7 +4,7 @@
 from tests.conftest import test_images as images
 from yolo5face.get_model import get_model
 
-model = get_model("yolov5n", device="cpu", target_size=[512, 1024])
+model = get_model("yolov5n", device="cpu")
 
 
 @mark.parametrize(
@@ -15,7 +15,7 @@
     ],
 )
 def test_face_detection(image, face):
-    boxes, points = model(image)[:2]
+    boxes, points = model(image, target_size=[512, 1024])[:2]
 
     for box_id, box in enumerate(boxes):
         assert len(DeepDiff(box, face[box_id]["box"])) == 0

diff --git a/tests/model_test.py b/tests/model_test.py
@@ -4,7 +4,7 @@
 from tests.conftest import test_images as images
 from yolo5face.get_model import get_model
 
-model = get_model("yolov5n", device="cpu", target_size=512)
+model = get_model("yolov5n", device="cpu")
 
 
 @mark.parametrize(
@@ -15,7 +15,7 @@
     ],
 )
 def test_face_detection(image, face):
-    boxes, points = model(image)[:2]
+    boxes, points = model(image, target_size=512)[:2]
 
     for box_id, box in enumerate(boxes):
         assert len(DeepDiff(box, face[box_id]["box"])) == 0

diff --git a/yolo5face/__init__.py b/yolo5face/__init__.py
@@ -1 +1 @@
-__version__ = "0.0.8"
+__version__ = "0.0.9"
diff --git a/yolo5face/get_model.py b/yolo5face/get_model.py
@@ -28,7 +28,6 @@ def get_file_name(url: str) -> str:
 def get_model(
     model_name: str,
     device: str,
-    target_size: int,
     min_face: int = 24,
     weights_path: str = "~/.torch/models",
 ) -> YoloDetectorAggregator:
@@ -57,7 +56,6 @@ def get_model(
         device = torch.device("cpu")
 
     return YoloDetectorAggregator(
-        target_sizes=target_size,
         min_face=min_face,
         device=device,
         weights_name=weight_file_path,

diff --git a/yolo5face/yoloface/YoloDetectorAggregator.py b/yolo5face/yoloface/YoloDetectorAggregator.py
@@ -8,27 +8,32 @@
 
 
 class YoloDetectorAggregator:
-    def __init__(self, target_sizes: int | list[int], **yolo_args: Any) -> None:
+    def __init__(self, **yolo_args: Any) -> None:
         self.yolo_args = yolo_args
-        self.target_sizes = target_sizes if isinstance(target_sizes, list) else [target_sizes]
+        self.detector = YoloDetector(**self.yolo_args)
 
     def nms(self, boxes: torch.Tensor, scores: torch.Tensor, iou_threshold: float = 0.5) -> torch.Tensor:
         """Applies Non-Maximum Suppression (NMS) to filter boxes."""
         return torch.ops.torchvision.nms(boxes.type(torch.float), scores.type(torch.float), iou_threshold)
 
-    def __call__(self, image: np.ndarray) -> tuple[list[BoxType], list[KeypointType], list[float]]:
+    def __call__(
+        self,
+        image: np.ndarray,
+        target_size: int | list[int],
+    ) -> tuple[list[BoxType], list[KeypointType], list[float]]:
         all_boxes, all_keypoints, all_scores = [], [], []
 
-        for size in self.target_sizes:
-            detector = YoloDetector(target_size=size, **self.yolo_args)
+        if isinstance(target_size, int):
+            target_size = [target_size]
 
-            boxes, keypoints, scores = detector(image)
+        for size in target_size:
+            boxes, keypoints, scores = self.detector(image, size)
 
             all_boxes.extend(boxes)
             all_keypoints.extend(keypoints)
             all_scores.extend(scores)
 
-        if len(self.target_sizes) > 1:
+        if len(target_size) > 1:
             # Perform aggregation with NMS if multiple target sizes are used
             return self.aggregate_predictions(all_boxes, all_keypoints, all_scores)
 

diff --git a/yolo5face/yoloface/face_detector.py b/yolo5face/yoloface/face_detector.py
@@ -24,20 +24,15 @@ def __init__(
         config_name: str,
         device: torch.device,
         min_face: int,
-        target_size: int | None = None,
     ):
         """
         weights_name: name of file with network weights in weights/ folder.
         config_name: name of .yaml config with network configuration from models/ folder.
         gpu : gpu number (int) or -1 or string for cpu.
         min_face : minimal face size in pixels.
-        target_size : target size of smaller image axis (choose lower for faster work). e.g. 480, 720, 1080.
-                    None for original resolution.
-
         """
         self._class_path = Path(__file__).parent.absolute()
         self.device = device
-        self.target_size = target_size
         self.min_face = min_face
 
         self.detector = self.init_detector(weights_name, config_name)
@@ -48,17 +43,16 @@ def init_detector(self, weights_name: str, config_name: str) -> nn.Module:
         detector.load_state_dict(state_dict)
         return detector.to(self.device).float().eval()
 
-    def _preprocess(self, imgs: list[np.ndarray]) -> torch.Tensor:
+    def _preprocess(self, imgs: list[np.ndarray], target_size: int) -> torch.Tensor:
         """
         Preprocessing image before passing through the network. Resize and conversion to torch tensor.
         """
         pp_imgs = []
         for img in imgs:
             h0, w0 = img.shape[:2]  # orig hw
-            if self.target_size:
-                r = self.target_size / min(h0, w0)  # resize image to img_size
-                if r < 1:
-                    img = cv2.resize(img, (int(w0 * r), int(h0 * r)), interpolation=cv2.INTER_AREA)
+            r = target_size / min(h0, w0)  # resize image to img_size
+            if r < 1:
+                img = cv2.resize(img, (int(w0 * r), int(h0 * r)), interpolation=cv2.INTER_AREA)
 
             imgsz = check_img_size(max(img.shape[:2]), s=int(self.detector.stride.max()))  # check img_size
             img = letterbox(img, new_shape=imgsz)[0]
@@ -120,6 +114,7 @@ def _postprocess(
     def predict(
         self,
         imgs: np.ndarray | list[np.ndarray],
+        target_size: int,
         conf_thres: float = 0.7,
         iou_thres: float = 0.5,
     ) -> tuple[list[BoxType], list[KeypointType], list[float]]:
@@ -138,7 +133,7 @@ def predict(
 
         origimgs = copy.deepcopy(images)
 
-        images = self._preprocess(images)
+        images = self._preprocess(images, target_size)
 
         with torch.inference_mode():  # change this with torch.no_grad() for pytorch <1.8 compatibility
             pred = self.detector(images)[0]
@@ -148,7 +143,8 @@ def predict(
     def __call__(
         self,
         imgs: np.ndarray | list[np.ndarray],
+        target_size: int,
         conf_thres: float = 0.7,
         iou_thres: float = 0.5,
     ) -> tuple[list[BoxType], list[KeypointType], list[float]]:
-        return self.predict(imgs, conf_thres, iou_thres)
+        return self.predict(imgs, target_size, conf_thres, iou_thres)