feat: cleaner detection

2025-04-23 16:56:09 +02:00
parent bcfc90acdf
commit ad8f3b8e66
1 changed files with 363 additions and 90 deletions
--- a/secure_pixelation/detect_humans.py
+++ b/secure_pixelation/detect_humans.py
@@ -1,11 +1,15 @@
 from __future__ import annotations
 from pathlib import Path
 import urllib.request
 from typing import Dict, List
 import json
 from dataclasses import dataclass
 from ultralytics import YOLO
 import cv2
 import numpy as np
 from scipy.optimize import minimize
 from scipy.spatial.transform import Rotation as R
 MODEL_PATH = Path("assets", "models")
@@ -38,133 +42,402 @@ def require_net(name: str):
        )
-def detect_human_parts(human: dict):
+# Thresholds for face keypoint distances (these might need adjustment)
-    parts = human["parts"]
+EYE_RATIO_THRESHOLD = 0.25
 NOSE_EYE_RATIO_THRESHOLD = 0.2
 EAR_NOSE_RATIO_THRESHOLD = 1.2
@dataclass
 class Keypoint:
    x: float
    y: float
    name: str
    confidence: float = 0
    @property
    def point(self):
        return (int(self.x), int(self.y))
    def get_distance(self, other: Keypoint) -> float:
        return np.sqrt((self.x - other.x) ** 2 + (self.y - other.y) ** 2)
 def detect_human_parts(human: dict, face_padding: int = 20):
    parts = human["parts"]
    to_detect = human["crop"]["file"]
    _p = Path(to_detect)
    detected = str(_p.with_name(_p.stem + "_detected" + _p.suffix))
    boxes_file = str(_p.with_name(_p.stem + "_boxes.json"))
    print(f"detecting human parts: {to_detect} => {detected}")
    def apply_rotation(rot_matrix, points):
        # Apply the rotation to the points, assuming points are 2D coordinates (flattened)
        return np.dot(rot_matrix, points.T).T
    def linearize_pairwise_distances(points, target_distances):
        # Calculate pairwise distances between the points
        num_points = len(points)
        pairwise_distances = np.zeros((num_points, num_points))
        for i in range(num_points):
            for j in range(i, num_points):
                pairwise_distances[i, j] = np.linalg.norm(points[i] - points[j])
                pairwise_distances[j, i] = pairwise_distances[i, j]  # symmetric matrix
        total_distance = np.sum(pairwise_distances)
        normed_distances = pairwise_distances / total_distance
        return np.abs(normed_distances - target_distances) / target_distances
    def objective(params, original_points, target_distances):
        # Convert params to an axis-angle representation (rotation vector)
        rot = R.from_rotvec(params)
        rotation_matrix = rot.as_matrix()[:2, :2]  # 2D rotation matrix (2x2)
        # Apply the rotation to the original points
        rotated_points = apply_rotation(rotation_matrix, original_points)
        # Compute the pairwise distances for the rotated points
        divergence = linearize_pairwise_distances(rotated_points, target_distances)
        return np.nansum(divergence)
    def optimize_rotation(original_points, relative_face_matrix):
        # Compute the pairwise distances of the original points
        original_distances = linearize_pairwise_distances(original_points, relative_face_matrix)
        # Initial guess: rotation vector (zero rotation)
        initial_params = np.zeros(3)  # Initial guess for the rotation vector (no rotation)
        # Perform the optimization to minimize the divergence
        result = minimize(objective, initial_params, args=(original_points, relative_face_matrix), method='BFGS')
        return result.x  # Rotation vector (axis-angle)
    def apply_optimized_rotation(rotation_vector, original_points):
        # Convert the rotation vector to a rotation matrix (2D)
        rot = R.from_rotvec(rotation_vector)
        rotation_matrix = rot.as_matrix()[:2, :2]  # 2D rotation matrix (2x2)
        # Apply the rotation to the points
        return apply_rotation(rotation_matrix, original_points)
    relative_face_matrix = np.array([
        [0.         , 0.02243309, 0.02243309, 0.05016191, 0.05016191],
        [0.02243309, 0.        , 0.04012953, 0.04486618, 0.07234453],
        [0.02243309, 0.04012953, 0.        , 0.07234453, 0.04486618],
        [0.05016191, 0.04486618, 0.07234453, 0.        , 0.08025906],
        [0.05016191, 0.07234453, 0.04486618, 0.08025906, 0.        ]
    ])
    # 
    model = YOLO('yolov8n-pose.pt')  # You can also try 'yolov8s-pose.pt' for better accuracy
    results = model(to_detect)[0]
    image = cv2.imread(to_detect)
    did_detect = False
    for person in results.keypoints.data:
        keypoints = person.cpu().numpy()
-        # Common keypoints: 0=nose, 5=left_shoulder, 11=left_hip, 15=left_foot
+        print("#" * 50)
        head = tuple(map(int, keypoints[0][:2]))
        foot = tuple(map(int, keypoints[15][:2]))
-        cv2.circle(image, head, 5, (255, 0, 0), -1)   # Head in blue
+        original_points = np.array([[k[0], k[1]] for k in keypoints[:5]])
-        cv2.circle(image, foot, 5, (0, 0, 255), -1)   # Foot in red
+        is_not_zero = False
-        did_detect = True
+        for x, y in original_points:
            if x != 0 or y != 0:
                is_not_zero = True
                break
-    if did_detect:
+        if not is_not_zero:
-        cv2.imwrite(detected, image)
+            continue
        rotation_vector = optimize_rotation(original_points, relative_face_matrix)
        optimized_points = apply_optimized_rotation(rotation_vector, original_points)
        optimized_distances = linearize_pairwise_distances(optimized_points, relative_face_matrix)
        # indices of the points that seem to be likely correct
        success_points = []
        for i in range(5):
            s_count = 0
            for j in range(5):
                d = np.abs(optimized_distances[i][j])
                if d < 1:
                    s_count += 1
            if s_count > 2:
                success_points.append(i)
        for point in original_points:
            cv2.circle(image, (int(point[0]), int(point[1])), 4, (0, 0, 255), -1)
        if len(success_points) < 1:
            continue
        valid_face = len(success_points) >= 3
        clean_points = []
        # Reconstruct disregarded points using weighted average of relative positions
        for i in range(5):
            if i not in success_points:
                weighted_sum = np.zeros(2)
                total_weight = 0.0
                for j in success_points:
                    if not np.isnan(relative_face_matrix[i][j]):
                        direction = original_points[j] - original_points[i]
                        norm = np.linalg.norm(direction)
                        if norm > 0:
                            direction = direction / norm
                        estimated_distance = relative_face_matrix[i][j]
                        estimate = original_points[j] - direction * estimated_distance
                        weighted_sum += estimate
                        total_weight += 1
                if total_weight > 0:
                    clean_points.append(weighted_sum / total_weight)
            else:
                clean_points.append(original_points[i])
        clean_points = np.array(clean_points)
        # Calculate bounding box from clean_points
        realistic_aspect_ratio = 2/3 # width / height
        x_coords = clean_points[:, 0]
        y_coords = clean_points[:, 1]
        min_x = np.min(x_coords)
        max_x = np.max(x_coords)
        min_y = np.min(y_coords)
        max_y = np.max(y_coords)
        # Face-like padding: more space top & bottom than sides
        width = max_x - min_x
        height = max_y - min_y
-def detect_humans(to_detect: str, crop_padding: int = 20):
+        normalized_bounding_size = max(width, height * realistic_aspect_ratio)
        real_width = normalized_bounding_size
        real_height = normalized_bounding_size / realistic_aspect_ratio
        padding_x = width * 0.7 + (real_width - width) / 2
        padding_y_top = height * 2 + (real_height - height) / 2
        padding_y_bottom = height * 1.7  + (real_height - height) / 2
        face_box_x1 = int(min_x - padding_x)
        face_box_y1 = int(min_y - padding_y_top)
        face_box_x2 = int(max_x + padding_x)
        face_box_y2 = int(max_y + padding_y_bottom)
        face_bounding_box = (face_box_x1, face_box_y1, face_box_x2, face_box_y2)
        color = (255, 255, 0)
        if valid_face:
            color = (0, 255, 0)
        cv2.rectangle(image, (face_box_x1, face_box_y1), (face_box_x2, face_box_y2), color, 2)
        for point in clean_points:
            cv2.circle(image, (int(point[0]), int(point[1])), 4, color, -1)
        print("\nOriginal points:")
        print(original_points)
        print("\nOriginal pairwise distances:")
        print(linearize_pairwise_distances(original_points, relative_face_matrix))
        print(f"Optimized rotation vector (axis-angle): {rotation_vector}")
        print("\nOptimized points after rotation:")
        print(optimized_points)
        print("\nOptimized pairwise distances:")
        print(optimized_distances)
        print(success_points)
        print(clean_points)
        """
        for idx in face_indices:
            x, y, conf = keypoints[idx]
            name = keypoint_names[idx]
            if conf > 0.3:
                face_points.append((x, y))
                point = (int(x), int(y))
                name = keypoint_names[idx]
                cv2.circle(image, point, 4, (0, 255, 0), -1)
                cv2.putText(image, name, (point[0] + 5, point[1] + 5),
                cv2.FONT_HERSHEY_SIMPLEX, 0.4, (0, 255, 0), 1)
        """
        """
        nose, left_eye, right_eye, left_ear, right_ear = face_points
        print(face_points)
        # Calculate pairwise distances
        nose_to_left_eye = euclidean_distance(nose, left_eye)
        nose_to_right_eye = euclidean_distance(nose, right_eye)
        eyes_distance = euclidean_distance(left_eye, right_eye)
        left_ear_to_nose = euclidean_distance(left_ear, nose)
        right_ear_to_nose = euclidean_distance(right_ear, nose)
        # Relative distances
        eye_to_eye_ratio = eyes_distance / (left_ear_to_nose + right_ear_to_nose)  # Eyes vs. nose-to-ears
        nose_to_eye_ratio = (nose_to_left_eye + nose_to_right_eye) / (left_ear_to_nose + right_ear_to_nose)  # Nose-to-eye vs. ear-to-nose
        ear_to_nose_ratio = (left_ear_to_nose + right_ear_to_nose) / 2  # Ear-to-nose proportionality
        # Validate using relative distances
        if not (EYE_RATIO_THRESHOLD < eye_to_eye_ratio < 0.5):  # Arbitrary ratio threshold
            print("⚠️ Rejected due to unrealistic eye-to-eye ratio:", eye_to_eye_ratio)
            has_valid_face = False
        if not (NOSE_EYE_RATIO_THRESHOLD < nose_to_eye_ratio < 0.4):  # Arbitrary ratio threshold
            print("⚠️ Rejected due to unrealistic nose-to-eye ratio:", nose_to_eye_ratio)
            has_valid_face = False
        if not (0.5 < ear_to_nose_ratio < EAR_NOSE_RATIO_THRESHOLD):
            print("⚠️ Rejected due to unrealistic ear-to-nose ratio:", ear_to_nose_ratio)
            has_valid_face = False
        # If all checks pass, calculate the bounding box
        xs, ys, _ = zip(*face_points)
        x_min, x_max = int(min(xs)), int(max(xs))
        y_min, y_max = int(min(ys)), int(max(ys))
        x_min = max(x_min - face_padding, 0)
        y_min = max(y_min - face_padding, 0)
        x_max = min(x_max + face_padding, image.shape[1])
        y_max = min(y_max + face_padding, image.shape[0])
        # Compute box size
        box_w = x_max - x_min
        box_h = y_max - y_min
        if has_valid_face:
            cv2.rectangle(image, (x_min, y_min), (x_max, y_max), (0, 255, 0), 2)
        else:
            cv2.rectangle(image, (x_min, y_min), (x_max, y_max), (255, 0, 0), 2)
        for i, (x, y, conf) in enumerate(keypoints):
            point = (int(x), int(y))
            name = keypoint_names[i]
            # cv2.circle(image, point, 4, (0, 255, 0), -1)
            # cv2.putText(image, name, (point[0] + 5, point[1] - 5),
            # cv2.FONT_HERSHEY_SIMPLEX, 0.4, (0, 255, 0), 1)
        # cv2.circle(image, head, 5, (255, 0, 0), -1)   # Head in blue
        # cv2.circle(image, foot, 5, (0, 0, 255), -1)   # Foot in red
        """ 
    cv2.imwrite(detected, image)
 def detect_humans(to_detect: str, crop_padding: int = 20, skip_detection_if_present: bool = True):
    _p = Path(to_detect)
    detected = str(_p.with_name(_p.stem + "_detected" + _p.suffix))
    boxes_file = str(_p.with_name(_p.stem + "_boxes.json"))
    print(f"detecting humans: {to_detect} => {detected}")
    require_net("yolov3")
    # Load YOLO
    net = cv2.dnn.readNet(str(MODEL_PATH / 'yolov3.weights'), str(MODEL_PATH / 'yolov3.cfg'))
    layer_names = net.getLayerNames()
    indices = net.getUnconnectedOutLayers()
    output_layers = [layer_names[int(i) - 1] for i in indices]
    # Load image
    image = cv2.imread(to_detect)
    original_image = cv2.imread(to_detect)
    height, width, channels = image.shape
    # Create blob and do forward pass
    blob = cv2.dnn.blobFromImage(image, 0.00392, (416, 416), (0, 0, 0), True, crop=False)
    net.setInput(blob)
    outs = net.forward(output_layers)
    boxes = []
    confidences = []
    # Information for each object detected
    for out in outs:
        for detection in out:
            scores = detection[5:]
            class_id = np.argmax(scores)
            confidence = scores[class_id]
            if confidence > 0.5 and class_id == 0:  # Class ID 0 is human
                center_x = int(detection[0] * width)
                center_y = int(detection[1] * height)
                w = int(detection[2] * width)
                h = int(detection[3] * height)
                x = int(center_x - w / 2)
                y = int(center_y - h / 2)
                boxes.append([x, y, w, h])
                confidences.append(float(confidence))
    # Apply Non-Maximum Suppression
    indices = cv2.dnn.NMSBoxes(boxes, confidences, score_threshold=0.5, nms_threshold=0.4)
    boxes_structures = {}
    human_boxes = boxes_structures["humans"] = []
-    human_part_folder = _p.with_name(_p.stem + "_parts")
+    if not (Path(boxes_file).exists() and skip_detection_if_present):
-    human_part_folder.mkdir(exist_ok=True)
+        require_net("yolov3")
-    for i in indices:
+        # Load YOLO
-        i = i[0] if isinstance(i, (list, np.ndarray)) else i  # Flatten index if needed
+        net = cv2.dnn.readNet(str(MODEL_PATH / 'yolov3.weights'), str(MODEL_PATH / 'yolov3.cfg'))
-        x, y, w, h = boxes[i]
+        layer_names = net.getLayerNames()
        indices = net.getUnconnectedOutLayers()
        output_layers = [layer_names[int(i) - 1] for i in indices]
        human_part_image_path = human_part_folder / (_p.stem + "_" + str(i) + _p.suffix)
-        image_height, image_width = image.shape[:2]
+        # Load image
        image = cv2.imread(to_detect)
        original_image = cv2.imread(to_detect)
        height, width, channels = image.shape
-        # Compute safe crop coordinates with padding
+        # Create blob and do forward pass
-        x1 = max(x - crop_padding, 0)
+        blob = cv2.dnn.blobFromImage(image, 0.00392, (416, 416), (0, 0, 0), True, crop=False)
-        y1 = max(y - crop_padding, 0)
+        net.setInput(blob)
-        x2 = min(x + w + crop_padding, image_width)
+        outs = net.forward(output_layers)
        y2 = min(y + h + crop_padding, image_height)
        human_crop = original_image[y1:y2, x1:x2]
-        cv2.imwrite(str(human_part_image_path), human_crop)
+        boxes = []
        confidences = []
-        print(f"\tfound human at {x}/{y} with the size of {w} x {h}")
+        # Information for each object detected
-        human_boxes.append({
+        for out in outs:
-            "x": x,
+            for detection in out:
-            "y": y,
+                scores = detection[5:]
-            "w": w,
+                class_id = np.argmax(scores)
-            "h": h,
+                confidence = scores[class_id]
-            "crop": {
+                if confidence > 0.5 and class_id == 0:  # Class ID 0 is human
-                "file": str(human_part_image_path),
+                    center_x = int(detection[0] * width)
-                "x": x1,
+                    center_y = int(detection[1] * height)
                    w = int(detection[2] * width)
                    h = int(detection[3] * height)
                    x = int(center_x - w / 2)
                    y = int(center_y - h / 2)
                    boxes.append([x, y, w, h])
                    confidences.append(float(confidence))
        # Apply Non-Maximum Suppression
        indices = cv2.dnn.NMSBoxes(boxes, confidences, score_threshold=0.5, nms_threshold=0.4)
        human_part_folder = _p.with_name(_p.stem + "_parts")
        human_part_folder.mkdir(exist_ok=True)
        for i in indices:
            i = i[0] if isinstance(i, (list, np.ndarray)) else i  # Flatten index if needed
            x, y, w, h = boxes[i]
            human_part_image_path = human_part_folder / (_p.stem + "_" + str(i) + _p.suffix)
            image_height, image_width = image.shape[:2]
            # Compute safe crop coordinates with padding
            x1 = max(x - crop_padding, 0)
            y1 = max(y - crop_padding, 0)
            x2 = min(x + w + crop_padding, image_width)
            y2 = min(y + h + crop_padding, image_height)
            human_crop = original_image[y1:y2, x1:x2]
            cv2.imwrite(str(human_part_image_path), human_crop)
            print(f"\tfound human at {x}/{y} with the size of {w} x {h}")
            human_boxes.append({
                "x": x,
                "y": y,
-                "w": x2 - x1,
+                "w": w,
-                "h": y2 - y1,
+                "h": h,
-            },
+                "crop": {
-            "parts": {},
+                    "file": str(human_part_image_path),
-        })
+                    "x": x1,
                    "y": y,
                    "w": x2 - x1,
                    "h": y2 - y1,
                },
                "parts": {},
            })
-        cv2.rectangle(image, (x, y), (x + w, y + h), (0, 255, 255), 2)
+            cv2.rectangle(image, (x, y), (x + w, y + h), (0, 255, 255), 2)
-    # Save the result
+        # Save the result
-    with open(boxes_file, "w") as f:
+        with open(boxes_file, "w") as f:
-        json.dump(boxes_structures, f)
+            json.dump(boxes_structures, f)
-    cv2.imwrite(detected, image)
+        cv2.imwrite(detected, image)
    else:
        with open(boxes_file, "r") as f:
            boxes_structures = json.load(f)
            human_boxes = boxes_structures["humans"]
    for human in human_boxes:
-        detect_human_parts(human["crop"]["file"])
+        detect_human_parts(human)