feat: cleaner detection

2025-04-23 16:56:09 +02:00
parent bcfc90acdf
commit ad8f3b8e66
1 changed files with 363 additions and 90 deletions
--- a/secure_pixelation/detect_humans.py
+++ b/secure_pixelation/detect_humans.py
@@ -1,11 +1,15 @@
+from __future__ import annotations
 from pathlib import Path
 import urllib.request
 from typing import Dict, List
 import json
+from dataclasses import dataclass

 from ultralytics import YOLO
 import cv2
 import numpy as np
+from scipy.optimize import minimize
+from scipy.spatial.transform import Rotation as R


 MODEL_PATH = Path("assets", "models")
@@ -38,133 +42,402 @@ def require_net(name: str):
        )


-def detect_human_parts(human: dict):
-    parts = human["parts"]
+# Thresholds for face keypoint distances (these might need adjustment)
+EYE_RATIO_THRESHOLD = 0.25
+NOSE_EYE_RATIO_THRESHOLD = 0.2
+EAR_NOSE_RATIO_THRESHOLD = 1.2


+@dataclass
+class Keypoint:
+    x: float
+    y: float
+    name: str
+    confidence: float = 0
+
+    @property
+    def point(self):
+        return (int(self.x), int(self.y))
+
+    def get_distance(self, other: Keypoint) -> float:
+        return np.sqrt((self.x - other.x) ** 2 + (self.y - other.y) ** 2)
+
+
+
+def detect_human_parts(human: dict, face_padding: int = 20):
+    parts = human["parts"]
+
    to_detect = human["crop"]["file"]
    _p = Path(to_detect)
    detected = str(_p.with_name(_p.stem + "_detected" + _p.suffix))
    boxes_file = str(_p.with_name(_p.stem + "_boxes.json"))
    print(f"detecting human parts: {to_detect} => {detected}")

+
+    def apply_rotation(rot_matrix, points):
+        # Apply the rotation to the points, assuming points are 2D coordinates (flattened)
+        return np.dot(rot_matrix, points.T).T
+
+    def linearize_pairwise_distances(points, target_distances):
+        # Calculate pairwise distances between the points
+        num_points = len(points)
+        pairwise_distances = np.zeros((num_points, num_points))
+        for i in range(num_points):
+            for j in range(i, num_points):
+                pairwise_distances[i, j] = np.linalg.norm(points[i] - points[j])
+                pairwise_distances[j, i] = pairwise_distances[i, j]  # symmetric matrix
+
+        total_distance = np.sum(pairwise_distances)
+        normed_distances = pairwise_distances / total_distance
+        
+        return np.abs(normed_distances - target_distances) / target_distances
+
+    def objective(params, original_points, target_distances):
+        # Convert params to an axis-angle representation (rotation vector)
+        rot = R.from_rotvec(params)
+        rotation_matrix = rot.as_matrix()[:2, :2]  # 2D rotation matrix (2x2)
+
+        # Apply the rotation to the original points
+        rotated_points = apply_rotation(rotation_matrix, original_points)
+
+        # Compute the pairwise distances for the rotated points
+        divergence = linearize_pairwise_distances(rotated_points, target_distances)
+        return np.nansum(divergence)
+
+    def optimize_rotation(original_points, relative_face_matrix):
+        # Compute the pairwise distances of the original points
+        original_distances = linearize_pairwise_distances(original_points, relative_face_matrix)
+        
+        # Initial guess: rotation vector (zero rotation)
+        initial_params = np.zeros(3)  # Initial guess for the rotation vector (no rotation)
+
+        # Perform the optimization to minimize the divergence
+        result = minimize(objective, initial_params, args=(original_points, relative_face_matrix), method='BFGS')
+
+        return result.x  # Rotation vector (axis-angle)
+
+    def apply_optimized_rotation(rotation_vector, original_points):
+        # Convert the rotation vector to a rotation matrix (2D)
+        rot = R.from_rotvec(rotation_vector)
+        rotation_matrix = rot.as_matrix()[:2, :2]  # 2D rotation matrix (2x2)
+        
+        # Apply the rotation to the points
+        return apply_rotation(rotation_matrix, original_points)
+
+
+    relative_face_matrix = np.array([
+        [0.         , 0.02243309, 0.02243309, 0.05016191, 0.05016191],
+        [0.02243309, 0.        , 0.04012953, 0.04486618, 0.07234453],
+        [0.02243309, 0.04012953, 0.        , 0.07234453, 0.04486618],
+        [0.05016191, 0.04486618, 0.07234453, 0.        , 0.08025906],
+        [0.05016191, 0.07234453, 0.04486618, 0.08025906, 0.        ]
+    ])
+    # 
    model = YOLO('yolov8n-pose.pt')  # You can also try 'yolov8s-pose.pt' for better accuracy
-
    results = model(to_detect)[0]
-
    image = cv2.imread(to_detect)

-    did_detect = False
    for person in results.keypoints.data:
        keypoints = person.cpu().numpy()

-        # Common keypoints: 0=nose, 5=left_shoulder, 11=left_hip, 15=left_foot
-        head = tuple(map(int, keypoints[0][:2]))
-        foot = tuple(map(int, keypoints[15][:2]))
+        print("#" * 50)

-        cv2.circle(image, head, 5, (255, 0, 0), -1)   # Head in blue
-        cv2.circle(image, foot, 5, (0, 0, 255), -1)   # Foot in red
-        did_detect = True
+        original_points = np.array([[k[0], k[1]] for k in keypoints[:5]])
+        is_not_zero = False
+        for x, y in original_points:
+            if x != 0 or y != 0:
+                is_not_zero = True
+                break

-    if did_detect:
-        cv2.imwrite(detected, image)
+        if not is_not_zero:
+            continue
+
+        rotation_vector = optimize_rotation(original_points, relative_face_matrix)
+        optimized_points = apply_optimized_rotation(rotation_vector, original_points)
+        optimized_distances = linearize_pairwise_distances(optimized_points, relative_face_matrix)
+
+        # indices of the points that seem to be likely correct
+        success_points = []
+        for i in range(5):
+            s_count = 0
+            for j in range(5):
+                d = np.abs(optimized_distances[i][j])
+                if d < 1:
+                    s_count += 1
+
+            if s_count > 2:
+                success_points.append(i)
+
+        for point in original_points:
+            cv2.circle(image, (int(point[0]), int(point[1])), 4, (0, 0, 255), -1)
+
+        if len(success_points) < 1:
+            continue
+        valid_face = len(success_points) >= 3
+
+        clean_points = []
+
+        # Reconstruct disregarded points using weighted average of relative positions
+        for i in range(5):
+            if i not in success_points:
+                weighted_sum = np.zeros(2)
+                total_weight = 0.0
+                for j in success_points:
+                    if not np.isnan(relative_face_matrix[i][j]):
+                        direction = original_points[j] - original_points[i]
+                        norm = np.linalg.norm(direction)
+                        if norm > 0:
+                            direction = direction / norm
+                        estimated_distance = relative_face_matrix[i][j]
+                        estimate = original_points[j] - direction * estimated_distance
+                        weighted_sum += estimate
+                        total_weight += 1
+                if total_weight > 0:
+                    clean_points.append(weighted_sum / total_weight)
+            else:
+                clean_points.append(original_points[i])
+
+        clean_points = np.array(clean_points)
+
+        # Calculate bounding box from clean_points
+        realistic_aspect_ratio = 2/3 # width / height
+
+        x_coords = clean_points[:, 0]
+        y_coords = clean_points[:, 1]
+
+        min_x = np.min(x_coords)
+        max_x = np.max(x_coords)
+        min_y = np.min(y_coords)
+        max_y = np.max(y_coords)
+
+        # Face-like padding: more space top & bottom than sides
+        width = max_x - min_x
+        height = max_y - min_y


-def detect_humans(to_detect: str, crop_padding: int = 20):
+        normalized_bounding_size = max(width, height * realistic_aspect_ratio)
+        real_width = normalized_bounding_size
+        real_height = normalized_bounding_size / realistic_aspect_ratio
+
+        padding_x = width * 0.7 + (real_width - width) / 2
+        padding_y_top = height * 2 + (real_height - height) / 2
+        padding_y_bottom = height * 1.7  + (real_height - height) / 2
+
+        face_box_x1 = int(min_x - padding_x)
+        face_box_y1 = int(min_y - padding_y_top)
+        face_box_x2 = int(max_x + padding_x)
+        face_box_y2 = int(max_y + padding_y_bottom)
+
+        face_bounding_box = (face_box_x1, face_box_y1, face_box_x2, face_box_y2)
+
+        color = (255, 255, 0)
+        if valid_face:
+            color = (0, 255, 0)
+
+        cv2.rectangle(image, (face_box_x1, face_box_y1), (face_box_x2, face_box_y2), color, 2)
+        for point in clean_points:
+            cv2.circle(image, (int(point[0]), int(point[1])), 4, color, -1)
+
+
+        print("\nOriginal points:")
+        print(original_points)
+        print("\nOriginal pairwise distances:")
+        print(linearize_pairwise_distances(original_points, relative_face_matrix))
+        print(f"Optimized rotation vector (axis-angle): {rotation_vector}")
+        print("\nOptimized points after rotation:")
+        print(optimized_points)
+        print("\nOptimized pairwise distances:")
+        print(optimized_distances)
+        print(success_points)
+        print(clean_points)
+
+
+            
+
+        """
+        for idx in face_indices:
+            x, y, conf = keypoints[idx]
+            name = keypoint_names[idx]
+            if conf > 0.3:
+                face_points.append((x, y))
+
+                point = (int(x), int(y))
+                name = keypoint_names[idx]
+                cv2.circle(image, point, 4, (0, 255, 0), -1)
+                cv2.putText(image, name, (point[0] + 5, point[1] + 5),
+                cv2.FONT_HERSHEY_SIMPLEX, 0.4, (0, 255, 0), 1)
+        """
+
+
+        """
+        nose, left_eye, right_eye, left_ear, right_ear = face_points
+        print(face_points)
+
+        # Calculate pairwise distances
+        nose_to_left_eye = euclidean_distance(nose, left_eye)
+        nose_to_right_eye = euclidean_distance(nose, right_eye)
+        eyes_distance = euclidean_distance(left_eye, right_eye)
+        left_ear_to_nose = euclidean_distance(left_ear, nose)
+        right_ear_to_nose = euclidean_distance(right_ear, nose)
+
+        # Relative distances
+        eye_to_eye_ratio = eyes_distance / (left_ear_to_nose + right_ear_to_nose)  # Eyes vs. nose-to-ears
+        nose_to_eye_ratio = (nose_to_left_eye + nose_to_right_eye) / (left_ear_to_nose + right_ear_to_nose)  # Nose-to-eye vs. ear-to-nose
+        ear_to_nose_ratio = (left_ear_to_nose + right_ear_to_nose) / 2  # Ear-to-nose proportionality
+
+        # Validate using relative distances
+        if not (EYE_RATIO_THRESHOLD < eye_to_eye_ratio < 0.5):  # Arbitrary ratio threshold
+            print("⚠️ Rejected due to unrealistic eye-to-eye ratio:", eye_to_eye_ratio)
+            has_valid_face = False
+
+        if not (NOSE_EYE_RATIO_THRESHOLD < nose_to_eye_ratio < 0.4):  # Arbitrary ratio threshold
+            print("⚠️ Rejected due to unrealistic nose-to-eye ratio:", nose_to_eye_ratio)
+            has_valid_face = False
+
+        if not (0.5 < ear_to_nose_ratio < EAR_NOSE_RATIO_THRESHOLD):
+            print("⚠️ Rejected due to unrealistic ear-to-nose ratio:", ear_to_nose_ratio)
+            has_valid_face = False
+
+
+        # If all checks pass, calculate the bounding box
+        xs, ys, _ = zip(*face_points)
+        x_min, x_max = int(min(xs)), int(max(xs))
+        y_min, y_max = int(min(ys)), int(max(ys))
+
+        x_min = max(x_min - face_padding, 0)
+        y_min = max(y_min - face_padding, 0)
+        x_max = min(x_max + face_padding, image.shape[1])
+        y_max = min(y_max + face_padding, image.shape[0])
+
+        # Compute box size
+        box_w = x_max - x_min
+        box_h = y_max - y_min
+
+        if has_valid_face:
+            cv2.rectangle(image, (x_min, y_min), (x_max, y_max), (0, 255, 0), 2)
+        else:
+            cv2.rectangle(image, (x_min, y_min), (x_max, y_max), (255, 0, 0), 2)
+            
+
+
+
+        for i, (x, y, conf) in enumerate(keypoints):
+            point = (int(x), int(y))
+            name = keypoint_names[i]
+            # cv2.circle(image, point, 4, (0, 255, 0), -1)
+            # cv2.putText(image, name, (point[0] + 5, point[1] - 5),
+            # cv2.FONT_HERSHEY_SIMPLEX, 0.4, (0, 255, 0), 1)
+
+        # cv2.circle(image, head, 5, (255, 0, 0), -1)   # Head in blue
+        # cv2.circle(image, foot, 5, (0, 0, 255), -1)   # Foot in red
+        """ 
+
+    cv2.imwrite(detected, image)
+
+
+def detect_humans(to_detect: str, crop_padding: int = 20, skip_detection_if_present: bool = True):
    _p = Path(to_detect)
    detected = str(_p.with_name(_p.stem + "_detected" + _p.suffix))
    boxes_file = str(_p.with_name(_p.stem + "_boxes.json"))
    print(f"detecting humans: {to_detect} => {detected}")

-    require_net("yolov3")
-
-    # Load YOLO
-    net = cv2.dnn.readNet(str(MODEL_PATH / 'yolov3.weights'), str(MODEL_PATH / 'yolov3.cfg'))
-    layer_names = net.getLayerNames()
-    indices = net.getUnconnectedOutLayers()
-    output_layers = [layer_names[int(i) - 1] for i in indices]
-
-
-    # Load image
-    image = cv2.imread(to_detect)
-    original_image = cv2.imread(to_detect)
-    height, width, channels = image.shape
-
-    # Create blob and do forward pass
-    blob = cv2.dnn.blobFromImage(image, 0.00392, (416, 416), (0, 0, 0), True, crop=False)
-    net.setInput(blob)
-    outs = net.forward(output_layers)
-
-    boxes = []
-    confidences = []
-
-    # Information for each object detected
-    for out in outs:
-        for detection in out:
-            scores = detection[5:]
-            class_id = np.argmax(scores)
-            confidence = scores[class_id]
-            if confidence > 0.5 and class_id == 0:  # Class ID 0 is human
-                center_x = int(detection[0] * width)
-                center_y = int(detection[1] * height)
-                w = int(detection[2] * width)
-                h = int(detection[3] * height)
-                x = int(center_x - w / 2)
-                y = int(center_y - h / 2)
-
-                boxes.append([x, y, w, h])
-                confidences.append(float(confidence))
-
-    # Apply Non-Maximum Suppression
-    indices = cv2.dnn.NMSBoxes(boxes, confidences, score_threshold=0.5, nms_threshold=0.4)
-
    boxes_structures = {}
    human_boxes = boxes_structures["humans"] = []

-    human_part_folder = _p.with_name(_p.stem + "_parts")
-    human_part_folder.mkdir(exist_ok=True)
+    if not (Path(boxes_file).exists() and skip_detection_if_present):
+        require_net("yolov3")

-    for i in indices:
-        i = i[0] if isinstance(i, (list, np.ndarray)) else i  # Flatten index if needed
-        x, y, w, h = boxes[i]
+        # Load YOLO
+        net = cv2.dnn.readNet(str(MODEL_PATH / 'yolov3.weights'), str(MODEL_PATH / 'yolov3.cfg'))
+        layer_names = net.getLayerNames()
+        indices = net.getUnconnectedOutLayers()
+        output_layers = [layer_names[int(i) - 1] for i in indices]

-        human_part_image_path = human_part_folder / (_p.stem + "_" + str(i) + _p.suffix)

-        image_height, image_width = image.shape[:2]
+        # Load image
+        image = cv2.imread(to_detect)
+        original_image = cv2.imread(to_detect)
+        height, width, channels = image.shape

-        # Compute safe crop coordinates with padding
-        x1 = max(x - crop_padding, 0)
-        y1 = max(y - crop_padding, 0)
-        x2 = min(x + w + crop_padding, image_width)
-        y2 = min(y + h + crop_padding, image_height)
-        human_crop = original_image[y1:y2, x1:x2]
+        # Create blob and do forward pass
+        blob = cv2.dnn.blobFromImage(image, 0.00392, (416, 416), (0, 0, 0), True, crop=False)
+        net.setInput(blob)
+        outs = net.forward(output_layers)

-        cv2.imwrite(str(human_part_image_path), human_crop)
+        boxes = []
+        confidences = []

-        print(f"\tfound human at {x}/{y} with the size of {w} x {h}")
-        human_boxes.append({
-            "x": x,
-            "y": y,
-            "w": w,
-            "h": h,
-            "crop": {
-                "file": str(human_part_image_path),
-                "x": x1,
+        # Information for each object detected
+        for out in outs:
+            for detection in out:
+                scores = detection[5:]
+                class_id = np.argmax(scores)
+                confidence = scores[class_id]
+                if confidence > 0.5 and class_id == 0:  # Class ID 0 is human
+                    center_x = int(detection[0] * width)
+                    center_y = int(detection[1] * height)
+                    w = int(detection[2] * width)
+                    h = int(detection[3] * height)
+                    x = int(center_x - w / 2)
+                    y = int(center_y - h / 2)
+
+                    boxes.append([x, y, w, h])
+                    confidences.append(float(confidence))
+
+        # Apply Non-Maximum Suppression
+        indices = cv2.dnn.NMSBoxes(boxes, confidences, score_threshold=0.5, nms_threshold=0.4)
+
+        human_part_folder = _p.with_name(_p.stem + "_parts")
+        human_part_folder.mkdir(exist_ok=True)
+
+        for i in indices:
+            i = i[0] if isinstance(i, (list, np.ndarray)) else i  # Flatten index if needed
+            x, y, w, h = boxes[i]
+
+            human_part_image_path = human_part_folder / (_p.stem + "_" + str(i) + _p.suffix)
+
+            image_height, image_width = image.shape[:2]
+
+            # Compute safe crop coordinates with padding
+            x1 = max(x - crop_padding, 0)
+            y1 = max(y - crop_padding, 0)
+            x2 = min(x + w + crop_padding, image_width)
+            y2 = min(y + h + crop_padding, image_height)
+            human_crop = original_image[y1:y2, x1:x2]
+
+            cv2.imwrite(str(human_part_image_path), human_crop)
+
+            print(f"\tfound human at {x}/{y} with the size of {w} x {h}")
+            human_boxes.append({
+                "x": x,
                "y": y,
-                "w": x2 - x1,
-                "h": y2 - y1,
-            },
-            "parts": {},
-        })
+                "w": w,
+                "h": h,
+                "crop": {
+                    "file": str(human_part_image_path),
+                    "x": x1,
+                    "y": y,
+                    "w": x2 - x1,
+                    "h": y2 - y1,
+                },
+                "parts": {},
+            })


-        cv2.rectangle(image, (x, y), (x + w, y + h), (0, 255, 255), 2)
+            cv2.rectangle(image, (x, y), (x + w, y + h), (0, 255, 255), 2)


-    # Save the result
-    with open(boxes_file, "w") as f:
-        json.dump(boxes_structures, f)
-    cv2.imwrite(detected, image)
+        # Save the result
+        with open(boxes_file, "w") as f:
+            json.dump(boxes_structures, f)
+        cv2.imwrite(detected, image)
+
+    else:
+
+        with open(boxes_file, "r") as f:
+            boxes_structures = json.load(f)
+            human_boxes = boxes_structures["humans"]
+

    for human in human_boxes:
-        detect_human_parts(human["crop"]["file"])
+        detect_human_parts(human)