diff --git a/secure_pixelation/detect_humans.py b/secure_pixelation/detect_humans.py index ff74155..f029e92 100644 --- a/secure_pixelation/detect_humans.py +++ b/secure_pixelation/detect_humans.py @@ -1,11 +1,15 @@ +from __future__ import annotations from pathlib import Path import urllib.request from typing import Dict, List import json +from dataclasses import dataclass from ultralytics import YOLO import cv2 import numpy as np +from scipy.optimize import minimize +from scipy.spatial.transform import Rotation as R MODEL_PATH = Path("assets", "models") @@ -38,133 +42,402 @@ def require_net(name: str): ) -def detect_human_parts(human: dict): - parts = human["parts"] +# Thresholds for face keypoint distances (these might need adjustment) +EYE_RATIO_THRESHOLD = 0.25 +NOSE_EYE_RATIO_THRESHOLD = 0.2 +EAR_NOSE_RATIO_THRESHOLD = 1.2 +@dataclass +class Keypoint: + x: float + y: float + name: str + confidence: float = 0 + + @property + def point(self): + return (int(self.x), int(self.y)) + + def get_distance(self, other: Keypoint) -> float: + return np.sqrt((self.x - other.x) ** 2 + (self.y - other.y) ** 2) + + + +def detect_human_parts(human: dict, face_padding: int = 20): + parts = human["parts"] + to_detect = human["crop"]["file"] _p = Path(to_detect) detected = str(_p.with_name(_p.stem + "_detected" + _p.suffix)) boxes_file = str(_p.with_name(_p.stem + "_boxes.json")) print(f"detecting human parts: {to_detect} => {detected}") + + def apply_rotation(rot_matrix, points): + # Apply the rotation to the points, assuming points are 2D coordinates (flattened) + return np.dot(rot_matrix, points.T).T + + def linearize_pairwise_distances(points, target_distances): + # Calculate pairwise distances between the points + num_points = len(points) + pairwise_distances = np.zeros((num_points, num_points)) + for i in range(num_points): + for j in range(i, num_points): + pairwise_distances[i, j] = np.linalg.norm(points[i] - points[j]) + pairwise_distances[j, i] = pairwise_distances[i, j] # symmetric matrix + + total_distance = np.sum(pairwise_distances) + normed_distances = pairwise_distances / total_distance + + return np.abs(normed_distances - target_distances) / target_distances + + def objective(params, original_points, target_distances): + # Convert params to an axis-angle representation (rotation vector) + rot = R.from_rotvec(params) + rotation_matrix = rot.as_matrix()[:2, :2] # 2D rotation matrix (2x2) + + # Apply the rotation to the original points + rotated_points = apply_rotation(rotation_matrix, original_points) + + # Compute the pairwise distances for the rotated points + divergence = linearize_pairwise_distances(rotated_points, target_distances) + return np.nansum(divergence) + + def optimize_rotation(original_points, relative_face_matrix): + # Compute the pairwise distances of the original points + original_distances = linearize_pairwise_distances(original_points, relative_face_matrix) + + # Initial guess: rotation vector (zero rotation) + initial_params = np.zeros(3) # Initial guess for the rotation vector (no rotation) + + # Perform the optimization to minimize the divergence + result = minimize(objective, initial_params, args=(original_points, relative_face_matrix), method='BFGS') + + return result.x # Rotation vector (axis-angle) + + def apply_optimized_rotation(rotation_vector, original_points): + # Convert the rotation vector to a rotation matrix (2D) + rot = R.from_rotvec(rotation_vector) + rotation_matrix = rot.as_matrix()[:2, :2] # 2D rotation matrix (2x2) + + # Apply the rotation to the points + return apply_rotation(rotation_matrix, original_points) + + + relative_face_matrix = np.array([ + [0. , 0.02243309, 0.02243309, 0.05016191, 0.05016191], + [0.02243309, 0. , 0.04012953, 0.04486618, 0.07234453], + [0.02243309, 0.04012953, 0. , 0.07234453, 0.04486618], + [0.05016191, 0.04486618, 0.07234453, 0. , 0.08025906], + [0.05016191, 0.07234453, 0.04486618, 0.08025906, 0. ] + ]) + # model = YOLO('yolov8n-pose.pt') # You can also try 'yolov8s-pose.pt' for better accuracy - results = model(to_detect)[0] - image = cv2.imread(to_detect) - did_detect = False for person in results.keypoints.data: keypoints = person.cpu().numpy() - # Common keypoints: 0=nose, 5=left_shoulder, 11=left_hip, 15=left_foot - head = tuple(map(int, keypoints[0][:2])) - foot = tuple(map(int, keypoints[15][:2])) + print("#" * 50) - cv2.circle(image, head, 5, (255, 0, 0), -1) # Head in blue - cv2.circle(image, foot, 5, (0, 0, 255), -1) # Foot in red - did_detect = True + original_points = np.array([[k[0], k[1]] for k in keypoints[:5]]) + is_not_zero = False + for x, y in original_points: + if x != 0 or y != 0: + is_not_zero = True + break - if did_detect: - cv2.imwrite(detected, image) + if not is_not_zero: + continue + + rotation_vector = optimize_rotation(original_points, relative_face_matrix) + optimized_points = apply_optimized_rotation(rotation_vector, original_points) + optimized_distances = linearize_pairwise_distances(optimized_points, relative_face_matrix) + + # indices of the points that seem to be likely correct + success_points = [] + for i in range(5): + s_count = 0 + for j in range(5): + d = np.abs(optimized_distances[i][j]) + if d < 1: + s_count += 1 + + if s_count > 2: + success_points.append(i) + + for point in original_points: + cv2.circle(image, (int(point[0]), int(point[1])), 4, (0, 0, 255), -1) + + if len(success_points) < 1: + continue + valid_face = len(success_points) >= 3 + + clean_points = [] + + # Reconstruct disregarded points using weighted average of relative positions + for i in range(5): + if i not in success_points: + weighted_sum = np.zeros(2) + total_weight = 0.0 + for j in success_points: + if not np.isnan(relative_face_matrix[i][j]): + direction = original_points[j] - original_points[i] + norm = np.linalg.norm(direction) + if norm > 0: + direction = direction / norm + estimated_distance = relative_face_matrix[i][j] + estimate = original_points[j] - direction * estimated_distance + weighted_sum += estimate + total_weight += 1 + if total_weight > 0: + clean_points.append(weighted_sum / total_weight) + else: + clean_points.append(original_points[i]) + + clean_points = np.array(clean_points) + + # Calculate bounding box from clean_points + realistic_aspect_ratio = 2/3 # width / height + + x_coords = clean_points[:, 0] + y_coords = clean_points[:, 1] + + min_x = np.min(x_coords) + max_x = np.max(x_coords) + min_y = np.min(y_coords) + max_y = np.max(y_coords) + + # Face-like padding: more space top & bottom than sides + width = max_x - min_x + height = max_y - min_y -def detect_humans(to_detect: str, crop_padding: int = 20): + normalized_bounding_size = max(width, height * realistic_aspect_ratio) + real_width = normalized_bounding_size + real_height = normalized_bounding_size / realistic_aspect_ratio + + padding_x = width * 0.7 + (real_width - width) / 2 + padding_y_top = height * 2 + (real_height - height) / 2 + padding_y_bottom = height * 1.7 + (real_height - height) / 2 + + face_box_x1 = int(min_x - padding_x) + face_box_y1 = int(min_y - padding_y_top) + face_box_x2 = int(max_x + padding_x) + face_box_y2 = int(max_y + padding_y_bottom) + + face_bounding_box = (face_box_x1, face_box_y1, face_box_x2, face_box_y2) + + color = (255, 255, 0) + if valid_face: + color = (0, 255, 0) + + cv2.rectangle(image, (face_box_x1, face_box_y1), (face_box_x2, face_box_y2), color, 2) + for point in clean_points: + cv2.circle(image, (int(point[0]), int(point[1])), 4, color, -1) + + + print("\nOriginal points:") + print(original_points) + print("\nOriginal pairwise distances:") + print(linearize_pairwise_distances(original_points, relative_face_matrix)) + print(f"Optimized rotation vector (axis-angle): {rotation_vector}") + print("\nOptimized points after rotation:") + print(optimized_points) + print("\nOptimized pairwise distances:") + print(optimized_distances) + print(success_points) + print(clean_points) + + + + + """ + for idx in face_indices: + x, y, conf = keypoints[idx] + name = keypoint_names[idx] + if conf > 0.3: + face_points.append((x, y)) + + point = (int(x), int(y)) + name = keypoint_names[idx] + cv2.circle(image, point, 4, (0, 255, 0), -1) + cv2.putText(image, name, (point[0] + 5, point[1] + 5), + cv2.FONT_HERSHEY_SIMPLEX, 0.4, (0, 255, 0), 1) + """ + + + """ + nose, left_eye, right_eye, left_ear, right_ear = face_points + print(face_points) + + # Calculate pairwise distances + nose_to_left_eye = euclidean_distance(nose, left_eye) + nose_to_right_eye = euclidean_distance(nose, right_eye) + eyes_distance = euclidean_distance(left_eye, right_eye) + left_ear_to_nose = euclidean_distance(left_ear, nose) + right_ear_to_nose = euclidean_distance(right_ear, nose) + + # Relative distances + eye_to_eye_ratio = eyes_distance / (left_ear_to_nose + right_ear_to_nose) # Eyes vs. nose-to-ears + nose_to_eye_ratio = (nose_to_left_eye + nose_to_right_eye) / (left_ear_to_nose + right_ear_to_nose) # Nose-to-eye vs. ear-to-nose + ear_to_nose_ratio = (left_ear_to_nose + right_ear_to_nose) / 2 # Ear-to-nose proportionality + + # Validate using relative distances + if not (EYE_RATIO_THRESHOLD < eye_to_eye_ratio < 0.5): # Arbitrary ratio threshold + print("⚠️ Rejected due to unrealistic eye-to-eye ratio:", eye_to_eye_ratio) + has_valid_face = False + + if not (NOSE_EYE_RATIO_THRESHOLD < nose_to_eye_ratio < 0.4): # Arbitrary ratio threshold + print("⚠️ Rejected due to unrealistic nose-to-eye ratio:", nose_to_eye_ratio) + has_valid_face = False + + if not (0.5 < ear_to_nose_ratio < EAR_NOSE_RATIO_THRESHOLD): + print("⚠️ Rejected due to unrealistic ear-to-nose ratio:", ear_to_nose_ratio) + has_valid_face = False + + + # If all checks pass, calculate the bounding box + xs, ys, _ = zip(*face_points) + x_min, x_max = int(min(xs)), int(max(xs)) + y_min, y_max = int(min(ys)), int(max(ys)) + + x_min = max(x_min - face_padding, 0) + y_min = max(y_min - face_padding, 0) + x_max = min(x_max + face_padding, image.shape[1]) + y_max = min(y_max + face_padding, image.shape[0]) + + # Compute box size + box_w = x_max - x_min + box_h = y_max - y_min + + if has_valid_face: + cv2.rectangle(image, (x_min, y_min), (x_max, y_max), (0, 255, 0), 2) + else: + cv2.rectangle(image, (x_min, y_min), (x_max, y_max), (255, 0, 0), 2) + + + + + for i, (x, y, conf) in enumerate(keypoints): + point = (int(x), int(y)) + name = keypoint_names[i] + # cv2.circle(image, point, 4, (0, 255, 0), -1) + # cv2.putText(image, name, (point[0] + 5, point[1] - 5), + # cv2.FONT_HERSHEY_SIMPLEX, 0.4, (0, 255, 0), 1) + + # cv2.circle(image, head, 5, (255, 0, 0), -1) # Head in blue + # cv2.circle(image, foot, 5, (0, 0, 255), -1) # Foot in red + """ + + cv2.imwrite(detected, image) + + +def detect_humans(to_detect: str, crop_padding: int = 20, skip_detection_if_present: bool = True): _p = Path(to_detect) detected = str(_p.with_name(_p.stem + "_detected" + _p.suffix)) boxes_file = str(_p.with_name(_p.stem + "_boxes.json")) print(f"detecting humans: {to_detect} => {detected}") - require_net("yolov3") - - # Load YOLO - net = cv2.dnn.readNet(str(MODEL_PATH / 'yolov3.weights'), str(MODEL_PATH / 'yolov3.cfg')) - layer_names = net.getLayerNames() - indices = net.getUnconnectedOutLayers() - output_layers = [layer_names[int(i) - 1] for i in indices] - - - # Load image - image = cv2.imread(to_detect) - original_image = cv2.imread(to_detect) - height, width, channels = image.shape - - # Create blob and do forward pass - blob = cv2.dnn.blobFromImage(image, 0.00392, (416, 416), (0, 0, 0), True, crop=False) - net.setInput(blob) - outs = net.forward(output_layers) - - boxes = [] - confidences = [] - - # Information for each object detected - for out in outs: - for detection in out: - scores = detection[5:] - class_id = np.argmax(scores) - confidence = scores[class_id] - if confidence > 0.5 and class_id == 0: # Class ID 0 is human - center_x = int(detection[0] * width) - center_y = int(detection[1] * height) - w = int(detection[2] * width) - h = int(detection[3] * height) - x = int(center_x - w / 2) - y = int(center_y - h / 2) - - boxes.append([x, y, w, h]) - confidences.append(float(confidence)) - - # Apply Non-Maximum Suppression - indices = cv2.dnn.NMSBoxes(boxes, confidences, score_threshold=0.5, nms_threshold=0.4) - boxes_structures = {} human_boxes = boxes_structures["humans"] = [] - human_part_folder = _p.with_name(_p.stem + "_parts") - human_part_folder.mkdir(exist_ok=True) + if not (Path(boxes_file).exists() and skip_detection_if_present): + require_net("yolov3") - for i in indices: - i = i[0] if isinstance(i, (list, np.ndarray)) else i # Flatten index if needed - x, y, w, h = boxes[i] + # Load YOLO + net = cv2.dnn.readNet(str(MODEL_PATH / 'yolov3.weights'), str(MODEL_PATH / 'yolov3.cfg')) + layer_names = net.getLayerNames() + indices = net.getUnconnectedOutLayers() + output_layers = [layer_names[int(i) - 1] for i in indices] - human_part_image_path = human_part_folder / (_p.stem + "_" + str(i) + _p.suffix) - image_height, image_width = image.shape[:2] + # Load image + image = cv2.imread(to_detect) + original_image = cv2.imread(to_detect) + height, width, channels = image.shape - # Compute safe crop coordinates with padding - x1 = max(x - crop_padding, 0) - y1 = max(y - crop_padding, 0) - x2 = min(x + w + crop_padding, image_width) - y2 = min(y + h + crop_padding, image_height) - human_crop = original_image[y1:y2, x1:x2] + # Create blob and do forward pass + blob = cv2.dnn.blobFromImage(image, 0.00392, (416, 416), (0, 0, 0), True, crop=False) + net.setInput(blob) + outs = net.forward(output_layers) - cv2.imwrite(str(human_part_image_path), human_crop) + boxes = [] + confidences = [] - print(f"\tfound human at {x}/{y} with the size of {w} x {h}") - human_boxes.append({ - "x": x, - "y": y, - "w": w, - "h": h, - "crop": { - "file": str(human_part_image_path), - "x": x1, + # Information for each object detected + for out in outs: + for detection in out: + scores = detection[5:] + class_id = np.argmax(scores) + confidence = scores[class_id] + if confidence > 0.5 and class_id == 0: # Class ID 0 is human + center_x = int(detection[0] * width) + center_y = int(detection[1] * height) + w = int(detection[2] * width) + h = int(detection[3] * height) + x = int(center_x - w / 2) + y = int(center_y - h / 2) + + boxes.append([x, y, w, h]) + confidences.append(float(confidence)) + + # Apply Non-Maximum Suppression + indices = cv2.dnn.NMSBoxes(boxes, confidences, score_threshold=0.5, nms_threshold=0.4) + + human_part_folder = _p.with_name(_p.stem + "_parts") + human_part_folder.mkdir(exist_ok=True) + + for i in indices: + i = i[0] if isinstance(i, (list, np.ndarray)) else i # Flatten index if needed + x, y, w, h = boxes[i] + + human_part_image_path = human_part_folder / (_p.stem + "_" + str(i) + _p.suffix) + + image_height, image_width = image.shape[:2] + + # Compute safe crop coordinates with padding + x1 = max(x - crop_padding, 0) + y1 = max(y - crop_padding, 0) + x2 = min(x + w + crop_padding, image_width) + y2 = min(y + h + crop_padding, image_height) + human_crop = original_image[y1:y2, x1:x2] + + cv2.imwrite(str(human_part_image_path), human_crop) + + print(f"\tfound human at {x}/{y} with the size of {w} x {h}") + human_boxes.append({ + "x": x, "y": y, - "w": x2 - x1, - "h": y2 - y1, - }, - "parts": {}, - }) + "w": w, + "h": h, + "crop": { + "file": str(human_part_image_path), + "x": x1, + "y": y, + "w": x2 - x1, + "h": y2 - y1, + }, + "parts": {}, + }) - cv2.rectangle(image, (x, y), (x + w, y + h), (0, 255, 255), 2) + cv2.rectangle(image, (x, y), (x + w, y + h), (0, 255, 255), 2) - # Save the result - with open(boxes_file, "w") as f: - json.dump(boxes_structures, f) - cv2.imwrite(detected, image) + # Save the result + with open(boxes_file, "w") as f: + json.dump(boxes_structures, f) + cv2.imwrite(detected, image) + + else: + + with open(boxes_file, "r") as f: + boxes_structures = json.load(f) + human_boxes = boxes_structures["humans"] + for human in human_boxes: - detect_human_parts(human["crop"]["file"]) + detect_human_parts(human)