generated from Hazel/python-project
	feat: cleaner detection
This commit is contained in:
		| @@ -1,11 +1,15 @@ | |||||||
|  | from __future__ import annotations | ||||||
| from pathlib import Path | from pathlib import Path | ||||||
| import urllib.request | import urllib.request | ||||||
| from typing import Dict, List | from typing import Dict, List | ||||||
| import json | import json | ||||||
|  | from dataclasses import dataclass | ||||||
|  |  | ||||||
| from ultralytics import YOLO | from ultralytics import YOLO | ||||||
| import cv2 | import cv2 | ||||||
| import numpy as np | import numpy as np | ||||||
|  | from scipy.optimize import minimize | ||||||
|  | from scipy.spatial.transform import Rotation as R | ||||||
|  |  | ||||||
|  |  | ||||||
| MODEL_PATH = Path("assets", "models") | MODEL_PATH = Path("assets", "models") | ||||||
| @@ -38,44 +42,309 @@ def require_net(name: str): | |||||||
|         ) |         ) | ||||||
|  |  | ||||||
|  |  | ||||||
| def detect_human_parts(human: dict): | # Thresholds for face keypoint distances (these might need adjustment) | ||||||
|     parts = human["parts"] | EYE_RATIO_THRESHOLD = 0.25 | ||||||
|  | NOSE_EYE_RATIO_THRESHOLD = 0.2 | ||||||
|  | EAR_NOSE_RATIO_THRESHOLD = 1.2 | ||||||
|  |  | ||||||
|  |  | ||||||
|  | @dataclass | ||||||
|  | class Keypoint: | ||||||
|  |     x: float | ||||||
|  |     y: float | ||||||
|  |     name: str | ||||||
|  |     confidence: float = 0 | ||||||
|  |  | ||||||
|  |     @property | ||||||
|  |     def point(self): | ||||||
|  |         return (int(self.x), int(self.y)) | ||||||
|  |  | ||||||
|  |     def get_distance(self, other: Keypoint) -> float: | ||||||
|  |         return np.sqrt((self.x - other.x) ** 2 + (self.y - other.y) ** 2) | ||||||
|  |  | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def detect_human_parts(human: dict, face_padding: int = 20): | ||||||
|  |     parts = human["parts"] | ||||||
|  |  | ||||||
|     to_detect = human["crop"]["file"] |     to_detect = human["crop"]["file"] | ||||||
|     _p = Path(to_detect) |     _p = Path(to_detect) | ||||||
|     detected = str(_p.with_name(_p.stem + "_detected" + _p.suffix)) |     detected = str(_p.with_name(_p.stem + "_detected" + _p.suffix)) | ||||||
|     boxes_file = str(_p.with_name(_p.stem + "_boxes.json")) |     boxes_file = str(_p.with_name(_p.stem + "_boxes.json")) | ||||||
|     print(f"detecting human parts: {to_detect} => {detected}") |     print(f"detecting human parts: {to_detect} => {detected}") | ||||||
|  |  | ||||||
|  |  | ||||||
|  |     def apply_rotation(rot_matrix, points): | ||||||
|  |         # Apply the rotation to the points, assuming points are 2D coordinates (flattened) | ||||||
|  |         return np.dot(rot_matrix, points.T).T | ||||||
|  |  | ||||||
|  |     def linearize_pairwise_distances(points, target_distances): | ||||||
|  |         # Calculate pairwise distances between the points | ||||||
|  |         num_points = len(points) | ||||||
|  |         pairwise_distances = np.zeros((num_points, num_points)) | ||||||
|  |         for i in range(num_points): | ||||||
|  |             for j in range(i, num_points): | ||||||
|  |                 pairwise_distances[i, j] = np.linalg.norm(points[i] - points[j]) | ||||||
|  |                 pairwise_distances[j, i] = pairwise_distances[i, j]  # symmetric matrix | ||||||
|  |  | ||||||
|  |         total_distance = np.sum(pairwise_distances) | ||||||
|  |         normed_distances = pairwise_distances / total_distance | ||||||
|  |          | ||||||
|  |         return np.abs(normed_distances - target_distances) / target_distances | ||||||
|  |  | ||||||
|  |     def objective(params, original_points, target_distances): | ||||||
|  |         # Convert params to an axis-angle representation (rotation vector) | ||||||
|  |         rot = R.from_rotvec(params) | ||||||
|  |         rotation_matrix = rot.as_matrix()[:2, :2]  # 2D rotation matrix (2x2) | ||||||
|  |  | ||||||
|  |         # Apply the rotation to the original points | ||||||
|  |         rotated_points = apply_rotation(rotation_matrix, original_points) | ||||||
|  |  | ||||||
|  |         # Compute the pairwise distances for the rotated points | ||||||
|  |         divergence = linearize_pairwise_distances(rotated_points, target_distances) | ||||||
|  |         return np.nansum(divergence) | ||||||
|  |  | ||||||
|  |     def optimize_rotation(original_points, relative_face_matrix): | ||||||
|  |         # Compute the pairwise distances of the original points | ||||||
|  |         original_distances = linearize_pairwise_distances(original_points, relative_face_matrix) | ||||||
|  |          | ||||||
|  |         # Initial guess: rotation vector (zero rotation) | ||||||
|  |         initial_params = np.zeros(3)  # Initial guess for the rotation vector (no rotation) | ||||||
|  |  | ||||||
|  |         # Perform the optimization to minimize the divergence | ||||||
|  |         result = minimize(objective, initial_params, args=(original_points, relative_face_matrix), method='BFGS') | ||||||
|  |  | ||||||
|  |         return result.x  # Rotation vector (axis-angle) | ||||||
|  |  | ||||||
|  |     def apply_optimized_rotation(rotation_vector, original_points): | ||||||
|  |         # Convert the rotation vector to a rotation matrix (2D) | ||||||
|  |         rot = R.from_rotvec(rotation_vector) | ||||||
|  |         rotation_matrix = rot.as_matrix()[:2, :2]  # 2D rotation matrix (2x2) | ||||||
|  |          | ||||||
|  |         # Apply the rotation to the points | ||||||
|  |         return apply_rotation(rotation_matrix, original_points) | ||||||
|  |  | ||||||
|  |  | ||||||
|  |     relative_face_matrix = np.array([ | ||||||
|  |         [0.         , 0.02243309, 0.02243309, 0.05016191, 0.05016191], | ||||||
|  |         [0.02243309, 0.        , 0.04012953, 0.04486618, 0.07234453], | ||||||
|  |         [0.02243309, 0.04012953, 0.        , 0.07234453, 0.04486618], | ||||||
|  |         [0.05016191, 0.04486618, 0.07234453, 0.        , 0.08025906], | ||||||
|  |         [0.05016191, 0.07234453, 0.04486618, 0.08025906, 0.        ] | ||||||
|  |     ]) | ||||||
|  |     #  | ||||||
|     model = YOLO('yolov8n-pose.pt')  # You can also try 'yolov8s-pose.pt' for better accuracy |     model = YOLO('yolov8n-pose.pt')  # You can also try 'yolov8s-pose.pt' for better accuracy | ||||||
|  |  | ||||||
|     results = model(to_detect)[0] |     results = model(to_detect)[0] | ||||||
|  |  | ||||||
|     image = cv2.imread(to_detect) |     image = cv2.imread(to_detect) | ||||||
|  |  | ||||||
|     did_detect = False |  | ||||||
|     for person in results.keypoints.data: |     for person in results.keypoints.data: | ||||||
|         keypoints = person.cpu().numpy() |         keypoints = person.cpu().numpy() | ||||||
|  |  | ||||||
|         # Common keypoints: 0=nose, 5=left_shoulder, 11=left_hip, 15=left_foot |         print("#" * 50) | ||||||
|         head = tuple(map(int, keypoints[0][:2])) |  | ||||||
|         foot = tuple(map(int, keypoints[15][:2])) |  | ||||||
|  |  | ||||||
|         cv2.circle(image, head, 5, (255, 0, 0), -1)   # Head in blue |         original_points = np.array([[k[0], k[1]] for k in keypoints[:5]]) | ||||||
|         cv2.circle(image, foot, 5, (0, 0, 255), -1)   # Foot in red |         is_not_zero = False | ||||||
|         did_detect = True |         for x, y in original_points: | ||||||
|  |             if x != 0 or y != 0: | ||||||
|  |                 is_not_zero = True | ||||||
|  |                 break | ||||||
|  |  | ||||||
|  |         if not is_not_zero: | ||||||
|  |             continue | ||||||
|  |  | ||||||
|  |         rotation_vector = optimize_rotation(original_points, relative_face_matrix) | ||||||
|  |         optimized_points = apply_optimized_rotation(rotation_vector, original_points) | ||||||
|  |         optimized_distances = linearize_pairwise_distances(optimized_points, relative_face_matrix) | ||||||
|  |  | ||||||
|  |         # indices of the points that seem to be likely correct | ||||||
|  |         success_points = [] | ||||||
|  |         for i in range(5): | ||||||
|  |             s_count = 0 | ||||||
|  |             for j in range(5): | ||||||
|  |                 d = np.abs(optimized_distances[i][j]) | ||||||
|  |                 if d < 1: | ||||||
|  |                     s_count += 1 | ||||||
|  |  | ||||||
|  |             if s_count > 2: | ||||||
|  |                 success_points.append(i) | ||||||
|  |  | ||||||
|  |         for point in original_points: | ||||||
|  |             cv2.circle(image, (int(point[0]), int(point[1])), 4, (0, 0, 255), -1) | ||||||
|  |  | ||||||
|  |         if len(success_points) < 1: | ||||||
|  |             continue | ||||||
|  |         valid_face = len(success_points) >= 3 | ||||||
|  |  | ||||||
|  |         clean_points = [] | ||||||
|  |  | ||||||
|  |         # Reconstruct disregarded points using weighted average of relative positions | ||||||
|  |         for i in range(5): | ||||||
|  |             if i not in success_points: | ||||||
|  |                 weighted_sum = np.zeros(2) | ||||||
|  |                 total_weight = 0.0 | ||||||
|  |                 for j in success_points: | ||||||
|  |                     if not np.isnan(relative_face_matrix[i][j]): | ||||||
|  |                         direction = original_points[j] - original_points[i] | ||||||
|  |                         norm = np.linalg.norm(direction) | ||||||
|  |                         if norm > 0: | ||||||
|  |                             direction = direction / norm | ||||||
|  |                         estimated_distance = relative_face_matrix[i][j] | ||||||
|  |                         estimate = original_points[j] - direction * estimated_distance | ||||||
|  |                         weighted_sum += estimate | ||||||
|  |                         total_weight += 1 | ||||||
|  |                 if total_weight > 0: | ||||||
|  |                     clean_points.append(weighted_sum / total_weight) | ||||||
|  |             else: | ||||||
|  |                 clean_points.append(original_points[i]) | ||||||
|  |  | ||||||
|  |         clean_points = np.array(clean_points) | ||||||
|  |  | ||||||
|  |         # Calculate bounding box from clean_points | ||||||
|  |         realistic_aspect_ratio = 2/3 # width / height | ||||||
|  |  | ||||||
|  |         x_coords = clean_points[:, 0] | ||||||
|  |         y_coords = clean_points[:, 1] | ||||||
|  |  | ||||||
|  |         min_x = np.min(x_coords) | ||||||
|  |         max_x = np.max(x_coords) | ||||||
|  |         min_y = np.min(y_coords) | ||||||
|  |         max_y = np.max(y_coords) | ||||||
|  |  | ||||||
|  |         # Face-like padding: more space top & bottom than sides | ||||||
|  |         width = max_x - min_x | ||||||
|  |         height = max_y - min_y | ||||||
|  |  | ||||||
|  |  | ||||||
|  |         normalized_bounding_size = max(width, height * realistic_aspect_ratio) | ||||||
|  |         real_width = normalized_bounding_size | ||||||
|  |         real_height = normalized_bounding_size / realistic_aspect_ratio | ||||||
|  |  | ||||||
|  |         padding_x = width * 0.7 + (real_width - width) / 2 | ||||||
|  |         padding_y_top = height * 2 + (real_height - height) / 2 | ||||||
|  |         padding_y_bottom = height * 1.7  + (real_height - height) / 2 | ||||||
|  |  | ||||||
|  |         face_box_x1 = int(min_x - padding_x) | ||||||
|  |         face_box_y1 = int(min_y - padding_y_top) | ||||||
|  |         face_box_x2 = int(max_x + padding_x) | ||||||
|  |         face_box_y2 = int(max_y + padding_y_bottom) | ||||||
|  |  | ||||||
|  |         face_bounding_box = (face_box_x1, face_box_y1, face_box_x2, face_box_y2) | ||||||
|  |  | ||||||
|  |         color = (255, 255, 0) | ||||||
|  |         if valid_face: | ||||||
|  |             color = (0, 255, 0) | ||||||
|  |  | ||||||
|  |         cv2.rectangle(image, (face_box_x1, face_box_y1), (face_box_x2, face_box_y2), color, 2) | ||||||
|  |         for point in clean_points: | ||||||
|  |             cv2.circle(image, (int(point[0]), int(point[1])), 4, color, -1) | ||||||
|  |  | ||||||
|  |  | ||||||
|  |         print("\nOriginal points:") | ||||||
|  |         print(original_points) | ||||||
|  |         print("\nOriginal pairwise distances:") | ||||||
|  |         print(linearize_pairwise_distances(original_points, relative_face_matrix)) | ||||||
|  |         print(f"Optimized rotation vector (axis-angle): {rotation_vector}") | ||||||
|  |         print("\nOptimized points after rotation:") | ||||||
|  |         print(optimized_points) | ||||||
|  |         print("\nOptimized pairwise distances:") | ||||||
|  |         print(optimized_distances) | ||||||
|  |         print(success_points) | ||||||
|  |         print(clean_points) | ||||||
|  |  | ||||||
|  |  | ||||||
|  |              | ||||||
|  |  | ||||||
|  |         """ | ||||||
|  |         for idx in face_indices: | ||||||
|  |             x, y, conf = keypoints[idx] | ||||||
|  |             name = keypoint_names[idx] | ||||||
|  |             if conf > 0.3: | ||||||
|  |                 face_points.append((x, y)) | ||||||
|  |  | ||||||
|  |                 point = (int(x), int(y)) | ||||||
|  |                 name = keypoint_names[idx] | ||||||
|  |                 cv2.circle(image, point, 4, (0, 255, 0), -1) | ||||||
|  |                 cv2.putText(image, name, (point[0] + 5, point[1] + 5), | ||||||
|  |                 cv2.FONT_HERSHEY_SIMPLEX, 0.4, (0, 255, 0), 1) | ||||||
|  |         """ | ||||||
|  |  | ||||||
|  |  | ||||||
|  |         """ | ||||||
|  |         nose, left_eye, right_eye, left_ear, right_ear = face_points | ||||||
|  |         print(face_points) | ||||||
|  |  | ||||||
|  |         # Calculate pairwise distances | ||||||
|  |         nose_to_left_eye = euclidean_distance(nose, left_eye) | ||||||
|  |         nose_to_right_eye = euclidean_distance(nose, right_eye) | ||||||
|  |         eyes_distance = euclidean_distance(left_eye, right_eye) | ||||||
|  |         left_ear_to_nose = euclidean_distance(left_ear, nose) | ||||||
|  |         right_ear_to_nose = euclidean_distance(right_ear, nose) | ||||||
|  |  | ||||||
|  |         # Relative distances | ||||||
|  |         eye_to_eye_ratio = eyes_distance / (left_ear_to_nose + right_ear_to_nose)  # Eyes vs. nose-to-ears | ||||||
|  |         nose_to_eye_ratio = (nose_to_left_eye + nose_to_right_eye) / (left_ear_to_nose + right_ear_to_nose)  # Nose-to-eye vs. ear-to-nose | ||||||
|  |         ear_to_nose_ratio = (left_ear_to_nose + right_ear_to_nose) / 2  # Ear-to-nose proportionality | ||||||
|  |  | ||||||
|  |         # Validate using relative distances | ||||||
|  |         if not (EYE_RATIO_THRESHOLD < eye_to_eye_ratio < 0.5):  # Arbitrary ratio threshold | ||||||
|  |             print("⚠️ Rejected due to unrealistic eye-to-eye ratio:", eye_to_eye_ratio) | ||||||
|  |             has_valid_face = False | ||||||
|  |  | ||||||
|  |         if not (NOSE_EYE_RATIO_THRESHOLD < nose_to_eye_ratio < 0.4):  # Arbitrary ratio threshold | ||||||
|  |             print("⚠️ Rejected due to unrealistic nose-to-eye ratio:", nose_to_eye_ratio) | ||||||
|  |             has_valid_face = False | ||||||
|  |  | ||||||
|  |         if not (0.5 < ear_to_nose_ratio < EAR_NOSE_RATIO_THRESHOLD): | ||||||
|  |             print("⚠️ Rejected due to unrealistic ear-to-nose ratio:", ear_to_nose_ratio) | ||||||
|  |             has_valid_face = False | ||||||
|  |  | ||||||
|  |  | ||||||
|  |         # If all checks pass, calculate the bounding box | ||||||
|  |         xs, ys, _ = zip(*face_points) | ||||||
|  |         x_min, x_max = int(min(xs)), int(max(xs)) | ||||||
|  |         y_min, y_max = int(min(ys)), int(max(ys)) | ||||||
|  |  | ||||||
|  |         x_min = max(x_min - face_padding, 0) | ||||||
|  |         y_min = max(y_min - face_padding, 0) | ||||||
|  |         x_max = min(x_max + face_padding, image.shape[1]) | ||||||
|  |         y_max = min(y_max + face_padding, image.shape[0]) | ||||||
|  |  | ||||||
|  |         # Compute box size | ||||||
|  |         box_w = x_max - x_min | ||||||
|  |         box_h = y_max - y_min | ||||||
|  |  | ||||||
|  |         if has_valid_face: | ||||||
|  |             cv2.rectangle(image, (x_min, y_min), (x_max, y_max), (0, 255, 0), 2) | ||||||
|  |         else: | ||||||
|  |             cv2.rectangle(image, (x_min, y_min), (x_max, y_max), (255, 0, 0), 2) | ||||||
|  |              | ||||||
|  |  | ||||||
|  |  | ||||||
|  |  | ||||||
|  |         for i, (x, y, conf) in enumerate(keypoints): | ||||||
|  |             point = (int(x), int(y)) | ||||||
|  |             name = keypoint_names[i] | ||||||
|  |             # cv2.circle(image, point, 4, (0, 255, 0), -1) | ||||||
|  |             # cv2.putText(image, name, (point[0] + 5, point[1] - 5), | ||||||
|  |             # cv2.FONT_HERSHEY_SIMPLEX, 0.4, (0, 255, 0), 1) | ||||||
|  |  | ||||||
|  |         # cv2.circle(image, head, 5, (255, 0, 0), -1)   # Head in blue | ||||||
|  |         # cv2.circle(image, foot, 5, (0, 0, 255), -1)   # Foot in red | ||||||
|  |         """  | ||||||
|  |  | ||||||
|     if did_detect: |  | ||||||
|     cv2.imwrite(detected, image) |     cv2.imwrite(detected, image) | ||||||
|  |  | ||||||
|  |  | ||||||
| def detect_humans(to_detect: str, crop_padding: int = 20): | def detect_humans(to_detect: str, crop_padding: int = 20, skip_detection_if_present: bool = True): | ||||||
|     _p = Path(to_detect) |     _p = Path(to_detect) | ||||||
|     detected = str(_p.with_name(_p.stem + "_detected" + _p.suffix)) |     detected = str(_p.with_name(_p.stem + "_detected" + _p.suffix)) | ||||||
|     boxes_file = str(_p.with_name(_p.stem + "_boxes.json")) |     boxes_file = str(_p.with_name(_p.stem + "_boxes.json")) | ||||||
|     print(f"detecting humans: {to_detect} => {detected}") |     print(f"detecting humans: {to_detect} => {detected}") | ||||||
|  |  | ||||||
|  |     boxes_structures = {} | ||||||
|  |     human_boxes = boxes_structures["humans"] = [] | ||||||
|  |  | ||||||
|  |     if not (Path(boxes_file).exists() and skip_detection_if_present): | ||||||
|         require_net("yolov3") |         require_net("yolov3") | ||||||
|  |  | ||||||
|         # Load YOLO |         # Load YOLO | ||||||
| @@ -118,9 +387,6 @@ def detect_humans(to_detect: str, crop_padding: int = 20): | |||||||
|         # Apply Non-Maximum Suppression |         # Apply Non-Maximum Suppression | ||||||
|         indices = cv2.dnn.NMSBoxes(boxes, confidences, score_threshold=0.5, nms_threshold=0.4) |         indices = cv2.dnn.NMSBoxes(boxes, confidences, score_threshold=0.5, nms_threshold=0.4) | ||||||
|  |  | ||||||
|     boxes_structures = {} |  | ||||||
|     human_boxes = boxes_structures["humans"] = [] |  | ||||||
|  |  | ||||||
|         human_part_folder = _p.with_name(_p.stem + "_parts") |         human_part_folder = _p.with_name(_p.stem + "_parts") | ||||||
|         human_part_folder.mkdir(exist_ok=True) |         human_part_folder.mkdir(exist_ok=True) | ||||||
|  |  | ||||||
| @@ -166,5 +432,12 @@ def detect_humans(to_detect: str, crop_padding: int = 20): | |||||||
|             json.dump(boxes_structures, f) |             json.dump(boxes_structures, f) | ||||||
|         cv2.imwrite(detected, image) |         cv2.imwrite(detected, image) | ||||||
|  |  | ||||||
|  |     else: | ||||||
|  |  | ||||||
|  |         with open(boxes_file, "r") as f: | ||||||
|  |             boxes_structures = json.load(f) | ||||||
|  |             human_boxes = boxes_structures["humans"] | ||||||
|  |  | ||||||
|  |  | ||||||
|     for human in human_boxes: |     for human in human_boxes: | ||||||
|         detect_human_parts(human["crop"]["file"]) |         detect_human_parts(human) | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user