feat: cleaner detection

This commit is contained in:
Hazel 2025-04-23 16:56:09 +02:00
parent bcfc90acdf
commit ad8f3b8e66

View File

@ -1,11 +1,15 @@
from __future__ import annotations
from pathlib import Path from pathlib import Path
import urllib.request import urllib.request
from typing import Dict, List from typing import Dict, List
import json import json
from dataclasses import dataclass
from ultralytics import YOLO from ultralytics import YOLO
import cv2 import cv2
import numpy as np import numpy as np
from scipy.optimize import minimize
from scipy.spatial.transform import Rotation as R
MODEL_PATH = Path("assets", "models") MODEL_PATH = Path("assets", "models")
@ -38,133 +42,402 @@ def require_net(name: str):
) )
def detect_human_parts(human: dict): # Thresholds for face keypoint distances (these might need adjustment)
parts = human["parts"] EYE_RATIO_THRESHOLD = 0.25
NOSE_EYE_RATIO_THRESHOLD = 0.2
EAR_NOSE_RATIO_THRESHOLD = 1.2
@dataclass
class Keypoint:
x: float
y: float
name: str
confidence: float = 0
@property
def point(self):
return (int(self.x), int(self.y))
def get_distance(self, other: Keypoint) -> float:
return np.sqrt((self.x - other.x) ** 2 + (self.y - other.y) ** 2)
def detect_human_parts(human: dict, face_padding: int = 20):
parts = human["parts"]
to_detect = human["crop"]["file"] to_detect = human["crop"]["file"]
_p = Path(to_detect) _p = Path(to_detect)
detected = str(_p.with_name(_p.stem + "_detected" + _p.suffix)) detected = str(_p.with_name(_p.stem + "_detected" + _p.suffix))
boxes_file = str(_p.with_name(_p.stem + "_boxes.json")) boxes_file = str(_p.with_name(_p.stem + "_boxes.json"))
print(f"detecting human parts: {to_detect} => {detected}") print(f"detecting human parts: {to_detect} => {detected}")
def apply_rotation(rot_matrix, points):
# Apply the rotation to the points, assuming points are 2D coordinates (flattened)
return np.dot(rot_matrix, points.T).T
def linearize_pairwise_distances(points, target_distances):
# Calculate pairwise distances between the points
num_points = len(points)
pairwise_distances = np.zeros((num_points, num_points))
for i in range(num_points):
for j in range(i, num_points):
pairwise_distances[i, j] = np.linalg.norm(points[i] - points[j])
pairwise_distances[j, i] = pairwise_distances[i, j] # symmetric matrix
total_distance = np.sum(pairwise_distances)
normed_distances = pairwise_distances / total_distance
return np.abs(normed_distances - target_distances) / target_distances
def objective(params, original_points, target_distances):
# Convert params to an axis-angle representation (rotation vector)
rot = R.from_rotvec(params)
rotation_matrix = rot.as_matrix()[:2, :2] # 2D rotation matrix (2x2)
# Apply the rotation to the original points
rotated_points = apply_rotation(rotation_matrix, original_points)
# Compute the pairwise distances for the rotated points
divergence = linearize_pairwise_distances(rotated_points, target_distances)
return np.nansum(divergence)
def optimize_rotation(original_points, relative_face_matrix):
# Compute the pairwise distances of the original points
original_distances = linearize_pairwise_distances(original_points, relative_face_matrix)
# Initial guess: rotation vector (zero rotation)
initial_params = np.zeros(3) # Initial guess for the rotation vector (no rotation)
# Perform the optimization to minimize the divergence
result = minimize(objective, initial_params, args=(original_points, relative_face_matrix), method='BFGS')
return result.x # Rotation vector (axis-angle)
def apply_optimized_rotation(rotation_vector, original_points):
# Convert the rotation vector to a rotation matrix (2D)
rot = R.from_rotvec(rotation_vector)
rotation_matrix = rot.as_matrix()[:2, :2] # 2D rotation matrix (2x2)
# Apply the rotation to the points
return apply_rotation(rotation_matrix, original_points)
relative_face_matrix = np.array([
[0. , 0.02243309, 0.02243309, 0.05016191, 0.05016191],
[0.02243309, 0. , 0.04012953, 0.04486618, 0.07234453],
[0.02243309, 0.04012953, 0. , 0.07234453, 0.04486618],
[0.05016191, 0.04486618, 0.07234453, 0. , 0.08025906],
[0.05016191, 0.07234453, 0.04486618, 0.08025906, 0. ]
])
#
model = YOLO('yolov8n-pose.pt') # You can also try 'yolov8s-pose.pt' for better accuracy model = YOLO('yolov8n-pose.pt') # You can also try 'yolov8s-pose.pt' for better accuracy
results = model(to_detect)[0] results = model(to_detect)[0]
image = cv2.imread(to_detect) image = cv2.imread(to_detect)
did_detect = False
for person in results.keypoints.data: for person in results.keypoints.data:
keypoints = person.cpu().numpy() keypoints = person.cpu().numpy()
# Common keypoints: 0=nose, 5=left_shoulder, 11=left_hip, 15=left_foot print("#" * 50)
head = tuple(map(int, keypoints[0][:2]))
foot = tuple(map(int, keypoints[15][:2]))
cv2.circle(image, head, 5, (255, 0, 0), -1) # Head in blue original_points = np.array([[k[0], k[1]] for k in keypoints[:5]])
cv2.circle(image, foot, 5, (0, 0, 255), -1) # Foot in red is_not_zero = False
did_detect = True for x, y in original_points:
if x != 0 or y != 0:
is_not_zero = True
break
if did_detect: if not is_not_zero:
cv2.imwrite(detected, image) continue
rotation_vector = optimize_rotation(original_points, relative_face_matrix)
optimized_points = apply_optimized_rotation(rotation_vector, original_points)
optimized_distances = linearize_pairwise_distances(optimized_points, relative_face_matrix)
# indices of the points that seem to be likely correct
success_points = []
for i in range(5):
s_count = 0
for j in range(5):
d = np.abs(optimized_distances[i][j])
if d < 1:
s_count += 1
if s_count > 2:
success_points.append(i)
for point in original_points:
cv2.circle(image, (int(point[0]), int(point[1])), 4, (0, 0, 255), -1)
if len(success_points) < 1:
continue
valid_face = len(success_points) >= 3
clean_points = []
# Reconstruct disregarded points using weighted average of relative positions
for i in range(5):
if i not in success_points:
weighted_sum = np.zeros(2)
total_weight = 0.0
for j in success_points:
if not np.isnan(relative_face_matrix[i][j]):
direction = original_points[j] - original_points[i]
norm = np.linalg.norm(direction)
if norm > 0:
direction = direction / norm
estimated_distance = relative_face_matrix[i][j]
estimate = original_points[j] - direction * estimated_distance
weighted_sum += estimate
total_weight += 1
if total_weight > 0:
clean_points.append(weighted_sum / total_weight)
else:
clean_points.append(original_points[i])
clean_points = np.array(clean_points)
# Calculate bounding box from clean_points
realistic_aspect_ratio = 2/3 # width / height
x_coords = clean_points[:, 0]
y_coords = clean_points[:, 1]
min_x = np.min(x_coords)
max_x = np.max(x_coords)
min_y = np.min(y_coords)
max_y = np.max(y_coords)
# Face-like padding: more space top & bottom than sides
width = max_x - min_x
height = max_y - min_y
def detect_humans(to_detect: str, crop_padding: int = 20): normalized_bounding_size = max(width, height * realistic_aspect_ratio)
real_width = normalized_bounding_size
real_height = normalized_bounding_size / realistic_aspect_ratio
padding_x = width * 0.7 + (real_width - width) / 2
padding_y_top = height * 2 + (real_height - height) / 2
padding_y_bottom = height * 1.7 + (real_height - height) / 2
face_box_x1 = int(min_x - padding_x)
face_box_y1 = int(min_y - padding_y_top)
face_box_x2 = int(max_x + padding_x)
face_box_y2 = int(max_y + padding_y_bottom)
face_bounding_box = (face_box_x1, face_box_y1, face_box_x2, face_box_y2)
color = (255, 255, 0)
if valid_face:
color = (0, 255, 0)
cv2.rectangle(image, (face_box_x1, face_box_y1), (face_box_x2, face_box_y2), color, 2)
for point in clean_points:
cv2.circle(image, (int(point[0]), int(point[1])), 4, color, -1)
print("\nOriginal points:")
print(original_points)
print("\nOriginal pairwise distances:")
print(linearize_pairwise_distances(original_points, relative_face_matrix))
print(f"Optimized rotation vector (axis-angle): {rotation_vector}")
print("\nOptimized points after rotation:")
print(optimized_points)
print("\nOptimized pairwise distances:")
print(optimized_distances)
print(success_points)
print(clean_points)
"""
for idx in face_indices:
x, y, conf = keypoints[idx]
name = keypoint_names[idx]
if conf > 0.3:
face_points.append((x, y))
point = (int(x), int(y))
name = keypoint_names[idx]
cv2.circle(image, point, 4, (0, 255, 0), -1)
cv2.putText(image, name, (point[0] + 5, point[1] + 5),
cv2.FONT_HERSHEY_SIMPLEX, 0.4, (0, 255, 0), 1)
"""
"""
nose, left_eye, right_eye, left_ear, right_ear = face_points
print(face_points)
# Calculate pairwise distances
nose_to_left_eye = euclidean_distance(nose, left_eye)
nose_to_right_eye = euclidean_distance(nose, right_eye)
eyes_distance = euclidean_distance(left_eye, right_eye)
left_ear_to_nose = euclidean_distance(left_ear, nose)
right_ear_to_nose = euclidean_distance(right_ear, nose)
# Relative distances
eye_to_eye_ratio = eyes_distance / (left_ear_to_nose + right_ear_to_nose) # Eyes vs. nose-to-ears
nose_to_eye_ratio = (nose_to_left_eye + nose_to_right_eye) / (left_ear_to_nose + right_ear_to_nose) # Nose-to-eye vs. ear-to-nose
ear_to_nose_ratio = (left_ear_to_nose + right_ear_to_nose) / 2 # Ear-to-nose proportionality
# Validate using relative distances
if not (EYE_RATIO_THRESHOLD < eye_to_eye_ratio < 0.5): # Arbitrary ratio threshold
print("⚠️ Rejected due to unrealistic eye-to-eye ratio:", eye_to_eye_ratio)
has_valid_face = False
if not (NOSE_EYE_RATIO_THRESHOLD < nose_to_eye_ratio < 0.4): # Arbitrary ratio threshold
print("⚠️ Rejected due to unrealistic nose-to-eye ratio:", nose_to_eye_ratio)
has_valid_face = False
if not (0.5 < ear_to_nose_ratio < EAR_NOSE_RATIO_THRESHOLD):
print("⚠️ Rejected due to unrealistic ear-to-nose ratio:", ear_to_nose_ratio)
has_valid_face = False
# If all checks pass, calculate the bounding box
xs, ys, _ = zip(*face_points)
x_min, x_max = int(min(xs)), int(max(xs))
y_min, y_max = int(min(ys)), int(max(ys))
x_min = max(x_min - face_padding, 0)
y_min = max(y_min - face_padding, 0)
x_max = min(x_max + face_padding, image.shape[1])
y_max = min(y_max + face_padding, image.shape[0])
# Compute box size
box_w = x_max - x_min
box_h = y_max - y_min
if has_valid_face:
cv2.rectangle(image, (x_min, y_min), (x_max, y_max), (0, 255, 0), 2)
else:
cv2.rectangle(image, (x_min, y_min), (x_max, y_max), (255, 0, 0), 2)
for i, (x, y, conf) in enumerate(keypoints):
point = (int(x), int(y))
name = keypoint_names[i]
# cv2.circle(image, point, 4, (0, 255, 0), -1)
# cv2.putText(image, name, (point[0] + 5, point[1] - 5),
# cv2.FONT_HERSHEY_SIMPLEX, 0.4, (0, 255, 0), 1)
# cv2.circle(image, head, 5, (255, 0, 0), -1) # Head in blue
# cv2.circle(image, foot, 5, (0, 0, 255), -1) # Foot in red
"""
cv2.imwrite(detected, image)
def detect_humans(to_detect: str, crop_padding: int = 20, skip_detection_if_present: bool = True):
_p = Path(to_detect) _p = Path(to_detect)
detected = str(_p.with_name(_p.stem + "_detected" + _p.suffix)) detected = str(_p.with_name(_p.stem + "_detected" + _p.suffix))
boxes_file = str(_p.with_name(_p.stem + "_boxes.json")) boxes_file = str(_p.with_name(_p.stem + "_boxes.json"))
print(f"detecting humans: {to_detect} => {detected}") print(f"detecting humans: {to_detect} => {detected}")
require_net("yolov3")
# Load YOLO
net = cv2.dnn.readNet(str(MODEL_PATH / 'yolov3.weights'), str(MODEL_PATH / 'yolov3.cfg'))
layer_names = net.getLayerNames()
indices = net.getUnconnectedOutLayers()
output_layers = [layer_names[int(i) - 1] for i in indices]
# Load image
image = cv2.imread(to_detect)
original_image = cv2.imread(to_detect)
height, width, channels = image.shape
# Create blob and do forward pass
blob = cv2.dnn.blobFromImage(image, 0.00392, (416, 416), (0, 0, 0), True, crop=False)
net.setInput(blob)
outs = net.forward(output_layers)
boxes = []
confidences = []
# Information for each object detected
for out in outs:
for detection in out:
scores = detection[5:]
class_id = np.argmax(scores)
confidence = scores[class_id]
if confidence > 0.5 and class_id == 0: # Class ID 0 is human
center_x = int(detection[0] * width)
center_y = int(detection[1] * height)
w = int(detection[2] * width)
h = int(detection[3] * height)
x = int(center_x - w / 2)
y = int(center_y - h / 2)
boxes.append([x, y, w, h])
confidences.append(float(confidence))
# Apply Non-Maximum Suppression
indices = cv2.dnn.NMSBoxes(boxes, confidences, score_threshold=0.5, nms_threshold=0.4)
boxes_structures = {} boxes_structures = {}
human_boxes = boxes_structures["humans"] = [] human_boxes = boxes_structures["humans"] = []
human_part_folder = _p.with_name(_p.stem + "_parts") if not (Path(boxes_file).exists() and skip_detection_if_present):
human_part_folder.mkdir(exist_ok=True) require_net("yolov3")
for i in indices: # Load YOLO
i = i[0] if isinstance(i, (list, np.ndarray)) else i # Flatten index if needed net = cv2.dnn.readNet(str(MODEL_PATH / 'yolov3.weights'), str(MODEL_PATH / 'yolov3.cfg'))
x, y, w, h = boxes[i] layer_names = net.getLayerNames()
indices = net.getUnconnectedOutLayers()
output_layers = [layer_names[int(i) - 1] for i in indices]
human_part_image_path = human_part_folder / (_p.stem + "_" + str(i) + _p.suffix)
image_height, image_width = image.shape[:2] # Load image
image = cv2.imread(to_detect)
original_image = cv2.imread(to_detect)
height, width, channels = image.shape
# Compute safe crop coordinates with padding # Create blob and do forward pass
x1 = max(x - crop_padding, 0) blob = cv2.dnn.blobFromImage(image, 0.00392, (416, 416), (0, 0, 0), True, crop=False)
y1 = max(y - crop_padding, 0) net.setInput(blob)
x2 = min(x + w + crop_padding, image_width) outs = net.forward(output_layers)
y2 = min(y + h + crop_padding, image_height)
human_crop = original_image[y1:y2, x1:x2]
cv2.imwrite(str(human_part_image_path), human_crop) boxes = []
confidences = []
print(f"\tfound human at {x}/{y} with the size of {w} x {h}") # Information for each object detected
human_boxes.append({ for out in outs:
"x": x, for detection in out:
"y": y, scores = detection[5:]
"w": w, class_id = np.argmax(scores)
"h": h, confidence = scores[class_id]
"crop": { if confidence > 0.5 and class_id == 0: # Class ID 0 is human
"file": str(human_part_image_path), center_x = int(detection[0] * width)
"x": x1, center_y = int(detection[1] * height)
w = int(detection[2] * width)
h = int(detection[3] * height)
x = int(center_x - w / 2)
y = int(center_y - h / 2)
boxes.append([x, y, w, h])
confidences.append(float(confidence))
# Apply Non-Maximum Suppression
indices = cv2.dnn.NMSBoxes(boxes, confidences, score_threshold=0.5, nms_threshold=0.4)
human_part_folder = _p.with_name(_p.stem + "_parts")
human_part_folder.mkdir(exist_ok=True)
for i in indices:
i = i[0] if isinstance(i, (list, np.ndarray)) else i # Flatten index if needed
x, y, w, h = boxes[i]
human_part_image_path = human_part_folder / (_p.stem + "_" + str(i) + _p.suffix)
image_height, image_width = image.shape[:2]
# Compute safe crop coordinates with padding
x1 = max(x - crop_padding, 0)
y1 = max(y - crop_padding, 0)
x2 = min(x + w + crop_padding, image_width)
y2 = min(y + h + crop_padding, image_height)
human_crop = original_image[y1:y2, x1:x2]
cv2.imwrite(str(human_part_image_path), human_crop)
print(f"\tfound human at {x}/{y} with the size of {w} x {h}")
human_boxes.append({
"x": x,
"y": y, "y": y,
"w": x2 - x1, "w": w,
"h": y2 - y1, "h": h,
}, "crop": {
"parts": {}, "file": str(human_part_image_path),
}) "x": x1,
"y": y,
"w": x2 - x1,
"h": y2 - y1,
},
"parts": {},
})
cv2.rectangle(image, (x, y), (x + w, y + h), (0, 255, 255), 2) cv2.rectangle(image, (x, y), (x + w, y + h), (0, 255, 255), 2)
# Save the result # Save the result
with open(boxes_file, "w") as f: with open(boxes_file, "w") as f:
json.dump(boxes_structures, f) json.dump(boxes_structures, f)
cv2.imwrite(detected, image) cv2.imwrite(detected, image)
else:
with open(boxes_file, "r") as f:
boxes_structures = json.load(f)
human_boxes = boxes_structures["humans"]
for human in human_boxes: for human in human_boxes:
detect_human_parts(human["crop"]["file"]) detect_human_parts(human)