generated from Hazel/python-project
feat: cleaner detection
This commit is contained in:
parent
bcfc90acdf
commit
ad8f3b8e66
@ -1,11 +1,15 @@
|
||||
from __future__ import annotations
|
||||
from pathlib import Path
|
||||
import urllib.request
|
||||
from typing import Dict, List
|
||||
import json
|
||||
from dataclasses import dataclass
|
||||
|
||||
from ultralytics import YOLO
|
||||
import cv2
|
||||
import numpy as np
|
||||
from scipy.optimize import minimize
|
||||
from scipy.spatial.transform import Rotation as R
|
||||
|
||||
|
||||
MODEL_PATH = Path("assets", "models")
|
||||
@ -38,133 +42,402 @@ def require_net(name: str):
|
||||
)
|
||||
|
||||
|
||||
def detect_human_parts(human: dict):
|
||||
parts = human["parts"]
|
||||
# Thresholds for face keypoint distances (these might need adjustment)
|
||||
EYE_RATIO_THRESHOLD = 0.25
|
||||
NOSE_EYE_RATIO_THRESHOLD = 0.2
|
||||
EAR_NOSE_RATIO_THRESHOLD = 1.2
|
||||
|
||||
|
||||
@dataclass
|
||||
class Keypoint:
|
||||
x: float
|
||||
y: float
|
||||
name: str
|
||||
confidence: float = 0
|
||||
|
||||
@property
|
||||
def point(self):
|
||||
return (int(self.x), int(self.y))
|
||||
|
||||
def get_distance(self, other: Keypoint) -> float:
|
||||
return np.sqrt((self.x - other.x) ** 2 + (self.y - other.y) ** 2)
|
||||
|
||||
|
||||
|
||||
def detect_human_parts(human: dict, face_padding: int = 20):
|
||||
parts = human["parts"]
|
||||
|
||||
to_detect = human["crop"]["file"]
|
||||
_p = Path(to_detect)
|
||||
detected = str(_p.with_name(_p.stem + "_detected" + _p.suffix))
|
||||
boxes_file = str(_p.with_name(_p.stem + "_boxes.json"))
|
||||
print(f"detecting human parts: {to_detect} => {detected}")
|
||||
|
||||
|
||||
def apply_rotation(rot_matrix, points):
|
||||
# Apply the rotation to the points, assuming points are 2D coordinates (flattened)
|
||||
return np.dot(rot_matrix, points.T).T
|
||||
|
||||
def linearize_pairwise_distances(points, target_distances):
|
||||
# Calculate pairwise distances between the points
|
||||
num_points = len(points)
|
||||
pairwise_distances = np.zeros((num_points, num_points))
|
||||
for i in range(num_points):
|
||||
for j in range(i, num_points):
|
||||
pairwise_distances[i, j] = np.linalg.norm(points[i] - points[j])
|
||||
pairwise_distances[j, i] = pairwise_distances[i, j] # symmetric matrix
|
||||
|
||||
total_distance = np.sum(pairwise_distances)
|
||||
normed_distances = pairwise_distances / total_distance
|
||||
|
||||
return np.abs(normed_distances - target_distances) / target_distances
|
||||
|
||||
def objective(params, original_points, target_distances):
|
||||
# Convert params to an axis-angle representation (rotation vector)
|
||||
rot = R.from_rotvec(params)
|
||||
rotation_matrix = rot.as_matrix()[:2, :2] # 2D rotation matrix (2x2)
|
||||
|
||||
# Apply the rotation to the original points
|
||||
rotated_points = apply_rotation(rotation_matrix, original_points)
|
||||
|
||||
# Compute the pairwise distances for the rotated points
|
||||
divergence = linearize_pairwise_distances(rotated_points, target_distances)
|
||||
return np.nansum(divergence)
|
||||
|
||||
def optimize_rotation(original_points, relative_face_matrix):
|
||||
# Compute the pairwise distances of the original points
|
||||
original_distances = linearize_pairwise_distances(original_points, relative_face_matrix)
|
||||
|
||||
# Initial guess: rotation vector (zero rotation)
|
||||
initial_params = np.zeros(3) # Initial guess for the rotation vector (no rotation)
|
||||
|
||||
# Perform the optimization to minimize the divergence
|
||||
result = minimize(objective, initial_params, args=(original_points, relative_face_matrix), method='BFGS')
|
||||
|
||||
return result.x # Rotation vector (axis-angle)
|
||||
|
||||
def apply_optimized_rotation(rotation_vector, original_points):
|
||||
# Convert the rotation vector to a rotation matrix (2D)
|
||||
rot = R.from_rotvec(rotation_vector)
|
||||
rotation_matrix = rot.as_matrix()[:2, :2] # 2D rotation matrix (2x2)
|
||||
|
||||
# Apply the rotation to the points
|
||||
return apply_rotation(rotation_matrix, original_points)
|
||||
|
||||
|
||||
relative_face_matrix = np.array([
|
||||
[0. , 0.02243309, 0.02243309, 0.05016191, 0.05016191],
|
||||
[0.02243309, 0. , 0.04012953, 0.04486618, 0.07234453],
|
||||
[0.02243309, 0.04012953, 0. , 0.07234453, 0.04486618],
|
||||
[0.05016191, 0.04486618, 0.07234453, 0. , 0.08025906],
|
||||
[0.05016191, 0.07234453, 0.04486618, 0.08025906, 0. ]
|
||||
])
|
||||
#
|
||||
model = YOLO('yolov8n-pose.pt') # You can also try 'yolov8s-pose.pt' for better accuracy
|
||||
|
||||
results = model(to_detect)[0]
|
||||
|
||||
image = cv2.imread(to_detect)
|
||||
|
||||
did_detect = False
|
||||
for person in results.keypoints.data:
|
||||
keypoints = person.cpu().numpy()
|
||||
|
||||
# Common keypoints: 0=nose, 5=left_shoulder, 11=left_hip, 15=left_foot
|
||||
head = tuple(map(int, keypoints[0][:2]))
|
||||
foot = tuple(map(int, keypoints[15][:2]))
|
||||
print("#" * 50)
|
||||
|
||||
cv2.circle(image, head, 5, (255, 0, 0), -1) # Head in blue
|
||||
cv2.circle(image, foot, 5, (0, 0, 255), -1) # Foot in red
|
||||
did_detect = True
|
||||
original_points = np.array([[k[0], k[1]] for k in keypoints[:5]])
|
||||
is_not_zero = False
|
||||
for x, y in original_points:
|
||||
if x != 0 or y != 0:
|
||||
is_not_zero = True
|
||||
break
|
||||
|
||||
if did_detect:
|
||||
cv2.imwrite(detected, image)
|
||||
if not is_not_zero:
|
||||
continue
|
||||
|
||||
rotation_vector = optimize_rotation(original_points, relative_face_matrix)
|
||||
optimized_points = apply_optimized_rotation(rotation_vector, original_points)
|
||||
optimized_distances = linearize_pairwise_distances(optimized_points, relative_face_matrix)
|
||||
|
||||
# indices of the points that seem to be likely correct
|
||||
success_points = []
|
||||
for i in range(5):
|
||||
s_count = 0
|
||||
for j in range(5):
|
||||
d = np.abs(optimized_distances[i][j])
|
||||
if d < 1:
|
||||
s_count += 1
|
||||
|
||||
if s_count > 2:
|
||||
success_points.append(i)
|
||||
|
||||
for point in original_points:
|
||||
cv2.circle(image, (int(point[0]), int(point[1])), 4, (0, 0, 255), -1)
|
||||
|
||||
if len(success_points) < 1:
|
||||
continue
|
||||
valid_face = len(success_points) >= 3
|
||||
|
||||
clean_points = []
|
||||
|
||||
# Reconstruct disregarded points using weighted average of relative positions
|
||||
for i in range(5):
|
||||
if i not in success_points:
|
||||
weighted_sum = np.zeros(2)
|
||||
total_weight = 0.0
|
||||
for j in success_points:
|
||||
if not np.isnan(relative_face_matrix[i][j]):
|
||||
direction = original_points[j] - original_points[i]
|
||||
norm = np.linalg.norm(direction)
|
||||
if norm > 0:
|
||||
direction = direction / norm
|
||||
estimated_distance = relative_face_matrix[i][j]
|
||||
estimate = original_points[j] - direction * estimated_distance
|
||||
weighted_sum += estimate
|
||||
total_weight += 1
|
||||
if total_weight > 0:
|
||||
clean_points.append(weighted_sum / total_weight)
|
||||
else:
|
||||
clean_points.append(original_points[i])
|
||||
|
||||
clean_points = np.array(clean_points)
|
||||
|
||||
# Calculate bounding box from clean_points
|
||||
realistic_aspect_ratio = 2/3 # width / height
|
||||
|
||||
x_coords = clean_points[:, 0]
|
||||
y_coords = clean_points[:, 1]
|
||||
|
||||
min_x = np.min(x_coords)
|
||||
max_x = np.max(x_coords)
|
||||
min_y = np.min(y_coords)
|
||||
max_y = np.max(y_coords)
|
||||
|
||||
# Face-like padding: more space top & bottom than sides
|
||||
width = max_x - min_x
|
||||
height = max_y - min_y
|
||||
|
||||
|
||||
def detect_humans(to_detect: str, crop_padding: int = 20):
|
||||
normalized_bounding_size = max(width, height * realistic_aspect_ratio)
|
||||
real_width = normalized_bounding_size
|
||||
real_height = normalized_bounding_size / realistic_aspect_ratio
|
||||
|
||||
padding_x = width * 0.7 + (real_width - width) / 2
|
||||
padding_y_top = height * 2 + (real_height - height) / 2
|
||||
padding_y_bottom = height * 1.7 + (real_height - height) / 2
|
||||
|
||||
face_box_x1 = int(min_x - padding_x)
|
||||
face_box_y1 = int(min_y - padding_y_top)
|
||||
face_box_x2 = int(max_x + padding_x)
|
||||
face_box_y2 = int(max_y + padding_y_bottom)
|
||||
|
||||
face_bounding_box = (face_box_x1, face_box_y1, face_box_x2, face_box_y2)
|
||||
|
||||
color = (255, 255, 0)
|
||||
if valid_face:
|
||||
color = (0, 255, 0)
|
||||
|
||||
cv2.rectangle(image, (face_box_x1, face_box_y1), (face_box_x2, face_box_y2), color, 2)
|
||||
for point in clean_points:
|
||||
cv2.circle(image, (int(point[0]), int(point[1])), 4, color, -1)
|
||||
|
||||
|
||||
print("\nOriginal points:")
|
||||
print(original_points)
|
||||
print("\nOriginal pairwise distances:")
|
||||
print(linearize_pairwise_distances(original_points, relative_face_matrix))
|
||||
print(f"Optimized rotation vector (axis-angle): {rotation_vector}")
|
||||
print("\nOptimized points after rotation:")
|
||||
print(optimized_points)
|
||||
print("\nOptimized pairwise distances:")
|
||||
print(optimized_distances)
|
||||
print(success_points)
|
||||
print(clean_points)
|
||||
|
||||
|
||||
|
||||
|
||||
"""
|
||||
for idx in face_indices:
|
||||
x, y, conf = keypoints[idx]
|
||||
name = keypoint_names[idx]
|
||||
if conf > 0.3:
|
||||
face_points.append((x, y))
|
||||
|
||||
point = (int(x), int(y))
|
||||
name = keypoint_names[idx]
|
||||
cv2.circle(image, point, 4, (0, 255, 0), -1)
|
||||
cv2.putText(image, name, (point[0] + 5, point[1] + 5),
|
||||
cv2.FONT_HERSHEY_SIMPLEX, 0.4, (0, 255, 0), 1)
|
||||
"""
|
||||
|
||||
|
||||
"""
|
||||
nose, left_eye, right_eye, left_ear, right_ear = face_points
|
||||
print(face_points)
|
||||
|
||||
# Calculate pairwise distances
|
||||
nose_to_left_eye = euclidean_distance(nose, left_eye)
|
||||
nose_to_right_eye = euclidean_distance(nose, right_eye)
|
||||
eyes_distance = euclidean_distance(left_eye, right_eye)
|
||||
left_ear_to_nose = euclidean_distance(left_ear, nose)
|
||||
right_ear_to_nose = euclidean_distance(right_ear, nose)
|
||||
|
||||
# Relative distances
|
||||
eye_to_eye_ratio = eyes_distance / (left_ear_to_nose + right_ear_to_nose) # Eyes vs. nose-to-ears
|
||||
nose_to_eye_ratio = (nose_to_left_eye + nose_to_right_eye) / (left_ear_to_nose + right_ear_to_nose) # Nose-to-eye vs. ear-to-nose
|
||||
ear_to_nose_ratio = (left_ear_to_nose + right_ear_to_nose) / 2 # Ear-to-nose proportionality
|
||||
|
||||
# Validate using relative distances
|
||||
if not (EYE_RATIO_THRESHOLD < eye_to_eye_ratio < 0.5): # Arbitrary ratio threshold
|
||||
print("⚠️ Rejected due to unrealistic eye-to-eye ratio:", eye_to_eye_ratio)
|
||||
has_valid_face = False
|
||||
|
||||
if not (NOSE_EYE_RATIO_THRESHOLD < nose_to_eye_ratio < 0.4): # Arbitrary ratio threshold
|
||||
print("⚠️ Rejected due to unrealistic nose-to-eye ratio:", nose_to_eye_ratio)
|
||||
has_valid_face = False
|
||||
|
||||
if not (0.5 < ear_to_nose_ratio < EAR_NOSE_RATIO_THRESHOLD):
|
||||
print("⚠️ Rejected due to unrealistic ear-to-nose ratio:", ear_to_nose_ratio)
|
||||
has_valid_face = False
|
||||
|
||||
|
||||
# If all checks pass, calculate the bounding box
|
||||
xs, ys, _ = zip(*face_points)
|
||||
x_min, x_max = int(min(xs)), int(max(xs))
|
||||
y_min, y_max = int(min(ys)), int(max(ys))
|
||||
|
||||
x_min = max(x_min - face_padding, 0)
|
||||
y_min = max(y_min - face_padding, 0)
|
||||
x_max = min(x_max + face_padding, image.shape[1])
|
||||
y_max = min(y_max + face_padding, image.shape[0])
|
||||
|
||||
# Compute box size
|
||||
box_w = x_max - x_min
|
||||
box_h = y_max - y_min
|
||||
|
||||
if has_valid_face:
|
||||
cv2.rectangle(image, (x_min, y_min), (x_max, y_max), (0, 255, 0), 2)
|
||||
else:
|
||||
cv2.rectangle(image, (x_min, y_min), (x_max, y_max), (255, 0, 0), 2)
|
||||
|
||||
|
||||
|
||||
|
||||
for i, (x, y, conf) in enumerate(keypoints):
|
||||
point = (int(x), int(y))
|
||||
name = keypoint_names[i]
|
||||
# cv2.circle(image, point, 4, (0, 255, 0), -1)
|
||||
# cv2.putText(image, name, (point[0] + 5, point[1] - 5),
|
||||
# cv2.FONT_HERSHEY_SIMPLEX, 0.4, (0, 255, 0), 1)
|
||||
|
||||
# cv2.circle(image, head, 5, (255, 0, 0), -1) # Head in blue
|
||||
# cv2.circle(image, foot, 5, (0, 0, 255), -1) # Foot in red
|
||||
"""
|
||||
|
||||
cv2.imwrite(detected, image)
|
||||
|
||||
|
||||
def detect_humans(to_detect: str, crop_padding: int = 20, skip_detection_if_present: bool = True):
|
||||
_p = Path(to_detect)
|
||||
detected = str(_p.with_name(_p.stem + "_detected" + _p.suffix))
|
||||
boxes_file = str(_p.with_name(_p.stem + "_boxes.json"))
|
||||
print(f"detecting humans: {to_detect} => {detected}")
|
||||
|
||||
require_net("yolov3")
|
||||
|
||||
# Load YOLO
|
||||
net = cv2.dnn.readNet(str(MODEL_PATH / 'yolov3.weights'), str(MODEL_PATH / 'yolov3.cfg'))
|
||||
layer_names = net.getLayerNames()
|
||||
indices = net.getUnconnectedOutLayers()
|
||||
output_layers = [layer_names[int(i) - 1] for i in indices]
|
||||
|
||||
|
||||
# Load image
|
||||
image = cv2.imread(to_detect)
|
||||
original_image = cv2.imread(to_detect)
|
||||
height, width, channels = image.shape
|
||||
|
||||
# Create blob and do forward pass
|
||||
blob = cv2.dnn.blobFromImage(image, 0.00392, (416, 416), (0, 0, 0), True, crop=False)
|
||||
net.setInput(blob)
|
||||
outs = net.forward(output_layers)
|
||||
|
||||
boxes = []
|
||||
confidences = []
|
||||
|
||||
# Information for each object detected
|
||||
for out in outs:
|
||||
for detection in out:
|
||||
scores = detection[5:]
|
||||
class_id = np.argmax(scores)
|
||||
confidence = scores[class_id]
|
||||
if confidence > 0.5 and class_id == 0: # Class ID 0 is human
|
||||
center_x = int(detection[0] * width)
|
||||
center_y = int(detection[1] * height)
|
||||
w = int(detection[2] * width)
|
||||
h = int(detection[3] * height)
|
||||
x = int(center_x - w / 2)
|
||||
y = int(center_y - h / 2)
|
||||
|
||||
boxes.append([x, y, w, h])
|
||||
confidences.append(float(confidence))
|
||||
|
||||
# Apply Non-Maximum Suppression
|
||||
indices = cv2.dnn.NMSBoxes(boxes, confidences, score_threshold=0.5, nms_threshold=0.4)
|
||||
|
||||
boxes_structures = {}
|
||||
human_boxes = boxes_structures["humans"] = []
|
||||
|
||||
human_part_folder = _p.with_name(_p.stem + "_parts")
|
||||
human_part_folder.mkdir(exist_ok=True)
|
||||
if not (Path(boxes_file).exists() and skip_detection_if_present):
|
||||
require_net("yolov3")
|
||||
|
||||
for i in indices:
|
||||
i = i[0] if isinstance(i, (list, np.ndarray)) else i # Flatten index if needed
|
||||
x, y, w, h = boxes[i]
|
||||
# Load YOLO
|
||||
net = cv2.dnn.readNet(str(MODEL_PATH / 'yolov3.weights'), str(MODEL_PATH / 'yolov3.cfg'))
|
||||
layer_names = net.getLayerNames()
|
||||
indices = net.getUnconnectedOutLayers()
|
||||
output_layers = [layer_names[int(i) - 1] for i in indices]
|
||||
|
||||
human_part_image_path = human_part_folder / (_p.stem + "_" + str(i) + _p.suffix)
|
||||
|
||||
image_height, image_width = image.shape[:2]
|
||||
# Load image
|
||||
image = cv2.imread(to_detect)
|
||||
original_image = cv2.imread(to_detect)
|
||||
height, width, channels = image.shape
|
||||
|
||||
# Compute safe crop coordinates with padding
|
||||
x1 = max(x - crop_padding, 0)
|
||||
y1 = max(y - crop_padding, 0)
|
||||
x2 = min(x + w + crop_padding, image_width)
|
||||
y2 = min(y + h + crop_padding, image_height)
|
||||
human_crop = original_image[y1:y2, x1:x2]
|
||||
# Create blob and do forward pass
|
||||
blob = cv2.dnn.blobFromImage(image, 0.00392, (416, 416), (0, 0, 0), True, crop=False)
|
||||
net.setInput(blob)
|
||||
outs = net.forward(output_layers)
|
||||
|
||||
cv2.imwrite(str(human_part_image_path), human_crop)
|
||||
boxes = []
|
||||
confidences = []
|
||||
|
||||
print(f"\tfound human at {x}/{y} with the size of {w} x {h}")
|
||||
human_boxes.append({
|
||||
"x": x,
|
||||
"y": y,
|
||||
"w": w,
|
||||
"h": h,
|
||||
"crop": {
|
||||
"file": str(human_part_image_path),
|
||||
"x": x1,
|
||||
# Information for each object detected
|
||||
for out in outs:
|
||||
for detection in out:
|
||||
scores = detection[5:]
|
||||
class_id = np.argmax(scores)
|
||||
confidence = scores[class_id]
|
||||
if confidence > 0.5 and class_id == 0: # Class ID 0 is human
|
||||
center_x = int(detection[0] * width)
|
||||
center_y = int(detection[1] * height)
|
||||
w = int(detection[2] * width)
|
||||
h = int(detection[3] * height)
|
||||
x = int(center_x - w / 2)
|
||||
y = int(center_y - h / 2)
|
||||
|
||||
boxes.append([x, y, w, h])
|
||||
confidences.append(float(confidence))
|
||||
|
||||
# Apply Non-Maximum Suppression
|
||||
indices = cv2.dnn.NMSBoxes(boxes, confidences, score_threshold=0.5, nms_threshold=0.4)
|
||||
|
||||
human_part_folder = _p.with_name(_p.stem + "_parts")
|
||||
human_part_folder.mkdir(exist_ok=True)
|
||||
|
||||
for i in indices:
|
||||
i = i[0] if isinstance(i, (list, np.ndarray)) else i # Flatten index if needed
|
||||
x, y, w, h = boxes[i]
|
||||
|
||||
human_part_image_path = human_part_folder / (_p.stem + "_" + str(i) + _p.suffix)
|
||||
|
||||
image_height, image_width = image.shape[:2]
|
||||
|
||||
# Compute safe crop coordinates with padding
|
||||
x1 = max(x - crop_padding, 0)
|
||||
y1 = max(y - crop_padding, 0)
|
||||
x2 = min(x + w + crop_padding, image_width)
|
||||
y2 = min(y + h + crop_padding, image_height)
|
||||
human_crop = original_image[y1:y2, x1:x2]
|
||||
|
||||
cv2.imwrite(str(human_part_image_path), human_crop)
|
||||
|
||||
print(f"\tfound human at {x}/{y} with the size of {w} x {h}")
|
||||
human_boxes.append({
|
||||
"x": x,
|
||||
"y": y,
|
||||
"w": x2 - x1,
|
||||
"h": y2 - y1,
|
||||
},
|
||||
"parts": {},
|
||||
})
|
||||
"w": w,
|
||||
"h": h,
|
||||
"crop": {
|
||||
"file": str(human_part_image_path),
|
||||
"x": x1,
|
||||
"y": y,
|
||||
"w": x2 - x1,
|
||||
"h": y2 - y1,
|
||||
},
|
||||
"parts": {},
|
||||
})
|
||||
|
||||
|
||||
cv2.rectangle(image, (x, y), (x + w, y + h), (0, 255, 255), 2)
|
||||
cv2.rectangle(image, (x, y), (x + w, y + h), (0, 255, 255), 2)
|
||||
|
||||
|
||||
# Save the result
|
||||
with open(boxes_file, "w") as f:
|
||||
json.dump(boxes_structures, f)
|
||||
cv2.imwrite(detected, image)
|
||||
# Save the result
|
||||
with open(boxes_file, "w") as f:
|
||||
json.dump(boxes_structures, f)
|
||||
cv2.imwrite(detected, image)
|
||||
|
||||
else:
|
||||
|
||||
with open(boxes_file, "r") as f:
|
||||
boxes_structures = json.load(f)
|
||||
human_boxes = boxes_structures["humans"]
|
||||
|
||||
|
||||
for human in human_boxes:
|
||||
detect_human_parts(human["crop"]["file"])
|
||||
detect_human_parts(human)
|
||||
|
Loading…
x
Reference in New Issue
Block a user