From f13878d8bc298b0273e02051d894084f9175c61a Mon Sep 17 00:00:00 2001
From: Lars Noack <lars@webcontact.de>
Date: Wed, 23 Apr 2025 12:42:19 +0200
Subject: [PATCH] feat: improved human detection

---
 pyproject.toml                     |  1 -
 secure_pixelation/__main__.py      |  2 +-
 secure_pixelation/detect_humans.py | 72 ++++++++++++++++++++++++++++--
 3 files changed, 69 insertions(+), 6 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index e1ea76e..31f66c1 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -3,7 +3,6 @@ name = "secure_pixelation"
 version = "0.0.0"
 dependencies = [
     "opencv_python~=4.11.0.86",
-    "imutils~=0.5.4",
 ]
 authors = []
 description = "Hiding faces with Mosaic has proven incredibly unsafe especially with videos, because the algorythm isn't destructive. However, if you black out the selected area, repopulate it with generative ai, and then pixelate it, it should look authentic, but be 100% destructive, thus safe."
diff --git a/secure_pixelation/__main__.py b/secure_pixelation/__main__.py
index 538b932..cc4b6cd 100644
--- a/secure_pixelation/__main__.py
+++ b/secure_pixelation/__main__.py
@@ -4,4 +4,4 @@ from .detect_humans import detect_humans
 def cli():
     print(f"Running secure_pixelation")
 
-    detect_humans("assets/humans.png")
+    detect_humans("assets/human_detection/humans.png")
diff --git a/secure_pixelation/detect_humans.py b/secure_pixelation/detect_humans.py
index 67aab6e..75afa76 100644
--- a/secure_pixelation/detect_humans.py
+++ b/secure_pixelation/detect_humans.py
@@ -1,10 +1,11 @@
 from pathlib import Path
 import urllib.request
 from typing import Dict, List
+import json
 
 import cv2
-import imutils
 import numpy as np
+import mediapipe as mp
 
 
 
@@ -38,13 +39,55 @@ def require_net(name: str):
         )
 
 
+def detect_humans_mediapipe(to_detect: str):
+    _p = Path(to_detect)
+    detected = str(_p.with_name(_p.stem + "_detected" + _p.suffix))
+    boxes_file = str(_p.with_name(_p.stem + "_boxes.json"))
+    print(f"detecting humans: {to_detect} => {detected}")
 
-# print(f"\tfound human at {x}/{y} with the size of {w} x {h}")
+    # Initialize MediaPipe Pose
+    mp_pose = mp.solutions.pose
+    pose = mp_pose.Pose(min_detection_confidence=0.5, min_tracking_confidence=0.5)
+    mp_drawing = mp.solutions.drawing_utils
+
+    # Load your image
+    image = cv2.imread(to_detect)
+
+    # Convert the BGR image to RGB (MediaPipe uses RGB format)
+    image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
+
+    # Perform pose detection
+    results = pose.process(image_rgb)
+
+    # Draw landmarks on the image (optional)
+    if results.pose_landmarks:
+        mp_drawing.draw_landmarks(image, results.pose_landmarks, mp_pose.POSE_CONNECTIONS)
+
+        # You can extract specific keypoints (head, feet, etc.)
+        # Example: 0 = Nose, 15 = Left Foot, 16 = Right Foot
+        head = results.pose_landmarks.landmark[0]  # Nose landmark
+        left_foot = results.pose_landmarks.landmark[15]  # Left Foot
+        right_foot = results.pose_landmarks.landmark[16]  # Right Foot
+
+        # Convert to pixel coordinates
+        h, w, _ = image.shape
+        head_coords = int(head.x * w), int(head.y * h)
+        left_foot_coords = int(left_foot.x * w), int(left_foot.y * h)
+        right_foot_coords = int(right_foot.x * w), int(right_foot.y * h)
+
+        # Draw head and feet positions on the image
+        cv2.circle(image, head_coords, 5, (255, 0, 0), -1)  # Head in blue
+        cv2.circle(image, left_foot_coords, 5, (0, 0, 255), -1)  # Left foot in red
+        cv2.circle(image, right_foot_coords, 5, (0, 0, 255), -1)  # Right foot in red
+
+    # Save the result
+    cv2.imwrite(detected, image)
 
 
 def detect_humans(to_detect: str):
     _p = Path(to_detect)
-    detected = str(_p.with_name(_p.stem + ".detected" + _p.suffix))
+    detected = str(_p.with_name(_p.stem + "_detected" + _p.suffix))
+    boxes_file = str(_p.with_name(_p.stem + "_boxes.json"))
     print(f"detecting humans: {to_detect} => {detected}")
 
     require_net("yolov3")
@@ -58,6 +101,7 @@ def detect_humans(to_detect: str):
 
     # Load image
     image = cv2.imread(to_detect)
+    original_image = cv2.imread(to_detect)
     height, width, channels = image.shape
 
     # Create blob and do forward pass
@@ -88,14 +132,34 @@ def detect_humans(to_detect: str):
     # Apply Non-Maximum Suppression
     indices = cv2.dnn.NMSBoxes(boxes, confidences, score_threshold=0.5, nms_threshold=0.4)
 
+    boxes_structures = {}
+    human_boxes = boxes_structures["humans"] = []
+
+    human_part_folder = _p.with_name(_p.stem + "_parts")
+    human_part_folder.mkdir(exist_ok=True)
+
     for i in indices:
         i = i[0] if isinstance(i, (list, np.ndarray)) else i  # Flatten index if needed
         x, y, w, h = boxes[i]
 
+        human_part_image_path = human_part_folder / (_p.stem + "_" + str(i) + _p.suffix)
+        human_crop = original_image[y:y+h, x:x+w]
+        cv2.imwrite(str(human_part_image_path), human_crop)
+
         print(f"\tfound human at {x}/{y} with the size of {w} x {h}")
+        human_boxes.append({
+            "x": x,
+            "y": y,
+            "w": w,
+            "h": h,
+            "cropped": str(human_part_image_path)
+        })
+
+
         cv2.rectangle(image, (x, y), (x + w, y + h), (0, 255, 255), 2)
 
 
     # Save the result
+    with open(boxes_file, "w") as f:
+        json.dump(boxes_structures, f)
     cv2.imwrite(detected, image)
-