Hey CanMV Community,
I’m new to hardware programming and currently working on face detection using the K230 (hardware v1.1) with CanMV Micropython. Below are the functions I’ve added for post-processing to improve the shape of detected faces. However, I’m facing performance issues when using them.
Steps to Reproduce
- Set up a face detection pipeline on the K230 with CanMV Micropython, using a YuNet model for detection.
- Add the following functions (see code below) to post-process and align detected faces based on eye landmarks.
- Run the pipeline with
rgb888p_size = [1920, 1080]anddisplay_size = [1920, 1080]on an HDMI display, processing live video frame by frame. - Observe the system behavior during continuous face detection.
Expected Results and Actual Results
- Expected Results: Smooth live face detection at a reasonable frame rate (e.g., 10-15 FPS) with properly aligned faces for further processing (e.g., recognition).
- Actual Results: When using these alignment functions, the K230 lags significantly, often freezing or stuttering during live detection. Without these functions, detection runs smoother (around 10-12 FPS), with occasional minor stutters (once per hour), but it’s much less laggy compared to when the alignment code is included.
Software and Hardware Version Information
- Hardware: K230 development board, version 1.1.
- Software: CanMV Micropython (latest version as of March 17, 2025), YuNet model (
k230_face_detection_yunet.kmodel).
Error Log
- No specific error messages appear in the console.
- The system slows down or freezes intermittently when the alignment functions are active, especially during live frame processing. Timing logs (e.g., via
ScopedTiming) aren’t captured yet, but the lag is visually noticeable.
Solution Attempts
- Removed the alignment code: Detection runs smoothly without
align_img_wrt_eyesand related functions, though minor stutters still occur occasionally (once per hour). - Adjusted resolution: Tested with
rgb888p_size = [640, 320], which reduces lag but compromises detection accuracy, so I reverted to 1920x1080. - Considered optimization: Thought about simplifying the
M = np.arraytransformation or offloading it to hardware, but I’m unsure how to proceed with the K230’s capabilities.
Supplementary Materials
Here are the functions I’m using to post-process and align faces after detection:
import ulab.numpy as np
def align_img_wrt_eyes(img, left_eye, right_eye):
if not left_eye or not right_eye:
return img, 0
angle = float(np.degrees(np.arctan2(left_eye[1] - right_eye[1], left_eye[0] - right_eye[0])))
# Handle NCHW format (C, H, W)
if len(img.shape) == 3 and img.shape[0] == 3: # NCHW
channels, h, w = img.shape
elif len(img.shape) == 3 and img.shape[2] == 3: # NHWC
h, w, channels = img.shape
elif len(img.shape) == 2: # Grayscale
h, w = img.shape
channels = 1
else:
raise ValueError("Unsupported image shape: {}".format(img.shape))
center = (w // 2, h // 2)
cos_val = np.cos(np.radians(angle))
sin_val = np.sin(np.radians(angle))
M = np.array([
[cos_val, sin_val, (1 - cos_val) * center[0] - sin_val * center[1]],
[-sin_val, cos_val, sin_val * center[0] + (1 - cos_val) * center[1]]
], dtype=np.float)
# Process based on input format
if len(img.shape) == 3 and img.shape[0] == 3: # NCHW
aligned_img = np.zeros((channels, h, w), dtype=np.uint8)
for i in range(h):
for j in range(w):
src_x = int(M[0, 0] * j + M[0, 1] * i + M[0, 2])
src_y = int(M[1, 0] * j + M[1, 1] * i + M[1, 2])
if 0 <= src_x < w and 0 <= src_y < h:
for k in range(channels):
aligned_img[k, i, j] = img[k, src_y, src_x]
elif len(img.shape) == 3 and img.shape[2] == 3: # NHWC
aligned_img = np.zeros((h, w, channels), dtype=np.uint8)
for i in range(h):
for j in range(w):
src_x = int(M[0, 0] * j + M[0, 1] * i + M[0, 2])
src_y = int(M[1, 0] * j + M[1, 1] * i + M[1, 2])
if 0 <= src_x < w and 0 <= src_y < h:
for k in range(channels):
aligned_img[i, j, k] = img[src_y, src_x, k]
elif len(img.shape) == 2: # Grayscale
aligned_img = np.zeros((h, w), dtype=np.uint8)
for i in range(h):
for j in range(w):
src_x = int(M[0, 0] * j + M[0, 1] * i + M[0, 2])
src_y = int(M[1, 0] * j + M[1, 1] * i + M[1, 2])
if 0 <= src_x < w and 0 <= src_y < h:
aligned_img[i, j] = img[src_y, src_x]
return aligned_img, angle
def project_facial_area(facial_area, angle, size):
# Rotate the four corners of the bounding box
x1, y1, x2, y2 = facial_area
h, w = size
center = (w // 2, h // 2)
cos_val = np.cos(np.radians(angle))
sin_val = np.sin(np.radians(angle))
def rotate_point(x, y):
x -= center[0]
y -= center[1]
new_x = x * cos_val + y * sin_val + center[0]
new_y = -x * sin_val + y * cos_val + center[1]
return new_x, new_y
rx1, ry1 = rotate_point(x1, y1)
rx2, ry2 = rotate_point(x2, y2)
return min(rx1, rx2), min(ry1, ry2), max(rx1, rx2), max(ry1, ry2)
def expand_and_align_face(img, x, y, w, h, landmarks, align=True, expand_percentage=0):
left_eye = (landmarks[0], landmarks[1]) if len(landmarks) >= 2 else None
right_eye = (landmarks[2], landmarks[3]) if len(landmarks) >= 4 else None
if expand_percentage > 0:
expanded_w = w + int(w * expand_percentage / 100)
expanded_h = h + int(h * expand_percentage / 100)
x = max(0, x - int((expanded_w - w) / 2))
y = max(0, y - int((expanded_h - h) / 2))
w = min(img.shape[1] - x, expanded_w)
h = min(img.shape[0] - y, expanded_h)
# Ensure valid dimensions
if w <= 0 or h <= 0:
return {"x": x, "y": y, "w": 0, "h": 0, "left_eye": None, "right_eye": None,
"nose": None, "mouth_left": None, "mouth_right": None}, None
detected_face = img[y:y+h, x:x+w]
if align and left_eye and right_eye:
aligned_face, angle = align_img_wrt_eyes(detected_face, left_eye, right_eye)
rotated_x1, rotated_y1, rotated_x2, rotated_y2 = project_facial_area((0, 0, w, h), angle, (h, w))
cropped_face = aligned_face[int(rotated_y1):int(rotated_y2), int(rotated_x1):int(rotated_x2)]
else:
cropped_face = detected_face
angle = 0
facial_area = {
"x": x, "y": y, "w": w, "h": h,
"left_eye": left_eye, "right_eye": right_eye,
"nose": (landmarks[4], landmarks[5]) if len(landmarks) >= 6 else None,
"mouth_left": (landmarks[6], landmarks[7]) if len(landmarks) >= 8 else None,
"mouth_right": (landmarks[8], landmarks[9]) if len(landmarks) >= 10 else None
}
return facial_area, cropped_face