Source code for multimodal_fin.processing.multimodal.video.face_detector

from dataclasses import dataclass, field
from typing import List, Optional
import numpy as np
from PIL import Image
from facenet_pytorch import MTCNN
import torch
import logging

logger = logging.getLogger(__name__)



[docs]
@dataclass
class FaceDetector:
    """
    Detects and crops faces from input images or video frames using MTCNN.
    """
    device: str = 'cuda' if torch.cuda.is_available() else 'cpu'
    mtcnn: MTCNN = field(init=False)

    def __post_init__(self):
        self.mtcnn = MTCNN(keep_all=False, post_process=True, device=self.device)


[docs]
    def detect_faces(self, image: Image.Image) -> Optional[Image.Image]:
        """
        Detects a single face in the given PIL image.

        Args:
            image (Image.Image): Input PIL image.

        Returns:
            Optional[Image.Image]: Cropped face image or None.
        """
        boxes, _ = self.mtcnn.detect(image)
        if boxes is not None:
            x1, y1, x2, y2 = map(int, boxes[0])
            logger.debug(f"Detected face at: {(x1, y1, x2, y2)}")
            return image.crop((x1, y1, x2, y2))
        return None



[docs]
    def recognize_faces(self, frame: np.ndarray) -> List[np.ndarray]:
        """
        Detects multiple faces in a video frame.

        Args:
            frame (np.ndarray): Input frame (BGR or RGB format).

        Returns:
            List[np.ndarray]: List of cropped face arrays.
        """
        boxes, probs = self.mtcnn.detect(frame)
        if boxes is None or probs is None:
            return []

        selected = boxes[probs > 0.9]
        faces = []
        for box in selected:
            x1, y1, x2, y2 = map(int, box)
            face = frame[y1:y2, x1:x2]
            faces.append(face)

        logger.debug(f"{len(faces)} faces detected in frame.")
        return faces