Source code for multimodal_fin.processing.multimodal.video.video_emotion_analyzer

from dataclasses import dataclass
import pandas as pd
import torch
import logging

from multimodal_fin.processing.multimodal.video.analyzer import EmotionVideoAnalyzer
from multimodal_fin.processing.multimodal.video.face_detector import FaceDetector
from multimodal_fin.processing.multimodal.video.processor import VideoProcessor
from multimodal_fin.processing.multimodal.video.recognizers.vit import VITRecognizer
from multimodal_fin.processing.multimodal.video.recognizers.fer import FERRecognizer
from multimodal_fin.processing.multimodal.video.recognizers.emotieff import EmotiEffRecognizer

from multimodal_fin.utils.logging import get_logger

logger = get_logger(__name__)



[docs]
@dataclass
class VideoEmotionAnalyzer:
    """
    High-level video emotion classification pipeline.

    This class orchestrates the process of detecting faces,
    recognizing emotions per frame, and aggregating predictions
    to produce a single dominant emotion for the full video.
    """

    mode: str
    """Recognition model type ('vit', 'fer', 'emotieff')."""

    skips: float = 0.1
    """Fraction of frames to process."""

    method: str = "mode"
    """Aggregation strategy ('mode', 'mean', 'abs')."""

    device: str = "cuda" if torch.cuda.is_available() else "cpu"
    """Device to use ('cuda' or 'cpu')."""

    emotieff_model: str = "enet_b0_8_best_afew"
    """Model name for EmotiEffRecognizer."""

    def __post_init__(self):
        logger.info(f"Initializing video analysis with mode={self.mode}, device={self.device}")

        self.face_detector = FaceDetector(device=self.device)
        self.video_processor = VideoProcessor(skips=self.skips)

        match self.mode:
            case "vit":
                self.recognizer = VITRecognizer(device=self.device)
            case "fer":
                self.recognizer = FERRecognizer(device=self.device)
            case "emotieff":
                self.recognizer = EmotiEffRecognizer(device=self.device, model=self.emotieff_model)
            case _:
                raise ValueError("Unsupported mode. Choose from: 'vit', 'fer', 'emotieff'.")

        self.analyzer = EmotionVideoAnalyzer(
            recognizer=self.recognizer,
            face_detector=self.face_detector,
            processor=self.video_processor
        )


[docs]
    def analyze_video(self, video_path: str) -> str:
        """
        Runs emotion recognition on a full video.

        Args:
            video_path (str): Path to the video file.

        Returns:
            str: Predicted dominant emotion.
        """
        df = self.analyzer.analyze_video(video_path)

        if df.empty:
            logger.warning("No predictions were made from the video.")
            return 'unknown'

        prediction = self.get_aggregated_prediction(df)
        if self.mode != "emotieff":
            prediction = self.swap_disgust_fear(prediction)

        logger.info(f"Predicted emotion: {prediction}")
        return prediction



[docs]
    def get_aggregated_prediction(self, df: pd.DataFrame) -> str:
        """
        Aggregates frame-level predictions using the selected strategy.

        Args:
            df (pd.DataFrame): DataFrame of frame-wise emotion probabilities.

        Returns:
            str: Final predicted emotion.
        """
        match self.method:
            case "mode":
                return df.idxmax(axis=1).mode()[0]
            case "mean":
                return df.mean().idxmax()
            case "abs":
                return df.max().idxmax()
            case _:
                raise ValueError(f"Unsupported aggregation method: {self.method}")



[docs]
    def swap_disgust_fear(self, emotion: str) -> str:
        """
        Optionally swaps 'disgust' and 'fear' to align with common misclassifications.

        Args:
            emotion (str): The predicted emotion.

        Returns:
            str: Possibly corrected emotion.
        """
        if emotion == "disgust":
            return "fear"
        elif emotion == "fear":
            return "disgust"
        return emotion



[docs]
    def classify_dataframe(self, df: pd.DataFrame) -> pd.DataFrame:
        """
        Applies video emotion classification to each path in a DataFrame.

        Args:
            df (pd.DataFrame): Must contain a 'Path' column with video paths.

        Returns:
            pd.DataFrame: Same DataFrame with an added 'classification' column.
        """
        if "Path" not in df.columns:
            raise ValueError("DataFrame must contain a 'Path' column.")

        logger.info(f"Classifying {len(df)} videos...")
        df["classification"] = df["Path"].apply(self.analyze_video)
        return df



[docs]
    def get_embeddings(self, video_path: str):
        """
        Placeholder for extracting emotion embeddings from video.

        Args:
            video_path (str): Path to video file.

        Returns:
            torch.Tensor: Emotion embedding (future implementation).
        """
        raise NotImplementedError("Embedding extraction is not yet implemented.")