Source code for multimodal_fin.processing.multimodal.embeddings_extractor

import os
import logging
from dataclasses import dataclass
from typing import Optional

import pandas as pd

from multimodal_fin.processing.multimodal.audio.audio_emotion_analyzer import AudioEmotionAnalyzer
from multimodal_fin.processing.multimodal.video.video_emotion_analyzer import VideoEmotionAnalyzer
from multimodal_fin.processing.multimodal.text.text_emotion_analyzer import TextEmotionAnalyzer

from multimodal_fin.processing.multimodal.multimodal_embeddings import MultimodalEmbeddings

from multimodal_fin.utils.logging import get_logger

logger = get_logger(__name__)



[docs]
@dataclass
class EmbeddingsExtractor:
    """
    Extracts multimodal emotion embeddings from a CSV of conference interventions.

    The extractor supports:
      - Audio emotion embeddings via `AudioEmotionAnalyzer`.
      - Text emotion embeddings via `TextEmotionAnalyzer`.
      - Video emotion embeddings via `VideoEmotionAnalyzer`.
    """

    audio_model_name: Optional[str] = None
    """Name of the model used for audio emotion recognition."""

    text_model_name: Optional[str] = None
    """Name of the model used for text emotion recognition."""

    video_model_name: Optional[str] = None
    """Name of the model used for video emotion recognition."""

    device: str = "cpu"
    """Computation device (e.g., 'cpu', 'cuda')."""

    verbose: int = 1
    """Verbosity level for logging."""

    def __post_init__(self):
        """Initializes emotion analyzers based on selected modalities."""
        self.audio_emotion = (
            AudioEmotionAnalyzer(model_name=self.audio_model_name, device=self.device)
            if self.audio_model_name else None
        )
        self.text_emotion = (
            TextEmotionAnalyzer(model_name=self.text_model_name, device=self.device)
            if self.text_model_name else None
        )
        self.video_emotion = (
            VideoEmotionAnalyzer(mode=self.video_model_name, device=self.device)
            if self.video_model_name else None
        )

        if self.verbose >= 2:
            logger.debug(f"Initialized EmbeddingsExtractor with device='{self.device}'")


[docs]
    def extract(self, csv_path: str, original_dir: str) -> pd.DataFrame:
        """
        Loads the classified interventions CSV and computes multimodal embeddings.

        Args:
            csv_path: Path to the CSV file containing interventions.
            original_dir: Directory with associated media files and metadata (LEVEL_3.json, audio.mp3).

        Returns:
            A pandas DataFrame with added columns for each modality's embeddings.
        """
        logger.info(f"Starting embedding extraction for: {csv_path}")
        logger.debug(f"Original directory: {original_dir}")

        # Construct required paths
        path_json = os.path.join(original_dir, "LEVEL_3.json")
        path_audio = os.path.join(original_dir, "audio.mp3")

        # Initialize the embedding module
        self.multimodal = MultimodalEmbeddings(
            path_csv=csv_path,
            path_json=path_json,
            audio_file_path=path_audio,
            audio_emotion_analyzer=self.audio_emotion,
            text_emotion_analyzer=self.text_emotion,
            video_emmotion_analyzer=self.video_emotion
        )

        if self.verbose:
            logger.info("Generating multimodal embeddings...")

        # Run the pipeline
        self.multimodal.generar_embeddings()

        logger.info("Embedding extraction complete.")
        return self.multimodal.sentences_df