Source code for multimodal_fin.processing.multimodal.embeddings_extractor
import os
import logging
from dataclasses import dataclass
from typing import Optional
import pandas as pd
from multimodal_fin.processing.multimodal.audio.audio_emotion_analyzer import AudioEmotionAnalyzer
from multimodal_fin.processing.multimodal.video.video_emotion_analyzer import VideoEmotionAnalyzer
from multimodal_fin.processing.multimodal.text.text_emotion_analyzer import TextEmotionAnalyzer
from multimodal_fin.processing.multimodal.multimodal_embeddings import MultimodalEmbeddings
from multimodal_fin.utils.logging import get_logger
logger = get_logger(__name__)
[docs]
@dataclass
class EmbeddingsExtractor:
"""
Extracts multimodal emotion embeddings from a CSV of conference interventions.
The extractor supports:
- Audio emotion embeddings via `AudioEmotionAnalyzer`.
- Text emotion embeddings via `TextEmotionAnalyzer`.
- Video emotion embeddings via `VideoEmotionAnalyzer`.
"""
audio_model_name: Optional[str] = None
"""Name of the model used for audio emotion recognition."""
text_model_name: Optional[str] = None
"""Name of the model used for text emotion recognition."""
video_model_name: Optional[str] = None
"""Name of the model used for video emotion recognition."""
device: str = "cpu"
"""Computation device (e.g., 'cpu', 'cuda')."""
verbose: int = 1
"""Verbosity level for logging."""
def __post_init__(self):
"""Initializes emotion analyzers based on selected modalities."""
self.audio_emotion = (
AudioEmotionAnalyzer(model_name=self.audio_model_name, device=self.device)
if self.audio_model_name else None
)
self.text_emotion = (
TextEmotionAnalyzer(model_name=self.text_model_name, device=self.device)
if self.text_model_name else None
)
self.video_emotion = (
VideoEmotionAnalyzer(mode=self.video_model_name, device=self.device)
if self.video_model_name else None
)
if self.verbose >= 2:
logger.debug(f"Initialized EmbeddingsExtractor with device='{self.device}'")
[docs]
def extract(self, csv_path: str, original_dir: str) -> pd.DataFrame:
"""
Loads the classified interventions CSV and computes multimodal embeddings.
Args:
csv_path: Path to the CSV file containing interventions.
original_dir: Directory with associated media files and metadata (LEVEL_3.json, audio.mp3).
Returns:
A pandas DataFrame with added columns for each modality's embeddings.
"""
logger.info(f"Starting embedding extraction for: {csv_path}")
logger.debug(f"Original directory: {original_dir}")
# Construct required paths
path_json = os.path.join(original_dir, "LEVEL_3.json")
path_audio = os.path.join(original_dir, "audio.mp3")
# Initialize the embedding module
self.multimodal = MultimodalEmbeddings(
path_csv=csv_path,
path_json=path_json,
audio_file_path=path_audio,
audio_emotion_analyzer=self.audio_emotion,
text_emotion_analyzer=self.text_emotion,
video_emmotion_analyzer=self.video_emotion
)
if self.verbose:
logger.info("Generating multimodal embeddings...")
# Run the pipeline
self.multimodal.generar_embeddings()
logger.info("Embedding extraction complete.")
return self.multimodal.sentences_df