Source code for multimodal_fin.processing.multimodal.audio.audio_emotion_analyzer

from dataclasses import dataclass
from typing import Dict
import logging
import pandas as pd
import torch

from multimodal_fin.processing.multimodal.audio.recognizers.base import AudioEmotionRecognizer
from multimodal_fin.processing.multimodal.audio.recognizers.emotion2vec import Emotion2VecRecognizer

from multimodal_fin.utils.logging import get_logger

logger = get_logger(__name__)


[docs] @dataclass class AudioEmotionAnalyzer: """ Extracts emotion-based audio embeddings or emotion classifications using a specified recognizer. """ mode: str = "emotion2vec" """The name of the recognition model type. Currently, only 'emotion2vec' is supported.""" device: str = "cuda" if torch.cuda.is_available() else "cpu" """The computation device to use ('cuda' or 'cpu').""" model_name: str = "iic/emotion2vec_plus_large" """Name or path of the model to be loaded.""" def __post_init__(self): match self.mode: case "emotion2vec": self.recognizer: AudioEmotionRecognizer = Emotion2VecRecognizer( model_name=self.model_name, device=self.device ) case _: raise ValueError(f"Unsupported mode '{self.mode}'. Only 'emotion2vec' is supported.")
[docs] def classify_audio(self, audio_path: str) -> str: """Returns the top predicted emotion for a given audio file.""" emotion_dict = self.recognizer.predict_from_wav(audio_path) top_emotion = self.recognizer.get_top_emotion(emotion_dict) return self._swap_disgust_fear(top_emotion)
[docs] def classify_dataframe(self, df: pd.DataFrame) -> pd.DataFrame: """ Adds a 'classification' column to a DataFrame by predicting emotions from audio file paths. Args: df (pd.DataFrame): Must contain a 'Path' column with paths to audio files. Returns: pd.DataFrame: The same DataFrame with a new 'classification' column. """ if "Path" not in df.columns: raise ValueError("DataFrame must contain a 'Path' column with audio file paths.") df["classification"] = df["Path"].apply(self.classify_audio) return df
def _swap_disgust_fear(self, emotion: str) -> str: """ Swaps 'disgusted' and 'fearful' if mode is 'emotion2vec', to address known model misclassifications. """ if self.mode == "emotion2vec": if emotion == "disgusted": return "fearful" elif emotion == "fearful": return "disgusted" return emotion
[docs] def get_embeddings(self, audio_path: str) -> torch.Tensor: """ Returns a centered logits vector representing emotional content from the given audio file. The vector is ordered as: ['happy', 'neutral', 'surprise', 'disgust', 'anger', 'sadness', 'fear'] Args: audio_path (str): Path to the audio file. Returns: torch.Tensor: Centered logits vector of emotion scores. """ emotion_dict = self.recognizer.predict_from_wav(audio_path) # Define label mapping and standard order standard_order = ['happy', 'neutral', 'surprise', 'disgust', 'anger', 'sadness', 'fear'] label_map = { 'happy': 'happy', 'neutral': 'neutral', 'surprised': 'surprise', 'disgusted': 'disgust', 'angry': 'anger', 'sad': 'sadness', 'fearful': 'fear', 'other': None } inverse_map = {v: k for k, v in label_map.items() if v is not None} try: ordered_probs = [list(emotion_dict.values())[0][inverse_map[label]] for label in standard_order] except KeyError as e: raise ValueError(f"Missing expected emotion in predictions: {e}") probs_tensor = torch.tensor(ordered_probs) logits = torch.log(probs_tensor) return logits - logits.mean()