Source code for multimodal_fin.embeddings.builder.feature_extractor
import numpy as np
import torch
from typing import List, Optional, Tuple, Union
from multimodal_fin.utils.logging import get_logger
logger = get_logger(__name__)
[docs]
class FeatureExtractor:
"""
Extracts multimodal (text, audio, video) and metadata features from a conference tree node.
Converts data into tensors suitable for model input.
"""
def __init__(
self,
categories_10k: Optional[List[str]] = None,
qa_categories: Optional[List[str]] = None,
max_num_coherences: int = 5
):
"""
Initializes the feature extractor with configuration.
Args:
categories_10k: List of 10-K section labels used for one-hot encoding.
qa_categories: List of response types for QA classification.
max_num_coherences: Maximum number of coherence entries to encode.
"""
self.categories_10k = categories_10k or ["MD&A", "Risk Factors", "Business", "Other"]
self.qa_categories = qa_categories or ["yes", "no", "partially"]
self.max_num_coherences = max_num_coherences
logger.info("✅ FeatureExtractor initialized")
[docs]
def to_onehot(self, value: str, options: List[str]) -> np.ndarray:
"""Converts a categorical value to a one-hot encoded vector."""
vec = np.zeros(len(options), dtype=np.float32)
if value in options:
vec[options.index(value)] = 1.0
return vec
[docs]
def to_onehot_bool(self, value: bool) -> np.ndarray:
"""Encodes a boolean value as a 1-hot vector [1, 0] or [0, 1]."""
return np.array([0.0, 1.0], dtype=np.float32) if value else np.array([1.0, 0.0], dtype=np.float32)
[docs]
def safe_len(self, emb: Union[List, dict]) -> int:
"""Safely computes the length of embeddings regardless of structure."""
if isinstance(emb, list):
return len(emb)
if isinstance(emb, dict):
return max((len(v) for v in emb.values()), default=0)
return 0
[docs]
def get_array_from_embedding(self, emb_data: Union[List, dict], n_target: int) -> np.ndarray:
"""
Converts raw embeddings into a padded NumPy array of shape [n_target, 7].
Args:
emb_data: List or dict of raw embeddings.
n_target: Desired number of time steps (padding/truncating applied).
Returns:
A NumPy array of shape [n_target, 7].
"""
if isinstance(emb_data, list):
arr = np.array(emb_data)
elif isinstance(emb_data, dict):
if not emb_data:
return np.zeros((n_target, 7), dtype=np.float32)
for v in emb_data.values():
arr = np.array(v)
if arr.ndim == 2 and arr.shape[1] == 7:
break
else:
return np.zeros((n_target, 7), dtype=np.float32)
else:
return np.zeros((n_target, 7), dtype=np.float32)
if arr.ndim == 1:
arr = arr.reshape(0, 7)
if arr.shape[0] < n_target:
pad = np.zeros((n_target - arr.shape[0], 7), dtype=np.float32)
arr = np.vstack([arr, pad])
return arr[:n_target]
[docs]
def extract(self, node) -> Tuple[torch.Tensor, torch.Tensor, np.ndarray]:
"""
Extracts a multimodal tensor and metadata vector from a tree node.
Args:
node: A `ConferenceNode` object containing multimodal data and metadata.
Returns:
frases: Tensor of shape [1, n, 21] with concatenated features.
mask: Boolean tensor of shape [1, n] indicating valid time steps.
meta_vec: Array of metadata of shape [expected_size].
"""
n_text = self.safe_len(node.text_embeddings)
n_audio = self.safe_len(node.audio_embeddings)
n_video = self.safe_len(node.video_embeddings)
n = max(n_text, n_audio, n_video, 1)
text = self.get_array_from_embedding(node.text_embeddings, n)
audio = self.get_array_from_embedding(node.audio_embeddings, n)
video = self.get_array_from_embedding(node.video_embeddings, n)
frases = np.concatenate([text, audio, video], axis=1) # [n, 21]
frases_tensor = torch.tensor(frases, dtype=torch.float32).unsqueeze(0) # [1, n, 21]
mask_tensor = torch.ones((1, n), dtype=torch.bool) # [1, n]
meta = []
# Classification metadata (10-K section)
cls = node.metadata.get("classification", {})
meta.append(float(cls.get("Confidence", 0.0)))
meta.extend(self.to_onehot(cls.get("Predicted_category", "Other"), self.categories_10k))
# QA response metadata
qa = node.metadata.get("qa_response", {})
meta.append(float(qa.get("Confidence", 0.0)))
pred_cat = str(qa.get("Predicted_category", "")).lower()
meta.extend(self.to_onehot(pred_cat, self.qa_categories))
# Coherence metadata
for coh in node.metadata.get("coherence", [])[:self.max_num_coherences]:
meta.extend(self.to_onehot_bool(coh.get("consistent", False)))
expected_size = 1 + len(self.categories_10k) + 1 + len(self.qa_categories) + 2 * self.max_num_coherences
meta_vec = np.array(meta, dtype=np.float32)
if len(meta_vec) < expected_size:
meta_vec = np.pad(meta_vec, (0, expected_size - len(meta_vec)))
elif len(meta_vec) > expected_size:
meta_vec = meta_vec[:expected_size]
logger.debug(f"Extracted features from node: {node.name}")
return frases_tensor, mask_tensor, meta_vec