Source code for multimodal_fin.processing.preprocessing.transcript_preprocessor

from dataclasses import dataclass
from typing import Optional
import pandas as pd
import json
import os

from multimodal_fin.utils.logging import get_logger

logger = get_logger(__name__)


[docs] @dataclass class TranscriptPreprocessor: """ Preprocesses a conference transcript by identifying the beginning of the Q&A section and labeling each row as either 'prepared_remarks' or 'q_a'. """ section_col: str = "Conf_Section" """Name of the column to write the section labels.""" text_col: str = "text" """Column containing the transcript text.""" qna_key: str = "questions_and_answers" """Key used to extract the Q&A intro from the JSON metadata."""
[docs] def extract_qna_intro(self, json_path: str) -> Optional[str]: """ Extracts the first sentence of the Q&A section from the metadata JSON. Args: json_path: Path to the JSON metadata file. Returns: The first sentence of the Q&A intro, or None if not found or file is invalid. """ if not os.path.exists(json_path): logger.warning(f"JSON path does not exist: {json_path}") return None try: with open(json_path, 'r', encoding='utf-8') as f: content = f.read().strip() if not content: logger.warning(f"Empty JSON content at: {json_path}") return None data = json.loads(content) qna_text = data.get(self.qna_key) if isinstance(qna_text, str): intro = qna_text.split(".")[0].strip() logger.debug(f"Extracted Q&A intro: {intro}") return intro except Exception as e: logger.error(f"Error reading {json_path}: {e}", exc_info=True) return None
[docs] def preprocess(self, csv_path: str, json_path: str) -> pd.DataFrame: """ Labels each row in the transcript CSV as either 'prepared_remarks' or 'q_a' based on the location of the Q&A intro. Args: csv_path: Path to the transcript CSV file. json_path: Path to the metadata JSON file. Returns: A DataFrame with an added column (`section_col`) containing the section labels. """ logger.info(f"Preprocessing transcript from {csv_path} with metadata {json_path}") df = pd.read_csv(csv_path) qna_intro = self.extract_qna_intro(json_path) if qna_intro and self.text_col in df.columns: match = df[df[self.text_col].str.contains(qna_intro, case=False, na=False)] if not match.empty: qna_start_index = match.index[0] logger.info(f"Q&A section starts at index: {qna_start_index}") df[self.section_col] = [ 'prepared_remarks' if i < qna_start_index else 'q_a' for i in df.index ] else: logger.warning("Q&A intro not found in transcript. Defaulting all to 'prepared_remarks'.") df[self.section_col] = 'prepared_remarks' else: logger.warning("No valid Q&A intro found or missing text column. Defaulting all to 'prepared_remarks'.") df[self.section_col] = 'prepared_remarks' logger.info("Preprocessing completed.") return df