from dataclasses import dataclass
from typing import Optional
import pandas as pd
import json
import os
from multimodal_fin.utils.logging import get_logger
logger = get_logger(__name__)
[docs]
@dataclass
class TranscriptPreprocessor:
"""
Preprocesses a conference transcript by identifying the beginning of the Q&A section
and labeling each row as either 'prepared_remarks' or 'q_a'.
"""
section_col: str = "Conf_Section"
"""Name of the column to write the section labels."""
text_col: str = "text"
"""Column containing the transcript text."""
qna_key: str = "questions_and_answers"
"""Key used to extract the Q&A intro from the JSON metadata."""
[docs]
def preprocess(self, csv_path: str, json_path: str) -> pd.DataFrame:
"""
Labels each row in the transcript CSV as either 'prepared_remarks' or 'q_a'
based on the location of the Q&A intro.
Args:
csv_path: Path to the transcript CSV file.
json_path: Path to the metadata JSON file.
Returns:
A DataFrame with an added column (`section_col`) containing the section labels.
"""
logger.info(f"Preprocessing transcript from {csv_path} with metadata {json_path}")
df = pd.read_csv(csv_path)
qna_intro = self.extract_qna_intro(json_path)
if qna_intro and self.text_col in df.columns:
match = df[df[self.text_col].str.contains(qna_intro, case=False, na=False)]
if not match.empty:
qna_start_index = match.index[0]
logger.info(f"Q&A section starts at index: {qna_start_index}")
df[self.section_col] = [
'prepared_remarks' if i < qna_start_index else 'q_a' for i in df.index
]
else:
logger.warning("Q&A intro not found in transcript. Defaulting all to 'prepared_remarks'.")
df[self.section_col] = 'prepared_remarks'
else:
logger.warning("No valid Q&A intro found or missing text column. Defaulting all to 'prepared_remarks'.")
df[self.section_col] = 'prepared_remarks'
logger.info("Preprocessing completed.")
return df