Source code for multimodal_fin.processing.metadata.qa_analyzer

from dataclasses import dataclass
from pydantic import BaseModel
from typing import List, Literal, Optional
import pandas as pd
import difflib
import json

from multimodal_fin.processing.basics import LLMClient, UncertaintyMixin
from multimodal_fin.processing.metadata.prompt_builder import PromptBuilder


[docs] class EvaluatedQA(BaseModel): """Represents a single question from an intervention and its evaluation.""" question: str answered: Literal['yes', 'partially', 'no'] answer_summary: Optional[str] = None answer_quote: Optional[str] = None
[docs] class InterventionAnalysis(BaseModel): """Represents the full QA analysis for an intervention.""" intervention: str response: str evaluations: List[EvaluatedQA]
[docs] @dataclass class QAAnalyzer(UncertaintyMixin): """Analyzes Q&A interactions by evaluating whether questions were answered in responses.""" model_name: str = "llama3" """Name of the LLM to use.""" NUM_EVALUATIONS: int = 5 """Number of LLM passes to estimate uncertainty.""" def __post_init__(self): """Initializes the LLM client and prompt builder.""" self.prompt_builder = PromptBuilder() self.llm = LLMClient(self.model_name)
[docs] def analize_qa(self, intervention: str, response: str) -> dict: """Runs the LLM to analyze whether each question in the intervention is answered in the response. Args: intervention (str): The text containing one or more questions. response (str): The response from the speaker. Returns: dict: The parsed LLM response following `InterventionAnalysis` schema. """ messages = self.prompt_builder.analize_qa(intervention, response) response = self.llm.chat(messages, schema=InterventionAnalysis.model_json_schema()) return json.loads(response)
[docs] def get_pred(self, question: str, response: str) -> tuple: """Performs multiple evaluations to determine the QA label and uncertainty. Args: question (str): A single question to evaluate. response (str): The response to evaluate against. Returns: tuple: (predicted_label, confidence, extra_info_dict) """ raw_outputs = [] labels = [] for _ in range(self.NUM_EVALUATIONS): result = self.analize_qa(question, response) label = self._extract_best_match_label(question, result) if label: labels.append(label) raw_outputs.append(result) if not labels: return None, 0.0, {} final_label, confidence = self.get_result_and_uncertainty( lambda _: labels.pop(0), question, len(raw_outputs) ) return final_label, confidence, { "raw_outputs": raw_outputs }
def _extract_best_match_label(self, question: str, result: dict) -> Optional[str]: """Finds the most relevant question in the LLM evaluation and returns its answer status. Args: question (str): The original question. result (dict): The LLM output in dictionary form. Returns: Optional[str]: One of ['yes', 'partially', 'no'] or None if nothing is matched. """ evaluations = result.get('evaluations', []) if not evaluations: return None if len(evaluations) == 1: return evaluations[0]['answered'] best_match = max( evaluations, key=lambda ev: difflib.SequenceMatcher( None, question.lower(), ev.get("question", "").lower() ).ratio() ) return best_match.get("answered")
[docs] def evaluate_qa_model(self, data: list) -> pd.DataFrame: """Evaluates a QA model on a dataset of interventions with ground truth. Args: data (list): A list of dicts, each containing a 'response' and list of 'label' dicts with true Q&A labels. Returns: pd.DataFrame: DataFrame with predicted and true labels per question. """ results = [] for example in data: response = example['response'] for q in example['label']: question = q['question'] true_label = q['answered'] pred_label = self.get_pred_question(question, response) if pred_label is not None: results.append({ "question": question, "response": response, "label": true_label, "classification": pred_label }) return pd.DataFrame(results)
[docs] def get_pred_question(self, question: str, response: str) -> Optional[str]: """Returns the answer status for a given question-response pair using a single pass. Args: question (str): The question to check. response (str): The response to evaluate. Returns: Optional[str]: One of ['yes', 'partially', 'no'] or None if failed. """ try: result = self.analize_qa(question, response) evaluations = result.get('evaluations', []) if not evaluations: print("⚠️ No evaluations returned") return None if len(evaluations) == 1: return evaluations[0]['answered'] best_match = max( evaluations, key=lambda ev: difflib.SequenceMatcher( None, question.lower(), ev.get("question", "").lower() ).ratio() ) similarity = difflib.SequenceMatcher( None, question.lower(), best_match.get("question", "").lower() ).ratio() print(f"🔍 Best match similarity: {similarity:.2f} -> '{best_match['question']}'") return best_match.get("answered") except Exception as e: print(f"❌ Error processing question: {question[:30]}... -> {e}") return None