Source code for multimodal_fin.processing.metadata.qa_analyzer

from dataclasses import dataclass
from pydantic import BaseModel
from typing import List, Literal, Optional
import pandas as pd
import difflib
import json

from multimodal_fin.processing.basics import LLMClient, UncertaintyMixin
from multimodal_fin.processing.metadata.prompt_builder import PromptBuilder



[docs]
class EvaluatedQA(BaseModel):
    """Represents a single question from an intervention and its evaluation."""
    question: str
    answered: Literal['yes', 'partially', 'no']
    answer_summary: Optional[str] = None
    answer_quote: Optional[str] = None




[docs]
class InterventionAnalysis(BaseModel):
    """Represents the full QA analysis for an intervention."""
    intervention: str
    response: str
    evaluations: List[EvaluatedQA]




[docs]
@dataclass
class QAAnalyzer(UncertaintyMixin):
    """Analyzes Q&A interactions by evaluating whether questions were answered in responses."""

    model_name: str = "llama3"
    """Name of the LLM to use."""

    NUM_EVALUATIONS: int = 5
    """Number of LLM passes to estimate uncertainty."""

    def __post_init__(self):
        """Initializes the LLM client and prompt builder."""
        self.prompt_builder = PromptBuilder()
        self.llm = LLMClient(self.model_name)


[docs]
    def analize_qa(self, intervention: str, response: str) -> dict:
        """Runs the LLM to analyze whether each question in the intervention is answered in the response.

        Args:
            intervention (str): The text containing one or more questions.
            response (str): The response from the speaker.

        Returns:
            dict: The parsed LLM response following `InterventionAnalysis` schema.
        """
        messages = self.prompt_builder.analize_qa(intervention, response)
        response = self.llm.chat(messages, schema=InterventionAnalysis.model_json_schema())
        return json.loads(response)



[docs]
    def get_pred(self, question: str, response: str) -> tuple:
        """Performs multiple evaluations to determine the QA label and uncertainty.

        Args:
            question (str): A single question to evaluate.
            response (str): The response to evaluate against.

        Returns:
            tuple: (predicted_label, confidence, extra_info_dict)
        """
        raw_outputs = []
        labels = []

        for _ in range(self.NUM_EVALUATIONS):
            result = self.analize_qa(question, response)
            label = self._extract_best_match_label(question, result)
            if label:
                labels.append(label)
                raw_outputs.append(result)

        if not labels:
            return None, 0.0, {}

        final_label, confidence = self.get_result_and_uncertainty(
            lambda _: labels.pop(0), question, len(raw_outputs)
        )

        return final_label, confidence, {
            "raw_outputs": raw_outputs
        }


    def _extract_best_match_label(self, question: str, result: dict) -> Optional[str]:
        """Finds the most relevant question in the LLM evaluation and returns its answer status.

        Args:
            question (str): The original question.
            result (dict): The LLM output in dictionary form.

        Returns:
            Optional[str]: One of ['yes', 'partially', 'no'] or None if nothing is matched.
        """
        evaluations = result.get('evaluations', [])
        if not evaluations:
            return None

        if len(evaluations) == 1:
            return evaluations[0]['answered']

        best_match = max(
            evaluations,
            key=lambda ev: difflib.SequenceMatcher(
                None, question.lower(), ev.get("question", "").lower()
            ).ratio()
        )
        return best_match.get("answered")


[docs]
    def evaluate_qa_model(self, data: list) -> pd.DataFrame:
        """Evaluates a QA model on a dataset of interventions with ground truth.

        Args:
            data (list): A list of dicts, each containing a 'response' and list of 'label' dicts with true Q&A labels.

        Returns:
            pd.DataFrame: DataFrame with predicted and true labels per question.
        """
        results = []

        for example in data:
            response = example['response']
            for q in example['label']:
                question = q['question']
                true_label = q['answered']
                pred_label = self.get_pred_question(question, response)

                if pred_label is not None:
                    results.append({
                        "question": question,
                        "response": response,
                        "label": true_label,
                        "classification": pred_label
                    })

        return pd.DataFrame(results)



[docs]
    def get_pred_question(self, question: str, response: str) -> Optional[str]:
        """Returns the answer status for a given question-response pair using a single pass.

        Args:
            question (str): The question to check.
            response (str): The response to evaluate.

        Returns:
            Optional[str]: One of ['yes', 'partially', 'no'] or None if failed.
        """
        try:
            result = self.analize_qa(question, response)
            evaluations = result.get('evaluations', [])

            if not evaluations:
                print("⚠️ No evaluations returned")
                return None

            if len(evaluations) == 1:
                return evaluations[0]['answered']

            best_match = max(
                evaluations,
                key=lambda ev: difflib.SequenceMatcher(
                    None, question.lower(), ev.get("question", "").lower()
                ).ratio()
            )

            similarity = difflib.SequenceMatcher(
                None, question.lower(), best_match.get("question", "").lower()
            ).ratio()
            print(f"🔍 Best match similarity: {similarity:.2f} -> '{best_match['question']}'")

            return best_match.get("answered")

        except Exception as e:
            print(f"❌ Error processing question: {question[:30]}... -> {e}")
            return None