Source code for multimodal_fin.processing.metadata.sec10k_analyzer

from dataclasses import dataclass
from typing import Literal, Tuple, List

import pandas as pd
import json
from pydantic import BaseModel

from multimodal_fin.processing.basics import LLMClient, UncertaintyMixin
from multimodal_fin.processing.metadata.prompt_builder import PromptBuilder


[docs] class Category10K(BaseModel): """Pydantic schema used for validating LLM output when classifying SEC 10-K topics.""" category: Literal['Business', 'Risk Factors', 'MD&A', 'Other']
[docs] @dataclass class SEC10KAnalyzer(UncertaintyMixin): """Class responsible for classifying intervention text into 10-K categories using LLMs.""" model: str = "llama3" """The name of the LLM model to use.""" NUM_EVALUATIONS: int = 10 """Number of times the classification is repeated to estimate uncertainty.""" def __post_init__(self): self.llm = LLMClient(self.model)
[docs] def classify_text(self, text: str) -> str: """Classifies a given text into one of the 10-K categories. Args: text (str): Text to classify. Returns: str: One of ['Business', 'Risk Factors', 'MD&A', 'Other']. """ messages = PromptBuilder.prompt_10k(text) response = self.llm.chat(messages, schema=Category10K.model_json_schema()) return json.loads(response)['category']
[docs] def explain_other_category(self, text: str) -> str: """Provides a natural language explanation for why a text was classified as 'Other'. Args: text (str): The text classified as 'Other'. Returns: str: Explanation generated by the LLM. """ messages = PromptBuilder.explain_why_other(text) return self.llm.chat(messages)
[docs] def get_pred(self, text: str) -> Tuple[str, float, List[str]]: """Predicts the category for a text using repeated sampling for uncertainty estimation. Args: text (str): Text to classify. Returns: Tuple[str, float, List[str]]: Most likely category, confidence score, and list of predictions. """ predictions = [self.classify_text(text) for _ in range(self.NUM_EVALUATIONS)] return self.get_result_and_uncertainty( lambda _: predictions.pop(0), text, self.NUM_EVALUATIONS )
[docs] def classify_dataframe(self, df: pd.DataFrame) -> pd.DataFrame: """Classifies an entire DataFrame of interventions by applying `get_pred` on each row. Args: df (pd.DataFrame): DataFrame with a 'text' column. Returns: pd.DataFrame: The original DataFrame with an added 'classification' column. """ df = df.copy() df['classification'] = df['text'].apply(lambda t: self.get_pred(t)[0]) return df