Source code for multimodal_fin.runners.downloads_runner

"""
Runner for downloading conference data (transcripts and audio) from earningscall.biz.

It uses a subset of S&P500 companies grouped by sector.
"""

import requests
import pandas as pd
from bs4 import BeautifulSoup
import earningscall

from multimodal_fin.config import DataAdquisitionSettings
from multimodal_fin.data_adquisition.Company import CompanyDataAcquisition
from multimodal_fin.runners.base import Runner
from multimodal_fin.utils.logging import get_logger

logger = get_logger(__name__)


[docs] class DataAdquisitionRunner(Runner): """Runner responsible for fetching earnings call transcripts and audio.""" def __init__(self, settings: DataAdquisitionSettings): """Initialize the data acquisition runner. Args: settings (DataAdquisitionSettings): Configuration with API key, base path, and URL. """ self.settings = settings
[docs] def run(self, **kwargs) -> None: """Download data for S&P500 companies from earningscall.biz. This includes scraping the main page, parsing the company table, and triggering download for transcripts and audio files. """ logger.info("Starting data acquisition from earningscall.biz") earningscall.api_key = self.settings.api_key try: response = requests.get(self.settings.url) response.raise_for_status() logger.info(f"Successfully fetched data from URL: {self.settings.url}") except requests.RequestException as e: logger.error(f"Failed to fetch data from {self.settings.url}: {e}", exc_info=True) raise soup = BeautifulSoup(response.text, 'html.parser') table = soup.find('table') if not table: logger.error("No table found on the earningscall page.") raise ValueError("HTML table not found") headers = [header.text.strip() for header in table.find_all('th')] rows = [ [col.text.strip() for col in row.find_all('td')] for row in table.find_all('tr')[1:] ] df = pd.DataFrame(rows, columns=headers) logger.info(f"Parsed table with {len(df)} entries.") # Select top 8 companies per sector for demonstration purposes sp500_subset = df.groupby("Sector").head(8).reset_index(drop=True) logger.info(f"Selected {len(sp500_subset)} companies (top 8 per sector)") for code in sp500_subset['Symbol']: logger.info(f"Fetching data for company: {code}") try: company = CompanyDataAcquisition(code) company.get_and_save_all_transcripts_and_audio(self.settings.base_path) logger.info(f"Successfully downloaded data for {code}") except Exception as e: logger.error(f"Failed to download data for {code}: {e}", exc_info=True)