crawler/cheapestoil/parsers.py

"""
Parsers for CheapestOil API response data.

API returns arrays like:
  [name, 150gal_price, 300gal_price, 500gal_price, service_area, updated, link, flag]

Price fields come as HTML strings like "$3.69<br />(Total $553.50*)"
"""
import re
import logging

# Common abbreviations that should stay uppercase after title-casing
_KEEP_UPPER = {"LLC", "INC", "LP", "HVAC", "II", "III", "IV", "USA"}


def _smart_title(name: str) -> str:
    """Convert a company name to title case, preserving common abbreviations."""
    words = name.title().split()
    return " ".join(w.upper() if w.upper() in _KEEP_UPPER else w for w in words)


def parse_price_150(price_html: str) -> float | None:
    """
    Extract the per-gallon price from a CheapestOil price field.

    Examples:
        "$3.69<br />(Total $553.50*)" -> 3.69
        "$4.199" -> 4.199
        "" -> None

    Args:
        price_html: Raw price string from the API

    Returns:
        Float price or None if unparseable.
    """
    if not price_html or not isinstance(price_html, str):
        return None
    # The per-gallon price is the first dollar amount before any <br> tag
    match = re.search(r'\$(\d+\.\d+)', price_html)
    if match:
        try:
            return float(match.group(1))
        except ValueError:
            pass
    logging.warning(f"Could not parse price from: {price_html!r}")
    return None


def parse_company_record(row: list, county_name: str | None) -> dict | None:
    """
    Convert an API row array to a structured dict.

    Expected row format:
        [0] name
        [1] 150gal price (HTML)
        [2] 300gal price (HTML)
        [3] 500gal price (HTML)
        [4] service area text
        [5] last updated date string
        [6] company link/slug
        [7] flag/badge

    Args:
        row: Raw array from the API
        county_name: County name this row came from (None for state-level)

    Returns:
        Dict with {name, price, service_area, county_name, date} or None.
    """
    if not isinstance(row, list) or len(row) < 6:
        logging.warning(f"Skipping malformed row: {row!r}")
        return None

    name = str(row[0]).strip() if row[0] else ""
    if not name:
        return None

    # Apply title case normalization
    name = _smart_title(name)

    price = parse_price_150(str(row[1]) if row[1] else "")
    service_area = str(row[4]).strip() if row[4] else ""
    date_str = str(row[5]).strip() if row[5] else ""
    # DB column is VARCHAR(20), truncate to fit
    if len(date_str) > 20:
        date_str = date_str[:20]

    # Extract company URL from row[6] (link/slug)
    # Only accept if it looks like a real external URL, not a slug
    url = None
    slug = None
    if len(row) > 6 and row[6]:
        raw_link = str(row[6]).strip()
        if raw_link:
            if raw_link.startswith("http"):
                url = raw_link
            else:
                # It's a slug for the cheapestoil detail page
                slug = raw_link

    return {
        "slug": slug,  # Return slug so scraper can use it to fetch details
        "name": name,
        "price": price,
        "service_area": service_area,
        "county_name": county_name,
        "date": date_str,
        "url": url,
        "slug": slug,
    }