""" Parsers for CheapestOil API response data. API returns arrays like: [name, 150gal_price, 300gal_price, 500gal_price, service_area, updated, link, flag] Price fields come as HTML strings like "$3.69
(Total $553.50*)" """ import re import logging # Common abbreviations that should stay uppercase after title-casing _KEEP_UPPER = {"LLC", "INC", "LP", "HVAC", "II", "III", "IV", "USA"} def _smart_title(name: str) -> str: """Convert a company name to title case, preserving common abbreviations.""" words = name.title().split() return " ".join(w.upper() if w.upper() in _KEEP_UPPER else w for w in words) def parse_price_150(price_html: str) -> float | None: """ Extract the per-gallon price from a CheapestOil price field. Examples: "$3.69
(Total $553.50*)" -> 3.69 "$4.199" -> 4.199 "" -> None Args: price_html: Raw price string from the API Returns: Float price or None if unparseable. """ if not price_html or not isinstance(price_html, str): return None # The per-gallon price is the first dollar amount before any
tag match = re.search(r'\$(\d+\.\d+)', price_html) if match: try: return float(match.group(1)) except ValueError: pass logging.warning(f"Could not parse price from: {price_html!r}") return None def parse_company_record(row: list, county_name: str | None) -> dict | None: """ Convert an API row array to a structured dict. Expected row format: [0] name [1] 150gal price (HTML) [2] 300gal price (HTML) [3] 500gal price (HTML) [4] service area text [5] last updated date string [6] company link/slug [7] flag/badge Args: row: Raw array from the API county_name: County name this row came from (None for state-level) Returns: Dict with {name, price, service_area, county_name, date} or None. """ if not isinstance(row, list) or len(row) < 6: logging.warning(f"Skipping malformed row: {row!r}") return None name = str(row[0]).strip() if row[0] else "" if not name: return None # Apply title case normalization name = _smart_title(name) price = parse_price_150(str(row[1]) if row[1] else "") service_area = str(row[4]).strip() if row[4] else "" date_str = str(row[5]).strip() if row[5] else "" # DB column is VARCHAR(20), truncate to fit if len(date_str) > 20: date_str = date_str[:20] # Extract company URL from row[6] (link/slug) # Only accept if it looks like a real external URL, not a slug url = None slug = None if len(row) > 6 and row[6]: raw_link = str(row[6]).strip() if raw_link: if raw_link.startswith("http"): url = raw_link else: # It's a slug for the cheapestoil detail page slug = raw_link return { "slug": slug, # Return slug so scraper can use it to fetch details "name": name, "price": price, "service_area": service_area, "county_name": county_name, "date": date_str, "url": url, "slug": slug, }