refactor: replace fuel_scraper with newenglandoil + cheapestoil scrapers

- Add newenglandoil/ package as the primary scraper (replaces fuel_scraper) - Add cheapestoil/ package as a secondary market price scraper - Add app.py entry point for direct execution - Update run.py: new scrape_cheapest(), migrate command, --state filter, --refresh-metadata flag for overwriting existing phone/URL data - Update models.py with latest schema fields - Update requirements.txt dependencies - Update Dockerfile and docker-compose.yml for new structure - Remove deprecated fuel_scraper module, test.py, and log file Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-03-06 11:34:21 -05:00
parent 8f45f4c209
commit 1592e6d685
26 changed files with 3221 additions and 1468 deletions
--- a/cheapestoil/parsers.py
+++ b/cheapestoil/parsers.py
@@ -0,0 +1,111 @@
+"""
+Parsers for CheapestOil API response data.
+
+API returns arrays like:
+  [name, 150gal_price, 300gal_price, 500gal_price, service_area, updated, link, flag]
+
+Price fields come as HTML strings like "$3.69<br />(Total $553.50*)"
+"""
+import re
+import logging
+
+# Common abbreviations that should stay uppercase after title-casing
+_KEEP_UPPER = {"LLC", "INC", "LP", "HVAC", "II", "III", "IV", "USA"}
+
+
+def _smart_title(name: str) -> str:
+    """Convert a company name to title case, preserving common abbreviations."""
+    words = name.title().split()
+    return " ".join(w.upper() if w.upper() in _KEEP_UPPER else w for w in words)
+
+
+def parse_price_150(price_html: str) -> float | None:
+    """
+    Extract the per-gallon price from a CheapestOil price field.
+
+    Examples:
+        "$3.69<br />(Total $553.50*)" -> 3.69
+        "$4.199" -> 4.199
+        "" -> None
+
+    Args:
+        price_html: Raw price string from the API
+
+    Returns:
+        Float price or None if unparseable.
+    """
+    if not price_html or not isinstance(price_html, str):
+        return None
+    # The per-gallon price is the first dollar amount before any <br> tag
+    match = re.search(r'\$(\d+\.\d+)', price_html)
+    if match:
+        try:
+            return float(match.group(1))
+        except ValueError:
+            pass
+    logging.warning(f"Could not parse price from: {price_html!r}")
+    return None
+
+
+def parse_company_record(row: list, county_name: str | None) -> dict | None:
+    """
+    Convert an API row array to a structured dict.
+
+    Expected row format:
+        [0] name
+        [1] 150gal price (HTML)
+        [2] 300gal price (HTML)
+        [3] 500gal price (HTML)
+        [4] service area text
+        [5] last updated date string
+        [6] company link/slug
+        [7] flag/badge
+
+    Args:
+        row: Raw array from the API
+        county_name: County name this row came from (None for state-level)
+
+    Returns:
+        Dict with {name, price, service_area, county_name, date} or None.
+    """
+    if not isinstance(row, list) or len(row) < 6:
+        logging.warning(f"Skipping malformed row: {row!r}")
+        return None
+
+    name = str(row[0]).strip() if row[0] else ""
+    if not name:
+        return None
+
+    # Apply title case normalization
+    name = _smart_title(name)
+
+    price = parse_price_150(str(row[1]) if row[1] else "")
+    service_area = str(row[4]).strip() if row[4] else ""
+    date_str = str(row[5]).strip() if row[5] else ""
+    # DB column is VARCHAR(20), truncate to fit
+    if len(date_str) > 20:
+        date_str = date_str[:20]
+
+    # Extract company URL from row[6] (link/slug)
+    # Only accept if it looks like a real external URL, not a slug
+    url = None
+    slug = None
+    if len(row) > 6 and row[6]:
+        raw_link = str(row[6]).strip()
+        if raw_link:
+            if raw_link.startswith("http"):
+                url = raw_link
+            else:
+                # It's a slug for the cheapestoil detail page
+                slug = raw_link
+
+    return {
+        "slug": slug,  # Return slug so scraper can use it to fetch details
+        "name": name,
+        "price": price,
+        "service_area": service_area,
+        "county_name": county_name,
+        "date": date_str,
+        "url": url,
+        "slug": slug,
+    }