refactor: replace fuel_scraper with newenglandoil + cheapestoil scrapers

- Add newenglandoil/ package as the primary scraper (replaces fuel_scraper) - Add cheapestoil/ package as a secondary market price scraper - Add app.py entry point for direct execution - Update run.py: new scrape_cheapest(), migrate command, --state filter, --refresh-metadata flag for overwriting existing phone/URL data - Update models.py with latest schema fields - Update requirements.txt dependencies - Update Dockerfile and docker-compose.yml for new structure - Remove deprecated fuel_scraper module, test.py, and log file Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-03-06 11:34:21 -05:00
parent 8f45f4c209
commit 1592e6d685
26 changed files with 3221 additions and 1468 deletions
--- a/cheapestoil/init.py
+++ b/cheapestoil/init.py
@@ -0,0 +1,4 @@
+# cheapestoil package
+from .scraper import scrape_state
+
+__all__ = ["scrape_state"]
--- a/cheapestoil/api_client.py
+++ b/cheapestoil/api_client.py
@@ -0,0 +1,136 @@
+"""
+HTTP client for the CheapestOil JSON API.
+"""
+import re
+import requests
+from bs4 import BeautifulSoup
+
+from .config import API_URL
+
+DEFAULT_HEADERS = {
+    "User-Agent": (
+        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
+        "AppleWebKit/537.36 (KHTML, like Gecko) "
+        "Chrome/91.0.4472.124 Safari/537.36"
+    )
+}
+
+REQUEST_TIMEOUT = 20
+
+
+def fetch_company_details(slug: str) -> dict:
+    """
+    Fetch company details (real URL, phone) from their CheapestOil profile page.
+    
+    Args:
+        slug: The company slug/path (e.g. "Abc-Oil-Company")
+        
+    Returns:
+        Dict with keys: "url" (str|None), "phone" (str|None)
+    """
+    if not slug:
+        return {"url": None, "phone": None}
+        
+    # Construct detail URL
+    # If slug is full URL, use it, else append to base
+    if slug.startswith("http"):
+        url = slug
+    else:
+        url = f"https://www.cheapestoil.com/{slug}"
+        
+    try:
+        resp = requests.get(url, headers=DEFAULT_HEADERS, timeout=REQUEST_TIMEOUT)
+        resp.raise_for_status()
+        soup = BeautifulSoup(resp.content, 'html.parser')
+        
+        real_url = None
+        phone = None
+        
+        # 1. Extract Real URL
+        # Look for "Visit Website" link or similar anchor texts
+        # Usually contained in a link with text "Visit Website" or the company name
+        # We look for a link that is NOT internal (doesn't contain cheapestoil.com)
+        # and behaves like an external link.
+        
+        # Common pattern: <a href="..." target="_blank">Visit Website</a>
+        visit_link = soup.find('a', string=re.compile(r"Visit Website|Company Website", re.IGNORECASE))
+        if visit_link and visit_link.get('href'):
+            href = visit_link.get('href')
+            if 'cheapestoil.com' not in href and href.startswith('http'):
+                real_url = href
+        
+        # Fallback: look for any external link in the contact section if structured
+        if not real_url:
+            # Try to find the first external link in the main content area
+            # (This is heuristics-based, might need adjustment)
+            content_div = soup.find('div', class_='col-md-8') # Common bootstrap main col
+            if content_div:
+                links = content_div.find_all('a', href=True)
+                for a in links:
+                    href = a['href']
+                    if href.startswith('http') and 'cheapestoil.com' not in href:
+                        real_url = href
+                        break
+
+        # 2. Extract Phone
+        # Reuse robust regex pattern logic
+        page_text = soup.get_text(" ", strip=True)
+        
+        # Look for "Phone:", "Tel:", etc.
+        # This is a bit simplified compared to the other scraper but likely sufficient
+        phone_match = re.search(r'(?:Phone|Tel|Call).*?(\(?\d{3}\)?[\s.\-]?\d{3}[\s.\-]?\d{4})', page_text, re.IGNORECASE)
+        if phone_match:
+            phone_candidate = phone_match.group(1)
+        else:
+             # Fallback to just finding a phone pattern
+            phone_match = re.search(r'(?:\(?\d{3}\)?[\s.\-]?\d{3}[\s.\-]?\d{4})', page_text)
+            phone_candidate = phone_match.group(0) if phone_match else None
+            
+        if phone_candidate:
+            digits = re.sub(r'\D', '', phone_candidate)
+            if len(digits) == 10:
+                phone = f"({digits[:3]}) {digits[3:6]}-{digits[6:]}"
+            else:
+                phone = phone_candidate
+
+        return {"url": real_url, "phone": phone}
+
+    except Exception as e:
+        logging.warning(f"Failed to fetch details for {slug}: {e}")
+        return {"url": None, "phone": None}
+
+
+
+def fetch_county_prices(state_api_name: str, county_name: str | None = None) -> list:
+    """
+    Fetch price data from the CheapestOil API.
+
+    Args:
+        state_api_name: State name as used by the API (e.g. "Massachusetts", "NewHampshire")
+        county_name: County name filter, or None for state-level results
+
+    Returns:
+        List of raw JSON arrays from the API, or empty list on failure.
+    """
+    params = {
+        "sort": 0,
+        "state": state_api_name,
+        "county": county_name or "",
+        "zip": "",
+    }
+    try:
+        resp = requests.get(
+            API_URL, params=params, headers=DEFAULT_HEADERS, timeout=REQUEST_TIMEOUT
+        )
+        resp.raise_for_status()
+        data = resp.json()
+        if isinstance(data, list):
+            return data
+        logging.warning(f"Unexpected response type from API: {type(data)}")
+        return []
+    except requests.exceptions.RequestException as e:
+        logging.error(f"Error fetching CheapestOil API for {state_api_name}/{county_name}: {e}")
+        return []
+    except ValueError as e:
+        logging.error(f"Invalid JSON from CheapestOil API: {e}")
+        return []
--- a/cheapestoil/company_matcher.py
+++ b/cheapestoil/company_matcher.py
@@ -0,0 +1,90 @@
+"""
+Company name normalization and matching for cross-source deduplication.
+
+Handles slight naming variations between NewEnglandOil and CheapestOil:
+    "Fireman's Fuel Co." == "Firemans Fuel" after normalization.
+"""
+import re
+import logging
+
+import sys
+import os
+sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
+from sqlalchemy.orm import Session
+import models
+
+# Suffixes to strip during normalization (order matters: longer first)
+_STRIP_SUFFIXES = [
+    "enterprises", "company", "oil co", "fuel co", "corp", "inc", "llc", "co",
+]
+
+
+def normalize_company_name(name: str) -> str:
+    """
+    Normalize a company name for fuzzy matching.
+
+    Steps:
+        1. Strip whitespace, lowercase
+        2. Replace '&' with 'and'
+        3. Remove punctuation (apostrophes, periods, commas)
+        4. Remove common suffixes
+        5. Collapse multiple spaces
+
+    Args:
+        name: Raw company name
+
+    Returns:
+        Normalized string for comparison.
+    """
+    s = name.strip().lower()
+    s = s.replace("&", "and")
+    s = re.sub(r"['.,$]", "", s)
+    s = s.strip()
+    # Remove common suffixes (longest first to avoid partial matches)
+    for suffix in _STRIP_SUFFIXES:
+        if s.endswith(suffix):
+            s = s[: -len(suffix)]
+            break
+    s = re.sub(r"\s+", " ", s).strip()
+    return s
+
+
+def find_existing_record(
+    db_session: Session,
+    raw_name: str,
+    state_abbr: str,
+    county_id: int | None,
+) -> "models.OilPrice | None":
+    """
+    Find an existing oil_prices record that matches by normalized company name.
+
+    Queries all records for the given state+county_id (or state+zone=0 if no county),
+    then compares normalized names in Python.
+
+    Args:
+        db_session: SQLAlchemy session
+        raw_name: Raw company name from CheapestOil
+        state_abbr: Two-letter state abbreviation
+        county_id: County ID or None
+
+    Returns:
+        Matching OilPrice record or None.
+    """
+    target = normalize_company_name(raw_name)
+    if not target:
+        return None
+
+    query = db_session.query(models.OilPrice).filter(
+        models.OilPrice.state == state_abbr,
+    )
+    if county_id is not None:
+        query = query.filter(models.OilPrice.county_id == county_id)
+    else:
+        query = query.filter(models.OilPrice.zone == 0)
+
+    for record in query.all():
+        if normalize_company_name(record.name) == target:
+            return record
+
+    return None
--- a/cheapestoil/config.py
+++ b/cheapestoil/config.py
@@ -0,0 +1,50 @@
+"""
+Configuration for the CheapestOil scraper.
+"""
+
+API_URL = "https://www.cheapestoil.com/heating-oil-prices/api"
+
+# Seconds between requests to be polite
+SCRAPE_DELAY = 2
+
+# State abbreviation -> list of county names on cheapestoil.com
+# None means state-level only (no county filter)
+STATE_COUNTIES = {
+    "MA": [
+        "Barnstable", "Berkshire", "Bristol", "Essex", "Franklin",
+        "Hampden", "Hampshire", "Middlesex", "Norfolk", "Plymouth",
+        "Suffolk", "Worcester",
+    ],
+    "CT": [
+        "Fairfield", "Hartford", "Litchfield", "Middlesex",
+        "New Haven", "New London", "Tolland", "Windham",
+    ],
+    "ME": [
+        "Cumberland", "York", "Penobscot", "Kennebec", "Androscoggin",
+        "Aroostook", "Oxford", "Hancock", "Somerset", "Knox",
+        "Waldo", "Sagadahoc", "Lincoln", "Washington", "Franklin",
+        "Piscataquis",
+    ],
+    "NH": [
+        "Belknap", "Carroll", "Cheshire", "Coos", "Grafton",
+        "Hillsborough", "Merrimack", "Rockingham", "Strafford", "Sullivan",
+    ],
+    "RI": [
+        "Bristol", "Kent", "Newport", "Providence", "Washington",
+    ],
+    "VT": [
+        "Addison", "Bennington", "Caledonia", "Chittenden", "Essex",
+        "Franklin", "Grand Isle", "Lamoille", "Orange", "Orleans",
+        "Rutland", "Washington", "Windham", "Windsor",
+    ],
+}
+
+# State abbreviation -> API state name (as used in cheapestoil.com params)
+STATE_API_NAMES = {
+    "MA": "Massachusetts",
+    "CT": "Connecticut",
+    "ME": "Maine",
+    "NH": "NewHampshire",
+    "RI": "RhodeIsland",
+    "VT": "Vermont",
+}
--- a/cheapestoil/parsers.py
+++ b/cheapestoil/parsers.py
@@ -0,0 +1,111 @@
+"""
+Parsers for CheapestOil API response data.
+
+API returns arrays like:
+  [name, 150gal_price, 300gal_price, 500gal_price, service_area, updated, link, flag]
+
+Price fields come as HTML strings like "$3.69<br />(Total $553.50*)"
+"""
+import re
+import logging
+
+# Common abbreviations that should stay uppercase after title-casing
+_KEEP_UPPER = {"LLC", "INC", "LP", "HVAC", "II", "III", "IV", "USA"}
+
+
+def _smart_title(name: str) -> str:
+    """Convert a company name to title case, preserving common abbreviations."""
+    words = name.title().split()
+    return " ".join(w.upper() if w.upper() in _KEEP_UPPER else w for w in words)
+
+
+def parse_price_150(price_html: str) -> float | None:
+    """
+    Extract the per-gallon price from a CheapestOil price field.
+
+    Examples:
+        "$3.69<br />(Total $553.50*)" -> 3.69
+        "$4.199" -> 4.199
+        "" -> None
+
+    Args:
+        price_html: Raw price string from the API
+
+    Returns:
+        Float price or None if unparseable.
+    """
+    if not price_html or not isinstance(price_html, str):
+        return None
+    # The per-gallon price is the first dollar amount before any <br> tag
+    match = re.search(r'\$(\d+\.\d+)', price_html)
+    if match:
+        try:
+            return float(match.group(1))
+        except ValueError:
+            pass
+    logging.warning(f"Could not parse price from: {price_html!r}")
+    return None
+
+
+def parse_company_record(row: list, county_name: str | None) -> dict | None:
+    """
+    Convert an API row array to a structured dict.
+
+    Expected row format:
+        [0] name
+        [1] 150gal price (HTML)
+        [2] 300gal price (HTML)
+        [3] 500gal price (HTML)
+        [4] service area text
+        [5] last updated date string
+        [6] company link/slug
+        [7] flag/badge
+
+    Args:
+        row: Raw array from the API
+        county_name: County name this row came from (None for state-level)
+
+    Returns:
+        Dict with {name, price, service_area, county_name, date} or None.
+    """
+    if not isinstance(row, list) or len(row) < 6:
+        logging.warning(f"Skipping malformed row: {row!r}")
+        return None
+
+    name = str(row[0]).strip() if row[0] else ""
+    if not name:
+        return None
+
+    # Apply title case normalization
+    name = _smart_title(name)
+
+    price = parse_price_150(str(row[1]) if row[1] else "")
+    service_area = str(row[4]).strip() if row[4] else ""
+    date_str = str(row[5]).strip() if row[5] else ""
+    # DB column is VARCHAR(20), truncate to fit
+    if len(date_str) > 20:
+        date_str = date_str[:20]
+
+    # Extract company URL from row[6] (link/slug)
+    # Only accept if it looks like a real external URL, not a slug
+    url = None
+    slug = None
+    if len(row) > 6 and row[6]:
+        raw_link = str(row[6]).strip()
+        if raw_link:
+            if raw_link.startswith("http"):
+                url = raw_link
+            else:
+                # It's a slug for the cheapestoil detail page
+                slug = raw_link
+
+    return {
+        "slug": slug,  # Return slug so scraper can use it to fetch details
+        "name": name,
+        "price": price,
+        "service_area": service_area,
+        "county_name": county_name,
+        "date": date_str,
+        "url": url,
+        "slug": slug,
+    }
--- a/cheapestoil/scraper.py
+++ b/cheapestoil/scraper.py
@@ -0,0 +1,217 @@
+"""
+Main orchestrator for the CheapestOil scraper.
+"""
+import logging
+import time
+from datetime import datetime
+
+import sys
+import os
+sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
+from sqlalchemy.orm import Session
+import models
+
+from .config import STATE_COUNTIES, STATE_API_NAMES, SCRAPE_DELAY
+from .api_client import fetch_company_details, fetch_county_prices
+from .parsers import parse_company_record
+from .company_matcher import find_existing_record
+from .town_lookup import resolve_county_from_service_area
+
+
+def _resolve_county_id(
+    county_name: str | None,
+    service_area: str,
+    state_abbr: str,
+    county_lookup: dict,
+) -> int | None:
+    """
+    Resolve a county_id from either a direct county name or service area text.
+
+    For MA/CT/ME: county_name comes directly from the API query parameter.
+    For NH/RI/VT: parse service_area text to find a town -> county mapping.
+    """
+    # Direct county match (MA/CT/ME)
+    if county_name:
+        county_id = county_lookup.get((state_abbr, county_name))
+        if county_id is None:
+            logging.warning(f"County not in DB: ({state_abbr}, {county_name})")
+        return county_id
+
+    # Service area parsing (NH/RI/VT)
+    if service_area:
+        resolved = resolve_county_from_service_area(service_area, state_abbr)
+        if resolved:
+            county_id = county_lookup.get((state_abbr, resolved))
+            if county_id is not None:
+                return county_id
+            logging.warning(f"Resolved county '{resolved}' not in DB for {state_abbr}")
+
+    return None
+
+
+def scrape_state(state_abbr: str, db_session: Session, county_lookup: dict, refresh_metadata: bool = False) -> dict:
+    """
+    Scrape all CheapestOil data for a single state.
+
+    Args:
+        state_abbr: Two-letter state code (MA, CT, ME, NH, RI, VT)
+        db_session: SQLAlchemy session
+        county_lookup: Dict of (state_abbr, county_name) -> county_id
+        refresh_metadata: If True, force re-fetch details (phone/url) and overwrite DB.
+
+    Returns:
+        Summary dict with {state, counties_scraped, records_added, records_updated, records_skipped}
+    """
+    state_abbr = state_abbr.upper()
+    if state_abbr not in STATE_API_NAMES:
+        raise ValueError(f"Unknown state: {state_abbr}. Must be one of {list(STATE_API_NAMES.keys())}")
+
+    api_name = STATE_API_NAMES[state_abbr]
+    counties = STATE_COUNTIES[state_abbr]
+
+    summary = {
+        "state": state_abbr,
+        "counties_scraped": 0,
+        "records_added": 0,
+        "records_updated": 0,
+        "records_skipped": 0,
+    }
+    
+    details_cache = {} # cache for detail pages: slug -> {url, phone}
+
+    for i, county_name in enumerate(counties):
+        if i > 0:
+            time.sleep(SCRAPE_DELAY)
+
+        label = county_name or "(state-level)"
+        logging.info(f"[CheapestOil] Fetching: {state_abbr} / {label}")
+
+        rows = fetch_county_prices(api_name, county_name)
+        if not rows:
+            logging.info(f"No results for {state_abbr} / {label}")
+            continue
+        
+        logging.info(f"[CheapestOil] Processing {len(rows)} records from {state_abbr} / {label} (Size: {len(rows)})")
+
+        summary["counties_scraped"] += 1
+
+        for row in rows:
+            record = parse_company_record(row, county_name)
+            if not record or record["price"] is None:
+                summary["records_skipped"] += 1
+                continue
+
+            # Resolve county_id
+            county_id = _resolve_county_id(
+                record["county_name"],
+                record["service_area"],
+                state_abbr,
+                county_lookup,
+            )
+
+            # Check for existing record (cross-source dedup)
+            existing = find_existing_record(
+                db_session, record["name"], state_abbr, county_id
+            )
+
+            # Fetch details logic:
+            slug = record.get("slug")
+            real_url = record.get("url")
+            phone = None
+            
+            # Determine if we need to fetch details
+            # If refresh_metadata is True, we want to fetch to ensure fresh data.
+            # If not, we fetch if we are missing info (which is handled if we don't have existing record or existing record missing info)
+            # Simplest approach: fetch if we have slug and (refresh_metadata OR missing basic info)
+            
+            should_fetch_details = False
+            if slug:
+                if refresh_metadata:
+                    should_fetch_details = True
+                elif existing:
+                    if not existing.url or not existing.phone:
+                         should_fetch_details = True
+                else:
+                    # New record, always fetch
+                    should_fetch_details = True
+
+            if should_fetch_details:
+                if slug in details_cache:
+                    cached = details_cache[slug]
+                    real_url = cached["url"]
+                    phone = cached["phone"]
+                else:
+                    details = fetch_company_details(slug)
+                    details_cache[slug] = details
+                    real_url = details["url"]
+                    phone = details["phone"]
+                    time.sleep(1.0) # Polite delay between detail pages
+
+            if existing:
+                # Skip vendor-managed records
+                if existing.company_id is not None:
+                    logging.debug(f"Skipping vendor-managed: {record['name']}")
+                    summary["records_skipped"] += 1
+                    continue
+
+                updated = False
+
+                # Backfill or Force Update url
+                if real_url:
+                    if not existing.url or (refresh_metadata and existing.url != real_url):
+                        existing.url = real_url
+                        updated = True
+                        logging.info(f"Updated/Backfilled URL for {record['name']}")
+
+                # Backfill or Force Update phone
+                if phone:
+                    if not existing.phone or (refresh_metadata and existing.phone != phone):
+                        existing.phone = phone
+                        updated = True
+                        logging.info(f"Updated/Backfilled Phone for {record['name']}")
+
+                # Backfill county_id if we have it now
+                if county_id is not None and existing.county_id != county_id:
+                    existing.county_id = county_id
+                    updated = True
+                    logging.info(f"Updated county_id for {record['name']}")
+
+                # Update if price changed, otherwise just touch timestamp
+                if existing.price != record["price"]:
+                    existing.price = record["price"]
+                    existing.date = record["date"]
+                    existing.scrapetimestamp = datetime.utcnow()
+                    summary["records_updated"] += 1
+                    logging.info(f"Updated price: {record['name']} ${existing.price:.2f} → ${record['price']:.2f}")
+                elif updated:
+                    existing.scrapetimestamp = datetime.utcnow()
+                    summary["records_updated"] += 1
+                else:
+                    existing.scrapetimestamp = datetime.utcnow()
+                    summary["records_skipped"] += 1
+                    logging.debug(f"No changes for {record['name']} (${record['price']:.2f})")
+            else:
+                # Insert new record (zone=0 for cheapestoil)
+                oil_price = models.OilPrice(
+                    state=state_abbr,
+                    zone=0,
+                    name=record["name"],
+                    price=record["price"],
+                    date=record["date"],
+                    county_id=county_id,
+                    url=real_url,
+                    phone=phone,
+                    scrapetimestamp=datetime.utcnow(),
+                )
+                db_session.add(oil_price)
+                summary["records_added"] += 1
+                logging.info(f"Added: {record['name']} in {state_abbr} (county_id={county_id}, phone={phone})")
+
+    db_session.commit()
+    logging.info(
+        f"[CheapestOil] State {state_abbr} complete: "
+        f"{summary['records_added']} added, {summary['records_updated']} updated, "
+        f"{summary['records_skipped']} skipped (no changes)"
+    )
+    return summary
--- a/cheapestoil/town_lookup.py
+++ b/cheapestoil/town_lookup.py