refactor: replace fuel_scraper with newenglandoil + cheapestoil scrapers

- Add newenglandoil/ package as the primary scraper (replaces fuel_scraper) - Add cheapestoil/ package as a secondary market price scraper - Add app.py entry point for direct execution - Update run.py: new scrape_cheapest(), migrate command, --state filter, --refresh-metadata flag for overwriting existing phone/URL data - Update models.py with latest schema fields - Update requirements.txt dependencies - Update Dockerfile and docker-compose.yml for new structure - Remove deprecated fuel_scraper module, test.py, and log file Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-03-06 11:34:21 -05:00
parent 8f45f4c209
commit 1592e6d685
26 changed files with 3221 additions and 1468 deletions
--- a/cheapestoil/api_client.py
+++ b/cheapestoil/api_client.py
@@ -0,0 +1,136 @@
+"""
+HTTP client for the CheapestOil JSON API.
+"""
+import re
+import requests
+from bs4 import BeautifulSoup
+
+from .config import API_URL
+
+DEFAULT_HEADERS = {
+    "User-Agent": (
+        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
+        "AppleWebKit/537.36 (KHTML, like Gecko) "
+        "Chrome/91.0.4472.124 Safari/537.36"
+    )
+}
+
+REQUEST_TIMEOUT = 20
+
+
+def fetch_company_details(slug: str) -> dict:
+    """
+    Fetch company details (real URL, phone) from their CheapestOil profile page.
+    
+    Args:
+        slug: The company slug/path (e.g. "Abc-Oil-Company")
+        
+    Returns:
+        Dict with keys: "url" (str|None), "phone" (str|None)
+    """
+    if not slug:
+        return {"url": None, "phone": None}
+        
+    # Construct detail URL
+    # If slug is full URL, use it, else append to base
+    if slug.startswith("http"):
+        url = slug
+    else:
+        url = f"https://www.cheapestoil.com/{slug}"
+        
+    try:
+        resp = requests.get(url, headers=DEFAULT_HEADERS, timeout=REQUEST_TIMEOUT)
+        resp.raise_for_status()
+        soup = BeautifulSoup(resp.content, 'html.parser')
+        
+        real_url = None
+        phone = None
+        
+        # 1. Extract Real URL
+        # Look for "Visit Website" link or similar anchor texts
+        # Usually contained in a link with text "Visit Website" or the company name
+        # We look for a link that is NOT internal (doesn't contain cheapestoil.com)
+        # and behaves like an external link.
+        
+        # Common pattern: <a href="..." target="_blank">Visit Website</a>
+        visit_link = soup.find('a', string=re.compile(r"Visit Website|Company Website", re.IGNORECASE))
+        if visit_link and visit_link.get('href'):
+            href = visit_link.get('href')
+            if 'cheapestoil.com' not in href and href.startswith('http'):
+                real_url = href
+        
+        # Fallback: look for any external link in the contact section if structured
+        if not real_url:
+            # Try to find the first external link in the main content area
+            # (This is heuristics-based, might need adjustment)
+            content_div = soup.find('div', class_='col-md-8') # Common bootstrap main col
+            if content_div:
+                links = content_div.find_all('a', href=True)
+                for a in links:
+                    href = a['href']
+                    if href.startswith('http') and 'cheapestoil.com' not in href:
+                        real_url = href
+                        break
+
+        # 2. Extract Phone
+        # Reuse robust regex pattern logic
+        page_text = soup.get_text(" ", strip=True)
+        
+        # Look for "Phone:", "Tel:", etc.
+        # This is a bit simplified compared to the other scraper but likely sufficient
+        phone_match = re.search(r'(?:Phone|Tel|Call).*?(\(?\d{3}\)?[\s.\-]?\d{3}[\s.\-]?\d{4})', page_text, re.IGNORECASE)
+        if phone_match:
+            phone_candidate = phone_match.group(1)
+        else:
+             # Fallback to just finding a phone pattern
+            phone_match = re.search(r'(?:\(?\d{3}\)?[\s.\-]?\d{3}[\s.\-]?\d{4})', page_text)
+            phone_candidate = phone_match.group(0) if phone_match else None
+            
+        if phone_candidate:
+            digits = re.sub(r'\D', '', phone_candidate)
+            if len(digits) == 10:
+                phone = f"({digits[:3]}) {digits[3:6]}-{digits[6:]}"
+            else:
+                phone = phone_candidate
+
+        return {"url": real_url, "phone": phone}
+
+    except Exception as e:
+        logging.warning(f"Failed to fetch details for {slug}: {e}")
+        return {"url": None, "phone": None}
+
+
+
+def fetch_county_prices(state_api_name: str, county_name: str | None = None) -> list:
+    """
+    Fetch price data from the CheapestOil API.
+
+    Args:
+        state_api_name: State name as used by the API (e.g. "Massachusetts", "NewHampshire")
+        county_name: County name filter, or None for state-level results
+
+    Returns:
+        List of raw JSON arrays from the API, or empty list on failure.
+    """
+    params = {
+        "sort": 0,
+        "state": state_api_name,
+        "county": county_name or "",
+        "zip": "",
+    }
+    try:
+        resp = requests.get(
+            API_URL, params=params, headers=DEFAULT_HEADERS, timeout=REQUEST_TIMEOUT
+        )
+        resp.raise_for_status()
+        data = resp.json()
+        if isinstance(data, list):
+            return data
+        logging.warning(f"Unexpected response type from API: {type(data)}")
+        return []
+    except requests.exceptions.RequestException as e:
+        logging.error(f"Error fetching CheapestOil API for {state_api_name}/{county_name}: {e}")
+        return []
+    except ValueError as e:
+        logging.error(f"Invalid JSON from CheapestOil API: {e}")
+        return []