refactor: replace fuel_scraper with newenglandoil + cheapestoil scrapers

- Add newenglandoil/ package as the primary scraper (replaces fuel_scraper) - Add cheapestoil/ package as a secondary market price scraper - Add app.py entry point for direct execution - Update run.py: new scrape_cheapest(), migrate command, --state filter, --refresh-metadata flag for overwriting existing phone/URL data - Update models.py with latest schema fields - Update requirements.txt dependencies - Update Dockerfile and docker-compose.yml for new structure - Remove deprecated fuel_scraper module, test.py, and log file Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-03-06 11:34:21 -05:00
parent 8f45f4c209
commit 1592e6d685
26 changed files with 3221 additions and 1468 deletions
--- a/newenglandoil/http_client.py
+++ b/newenglandoil/http_client.py
@@ -0,0 +1,111 @@
+"""
+HTTP client module for making web requests.
+"""
+import logging
+import re
+import time
+import requests
+from bs4 import BeautifulSoup
+
+# Default headers to mimic a browser
+DEFAULT_HEADERS = {
+    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
+}
+
+REQUEST_TIMEOUT = 20
+PHONE_FETCH_DELAY = 1  # seconds between phone page requests
+
+
+def make_request(url: str) -> BeautifulSoup | None:
+    """
+    Fetch a URL and return a BeautifulSoup object.
+
+    Args:
+        url: The URL to fetch
+
+    Returns:
+        BeautifulSoup object if successful, None otherwise
+    """
+    try:
+        response = requests.get(url, headers=DEFAULT_HEADERS, timeout=REQUEST_TIMEOUT)
+        response.raise_for_status()
+        return BeautifulSoup(response.content, 'html.parser')
+    except requests.exceptions.RequestException as e:
+        logging.error(f"Error fetching {url}: {e}")
+        return None
+
+
+def fetch_phone_number(base_url: str, phone_page_path: str, state_slug: str = "") -> str | None:
+    """
+    Fetch a phone number from a newenglandoil phones.asp page.
+
+    Args:
+        base_url: Site base URL (e.g. "https://www.newenglandoil.com")
+        phone_page_path: Relative path like "phones.asp?zone=1&ID=10&a=MA1"
+        state_slug: State slug for URL path (e.g. "massachusetts")
+
+    Returns:
+        Phone number string or None if not found.
+    """
+    # Build full URL - phone_page_path may be relative
+    if phone_page_path.startswith('http'):
+        url = phone_page_path
+    elif state_slug:
+        url = f"{base_url}/{state_slug}/{phone_page_path}"
+    else:
+        url = f"{base_url}/{phone_page_path}"
+
+    time.sleep(PHONE_FETCH_DELAY)
+
+    soup = make_request(url)
+    if not soup:
+        return None
+
+    # Look for phone number patterns in the page text
+    page_text = soup.get_text(" ", strip=True) 
+    
+    # Common US phone formats: (508) 555-1234, 508-555-1234, 508.555.1234, 5085551234
+    # Captures:
+    # 1. Optional open paren
+    # 2. 3 digits (area code)
+    # 3. Optional close paren
+    # 4. Separator (space, dot, dash)
+    # 5. 3 digits (prefix)
+    # 6. Separator
+    # 7. 4 digits (line number)
+    phone_pattern = re.compile(
+        r'(?:\(?(\d{3})\)?[\s.\-]?(\d{3})[\s.\-]?(\d{4}))'
+    )
+    
+    # Try to find a phone number near "Phone:" or "Tel:" first
+    keyword_pattern = re.compile(r'(?:Phone|Tel|Call|Contact).*?(\(?\d{3}\)?[\s.\-]?\d{3}[\s.\-]?\d{4})', re.IGNORECASE)
+    keyword_match = keyword_pattern.search(page_text)
+    
+    candidate = None
+    if keyword_match:
+        # If we found a number near a keyword, use that one.
+        candidate = keyword_match.group(1)
+    else:
+        # Otherwise, look for the first valid phone pattern
+        matches = phone_pattern.findall(page_text)
+        for m in matches:
+            # m is a tuple of groups: ('508', '555', '1234')
+            full_num = "".join(m)
+            
+            # Simple heuristic to avoid dates like 2024, 2025 or common years if adjacent
+            # But the regex requires 3-3-4 structure so a simple "2024" won't match unless it's like 202-455-1234
+            # We can filter out obviously bad "numbers" if needed, e.g. 000-000-0000
+            if full_num.startswith('000'): 
+                continue
+                
+            candidate = f"{m[0]}-{m[1]}-{m[2]}"
+            break
+
+    if candidate:
+        digits = re.sub(r'\D', '', candidate)
+        if len(digits) == 10:
+            return f"({digits[:3]}) {digits[3:6]}-{digits[6:]}"
+        return candidate
+
+    logging.debug(f"No phone number found on {url}")
+    return None