refactor: replace fuel_scraper with newenglandoil + cheapestoil scrapers

- Add newenglandoil/ package as the primary scraper (replaces fuel_scraper)
- Add cheapestoil/ package as a secondary market price scraper
- Add app.py entry point for direct execution
- Update run.py: new scrape_cheapest(), migrate command, --state filter,
  --refresh-metadata flag for overwriting existing phone/URL data
- Update models.py with latest schema fields
- Update requirements.txt dependencies
- Update Dockerfile and docker-compose.yml for new structure
- Remove deprecated fuel_scraper module, test.py, and log file

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-03-06 11:34:21 -05:00
parent 8f45f4c209
commit 1592e6d685
26 changed files with 3221 additions and 1468 deletions

View File

@@ -0,0 +1,111 @@
"""
HTTP client module for making web requests.
"""
import logging
import re
import time
import requests
from bs4 import BeautifulSoup
# Default headers to mimic a browser
DEFAULT_HEADERS = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}
REQUEST_TIMEOUT = 20
PHONE_FETCH_DELAY = 1 # seconds between phone page requests
def make_request(url: str) -> BeautifulSoup | None:
"""
Fetch a URL and return a BeautifulSoup object.
Args:
url: The URL to fetch
Returns:
BeautifulSoup object if successful, None otherwise
"""
try:
response = requests.get(url, headers=DEFAULT_HEADERS, timeout=REQUEST_TIMEOUT)
response.raise_for_status()
return BeautifulSoup(response.content, 'html.parser')
except requests.exceptions.RequestException as e:
logging.error(f"Error fetching {url}: {e}")
return None
def fetch_phone_number(base_url: str, phone_page_path: str, state_slug: str = "") -> str | None:
"""
Fetch a phone number from a newenglandoil phones.asp page.
Args:
base_url: Site base URL (e.g. "https://www.newenglandoil.com")
phone_page_path: Relative path like "phones.asp?zone=1&ID=10&a=MA1"
state_slug: State slug for URL path (e.g. "massachusetts")
Returns:
Phone number string or None if not found.
"""
# Build full URL - phone_page_path may be relative
if phone_page_path.startswith('http'):
url = phone_page_path
elif state_slug:
url = f"{base_url}/{state_slug}/{phone_page_path}"
else:
url = f"{base_url}/{phone_page_path}"
time.sleep(PHONE_FETCH_DELAY)
soup = make_request(url)
if not soup:
return None
# Look for phone number patterns in the page text
page_text = soup.get_text(" ", strip=True)
# Common US phone formats: (508) 555-1234, 508-555-1234, 508.555.1234, 5085551234
# Captures:
# 1. Optional open paren
# 2. 3 digits (area code)
# 3. Optional close paren
# 4. Separator (space, dot, dash)
# 5. 3 digits (prefix)
# 6. Separator
# 7. 4 digits (line number)
phone_pattern = re.compile(
r'(?:\(?(\d{3})\)?[\s.\-]?(\d{3})[\s.\-]?(\d{4}))'
)
# Try to find a phone number near "Phone:" or "Tel:" first
keyword_pattern = re.compile(r'(?:Phone|Tel|Call|Contact).*?(\(?\d{3}\)?[\s.\-]?\d{3}[\s.\-]?\d{4})', re.IGNORECASE)
keyword_match = keyword_pattern.search(page_text)
candidate = None
if keyword_match:
# If we found a number near a keyword, use that one.
candidate = keyword_match.group(1)
else:
# Otherwise, look for the first valid phone pattern
matches = phone_pattern.findall(page_text)
for m in matches:
# m is a tuple of groups: ('508', '555', '1234')
full_num = "".join(m)
# Simple heuristic to avoid dates like 2024, 2025 or common years if adjacent
# But the regex requires 3-3-4 structure so a simple "2024" won't match unless it's like 202-455-1234
# We can filter out obviously bad "numbers" if needed, e.g. 000-000-0000
if full_num.startswith('000'):
continue
candidate = f"{m[0]}-{m[1]}-{m[2]}"
break
if candidate:
digits = re.sub(r'\D', '', candidate)
if len(digits) == 10:
return f"({digits[:3]}) {digits[3:6]}-{digits[6:]}"
return candidate
logging.debug(f"No phone number found on {url}")
return None