refactor: replace fuel_scraper with newenglandoil + cheapestoil scrapers

- Add newenglandoil/ package as the primary scraper (replaces fuel_scraper)
- Add cheapestoil/ package as a secondary market price scraper
- Add app.py entry point for direct execution
- Update run.py: new scrape_cheapest(), migrate command, --state filter,
  --refresh-metadata flag for overwriting existing phone/URL data
- Update models.py with latest schema fields
- Update requirements.txt dependencies
- Update Dockerfile and docker-compose.yml for new structure
- Remove deprecated fuel_scraper module, test.py, and log file

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-03-06 11:34:21 -05:00
parent 8f45f4c209
commit 1592e6d685
26 changed files with 3221 additions and 1468 deletions

136
cheapestoil/api_client.py Normal file
View File

@@ -0,0 +1,136 @@
"""
HTTP client for the CheapestOil JSON API.
"""
import re
import requests
from bs4 import BeautifulSoup
from .config import API_URL
DEFAULT_HEADERS = {
"User-Agent": (
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
"AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/91.0.4472.124 Safari/537.36"
)
}
REQUEST_TIMEOUT = 20
def fetch_company_details(slug: str) -> dict:
"""
Fetch company details (real URL, phone) from their CheapestOil profile page.
Args:
slug: The company slug/path (e.g. "Abc-Oil-Company")
Returns:
Dict with keys: "url" (str|None), "phone" (str|None)
"""
if not slug:
return {"url": None, "phone": None}
# Construct detail URL
# If slug is full URL, use it, else append to base
if slug.startswith("http"):
url = slug
else:
url = f"https://www.cheapestoil.com/{slug}"
try:
resp = requests.get(url, headers=DEFAULT_HEADERS, timeout=REQUEST_TIMEOUT)
resp.raise_for_status()
soup = BeautifulSoup(resp.content, 'html.parser')
real_url = None
phone = None
# 1. Extract Real URL
# Look for "Visit Website" link or similar anchor texts
# Usually contained in a link with text "Visit Website" or the company name
# We look for a link that is NOT internal (doesn't contain cheapestoil.com)
# and behaves like an external link.
# Common pattern: <a href="..." target="_blank">Visit Website</a>
visit_link = soup.find('a', string=re.compile(r"Visit Website|Company Website", re.IGNORECASE))
if visit_link and visit_link.get('href'):
href = visit_link.get('href')
if 'cheapestoil.com' not in href and href.startswith('http'):
real_url = href
# Fallback: look for any external link in the contact section if structured
if not real_url:
# Try to find the first external link in the main content area
# (This is heuristics-based, might need adjustment)
content_div = soup.find('div', class_='col-md-8') # Common bootstrap main col
if content_div:
links = content_div.find_all('a', href=True)
for a in links:
href = a['href']
if href.startswith('http') and 'cheapestoil.com' not in href:
real_url = href
break
# 2. Extract Phone
# Reuse robust regex pattern logic
page_text = soup.get_text(" ", strip=True)
# Look for "Phone:", "Tel:", etc.
# This is a bit simplified compared to the other scraper but likely sufficient
phone_match = re.search(r'(?:Phone|Tel|Call).*?(\(?\d{3}\)?[\s.\-]?\d{3}[\s.\-]?\d{4})', page_text, re.IGNORECASE)
if phone_match:
phone_candidate = phone_match.group(1)
else:
# Fallback to just finding a phone pattern
phone_match = re.search(r'(?:\(?\d{3}\)?[\s.\-]?\d{3}[\s.\-]?\d{4})', page_text)
phone_candidate = phone_match.group(0) if phone_match else None
if phone_candidate:
digits = re.sub(r'\D', '', phone_candidate)
if len(digits) == 10:
phone = f"({digits[:3]}) {digits[3:6]}-{digits[6:]}"
else:
phone = phone_candidate
return {"url": real_url, "phone": phone}
except Exception as e:
logging.warning(f"Failed to fetch details for {slug}: {e}")
return {"url": None, "phone": None}
def fetch_county_prices(state_api_name: str, county_name: str | None = None) -> list:
"""
Fetch price data from the CheapestOil API.
Args:
state_api_name: State name as used by the API (e.g. "Massachusetts", "NewHampshire")
county_name: County name filter, or None for state-level results
Returns:
List of raw JSON arrays from the API, or empty list on failure.
"""
params = {
"sort": 0,
"state": state_api_name,
"county": county_name or "",
"zip": "",
}
try:
resp = requests.get(
API_URL, params=params, headers=DEFAULT_HEADERS, timeout=REQUEST_TIMEOUT
)
resp.raise_for_status()
data = resp.json()
if isinstance(data, list):
return data
logging.warning(f"Unexpected response type from API: {type(data)}")
return []
except requests.exceptions.RequestException as e:
logging.error(f"Error fetching CheapestOil API for {state_api_name}/{county_name}: {e}")
return []
except ValueError as e:
logging.error(f"Invalid JSON from CheapestOil API: {e}")
return []