refactor: replace fuel_scraper with newenglandoil + cheapestoil scrapers
- Add newenglandoil/ package as the primary scraper (replaces fuel_scraper) - Add cheapestoil/ package as a secondary market price scraper - Add app.py entry point for direct execution - Update run.py: new scrape_cheapest(), migrate command, --state filter, --refresh-metadata flag for overwriting existing phone/URL data - Update models.py with latest schema fields - Update requirements.txt dependencies - Update Dockerfile and docker-compose.yml for new structure - Remove deprecated fuel_scraper module, test.py, and log file Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
136
cheapestoil/api_client.py
Normal file
136
cheapestoil/api_client.py
Normal file
@@ -0,0 +1,136 @@
|
||||
"""
|
||||
HTTP client for the CheapestOil JSON API.
|
||||
"""
|
||||
import re
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
from .config import API_URL
|
||||
|
||||
DEFAULT_HEADERS = {
|
||||
"User-Agent": (
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
|
||||
"AppleWebKit/537.36 (KHTML, like Gecko) "
|
||||
"Chrome/91.0.4472.124 Safari/537.36"
|
||||
)
|
||||
}
|
||||
|
||||
REQUEST_TIMEOUT = 20
|
||||
|
||||
|
||||
def fetch_company_details(slug: str) -> dict:
|
||||
"""
|
||||
Fetch company details (real URL, phone) from their CheapestOil profile page.
|
||||
|
||||
Args:
|
||||
slug: The company slug/path (e.g. "Abc-Oil-Company")
|
||||
|
||||
Returns:
|
||||
Dict with keys: "url" (str|None), "phone" (str|None)
|
||||
"""
|
||||
if not slug:
|
||||
return {"url": None, "phone": None}
|
||||
|
||||
# Construct detail URL
|
||||
# If slug is full URL, use it, else append to base
|
||||
if slug.startswith("http"):
|
||||
url = slug
|
||||
else:
|
||||
url = f"https://www.cheapestoil.com/{slug}"
|
||||
|
||||
try:
|
||||
resp = requests.get(url, headers=DEFAULT_HEADERS, timeout=REQUEST_TIMEOUT)
|
||||
resp.raise_for_status()
|
||||
soup = BeautifulSoup(resp.content, 'html.parser')
|
||||
|
||||
real_url = None
|
||||
phone = None
|
||||
|
||||
# 1. Extract Real URL
|
||||
# Look for "Visit Website" link or similar anchor texts
|
||||
# Usually contained in a link with text "Visit Website" or the company name
|
||||
# We look for a link that is NOT internal (doesn't contain cheapestoil.com)
|
||||
# and behaves like an external link.
|
||||
|
||||
# Common pattern: <a href="..." target="_blank">Visit Website</a>
|
||||
visit_link = soup.find('a', string=re.compile(r"Visit Website|Company Website", re.IGNORECASE))
|
||||
if visit_link and visit_link.get('href'):
|
||||
href = visit_link.get('href')
|
||||
if 'cheapestoil.com' not in href and href.startswith('http'):
|
||||
real_url = href
|
||||
|
||||
# Fallback: look for any external link in the contact section if structured
|
||||
if not real_url:
|
||||
# Try to find the first external link in the main content area
|
||||
# (This is heuristics-based, might need adjustment)
|
||||
content_div = soup.find('div', class_='col-md-8') # Common bootstrap main col
|
||||
if content_div:
|
||||
links = content_div.find_all('a', href=True)
|
||||
for a in links:
|
||||
href = a['href']
|
||||
if href.startswith('http') and 'cheapestoil.com' not in href:
|
||||
real_url = href
|
||||
break
|
||||
|
||||
# 2. Extract Phone
|
||||
# Reuse robust regex pattern logic
|
||||
page_text = soup.get_text(" ", strip=True)
|
||||
|
||||
# Look for "Phone:", "Tel:", etc.
|
||||
# This is a bit simplified compared to the other scraper but likely sufficient
|
||||
phone_match = re.search(r'(?:Phone|Tel|Call).*?(\(?\d{3}\)?[\s.\-]?\d{3}[\s.\-]?\d{4})', page_text, re.IGNORECASE)
|
||||
if phone_match:
|
||||
phone_candidate = phone_match.group(1)
|
||||
else:
|
||||
# Fallback to just finding a phone pattern
|
||||
phone_match = re.search(r'(?:\(?\d{3}\)?[\s.\-]?\d{3}[\s.\-]?\d{4})', page_text)
|
||||
phone_candidate = phone_match.group(0) if phone_match else None
|
||||
|
||||
if phone_candidate:
|
||||
digits = re.sub(r'\D', '', phone_candidate)
|
||||
if len(digits) == 10:
|
||||
phone = f"({digits[:3]}) {digits[3:6]}-{digits[6:]}"
|
||||
else:
|
||||
phone = phone_candidate
|
||||
|
||||
return {"url": real_url, "phone": phone}
|
||||
|
||||
except Exception as e:
|
||||
logging.warning(f"Failed to fetch details for {slug}: {e}")
|
||||
return {"url": None, "phone": None}
|
||||
|
||||
|
||||
|
||||
def fetch_county_prices(state_api_name: str, county_name: str | None = None) -> list:
|
||||
"""
|
||||
Fetch price data from the CheapestOil API.
|
||||
|
||||
Args:
|
||||
state_api_name: State name as used by the API (e.g. "Massachusetts", "NewHampshire")
|
||||
county_name: County name filter, or None for state-level results
|
||||
|
||||
Returns:
|
||||
List of raw JSON arrays from the API, or empty list on failure.
|
||||
"""
|
||||
params = {
|
||||
"sort": 0,
|
||||
"state": state_api_name,
|
||||
"county": county_name or "",
|
||||
"zip": "",
|
||||
}
|
||||
try:
|
||||
resp = requests.get(
|
||||
API_URL, params=params, headers=DEFAULT_HEADERS, timeout=REQUEST_TIMEOUT
|
||||
)
|
||||
resp.raise_for_status()
|
||||
data = resp.json()
|
||||
if isinstance(data, list):
|
||||
return data
|
||||
logging.warning(f"Unexpected response type from API: {type(data)}")
|
||||
return []
|
||||
except requests.exceptions.RequestException as e:
|
||||
logging.error(f"Error fetching CheapestOil API for {state_api_name}/{county_name}: {e}")
|
||||
return []
|
||||
except ValueError as e:
|
||||
logging.error(f"Invalid JSON from CheapestOil API: {e}")
|
||||
return []
|
||||
Reference in New Issue
Block a user