- Add newenglandoil/ package as the primary scraper (replaces fuel_scraper) - Add cheapestoil/ package as a secondary market price scraper - Add app.py entry point for direct execution - Update run.py: new scrape_cheapest(), migrate command, --state filter, --refresh-metadata flag for overwriting existing phone/URL data - Update models.py with latest schema fields - Update requirements.txt dependencies - Update Dockerfile and docker-compose.yml for new structure - Remove deprecated fuel_scraper module, test.py, and log file Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
112 lines
3.6 KiB
Python
112 lines
3.6 KiB
Python
"""
|
|
HTTP client module for making web requests.
|
|
"""
|
|
import logging
|
|
import re
|
|
import time
|
|
import requests
|
|
from bs4 import BeautifulSoup
|
|
|
|
# Default headers to mimic a browser
|
|
DEFAULT_HEADERS = {
|
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
|
|
}
|
|
|
|
REQUEST_TIMEOUT = 20
|
|
PHONE_FETCH_DELAY = 1 # seconds between phone page requests
|
|
|
|
|
|
def make_request(url: str) -> BeautifulSoup | None:
|
|
"""
|
|
Fetch a URL and return a BeautifulSoup object.
|
|
|
|
Args:
|
|
url: The URL to fetch
|
|
|
|
Returns:
|
|
BeautifulSoup object if successful, None otherwise
|
|
"""
|
|
try:
|
|
response = requests.get(url, headers=DEFAULT_HEADERS, timeout=REQUEST_TIMEOUT)
|
|
response.raise_for_status()
|
|
return BeautifulSoup(response.content, 'html.parser')
|
|
except requests.exceptions.RequestException as e:
|
|
logging.error(f"Error fetching {url}: {e}")
|
|
return None
|
|
|
|
|
|
def fetch_phone_number(base_url: str, phone_page_path: str, state_slug: str = "") -> str | None:
|
|
"""
|
|
Fetch a phone number from a newenglandoil phones.asp page.
|
|
|
|
Args:
|
|
base_url: Site base URL (e.g. "https://www.newenglandoil.com")
|
|
phone_page_path: Relative path like "phones.asp?zone=1&ID=10&a=MA1"
|
|
state_slug: State slug for URL path (e.g. "massachusetts")
|
|
|
|
Returns:
|
|
Phone number string or None if not found.
|
|
"""
|
|
# Build full URL - phone_page_path may be relative
|
|
if phone_page_path.startswith('http'):
|
|
url = phone_page_path
|
|
elif state_slug:
|
|
url = f"{base_url}/{state_slug}/{phone_page_path}"
|
|
else:
|
|
url = f"{base_url}/{phone_page_path}"
|
|
|
|
time.sleep(PHONE_FETCH_DELAY)
|
|
|
|
soup = make_request(url)
|
|
if not soup:
|
|
return None
|
|
|
|
# Look for phone number patterns in the page text
|
|
page_text = soup.get_text(" ", strip=True)
|
|
|
|
# Common US phone formats: (508) 555-1234, 508-555-1234, 508.555.1234, 5085551234
|
|
# Captures:
|
|
# 1. Optional open paren
|
|
# 2. 3 digits (area code)
|
|
# 3. Optional close paren
|
|
# 4. Separator (space, dot, dash)
|
|
# 5. 3 digits (prefix)
|
|
# 6. Separator
|
|
# 7. 4 digits (line number)
|
|
phone_pattern = re.compile(
|
|
r'(?:\(?(\d{3})\)?[\s.\-]?(\d{3})[\s.\-]?(\d{4}))'
|
|
)
|
|
|
|
# Try to find a phone number near "Phone:" or "Tel:" first
|
|
keyword_pattern = re.compile(r'(?:Phone|Tel|Call|Contact).*?(\(?\d{3}\)?[\s.\-]?\d{3}[\s.\-]?\d{4})', re.IGNORECASE)
|
|
keyword_match = keyword_pattern.search(page_text)
|
|
|
|
candidate = None
|
|
if keyword_match:
|
|
# If we found a number near a keyword, use that one.
|
|
candidate = keyword_match.group(1)
|
|
else:
|
|
# Otherwise, look for the first valid phone pattern
|
|
matches = phone_pattern.findall(page_text)
|
|
for m in matches:
|
|
# m is a tuple of groups: ('508', '555', '1234')
|
|
full_num = "".join(m)
|
|
|
|
# Simple heuristic to avoid dates like 2024, 2025 or common years if adjacent
|
|
# But the regex requires 3-3-4 structure so a simple "2024" won't match unless it's like 202-455-1234
|
|
# We can filter out obviously bad "numbers" if needed, e.g. 000-000-0000
|
|
if full_num.startswith('000'):
|
|
continue
|
|
|
|
candidate = f"{m[0]}-{m[1]}-{m[2]}"
|
|
break
|
|
|
|
if candidate:
|
|
digits = re.sub(r'\D', '', candidate)
|
|
if len(digits) == 10:
|
|
return f"({digits[:3]}) {digits[3:6]}-{digits[6:]}"
|
|
return candidate
|
|
|
|
logging.debug(f"No phone number found on {url}")
|
|
return None
|