refactor: replace fuel_scraper with newenglandoil + cheapestoil scrapers
- Add newenglandoil/ package as the primary scraper (replaces fuel_scraper) - Add cheapestoil/ package as a secondary market price scraper - Add app.py entry point for direct execution - Update run.py: new scrape_cheapest(), migrate command, --state filter, --refresh-metadata flag for overwriting existing phone/URL data - Update models.py with latest schema fields - Update requirements.txt dependencies - Update Dockerfile and docker-compose.yml for new structure - Remove deprecated fuel_scraper module, test.py, and log file Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
111
newenglandoil/http_client.py
Normal file
111
newenglandoil/http_client.py
Normal file
@@ -0,0 +1,111 @@
|
||||
"""
|
||||
HTTP client module for making web requests.
|
||||
"""
|
||||
import logging
|
||||
import re
|
||||
import time
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
# Default headers to mimic a browser
|
||||
DEFAULT_HEADERS = {
|
||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
|
||||
}
|
||||
|
||||
REQUEST_TIMEOUT = 20
|
||||
PHONE_FETCH_DELAY = 1 # seconds between phone page requests
|
||||
|
||||
|
||||
def make_request(url: str) -> BeautifulSoup | None:
|
||||
"""
|
||||
Fetch a URL and return a BeautifulSoup object.
|
||||
|
||||
Args:
|
||||
url: The URL to fetch
|
||||
|
||||
Returns:
|
||||
BeautifulSoup object if successful, None otherwise
|
||||
"""
|
||||
try:
|
||||
response = requests.get(url, headers=DEFAULT_HEADERS, timeout=REQUEST_TIMEOUT)
|
||||
response.raise_for_status()
|
||||
return BeautifulSoup(response.content, 'html.parser')
|
||||
except requests.exceptions.RequestException as e:
|
||||
logging.error(f"Error fetching {url}: {e}")
|
||||
return None
|
||||
|
||||
|
||||
def fetch_phone_number(base_url: str, phone_page_path: str, state_slug: str = "") -> str | None:
|
||||
"""
|
||||
Fetch a phone number from a newenglandoil phones.asp page.
|
||||
|
||||
Args:
|
||||
base_url: Site base URL (e.g. "https://www.newenglandoil.com")
|
||||
phone_page_path: Relative path like "phones.asp?zone=1&ID=10&a=MA1"
|
||||
state_slug: State slug for URL path (e.g. "massachusetts")
|
||||
|
||||
Returns:
|
||||
Phone number string or None if not found.
|
||||
"""
|
||||
# Build full URL - phone_page_path may be relative
|
||||
if phone_page_path.startswith('http'):
|
||||
url = phone_page_path
|
||||
elif state_slug:
|
||||
url = f"{base_url}/{state_slug}/{phone_page_path}"
|
||||
else:
|
||||
url = f"{base_url}/{phone_page_path}"
|
||||
|
||||
time.sleep(PHONE_FETCH_DELAY)
|
||||
|
||||
soup = make_request(url)
|
||||
if not soup:
|
||||
return None
|
||||
|
||||
# Look for phone number patterns in the page text
|
||||
page_text = soup.get_text(" ", strip=True)
|
||||
|
||||
# Common US phone formats: (508) 555-1234, 508-555-1234, 508.555.1234, 5085551234
|
||||
# Captures:
|
||||
# 1. Optional open paren
|
||||
# 2. 3 digits (area code)
|
||||
# 3. Optional close paren
|
||||
# 4. Separator (space, dot, dash)
|
||||
# 5. 3 digits (prefix)
|
||||
# 6. Separator
|
||||
# 7. 4 digits (line number)
|
||||
phone_pattern = re.compile(
|
||||
r'(?:\(?(\d{3})\)?[\s.\-]?(\d{3})[\s.\-]?(\d{4}))'
|
||||
)
|
||||
|
||||
# Try to find a phone number near "Phone:" or "Tel:" first
|
||||
keyword_pattern = re.compile(r'(?:Phone|Tel|Call|Contact).*?(\(?\d{3}\)?[\s.\-]?\d{3}[\s.\-]?\d{4})', re.IGNORECASE)
|
||||
keyword_match = keyword_pattern.search(page_text)
|
||||
|
||||
candidate = None
|
||||
if keyword_match:
|
||||
# If we found a number near a keyword, use that one.
|
||||
candidate = keyword_match.group(1)
|
||||
else:
|
||||
# Otherwise, look for the first valid phone pattern
|
||||
matches = phone_pattern.findall(page_text)
|
||||
for m in matches:
|
||||
# m is a tuple of groups: ('508', '555', '1234')
|
||||
full_num = "".join(m)
|
||||
|
||||
# Simple heuristic to avoid dates like 2024, 2025 or common years if adjacent
|
||||
# But the regex requires 3-3-4 structure so a simple "2024" won't match unless it's like 202-455-1234
|
||||
# We can filter out obviously bad "numbers" if needed, e.g. 000-000-0000
|
||||
if full_num.startswith('000'):
|
||||
continue
|
||||
|
||||
candidate = f"{m[0]}-{m[1]}-{m[2]}"
|
||||
break
|
||||
|
||||
if candidate:
|
||||
digits = re.sub(r'\D', '', candidate)
|
||||
if len(digits) == 10:
|
||||
return f"({digits[:3]}) {digits[3:6]}-{digits[6:]}"
|
||||
return candidate
|
||||
|
||||
logging.debug(f"No phone number found on {url}")
|
||||
return None
|
||||
Reference in New Issue
Block a user