refactor: replace fuel_scraper with newenglandoil + cheapestoil scrapers
- Add newenglandoil/ package as the primary scraper (replaces fuel_scraper) - Add cheapestoil/ package as a secondary market price scraper - Add app.py entry point for direct execution - Update run.py: new scrape_cheapest(), migrate command, --state filter, --refresh-metadata flag for overwriting existing phone/URL data - Update models.py with latest schema fields - Update requirements.txt dependencies - Update Dockerfile and docker-compose.yml for new structure - Remove deprecated fuel_scraper module, test.py, and log file Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
4
newenglandoil/__init__.py
Normal file
4
newenglandoil/__init__.py
Normal file
@@ -0,0 +1,4 @@
|
||||
# newenglandoil package
|
||||
from .scraper import main
|
||||
|
||||
__all__ = ["main"]
|
||||
125
newenglandoil/config.py
Normal file
125
newenglandoil/config.py
Normal file
@@ -0,0 +1,125 @@
|
||||
"""
|
||||
Configuration module for the fuel scraper.
|
||||
Contains site definitions, zone-to-county mapping, and logging setup.
|
||||
"""
|
||||
import logging
|
||||
|
||||
# --- SITES CONFIGURATION ---
|
||||
SITES_CONFIG = [
|
||||
{
|
||||
"site_name": "NewEnglandOil",
|
||||
"base_url": "https://www.newenglandoil.com",
|
||||
"url_template": "{base_url}/{state_slug}/{zone_slug}.asp?type={oil_type}",
|
||||
"oil_type": 0,
|
||||
"locations": {
|
||||
"connecticut": [
|
||||
"zone1", "zone2", "zone3", "zone4", "zone5", "zone6", "zone7",
|
||||
"zone8", "zone9", "zone10"
|
||||
],
|
||||
"massachusetts": [
|
||||
"zone1", "zone2", "zone3", "zone4", "zone5", "zone6",
|
||||
"zone7", "zone8", "zone9", "zone10", "zone11", "zone12",
|
||||
"zone13", "zone14", "zone15"
|
||||
],
|
||||
"newhampshire": [
|
||||
"zone1", "zone2", "zone3", "zone4", "zone5", "zone6"
|
||||
],
|
||||
"rhodeisland": [
|
||||
"zone1", "zone2", "zone3", "zone4"
|
||||
],
|
||||
}
|
||||
},
|
||||
{
|
||||
"site_name": "MaineOil",
|
||||
"base_url": "https://www.maineoil.com",
|
||||
"url_template": "{base_url}/{zone_slug}.asp?type={oil_type}",
|
||||
"oil_type": 0,
|
||||
"locations": {
|
||||
"maine": [
|
||||
"zone1", "zone2", "zone3", "zone4", "zone5",
|
||||
"zone6", "zone7"
|
||||
]
|
||||
}
|
||||
}
|
||||
]
|
||||
|
||||
# --- STATE ABBREVIATION MAP ---
|
||||
# Maps lowercase state keys (as used in SITES_CONFIG locations) to 2-letter abbreviations
|
||||
STATE_ABBREV_MAP = {
|
||||
"connecticut": "CT",
|
||||
"massachusetts": "MA",
|
||||
"maine": "ME",
|
||||
"newhampshire": "NH",
|
||||
"rhodeisland": "RI",
|
||||
"vermont": "VT",
|
||||
}
|
||||
|
||||
# --- ZONE-TO-COUNTY MAPPING ---
|
||||
# Maps (state_key, zone_number) -> (state_abbrev, county_name)
|
||||
# state_key matches the keys in SITES_CONFIG locations (lowercase, no spaces)
|
||||
# county_name must match the county.name in the database exactly
|
||||
ZONE_COUNTY_MAP = {
|
||||
# Connecticut (10 zones -> 8 counties)
|
||||
("connecticut", 1): ("CT", "New London"), # Southeast CT
|
||||
("connecticut", 2): ("CT", "Windham"), # Northeast CT
|
||||
("connecticut", 3): ("CT", "New Haven"), # New Haven, Bridgeport
|
||||
("connecticut", 4): ("CT", "Middlesex"), # Southeast Central CT
|
||||
("connecticut", 5): ("CT", "New Haven"), # Southwest Central CT
|
||||
("connecticut", 6): ("CT", "Hartford"), # Greater Hartford
|
||||
("connecticut", 7): ("CT", "Litchfield"), # West CT
|
||||
("connecticut", 8): ("CT", "Fairfield"), # Southwest CT
|
||||
("connecticut", 9): ("CT", "Tolland"), # Northeast Central CT
|
||||
("connecticut", 10): ("CT", "Litchfield"), # Northwest CT
|
||||
|
||||
# Massachusetts (15 zones -> 14 counties)
|
||||
("massachusetts", 1): ("MA", "Suffolk"), # South Boston
|
||||
("massachusetts", 2): ("MA", "Middlesex"), # North Boston
|
||||
("massachusetts", 3): ("MA", "Norfolk"), # Southwest of Boston
|
||||
("massachusetts", 4): ("MA", "Plymouth"), # South of Boston
|
||||
("massachusetts", 5): ("MA", "Middlesex"), # West of Boston
|
||||
("massachusetts", 6): ("MA", "Bristol"), # Southern Massachusetts
|
||||
("massachusetts", 7): ("MA", "Barnstable"), # Cape Cod & Islands
|
||||
("massachusetts", 8): ("MA", "Essex"), # Northwest of Boston
|
||||
("massachusetts", 9): ("MA", "Essex"), # North of Boston
|
||||
("massachusetts", 10): ("MA", "Worcester"), # Central Massachusetts
|
||||
("massachusetts", 11): ("MA", "Worcester"), # East Central Massachusetts
|
||||
("massachusetts", 12): ("MA", "Hampshire"), # West Central Massachusetts
|
||||
("massachusetts", 13): ("MA", "Hampden"), # Springfield Area
|
||||
("massachusetts", 14): ("MA", "Franklin"), # Northwestern Massachusetts
|
||||
("massachusetts", 15): ("MA", "Berkshire"), # Western Massachusetts
|
||||
|
||||
# New Hampshire (6 zones -> 10 counties)
|
||||
("newhampshire", 1): ("NH", "Coos"), # Northern NH
|
||||
("newhampshire", 2): ("NH", "Strafford"), # Eastern NH
|
||||
("newhampshire", 3): ("NH", "Merrimack"), # Central NH
|
||||
("newhampshire", 4): ("NH", "Grafton"), # West Central NH
|
||||
("newhampshire", 5): ("NH", "Cheshire"), # Southwest NH
|
||||
("newhampshire", 6): ("NH", "Hillsborough"), # South Central NH
|
||||
|
||||
# Rhode Island (4 zones -> 5 counties)
|
||||
("rhodeisland", 1): ("RI", "Newport"), # Southeast RI
|
||||
("rhodeisland", 2): ("RI", "Providence"), # Northern RI
|
||||
("rhodeisland", 3): ("RI", "Washington"), # Southwest RI
|
||||
("rhodeisland", 4): ("RI", "Kent"), # Central RI
|
||||
|
||||
# Maine (7 zones -> 16 counties, via MaineOil.com)
|
||||
("maine", 1): ("ME", "Cumberland"), # Greater Portland
|
||||
("maine", 2): ("ME", "Kennebec"), # Augusta/Waterville
|
||||
("maine", 3): ("ME", "Androscoggin"), # Auburn/Lewiston/Western
|
||||
("maine", 4): ("ME", "York"), # Southern Maine
|
||||
("maine", 5): ("ME", "Knox"), # Mid-Coast
|
||||
("maine", 6): ("ME", "Penobscot"), # Bangor West
|
||||
("maine", 7): ("ME", "Washington"), # Downeast
|
||||
}
|
||||
|
||||
# --- LOGGING CONFIGURATION ---
|
||||
LOG_FILE = "oil_scraper.log"
|
||||
|
||||
|
||||
def setup_logging():
|
||||
"""Configure logging for the scraper."""
|
||||
logging.basicConfig(
|
||||
filename=LOG_FILE,
|
||||
level=logging.INFO,
|
||||
format='%(asctime)s - %(levelname)s - [%(filename)s:%(lineno)d] - %(message)s'
|
||||
)
|
||||
131
newenglandoil/db_operations.py
Normal file
131
newenglandoil/db_operations.py
Normal file
@@ -0,0 +1,131 @@
|
||||
"""
|
||||
Database operations module for oil price CRUD operations.
|
||||
"""
|
||||
import logging
|
||||
import sys
|
||||
import os
|
||||
from datetime import datetime
|
||||
# Add parent directory to path for imports
|
||||
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
||||
|
||||
from sqlalchemy.orm import Session
|
||||
from sqlalchemy import func
|
||||
import models
|
||||
|
||||
|
||||
def upsert_oil_price(db_session: Session, item_dict: dict, force_update_metadata: bool = False) -> bool:
|
||||
"""
|
||||
Insert or update an oil price record.
|
||||
|
||||
Logic:
|
||||
- Match by (name, state, county_id) - case insensitive on name!
|
||||
- If county_id is None, fall back to (name, state, zone).
|
||||
- If match found:
|
||||
- If company_id is set: SKIP (vendor managed).
|
||||
- Update name to formatted version (e.g. "Leblanc Oil" vs "LEBLANC OIL").
|
||||
- Update phone/url if missing OR force_update_metadata is True.
|
||||
- Update price/date if changed.
|
||||
- If no match: INSERT.
|
||||
|
||||
Args:
|
||||
db_session: SQLAlchemy session
|
||||
item_dict: Dictionary with state, zone, name, price, date, county_id
|
||||
force_update_metadata: If True, overwrite existing phone/url
|
||||
"""
|
||||
county_id = item_dict.get("county_id")
|
||||
site_name = item_dict.get("site_name", "NewEnglandOil")
|
||||
name_clean = item_dict["name"].strip()
|
||||
|
||||
# Query for existing record - Case Insensitive
|
||||
query = db_session.query(models.OilPrice).filter(
|
||||
func.lower(models.OilPrice.name) == name_clean.lower(),
|
||||
models.OilPrice.state == item_dict["state"]
|
||||
)
|
||||
|
||||
if county_id is not None:
|
||||
query = query.filter(models.OilPrice.county_id == county_id)
|
||||
else:
|
||||
query = query.filter(models.OilPrice.zone == item_dict["zone"])
|
||||
|
||||
existing_record = query.first()
|
||||
|
||||
new_phone = item_dict.get("phone")
|
||||
new_url = item_dict.get("url")
|
||||
|
||||
if existing_record:
|
||||
# Record exists
|
||||
if existing_record.company_id is not None:
|
||||
logging.debug(
|
||||
f"[{site_name}] Skipping update for {name_clean} (ID={existing_record.id}) "
|
||||
"due to non-null company_id"
|
||||
)
|
||||
return False
|
||||
|
||||
updated = False
|
||||
|
||||
# 1. Update name casing if different (and new name looks "better" e.g. not all caps)
|
||||
# Simple heuristic: if existing is all caps and new is mixed, take new.
|
||||
if existing_record.name != name_clean:
|
||||
# We trust the scraper's _smart_title() output is generally good
|
||||
existing_record.name = name_clean
|
||||
updated = True
|
||||
|
||||
# 2. Update county_id if we have one (scraper resolved it) and DB didn't have it
|
||||
if county_id is not None and existing_record.county_id != county_id:
|
||||
existing_record.county_id = county_id
|
||||
updated = True
|
||||
|
||||
# 3. Backfill or Force Update phone/url
|
||||
if new_phone:
|
||||
if not existing_record.phone or (force_update_metadata and existing_record.phone != new_phone):
|
||||
existing_record.phone = new_phone
|
||||
updated = True
|
||||
|
||||
if new_url:
|
||||
if not existing_record.url or (force_update_metadata and existing_record.url != new_url):
|
||||
existing_record.url = new_url
|
||||
updated = True
|
||||
|
||||
# 4. Check Price Change
|
||||
# We compare as float provided logic is sound, but float equality can be tricky.
|
||||
# However, price is usually 2 decimals.
|
||||
if abs(existing_record.price - item_dict["price"]) > 0.001:
|
||||
existing_record.price = item_dict["price"]
|
||||
existing_record.date = item_dict["date"]
|
||||
existing_record.scrapetimestamp = datetime.utcnow()
|
||||
logging.info(
|
||||
f"[{site_name}] Updated price for {name_clean} (ID={existing_record.id}) "
|
||||
f"to {item_dict['price']}"
|
||||
)
|
||||
return True
|
||||
elif updated:
|
||||
existing_record.scrapetimestamp = datetime.utcnow()
|
||||
logging.info(
|
||||
f"[{site_name}] Updated metadata for {name_clean} (ID={existing_record.id})"
|
||||
)
|
||||
return True
|
||||
else:
|
||||
# No meaningful change
|
||||
logging.debug(
|
||||
f"[{site_name}] Price unchanged for {name_clean} in {item_dict['state']} zone {item_dict['zone']}"
|
||||
)
|
||||
return False
|
||||
else:
|
||||
# Create new
|
||||
oil_price_record = models.OilPrice(
|
||||
state=item_dict["state"],
|
||||
zone=item_dict["zone"],
|
||||
name=name_clean,
|
||||
price=item_dict["price"],
|
||||
date=item_dict["date"],
|
||||
county_id=county_id,
|
||||
phone=new_phone,
|
||||
url=new_url,
|
||||
scrapetimestamp=datetime.utcnow()
|
||||
)
|
||||
db_session.add(oil_price_record)
|
||||
logging.info(
|
||||
f"[{site_name}] Added new record for {name_clean} in {item_dict['state']} zone {item_dict['zone']} "
|
||||
f"(county_id={county_id})"
|
||||
)
|
||||
return True
|
||||
111
newenglandoil/http_client.py
Normal file
111
newenglandoil/http_client.py
Normal file
@@ -0,0 +1,111 @@
|
||||
"""
|
||||
HTTP client module for making web requests.
|
||||
"""
|
||||
import logging
|
||||
import re
|
||||
import time
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
# Default headers to mimic a browser
|
||||
DEFAULT_HEADERS = {
|
||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
|
||||
}
|
||||
|
||||
REQUEST_TIMEOUT = 20
|
||||
PHONE_FETCH_DELAY = 1 # seconds between phone page requests
|
||||
|
||||
|
||||
def make_request(url: str) -> BeautifulSoup | None:
|
||||
"""
|
||||
Fetch a URL and return a BeautifulSoup object.
|
||||
|
||||
Args:
|
||||
url: The URL to fetch
|
||||
|
||||
Returns:
|
||||
BeautifulSoup object if successful, None otherwise
|
||||
"""
|
||||
try:
|
||||
response = requests.get(url, headers=DEFAULT_HEADERS, timeout=REQUEST_TIMEOUT)
|
||||
response.raise_for_status()
|
||||
return BeautifulSoup(response.content, 'html.parser')
|
||||
except requests.exceptions.RequestException as e:
|
||||
logging.error(f"Error fetching {url}: {e}")
|
||||
return None
|
||||
|
||||
|
||||
def fetch_phone_number(base_url: str, phone_page_path: str, state_slug: str = "") -> str | None:
|
||||
"""
|
||||
Fetch a phone number from a newenglandoil phones.asp page.
|
||||
|
||||
Args:
|
||||
base_url: Site base URL (e.g. "https://www.newenglandoil.com")
|
||||
phone_page_path: Relative path like "phones.asp?zone=1&ID=10&a=MA1"
|
||||
state_slug: State slug for URL path (e.g. "massachusetts")
|
||||
|
||||
Returns:
|
||||
Phone number string or None if not found.
|
||||
"""
|
||||
# Build full URL - phone_page_path may be relative
|
||||
if phone_page_path.startswith('http'):
|
||||
url = phone_page_path
|
||||
elif state_slug:
|
||||
url = f"{base_url}/{state_slug}/{phone_page_path}"
|
||||
else:
|
||||
url = f"{base_url}/{phone_page_path}"
|
||||
|
||||
time.sleep(PHONE_FETCH_DELAY)
|
||||
|
||||
soup = make_request(url)
|
||||
if not soup:
|
||||
return None
|
||||
|
||||
# Look for phone number patterns in the page text
|
||||
page_text = soup.get_text(" ", strip=True)
|
||||
|
||||
# Common US phone formats: (508) 555-1234, 508-555-1234, 508.555.1234, 5085551234
|
||||
# Captures:
|
||||
# 1. Optional open paren
|
||||
# 2. 3 digits (area code)
|
||||
# 3. Optional close paren
|
||||
# 4. Separator (space, dot, dash)
|
||||
# 5. 3 digits (prefix)
|
||||
# 6. Separator
|
||||
# 7. 4 digits (line number)
|
||||
phone_pattern = re.compile(
|
||||
r'(?:\(?(\d{3})\)?[\s.\-]?(\d{3})[\s.\-]?(\d{4}))'
|
||||
)
|
||||
|
||||
# Try to find a phone number near "Phone:" or "Tel:" first
|
||||
keyword_pattern = re.compile(r'(?:Phone|Tel|Call|Contact).*?(\(?\d{3}\)?[\s.\-]?\d{3}[\s.\-]?\d{4})', re.IGNORECASE)
|
||||
keyword_match = keyword_pattern.search(page_text)
|
||||
|
||||
candidate = None
|
||||
if keyword_match:
|
||||
# If we found a number near a keyword, use that one.
|
||||
candidate = keyword_match.group(1)
|
||||
else:
|
||||
# Otherwise, look for the first valid phone pattern
|
||||
matches = phone_pattern.findall(page_text)
|
||||
for m in matches:
|
||||
# m is a tuple of groups: ('508', '555', '1234')
|
||||
full_num = "".join(m)
|
||||
|
||||
# Simple heuristic to avoid dates like 2024, 2025 or common years if adjacent
|
||||
# But the regex requires 3-3-4 structure so a simple "2024" won't match unless it's like 202-455-1234
|
||||
# We can filter out obviously bad "numbers" if needed, e.g. 000-000-0000
|
||||
if full_num.startswith('000'):
|
||||
continue
|
||||
|
||||
candidate = f"{m[0]}-{m[1]}-{m[2]}"
|
||||
break
|
||||
|
||||
if candidate:
|
||||
digits = re.sub(r'\D', '', candidate)
|
||||
if len(digits) == 10:
|
||||
return f"({digits[:3]}) {digits[3:6]}-{digits[6:]}"
|
||||
return candidate
|
||||
|
||||
logging.debug(f"No phone number found on {url}")
|
||||
return None
|
||||
289
newenglandoil/parsers.py
Normal file
289
newenglandoil/parsers.py
Normal file
@@ -0,0 +1,289 @@
|
||||
"""
|
||||
HTML parsing module for extracting oil price data from web pages.
|
||||
"""
|
||||
import logging
|
||||
import re
|
||||
from urllib.parse import urlparse, parse_qs
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
from .config import STATE_ABBREV_MAP
|
||||
|
||||
|
||||
def parse_zone_slug_to_int(zone_slug_str: str) -> int | None:
|
||||
"""
|
||||
Extract the numeric part of a zone slug.
|
||||
|
||||
Examples:
|
||||
"zone1" -> 1
|
||||
"zonema5" -> 5
|
||||
|
||||
Args:
|
||||
zone_slug_str: Zone slug string like "zone1", "zonema5"
|
||||
|
||||
Returns:
|
||||
Integer zone number or None if parsing fails
|
||||
"""
|
||||
if not zone_slug_str:
|
||||
return None
|
||||
match = re.search(r'\d+$', zone_slug_str)
|
||||
if match:
|
||||
return int(match.group(0))
|
||||
logging.warning(f"Could not parse numeric zone from slug: '{zone_slug_str}'")
|
||||
return None
|
||||
|
||||
|
||||
def _find_price_table_columns(thead) -> dict | None:
|
||||
"""
|
||||
Find column indices for company, price, and date in a table header.
|
||||
|
||||
Args:
|
||||
thead: BeautifulSoup thead element
|
||||
|
||||
Returns:
|
||||
Dictionary with column indices or None if not a price table
|
||||
"""
|
||||
headers_lower = [th.get_text(strip=True).lower() for th in thead.find_all('th')]
|
||||
column_indices = {}
|
||||
|
||||
try:
|
||||
column_indices['company'] = headers_lower.index('company name')
|
||||
price_col_name_part = 'price'
|
||||
column_indices['price'] = next(
|
||||
i for i, header in enumerate(headers_lower) if price_col_name_part in header
|
||||
)
|
||||
column_indices['date'] = headers_lower.index('date')
|
||||
return column_indices
|
||||
except (ValueError, StopIteration):
|
||||
return None
|
||||
|
||||
|
||||
def _smart_title(name: str) -> str:
|
||||
"""
|
||||
Convert a company name to title case, preserving common abbreviations.
|
||||
|
||||
Handles: LLC, INC, CO, LP, HVAC, A1, etc.
|
||||
"""
|
||||
# Common abbreviations that should stay uppercase
|
||||
keep_upper = {"LLC", "INC", "LP", "HVAC", "II", "III", "IV", "USA", "CT", "MA", "NH", "ME", "RI", "VT"}
|
||||
words = name.title().split()
|
||||
result = []
|
||||
for word in words:
|
||||
if word.upper() in keep_upper:
|
||||
result.append(word.upper())
|
||||
else:
|
||||
result.append(word)
|
||||
return " ".join(result)
|
||||
|
||||
|
||||
def _extract_company_url(company_link) -> str | None:
|
||||
"""
|
||||
Extract the actual company URL from a link.
|
||||
|
||||
Handles:
|
||||
1. Redirects: click.asp?x=http://example.com&... -> http://example.com
|
||||
2. Direct links: http://example.com -> http://example.com
|
||||
"""
|
||||
if not company_link:
|
||||
return None
|
||||
|
||||
href = company_link.get('href', '')
|
||||
if not href:
|
||||
return None
|
||||
|
||||
url_candidate = None
|
||||
|
||||
if 'click.asp' in href:
|
||||
# Parse the x parameter which contains the actual URL
|
||||
try:
|
||||
parsed = urlparse(href)
|
||||
params = parse_qs(parsed.query)
|
||||
extracted = params.get('x', [None])[0]
|
||||
if extracted:
|
||||
url_candidate = extracted
|
||||
except Exception:
|
||||
pass
|
||||
elif href.startswith(('http://', 'https://')):
|
||||
# Direct link
|
||||
url_candidate = href
|
||||
|
||||
# Validate the candidate URL
|
||||
if url_candidate:
|
||||
try:
|
||||
# Basic validation
|
||||
if not url_candidate.startswith(('http://', 'https://')):
|
||||
return None
|
||||
|
||||
lower_url = url_candidate.lower()
|
||||
# Filter out internal or competitor site loops
|
||||
if 'newenglandoil.com' in lower_url or 'cheapestoil.com' in lower_url:
|
||||
return None
|
||||
|
||||
return url_candidate
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def _extract_phone_link(cells: list) -> dict | None:
|
||||
"""
|
||||
Extract the phone page link info from a row's phone cell.
|
||||
|
||||
Phone link format: phones.asp?zone=1&ID=10&a=MA1
|
||||
Returns dict with {url, company_neo_id} or None.
|
||||
"""
|
||||
for cell in cells:
|
||||
link = cell.find('a', href=lambda h: h and 'phones.asp' in h)
|
||||
if link:
|
||||
href = link.get('href', '')
|
||||
try:
|
||||
parsed = urlparse(href)
|
||||
params = parse_qs(parsed.query)
|
||||
neo_id = params.get('ID', [None])[0]
|
||||
return {
|
||||
"phone_page_path": href,
|
||||
"neo_id": neo_id,
|
||||
}
|
||||
except Exception:
|
||||
pass
|
||||
return None
|
||||
|
||||
|
||||
def _parse_row(cells: list, column_indices: dict, state_name: str, zone: int) -> dict | None:
|
||||
"""
|
||||
Parse a single table row into a price record.
|
||||
|
||||
Args:
|
||||
cells: List of td elements
|
||||
column_indices: Dictionary mapping column names to indices
|
||||
state_name: State name string (lowercase key like "connecticut")
|
||||
zone: Zone number
|
||||
|
||||
Returns:
|
||||
Dictionary with parsed data or None if parsing fails
|
||||
"""
|
||||
max_required_index = max(column_indices.values())
|
||||
|
||||
if len(cells) <= max_required_index:
|
||||
return None
|
||||
|
||||
# Extract company name (prefer link text if available)
|
||||
company_cell = cells[column_indices['company']]
|
||||
company_name = company_cell.get_text(strip=True)
|
||||
company_link = company_cell.find('a')
|
||||
if company_link:
|
||||
company_name = company_link.get_text(strip=True)
|
||||
|
||||
# Apply title case normalization
|
||||
company_name = _smart_title(company_name)
|
||||
|
||||
# Extract company URL from click.asp link
|
||||
company_url = _extract_company_url(company_link)
|
||||
|
||||
# Extract phone page link info
|
||||
phone_info = _extract_phone_link(cells)
|
||||
|
||||
# Extract and parse price
|
||||
price_str = cells[column_indices['price']].get_text(strip=True)
|
||||
price_float = None
|
||||
try:
|
||||
cleaned_price_str = ''.join(filter(lambda x: x.isdigit() or x == '.', price_str))
|
||||
if cleaned_price_str:
|
||||
price_float = float(cleaned_price_str)
|
||||
except ValueError:
|
||||
logging.warning(f"Could not parse price: '{price_str}' for {company_name} in {state_name}/zone{zone}.")
|
||||
except Exception as e:
|
||||
logging.error(f"Unexpected error parsing price: '{price_str}' for {company_name}. Error: {e}")
|
||||
|
||||
# Extract date
|
||||
date_posted_str = cells[column_indices['date']].get_text(strip=True)
|
||||
|
||||
# Convert state name to 2-letter abbreviation
|
||||
state_abbr = STATE_ABBREV_MAP.get(state_name.lower())
|
||||
if not state_abbr:
|
||||
logging.warning(f"Unknown state key: {state_name}, using capitalized form")
|
||||
state_abbr = state_name.capitalize()
|
||||
|
||||
return {
|
||||
"state": state_abbr,
|
||||
"zone": zone,
|
||||
"name": company_name,
|
||||
"price": price_float,
|
||||
"date": date_posted_str,
|
||||
"url": company_url,
|
||||
"phone_info": phone_info,
|
||||
}
|
||||
|
||||
|
||||
def parse_price_table(soup: BeautifulSoup, state_name_key: str, zone_slug_str: str, site_name: str = "NewEnglandOil") -> list[dict]:
|
||||
"""
|
||||
Parse price tables from a BeautifulSoup page.
|
||||
|
||||
Args:
|
||||
soup: BeautifulSoup object of the page
|
||||
state_name_key: State key like "connecticut", "maine"
|
||||
zone_slug_str: Zone slug like "zone1", "zonema5"
|
||||
|
||||
Returns:
|
||||
List of dictionaries containing price data
|
||||
"""
|
||||
data_dicts = []
|
||||
all_tables = soup.find_all('table')
|
||||
logging.info(f"[{site_name}] Found {len(all_tables)} table(s) on page for {state_name_key} - {zone_slug_str}.")
|
||||
|
||||
if not all_tables:
|
||||
logging.warning(f"[{site_name}] No HTML tables found at all for {state_name_key} - {zone_slug_str}.")
|
||||
return data_dicts
|
||||
|
||||
# Parse zone number from slug
|
||||
zone_int = parse_zone_slug_to_int(zone_slug_str)
|
||||
if zone_int is None:
|
||||
logging.error(f"[{site_name}] Cannot parse zone number for {state_name_key} - {zone_slug_str}. Skipping.")
|
||||
return data_dicts
|
||||
|
||||
candidate_tables_found = 0
|
||||
|
||||
for table_index, table in enumerate(all_tables):
|
||||
thead = table.find('thead')
|
||||
if not thead:
|
||||
logging.debug(f"Table {table_index} has no thead.")
|
||||
continue
|
||||
|
||||
# Check if this is a price table
|
||||
column_indices = _find_price_table_columns(thead)
|
||||
if column_indices is None:
|
||||
logging.debug(f"Table {table_index} headers do not contain all key columns.")
|
||||
continue
|
||||
|
||||
logging.debug(f"Table {table_index} identified as price table. Indices: {column_indices}")
|
||||
candidate_tables_found += 1
|
||||
|
||||
# Parse table body
|
||||
tbody = table.find('tbody')
|
||||
if not tbody:
|
||||
logging.warning(f"[{site_name}] Price table identified by headers has no tbody. Skipping. State: {state_name_key}, Zone: {zone_slug_str}")
|
||||
continue
|
||||
|
||||
rows = tbody.find_all('tr')
|
||||
if not rows:
|
||||
logging.debug(f"No rows found in tbody for price table in {state_name_key}/{zone_slug_str}")
|
||||
continue
|
||||
|
||||
# Parse each row
|
||||
for row_index, row in enumerate(rows):
|
||||
cells = row.find_all('td')
|
||||
record = _parse_row(cells, column_indices, state_name_key, zone_int)
|
||||
|
||||
if record:
|
||||
data_dicts.append(record)
|
||||
elif len(cells) > 0:
|
||||
max_required = max(column_indices.values()) + 1
|
||||
logging.warning(
|
||||
f"[{site_name}] Skipping row {row_index+1} with insufficient cells ({len(cells)}, need {max_required}) "
|
||||
f"in {state_name_key}/{zone_slug_str}"
|
||||
)
|
||||
|
||||
if candidate_tables_found == 0:
|
||||
logging.warning(f"[{site_name}] No tables matching expected price table structure found for {state_name_key} - {zone_slug_str}.")
|
||||
|
||||
return data_dicts
|
||||
266
newenglandoil/scraper.py
Normal file
266
newenglandoil/scraper.py
Normal file
@@ -0,0 +1,266 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Main scraper orchestrator module.
|
||||
Coordinates fetching, parsing, and storing oil price data.
|
||||
"""
|
||||
import logging
|
||||
import sys
|
||||
import os
|
||||
|
||||
# Add parent directory to path for imports
|
||||
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
||||
|
||||
from sqlalchemy.orm import Session
|
||||
from database import SessionLocal, init_db
|
||||
import models
|
||||
|
||||
from .config import SITES_CONFIG, ZONE_COUNTY_MAP, setup_logging, STATE_ABBREV_MAP
|
||||
from .http_client import make_request, fetch_phone_number
|
||||
from .parsers import parse_price_table, parse_zone_slug_to_int
|
||||
from .db_operations import upsert_oil_price
|
||||
|
||||
|
||||
def _build_county_lookup(db_session: Session) -> dict:
|
||||
"""
|
||||
Build a lookup dict from (state_abbrev, county_name) -> county_id
|
||||
by querying the county table.
|
||||
"""
|
||||
counties = db_session.query(models.County).all()
|
||||
lookup = {}
|
||||
for c in counties:
|
||||
if c.name:
|
||||
lookup[(c.state, c.name.strip())] = c.id
|
||||
logging.info(f"Built county lookup with {len(lookup)} entries")
|
||||
return lookup
|
||||
|
||||
|
||||
def _resolve_county_id(state_key: str, zone_number: int, county_lookup: dict) -> int | None:
|
||||
"""
|
||||
Resolve a county_id from ZONE_COUNTY_MAP and the county lookup.
|
||||
Returns None if no mapping exists.
|
||||
"""
|
||||
mapping = ZONE_COUNTY_MAP.get((state_key, zone_number))
|
||||
if not mapping:
|
||||
logging.debug(f"No zone-to-county mapping for ({state_key}, {zone_number})")
|
||||
return None
|
||||
state_abbrev, county_name = mapping
|
||||
county_id = county_lookup.get((state_abbrev, county_name))
|
||||
if county_id is None:
|
||||
logging.warning(f"County not found in DB: ({state_abbrev}, {county_name}) for zone ({state_key}, {zone_number})")
|
||||
return county_id
|
||||
|
||||
|
||||
def _scrape_zone(
|
||||
db_session: Session,
|
||||
site_name: str,
|
||||
url_template: str,
|
||||
base_url: str,
|
||||
oil_type: int,
|
||||
state_key: str,
|
||||
zone_slug: str,
|
||||
county_lookup: dict,
|
||||
phone_cache: dict,
|
||||
refresh_metadata: bool = False,
|
||||
) -> int:
|
||||
"""
|
||||
Scrape a single zone and store records.
|
||||
|
||||
Args:
|
||||
phone_cache: Dict mapping neo_id -> phone string. Shared across zones
|
||||
to avoid re-fetching the same company's phone page.
|
||||
refresh_metadata: If True, force re-fetch phone even if in cache (or not cached yet)
|
||||
and overwrite DB values.
|
||||
|
||||
Returns:
|
||||
Number of records processed
|
||||
"""
|
||||
format_params = {
|
||||
"base_url": base_url,
|
||||
"state_slug": state_key,
|
||||
"zone_slug": zone_slug,
|
||||
"oil_type": oil_type
|
||||
}
|
||||
target_url = url_template.format(**format_params)
|
||||
|
||||
logging.info(f"[{site_name}] Scraping: {target_url} (State: {state_key}, Zone Slug: {zone_slug})")
|
||||
|
||||
soup = make_request(target_url)
|
||||
if not soup:
|
||||
logging.warning(f"[{site_name}] Failed to retrieve or parse {target_url}. Skipping.")
|
||||
return 0
|
||||
|
||||
parsed_items = parse_price_table(soup, state_key, zone_slug, site_name)
|
||||
|
||||
if not parsed_items:
|
||||
logging.info(f"[{site_name}] No data extracted from {target_url}")
|
||||
return 0
|
||||
|
||||
# Resolve county_id for this zone
|
||||
zone_number = parse_zone_slug_to_int(zone_slug)
|
||||
county_id = None
|
||||
if zone_number is not None:
|
||||
county_id = _resolve_county_id(state_key, zone_number, county_lookup)
|
||||
|
||||
records_processed = 0
|
||||
for item_dict in parsed_items:
|
||||
item_dict["county_id"] = county_id
|
||||
item_dict["site_name"] = site_name
|
||||
|
||||
# Fetch phone number if we have phone_info and haven't fetched this company yet
|
||||
phone_info = item_dict.pop("phone_info", None)
|
||||
if phone_info:
|
||||
neo_id = phone_info.get("neo_id")
|
||||
|
||||
# If refresh_metadata is True, we want to fetch regardless of cache check initially
|
||||
# to refresh the cache value if needed.
|
||||
# Use phone_page_path as the cache key because neo_id is only unique per zone.
|
||||
# phone_page_path typically looks like "phones.asp?zone=1&ID=10&a=MA1" effectively unique.
|
||||
phone_key = phone_info.get("phone_page_path")
|
||||
|
||||
if phone_key:
|
||||
should_fetch = False
|
||||
if phone_key in phone_cache:
|
||||
if refresh_metadata:
|
||||
# Even if in cache, we might want to refetch?
|
||||
# Or maybe just trust first fetch in this run.
|
||||
# Let's say cache handles current runtime, refresh_metadata handles DB.
|
||||
# BUT if we want to refresh, we should fetch it at least once this run.
|
||||
item_dict["phone"] = phone_cache[phone_key]
|
||||
else:
|
||||
item_dict["phone"] = phone_cache[phone_key]
|
||||
else:
|
||||
should_fetch = True
|
||||
|
||||
if should_fetch:
|
||||
# Only include state_slug in phone URL if the site uses it in its URL template
|
||||
slug = state_key if "{state_slug}" in url_template else ""
|
||||
phone = fetch_phone_number(base_url, phone_info["phone_page_path"], slug)
|
||||
phone_cache[phone_key] = phone
|
||||
item_dict["phone"] = phone
|
||||
if phone:
|
||||
logging.info(f"[{site_name}] Fetched phone for {item_dict['name']} (ID={neo_id}): {phone}")
|
||||
|
||||
if upsert_oil_price(db_session, item_dict, force_update_metadata=refresh_metadata):
|
||||
records_processed += 1
|
||||
|
||||
logging.info(
|
||||
f"[{site_name}] Processed {len(parsed_items)} records from {site_name} - {state_key}/{zone_slug} "
|
||||
f"({records_processed} inserted/updated, county_id={county_id}) (Size: {len(parsed_items)})"
|
||||
)
|
||||
|
||||
return len(parsed_items)
|
||||
|
||||
|
||||
def _scrape_site(db_session: Session, site_config: dict, county_lookup: dict, refresh_metadata: bool = False) -> int:
|
||||
"""
|
||||
Scrape all zones for a single site.
|
||||
|
||||
Returns:
|
||||
Total number of records processed
|
||||
"""
|
||||
site_name = site_config["site_name"]
|
||||
base_url = site_config["base_url"]
|
||||
url_template = site_config["url_template"]
|
||||
oil_type = site_config["oil_type"]
|
||||
|
||||
logging.info(f"--- Processing site: {site_name} ---")
|
||||
|
||||
total_records = 0
|
||||
# Shared phone cache across all zones for this site to avoid redundant fetches
|
||||
phone_cache = {}
|
||||
|
||||
for state_key, zone_slugs in site_config["locations"].items():
|
||||
for zone_slug in zone_slugs:
|
||||
records = _scrape_zone(
|
||||
db_session=db_session,
|
||||
site_name=site_name,
|
||||
url_template=url_template,
|
||||
base_url=base_url,
|
||||
oil_type=oil_type,
|
||||
state_key=state_key,
|
||||
zone_slug=zone_slug,
|
||||
county_lookup=county_lookup,
|
||||
phone_cache=phone_cache,
|
||||
refresh_metadata=refresh_metadata,
|
||||
)
|
||||
total_records += records
|
||||
|
||||
logging.info(f"Phone cache: fetched {len(phone_cache)} unique company phones for {site_name}")
|
||||
return total_records
|
||||
|
||||
|
||||
def main(refresh_metadata: bool = False, target_state_abbr: str | None = None):
|
||||
"""
|
||||
Main entry point for the oil price scraper.
|
||||
|
||||
Args:
|
||||
refresh_metadata: If True, force re-fetch details.
|
||||
target_state_abbr: If set (e.g. "MA"), only scrape that state.
|
||||
"""
|
||||
setup_logging()
|
||||
|
||||
state_msg = f" (State: {target_state_abbr})" if target_state_abbr else ""
|
||||
logging.info(f"Starting oil price scraper job.{state_msg} (Refresh Metadata: {refresh_metadata})")
|
||||
|
||||
# Initialize database
|
||||
try:
|
||||
init_db()
|
||||
logging.info("Database initialized/checked successfully.")
|
||||
except Exception as e:
|
||||
logging.error(f"Failed to initialize database: {e}", exc_info=True)
|
||||
return
|
||||
|
||||
db_session: Session = SessionLocal()
|
||||
total_records = 0
|
||||
|
||||
try:
|
||||
# Build county lookup at startup
|
||||
county_lookup = _build_county_lookup(db_session)
|
||||
|
||||
# Build reverse map for state filtering
|
||||
abbrev_to_state = {v: k for k, v in STATE_ABBREV_MAP.items()}
|
||||
target_state_key = abbrev_to_state.get(target_state_abbr.upper()) if target_state_abbr else None
|
||||
|
||||
if target_state_abbr and not target_state_key:
|
||||
logging.error(f"Unknown state abbreviation: {target_state_abbr}")
|
||||
return
|
||||
|
||||
# Process each configured site
|
||||
for site_config in SITES_CONFIG:
|
||||
# If filtering by state, create a shallow copy of config with filtered locations
|
||||
config_to_use = site_config
|
||||
if target_state_key:
|
||||
# Check if this site has the target state
|
||||
if target_state_key in site_config["locations"]:
|
||||
# Create filtered config
|
||||
config_to_use = site_config.copy()
|
||||
config_to_use["locations"] = {
|
||||
target_state_key: site_config["locations"][target_state_key]
|
||||
}
|
||||
else:
|
||||
logging.info(f"Skipping {site_config['site_name']} (does not cover {target_state_abbr})")
|
||||
continue
|
||||
|
||||
records = _scrape_site(db_session, config_to_use, county_lookup, refresh_metadata=refresh_metadata)
|
||||
total_records += records
|
||||
|
||||
# Commit all changes
|
||||
if total_records > 0:
|
||||
db_session.commit()
|
||||
logging.info(f"Successfully committed records to the database.")
|
||||
else:
|
||||
logging.info("No new records were queued for database insertion in this run.")
|
||||
|
||||
except Exception as e:
|
||||
logging.error(f"An error occurred during scraping or DB operation: {e}", exc_info=True)
|
||||
db_session.rollback()
|
||||
logging.info("Database transaction rolled back due to error.")
|
||||
finally:
|
||||
db_session.close()
|
||||
logging.info("Database session closed.")
|
||||
|
||||
logging.info("Oil price scraper job finished.")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user