feat(CRIT-010): add zone-to-county mapping and county_id to oil_prices

Add ZONE_COUNTY_MAP for all 5 scraped states (42 zone-to-county entries).
Scraper now resolves county_id at startup and assigns it to each record.
Upsert logic deduplicates by (name, state, county_id) to prevent duplicates
when multiple zones map to the same county. Also adds County model for
DB lookups and fixes Rhode Island zone count (4, not 5).

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-02-09 18:31:46 -05:00
parent 6daa706e5a
commit 8f45f4c209
9 changed files with 746 additions and 34 deletions

View File

@@ -31,7 +31,7 @@ SITES_CONFIG = [
"zone1", "zone2", "zone3", "zone4", "zone5", "zone6" "zone1", "zone2", "zone3", "zone4", "zone5", "zone6"
], ],
"rhodeisland": [ "rhodeisland": [
"zone1", "zone2", "zone3", "zone4", "zone5" "zone1", "zone2", "zone3", "zone4"
], ],
@@ -40,23 +40,64 @@ SITES_CONFIG = [
{ {
"site_name": "MaineOil", "site_name": "MaineOil",
"base_url": "https://www.maineoil.com", "base_url": "https://www.maineoil.com",
# URL template for MaineOil using numeric zones like zone1.asp, zone2.asp
# {zone_slug} will be "zone1", "zone2", etc.
# No {state_slug} is needed in this part of the path for maineoil.com
"url_template": "{base_url}/{zone_slug}.asp?type={oil_type}", "url_template": "{base_url}/{zone_slug}.asp?type={oil_type}",
"oil_type": 0, "oil_type": 0,
"locations": { "locations": {
# "maine" is our internal key for the state.
# The zone_slugs are "zone1", "zone2", etc.
# YOU NEED TO VERIFY THE ACTUAL ZONE SLUGS AND COUNT FOR MAINEOIL.COM
"maine": [ "maine": [
"zone1", "zone2", "zone3", "zone4", "zone5", "zone1", "zone2", "zone3", "zone4", "zone5",
"zone6", "zone7" # Example: Add/remove based on actual zones on maineoil.com "zone6", "zone7"
] ]
} }
} }
] ]
# --- ZONE-TO-COUNTY MAPPING ---
# Maps (state_key, zone_number) -> (state_abbrev, county_name)
ZONE_COUNTY_MAP = {
("connecticut", 1): ("CT", "New London"),
("connecticut", 2): ("CT", "Windham"),
("connecticut", 3): ("CT", "New Haven"),
("connecticut", 4): ("CT", "Middlesex"),
("connecticut", 5): ("CT", "New Haven"),
("connecticut", 6): ("CT", "Hartford"),
("connecticut", 7): ("CT", "Litchfield"),
("connecticut", 8): ("CT", "Fairfield"),
("connecticut", 9): ("CT", "Tolland"),
("connecticut", 10): ("CT", "Litchfield"),
("massachusetts", 1): ("MA", "Suffolk"),
("massachusetts", 2): ("MA", "Middlesex"),
("massachusetts", 3): ("MA", "Norfolk"),
("massachusetts", 4): ("MA", "Plymouth"),
("massachusetts", 5): ("MA", "Middlesex"),
("massachusetts", 6): ("MA", "Bristol"),
("massachusetts", 7): ("MA", "Barnstable"),
("massachusetts", 8): ("MA", "Essex"),
("massachusetts", 9): ("MA", "Essex"),
("massachusetts", 10): ("MA", "Worcester"),
("massachusetts", 11): ("MA", "Worcester"),
("massachusetts", 12): ("MA", "Hampshire"),
("massachusetts", 13): ("MA", "Hampden"),
("massachusetts", 14): ("MA", "Franklin"),
("massachusetts", 15): ("MA", "Berkshire"),
("newhampshire", 1): ("NH", "Coos"),
("newhampshire", 2): ("NH", "Strafford"),
("newhampshire", 3): ("NH", "Merrimack"),
("newhampshire", 4): ("NH", "Grafton"),
("newhampshire", 5): ("NH", "Cheshire"),
("newhampshire", 6): ("NH", "Hillsborough"),
("rhodeisland", 1): ("RI", "Newport"),
("rhodeisland", 2): ("RI", "Providence"),
("rhodeisland", 3): ("RI", "Washington"),
("rhodeisland", 4): ("RI", "Kent"),
("maine", 1): ("ME", "Cumberland"),
("maine", 2): ("ME", "Kennebec"),
("maine", 3): ("ME", "Androscoggin"),
("maine", 4): ("ME", "York"),
("maine", 5): ("ME", "Knox"),
("maine", 6): ("ME", "Penobscot"),
("maine", 7): ("ME", "Washington"),
}
LOG_FILE = "oil_scraper.log" LOG_FILE = "oil_scraper.log"
logging.basicConfig( logging.basicConfig(
filename=LOG_FILE, filename=LOG_FILE,
@@ -125,7 +166,7 @@ def parse_price_table(soup, state_name_key, zone_slug_str):
if not is_price_table: if not is_price_table:
continue continue
candidate_tables_found += 1 candidate_tables_found += 1
tbody = table.find('tbody') tbody = table.find('tbody')
if not tbody: if not tbody:
@@ -139,7 +180,7 @@ def parse_price_table(soup, state_name_key, zone_slug_str):
for row_index, row in enumerate(rows): for row_index, row in enumerate(rows):
cells = row.find_all('td') cells = row.find_all('td')
max_required_index = max(actual_column_indices.values()) if actual_column_indices else -1 max_required_index = max(actual_column_indices.values()) if actual_column_indices else -1
if max_required_index == -1: if max_required_index == -1:
logging.error(f"Logic error: is_price_table true but no column indices for {state_name_key}/{zone_slug_str}") logging.error(f"Logic error: is_price_table true but no column indices for {state_name_key}/{zone_slug_str}")
continue continue
@@ -172,11 +213,31 @@ def parse_price_table(soup, state_name_key, zone_slug_str):
}) })
elif len(cells) > 0: elif len(cells) > 0:
logging.warning(f"Skipping row {row_index+1} with insufficient cells ({len(cells)}, need {max_required_index+1}) in {state_name_key}/{zone_slug_str}") logging.warning(f"Skipping row {row_index+1} with insufficient cells ({len(cells)}, need {max_required_index+1}) in {state_name_key}/{zone_slug_str}")
if candidate_tables_found == 0: if candidate_tables_found == 0:
logging.warning(f"No tables matching expected price table structure found for {state_name_key} - {zone_slug_str}.") logging.warning(f"No tables matching expected price table structure found for {state_name_key} - {zone_slug_str}.")
return data_dicts return data_dicts
# --- Helper: Build county lookup ---
def build_county_lookup(db_session):
"""Build (state_abbrev, county_name) -> county_id lookup from DB."""
counties = db_session.query(models.County).all()
lookup = {}
for c in counties:
lookup[(c.state, c.name)] = c.id
logging.info(f"Built county lookup with {len(lookup)} entries")
return lookup
def resolve_county_id(state_key, zone_number, county_lookup):
"""Resolve county_id from ZONE_COUNTY_MAP and county lookup."""
mapping = ZONE_COUNTY_MAP.get((state_key, zone_number))
if not mapping:
return None
state_abbrev, county_name = mapping
return county_lookup.get((state_abbrev, county_name))
# --- Main Script --- # --- Main Script ---
def main(): def main():
logging.info("Starting oil price scraper job.") logging.info("Starting oil price scraper job.")
@@ -191,22 +252,24 @@ def main():
total_records_added_this_run = 0 total_records_added_this_run = 0
try: try:
# Build county lookup at startup
county_lookup = build_county_lookup(db_session)
for site_config in SITES_CONFIG: for site_config in SITES_CONFIG:
site_name = site_config["site_name"] site_name = site_config["site_name"]
base_url = site_config["base_url"] base_url = site_config["base_url"]
url_template = site_config["url_template"] url_template = site_config["url_template"]
oil_type = site_config["oil_type"] oil_type = site_config["oil_type"]
logging.info(f"--- Processing site: {site_name} ---") logging.info(f"--- Processing site: {site_name} ---")
for state_key_in_config, zone_slugs_list in site_config["locations"].items(): for state_key_in_config, zone_slugs_list in site_config["locations"].items():
# state_key_in_config is "connecticut", "maine", etc.
for zone_slug_from_list in zone_slugs_list:
for zone_slug_from_list in zone_slugs_list: # e.g., "zone1", "zonema5"
format_params = { format_params = {
"base_url": base_url, "base_url": base_url,
"state_slug": state_key_in_config, # Used if {state_slug} in template "state_slug": state_key_in_config,
"zone_slug": zone_slug_from_list, # This is "zone1", "zonema5", etc. "zone_slug": zone_slug_from_list,
"oil_type": oil_type "oil_type": oil_type
} }
target_url = url_template.format(**format_params) target_url = url_template.format(**format_params)
@@ -215,44 +278,61 @@ def main():
soup = make_request(target_url) soup = make_request(target_url)
if soup: if soup:
# Pass state_key_in_config as state_name_key
# Pass zone_slug_from_list (e.g. "zone1") as zone_slug_str for parsing to int
parsed_items = parse_price_table(soup, state_key_in_config, zone_slug_from_list) parsed_items = parse_price_table(soup, state_key_in_config, zone_slug_from_list)
if parsed_items: if parsed_items:
for item_dict in parsed_items: # item_dict["zone"] will be an integer # Resolve county_id for this zone
# Check if a record with the same name, state, and zone already exists zone_int = parse_zone_slug_to_int(zone_slug_from_list)
existing_record = db_session.query(models.OilPrice).filter( county_id = None
models.OilPrice.name == item_dict["name"], if zone_int is not None:
models.OilPrice.state == item_dict["state"], county_id = resolve_county_id(state_key_in_config, zone_int, county_lookup)
models.OilPrice.zone == item_dict["zone"]
).first() for item_dict in parsed_items:
# Match by county_id when available to avoid duplicates
# when multiple zones map to the same county
if county_id is not None:
existing_record = db_session.query(models.OilPrice).filter(
models.OilPrice.name == item_dict["name"],
models.OilPrice.state == item_dict["state"],
models.OilPrice.county_id == county_id
).first()
else:
existing_record = db_session.query(models.OilPrice).filter(
models.OilPrice.name == item_dict["name"],
models.OilPrice.state == item_dict["state"],
models.OilPrice.zone == item_dict["zone"]
).first()
if existing_record: if existing_record:
# If record exists, check if company_id is not null
if existing_record.company_id is not None: if existing_record.company_id is not None:
logging.debug(f"Skipping update for {item_dict['name']} in {item_dict['state']} zone {item_dict['zone']} due to non-null company_id") logging.debug(f"Skipping update for {item_dict['name']} in {item_dict['state']} zone {item_dict['zone']} due to non-null company_id")
else: else:
# If company_id is null, check if price is different updated = False
if county_id is not None and existing_record.county_id != county_id:
existing_record.county_id = county_id
updated = True
if existing_record.price != item_dict["price"]: if existing_record.price != item_dict["price"]:
existing_record.price = item_dict["price"] existing_record.price = item_dict["price"]
existing_record.date = item_dict["date"] existing_record.date = item_dict["date"]
existing_record.scrapetimestamp = datetime.utcnow() existing_record.scrapetimestamp = datetime.utcnow()
logging.info(f"Updated price for {item_dict['name']} in {item_dict['state']} zone {item_dict['zone']} to {item_dict['price']}") logging.info(f"Updated price for {item_dict['name']} in {item_dict['state']} zone {item_dict['zone']} to {item_dict['price']}")
elif updated:
existing_record.scrapetimestamp = datetime.utcnow()
logging.info(f"Updated county_id for {item_dict['name']} in {item_dict['state']} zone {item_dict['zone']} to {county_id}")
else: else:
logging.debug(f"Price unchanged for {item_dict['name']} in {item_dict['state']} zone {item_dict['zone']}") logging.debug(f"Price unchanged for {item_dict['name']} in {item_dict['state']} zone {item_dict['zone']}")
else: else:
# If no record exists, create a new one
oil_price_record = models.OilPrice( oil_price_record = models.OilPrice(
state=item_dict["state"], state=item_dict["state"],
zone=item_dict["zone"], zone=item_dict["zone"],
name=item_dict["name"], name=item_dict["name"],
price=item_dict["price"], price=item_dict["price"],
date=item_dict["date"], date=item_dict["date"],
county_id=county_id,
scrapetimestamp=datetime.utcnow() scrapetimestamp=datetime.utcnow()
) )
db_session.add(oil_price_record) db_session.add(oil_price_record)
logging.info(f"Added new record for {item_dict['name']} in {item_dict['state']} zone {item_dict['zone']}") logging.info(f"Added new record for {item_dict['name']} in {item_dict['state']} zone {item_dict['zone']} (county_id={county_id})")
total_records_added_this_run += len(parsed_items) total_records_added_this_run += len(parsed_items)
logging.info(f"Queued {len(parsed_items)} records from {site_name} - {state_key_in_config}/{zone_slug_from_list} for DB insertion.") logging.info(f"Queued {len(parsed_items)} records from {site_name} - {state_key_in_config}/{zone_slug_from_list} for DB insertion.")
else: else:

4
fuel_scraper/__init__.py Normal file
View File

@@ -0,0 +1,4 @@
# fuel_scraper package
from .scraper import main
__all__ = ["main"]

114
fuel_scraper/config.py Normal file
View File

@@ -0,0 +1,114 @@
"""
Configuration module for the fuel scraper.
Contains site definitions, zone-to-county mapping, and logging setup.
"""
import logging
# --- SITES CONFIGURATION ---
SITES_CONFIG = [
{
"site_name": "NewEnglandOil",
"base_url": "https://www.newenglandoil.com",
"url_template": "{base_url}/{state_slug}/{zone_slug}.asp?type={oil_type}",
"oil_type": 0,
"locations": {
"connecticut": [
"zone1", "zone2", "zone3", "zone4", "zone5", "zone6", "zone7",
"zone8", "zone9", "zone10"
],
"massachusetts": [
"zone1", "zone2", "zone3", "zone4", "zone5", "zone6",
"zone7", "zone8", "zone9", "zone10", "zone11", "zone12",
"zone13", "zone14", "zone15"
],
"newhampshire": [
"zone1", "zone2", "zone3", "zone4", "zone5", "zone6"
],
"rhodeisland": [
"zone1", "zone2", "zone3", "zone4"
],
}
},
{
"site_name": "MaineOil",
"base_url": "https://www.maineoil.com",
"url_template": "{base_url}/{zone_slug}.asp?type={oil_type}",
"oil_type": 0,
"locations": {
"maine": [
"zone1", "zone2", "zone3", "zone4", "zone5",
"zone6", "zone7"
]
}
}
]
# --- ZONE-TO-COUNTY MAPPING ---
# Maps (state_key, zone_number) -> (state_abbrev, county_name)
# state_key matches the keys in SITES_CONFIG locations (lowercase, no spaces)
# county_name must match the county.name in the database exactly
ZONE_COUNTY_MAP = {
# Connecticut (10 zones -> 8 counties)
("connecticut", 1): ("CT", "New London"), # Southeast CT
("connecticut", 2): ("CT", "Windham"), # Northeast CT
("connecticut", 3): ("CT", "New Haven"), # New Haven, Bridgeport
("connecticut", 4): ("CT", "Middlesex"), # Southeast Central CT
("connecticut", 5): ("CT", "New Haven"), # Southwest Central CT
("connecticut", 6): ("CT", "Hartford"), # Greater Hartford
("connecticut", 7): ("CT", "Litchfield"), # West CT
("connecticut", 8): ("CT", "Fairfield"), # Southwest CT
("connecticut", 9): ("CT", "Tolland"), # Northeast Central CT
("connecticut", 10): ("CT", "Litchfield"), # Northwest CT
# Massachusetts (15 zones -> 14 counties)
("massachusetts", 1): ("MA", "Suffolk"), # South Boston
("massachusetts", 2): ("MA", "Middlesex"), # North Boston
("massachusetts", 3): ("MA", "Norfolk"), # Southwest of Boston
("massachusetts", 4): ("MA", "Plymouth"), # South of Boston
("massachusetts", 5): ("MA", "Middlesex"), # West of Boston
("massachusetts", 6): ("MA", "Bristol"), # Southern Massachusetts
("massachusetts", 7): ("MA", "Barnstable"), # Cape Cod & Islands
("massachusetts", 8): ("MA", "Essex"), # Northwest of Boston
("massachusetts", 9): ("MA", "Essex"), # North of Boston
("massachusetts", 10): ("MA", "Worcester"), # Central Massachusetts
("massachusetts", 11): ("MA", "Worcester"), # East Central Massachusetts
("massachusetts", 12): ("MA", "Hampshire"), # West Central Massachusetts
("massachusetts", 13): ("MA", "Hampden"), # Springfield Area
("massachusetts", 14): ("MA", "Franklin"), # Northwestern Massachusetts
("massachusetts", 15): ("MA", "Berkshire"), # Western Massachusetts
# New Hampshire (6 zones -> 10 counties)
("newhampshire", 1): ("NH", "Coos"), # Northern NH
("newhampshire", 2): ("NH", "Strafford"), # Eastern NH
("newhampshire", 3): ("NH", "Merrimack"), # Central NH
("newhampshire", 4): ("NH", "Grafton"), # West Central NH
("newhampshire", 5): ("NH", "Cheshire"), # Southwest NH
("newhampshire", 6): ("NH", "Hillsborough"), # South Central NH
# Rhode Island (4 zones -> 5 counties)
("rhodeisland", 1): ("RI", "Newport"), # Southeast RI
("rhodeisland", 2): ("RI", "Providence"), # Northern RI
("rhodeisland", 3): ("RI", "Washington"), # Southwest RI
("rhodeisland", 4): ("RI", "Kent"), # Central RI
# Maine (7 zones -> 16 counties, via MaineOil.com)
("maine", 1): ("ME", "Cumberland"), # Greater Portland
("maine", 2): ("ME", "Kennebec"), # Augusta/Waterville
("maine", 3): ("ME", "Androscoggin"), # Auburn/Lewiston/Western
("maine", 4): ("ME", "York"), # Southern Maine
("maine", 5): ("ME", "Knox"), # Mid-Coast
("maine", 6): ("ME", "Penobscot"), # Bangor West
("maine", 7): ("ME", "Washington"), # Downeast
}
# --- LOGGING CONFIGURATION ---
LOG_FILE = "oil_scraper.log"
def setup_logging():
"""Configure logging for the scraper."""
logging.basicConfig(
filename=LOG_FILE,
level=logging.INFO,
format='%(asctime)s - %(levelname)s - [%(filename)s:%(lineno)d] - %(message)s'
)

View File

@@ -0,0 +1,105 @@
"""
Database operations module for oil price CRUD operations.
"""
import logging
from datetime import datetime
from sqlalchemy.orm import Session
import sys
import os
# Add parent directory to path for imports
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
import models
def upsert_oil_price(db_session: Session, item_dict: dict) -> bool:
"""
Insert or update an oil price record.
Logic:
- Match by (name, state, county_id) when county_id is available to avoid
duplicates when multiple zones map to the same county.
- Fall back to (name, state, zone) when county_id is not available.
- If record exists with non-null company_id: skip (vendor-managed price)
- If record exists with null company_id and different price: update
- If record exists with same price: skip (no change)
- If no record exists: insert new
Args:
db_session: SQLAlchemy session
item_dict: Dictionary with state, zone, name, price, date, county_id
Returns:
True if a record was inserted or updated, False otherwise
"""
county_id = item_dict.get("county_id")
# Check if record already exists - prefer matching by county_id to avoid
# duplicates when multiple zones map to the same county
if county_id is not None:
existing_record = db_session.query(models.OilPrice).filter(
models.OilPrice.name == item_dict["name"],
models.OilPrice.state == item_dict["state"],
models.OilPrice.county_id == county_id
).first()
else:
existing_record = db_session.query(models.OilPrice).filter(
models.OilPrice.name == item_dict["name"],
models.OilPrice.state == item_dict["state"],
models.OilPrice.zone == item_dict["zone"]
).first()
if existing_record:
# Record exists - check if we should update
if existing_record.company_id is not None:
logging.debug(
f"Skipping update for {item_dict['name']} in {item_dict['state']} zone {item_dict['zone']} "
"due to non-null company_id"
)
return False
# Always update county_id if we have one and it differs
updated = False
if county_id is not None and existing_record.county_id != county_id:
existing_record.county_id = county_id
updated = True
# Company ID is null - check if price changed
if existing_record.price != item_dict["price"]:
existing_record.price = item_dict["price"]
existing_record.date = item_dict["date"]
existing_record.scrapetimestamp = datetime.utcnow()
logging.info(
f"Updated price for {item_dict['name']} in {item_dict['state']} zone {item_dict['zone']} "
f"to {item_dict['price']}"
)
return True
elif updated:
existing_record.scrapetimestamp = datetime.utcnow()
logging.info(
f"Updated county_id for {item_dict['name']} in {item_dict['state']} zone {item_dict['zone']} "
f"to {county_id}"
)
return True
else:
logging.debug(
f"Price unchanged for {item_dict['name']} in {item_dict['state']} zone {item_dict['zone']}"
)
return False
else:
# No record exists - create new
oil_price_record = models.OilPrice(
state=item_dict["state"],
zone=item_dict["zone"],
name=item_dict["name"],
price=item_dict["price"],
date=item_dict["date"],
county_id=county_id,
scrapetimestamp=datetime.utcnow()
)
db_session.add(oil_price_record)
logging.info(
f"Added new record for {item_dict['name']} in {item_dict['state']} zone {item_dict['zone']} "
f"(county_id={county_id})"
)
return True

View File

@@ -0,0 +1,32 @@
"""
HTTP client module for making web requests.
"""
import logging
import requests
from bs4 import BeautifulSoup
# Default headers to mimic a browser
DEFAULT_HEADERS = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}
REQUEST_TIMEOUT = 20
def make_request(url: str) -> BeautifulSoup | None:
"""
Fetch a URL and return a BeautifulSoup object.
Args:
url: The URL to fetch
Returns:
BeautifulSoup object if successful, None otherwise
"""
try:
response = requests.get(url, headers=DEFAULT_HEADERS, timeout=REQUEST_TIMEOUT)
response.raise_for_status()
return BeautifulSoup(response.content, 'html.parser')
except requests.exceptions.RequestException as e:
logging.error(f"Error fetching {url}: {e}")
return None

177
fuel_scraper/parsers.py Normal file
View File

@@ -0,0 +1,177 @@
"""
HTML parsing module for extracting oil price data from web pages.
"""
import logging
import re
from bs4 import BeautifulSoup
def parse_zone_slug_to_int(zone_slug_str: str) -> int | None:
"""
Extract the numeric part of a zone slug.
Examples:
"zone1" -> 1
"zonema5" -> 5
Args:
zone_slug_str: Zone slug string like "zone1", "zonema5"
Returns:
Integer zone number or None if parsing fails
"""
if not zone_slug_str:
return None
match = re.search(r'\d+$', zone_slug_str)
if match:
return int(match.group(0))
logging.warning(f"Could not parse numeric zone from slug: '{zone_slug_str}'")
return None
def _find_price_table_columns(thead) -> dict | None:
"""
Find column indices for company, price, and date in a table header.
Args:
thead: BeautifulSoup thead element
Returns:
Dictionary with column indices or None if not a price table
"""
headers_lower = [th.get_text(strip=True).lower() for th in thead.find_all('th')]
column_indices = {}
try:
column_indices['company'] = headers_lower.index('company name')
price_col_name_part = 'price'
column_indices['price'] = next(
i for i, header in enumerate(headers_lower) if price_col_name_part in header
)
column_indices['date'] = headers_lower.index('date')
return column_indices
except (ValueError, StopIteration):
return None
def _parse_row(cells: list, column_indices: dict, state_name: str, zone: int) -> dict | None:
"""
Parse a single table row into a price record.
Args:
cells: List of td elements
column_indices: Dictionary mapping column names to indices
state_name: State name string
zone: Zone number
Returns:
Dictionary with parsed data or None if parsing fails
"""
max_required_index = max(column_indices.values())
if len(cells) <= max_required_index:
return None
# Extract company name (prefer link text if available)
company_cell = cells[column_indices['company']]
company_name = company_cell.get_text(strip=True)
company_link = company_cell.find('a')
if company_link:
company_name = company_link.get_text(strip=True)
# Extract and parse price
price_str = cells[column_indices['price']].get_text(strip=True)
price_float = None
try:
cleaned_price_str = ''.join(filter(lambda x: x.isdigit() or x == '.', price_str))
if cleaned_price_str:
price_float = float(cleaned_price_str)
except ValueError:
logging.warning(f"Could not parse price: '{price_str}' for {company_name} in {state_name}/zone{zone}.")
except Exception as e:
logging.error(f"Unexpected error parsing price: '{price_str}' for {company_name}. Error: {e}")
# Extract date
date_posted_str = cells[column_indices['date']].get_text(strip=True)
return {
"state": state_name.capitalize(),
"zone": zone,
"name": company_name,
"price": price_float,
"date": date_posted_str,
}
def parse_price_table(soup: BeautifulSoup, state_name_key: str, zone_slug_str: str) -> list[dict]:
"""
Parse price tables from a BeautifulSoup page.
Args:
soup: BeautifulSoup object of the page
state_name_key: State key like "connecticut", "maine"
zone_slug_str: Zone slug like "zone1", "zonema5"
Returns:
List of dictionaries containing price data
"""
data_dicts = []
all_tables = soup.find_all('table')
logging.info(f"Found {len(all_tables)} table(s) on page for {state_name_key} - {zone_slug_str}.")
if not all_tables:
logging.warning(f"No HTML tables found at all for {state_name_key} - {zone_slug_str}.")
return data_dicts
# Parse zone number from slug
zone_int = parse_zone_slug_to_int(zone_slug_str)
if zone_int is None:
logging.error(f"Cannot parse zone number for {state_name_key} - {zone_slug_str}. Skipping.")
return data_dicts
candidate_tables_found = 0
for table_index, table in enumerate(all_tables):
thead = table.find('thead')
if not thead:
logging.debug(f"Table {table_index} has no thead.")
continue
# Check if this is a price table
column_indices = _find_price_table_columns(thead)
if column_indices is None:
logging.debug(f"Table {table_index} headers do not contain all key columns.")
continue
logging.debug(f"Table {table_index} identified as price table. Indices: {column_indices}")
candidate_tables_found += 1
# Parse table body
tbody = table.find('tbody')
if not tbody:
logging.warning(f"Price table identified by headers has no tbody. Skipping. State: {state_name_key}, Zone: {zone_slug_str}")
continue
rows = tbody.find_all('tr')
if not rows:
logging.debug(f"No rows found in tbody for price table in {state_name_key}/{zone_slug_str}")
continue
# Parse each row
for row_index, row in enumerate(rows):
cells = row.find_all('td')
record = _parse_row(cells, column_indices, state_name_key, zone_int)
if record:
data_dicts.append(record)
elif len(cells) > 0:
max_required = max(column_indices.values()) + 1
logging.warning(
f"Skipping row {row_index+1} with insufficient cells ({len(cells)}, need {max_required}) "
f"in {state_name_key}/{zone_slug_str}"
)
if candidate_tables_found == 0:
logging.warning(f"No tables matching expected price table structure found for {state_name_key} - {zone_slug_str}.")
return data_dicts

191
fuel_scraper/scraper.py Normal file
View File

@@ -0,0 +1,191 @@
#!/usr/bin/env python3
"""
Main scraper orchestrator module.
Coordinates fetching, parsing, and storing oil price data.
"""
import logging
import sys
import os
# Add parent directory to path for imports
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from sqlalchemy.orm import Session
from database import SessionLocal, init_db
import models
from .config import SITES_CONFIG, ZONE_COUNTY_MAP, setup_logging
from .http_client import make_request
from .parsers import parse_price_table, parse_zone_slug_to_int
from .db_operations import upsert_oil_price
def _build_county_lookup(db_session: Session) -> dict:
"""
Build a lookup dict from (state_abbrev, county_name) -> county_id
by querying the county table.
"""
counties = db_session.query(models.County).all()
lookup = {}
for c in counties:
lookup[(c.state, c.name)] = c.id
logging.info(f"Built county lookup with {len(lookup)} entries")
return lookup
def _resolve_county_id(state_key: str, zone_number: int, county_lookup: dict) -> int | None:
"""
Resolve a county_id from ZONE_COUNTY_MAP and the county lookup.
Returns None if no mapping exists.
"""
mapping = ZONE_COUNTY_MAP.get((state_key, zone_number))
if not mapping:
logging.debug(f"No zone-to-county mapping for ({state_key}, {zone_number})")
return None
state_abbrev, county_name = mapping
county_id = county_lookup.get((state_abbrev, county_name))
if county_id is None:
logging.warning(f"County not found in DB: ({state_abbrev}, {county_name}) for zone ({state_key}, {zone_number})")
return county_id
def _scrape_zone(
db_session: Session,
site_name: str,
url_template: str,
base_url: str,
oil_type: int,
state_key: str,
zone_slug: str,
county_lookup: dict
) -> int:
"""
Scrape a single zone and store records.
Returns:
Number of records processed
"""
format_params = {
"base_url": base_url,
"state_slug": state_key,
"zone_slug": zone_slug,
"oil_type": oil_type
}
target_url = url_template.format(**format_params)
logging.info(f"Scraping: {target_url} (State: {state_key}, Zone Slug: {zone_slug})")
soup = make_request(target_url)
if not soup:
logging.warning(f"Failed to retrieve or parse {target_url}. Skipping.")
return 0
parsed_items = parse_price_table(soup, state_key, zone_slug)
if not parsed_items:
logging.info(f"No data extracted from {target_url}")
return 0
# Resolve county_id for this zone
zone_number = parse_zone_slug_to_int(zone_slug)
county_id = None
if zone_number is not None:
county_id = _resolve_county_id(state_key, zone_number, county_lookup)
records_processed = 0
for item_dict in parsed_items:
item_dict["county_id"] = county_id
if upsert_oil_price(db_session, item_dict):
records_processed += 1
logging.info(
f"Processed {len(parsed_items)} records from {site_name} - {state_key}/{zone_slug} "
f"({records_processed} inserted/updated, county_id={county_id})"
)
return len(parsed_items)
def _scrape_site(db_session: Session, site_config: dict, county_lookup: dict) -> int:
"""
Scrape all zones for a single site.
Returns:
Total number of records processed
"""
site_name = site_config["site_name"]
base_url = site_config["base_url"]
url_template = site_config["url_template"]
oil_type = site_config["oil_type"]
logging.info(f"--- Processing site: {site_name} ---")
total_records = 0
for state_key, zone_slugs in site_config["locations"].items():
for zone_slug in zone_slugs:
records = _scrape_zone(
db_session=db_session,
site_name=site_name,
url_template=url_template,
base_url=base_url,
oil_type=oil_type,
state_key=state_key,
zone_slug=zone_slug,
county_lookup=county_lookup
)
total_records += records
return total_records
def main():
"""
Main entry point for the oil price scraper.
Initializes database, iterates through all configured sites and zones,
scrapes price data, and stores it in the database.
"""
setup_logging()
logging.info("Starting oil price scraper job.")
# Initialize database
try:
init_db()
logging.info("Database initialized/checked successfully.")
except Exception as e:
logging.error(f"Failed to initialize database: {e}", exc_info=True)
return
db_session: Session = SessionLocal()
total_records = 0
try:
# Build county lookup at startup
county_lookup = _build_county_lookup(db_session)
# Process each configured site
for site_config in SITES_CONFIG:
records = _scrape_site(db_session, site_config, county_lookup)
total_records += records
# Commit all changes
if total_records > 0:
db_session.commit()
logging.info(f"Successfully committed records to the database.")
else:
logging.info("No new records were queued for database insertion in this run.")
except Exception as e:
logging.error(f"An error occurred during scraping or DB operation: {e}", exc_info=True)
db_session.rollback()
logging.info("Database transaction rolled back due to error.")
finally:
db_session.close()
logging.info("Database session closed.")
logging.info("Oil price scraper job finished.")
if __name__ == "__main__":
main()

View File

@@ -24,11 +24,20 @@ class OilPrice(Base):
# when a new record is created and this field is not explicitly set. # when a new record is created and this field is not explicitly set.
company_id = Column(Integer, ForeignKey("company.id"), nullable=True) company_id = Column(Integer, ForeignKey("company.id"), nullable=True)
county_id = Column(Integer, nullable=True)
def __repr__(self): def __repr__(self):
return (f"<OilPrice(id={self.id}, state='{self.state}', zone='{self.zone}', " return (f"<OilPrice(id={self.id}, state='{self.state}', zone='{self.zone}', "
f"name='{self.name}', price={self.price}, date='{self.date}', " f"name='{self.name}', price={self.price}, date='{self.date}', "
f"scraped_at='{self.scrapetimestamp}')>") # Added scraped_at to repr f"county_id={self.county_id}, scraped_at='{self.scrapetimestamp}')>")
# --- County Model (read-only, for lookups) ---
class County(Base):
__tablename__ = "county"
id = Column(Integer, primary_key=True)
name = Column(String(255))
state = Column(String(2))
# --- Company Model (remains the same) --- # --- Company Model (remains the same) ---
class Company(Base): class Company(Base):

2
run.py
View File

@@ -6,7 +6,7 @@ import logging
# The 'import models' is crucial for init_db to know about the tables # The 'import models' is crucial for init_db to know about the tables
import models import models
from database import init_db, SessionLocal from database import init_db, SessionLocal
from fuel_scraper import main as run_scraper_main # Assuming your scraper's main is 'main' from fuel_scraper import main as run_scraper_main # Import from modular package
# Configure basic logging for the run.py script itself if needed # Configure basic logging for the run.py script itself if needed
# Your other modules (fuel_scraper, database) will have their own logging # Your other modules (fuel_scraper, database) will have their own logging