feat(CRIT-010): add zone-to-county mapping and county_id to oil_prices

Add ZONE_COUNTY_MAP for all 5 scraped states (42 zone-to-county entries). Scraper now resolves county_id at startup and assigns it to each record. Upsert logic deduplicates by (name, state, county_id) to prevent duplicates when multiple zones map to the same county. Also adds County model for DB lookups and fixes Rhode Island zone count (4, not 5). Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-09 18:31:46 -05:00
parent 6daa706e5a
commit 8f45f4c209
9 changed files with 746 additions and 34 deletions
@@ -0,0 +1,4 @@
+# fuel_scraper package
+from .scraper import main
+
+__all__ = ["main"]
@@ -0,0 +1,114 @@
+"""
+Configuration module for the fuel scraper.
+Contains site definitions, zone-to-county mapping, and logging setup.
+"""
+import logging
+
+# --- SITES CONFIGURATION ---
+SITES_CONFIG = [
+    {
+        "site_name": "NewEnglandOil",
+        "base_url": "https://www.newenglandoil.com",
+        "url_template": "{base_url}/{state_slug}/{zone_slug}.asp?type={oil_type}",
+        "oil_type": 0,
+        "locations": {
+            "connecticut": [
+                "zone1", "zone2", "zone3", "zone4", "zone5", "zone6", "zone7",
+                "zone8", "zone9", "zone10"
+            ],
+            "massachusetts": [
+                "zone1", "zone2", "zone3", "zone4", "zone5", "zone6",
+                "zone7", "zone8", "zone9", "zone10", "zone11", "zone12",
+                "zone13", "zone14", "zone15"
+            ],
+            "newhampshire": [
+                "zone1", "zone2", "zone3", "zone4", "zone5", "zone6"
+            ],
+            "rhodeisland": [
+                "zone1", "zone2", "zone3", "zone4"
+            ],
+        }
+    },
+    {
+        "site_name": "MaineOil",
+        "base_url": "https://www.maineoil.com",
+        "url_template": "{base_url}/{zone_slug}.asp?type={oil_type}",
+        "oil_type": 0,
+        "locations": {
+            "maine": [
+                "zone1", "zone2", "zone3", "zone4", "zone5",
+                "zone6", "zone7"
+            ]
+        }
+    }
+]
+
+# --- ZONE-TO-COUNTY MAPPING ---
+# Maps (state_key, zone_number) -> (state_abbrev, county_name)
+# state_key matches the keys in SITES_CONFIG locations (lowercase, no spaces)
+# county_name must match the county.name in the database exactly
+ZONE_COUNTY_MAP = {
+    # Connecticut (10 zones -> 8 counties)
+    ("connecticut", 1): ("CT", "New London"),       # Southeast CT
+    ("connecticut", 2): ("CT", "Windham"),           # Northeast CT
+    ("connecticut", 3): ("CT", "New Haven"),         # New Haven, Bridgeport
+    ("connecticut", 4): ("CT", "Middlesex"),         # Southeast Central CT
+    ("connecticut", 5): ("CT", "New Haven"),         # Southwest Central CT
+    ("connecticut", 6): ("CT", "Hartford"),          # Greater Hartford
+    ("connecticut", 7): ("CT", "Litchfield"),        # West CT
+    ("connecticut", 8): ("CT", "Fairfield"),         # Southwest CT
+    ("connecticut", 9): ("CT", "Tolland"),           # Northeast Central CT
+    ("connecticut", 10): ("CT", "Litchfield"),       # Northwest CT
+
+    # Massachusetts (15 zones -> 14 counties)
+    ("massachusetts", 1): ("MA", "Suffolk"),         # South Boston
+    ("massachusetts", 2): ("MA", "Middlesex"),       # North Boston
+    ("massachusetts", 3): ("MA", "Norfolk"),         # Southwest of Boston
+    ("massachusetts", 4): ("MA", "Plymouth"),        # South of Boston
+    ("massachusetts", 5): ("MA", "Middlesex"),       # West of Boston
+    ("massachusetts", 6): ("MA", "Bristol"),         # Southern Massachusetts
+    ("massachusetts", 7): ("MA", "Barnstable"),      # Cape Cod & Islands
+    ("massachusetts", 8): ("MA", "Essex"),           # Northwest of Boston
+    ("massachusetts", 9): ("MA", "Essex"),           # North of Boston
+    ("massachusetts", 10): ("MA", "Worcester"),      # Central Massachusetts
+    ("massachusetts", 11): ("MA", "Worcester"),      # East Central Massachusetts
+    ("massachusetts", 12): ("MA", "Hampshire"),      # West Central Massachusetts
+    ("massachusetts", 13): ("MA", "Hampden"),        # Springfield Area
+    ("massachusetts", 14): ("MA", "Franklin"),       # Northwestern Massachusetts
+    ("massachusetts", 15): ("MA", "Berkshire"),      # Western Massachusetts
+
+    # New Hampshire (6 zones -> 10 counties)
+    ("newhampshire", 1): ("NH", "Coos"),             # Northern NH
+    ("newhampshire", 2): ("NH", "Strafford"),        # Eastern NH
+    ("newhampshire", 3): ("NH", "Merrimack"),        # Central NH
+    ("newhampshire", 4): ("NH", "Grafton"),          # West Central NH
+    ("newhampshire", 5): ("NH", "Cheshire"),         # Southwest NH
+    ("newhampshire", 6): ("NH", "Hillsborough"),     # South Central NH
+
+    # Rhode Island (4 zones -> 5 counties)
+    ("rhodeisland", 1): ("RI", "Newport"),           # Southeast RI
+    ("rhodeisland", 2): ("RI", "Providence"),        # Northern RI
+    ("rhodeisland", 3): ("RI", "Washington"),        # Southwest RI
+    ("rhodeisland", 4): ("RI", "Kent"),              # Central RI
+
+    # Maine (7 zones -> 16 counties, via MaineOil.com)
+    ("maine", 1): ("ME", "Cumberland"),              # Greater Portland
+    ("maine", 2): ("ME", "Kennebec"),                # Augusta/Waterville
+    ("maine", 3): ("ME", "Androscoggin"),            # Auburn/Lewiston/Western
+    ("maine", 4): ("ME", "York"),                    # Southern Maine
+    ("maine", 5): ("ME", "Knox"),                    # Mid-Coast
+    ("maine", 6): ("ME", "Penobscot"),               # Bangor West
+    ("maine", 7): ("ME", "Washington"),              # Downeast
+}
+
+# --- LOGGING CONFIGURATION ---
+LOG_FILE = "oil_scraper.log"
+
+
+def setup_logging():
+    """Configure logging for the scraper."""
+    logging.basicConfig(
+        filename=LOG_FILE,
+        level=logging.INFO,
+        format='%(asctime)s - %(levelname)s - [%(filename)s:%(lineno)d] - %(message)s'
+    )
@@ -0,0 +1,105 @@
+"""
+Database operations module for oil price CRUD operations.
+"""
+import logging
+from datetime import datetime
+from sqlalchemy.orm import Session
+
+import sys
+import os
+# Add parent directory to path for imports
+sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+import models
+
+
+def upsert_oil_price(db_session: Session, item_dict: dict) -> bool:
+    """
+    Insert or update an oil price record.
+
+    Logic:
+    - Match by (name, state, county_id) when county_id is available to avoid
+      duplicates when multiple zones map to the same county.
+    - Fall back to (name, state, zone) when county_id is not available.
+    - If record exists with non-null company_id: skip (vendor-managed price)
+    - If record exists with null company_id and different price: update
+    - If record exists with same price: skip (no change)
+    - If no record exists: insert new
+
+    Args:
+        db_session: SQLAlchemy session
+        item_dict: Dictionary with state, zone, name, price, date, county_id
+
+    Returns:
+        True if a record was inserted or updated, False otherwise
+    """
+    county_id = item_dict.get("county_id")
+
+    # Check if record already exists - prefer matching by county_id to avoid
+    # duplicates when multiple zones map to the same county
+    if county_id is not None:
+        existing_record = db_session.query(models.OilPrice).filter(
+            models.OilPrice.name == item_dict["name"],
+            models.OilPrice.state == item_dict["state"],
+            models.OilPrice.county_id == county_id
+        ).first()
+    else:
+        existing_record = db_session.query(models.OilPrice).filter(
+            models.OilPrice.name == item_dict["name"],
+            models.OilPrice.state == item_dict["state"],
+            models.OilPrice.zone == item_dict["zone"]
+        ).first()
+
+    if existing_record:
+        # Record exists - check if we should update
+        if existing_record.company_id is not None:
+            logging.debug(
+                f"Skipping update for {item_dict['name']} in {item_dict['state']} zone {item_dict['zone']} "
+                "due to non-null company_id"
+            )
+            return False
+
+        # Always update county_id if we have one and it differs
+        updated = False
+        if county_id is not None and existing_record.county_id != county_id:
+            existing_record.county_id = county_id
+            updated = True
+
+        # Company ID is null - check if price changed
+        if existing_record.price != item_dict["price"]:
+            existing_record.price = item_dict["price"]
+            existing_record.date = item_dict["date"]
+            existing_record.scrapetimestamp = datetime.utcnow()
+            logging.info(
+                f"Updated price for {item_dict['name']} in {item_dict['state']} zone {item_dict['zone']} "
+                f"to {item_dict['price']}"
+            )
+            return True
+        elif updated:
+            existing_record.scrapetimestamp = datetime.utcnow()
+            logging.info(
+                f"Updated county_id for {item_dict['name']} in {item_dict['state']} zone {item_dict['zone']} "
+                f"to {county_id}"
+            )
+            return True
+        else:
+            logging.debug(
+                f"Price unchanged for {item_dict['name']} in {item_dict['state']} zone {item_dict['zone']}"
+            )
+            return False
+    else:
+        # No record exists - create new
+        oil_price_record = models.OilPrice(
+            state=item_dict["state"],
+            zone=item_dict["zone"],
+            name=item_dict["name"],
+            price=item_dict["price"],
+            date=item_dict["date"],
+            county_id=county_id,
+            scrapetimestamp=datetime.utcnow()
+        )
+        db_session.add(oil_price_record)
+        logging.info(
+            f"Added new record for {item_dict['name']} in {item_dict['state']} zone {item_dict['zone']} "
+            f"(county_id={county_id})"
+        )
+        return True
@@ -0,0 +1,32 @@
+"""
+HTTP client module for making web requests.
+"""
+import logging
+import requests
+from bs4 import BeautifulSoup
+
+# Default headers to mimic a browser
+DEFAULT_HEADERS = {
+    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
+}
+
+REQUEST_TIMEOUT = 20
+
+
+def make_request(url: str) -> BeautifulSoup | None:
+    """
+    Fetch a URL and return a BeautifulSoup object.
+    
+    Args:
+        url: The URL to fetch
+        
+    Returns:
+        BeautifulSoup object if successful, None otherwise
+    """
+    try:
+        response = requests.get(url, headers=DEFAULT_HEADERS, timeout=REQUEST_TIMEOUT)
+        response.raise_for_status()
+        return BeautifulSoup(response.content, 'html.parser')
+    except requests.exceptions.RequestException as e:
+        logging.error(f"Error fetching {url}: {e}")
+        return None
@@ -0,0 +1,177 @@
+"""
+HTML parsing module for extracting oil price data from web pages.
+"""
+import logging
+import re
+from bs4 import BeautifulSoup
+
+
+def parse_zone_slug_to_int(zone_slug_str: str) -> int | None:
+    """
+    Extract the numeric part of a zone slug.
+    
+    Examples:
+        "zone1" -> 1
+        "zonema5" -> 5
+        
+    Args:
+        zone_slug_str: Zone slug string like "zone1", "zonema5"
+        
+    Returns:
+        Integer zone number or None if parsing fails
+    """
+    if not zone_slug_str:
+        return None
+    match = re.search(r'\d+$', zone_slug_str)
+    if match:
+        return int(match.group(0))
+    logging.warning(f"Could not parse numeric zone from slug: '{zone_slug_str}'")
+    return None
+
+
+def _find_price_table_columns(thead) -> dict | None:
+    """
+    Find column indices for company, price, and date in a table header.
+    
+    Args:
+        thead: BeautifulSoup thead element
+        
+    Returns:
+        Dictionary with column indices or None if not a price table
+    """
+    headers_lower = [th.get_text(strip=True).lower() for th in thead.find_all('th')]
+    column_indices = {}
+    
+    try:
+        column_indices['company'] = headers_lower.index('company name')
+        price_col_name_part = 'price'
+        column_indices['price'] = next(
+            i for i, header in enumerate(headers_lower) if price_col_name_part in header
+        )
+        column_indices['date'] = headers_lower.index('date')
+        return column_indices
+    except (ValueError, StopIteration):
+        return None
+
+
+def _parse_row(cells: list, column_indices: dict, state_name: str, zone: int) -> dict | None:
+    """
+    Parse a single table row into a price record.
+    
+    Args:
+        cells: List of td elements
+        column_indices: Dictionary mapping column names to indices
+        state_name: State name string
+        zone: Zone number
+        
+    Returns:
+        Dictionary with parsed data or None if parsing fails
+    """
+    max_required_index = max(column_indices.values())
+    
+    if len(cells) <= max_required_index:
+        return None
+    
+    # Extract company name (prefer link text if available)
+    company_cell = cells[column_indices['company']]
+    company_name = company_cell.get_text(strip=True)
+    company_link = company_cell.find('a')
+    if company_link:
+        company_name = company_link.get_text(strip=True)
+    
+    # Extract and parse price
+    price_str = cells[column_indices['price']].get_text(strip=True)
+    price_float = None
+    try:
+        cleaned_price_str = ''.join(filter(lambda x: x.isdigit() or x == '.', price_str))
+        if cleaned_price_str:
+            price_float = float(cleaned_price_str)
+    except ValueError:
+        logging.warning(f"Could not parse price: '{price_str}' for {company_name} in {state_name}/zone{zone}.")
+    except Exception as e:
+        logging.error(f"Unexpected error parsing price: '{price_str}' for {company_name}. Error: {e}")
+    
+    # Extract date
+    date_posted_str = cells[column_indices['date']].get_text(strip=True)
+    
+    return {
+        "state": state_name.capitalize(),
+        "zone": zone,
+        "name": company_name,
+        "price": price_float,
+        "date": date_posted_str,
+    }
+
+
+def parse_price_table(soup: BeautifulSoup, state_name_key: str, zone_slug_str: str) -> list[dict]:
+    """
+    Parse price tables from a BeautifulSoup page.
+    
+    Args:
+        soup: BeautifulSoup object of the page
+        state_name_key: State key like "connecticut", "maine"
+        zone_slug_str: Zone slug like "zone1", "zonema5"
+        
+    Returns:
+        List of dictionaries containing price data
+    """
+    data_dicts = []
+    all_tables = soup.find_all('table')
+    logging.info(f"Found {len(all_tables)} table(s) on page for {state_name_key} - {zone_slug_str}.")
+    
+    if not all_tables:
+        logging.warning(f"No HTML tables found at all for {state_name_key} - {zone_slug_str}.")
+        return data_dicts
+    
+    # Parse zone number from slug
+    zone_int = parse_zone_slug_to_int(zone_slug_str)
+    if zone_int is None:
+        logging.error(f"Cannot parse zone number for {state_name_key} - {zone_slug_str}. Skipping.")
+        return data_dicts
+    
+    candidate_tables_found = 0
+    
+    for table_index, table in enumerate(all_tables):
+        thead = table.find('thead')
+        if not thead:
+            logging.debug(f"Table {table_index} has no thead.")
+            continue
+        
+        # Check if this is a price table
+        column_indices = _find_price_table_columns(thead)
+        if column_indices is None:
+            logging.debug(f"Table {table_index} headers do not contain all key columns.")
+            continue
+        
+        logging.debug(f"Table {table_index} identified as price table. Indices: {column_indices}")
+        candidate_tables_found += 1
+        
+        # Parse table body
+        tbody = table.find('tbody')
+        if not tbody:
+            logging.warning(f"Price table identified by headers has no tbody. Skipping. State: {state_name_key}, Zone: {zone_slug_str}")
+            continue
+        
+        rows = tbody.find_all('tr')
+        if not rows:
+            logging.debug(f"No rows found in tbody for price table in {state_name_key}/{zone_slug_str}")
+            continue
+        
+        # Parse each row
+        for row_index, row in enumerate(rows):
+            cells = row.find_all('td')
+            record = _parse_row(cells, column_indices, state_name_key, zone_int)
+            
+            if record:
+                data_dicts.append(record)
+            elif len(cells) > 0:
+                max_required = max(column_indices.values()) + 1
+                logging.warning(
+                    f"Skipping row {row_index+1} with insufficient cells ({len(cells)}, need {max_required}) "
+                    f"in {state_name_key}/{zone_slug_str}"
+                )
+    
+    if candidate_tables_found == 0:
+        logging.warning(f"No tables matching expected price table structure found for {state_name_key} - {zone_slug_str}.")
+    
+    return data_dicts
@@ -0,0 +1,191 @@
+#!/usr/bin/env python3
+"""
+Main scraper orchestrator module.
+Coordinates fetching, parsing, and storing oil price data.
+"""
+import logging
+import sys
+import os
+
+# Add parent directory to path for imports
+sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
+from sqlalchemy.orm import Session
+from database import SessionLocal, init_db
+import models
+
+from .config import SITES_CONFIG, ZONE_COUNTY_MAP, setup_logging
+from .http_client import make_request
+from .parsers import parse_price_table, parse_zone_slug_to_int
+from .db_operations import upsert_oil_price
+
+
+def _build_county_lookup(db_session: Session) -> dict:
+    """
+    Build a lookup dict from (state_abbrev, county_name) -> county_id
+    by querying the county table.
+    """
+    counties = db_session.query(models.County).all()
+    lookup = {}
+    for c in counties:
+        lookup[(c.state, c.name)] = c.id
+    logging.info(f"Built county lookup with {len(lookup)} entries")
+    return lookup
+
+
+def _resolve_county_id(state_key: str, zone_number: int, county_lookup: dict) -> int | None:
+    """
+    Resolve a county_id from ZONE_COUNTY_MAP and the county lookup.
+    Returns None if no mapping exists.
+    """
+    mapping = ZONE_COUNTY_MAP.get((state_key, zone_number))
+    if not mapping:
+        logging.debug(f"No zone-to-county mapping for ({state_key}, {zone_number})")
+        return None
+    state_abbrev, county_name = mapping
+    county_id = county_lookup.get((state_abbrev, county_name))
+    if county_id is None:
+        logging.warning(f"County not found in DB: ({state_abbrev}, {county_name}) for zone ({state_key}, {zone_number})")
+    return county_id
+
+
+def _scrape_zone(
+    db_session: Session,
+    site_name: str,
+    url_template: str,
+    base_url: str,
+    oil_type: int,
+    state_key: str,
+    zone_slug: str,
+    county_lookup: dict
+) -> int:
+    """
+    Scrape a single zone and store records.
+
+    Returns:
+        Number of records processed
+    """
+    format_params = {
+        "base_url": base_url,
+        "state_slug": state_key,
+        "zone_slug": zone_slug,
+        "oil_type": oil_type
+    }
+    target_url = url_template.format(**format_params)
+
+    logging.info(f"Scraping: {target_url} (State: {state_key}, Zone Slug: {zone_slug})")
+
+    soup = make_request(target_url)
+    if not soup:
+        logging.warning(f"Failed to retrieve or parse {target_url}. Skipping.")
+        return 0
+
+    parsed_items = parse_price_table(soup, state_key, zone_slug)
+
+    if not parsed_items:
+        logging.info(f"No data extracted from {target_url}")
+        return 0
+
+    # Resolve county_id for this zone
+    zone_number = parse_zone_slug_to_int(zone_slug)
+    county_id = None
+    if zone_number is not None:
+        county_id = _resolve_county_id(state_key, zone_number, county_lookup)
+
+    records_processed = 0
+    for item_dict in parsed_items:
+        item_dict["county_id"] = county_id
+        if upsert_oil_price(db_session, item_dict):
+            records_processed += 1
+
+    logging.info(
+        f"Processed {len(parsed_items)} records from {site_name} - {state_key}/{zone_slug} "
+        f"({records_processed} inserted/updated, county_id={county_id})"
+    )
+
+    return len(parsed_items)
+
+
+def _scrape_site(db_session: Session, site_config: dict, county_lookup: dict) -> int:
+    """
+    Scrape all zones for a single site.
+
+    Returns:
+        Total number of records processed
+    """
+    site_name = site_config["site_name"]
+    base_url = site_config["base_url"]
+    url_template = site_config["url_template"]
+    oil_type = site_config["oil_type"]
+
+    logging.info(f"--- Processing site: {site_name} ---")
+
+    total_records = 0
+
+    for state_key, zone_slugs in site_config["locations"].items():
+        for zone_slug in zone_slugs:
+            records = _scrape_zone(
+                db_session=db_session,
+                site_name=site_name,
+                url_template=url_template,
+                base_url=base_url,
+                oil_type=oil_type,
+                state_key=state_key,
+                zone_slug=zone_slug,
+                county_lookup=county_lookup
+            )
+            total_records += records
+
+    return total_records
+
+
+def main():
+    """
+    Main entry point for the oil price scraper.
+
+    Initializes database, iterates through all configured sites and zones,
+    scrapes price data, and stores it in the database.
+    """
+    setup_logging()
+    logging.info("Starting oil price scraper job.")
+
+    # Initialize database
+    try:
+        init_db()
+        logging.info("Database initialized/checked successfully.")
+    except Exception as e:
+        logging.error(f"Failed to initialize database: {e}", exc_info=True)
+        return
+
+    db_session: Session = SessionLocal()
+    total_records = 0
+
+    try:
+        # Build county lookup at startup
+        county_lookup = _build_county_lookup(db_session)
+
+        # Process each configured site
+        for site_config in SITES_CONFIG:
+            records = _scrape_site(db_session, site_config, county_lookup)
+            total_records += records
+
+        # Commit all changes
+        if total_records > 0:
+            db_session.commit()
+            logging.info(f"Successfully committed records to the database.")
+        else:
+            logging.info("No new records were queued for database insertion in this run.")
+
+    except Exception as e:
+        logging.error(f"An error occurred during scraping or DB operation: {e}", exc_info=True)
+        db_session.rollback()
+        logging.info("Database transaction rolled back due to error.")
+    finally:
+        db_session.close()
+        logging.info("Database session closed.")
+
+    logging.info("Oil price scraper job finished.")
+
+
+if __name__ == "__main__":
+    main()