feat(CRIT-010): add zone-to-county mapping and county_id to oil_prices

Add ZONE_COUNTY_MAP for all 5 scraped states (42 zone-to-county entries). Scraper now resolves county_id at startup and assigns it to each record. Upsert logic deduplicates by (name, state, county_id) to prevent duplicates when multiple zones map to the same county. Also adds County model for DB lookups and fixes Rhode Island zone count (4, not 5). Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-09 18:31:46 -05:00
parent 6daa706e5a
commit 8f45f4c209
9 changed files with 746 additions and 34 deletions
@@ -31,7 +31,7 @@ SITES_CONFIG = [
                "zone1", "zone2", "zone3", "zone4", "zone5", "zone6"
            ],
            "rhodeisland": [
-                "zone1", "zone2", "zone3", "zone4", "zone5"
+                "zone1", "zone2", "zone3", "zone4"
            ],
@@ -40,23 +40,64 @@ SITES_CONFIG = [
    {
        "site_name": "MaineOil",
        "base_url": "https://www.maineoil.com",
        # URL template for MaineOil using numeric zones like zone1.asp, zone2.asp
        # {zone_slug} will be "zone1", "zone2", etc.
        # No {state_slug} is needed in this part of the path for maineoil.com
        "url_template": "{base_url}/{zone_slug}.asp?type={oil_type}",
        "oil_type": 0,
        "locations": {
            # "maine" is our internal key for the state.
            # The zone_slugs are "zone1", "zone2", etc.
            # YOU NEED TO VERIFY THE ACTUAL ZONE SLUGS AND COUNT FOR MAINEOIL.COM
            "maine": [
                "zone1", "zone2", "zone3", "zone4", "zone5",
-                "zone6", "zone7" # Example: Add/remove based on actual zones on maineoil.com
+                "zone6", "zone7"
            ]
        }
    }
 ]
 # --- ZONE-TO-COUNTY MAPPING ---
 # Maps (state_key, zone_number) -> (state_abbrev, county_name)
 ZONE_COUNTY_MAP = {
    ("connecticut", 1): ("CT", "New London"),
    ("connecticut", 2): ("CT", "Windham"),
    ("connecticut", 3): ("CT", "New Haven"),
    ("connecticut", 4): ("CT", "Middlesex"),
    ("connecticut", 5): ("CT", "New Haven"),
    ("connecticut", 6): ("CT", "Hartford"),
    ("connecticut", 7): ("CT", "Litchfield"),
    ("connecticut", 8): ("CT", "Fairfield"),
    ("connecticut", 9): ("CT", "Tolland"),
    ("connecticut", 10): ("CT", "Litchfield"),
    ("massachusetts", 1): ("MA", "Suffolk"),
    ("massachusetts", 2): ("MA", "Middlesex"),
    ("massachusetts", 3): ("MA", "Norfolk"),
    ("massachusetts", 4): ("MA", "Plymouth"),
    ("massachusetts", 5): ("MA", "Middlesex"),
    ("massachusetts", 6): ("MA", "Bristol"),
    ("massachusetts", 7): ("MA", "Barnstable"),
    ("massachusetts", 8): ("MA", "Essex"),
    ("massachusetts", 9): ("MA", "Essex"),
    ("massachusetts", 10): ("MA", "Worcester"),
    ("massachusetts", 11): ("MA", "Worcester"),
    ("massachusetts", 12): ("MA", "Hampshire"),
    ("massachusetts", 13): ("MA", "Hampden"),
    ("massachusetts", 14): ("MA", "Franklin"),
    ("massachusetts", 15): ("MA", "Berkshire"),
    ("newhampshire", 1): ("NH", "Coos"),
    ("newhampshire", 2): ("NH", "Strafford"),
    ("newhampshire", 3): ("NH", "Merrimack"),
    ("newhampshire", 4): ("NH", "Grafton"),
    ("newhampshire", 5): ("NH", "Cheshire"),
    ("newhampshire", 6): ("NH", "Hillsborough"),
    ("rhodeisland", 1): ("RI", "Newport"),
    ("rhodeisland", 2): ("RI", "Providence"),
    ("rhodeisland", 3): ("RI", "Washington"),
    ("rhodeisland", 4): ("RI", "Kent"),
    ("maine", 1): ("ME", "Cumberland"),
    ("maine", 2): ("ME", "Kennebec"),
    ("maine", 3): ("ME", "Androscoggin"),
    ("maine", 4): ("ME", "York"),
    ("maine", 5): ("ME", "Knox"),
    ("maine", 6): ("ME", "Penobscot"),
    ("maine", 7): ("ME", "Washington"),
 }
 LOG_FILE = "oil_scraper.log"
 logging.basicConfig(
    filename=LOG_FILE,
@@ -125,7 +166,7 @@ def parse_price_table(soup, state_name_key, zone_slug_str):
        if not is_price_table:
            continue
-        
+
        candidate_tables_found += 1
        tbody = table.find('tbody')
        if not tbody:
@@ -139,7 +180,7 @@ def parse_price_table(soup, state_name_key, zone_slug_str):
        for row_index, row in enumerate(rows):
            cells = row.find_all('td')
            max_required_index = max(actual_column_indices.values()) if actual_column_indices else -1
-            
+
            if max_required_index == -1:
                logging.error(f"Logic error: is_price_table true but no column indices for {state_name_key}/{zone_slug_str}")
                continue
@@ -172,11 +213,31 @@ def parse_price_table(soup, state_name_key, zone_slug_str):
                })
            elif len(cells) > 0:
                logging.warning(f"Skipping row {row_index+1} with insufficient cells ({len(cells)}, need {max_required_index+1}) in {state_name_key}/{zone_slug_str}")
-    
+
    if candidate_tables_found == 0:
        logging.warning(f"No tables matching expected price table structure found for {state_name_key} - {zone_slug_str}.")
    return data_dicts
 # --- Helper: Build county lookup ---
 def build_county_lookup(db_session):
    """Build (state_abbrev, county_name) -> county_id lookup from DB."""
    counties = db_session.query(models.County).all()
    lookup = {}
    for c in counties:
        lookup[(c.state, c.name)] = c.id
    logging.info(f"Built county lookup with {len(lookup)} entries")
    return lookup
 def resolve_county_id(state_key, zone_number, county_lookup):
    """Resolve county_id from ZONE_COUNTY_MAP and county lookup."""
    mapping = ZONE_COUNTY_MAP.get((state_key, zone_number))
    if not mapping:
        return None
    state_abbrev, county_name = mapping
    return county_lookup.get((state_abbrev, county_name))
 # --- Main Script ---
 def main():
    logging.info("Starting oil price scraper job.")
@@ -191,22 +252,24 @@ def main():
    total_records_added_this_run = 0
    try:
        # Build county lookup at startup
        county_lookup = build_county_lookup(db_session)
        for site_config in SITES_CONFIG:
            site_name = site_config["site_name"]
            base_url = site_config["base_url"]
            url_template = site_config["url_template"]
            oil_type = site_config["oil_type"]
-            
+
            logging.info(f"--- Processing site: {site_name} ---")
            for state_key_in_config, zone_slugs_list in site_config["locations"].items():
-                # state_key_in_config is "connecticut", "maine", etc.
+
-                
+                for zone_slug_from_list in zone_slugs_list:
                for zone_slug_from_list in zone_slugs_list: # e.g., "zone1", "zonema5"
                    format_params = {
                        "base_url": base_url,
-                        "state_slug": state_key_in_config, # Used if {state_slug} in template
+                        "state_slug": state_key_in_config,
-                        "zone_slug": zone_slug_from_list,  # This is "zone1", "zonema5", etc.
+                        "zone_slug": zone_slug_from_list,
                        "oil_type": oil_type
                    }
                    target_url = url_template.format(**format_params)
@@ -215,44 +278,61 @@ def main():
                    soup = make_request(target_url)
                    if soup:
                        # Pass state_key_in_config as state_name_key
                        # Pass zone_slug_from_list (e.g. "zone1") as zone_slug_str for parsing to int
                        parsed_items = parse_price_table(soup, state_key_in_config, zone_slug_from_list)
-                        
+
                        if parsed_items:
-                            for item_dict in parsed_items: # item_dict["zone"] will be an integer
+                            # Resolve county_id for this zone
-                                # Check if a record with the same name, state, and zone already exists
+                            zone_int = parse_zone_slug_to_int(zone_slug_from_list)
-                                existing_record = db_session.query(models.OilPrice).filter(
+                            county_id = None
-                                    models.OilPrice.name == item_dict["name"],
+                            if zone_int is not None:
-                                    models.OilPrice.state == item_dict["state"],
+                                county_id = resolve_county_id(state_key_in_config, zone_int, county_lookup)
-                                    models.OilPrice.zone == item_dict["zone"]
+
-                                ).first()
+                            for item_dict in parsed_items:
-                                
+                                # Match by county_id when available to avoid duplicates
                                # when multiple zones map to the same county
                                if county_id is not None:
                                    existing_record = db_session.query(models.OilPrice).filter(
                                        models.OilPrice.name == item_dict["name"],
                                        models.OilPrice.state == item_dict["state"],
                                        models.OilPrice.county_id == county_id
                                    ).first()
                                else:
                                    existing_record = db_session.query(models.OilPrice).filter(
                                        models.OilPrice.name == item_dict["name"],
                                        models.OilPrice.state == item_dict["state"],
                                        models.OilPrice.zone == item_dict["zone"]
                                    ).first()
                                if existing_record:
                                    # If record exists, check if company_id is not null
                                    if existing_record.company_id is not None:
                                        logging.debug(f"Skipping update for {item_dict['name']} in {item_dict['state']} zone {item_dict['zone']} due to non-null company_id")
                                    else:
-                                        # If company_id is null, check if price is different
+                                        updated = False
                                        if county_id is not None and existing_record.county_id != county_id:
                                            existing_record.county_id = county_id
                                            updated = True
                                        if existing_record.price != item_dict["price"]:
                                            existing_record.price = item_dict["price"]
                                            existing_record.date = item_dict["date"]
                                            existing_record.scrapetimestamp = datetime.utcnow()
                                            logging.info(f"Updated price for {item_dict['name']} in {item_dict['state']} zone {item_dict['zone']} to {item_dict['price']}")
                                        elif updated:
                                            existing_record.scrapetimestamp = datetime.utcnow()
                                            logging.info(f"Updated county_id for {item_dict['name']} in {item_dict['state']} zone {item_dict['zone']} to {county_id}")
                                        else:
                                            logging.debug(f"Price unchanged for {item_dict['name']} in {item_dict['state']} zone {item_dict['zone']}")
                                else:
                                    # If no record exists, create a new one
                                    oil_price_record = models.OilPrice(
                                        state=item_dict["state"],
                                        zone=item_dict["zone"],
                                        name=item_dict["name"],
                                        price=item_dict["price"],
                                        date=item_dict["date"],
                                        county_id=county_id,
                                        scrapetimestamp=datetime.utcnow()
                                    )
                                    db_session.add(oil_price_record)
-                                    logging.info(f"Added new record for {item_dict['name']} in {item_dict['state']} zone {item_dict['zone']}")
+                                    logging.info(f"Added new record for {item_dict['name']} in {item_dict['state']} zone {item_dict['zone']} (county_id={county_id})")
                            total_records_added_this_run += len(parsed_items)
                            logging.info(f"Queued {len(parsed_items)} records from {site_name} - {state_key_in_config}/{zone_slug_from_list} for DB insertion.")
                        else:
@@ -0,0 +1,4 @@
 # fuel_scraper package
 from .scraper import main
 __all__ = ["main"]
@@ -0,0 +1,114 @@
 """
 Configuration module for the fuel scraper.
 Contains site definitions, zone-to-county mapping, and logging setup.
 """
 import logging
 # --- SITES CONFIGURATION ---
 SITES_CONFIG = [
    {
        "site_name": "NewEnglandOil",
        "base_url": "https://www.newenglandoil.com",
        "url_template": "{base_url}/{state_slug}/{zone_slug}.asp?type={oil_type}",
        "oil_type": 0,
        "locations": {
            "connecticut": [
                "zone1", "zone2", "zone3", "zone4", "zone5", "zone6", "zone7",
                "zone8", "zone9", "zone10"
            ],
            "massachusetts": [
                "zone1", "zone2", "zone3", "zone4", "zone5", "zone6",
                "zone7", "zone8", "zone9", "zone10", "zone11", "zone12",
                "zone13", "zone14", "zone15"
            ],
            "newhampshire": [
                "zone1", "zone2", "zone3", "zone4", "zone5", "zone6"
            ],
            "rhodeisland": [
                "zone1", "zone2", "zone3", "zone4"
            ],
        }
    },
    {
        "site_name": "MaineOil",
        "base_url": "https://www.maineoil.com",
        "url_template": "{base_url}/{zone_slug}.asp?type={oil_type}",
        "oil_type": 0,
        "locations": {
            "maine": [
                "zone1", "zone2", "zone3", "zone4", "zone5",
                "zone6", "zone7"
            ]
        }
    }
 ]
 # --- ZONE-TO-COUNTY MAPPING ---
 # Maps (state_key, zone_number) -> (state_abbrev, county_name)
 # state_key matches the keys in SITES_CONFIG locations (lowercase, no spaces)
 # county_name must match the county.name in the database exactly
 ZONE_COUNTY_MAP = {
    # Connecticut (10 zones -> 8 counties)
    ("connecticut", 1): ("CT", "New London"),       # Southeast CT
    ("connecticut", 2): ("CT", "Windham"),           # Northeast CT
    ("connecticut", 3): ("CT", "New Haven"),         # New Haven, Bridgeport
    ("connecticut", 4): ("CT", "Middlesex"),         # Southeast Central CT
    ("connecticut", 5): ("CT", "New Haven"),         # Southwest Central CT
    ("connecticut", 6): ("CT", "Hartford"),          # Greater Hartford
    ("connecticut", 7): ("CT", "Litchfield"),        # West CT
    ("connecticut", 8): ("CT", "Fairfield"),         # Southwest CT
    ("connecticut", 9): ("CT", "Tolland"),           # Northeast Central CT
    ("connecticut", 10): ("CT", "Litchfield"),       # Northwest CT
    # Massachusetts (15 zones -> 14 counties)
    ("massachusetts", 1): ("MA", "Suffolk"),         # South Boston
    ("massachusetts", 2): ("MA", "Middlesex"),       # North Boston
    ("massachusetts", 3): ("MA", "Norfolk"),         # Southwest of Boston
    ("massachusetts", 4): ("MA", "Plymouth"),        # South of Boston
    ("massachusetts", 5): ("MA", "Middlesex"),       # West of Boston
    ("massachusetts", 6): ("MA", "Bristol"),         # Southern Massachusetts
    ("massachusetts", 7): ("MA", "Barnstable"),      # Cape Cod & Islands
    ("massachusetts", 8): ("MA", "Essex"),           # Northwest of Boston
    ("massachusetts", 9): ("MA", "Essex"),           # North of Boston
    ("massachusetts", 10): ("MA", "Worcester"),      # Central Massachusetts
    ("massachusetts", 11): ("MA", "Worcester"),      # East Central Massachusetts
    ("massachusetts", 12): ("MA", "Hampshire"),      # West Central Massachusetts
    ("massachusetts", 13): ("MA", "Hampden"),        # Springfield Area
    ("massachusetts", 14): ("MA", "Franklin"),       # Northwestern Massachusetts
    ("massachusetts", 15): ("MA", "Berkshire"),      # Western Massachusetts
    # New Hampshire (6 zones -> 10 counties)
    ("newhampshire", 1): ("NH", "Coos"),             # Northern NH
    ("newhampshire", 2): ("NH", "Strafford"),        # Eastern NH
    ("newhampshire", 3): ("NH", "Merrimack"),        # Central NH
    ("newhampshire", 4): ("NH", "Grafton"),          # West Central NH
    ("newhampshire", 5): ("NH", "Cheshire"),         # Southwest NH
    ("newhampshire", 6): ("NH", "Hillsborough"),     # South Central NH
    # Rhode Island (4 zones -> 5 counties)
    ("rhodeisland", 1): ("RI", "Newport"),           # Southeast RI
    ("rhodeisland", 2): ("RI", "Providence"),        # Northern RI
    ("rhodeisland", 3): ("RI", "Washington"),        # Southwest RI
    ("rhodeisland", 4): ("RI", "Kent"),              # Central RI
    # Maine (7 zones -> 16 counties, via MaineOil.com)
    ("maine", 1): ("ME", "Cumberland"),              # Greater Portland
    ("maine", 2): ("ME", "Kennebec"),                # Augusta/Waterville
    ("maine", 3): ("ME", "Androscoggin"),            # Auburn/Lewiston/Western
    ("maine", 4): ("ME", "York"),                    # Southern Maine
    ("maine", 5): ("ME", "Knox"),                    # Mid-Coast
    ("maine", 6): ("ME", "Penobscot"),               # Bangor West
    ("maine", 7): ("ME", "Washington"),              # Downeast
 }
 # --- LOGGING CONFIGURATION ---
 LOG_FILE = "oil_scraper.log"
 def setup_logging():
    """Configure logging for the scraper."""
    logging.basicConfig(
        filename=LOG_FILE,
        level=logging.INFO,
        format='%(asctime)s - %(levelname)s - [%(filename)s:%(lineno)d] - %(message)s'
    )
@@ -0,0 +1,105 @@
 """
 Database operations module for oil price CRUD operations.
 """
 import logging
 from datetime import datetime
 from sqlalchemy.orm import Session
 import sys
 import os
 # Add parent directory to path for imports
 sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
 import models
 def upsert_oil_price(db_session: Session, item_dict: dict) -> bool:
    """
    Insert or update an oil price record.
    Logic:
    - Match by (name, state, county_id) when county_id is available to avoid
      duplicates when multiple zones map to the same county.
    - Fall back to (name, state, zone) when county_id is not available.
    - If record exists with non-null company_id: skip (vendor-managed price)
    - If record exists with null company_id and different price: update
    - If record exists with same price: skip (no change)
    - If no record exists: insert new
    Args:
        db_session: SQLAlchemy session
        item_dict: Dictionary with state, zone, name, price, date, county_id
    Returns:
        True if a record was inserted or updated, False otherwise
    """
    county_id = item_dict.get("county_id")
    # Check if record already exists - prefer matching by county_id to avoid
    # duplicates when multiple zones map to the same county
    if county_id is not None:
        existing_record = db_session.query(models.OilPrice).filter(
            models.OilPrice.name == item_dict["name"],
            models.OilPrice.state == item_dict["state"],
            models.OilPrice.county_id == county_id
        ).first()
    else:
        existing_record = db_session.query(models.OilPrice).filter(
            models.OilPrice.name == item_dict["name"],
            models.OilPrice.state == item_dict["state"],
            models.OilPrice.zone == item_dict["zone"]
        ).first()
    if existing_record:
        # Record exists - check if we should update
        if existing_record.company_id is not None:
            logging.debug(
                f"Skipping update for {item_dict['name']} in {item_dict['state']} zone {item_dict['zone']} "
                "due to non-null company_id"
            )
            return False
        # Always update county_id if we have one and it differs
        updated = False
        if county_id is not None and existing_record.county_id != county_id:
            existing_record.county_id = county_id
            updated = True
        # Company ID is null - check if price changed
        if existing_record.price != item_dict["price"]:
            existing_record.price = item_dict["price"]
            existing_record.date = item_dict["date"]
            existing_record.scrapetimestamp = datetime.utcnow()
            logging.info(
                f"Updated price for {item_dict['name']} in {item_dict['state']} zone {item_dict['zone']} "
                f"to {item_dict['price']}"
            )
            return True
        elif updated:
            existing_record.scrapetimestamp = datetime.utcnow()
            logging.info(
                f"Updated county_id for {item_dict['name']} in {item_dict['state']} zone {item_dict['zone']} "
                f"to {county_id}"
            )
            return True
        else:
            logging.debug(
                f"Price unchanged for {item_dict['name']} in {item_dict['state']} zone {item_dict['zone']}"
            )
            return False
    else:
        # No record exists - create new
        oil_price_record = models.OilPrice(
            state=item_dict["state"],
            zone=item_dict["zone"],
            name=item_dict["name"],
            price=item_dict["price"],
            date=item_dict["date"],
            county_id=county_id,
            scrapetimestamp=datetime.utcnow()
        )
        db_session.add(oil_price_record)
        logging.info(
            f"Added new record for {item_dict['name']} in {item_dict['state']} zone {item_dict['zone']} "
            f"(county_id={county_id})"
        )
        return True
@@ -0,0 +1,32 @@
 """
 HTTP client module for making web requests.
 """
 import logging
 import requests
 from bs4 import BeautifulSoup
 # Default headers to mimic a browser
 DEFAULT_HEADERS = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
 }
 REQUEST_TIMEOUT = 20
 def make_request(url: str) -> BeautifulSoup | None:
    """
    Fetch a URL and return a BeautifulSoup object.
    Args:
        url: The URL to fetch
    Returns:
        BeautifulSoup object if successful, None otherwise
    """
    try:
        response = requests.get(url, headers=DEFAULT_HEADERS, timeout=REQUEST_TIMEOUT)
        response.raise_for_status()
        return BeautifulSoup(response.content, 'html.parser')
    except requests.exceptions.RequestException as e:
        logging.error(f"Error fetching {url}: {e}")
        return None
@@ -0,0 +1,177 @@
 """
 HTML parsing module for extracting oil price data from web pages.
 """
 import logging
 import re
 from bs4 import BeautifulSoup
 def parse_zone_slug_to_int(zone_slug_str: str) -> int | None:
    """
    Extract the numeric part of a zone slug.
    Examples:
        "zone1" -> 1
        "zonema5" -> 5
    Args:
        zone_slug_str: Zone slug string like "zone1", "zonema5"
    Returns:
        Integer zone number or None if parsing fails
    """
    if not zone_slug_str:
        return None
    match = re.search(r'\d+$', zone_slug_str)
    if match:
        return int(match.group(0))
    logging.warning(f"Could not parse numeric zone from slug: '{zone_slug_str}'")
    return None
 def _find_price_table_columns(thead) -> dict | None:
    """
    Find column indices for company, price, and date in a table header.
    Args:
        thead: BeautifulSoup thead element
    Returns:
        Dictionary with column indices or None if not a price table
    """
    headers_lower = [th.get_text(strip=True).lower() for th in thead.find_all('th')]
    column_indices = {}
    try:
        column_indices['company'] = headers_lower.index('company name')
        price_col_name_part = 'price'
        column_indices['price'] = next(
            i for i, header in enumerate(headers_lower) if price_col_name_part in header
        )
        column_indices['date'] = headers_lower.index('date')
        return column_indices
    except (ValueError, StopIteration):
        return None
 def _parse_row(cells: list, column_indices: dict, state_name: str, zone: int) -> dict | None:
    """
    Parse a single table row into a price record.
    Args:
        cells: List of td elements
        column_indices: Dictionary mapping column names to indices
        state_name: State name string
        zone: Zone number
    Returns:
        Dictionary with parsed data or None if parsing fails
    """
    max_required_index = max(column_indices.values())
    if len(cells) <= max_required_index:
        return None
    # Extract company name (prefer link text if available)
    company_cell = cells[column_indices['company']]
    company_name = company_cell.get_text(strip=True)
    company_link = company_cell.find('a')
    if company_link:
        company_name = company_link.get_text(strip=True)
    # Extract and parse price
    price_str = cells[column_indices['price']].get_text(strip=True)
    price_float = None
    try:
        cleaned_price_str = ''.join(filter(lambda x: x.isdigit() or x == '.', price_str))
        if cleaned_price_str:
            price_float = float(cleaned_price_str)
    except ValueError:
        logging.warning(f"Could not parse price: '{price_str}' for {company_name} in {state_name}/zone{zone}.")
    except Exception as e:
        logging.error(f"Unexpected error parsing price: '{price_str}' for {company_name}. Error: {e}")
    # Extract date
    date_posted_str = cells[column_indices['date']].get_text(strip=True)
    return {
        "state": state_name.capitalize(),
        "zone": zone,
        "name": company_name,
        "price": price_float,
        "date": date_posted_str,
    }
 def parse_price_table(soup: BeautifulSoup, state_name_key: str, zone_slug_str: str) -> list[dict]:
    """
    Parse price tables from a BeautifulSoup page.
    Args:
        soup: BeautifulSoup object of the page
        state_name_key: State key like "connecticut", "maine"
        zone_slug_str: Zone slug like "zone1", "zonema5"
    Returns:
        List of dictionaries containing price data
    """
    data_dicts = []
    all_tables = soup.find_all('table')
    logging.info(f"Found {len(all_tables)} table(s) on page for {state_name_key} - {zone_slug_str}.")
    if not all_tables:
        logging.warning(f"No HTML tables found at all for {state_name_key} - {zone_slug_str}.")
        return data_dicts
    # Parse zone number from slug
    zone_int = parse_zone_slug_to_int(zone_slug_str)
    if zone_int is None:
        logging.error(f"Cannot parse zone number for {state_name_key} - {zone_slug_str}. Skipping.")
        return data_dicts
    candidate_tables_found = 0
    for table_index, table in enumerate(all_tables):
        thead = table.find('thead')
        if not thead:
            logging.debug(f"Table {table_index} has no thead.")
            continue
        # Check if this is a price table
        column_indices = _find_price_table_columns(thead)
        if column_indices is None:
            logging.debug(f"Table {table_index} headers do not contain all key columns.")
            continue
        logging.debug(f"Table {table_index} identified as price table. Indices: {column_indices}")
        candidate_tables_found += 1
        # Parse table body
        tbody = table.find('tbody')
        if not tbody:
            logging.warning(f"Price table identified by headers has no tbody. Skipping. State: {state_name_key}, Zone: {zone_slug_str}")
            continue
        rows = tbody.find_all('tr')
        if not rows:
            logging.debug(f"No rows found in tbody for price table in {state_name_key}/{zone_slug_str}")
            continue
        # Parse each row
        for row_index, row in enumerate(rows):
            cells = row.find_all('td')
            record = _parse_row(cells, column_indices, state_name_key, zone_int)
            if record:
                data_dicts.append(record)
            elif len(cells) > 0:
                max_required = max(column_indices.values()) + 1
                logging.warning(
                    f"Skipping row {row_index+1} with insufficient cells ({len(cells)}, need {max_required}) "
                    f"in {state_name_key}/{zone_slug_str}"
                )
    if candidate_tables_found == 0:
        logging.warning(f"No tables matching expected price table structure found for {state_name_key} - {zone_slug_str}.")
    return data_dicts
@@ -0,0 +1,191 @@
 #!/usr/bin/env python3
 """
 Main scraper orchestrator module.
 Coordinates fetching, parsing, and storing oil price data.
 """
 import logging
 import sys
 import os
 # Add parent directory to path for imports
 sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
 from sqlalchemy.orm import Session
 from database import SessionLocal, init_db
 import models
 from .config import SITES_CONFIG, ZONE_COUNTY_MAP, setup_logging
 from .http_client import make_request
 from .parsers import parse_price_table, parse_zone_slug_to_int
 from .db_operations import upsert_oil_price
 def _build_county_lookup(db_session: Session) -> dict:
    """
    Build a lookup dict from (state_abbrev, county_name) -> county_id
    by querying the county table.
    """
    counties = db_session.query(models.County).all()
    lookup = {}
    for c in counties:
        lookup[(c.state, c.name)] = c.id
    logging.info(f"Built county lookup with {len(lookup)} entries")
    return lookup
 def _resolve_county_id(state_key: str, zone_number: int, county_lookup: dict) -> int | None:
    """
    Resolve a county_id from ZONE_COUNTY_MAP and the county lookup.
    Returns None if no mapping exists.
    """
    mapping = ZONE_COUNTY_MAP.get((state_key, zone_number))
    if not mapping:
        logging.debug(f"No zone-to-county mapping for ({state_key}, {zone_number})")
        return None
    state_abbrev, county_name = mapping
    county_id = county_lookup.get((state_abbrev, county_name))
    if county_id is None:
        logging.warning(f"County not found in DB: ({state_abbrev}, {county_name}) for zone ({state_key}, {zone_number})")
    return county_id
 def _scrape_zone(
    db_session: Session,
    site_name: str,
    url_template: str,
    base_url: str,
    oil_type: int,
    state_key: str,
    zone_slug: str,
    county_lookup: dict
 ) -> int:
    """
    Scrape a single zone and store records.
    Returns:
        Number of records processed
    """
    format_params = {
        "base_url": base_url,
        "state_slug": state_key,
        "zone_slug": zone_slug,
        "oil_type": oil_type
    }
    target_url = url_template.format(**format_params)
    logging.info(f"Scraping: {target_url} (State: {state_key}, Zone Slug: {zone_slug})")
    soup = make_request(target_url)
    if not soup:
        logging.warning(f"Failed to retrieve or parse {target_url}. Skipping.")
        return 0
    parsed_items = parse_price_table(soup, state_key, zone_slug)
    if not parsed_items:
        logging.info(f"No data extracted from {target_url}")
        return 0
    # Resolve county_id for this zone
    zone_number = parse_zone_slug_to_int(zone_slug)
    county_id = None
    if zone_number is not None:
        county_id = _resolve_county_id(state_key, zone_number, county_lookup)
    records_processed = 0
    for item_dict in parsed_items:
        item_dict["county_id"] = county_id
        if upsert_oil_price(db_session, item_dict):
            records_processed += 1
    logging.info(
        f"Processed {len(parsed_items)} records from {site_name} - {state_key}/{zone_slug} "
        f"({records_processed} inserted/updated, county_id={county_id})"
    )
    return len(parsed_items)
 def _scrape_site(db_session: Session, site_config: dict, county_lookup: dict) -> int:
    """
    Scrape all zones for a single site.
    Returns:
        Total number of records processed
    """
    site_name = site_config["site_name"]
    base_url = site_config["base_url"]
    url_template = site_config["url_template"]
    oil_type = site_config["oil_type"]
    logging.info(f"--- Processing site: {site_name} ---")
    total_records = 0
    for state_key, zone_slugs in site_config["locations"].items():
        for zone_slug in zone_slugs:
            records = _scrape_zone(
                db_session=db_session,
                site_name=site_name,
                url_template=url_template,
                base_url=base_url,
                oil_type=oil_type,
                state_key=state_key,
                zone_slug=zone_slug,
                county_lookup=county_lookup
            )
            total_records += records
    return total_records
 def main():
    """
    Main entry point for the oil price scraper.
    Initializes database, iterates through all configured sites and zones,
    scrapes price data, and stores it in the database.
    """
    setup_logging()
    logging.info("Starting oil price scraper job.")
    # Initialize database
    try:
        init_db()
        logging.info("Database initialized/checked successfully.")
    except Exception as e:
        logging.error(f"Failed to initialize database: {e}", exc_info=True)
        return
    db_session: Session = SessionLocal()
    total_records = 0
    try:
        # Build county lookup at startup
        county_lookup = _build_county_lookup(db_session)
        # Process each configured site
        for site_config in SITES_CONFIG:
            records = _scrape_site(db_session, site_config, county_lookup)
            total_records += records
        # Commit all changes
        if total_records > 0:
            db_session.commit()
            logging.info(f"Successfully committed records to the database.")
        else:
            logging.info("No new records were queued for database insertion in this run.")
    except Exception as e:
        logging.error(f"An error occurred during scraping or DB operation: {e}", exc_info=True)
        db_session.rollback()
        logging.info("Database transaction rolled back due to error.")
    finally:
        db_session.close()
        logging.info("Database session closed.")
    logging.info("Oil price scraper job finished.")
 if __name__ == "__main__":
    main()
@@ -24,11 +24,20 @@ class OilPrice(Base):
    # when a new record is created and this field is not explicitly set.
    company_id = Column(Integer, ForeignKey("company.id"), nullable=True)
    county_id = Column(Integer, nullable=True)
    def __repr__(self):
        return (f"<OilPrice(id={self.id}, state='{self.state}', zone='{self.zone}', "
                f"name='{self.name}', price={self.price}, date='{self.date}', "
-                f"scraped_at='{self.scrapetimestamp}')>") # Added scraped_at to repr
+                f"county_id={self.county_id}, scraped_at='{self.scrapetimestamp}')>")
 # --- County Model (read-only, for lookups) ---
 class County(Base):
    __tablename__ = "county"
    id = Column(Integer, primary_key=True)
    name = Column(String(255))
    state = Column(String(2))
 # --- Company Model (remains the same) ---
 class Company(Base):
@@ -6,7 +6,7 @@ import logging
 # The 'import models' is crucial for init_db to know about the tables
 import models
 from database import init_db, SessionLocal
-from fuel_scraper import main as run_scraper_main # Assuming your scraper's main is 'main'
+from fuel_scraper import main as run_scraper_main  # Import from modular package
 # Configure basic logging for the run.py script itself if needed
 # Your other modules (fuel_scraper, database) will have their own logging