diff --git a/fuel_scraper.py b/fuel_scraper.py index d4dcd50..1193362 100644 --- a/fuel_scraper.py +++ b/fuel_scraper.py @@ -31,7 +31,7 @@ SITES_CONFIG = [ "zone1", "zone2", "zone3", "zone4", "zone5", "zone6" ], "rhodeisland": [ - "zone1", "zone2", "zone3", "zone4", "zone5" + "zone1", "zone2", "zone3", "zone4" ], @@ -40,23 +40,64 @@ SITES_CONFIG = [ { "site_name": "MaineOil", "base_url": "https://www.maineoil.com", - # URL template for MaineOil using numeric zones like zone1.asp, zone2.asp - # {zone_slug} will be "zone1", "zone2", etc. - # No {state_slug} is needed in this part of the path for maineoil.com "url_template": "{base_url}/{zone_slug}.asp?type={oil_type}", "oil_type": 0, "locations": { - # "maine" is our internal key for the state. - # The zone_slugs are "zone1", "zone2", etc. - # YOU NEED TO VERIFY THE ACTUAL ZONE SLUGS AND COUNT FOR MAINEOIL.COM "maine": [ "zone1", "zone2", "zone3", "zone4", "zone5", - "zone6", "zone7" # Example: Add/remove based on actual zones on maineoil.com + "zone6", "zone7" ] } } ] +# --- ZONE-TO-COUNTY MAPPING --- +# Maps (state_key, zone_number) -> (state_abbrev, county_name) +ZONE_COUNTY_MAP = { + ("connecticut", 1): ("CT", "New London"), + ("connecticut", 2): ("CT", "Windham"), + ("connecticut", 3): ("CT", "New Haven"), + ("connecticut", 4): ("CT", "Middlesex"), + ("connecticut", 5): ("CT", "New Haven"), + ("connecticut", 6): ("CT", "Hartford"), + ("connecticut", 7): ("CT", "Litchfield"), + ("connecticut", 8): ("CT", "Fairfield"), + ("connecticut", 9): ("CT", "Tolland"), + ("connecticut", 10): ("CT", "Litchfield"), + ("massachusetts", 1): ("MA", "Suffolk"), + ("massachusetts", 2): ("MA", "Middlesex"), + ("massachusetts", 3): ("MA", "Norfolk"), + ("massachusetts", 4): ("MA", "Plymouth"), + ("massachusetts", 5): ("MA", "Middlesex"), + ("massachusetts", 6): ("MA", "Bristol"), + ("massachusetts", 7): ("MA", "Barnstable"), + ("massachusetts", 8): ("MA", "Essex"), + ("massachusetts", 9): ("MA", "Essex"), + ("massachusetts", 10): ("MA", "Worcester"), + ("massachusetts", 11): ("MA", "Worcester"), + ("massachusetts", 12): ("MA", "Hampshire"), + ("massachusetts", 13): ("MA", "Hampden"), + ("massachusetts", 14): ("MA", "Franklin"), + ("massachusetts", 15): ("MA", "Berkshire"), + ("newhampshire", 1): ("NH", "Coos"), + ("newhampshire", 2): ("NH", "Strafford"), + ("newhampshire", 3): ("NH", "Merrimack"), + ("newhampshire", 4): ("NH", "Grafton"), + ("newhampshire", 5): ("NH", "Cheshire"), + ("newhampshire", 6): ("NH", "Hillsborough"), + ("rhodeisland", 1): ("RI", "Newport"), + ("rhodeisland", 2): ("RI", "Providence"), + ("rhodeisland", 3): ("RI", "Washington"), + ("rhodeisland", 4): ("RI", "Kent"), + ("maine", 1): ("ME", "Cumberland"), + ("maine", 2): ("ME", "Kennebec"), + ("maine", 3): ("ME", "Androscoggin"), + ("maine", 4): ("ME", "York"), + ("maine", 5): ("ME", "Knox"), + ("maine", 6): ("ME", "Penobscot"), + ("maine", 7): ("ME", "Washington"), +} + LOG_FILE = "oil_scraper.log" logging.basicConfig( filename=LOG_FILE, @@ -125,7 +166,7 @@ def parse_price_table(soup, state_name_key, zone_slug_str): if not is_price_table: continue - + candidate_tables_found += 1 tbody = table.find('tbody') if not tbody: @@ -139,7 +180,7 @@ def parse_price_table(soup, state_name_key, zone_slug_str): for row_index, row in enumerate(rows): cells = row.find_all('td') max_required_index = max(actual_column_indices.values()) if actual_column_indices else -1 - + if max_required_index == -1: logging.error(f"Logic error: is_price_table true but no column indices for {state_name_key}/{zone_slug_str}") continue @@ -172,11 +213,31 @@ def parse_price_table(soup, state_name_key, zone_slug_str): }) elif len(cells) > 0: logging.warning(f"Skipping row {row_index+1} with insufficient cells ({len(cells)}, need {max_required_index+1}) in {state_name_key}/{zone_slug_str}") - + if candidate_tables_found == 0: logging.warning(f"No tables matching expected price table structure found for {state_name_key} - {zone_slug_str}.") return data_dicts +# --- Helper: Build county lookup --- +def build_county_lookup(db_session): + """Build (state_abbrev, county_name) -> county_id lookup from DB.""" + counties = db_session.query(models.County).all() + lookup = {} + for c in counties: + lookup[(c.state, c.name)] = c.id + logging.info(f"Built county lookup with {len(lookup)} entries") + return lookup + + +def resolve_county_id(state_key, zone_number, county_lookup): + """Resolve county_id from ZONE_COUNTY_MAP and county lookup.""" + mapping = ZONE_COUNTY_MAP.get((state_key, zone_number)) + if not mapping: + return None + state_abbrev, county_name = mapping + return county_lookup.get((state_abbrev, county_name)) + + # --- Main Script --- def main(): logging.info("Starting oil price scraper job.") @@ -191,22 +252,24 @@ def main(): total_records_added_this_run = 0 try: + # Build county lookup at startup + county_lookup = build_county_lookup(db_session) + for site_config in SITES_CONFIG: site_name = site_config["site_name"] base_url = site_config["base_url"] url_template = site_config["url_template"] oil_type = site_config["oil_type"] - + logging.info(f"--- Processing site: {site_name} ---") for state_key_in_config, zone_slugs_list in site_config["locations"].items(): - # state_key_in_config is "connecticut", "maine", etc. - - for zone_slug_from_list in zone_slugs_list: # e.g., "zone1", "zonema5" + + for zone_slug_from_list in zone_slugs_list: format_params = { "base_url": base_url, - "state_slug": state_key_in_config, # Used if {state_slug} in template - "zone_slug": zone_slug_from_list, # This is "zone1", "zonema5", etc. + "state_slug": state_key_in_config, + "zone_slug": zone_slug_from_list, "oil_type": oil_type } target_url = url_template.format(**format_params) @@ -215,44 +278,61 @@ def main(): soup = make_request(target_url) if soup: - # Pass state_key_in_config as state_name_key - # Pass zone_slug_from_list (e.g. "zone1") as zone_slug_str for parsing to int parsed_items = parse_price_table(soup, state_key_in_config, zone_slug_from_list) - + if parsed_items: - for item_dict in parsed_items: # item_dict["zone"] will be an integer - # Check if a record with the same name, state, and zone already exists - existing_record = db_session.query(models.OilPrice).filter( - models.OilPrice.name == item_dict["name"], - models.OilPrice.state == item_dict["state"], - models.OilPrice.zone == item_dict["zone"] - ).first() - + # Resolve county_id for this zone + zone_int = parse_zone_slug_to_int(zone_slug_from_list) + county_id = None + if zone_int is not None: + county_id = resolve_county_id(state_key_in_config, zone_int, county_lookup) + + for item_dict in parsed_items: + # Match by county_id when available to avoid duplicates + # when multiple zones map to the same county + if county_id is not None: + existing_record = db_session.query(models.OilPrice).filter( + models.OilPrice.name == item_dict["name"], + models.OilPrice.state == item_dict["state"], + models.OilPrice.county_id == county_id + ).first() + else: + existing_record = db_session.query(models.OilPrice).filter( + models.OilPrice.name == item_dict["name"], + models.OilPrice.state == item_dict["state"], + models.OilPrice.zone == item_dict["zone"] + ).first() + if existing_record: - # If record exists, check if company_id is not null if existing_record.company_id is not None: logging.debug(f"Skipping update for {item_dict['name']} in {item_dict['state']} zone {item_dict['zone']} due to non-null company_id") else: - # If company_id is null, check if price is different + updated = False + if county_id is not None and existing_record.county_id != county_id: + existing_record.county_id = county_id + updated = True if existing_record.price != item_dict["price"]: existing_record.price = item_dict["price"] existing_record.date = item_dict["date"] existing_record.scrapetimestamp = datetime.utcnow() logging.info(f"Updated price for {item_dict['name']} in {item_dict['state']} zone {item_dict['zone']} to {item_dict['price']}") + elif updated: + existing_record.scrapetimestamp = datetime.utcnow() + logging.info(f"Updated county_id for {item_dict['name']} in {item_dict['state']} zone {item_dict['zone']} to {county_id}") else: logging.debug(f"Price unchanged for {item_dict['name']} in {item_dict['state']} zone {item_dict['zone']}") else: - # If no record exists, create a new one oil_price_record = models.OilPrice( state=item_dict["state"], zone=item_dict["zone"], name=item_dict["name"], price=item_dict["price"], date=item_dict["date"], + county_id=county_id, scrapetimestamp=datetime.utcnow() ) db_session.add(oil_price_record) - logging.info(f"Added new record for {item_dict['name']} in {item_dict['state']} zone {item_dict['zone']}") + logging.info(f"Added new record for {item_dict['name']} in {item_dict['state']} zone {item_dict['zone']} (county_id={county_id})") total_records_added_this_run += len(parsed_items) logging.info(f"Queued {len(parsed_items)} records from {site_name} - {state_key_in_config}/{zone_slug_from_list} for DB insertion.") else: diff --git a/fuel_scraper/__init__.py b/fuel_scraper/__init__.py new file mode 100644 index 0000000..4bcddf1 --- /dev/null +++ b/fuel_scraper/__init__.py @@ -0,0 +1,4 @@ +# fuel_scraper package +from .scraper import main + +__all__ = ["main"] diff --git a/fuel_scraper/config.py b/fuel_scraper/config.py new file mode 100644 index 0000000..2087045 --- /dev/null +++ b/fuel_scraper/config.py @@ -0,0 +1,114 @@ +""" +Configuration module for the fuel scraper. +Contains site definitions, zone-to-county mapping, and logging setup. +""" +import logging + +# --- SITES CONFIGURATION --- +SITES_CONFIG = [ + { + "site_name": "NewEnglandOil", + "base_url": "https://www.newenglandoil.com", + "url_template": "{base_url}/{state_slug}/{zone_slug}.asp?type={oil_type}", + "oil_type": 0, + "locations": { + "connecticut": [ + "zone1", "zone2", "zone3", "zone4", "zone5", "zone6", "zone7", + "zone8", "zone9", "zone10" + ], + "massachusetts": [ + "zone1", "zone2", "zone3", "zone4", "zone5", "zone6", + "zone7", "zone8", "zone9", "zone10", "zone11", "zone12", + "zone13", "zone14", "zone15" + ], + "newhampshire": [ + "zone1", "zone2", "zone3", "zone4", "zone5", "zone6" + ], + "rhodeisland": [ + "zone1", "zone2", "zone3", "zone4" + ], + } + }, + { + "site_name": "MaineOil", + "base_url": "https://www.maineoil.com", + "url_template": "{base_url}/{zone_slug}.asp?type={oil_type}", + "oil_type": 0, + "locations": { + "maine": [ + "zone1", "zone2", "zone3", "zone4", "zone5", + "zone6", "zone7" + ] + } + } +] + +# --- ZONE-TO-COUNTY MAPPING --- +# Maps (state_key, zone_number) -> (state_abbrev, county_name) +# state_key matches the keys in SITES_CONFIG locations (lowercase, no spaces) +# county_name must match the county.name in the database exactly +ZONE_COUNTY_MAP = { + # Connecticut (10 zones -> 8 counties) + ("connecticut", 1): ("CT", "New London"), # Southeast CT + ("connecticut", 2): ("CT", "Windham"), # Northeast CT + ("connecticut", 3): ("CT", "New Haven"), # New Haven, Bridgeport + ("connecticut", 4): ("CT", "Middlesex"), # Southeast Central CT + ("connecticut", 5): ("CT", "New Haven"), # Southwest Central CT + ("connecticut", 6): ("CT", "Hartford"), # Greater Hartford + ("connecticut", 7): ("CT", "Litchfield"), # West CT + ("connecticut", 8): ("CT", "Fairfield"), # Southwest CT + ("connecticut", 9): ("CT", "Tolland"), # Northeast Central CT + ("connecticut", 10): ("CT", "Litchfield"), # Northwest CT + + # Massachusetts (15 zones -> 14 counties) + ("massachusetts", 1): ("MA", "Suffolk"), # South Boston + ("massachusetts", 2): ("MA", "Middlesex"), # North Boston + ("massachusetts", 3): ("MA", "Norfolk"), # Southwest of Boston + ("massachusetts", 4): ("MA", "Plymouth"), # South of Boston + ("massachusetts", 5): ("MA", "Middlesex"), # West of Boston + ("massachusetts", 6): ("MA", "Bristol"), # Southern Massachusetts + ("massachusetts", 7): ("MA", "Barnstable"), # Cape Cod & Islands + ("massachusetts", 8): ("MA", "Essex"), # Northwest of Boston + ("massachusetts", 9): ("MA", "Essex"), # North of Boston + ("massachusetts", 10): ("MA", "Worcester"), # Central Massachusetts + ("massachusetts", 11): ("MA", "Worcester"), # East Central Massachusetts + ("massachusetts", 12): ("MA", "Hampshire"), # West Central Massachusetts + ("massachusetts", 13): ("MA", "Hampden"), # Springfield Area + ("massachusetts", 14): ("MA", "Franklin"), # Northwestern Massachusetts + ("massachusetts", 15): ("MA", "Berkshire"), # Western Massachusetts + + # New Hampshire (6 zones -> 10 counties) + ("newhampshire", 1): ("NH", "Coos"), # Northern NH + ("newhampshire", 2): ("NH", "Strafford"), # Eastern NH + ("newhampshire", 3): ("NH", "Merrimack"), # Central NH + ("newhampshire", 4): ("NH", "Grafton"), # West Central NH + ("newhampshire", 5): ("NH", "Cheshire"), # Southwest NH + ("newhampshire", 6): ("NH", "Hillsborough"), # South Central NH + + # Rhode Island (4 zones -> 5 counties) + ("rhodeisland", 1): ("RI", "Newport"), # Southeast RI + ("rhodeisland", 2): ("RI", "Providence"), # Northern RI + ("rhodeisland", 3): ("RI", "Washington"), # Southwest RI + ("rhodeisland", 4): ("RI", "Kent"), # Central RI + + # Maine (7 zones -> 16 counties, via MaineOil.com) + ("maine", 1): ("ME", "Cumberland"), # Greater Portland + ("maine", 2): ("ME", "Kennebec"), # Augusta/Waterville + ("maine", 3): ("ME", "Androscoggin"), # Auburn/Lewiston/Western + ("maine", 4): ("ME", "York"), # Southern Maine + ("maine", 5): ("ME", "Knox"), # Mid-Coast + ("maine", 6): ("ME", "Penobscot"), # Bangor West + ("maine", 7): ("ME", "Washington"), # Downeast +} + +# --- LOGGING CONFIGURATION --- +LOG_FILE = "oil_scraper.log" + + +def setup_logging(): + """Configure logging for the scraper.""" + logging.basicConfig( + filename=LOG_FILE, + level=logging.INFO, + format='%(asctime)s - %(levelname)s - [%(filename)s:%(lineno)d] - %(message)s' + ) diff --git a/fuel_scraper/db_operations.py b/fuel_scraper/db_operations.py new file mode 100644 index 0000000..030dce0 --- /dev/null +++ b/fuel_scraper/db_operations.py @@ -0,0 +1,105 @@ +""" +Database operations module for oil price CRUD operations. +""" +import logging +from datetime import datetime +from sqlalchemy.orm import Session + +import sys +import os +# Add parent directory to path for imports +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) +import models + + +def upsert_oil_price(db_session: Session, item_dict: dict) -> bool: + """ + Insert or update an oil price record. + + Logic: + - Match by (name, state, county_id) when county_id is available to avoid + duplicates when multiple zones map to the same county. + - Fall back to (name, state, zone) when county_id is not available. + - If record exists with non-null company_id: skip (vendor-managed price) + - If record exists with null company_id and different price: update + - If record exists with same price: skip (no change) + - If no record exists: insert new + + Args: + db_session: SQLAlchemy session + item_dict: Dictionary with state, zone, name, price, date, county_id + + Returns: + True if a record was inserted or updated, False otherwise + """ + county_id = item_dict.get("county_id") + + # Check if record already exists - prefer matching by county_id to avoid + # duplicates when multiple zones map to the same county + if county_id is not None: + existing_record = db_session.query(models.OilPrice).filter( + models.OilPrice.name == item_dict["name"], + models.OilPrice.state == item_dict["state"], + models.OilPrice.county_id == county_id + ).first() + else: + existing_record = db_session.query(models.OilPrice).filter( + models.OilPrice.name == item_dict["name"], + models.OilPrice.state == item_dict["state"], + models.OilPrice.zone == item_dict["zone"] + ).first() + + if existing_record: + # Record exists - check if we should update + if existing_record.company_id is not None: + logging.debug( + f"Skipping update for {item_dict['name']} in {item_dict['state']} zone {item_dict['zone']} " + "due to non-null company_id" + ) + return False + + # Always update county_id if we have one and it differs + updated = False + if county_id is not None and existing_record.county_id != county_id: + existing_record.county_id = county_id + updated = True + + # Company ID is null - check if price changed + if existing_record.price != item_dict["price"]: + existing_record.price = item_dict["price"] + existing_record.date = item_dict["date"] + existing_record.scrapetimestamp = datetime.utcnow() + logging.info( + f"Updated price for {item_dict['name']} in {item_dict['state']} zone {item_dict['zone']} " + f"to {item_dict['price']}" + ) + return True + elif updated: + existing_record.scrapetimestamp = datetime.utcnow() + logging.info( + f"Updated county_id for {item_dict['name']} in {item_dict['state']} zone {item_dict['zone']} " + f"to {county_id}" + ) + return True + else: + logging.debug( + f"Price unchanged for {item_dict['name']} in {item_dict['state']} zone {item_dict['zone']}" + ) + return False + else: + # No record exists - create new + oil_price_record = models.OilPrice( + state=item_dict["state"], + zone=item_dict["zone"], + name=item_dict["name"], + price=item_dict["price"], + date=item_dict["date"], + county_id=county_id, + scrapetimestamp=datetime.utcnow() + ) + db_session.add(oil_price_record) + logging.info( + f"Added new record for {item_dict['name']} in {item_dict['state']} zone {item_dict['zone']} " + f"(county_id={county_id})" + ) + return True diff --git a/fuel_scraper/http_client.py b/fuel_scraper/http_client.py new file mode 100644 index 0000000..4658518 --- /dev/null +++ b/fuel_scraper/http_client.py @@ -0,0 +1,32 @@ +""" +HTTP client module for making web requests. +""" +import logging +import requests +from bs4 import BeautifulSoup + +# Default headers to mimic a browser +DEFAULT_HEADERS = { + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' +} + +REQUEST_TIMEOUT = 20 + + +def make_request(url: str) -> BeautifulSoup | None: + """ + Fetch a URL and return a BeautifulSoup object. + + Args: + url: The URL to fetch + + Returns: + BeautifulSoup object if successful, None otherwise + """ + try: + response = requests.get(url, headers=DEFAULT_HEADERS, timeout=REQUEST_TIMEOUT) + response.raise_for_status() + return BeautifulSoup(response.content, 'html.parser') + except requests.exceptions.RequestException as e: + logging.error(f"Error fetching {url}: {e}") + return None diff --git a/fuel_scraper/parsers.py b/fuel_scraper/parsers.py new file mode 100644 index 0000000..0fb4ccc --- /dev/null +++ b/fuel_scraper/parsers.py @@ -0,0 +1,177 @@ +""" +HTML parsing module for extracting oil price data from web pages. +""" +import logging +import re +from bs4 import BeautifulSoup + + +def parse_zone_slug_to_int(zone_slug_str: str) -> int | None: + """ + Extract the numeric part of a zone slug. + + Examples: + "zone1" -> 1 + "zonema5" -> 5 + + Args: + zone_slug_str: Zone slug string like "zone1", "zonema5" + + Returns: + Integer zone number or None if parsing fails + """ + if not zone_slug_str: + return None + match = re.search(r'\d+$', zone_slug_str) + if match: + return int(match.group(0)) + logging.warning(f"Could not parse numeric zone from slug: '{zone_slug_str}'") + return None + + +def _find_price_table_columns(thead) -> dict | None: + """ + Find column indices for company, price, and date in a table header. + + Args: + thead: BeautifulSoup thead element + + Returns: + Dictionary with column indices or None if not a price table + """ + headers_lower = [th.get_text(strip=True).lower() for th in thead.find_all('th')] + column_indices = {} + + try: + column_indices['company'] = headers_lower.index('company name') + price_col_name_part = 'price' + column_indices['price'] = next( + i for i, header in enumerate(headers_lower) if price_col_name_part in header + ) + column_indices['date'] = headers_lower.index('date') + return column_indices + except (ValueError, StopIteration): + return None + + +def _parse_row(cells: list, column_indices: dict, state_name: str, zone: int) -> dict | None: + """ + Parse a single table row into a price record. + + Args: + cells: List of td elements + column_indices: Dictionary mapping column names to indices + state_name: State name string + zone: Zone number + + Returns: + Dictionary with parsed data or None if parsing fails + """ + max_required_index = max(column_indices.values()) + + if len(cells) <= max_required_index: + return None + + # Extract company name (prefer link text if available) + company_cell = cells[column_indices['company']] + company_name = company_cell.get_text(strip=True) + company_link = company_cell.find('a') + if company_link: + company_name = company_link.get_text(strip=True) + + # Extract and parse price + price_str = cells[column_indices['price']].get_text(strip=True) + price_float = None + try: + cleaned_price_str = ''.join(filter(lambda x: x.isdigit() or x == '.', price_str)) + if cleaned_price_str: + price_float = float(cleaned_price_str) + except ValueError: + logging.warning(f"Could not parse price: '{price_str}' for {company_name} in {state_name}/zone{zone}.") + except Exception as e: + logging.error(f"Unexpected error parsing price: '{price_str}' for {company_name}. Error: {e}") + + # Extract date + date_posted_str = cells[column_indices['date']].get_text(strip=True) + + return { + "state": state_name.capitalize(), + "zone": zone, + "name": company_name, + "price": price_float, + "date": date_posted_str, + } + + +def parse_price_table(soup: BeautifulSoup, state_name_key: str, zone_slug_str: str) -> list[dict]: + """ + Parse price tables from a BeautifulSoup page. + + Args: + soup: BeautifulSoup object of the page + state_name_key: State key like "connecticut", "maine" + zone_slug_str: Zone slug like "zone1", "zonema5" + + Returns: + List of dictionaries containing price data + """ + data_dicts = [] + all_tables = soup.find_all('table') + logging.info(f"Found {len(all_tables)} table(s) on page for {state_name_key} - {zone_slug_str}.") + + if not all_tables: + logging.warning(f"No HTML tables found at all for {state_name_key} - {zone_slug_str}.") + return data_dicts + + # Parse zone number from slug + zone_int = parse_zone_slug_to_int(zone_slug_str) + if zone_int is None: + logging.error(f"Cannot parse zone number for {state_name_key} - {zone_slug_str}. Skipping.") + return data_dicts + + candidate_tables_found = 0 + + for table_index, table in enumerate(all_tables): + thead = table.find('thead') + if not thead: + logging.debug(f"Table {table_index} has no thead.") + continue + + # Check if this is a price table + column_indices = _find_price_table_columns(thead) + if column_indices is None: + logging.debug(f"Table {table_index} headers do not contain all key columns.") + continue + + logging.debug(f"Table {table_index} identified as price table. Indices: {column_indices}") + candidate_tables_found += 1 + + # Parse table body + tbody = table.find('tbody') + if not tbody: + logging.warning(f"Price table identified by headers has no tbody. Skipping. State: {state_name_key}, Zone: {zone_slug_str}") + continue + + rows = tbody.find_all('tr') + if not rows: + logging.debug(f"No rows found in tbody for price table in {state_name_key}/{zone_slug_str}") + continue + + # Parse each row + for row_index, row in enumerate(rows): + cells = row.find_all('td') + record = _parse_row(cells, column_indices, state_name_key, zone_int) + + if record: + data_dicts.append(record) + elif len(cells) > 0: + max_required = max(column_indices.values()) + 1 + logging.warning( + f"Skipping row {row_index+1} with insufficient cells ({len(cells)}, need {max_required}) " + f"in {state_name_key}/{zone_slug_str}" + ) + + if candidate_tables_found == 0: + logging.warning(f"No tables matching expected price table structure found for {state_name_key} - {zone_slug_str}.") + + return data_dicts diff --git a/fuel_scraper/scraper.py b/fuel_scraper/scraper.py new file mode 100644 index 0000000..08b939b --- /dev/null +++ b/fuel_scraper/scraper.py @@ -0,0 +1,191 @@ +#!/usr/bin/env python3 +""" +Main scraper orchestrator module. +Coordinates fetching, parsing, and storing oil price data. +""" +import logging +import sys +import os + +# Add parent directory to path for imports +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +from sqlalchemy.orm import Session +from database import SessionLocal, init_db +import models + +from .config import SITES_CONFIG, ZONE_COUNTY_MAP, setup_logging +from .http_client import make_request +from .parsers import parse_price_table, parse_zone_slug_to_int +from .db_operations import upsert_oil_price + + +def _build_county_lookup(db_session: Session) -> dict: + """ + Build a lookup dict from (state_abbrev, county_name) -> county_id + by querying the county table. + """ + counties = db_session.query(models.County).all() + lookup = {} + for c in counties: + lookup[(c.state, c.name)] = c.id + logging.info(f"Built county lookup with {len(lookup)} entries") + return lookup + + +def _resolve_county_id(state_key: str, zone_number: int, county_lookup: dict) -> int | None: + """ + Resolve a county_id from ZONE_COUNTY_MAP and the county lookup. + Returns None if no mapping exists. + """ + mapping = ZONE_COUNTY_MAP.get((state_key, zone_number)) + if not mapping: + logging.debug(f"No zone-to-county mapping for ({state_key}, {zone_number})") + return None + state_abbrev, county_name = mapping + county_id = county_lookup.get((state_abbrev, county_name)) + if county_id is None: + logging.warning(f"County not found in DB: ({state_abbrev}, {county_name}) for zone ({state_key}, {zone_number})") + return county_id + + +def _scrape_zone( + db_session: Session, + site_name: str, + url_template: str, + base_url: str, + oil_type: int, + state_key: str, + zone_slug: str, + county_lookup: dict +) -> int: + """ + Scrape a single zone and store records. + + Returns: + Number of records processed + """ + format_params = { + "base_url": base_url, + "state_slug": state_key, + "zone_slug": zone_slug, + "oil_type": oil_type + } + target_url = url_template.format(**format_params) + + logging.info(f"Scraping: {target_url} (State: {state_key}, Zone Slug: {zone_slug})") + + soup = make_request(target_url) + if not soup: + logging.warning(f"Failed to retrieve or parse {target_url}. Skipping.") + return 0 + + parsed_items = parse_price_table(soup, state_key, zone_slug) + + if not parsed_items: + logging.info(f"No data extracted from {target_url}") + return 0 + + # Resolve county_id for this zone + zone_number = parse_zone_slug_to_int(zone_slug) + county_id = None + if zone_number is not None: + county_id = _resolve_county_id(state_key, zone_number, county_lookup) + + records_processed = 0 + for item_dict in parsed_items: + item_dict["county_id"] = county_id + if upsert_oil_price(db_session, item_dict): + records_processed += 1 + + logging.info( + f"Processed {len(parsed_items)} records from {site_name} - {state_key}/{zone_slug} " + f"({records_processed} inserted/updated, county_id={county_id})" + ) + + return len(parsed_items) + + +def _scrape_site(db_session: Session, site_config: dict, county_lookup: dict) -> int: + """ + Scrape all zones for a single site. + + Returns: + Total number of records processed + """ + site_name = site_config["site_name"] + base_url = site_config["base_url"] + url_template = site_config["url_template"] + oil_type = site_config["oil_type"] + + logging.info(f"--- Processing site: {site_name} ---") + + total_records = 0 + + for state_key, zone_slugs in site_config["locations"].items(): + for zone_slug in zone_slugs: + records = _scrape_zone( + db_session=db_session, + site_name=site_name, + url_template=url_template, + base_url=base_url, + oil_type=oil_type, + state_key=state_key, + zone_slug=zone_slug, + county_lookup=county_lookup + ) + total_records += records + + return total_records + + +def main(): + """ + Main entry point for the oil price scraper. + + Initializes database, iterates through all configured sites and zones, + scrapes price data, and stores it in the database. + """ + setup_logging() + logging.info("Starting oil price scraper job.") + + # Initialize database + try: + init_db() + logging.info("Database initialized/checked successfully.") + except Exception as e: + logging.error(f"Failed to initialize database: {e}", exc_info=True) + return + + db_session: Session = SessionLocal() + total_records = 0 + + try: + # Build county lookup at startup + county_lookup = _build_county_lookup(db_session) + + # Process each configured site + for site_config in SITES_CONFIG: + records = _scrape_site(db_session, site_config, county_lookup) + total_records += records + + # Commit all changes + if total_records > 0: + db_session.commit() + logging.info(f"Successfully committed records to the database.") + else: + logging.info("No new records were queued for database insertion in this run.") + + except Exception as e: + logging.error(f"An error occurred during scraping or DB operation: {e}", exc_info=True) + db_session.rollback() + logging.info("Database transaction rolled back due to error.") + finally: + db_session.close() + logging.info("Database session closed.") + + logging.info("Oil price scraper job finished.") + + +if __name__ == "__main__": + main() diff --git a/models.py b/models.py index eebbaf6..4879848 100644 --- a/models.py +++ b/models.py @@ -24,11 +24,20 @@ class OilPrice(Base): # when a new record is created and this field is not explicitly set. company_id = Column(Integer, ForeignKey("company.id"), nullable=True) + county_id = Column(Integer, nullable=True) def __repr__(self): return (f"") # Added scraped_at to repr + f"county_id={self.county_id}, scraped_at='{self.scrapetimestamp}')>") + +# --- County Model (read-only, for lookups) --- +class County(Base): + __tablename__ = "county" + + id = Column(Integer, primary_key=True) + name = Column(String(255)) + state = Column(String(2)) # --- Company Model (remains the same) --- class Company(Base): diff --git a/run.py b/run.py index 2318a2e..5e4bacc 100644 --- a/run.py +++ b/run.py @@ -6,7 +6,7 @@ import logging # The 'import models' is crucial for init_db to know about the tables import models from database import init_db, SessionLocal -from fuel_scraper import main as run_scraper_main # Assuming your scraper's main is 'main' +from fuel_scraper import main as run_scraper_main # Import from modular package # Configure basic logging for the run.py script itself if needed # Your other modules (fuel_scraper, database) will have their own logging