#!/usr/bin/env python3 """ Main scraper orchestrator module. Coordinates fetching, parsing, and storing oil price data. """ import logging import sys import os # Add parent directory to path for imports sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) from sqlalchemy.orm import Session from database import SessionLocal, init_db import models from .config import SITES_CONFIG, ZONE_COUNTY_MAP, setup_logging, STATE_ABBREV_MAP from .http_client import make_request, fetch_phone_number from .parsers import parse_price_table, parse_zone_slug_to_int from .db_operations import upsert_oil_price def _build_county_lookup(db_session: Session) -> dict: """ Build a lookup dict from (state_abbrev, county_name) -> county_id by querying the county table. """ counties = db_session.query(models.County).all() lookup = {} for c in counties: if c.name: lookup[(c.state, c.name.strip())] = c.id logging.info(f"Built county lookup with {len(lookup)} entries") return lookup def _resolve_county_id(state_key: str, zone_number: int, county_lookup: dict) -> int | None: """ Resolve a county_id from ZONE_COUNTY_MAP and the county lookup. Returns None if no mapping exists. """ mapping = ZONE_COUNTY_MAP.get((state_key, zone_number)) if not mapping: logging.debug(f"No zone-to-county mapping for ({state_key}, {zone_number})") return None state_abbrev, county_name = mapping county_id = county_lookup.get((state_abbrev, county_name)) if county_id is None: logging.warning(f"County not found in DB: ({state_abbrev}, {county_name}) for zone ({state_key}, {zone_number})") return county_id def _scrape_zone( db_session: Session, site_name: str, url_template: str, base_url: str, oil_type: int, state_key: str, zone_slug: str, county_lookup: dict, phone_cache: dict, refresh_metadata: bool = False, ) -> int: """ Scrape a single zone and store records. Args: phone_cache: Dict mapping neo_id -> phone string. Shared across zones to avoid re-fetching the same company's phone page. refresh_metadata: If True, force re-fetch phone even if in cache (or not cached yet) and overwrite DB values. Returns: Number of records processed """ format_params = { "base_url": base_url, "state_slug": state_key, "zone_slug": zone_slug, "oil_type": oil_type } target_url = url_template.format(**format_params) logging.info(f"[{site_name}] Scraping: {target_url} (State: {state_key}, Zone Slug: {zone_slug})") soup = make_request(target_url) if not soup: logging.warning(f"[{site_name}] Failed to retrieve or parse {target_url}. Skipping.") return 0 parsed_items = parse_price_table(soup, state_key, zone_slug, site_name) if not parsed_items: logging.info(f"[{site_name}] No data extracted from {target_url}") return 0 # Resolve county_id for this zone zone_number = parse_zone_slug_to_int(zone_slug) county_id = None if zone_number is not None: county_id = _resolve_county_id(state_key, zone_number, county_lookup) records_processed = 0 for item_dict in parsed_items: item_dict["county_id"] = county_id item_dict["site_name"] = site_name # Fetch phone number if we have phone_info and haven't fetched this company yet phone_info = item_dict.pop("phone_info", None) if phone_info: neo_id = phone_info.get("neo_id") # If refresh_metadata is True, we want to fetch regardless of cache check initially # to refresh the cache value if needed. # Use phone_page_path as the cache key because neo_id is only unique per zone. # phone_page_path typically looks like "phones.asp?zone=1&ID=10&a=MA1" effectively unique. phone_key = phone_info.get("phone_page_path") if phone_key: should_fetch = False if phone_key in phone_cache: if refresh_metadata: # Even if in cache, we might want to refetch? # Or maybe just trust first fetch in this run. # Let's say cache handles current runtime, refresh_metadata handles DB. # BUT if we want to refresh, we should fetch it at least once this run. item_dict["phone"] = phone_cache[phone_key] else: item_dict["phone"] = phone_cache[phone_key] else: should_fetch = True if should_fetch: # Only include state_slug in phone URL if the site uses it in its URL template slug = state_key if "{state_slug}" in url_template else "" phone = fetch_phone_number(base_url, phone_info["phone_page_path"], slug) phone_cache[phone_key] = phone item_dict["phone"] = phone if phone: logging.info(f"[{site_name}] Fetched phone for {item_dict['name']} (ID={neo_id}): {phone}") if upsert_oil_price(db_session, item_dict, force_update_metadata=refresh_metadata): records_processed += 1 logging.info( f"[{site_name}] Processed {len(parsed_items)} records from {site_name} - {state_key}/{zone_slug} " f"({records_processed} inserted/updated, county_id={county_id}) (Size: {len(parsed_items)})" ) return len(parsed_items) def _scrape_site(db_session: Session, site_config: dict, county_lookup: dict, refresh_metadata: bool = False) -> int: """ Scrape all zones for a single site. Returns: Total number of records processed """ site_name = site_config["site_name"] base_url = site_config["base_url"] url_template = site_config["url_template"] oil_type = site_config["oil_type"] logging.info(f"--- Processing site: {site_name} ---") total_records = 0 # Shared phone cache across all zones for this site to avoid redundant fetches phone_cache = {} for state_key, zone_slugs in site_config["locations"].items(): for zone_slug in zone_slugs: records = _scrape_zone( db_session=db_session, site_name=site_name, url_template=url_template, base_url=base_url, oil_type=oil_type, state_key=state_key, zone_slug=zone_slug, county_lookup=county_lookup, phone_cache=phone_cache, refresh_metadata=refresh_metadata, ) total_records += records logging.info(f"Phone cache: fetched {len(phone_cache)} unique company phones for {site_name}") return total_records def main(refresh_metadata: bool = False, target_state_abbr: str | None = None): """ Main entry point for the oil price scraper. Args: refresh_metadata: If True, force re-fetch details. target_state_abbr: If set (e.g. "MA"), only scrape that state. """ setup_logging() state_msg = f" (State: {target_state_abbr})" if target_state_abbr else "" logging.info(f"Starting oil price scraper job.{state_msg} (Refresh Metadata: {refresh_metadata})") # Initialize database try: init_db() logging.info("Database initialized/checked successfully.") except Exception as e: logging.error(f"Failed to initialize database: {e}", exc_info=True) return db_session: Session = SessionLocal() total_records = 0 try: # Build county lookup at startup county_lookup = _build_county_lookup(db_session) # Build reverse map for state filtering abbrev_to_state = {v: k for k, v in STATE_ABBREV_MAP.items()} target_state_key = abbrev_to_state.get(target_state_abbr.upper()) if target_state_abbr else None if target_state_abbr and not target_state_key: logging.error(f"Unknown state abbreviation: {target_state_abbr}") return # Process each configured site for site_config in SITES_CONFIG: # If filtering by state, create a shallow copy of config with filtered locations config_to_use = site_config if target_state_key: # Check if this site has the target state if target_state_key in site_config["locations"]: # Create filtered config config_to_use = site_config.copy() config_to_use["locations"] = { target_state_key: site_config["locations"][target_state_key] } else: logging.info(f"Skipping {site_config['site_name']} (does not cover {target_state_abbr})") continue records = _scrape_site(db_session, config_to_use, county_lookup, refresh_metadata=refresh_metadata) total_records += records # Commit all changes if total_records > 0: db_session.commit() logging.info(f"Successfully committed records to the database.") else: logging.info("No new records were queued for database insertion in this run.") except Exception as e: logging.error(f"An error occurred during scraping or DB operation: {e}", exc_info=True) db_session.rollback() logging.info("Database transaction rolled back due to error.") finally: db_session.close() logging.info("Database session closed.") logging.info("Oil price scraper job finished.") if __name__ == "__main__": main()