Add ZONE_COUNTY_MAP for all 5 scraped states (42 zone-to-county entries). Scraper now resolves county_id at startup and assigns it to each record. Upsert logic deduplicates by (name, state, county_id) to prevent duplicates when multiple zones map to the same county. Also adds County model for DB lookups and fixes Rhode Island zone count (4, not 5). Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
192 lines
5.6 KiB
Python
192 lines
5.6 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Main scraper orchestrator module.
|
|
Coordinates fetching, parsing, and storing oil price data.
|
|
"""
|
|
import logging
|
|
import sys
|
|
import os
|
|
|
|
# Add parent directory to path for imports
|
|
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
|
|
|
from sqlalchemy.orm import Session
|
|
from database import SessionLocal, init_db
|
|
import models
|
|
|
|
from .config import SITES_CONFIG, ZONE_COUNTY_MAP, setup_logging
|
|
from .http_client import make_request
|
|
from .parsers import parse_price_table, parse_zone_slug_to_int
|
|
from .db_operations import upsert_oil_price
|
|
|
|
|
|
def _build_county_lookup(db_session: Session) -> dict:
|
|
"""
|
|
Build a lookup dict from (state_abbrev, county_name) -> county_id
|
|
by querying the county table.
|
|
"""
|
|
counties = db_session.query(models.County).all()
|
|
lookup = {}
|
|
for c in counties:
|
|
lookup[(c.state, c.name)] = c.id
|
|
logging.info(f"Built county lookup with {len(lookup)} entries")
|
|
return lookup
|
|
|
|
|
|
def _resolve_county_id(state_key: str, zone_number: int, county_lookup: dict) -> int | None:
|
|
"""
|
|
Resolve a county_id from ZONE_COUNTY_MAP and the county lookup.
|
|
Returns None if no mapping exists.
|
|
"""
|
|
mapping = ZONE_COUNTY_MAP.get((state_key, zone_number))
|
|
if not mapping:
|
|
logging.debug(f"No zone-to-county mapping for ({state_key}, {zone_number})")
|
|
return None
|
|
state_abbrev, county_name = mapping
|
|
county_id = county_lookup.get((state_abbrev, county_name))
|
|
if county_id is None:
|
|
logging.warning(f"County not found in DB: ({state_abbrev}, {county_name}) for zone ({state_key}, {zone_number})")
|
|
return county_id
|
|
|
|
|
|
def _scrape_zone(
|
|
db_session: Session,
|
|
site_name: str,
|
|
url_template: str,
|
|
base_url: str,
|
|
oil_type: int,
|
|
state_key: str,
|
|
zone_slug: str,
|
|
county_lookup: dict
|
|
) -> int:
|
|
"""
|
|
Scrape a single zone and store records.
|
|
|
|
Returns:
|
|
Number of records processed
|
|
"""
|
|
format_params = {
|
|
"base_url": base_url,
|
|
"state_slug": state_key,
|
|
"zone_slug": zone_slug,
|
|
"oil_type": oil_type
|
|
}
|
|
target_url = url_template.format(**format_params)
|
|
|
|
logging.info(f"Scraping: {target_url} (State: {state_key}, Zone Slug: {zone_slug})")
|
|
|
|
soup = make_request(target_url)
|
|
if not soup:
|
|
logging.warning(f"Failed to retrieve or parse {target_url}. Skipping.")
|
|
return 0
|
|
|
|
parsed_items = parse_price_table(soup, state_key, zone_slug)
|
|
|
|
if not parsed_items:
|
|
logging.info(f"No data extracted from {target_url}")
|
|
return 0
|
|
|
|
# Resolve county_id for this zone
|
|
zone_number = parse_zone_slug_to_int(zone_slug)
|
|
county_id = None
|
|
if zone_number is not None:
|
|
county_id = _resolve_county_id(state_key, zone_number, county_lookup)
|
|
|
|
records_processed = 0
|
|
for item_dict in parsed_items:
|
|
item_dict["county_id"] = county_id
|
|
if upsert_oil_price(db_session, item_dict):
|
|
records_processed += 1
|
|
|
|
logging.info(
|
|
f"Processed {len(parsed_items)} records from {site_name} - {state_key}/{zone_slug} "
|
|
f"({records_processed} inserted/updated, county_id={county_id})"
|
|
)
|
|
|
|
return len(parsed_items)
|
|
|
|
|
|
def _scrape_site(db_session: Session, site_config: dict, county_lookup: dict) -> int:
|
|
"""
|
|
Scrape all zones for a single site.
|
|
|
|
Returns:
|
|
Total number of records processed
|
|
"""
|
|
site_name = site_config["site_name"]
|
|
base_url = site_config["base_url"]
|
|
url_template = site_config["url_template"]
|
|
oil_type = site_config["oil_type"]
|
|
|
|
logging.info(f"--- Processing site: {site_name} ---")
|
|
|
|
total_records = 0
|
|
|
|
for state_key, zone_slugs in site_config["locations"].items():
|
|
for zone_slug in zone_slugs:
|
|
records = _scrape_zone(
|
|
db_session=db_session,
|
|
site_name=site_name,
|
|
url_template=url_template,
|
|
base_url=base_url,
|
|
oil_type=oil_type,
|
|
state_key=state_key,
|
|
zone_slug=zone_slug,
|
|
county_lookup=county_lookup
|
|
)
|
|
total_records += records
|
|
|
|
return total_records
|
|
|
|
|
|
def main():
|
|
"""
|
|
Main entry point for the oil price scraper.
|
|
|
|
Initializes database, iterates through all configured sites and zones,
|
|
scrapes price data, and stores it in the database.
|
|
"""
|
|
setup_logging()
|
|
logging.info("Starting oil price scraper job.")
|
|
|
|
# Initialize database
|
|
try:
|
|
init_db()
|
|
logging.info("Database initialized/checked successfully.")
|
|
except Exception as e:
|
|
logging.error(f"Failed to initialize database: {e}", exc_info=True)
|
|
return
|
|
|
|
db_session: Session = SessionLocal()
|
|
total_records = 0
|
|
|
|
try:
|
|
# Build county lookup at startup
|
|
county_lookup = _build_county_lookup(db_session)
|
|
|
|
# Process each configured site
|
|
for site_config in SITES_CONFIG:
|
|
records = _scrape_site(db_session, site_config, county_lookup)
|
|
total_records += records
|
|
|
|
# Commit all changes
|
|
if total_records > 0:
|
|
db_session.commit()
|
|
logging.info(f"Successfully committed records to the database.")
|
|
else:
|
|
logging.info("No new records were queued for database insertion in this run.")
|
|
|
|
except Exception as e:
|
|
logging.error(f"An error occurred during scraping or DB operation: {e}", exc_info=True)
|
|
db_session.rollback()
|
|
logging.info("Database transaction rolled back due to error.")
|
|
finally:
|
|
db_session.close()
|
|
logging.info("Database session closed.")
|
|
|
|
logging.info("Oil price scraper job finished.")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|