refactor: replace fuel_scraper with newenglandoil + cheapestoil scrapers
- Add newenglandoil/ package as the primary scraper (replaces fuel_scraper) - Add cheapestoil/ package as a secondary market price scraper - Add app.py entry point for direct execution - Update run.py: new scrape_cheapest(), migrate command, --state filter, --refresh-metadata flag for overwriting existing phone/URL data - Update models.py with latest schema fields - Update requirements.txt dependencies - Update Dockerfile and docker-compose.yml for new structure - Remove deprecated fuel_scraper module, test.py, and log file Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
266
newenglandoil/scraper.py
Normal file
266
newenglandoil/scraper.py
Normal file
@@ -0,0 +1,266 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Main scraper orchestrator module.
|
||||
Coordinates fetching, parsing, and storing oil price data.
|
||||
"""
|
||||
import logging
|
||||
import sys
|
||||
import os
|
||||
|
||||
# Add parent directory to path for imports
|
||||
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
||||
|
||||
from sqlalchemy.orm import Session
|
||||
from database import SessionLocal, init_db
|
||||
import models
|
||||
|
||||
from .config import SITES_CONFIG, ZONE_COUNTY_MAP, setup_logging, STATE_ABBREV_MAP
|
||||
from .http_client import make_request, fetch_phone_number
|
||||
from .parsers import parse_price_table, parse_zone_slug_to_int
|
||||
from .db_operations import upsert_oil_price
|
||||
|
||||
|
||||
def _build_county_lookup(db_session: Session) -> dict:
|
||||
"""
|
||||
Build a lookup dict from (state_abbrev, county_name) -> county_id
|
||||
by querying the county table.
|
||||
"""
|
||||
counties = db_session.query(models.County).all()
|
||||
lookup = {}
|
||||
for c in counties:
|
||||
if c.name:
|
||||
lookup[(c.state, c.name.strip())] = c.id
|
||||
logging.info(f"Built county lookup with {len(lookup)} entries")
|
||||
return lookup
|
||||
|
||||
|
||||
def _resolve_county_id(state_key: str, zone_number: int, county_lookup: dict) -> int | None:
|
||||
"""
|
||||
Resolve a county_id from ZONE_COUNTY_MAP and the county lookup.
|
||||
Returns None if no mapping exists.
|
||||
"""
|
||||
mapping = ZONE_COUNTY_MAP.get((state_key, zone_number))
|
||||
if not mapping:
|
||||
logging.debug(f"No zone-to-county mapping for ({state_key}, {zone_number})")
|
||||
return None
|
||||
state_abbrev, county_name = mapping
|
||||
county_id = county_lookup.get((state_abbrev, county_name))
|
||||
if county_id is None:
|
||||
logging.warning(f"County not found in DB: ({state_abbrev}, {county_name}) for zone ({state_key}, {zone_number})")
|
||||
return county_id
|
||||
|
||||
|
||||
def _scrape_zone(
|
||||
db_session: Session,
|
||||
site_name: str,
|
||||
url_template: str,
|
||||
base_url: str,
|
||||
oil_type: int,
|
||||
state_key: str,
|
||||
zone_slug: str,
|
||||
county_lookup: dict,
|
||||
phone_cache: dict,
|
||||
refresh_metadata: bool = False,
|
||||
) -> int:
|
||||
"""
|
||||
Scrape a single zone and store records.
|
||||
|
||||
Args:
|
||||
phone_cache: Dict mapping neo_id -> phone string. Shared across zones
|
||||
to avoid re-fetching the same company's phone page.
|
||||
refresh_metadata: If True, force re-fetch phone even if in cache (or not cached yet)
|
||||
and overwrite DB values.
|
||||
|
||||
Returns:
|
||||
Number of records processed
|
||||
"""
|
||||
format_params = {
|
||||
"base_url": base_url,
|
||||
"state_slug": state_key,
|
||||
"zone_slug": zone_slug,
|
||||
"oil_type": oil_type
|
||||
}
|
||||
target_url = url_template.format(**format_params)
|
||||
|
||||
logging.info(f"[{site_name}] Scraping: {target_url} (State: {state_key}, Zone Slug: {zone_slug})")
|
||||
|
||||
soup = make_request(target_url)
|
||||
if not soup:
|
||||
logging.warning(f"[{site_name}] Failed to retrieve or parse {target_url}. Skipping.")
|
||||
return 0
|
||||
|
||||
parsed_items = parse_price_table(soup, state_key, zone_slug, site_name)
|
||||
|
||||
if not parsed_items:
|
||||
logging.info(f"[{site_name}] No data extracted from {target_url}")
|
||||
return 0
|
||||
|
||||
# Resolve county_id for this zone
|
||||
zone_number = parse_zone_slug_to_int(zone_slug)
|
||||
county_id = None
|
||||
if zone_number is not None:
|
||||
county_id = _resolve_county_id(state_key, zone_number, county_lookup)
|
||||
|
||||
records_processed = 0
|
||||
for item_dict in parsed_items:
|
||||
item_dict["county_id"] = county_id
|
||||
item_dict["site_name"] = site_name
|
||||
|
||||
# Fetch phone number if we have phone_info and haven't fetched this company yet
|
||||
phone_info = item_dict.pop("phone_info", None)
|
||||
if phone_info:
|
||||
neo_id = phone_info.get("neo_id")
|
||||
|
||||
# If refresh_metadata is True, we want to fetch regardless of cache check initially
|
||||
# to refresh the cache value if needed.
|
||||
# Use phone_page_path as the cache key because neo_id is only unique per zone.
|
||||
# phone_page_path typically looks like "phones.asp?zone=1&ID=10&a=MA1" effectively unique.
|
||||
phone_key = phone_info.get("phone_page_path")
|
||||
|
||||
if phone_key:
|
||||
should_fetch = False
|
||||
if phone_key in phone_cache:
|
||||
if refresh_metadata:
|
||||
# Even if in cache, we might want to refetch?
|
||||
# Or maybe just trust first fetch in this run.
|
||||
# Let's say cache handles current runtime, refresh_metadata handles DB.
|
||||
# BUT if we want to refresh, we should fetch it at least once this run.
|
||||
item_dict["phone"] = phone_cache[phone_key]
|
||||
else:
|
||||
item_dict["phone"] = phone_cache[phone_key]
|
||||
else:
|
||||
should_fetch = True
|
||||
|
||||
if should_fetch:
|
||||
# Only include state_slug in phone URL if the site uses it in its URL template
|
||||
slug = state_key if "{state_slug}" in url_template else ""
|
||||
phone = fetch_phone_number(base_url, phone_info["phone_page_path"], slug)
|
||||
phone_cache[phone_key] = phone
|
||||
item_dict["phone"] = phone
|
||||
if phone:
|
||||
logging.info(f"[{site_name}] Fetched phone for {item_dict['name']} (ID={neo_id}): {phone}")
|
||||
|
||||
if upsert_oil_price(db_session, item_dict, force_update_metadata=refresh_metadata):
|
||||
records_processed += 1
|
||||
|
||||
logging.info(
|
||||
f"[{site_name}] Processed {len(parsed_items)} records from {site_name} - {state_key}/{zone_slug} "
|
||||
f"({records_processed} inserted/updated, county_id={county_id}) (Size: {len(parsed_items)})"
|
||||
)
|
||||
|
||||
return len(parsed_items)
|
||||
|
||||
|
||||
def _scrape_site(db_session: Session, site_config: dict, county_lookup: dict, refresh_metadata: bool = False) -> int:
|
||||
"""
|
||||
Scrape all zones for a single site.
|
||||
|
||||
Returns:
|
||||
Total number of records processed
|
||||
"""
|
||||
site_name = site_config["site_name"]
|
||||
base_url = site_config["base_url"]
|
||||
url_template = site_config["url_template"]
|
||||
oil_type = site_config["oil_type"]
|
||||
|
||||
logging.info(f"--- Processing site: {site_name} ---")
|
||||
|
||||
total_records = 0
|
||||
# Shared phone cache across all zones for this site to avoid redundant fetches
|
||||
phone_cache = {}
|
||||
|
||||
for state_key, zone_slugs in site_config["locations"].items():
|
||||
for zone_slug in zone_slugs:
|
||||
records = _scrape_zone(
|
||||
db_session=db_session,
|
||||
site_name=site_name,
|
||||
url_template=url_template,
|
||||
base_url=base_url,
|
||||
oil_type=oil_type,
|
||||
state_key=state_key,
|
||||
zone_slug=zone_slug,
|
||||
county_lookup=county_lookup,
|
||||
phone_cache=phone_cache,
|
||||
refresh_metadata=refresh_metadata,
|
||||
)
|
||||
total_records += records
|
||||
|
||||
logging.info(f"Phone cache: fetched {len(phone_cache)} unique company phones for {site_name}")
|
||||
return total_records
|
||||
|
||||
|
||||
def main(refresh_metadata: bool = False, target_state_abbr: str | None = None):
|
||||
"""
|
||||
Main entry point for the oil price scraper.
|
||||
|
||||
Args:
|
||||
refresh_metadata: If True, force re-fetch details.
|
||||
target_state_abbr: If set (e.g. "MA"), only scrape that state.
|
||||
"""
|
||||
setup_logging()
|
||||
|
||||
state_msg = f" (State: {target_state_abbr})" if target_state_abbr else ""
|
||||
logging.info(f"Starting oil price scraper job.{state_msg} (Refresh Metadata: {refresh_metadata})")
|
||||
|
||||
# Initialize database
|
||||
try:
|
||||
init_db()
|
||||
logging.info("Database initialized/checked successfully.")
|
||||
except Exception as e:
|
||||
logging.error(f"Failed to initialize database: {e}", exc_info=True)
|
||||
return
|
||||
|
||||
db_session: Session = SessionLocal()
|
||||
total_records = 0
|
||||
|
||||
try:
|
||||
# Build county lookup at startup
|
||||
county_lookup = _build_county_lookup(db_session)
|
||||
|
||||
# Build reverse map for state filtering
|
||||
abbrev_to_state = {v: k for k, v in STATE_ABBREV_MAP.items()}
|
||||
target_state_key = abbrev_to_state.get(target_state_abbr.upper()) if target_state_abbr else None
|
||||
|
||||
if target_state_abbr and not target_state_key:
|
||||
logging.error(f"Unknown state abbreviation: {target_state_abbr}")
|
||||
return
|
||||
|
||||
# Process each configured site
|
||||
for site_config in SITES_CONFIG:
|
||||
# If filtering by state, create a shallow copy of config with filtered locations
|
||||
config_to_use = site_config
|
||||
if target_state_key:
|
||||
# Check if this site has the target state
|
||||
if target_state_key in site_config["locations"]:
|
||||
# Create filtered config
|
||||
config_to_use = site_config.copy()
|
||||
config_to_use["locations"] = {
|
||||
target_state_key: site_config["locations"][target_state_key]
|
||||
}
|
||||
else:
|
||||
logging.info(f"Skipping {site_config['site_name']} (does not cover {target_state_abbr})")
|
||||
continue
|
||||
|
||||
records = _scrape_site(db_session, config_to_use, county_lookup, refresh_metadata=refresh_metadata)
|
||||
total_records += records
|
||||
|
||||
# Commit all changes
|
||||
if total_records > 0:
|
||||
db_session.commit()
|
||||
logging.info(f"Successfully committed records to the database.")
|
||||
else:
|
||||
logging.info("No new records were queued for database insertion in this run.")
|
||||
|
||||
except Exception as e:
|
||||
logging.error(f"An error occurred during scraping or DB operation: {e}", exc_info=True)
|
||||
db_session.rollback()
|
||||
logging.info("Database transaction rolled back due to error.")
|
||||
finally:
|
||||
db_session.close()
|
||||
logging.info("Database session closed.")
|
||||
|
||||
logging.info("Oil price scraper job finished.")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user