crawler/app.py

"""
FastAPI web server for the crawler.
Provides HTTP endpoints to trigger scrapes on demand.
"""
import logging
from fastapi import FastAPI, HTTPException

import models
from database import SessionLocal
from cheapestoil import scrape_state
from cheapestoil.config import STATE_API_NAMES
from newenglandoil.scraper import main as run_newenglandoil_scraper

logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s - %(levelname)s - [%(filename)s:%(lineno)d] - %(message)s",
)

app = FastAPI(title="Crawler API", version="1.0.0")


def _build_county_lookup(db_session):
    """Build a (state_abbr, county_name) -> county_id lookup from the DB."""
    counties = db_session.query(models.County).all()
    return {(c.state.strip(), c.name.strip()): c.id for c in counties}


@app.get("/health")
def health():
    return {"status": "ok"}


@app.get("/scrape/{state_abbr}")
def scrape_endpoint(state_abbr: str, refresh_metadata: bool = False):
    """Trigger a CheapestOil scrape for a single state."""
    state_abbr = state_abbr.upper()
    if state_abbr not in STATE_API_NAMES:
        raise HTTPException(
            status_code=400,
            detail=f"Unknown state: {state_abbr}. Valid: {list(STATE_API_NAMES.keys())}",
        )

    db_session = SessionLocal()
    try:
        county_lookup = _build_county_lookup(db_session)
        result = scrape_state(state_abbr, db_session, county_lookup, refresh_metadata=refresh_metadata)
        return result
    except Exception as e:
        db_session.rollback()
        logging.error(f"Scrape failed for {state_abbr}: {e}", exc_info=True)
        raise HTTPException(status_code=500, detail=str(e))
    finally:
        db_session.close()


@app.get("/scrape-newenglandoil")
def scrape_newenglandoil_endpoint(state: str = None, refresh_metadata: bool = False):
    """Trigger the NewEnglandOil scraper (runs synchronously)."""
    try:
        # This will run the scraper and log to stdout (inherited from app's logging setup)
        run_newenglandoil_scraper(refresh_metadata=refresh_metadata, target_state_abbr=state)
        return {"status": "ok", "message": "NewEnglandOil scrape completed"}
    except Exception as e:
        logging.error(f"NewEnglandOil scrape failed: {e}", exc_info=True)
        raise HTTPException(status_code=500, detail=str(e))