- Add newenglandoil/ package as the primary scraper (replaces fuel_scraper) - Add cheapestoil/ package as a secondary market price scraper - Add app.py entry point for direct execution - Update run.py: new scrape_cheapest(), migrate command, --state filter, --refresh-metadata flag for overwriting existing phone/URL data - Update models.py with latest schema fields - Update requirements.txt dependencies - Update Dockerfile and docker-compose.yml for new structure - Remove deprecated fuel_scraper module, test.py, and log file Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
66 lines
2.3 KiB
Python
66 lines
2.3 KiB
Python
"""
|
|
FastAPI web server for the crawler.
|
|
Provides HTTP endpoints to trigger scrapes on demand.
|
|
"""
|
|
import logging
|
|
from fastapi import FastAPI, HTTPException
|
|
|
|
import models
|
|
from database import SessionLocal
|
|
from cheapestoil import scrape_state
|
|
from cheapestoil.config import STATE_API_NAMES
|
|
from newenglandoil.scraper import main as run_newenglandoil_scraper
|
|
|
|
logging.basicConfig(
|
|
level=logging.INFO,
|
|
format="%(asctime)s - %(levelname)s - [%(filename)s:%(lineno)d] - %(message)s",
|
|
)
|
|
|
|
app = FastAPI(title="Crawler API", version="1.0.0")
|
|
|
|
|
|
def _build_county_lookup(db_session):
|
|
"""Build a (state_abbr, county_name) -> county_id lookup from the DB."""
|
|
counties = db_session.query(models.County).all()
|
|
return {(c.state.strip(), c.name.strip()): c.id for c in counties}
|
|
|
|
|
|
@app.get("/health")
|
|
def health():
|
|
return {"status": "ok"}
|
|
|
|
|
|
@app.get("/scrape/{state_abbr}")
|
|
def scrape_endpoint(state_abbr: str, refresh_metadata: bool = False):
|
|
"""Trigger a CheapestOil scrape for a single state."""
|
|
state_abbr = state_abbr.upper()
|
|
if state_abbr not in STATE_API_NAMES:
|
|
raise HTTPException(
|
|
status_code=400,
|
|
detail=f"Unknown state: {state_abbr}. Valid: {list(STATE_API_NAMES.keys())}",
|
|
)
|
|
|
|
db_session = SessionLocal()
|
|
try:
|
|
county_lookup = _build_county_lookup(db_session)
|
|
result = scrape_state(state_abbr, db_session, county_lookup, refresh_metadata=refresh_metadata)
|
|
return result
|
|
except Exception as e:
|
|
db_session.rollback()
|
|
logging.error(f"Scrape failed for {state_abbr}: {e}", exc_info=True)
|
|
raise HTTPException(status_code=500, detail=str(e))
|
|
finally:
|
|
db_session.close()
|
|
|
|
|
|
@app.get("/scrape-newenglandoil")
|
|
def scrape_newenglandoil_endpoint(state: str = None, refresh_metadata: bool = False):
|
|
"""Trigger the NewEnglandOil scraper (runs synchronously)."""
|
|
try:
|
|
# This will run the scraper and log to stdout (inherited from app's logging setup)
|
|
run_newenglandoil_scraper(refresh_metadata=refresh_metadata, target_state_abbr=state)
|
|
return {"status": "ok", "message": "NewEnglandOil scrape completed"}
|
|
except Exception as e:
|
|
logging.error(f"NewEnglandOil scrape failed: {e}", exc_info=True)
|
|
raise HTTPException(status_code=500, detail=str(e))
|