refactor: replace fuel_scraper with newenglandoil + cheapestoil scrapers

- Add newenglandoil/ package as the primary scraper (replaces fuel_scraper) - Add cheapestoil/ package as a secondary market price scraper - Add app.py entry point for direct execution - Update run.py: new scrape_cheapest(), migrate command, --state filter, --refresh-metadata flag for overwriting existing phone/URL data - Update models.py with latest schema fields - Update requirements.txt dependencies - Update Dockerfile and docker-compose.yml for new structure - Remove deprecated fuel_scraper module, test.py, and log file Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-03-06 11:34:21 -05:00
parent 8f45f4c209
commit 1592e6d685
26 changed files with 3221 additions and 1468 deletions
--- a/app.py
+++ b/app.py
@@ -0,0 +1,65 @@
+"""
+FastAPI web server for the crawler.
+Provides HTTP endpoints to trigger scrapes on demand.
+"""
+import logging
+from fastapi import FastAPI, HTTPException
+
+import models
+from database import SessionLocal
+from cheapestoil import scrape_state
+from cheapestoil.config import STATE_API_NAMES
+from newenglandoil.scraper import main as run_newenglandoil_scraper
+
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(asctime)s - %(levelname)s - [%(filename)s:%(lineno)d] - %(message)s",
+)
+
+app = FastAPI(title="Crawler API", version="1.0.0")
+
+
+def _build_county_lookup(db_session):
+    """Build a (state_abbr, county_name) -> county_id lookup from the DB."""
+    counties = db_session.query(models.County).all()
+    return {(c.state.strip(), c.name.strip()): c.id for c in counties}
+
+
+@app.get("/health")
+def health():
+    return {"status": "ok"}
+
+
+@app.get("/scrape/{state_abbr}")
+def scrape_endpoint(state_abbr: str, refresh_metadata: bool = False):
+    """Trigger a CheapestOil scrape for a single state."""
+    state_abbr = state_abbr.upper()
+    if state_abbr not in STATE_API_NAMES:
+        raise HTTPException(
+            status_code=400,
+            detail=f"Unknown state: {state_abbr}. Valid: {list(STATE_API_NAMES.keys())}",
+        )
+
+    db_session = SessionLocal()
+    try:
+        county_lookup = _build_county_lookup(db_session)
+        result = scrape_state(state_abbr, db_session, county_lookup, refresh_metadata=refresh_metadata)
+        return result
+    except Exception as e:
+        db_session.rollback()
+        logging.error(f"Scrape failed for {state_abbr}: {e}", exc_info=True)
+        raise HTTPException(status_code=500, detail=str(e))
+    finally:
+        db_session.close()
+
+
+@app.get("/scrape-newenglandoil")
+def scrape_newenglandoil_endpoint(state: str = None, refresh_metadata: bool = False):
+    """Trigger the NewEnglandOil scraper (runs synchronously)."""
+    try:
+        # This will run the scraper and log to stdout (inherited from app's logging setup)
+        run_newenglandoil_scraper(refresh_metadata=refresh_metadata, target_state_abbr=state)
+        return {"status": "ok", "message": "NewEnglandOil scrape completed"}
+    except Exception as e:
+        logging.error(f"NewEnglandOil scrape failed: {e}", exc_info=True)
+        raise HTTPException(status_code=500, detail=str(e))