refactor: replace fuel_scraper with newenglandoil + cheapestoil scrapers

- Add newenglandoil/ package as the primary scraper (replaces fuel_scraper)
- Add cheapestoil/ package as a secondary market price scraper
- Add app.py entry point for direct execution
- Update run.py: new scrape_cheapest(), migrate command, --state filter,
  --refresh-metadata flag for overwriting existing phone/URL data
- Update models.py with latest schema fields
- Update requirements.txt dependencies
- Update Dockerfile and docker-compose.yml for new structure
- Remove deprecated fuel_scraper module, test.py, and log file

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-03-06 11:34:21 -05:00
parent 8f45f4c209
commit 1592e6d685
26 changed files with 3221 additions and 1468 deletions

65
app.py Normal file
View File

@@ -0,0 +1,65 @@
"""
FastAPI web server for the crawler.
Provides HTTP endpoints to trigger scrapes on demand.
"""
import logging
from fastapi import FastAPI, HTTPException
import models
from database import SessionLocal
from cheapestoil import scrape_state
from cheapestoil.config import STATE_API_NAMES
from newenglandoil.scraper import main as run_newenglandoil_scraper
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s - %(levelname)s - [%(filename)s:%(lineno)d] - %(message)s",
)
app = FastAPI(title="Crawler API", version="1.0.0")
def _build_county_lookup(db_session):
"""Build a (state_abbr, county_name) -> county_id lookup from the DB."""
counties = db_session.query(models.County).all()
return {(c.state.strip(), c.name.strip()): c.id for c in counties}
@app.get("/health")
def health():
return {"status": "ok"}
@app.get("/scrape/{state_abbr}")
def scrape_endpoint(state_abbr: str, refresh_metadata: bool = False):
"""Trigger a CheapestOil scrape for a single state."""
state_abbr = state_abbr.upper()
if state_abbr not in STATE_API_NAMES:
raise HTTPException(
status_code=400,
detail=f"Unknown state: {state_abbr}. Valid: {list(STATE_API_NAMES.keys())}",
)
db_session = SessionLocal()
try:
county_lookup = _build_county_lookup(db_session)
result = scrape_state(state_abbr, db_session, county_lookup, refresh_metadata=refresh_metadata)
return result
except Exception as e:
db_session.rollback()
logging.error(f"Scrape failed for {state_abbr}: {e}", exc_info=True)
raise HTTPException(status_code=500, detail=str(e))
finally:
db_session.close()
@app.get("/scrape-newenglandoil")
def scrape_newenglandoil_endpoint(state: str = None, refresh_metadata: bool = False):
"""Trigger the NewEnglandOil scraper (runs synchronously)."""
try:
# This will run the scraper and log to stdout (inherited from app's logging setup)
run_newenglandoil_scraper(refresh_metadata=refresh_metadata, target_state_abbr=state)
return {"status": "ok", "message": "NewEnglandOil scrape completed"}
except Exception as e:
logging.error(f"NewEnglandOil scrape failed: {e}", exc_info=True)
raise HTTPException(status_code=500, detail=str(e))