- Add newenglandoil/ package as the primary scraper (replaces fuel_scraper) - Add cheapestoil/ package as a secondary market price scraper - Add app.py entry point for direct execution - Update run.py: new scrape_cheapest(), migrate command, --state filter, --refresh-metadata flag for overwriting existing phone/URL data - Update models.py with latest schema fields - Update requirements.txt dependencies - Update Dockerfile and docker-compose.yml for new structure - Remove deprecated fuel_scraper module, test.py, and log file Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
121 lines
4.1 KiB
Python
121 lines
4.1 KiB
Python
# run.py
|
|
import argparse
|
|
import logging
|
|
|
|
import models
|
|
from database import init_db, SessionLocal
|
|
from newenglandoil import main as run_scraper_main
|
|
|
|
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
def initialize_database():
|
|
"""Initializes the database by creating tables based on models."""
|
|
logger.info("Attempting to initialize database...")
|
|
try:
|
|
init_db()
|
|
logger.info("Database initialization process completed.")
|
|
except Exception as e:
|
|
logger.error(f"Error during database initialization: {e}", exc_info=True)
|
|
|
|
|
|
def scrape_data(state_abbr: str | None = None, refresh_metadata: bool = False):
|
|
"""Runs the NewEnglandOil scraper."""
|
|
logger.info("Starting the NewEnglandOil scraper...")
|
|
if refresh_metadata:
|
|
logger.info("Metadata refresh enabled: Existing phone/URL data may be overwritten.")
|
|
if state_abbr:
|
|
logger.info(f"Scraping restricted to state: {state_abbr}")
|
|
|
|
try:
|
|
run_scraper_main(refresh_metadata=refresh_metadata, target_state_abbr=state_abbr)
|
|
logger.info("NewEnglandOil scraper finished.")
|
|
except Exception as e:
|
|
logger.error(f"Error during scraping process: {e}", exc_info=True)
|
|
|
|
|
|
def scrape_cheapest(state_abbr: str, refresh_metadata: bool = False):
|
|
"""Runs the CheapestOil scraper for a single state."""
|
|
from cheapestoil import scrape_state
|
|
|
|
logger.info(f"Starting CheapestOil scrape for {state_abbr}...")
|
|
if refresh_metadata:
|
|
logger.info("Metadata refresh enabled: Existing phone/URL data may be overwritten.")
|
|
|
|
db_session = SessionLocal()
|
|
try:
|
|
counties = db_session.query(models.County).all()
|
|
county_lookup = {(c.state.strip(), c.name.strip()): c.id for c in counties}
|
|
result = scrape_state(state_abbr, db_session, county_lookup, refresh_metadata=refresh_metadata)
|
|
logger.info(f"CheapestOil result: {result}")
|
|
except Exception as e:
|
|
db_session.rollback()
|
|
logger.error(f"Error during CheapestOil scrape: {e}", exc_info=True)
|
|
finally:
|
|
db_session.close()
|
|
|
|
|
|
def run_migration():
|
|
"""Runs the data normalization migration."""
|
|
from migrate_normalize import main as migrate_main
|
|
logger.info("Running data normalization migration...")
|
|
try:
|
|
migrate_main()
|
|
logger.info("Migration completed.")
|
|
except Exception as e:
|
|
logger.error(f"Error during migration: {e}", exc_info=True)
|
|
|
|
|
|
def start_server():
|
|
"""Starts the FastAPI server."""
|
|
import uvicorn
|
|
logger.info("Starting FastAPI crawler server on port 9553...")
|
|
uvicorn.run("app:app", host="0.0.0.0", port=9553)
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(description="Fuel Price Scraper Control Script")
|
|
parser.add_argument(
|
|
"action",
|
|
choices=["initdb", "scrape", "scrape-cheapest", "migrate", "server"],
|
|
help=(
|
|
"'initdb' to initialize the database, "
|
|
"'scrape' to run NewEnglandOil scraper, "
|
|
"'scrape-cheapest' to run CheapestOil scraper, "
|
|
"'migrate' to run data normalization migration, "
|
|
"'server' to start the FastAPI server."
|
|
),
|
|
)
|
|
parser.add_argument(
|
|
"--state",
|
|
default=None,
|
|
help="State abbreviation (MA, CT, ME, NH, RI, VT).",
|
|
)
|
|
parser.add_argument(
|
|
"--refresh-metadata",
|
|
action="store_true",
|
|
help="Force refresh phone numbers and URLs, overwriting existing data.",
|
|
)
|
|
|
|
args = parser.parse_args()
|
|
|
|
if args.action == "initdb":
|
|
initialize_database()
|
|
elif args.action == "scrape":
|
|
scrape_data(state_abbr=args.state, refresh_metadata=args.refresh_metadata)
|
|
elif args.action == "scrape-cheapest":
|
|
if not args.state:
|
|
logger.error("--state is required for scrape-cheapest action")
|
|
parser.print_help()
|
|
return
|
|
scrape_cheapest(args.state.upper(), refresh_metadata=args.refresh_metadata)
|
|
elif args.action == "migrate":
|
|
run_migration()
|
|
elif args.action == "server":
|
|
start_server()
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|