refactor: replace fuel_scraper with newenglandoil + cheapestoil scrapers

- Add newenglandoil/ package as the primary scraper (replaces fuel_scraper)
- Add cheapestoil/ package as a secondary market price scraper
- Add app.py entry point for direct execution
- Update run.py: new scrape_cheapest(), migrate command, --state filter,
  --refresh-metadata flag for overwriting existing phone/URL data
- Update models.py with latest schema fields
- Update requirements.txt dependencies
- Update Dockerfile and docker-compose.yml for new structure
- Remove deprecated fuel_scraper module, test.py, and log file

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-03-06 11:34:21 -05:00
parent 8f45f4c209
commit 1592e6d685
26 changed files with 3221 additions and 1468 deletions

106
run.py
View File

@@ -2,44 +2,100 @@
import argparse
import logging
# Import necessary functions/modules from your project
# The 'import models' is crucial for init_db to know about the tables
import models
from database import init_db, SessionLocal
from fuel_scraper import main as run_scraper_main # Import from modular package
from newenglandoil import main as run_scraper_main
# Configure basic logging for the run.py script itself if needed
# Your other modules (fuel_scraper, database) will have their own logging
# or you might centralize logging configuration further.
# For simplicity, we'll let fuel_scraper handle its detailed logging.
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)
def initialize_database():
"""Initializes the database by creating tables based on models."""
logger.info("Attempting to initialize database...")
try:
init_db() # This function is imported from database.py
# It relies on models being imported so Base.metadata is populated
init_db()
logger.info("Database initialization process completed.")
except Exception as e:
logger.error(f"Error during database initialization: {e}", exc_info=True)
def scrape_data():
"""Runs the fuel price scraper."""
logger.info("Starting the fuel price scraper...")
def scrape_data(state_abbr: str | None = None, refresh_metadata: bool = False):
"""Runs the NewEnglandOil scraper."""
logger.info("Starting the NewEnglandOil scraper...")
if refresh_metadata:
logger.info("Metadata refresh enabled: Existing phone/URL data may be overwritten.")
if state_abbr:
logger.info(f"Scraping restricted to state: {state_abbr}")
try:
run_scraper_main() # This is the main function from fuel_scraper.py
logger.info("Fuel price scraper finished.")
run_scraper_main(refresh_metadata=refresh_metadata, target_state_abbr=state_abbr)
logger.info("NewEnglandOil scraper finished.")
except Exception as e:
logger.error(f"Error during scraping process: {e}", exc_info=True)
def scrape_cheapest(state_abbr: str, refresh_metadata: bool = False):
"""Runs the CheapestOil scraper for a single state."""
from cheapestoil import scrape_state
logger.info(f"Starting CheapestOil scrape for {state_abbr}...")
if refresh_metadata:
logger.info("Metadata refresh enabled: Existing phone/URL data may be overwritten.")
db_session = SessionLocal()
try:
counties = db_session.query(models.County).all()
county_lookup = {(c.state.strip(), c.name.strip()): c.id for c in counties}
result = scrape_state(state_abbr, db_session, county_lookup, refresh_metadata=refresh_metadata)
logger.info(f"CheapestOil result: {result}")
except Exception as e:
db_session.rollback()
logger.error(f"Error during CheapestOil scrape: {e}", exc_info=True)
finally:
db_session.close()
def run_migration():
"""Runs the data normalization migration."""
from migrate_normalize import main as migrate_main
logger.info("Running data normalization migration...")
try:
migrate_main()
logger.info("Migration completed.")
except Exception as e:
logger.error(f"Error during migration: {e}", exc_info=True)
def start_server():
"""Starts the FastAPI server."""
import uvicorn
logger.info("Starting FastAPI crawler server on port 9553...")
uvicorn.run("app:app", host="0.0.0.0", port=9553)
def main():
parser = argparse.ArgumentParser(description="Fuel Price Scraper Control Script")
parser.add_argument(
"action",
choices=["initdb", "scrape"],
help="The action to perform: 'initdb' to initialize the database, 'scrape' to run the scraper."
choices=["initdb", "scrape", "scrape-cheapest", "migrate", "server"],
help=(
"'initdb' to initialize the database, "
"'scrape' to run NewEnglandOil scraper, "
"'scrape-cheapest' to run CheapestOil scraper, "
"'migrate' to run data normalization migration, "
"'server' to start the FastAPI server."
),
)
parser.add_argument(
"--state",
default=None,
help="State abbreviation (MA, CT, ME, NH, RI, VT).",
)
parser.add_argument(
"--refresh-metadata",
action="store_true",
help="Force refresh phone numbers and URLs, overwriting existing data.",
)
args = parser.parse_args()
@@ -47,10 +103,18 @@ def main():
if args.action == "initdb":
initialize_database()
elif args.action == "scrape":
scrape_data()
else:
logger.error(f"Unknown action: {args.action}")
parser.print_help()
scrape_data(state_abbr=args.state, refresh_metadata=args.refresh_metadata)
elif args.action == "scrape-cheapest":
if not args.state:
logger.error("--state is required for scrape-cheapest action")
parser.print_help()
return
scrape_cheapest(args.state.upper(), refresh_metadata=args.refresh_metadata)
elif args.action == "migrate":
run_migration()
elif args.action == "server":
start_server()
if __name__ == "__main__":
main()
main()