refactor: replace fuel_scraper with newenglandoil + cheapestoil scrapers
- Add newenglandoil/ package as the primary scraper (replaces fuel_scraper) - Add cheapestoil/ package as a secondary market price scraper - Add app.py entry point for direct execution - Update run.py: new scrape_cheapest(), migrate command, --state filter, --refresh-metadata flag for overwriting existing phone/URL data - Update models.py with latest schema fields - Update requirements.txt dependencies - Update Dockerfile and docker-compose.yml for new structure - Remove deprecated fuel_scraper module, test.py, and log file Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
106
run.py
106
run.py
@@ -2,44 +2,100 @@
|
||||
import argparse
|
||||
import logging
|
||||
|
||||
# Import necessary functions/modules from your project
|
||||
# The 'import models' is crucial for init_db to know about the tables
|
||||
import models
|
||||
from database import init_db, SessionLocal
|
||||
from fuel_scraper import main as run_scraper_main # Import from modular package
|
||||
from newenglandoil import main as run_scraper_main
|
||||
|
||||
# Configure basic logging for the run.py script itself if needed
|
||||
# Your other modules (fuel_scraper, database) will have their own logging
|
||||
# or you might centralize logging configuration further.
|
||||
# For simplicity, we'll let fuel_scraper handle its detailed logging.
|
||||
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def initialize_database():
|
||||
"""Initializes the database by creating tables based on models."""
|
||||
logger.info("Attempting to initialize database...")
|
||||
try:
|
||||
init_db() # This function is imported from database.py
|
||||
# It relies on models being imported so Base.metadata is populated
|
||||
init_db()
|
||||
logger.info("Database initialization process completed.")
|
||||
except Exception as e:
|
||||
logger.error(f"Error during database initialization: {e}", exc_info=True)
|
||||
|
||||
def scrape_data():
|
||||
"""Runs the fuel price scraper."""
|
||||
logger.info("Starting the fuel price scraper...")
|
||||
|
||||
def scrape_data(state_abbr: str | None = None, refresh_metadata: bool = False):
|
||||
"""Runs the NewEnglandOil scraper."""
|
||||
logger.info("Starting the NewEnglandOil scraper...")
|
||||
if refresh_metadata:
|
||||
logger.info("Metadata refresh enabled: Existing phone/URL data may be overwritten.")
|
||||
if state_abbr:
|
||||
logger.info(f"Scraping restricted to state: {state_abbr}")
|
||||
|
||||
try:
|
||||
run_scraper_main() # This is the main function from fuel_scraper.py
|
||||
logger.info("Fuel price scraper finished.")
|
||||
run_scraper_main(refresh_metadata=refresh_metadata, target_state_abbr=state_abbr)
|
||||
logger.info("NewEnglandOil scraper finished.")
|
||||
except Exception as e:
|
||||
logger.error(f"Error during scraping process: {e}", exc_info=True)
|
||||
|
||||
|
||||
def scrape_cheapest(state_abbr: str, refresh_metadata: bool = False):
|
||||
"""Runs the CheapestOil scraper for a single state."""
|
||||
from cheapestoil import scrape_state
|
||||
|
||||
logger.info(f"Starting CheapestOil scrape for {state_abbr}...")
|
||||
if refresh_metadata:
|
||||
logger.info("Metadata refresh enabled: Existing phone/URL data may be overwritten.")
|
||||
|
||||
db_session = SessionLocal()
|
||||
try:
|
||||
counties = db_session.query(models.County).all()
|
||||
county_lookup = {(c.state.strip(), c.name.strip()): c.id for c in counties}
|
||||
result = scrape_state(state_abbr, db_session, county_lookup, refresh_metadata=refresh_metadata)
|
||||
logger.info(f"CheapestOil result: {result}")
|
||||
except Exception as e:
|
||||
db_session.rollback()
|
||||
logger.error(f"Error during CheapestOil scrape: {e}", exc_info=True)
|
||||
finally:
|
||||
db_session.close()
|
||||
|
||||
|
||||
def run_migration():
|
||||
"""Runs the data normalization migration."""
|
||||
from migrate_normalize import main as migrate_main
|
||||
logger.info("Running data normalization migration...")
|
||||
try:
|
||||
migrate_main()
|
||||
logger.info("Migration completed.")
|
||||
except Exception as e:
|
||||
logger.error(f"Error during migration: {e}", exc_info=True)
|
||||
|
||||
|
||||
def start_server():
|
||||
"""Starts the FastAPI server."""
|
||||
import uvicorn
|
||||
logger.info("Starting FastAPI crawler server on port 9553...")
|
||||
uvicorn.run("app:app", host="0.0.0.0", port=9553)
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="Fuel Price Scraper Control Script")
|
||||
parser.add_argument(
|
||||
"action",
|
||||
choices=["initdb", "scrape"],
|
||||
help="The action to perform: 'initdb' to initialize the database, 'scrape' to run the scraper."
|
||||
choices=["initdb", "scrape", "scrape-cheapest", "migrate", "server"],
|
||||
help=(
|
||||
"'initdb' to initialize the database, "
|
||||
"'scrape' to run NewEnglandOil scraper, "
|
||||
"'scrape-cheapest' to run CheapestOil scraper, "
|
||||
"'migrate' to run data normalization migration, "
|
||||
"'server' to start the FastAPI server."
|
||||
),
|
||||
)
|
||||
parser.add_argument(
|
||||
"--state",
|
||||
default=None,
|
||||
help="State abbreviation (MA, CT, ME, NH, RI, VT).",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--refresh-metadata",
|
||||
action="store_true",
|
||||
help="Force refresh phone numbers and URLs, overwriting existing data.",
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
@@ -47,10 +103,18 @@ def main():
|
||||
if args.action == "initdb":
|
||||
initialize_database()
|
||||
elif args.action == "scrape":
|
||||
scrape_data()
|
||||
else:
|
||||
logger.error(f"Unknown action: {args.action}")
|
||||
parser.print_help()
|
||||
scrape_data(state_abbr=args.state, refresh_metadata=args.refresh_metadata)
|
||||
elif args.action == "scrape-cheapest":
|
||||
if not args.state:
|
||||
logger.error("--state is required for scrape-cheapest action")
|
||||
parser.print_help()
|
||||
return
|
||||
scrape_cheapest(args.state.upper(), refresh_metadata=args.refresh_metadata)
|
||||
elif args.action == "migrate":
|
||||
run_migration()
|
||||
elif args.action == "server":
|
||||
start_server()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
main()
|
||||
|
||||
Reference in New Issue
Block a user