refactor: replace fuel_scraper with newenglandoil + cheapestoil scrapers

- Add newenglandoil/ package as the primary scraper (replaces fuel_scraper)
- Add cheapestoil/ package as a secondary market price scraper
- Add app.py entry point for direct execution
- Update run.py: new scrape_cheapest(), migrate command, --state filter,
  --refresh-metadata flag for overwriting existing phone/URL data
- Update models.py with latest schema fields
- Update requirements.txt dependencies
- Update Dockerfile and docker-compose.yml for new structure
- Remove deprecated fuel_scraper module, test.py, and log file

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-03-06 11:34:21 -05:00
parent 8f45f4c209
commit 1592e6d685
26 changed files with 3221 additions and 1468 deletions

View File

@@ -0,0 +1,90 @@
"""
Company name normalization and matching for cross-source deduplication.
Handles slight naming variations between NewEnglandOil and CheapestOil:
"Fireman's Fuel Co." == "Firemans Fuel" after normalization.
"""
import re
import logging
import sys
import os
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from sqlalchemy.orm import Session
import models
# Suffixes to strip during normalization (order matters: longer first)
_STRIP_SUFFIXES = [
"enterprises", "company", "oil co", "fuel co", "corp", "inc", "llc", "co",
]
def normalize_company_name(name: str) -> str:
"""
Normalize a company name for fuzzy matching.
Steps:
1. Strip whitespace, lowercase
2. Replace '&' with 'and'
3. Remove punctuation (apostrophes, periods, commas)
4. Remove common suffixes
5. Collapse multiple spaces
Args:
name: Raw company name
Returns:
Normalized string for comparison.
"""
s = name.strip().lower()
s = s.replace("&", "and")
s = re.sub(r"['.,$]", "", s)
s = s.strip()
# Remove common suffixes (longest first to avoid partial matches)
for suffix in _STRIP_SUFFIXES:
if s.endswith(suffix):
s = s[: -len(suffix)]
break
s = re.sub(r"\s+", " ", s).strip()
return s
def find_existing_record(
db_session: Session,
raw_name: str,
state_abbr: str,
county_id: int | None,
) -> "models.OilPrice | None":
"""
Find an existing oil_prices record that matches by normalized company name.
Queries all records for the given state+county_id (or state+zone=0 if no county),
then compares normalized names in Python.
Args:
db_session: SQLAlchemy session
raw_name: Raw company name from CheapestOil
state_abbr: Two-letter state abbreviation
county_id: County ID or None
Returns:
Matching OilPrice record or None.
"""
target = normalize_company_name(raw_name)
if not target:
return None
query = db_session.query(models.OilPrice).filter(
models.OilPrice.state == state_abbr,
)
if county_id is not None:
query = query.filter(models.OilPrice.county_id == county_id)
else:
query = query.filter(models.OilPrice.zone == 0)
for record in query.all():
if normalize_company_name(record.name) == target:
return record
return None