refactor: replace fuel_scraper with newenglandoil + cheapestoil scrapers

- Add newenglandoil/ package as the primary scraper (replaces fuel_scraper) - Add cheapestoil/ package as a secondary market price scraper - Add app.py entry point for direct execution - Update run.py: new scrape_cheapest(), migrate command, --state filter, --refresh-metadata flag for overwriting existing phone/URL data - Update models.py with latest schema fields - Update requirements.txt dependencies - Update Dockerfile and docker-compose.yml for new structure - Remove deprecated fuel_scraper module, test.py, and log file Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-03-06 11:34:21 -05:00
parent 8f45f4c209
commit 1592e6d685
26 changed files with 3221 additions and 1468 deletions
--- a/cheapestoil/company_matcher.py
+++ b/cheapestoil/company_matcher.py
@@ -0,0 +1,90 @@
+"""
+Company name normalization and matching for cross-source deduplication.
+
+Handles slight naming variations between NewEnglandOil and CheapestOil:
+    "Fireman's Fuel Co." == "Firemans Fuel" after normalization.
+"""
+import re
+import logging
+
+import sys
+import os
+sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
+from sqlalchemy.orm import Session
+import models
+
+# Suffixes to strip during normalization (order matters: longer first)
+_STRIP_SUFFIXES = [
+    "enterprises", "company", "oil co", "fuel co", "corp", "inc", "llc", "co",
+]
+
+
+def normalize_company_name(name: str) -> str:
+    """
+    Normalize a company name for fuzzy matching.
+
+    Steps:
+        1. Strip whitespace, lowercase
+        2. Replace '&' with 'and'
+        3. Remove punctuation (apostrophes, periods, commas)
+        4. Remove common suffixes
+        5. Collapse multiple spaces
+
+    Args:
+        name: Raw company name
+
+    Returns:
+        Normalized string for comparison.
+    """
+    s = name.strip().lower()
+    s = s.replace("&", "and")
+    s = re.sub(r"['.,$]", "", s)
+    s = s.strip()
+    # Remove common suffixes (longest first to avoid partial matches)
+    for suffix in _STRIP_SUFFIXES:
+        if s.endswith(suffix):
+            s = s[: -len(suffix)]
+            break
+    s = re.sub(r"\s+", " ", s).strip()
+    return s
+
+
+def find_existing_record(
+    db_session: Session,
+    raw_name: str,
+    state_abbr: str,
+    county_id: int | None,
+) -> "models.OilPrice | None":
+    """
+    Find an existing oil_prices record that matches by normalized company name.
+
+    Queries all records for the given state+county_id (or state+zone=0 if no county),
+    then compares normalized names in Python.
+
+    Args:
+        db_session: SQLAlchemy session
+        raw_name: Raw company name from CheapestOil
+        state_abbr: Two-letter state abbreviation
+        county_id: County ID or None
+
+    Returns:
+        Matching OilPrice record or None.
+    """
+    target = normalize_company_name(raw_name)
+    if not target:
+        return None
+
+    query = db_session.query(models.OilPrice).filter(
+        models.OilPrice.state == state_abbr,
+    )
+    if county_id is not None:
+        query = query.filter(models.OilPrice.county_id == county_id)
+    else:
+        query = query.filter(models.OilPrice.zone == 0)
+
+    for record in query.all():
+        if normalize_company_name(record.name) == target:
+            return record
+
+    return None