- Add newenglandoil/ package as the primary scraper (replaces fuel_scraper) - Add cheapestoil/ package as a secondary market price scraper - Add app.py entry point for direct execution - Update run.py: new scrape_cheapest(), migrate command, --state filter, --refresh-metadata flag for overwriting existing phone/URL data - Update models.py with latest schema fields - Update requirements.txt dependencies - Update Dockerfile and docker-compose.yml for new structure - Remove deprecated fuel_scraper module, test.py, and log file Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
91 lines
2.4 KiB
Python
91 lines
2.4 KiB
Python
"""
|
|
Company name normalization and matching for cross-source deduplication.
|
|
|
|
Handles slight naming variations between NewEnglandOil and CheapestOil:
|
|
"Fireman's Fuel Co." == "Firemans Fuel" after normalization.
|
|
"""
|
|
import re
|
|
import logging
|
|
|
|
import sys
|
|
import os
|
|
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
|
|
|
from sqlalchemy.orm import Session
|
|
import models
|
|
|
|
# Suffixes to strip during normalization (order matters: longer first)
|
|
_STRIP_SUFFIXES = [
|
|
"enterprises", "company", "oil co", "fuel co", "corp", "inc", "llc", "co",
|
|
]
|
|
|
|
|
|
def normalize_company_name(name: str) -> str:
|
|
"""
|
|
Normalize a company name for fuzzy matching.
|
|
|
|
Steps:
|
|
1. Strip whitespace, lowercase
|
|
2. Replace '&' with 'and'
|
|
3. Remove punctuation (apostrophes, periods, commas)
|
|
4. Remove common suffixes
|
|
5. Collapse multiple spaces
|
|
|
|
Args:
|
|
name: Raw company name
|
|
|
|
Returns:
|
|
Normalized string for comparison.
|
|
"""
|
|
s = name.strip().lower()
|
|
s = s.replace("&", "and")
|
|
s = re.sub(r"['.,$]", "", s)
|
|
s = s.strip()
|
|
# Remove common suffixes (longest first to avoid partial matches)
|
|
for suffix in _STRIP_SUFFIXES:
|
|
if s.endswith(suffix):
|
|
s = s[: -len(suffix)]
|
|
break
|
|
s = re.sub(r"\s+", " ", s).strip()
|
|
return s
|
|
|
|
|
|
def find_existing_record(
|
|
db_session: Session,
|
|
raw_name: str,
|
|
state_abbr: str,
|
|
county_id: int | None,
|
|
) -> "models.OilPrice | None":
|
|
"""
|
|
Find an existing oil_prices record that matches by normalized company name.
|
|
|
|
Queries all records for the given state+county_id (or state+zone=0 if no county),
|
|
then compares normalized names in Python.
|
|
|
|
Args:
|
|
db_session: SQLAlchemy session
|
|
raw_name: Raw company name from CheapestOil
|
|
state_abbr: Two-letter state abbreviation
|
|
county_id: County ID or None
|
|
|
|
Returns:
|
|
Matching OilPrice record or None.
|
|
"""
|
|
target = normalize_company_name(raw_name)
|
|
if not target:
|
|
return None
|
|
|
|
query = db_session.query(models.OilPrice).filter(
|
|
models.OilPrice.state == state_abbr,
|
|
)
|
|
if county_id is not None:
|
|
query = query.filter(models.OilPrice.county_id == county_id)
|
|
else:
|
|
query = query.filter(models.OilPrice.zone == 0)
|
|
|
|
for record in query.all():
|
|
if normalize_company_name(record.name) == target:
|
|
return record
|
|
|
|
return None
|