""" Company name normalization and matching for cross-source deduplication. Handles slight naming variations between NewEnglandOil and CheapestOil: "Fireman's Fuel Co." == "Firemans Fuel" after normalization. """ import re import logging import sys import os sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) from sqlalchemy.orm import Session import models # Suffixes to strip during normalization (order matters: longer first) _STRIP_SUFFIXES = [ "enterprises", "company", "oil co", "fuel co", "corp", "inc", "llc", "co", ] def normalize_company_name(name: str) -> str: """ Normalize a company name for fuzzy matching. Steps: 1. Strip whitespace, lowercase 2. Replace '&' with 'and' 3. Remove punctuation (apostrophes, periods, commas) 4. Remove common suffixes 5. Collapse multiple spaces Args: name: Raw company name Returns: Normalized string for comparison. """ s = name.strip().lower() s = s.replace("&", "and") s = re.sub(r"['.,$]", "", s) s = s.strip() # Remove common suffixes (longest first to avoid partial matches) for suffix in _STRIP_SUFFIXES: if s.endswith(suffix): s = s[: -len(suffix)] break s = re.sub(r"\s+", " ", s).strip() return s def find_existing_record( db_session: Session, raw_name: str, state_abbr: str, county_id: int | None, ) -> "models.OilPrice | None": """ Find an existing oil_prices record that matches by normalized company name. Queries all records for the given state+county_id (or state+zone=0 if no county), then compares normalized names in Python. Args: db_session: SQLAlchemy session raw_name: Raw company name from CheapestOil state_abbr: Two-letter state abbreviation county_id: County ID or None Returns: Matching OilPrice record or None. """ target = normalize_company_name(raw_name) if not target: return None query = db_session.query(models.OilPrice).filter( models.OilPrice.state == state_abbr, ) if county_id is not None: query = query.filter(models.OilPrice.county_id == county_id) else: query = query.filter(models.OilPrice.zone == 0) for record in query.all(): if normalize_company_name(record.name) == target: return record return None