crawler/cheapestoil/company_matcher.py

"""
Company name normalization and matching for cross-source deduplication.

Handles slight naming variations between NewEnglandOil and CheapestOil:
    "Fireman's Fuel Co." == "Firemans Fuel" after normalization.
"""
import re
import logging

import sys
import os
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))

from sqlalchemy.orm import Session
import models

# Suffixes to strip during normalization (order matters: longer first)
_STRIP_SUFFIXES = [
    "enterprises", "company", "oil co", "fuel co", "corp", "inc", "llc", "co",
]


def normalize_company_name(name: str) -> str:
    """
    Normalize a company name for fuzzy matching.

    Steps:
        1. Strip whitespace, lowercase
        2. Replace '&' with 'and'
        3. Remove punctuation (apostrophes, periods, commas)
        4. Remove common suffixes
        5. Collapse multiple spaces

    Args:
        name: Raw company name

    Returns:
        Normalized string for comparison.
    """
    s = name.strip().lower()
    s = s.replace("&", "and")
    s = re.sub(r"['.,$]", "", s)
    s = s.strip()
    # Remove common suffixes (longest first to avoid partial matches)
    for suffix in _STRIP_SUFFIXES:
        if s.endswith(suffix):
            s = s[: -len(suffix)]
            break
    s = re.sub(r"\s+", " ", s).strip()
    return s


def find_existing_record(
    db_session: Session,
    raw_name: str,
    state_abbr: str,
    county_id: int | None,
) -> "models.OilPrice | None":
    """
    Find an existing oil_prices record that matches by normalized company name.

    Queries all records for the given state+county_id (or state+zone=0 if no county),
    then compares normalized names in Python.

    Args:
        db_session: SQLAlchemy session
        raw_name: Raw company name from CheapestOil
        state_abbr: Two-letter state abbreviation
        county_id: County ID or None

    Returns:
        Matching OilPrice record or None.
    """
    target = normalize_company_name(raw_name)
    if not target:
        return None

    query = db_session.query(models.OilPrice).filter(
        models.OilPrice.state == state_abbr,
    )
    if county_id is not None:
        query = query.filter(models.OilPrice.county_id == county_id)
    else:
        query = query.filter(models.OilPrice.zone == 0)

    for record in query.all():
        if normalize_company_name(record.name) == target:
            return record

    return None