refactor: replace fuel_scraper with newenglandoil + cheapestoil scrapers
- Add newenglandoil/ package as the primary scraper (replaces fuel_scraper) - Add cheapestoil/ package as a secondary market price scraper - Add app.py entry point for direct execution - Update run.py: new scrape_cheapest(), migrate command, --state filter, --refresh-metadata flag for overwriting existing phone/URL data - Update models.py with latest schema fields - Update requirements.txt dependencies - Update Dockerfile and docker-compose.yml for new structure - Remove deprecated fuel_scraper module, test.py, and log file Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
90
cheapestoil/company_matcher.py
Normal file
90
cheapestoil/company_matcher.py
Normal file
@@ -0,0 +1,90 @@
|
||||
"""
|
||||
Company name normalization and matching for cross-source deduplication.
|
||||
|
||||
Handles slight naming variations between NewEnglandOil and CheapestOil:
|
||||
"Fireman's Fuel Co." == "Firemans Fuel" after normalization.
|
||||
"""
|
||||
import re
|
||||
import logging
|
||||
|
||||
import sys
|
||||
import os
|
||||
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
||||
|
||||
from sqlalchemy.orm import Session
|
||||
import models
|
||||
|
||||
# Suffixes to strip during normalization (order matters: longer first)
|
||||
_STRIP_SUFFIXES = [
|
||||
"enterprises", "company", "oil co", "fuel co", "corp", "inc", "llc", "co",
|
||||
]
|
||||
|
||||
|
||||
def normalize_company_name(name: str) -> str:
|
||||
"""
|
||||
Normalize a company name for fuzzy matching.
|
||||
|
||||
Steps:
|
||||
1. Strip whitespace, lowercase
|
||||
2. Replace '&' with 'and'
|
||||
3. Remove punctuation (apostrophes, periods, commas)
|
||||
4. Remove common suffixes
|
||||
5. Collapse multiple spaces
|
||||
|
||||
Args:
|
||||
name: Raw company name
|
||||
|
||||
Returns:
|
||||
Normalized string for comparison.
|
||||
"""
|
||||
s = name.strip().lower()
|
||||
s = s.replace("&", "and")
|
||||
s = re.sub(r"['.,$]", "", s)
|
||||
s = s.strip()
|
||||
# Remove common suffixes (longest first to avoid partial matches)
|
||||
for suffix in _STRIP_SUFFIXES:
|
||||
if s.endswith(suffix):
|
||||
s = s[: -len(suffix)]
|
||||
break
|
||||
s = re.sub(r"\s+", " ", s).strip()
|
||||
return s
|
||||
|
||||
|
||||
def find_existing_record(
|
||||
db_session: Session,
|
||||
raw_name: str,
|
||||
state_abbr: str,
|
||||
county_id: int | None,
|
||||
) -> "models.OilPrice | None":
|
||||
"""
|
||||
Find an existing oil_prices record that matches by normalized company name.
|
||||
|
||||
Queries all records for the given state+county_id (or state+zone=0 if no county),
|
||||
then compares normalized names in Python.
|
||||
|
||||
Args:
|
||||
db_session: SQLAlchemy session
|
||||
raw_name: Raw company name from CheapestOil
|
||||
state_abbr: Two-letter state abbreviation
|
||||
county_id: County ID or None
|
||||
|
||||
Returns:
|
||||
Matching OilPrice record or None.
|
||||
"""
|
||||
target = normalize_company_name(raw_name)
|
||||
if not target:
|
||||
return None
|
||||
|
||||
query = db_session.query(models.OilPrice).filter(
|
||||
models.OilPrice.state == state_abbr,
|
||||
)
|
||||
if county_id is not None:
|
||||
query = query.filter(models.OilPrice.county_id == county_id)
|
||||
else:
|
||||
query = query.filter(models.OilPrice.zone == 0)
|
||||
|
||||
for record in query.all():
|
||||
if normalize_company_name(record.name) == target:
|
||||
return record
|
||||
|
||||
return None
|
||||
Reference in New Issue
Block a user