refactor: replace fuel_scraper with newenglandoil + cheapestoil scrapers
- Add newenglandoil/ package as the primary scraper (replaces fuel_scraper) - Add cheapestoil/ package as a secondary market price scraper - Add app.py entry point for direct execution - Update run.py: new scrape_cheapest(), migrate command, --state filter, --refresh-metadata flag for overwriting existing phone/URL data - Update models.py with latest schema fields - Update requirements.txt dependencies - Update Dockerfile and docker-compose.yml for new structure - Remove deprecated fuel_scraper module, test.py, and log file Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
111
cheapestoil/parsers.py
Normal file
111
cheapestoil/parsers.py
Normal file
@@ -0,0 +1,111 @@
|
||||
"""
|
||||
Parsers for CheapestOil API response data.
|
||||
|
||||
API returns arrays like:
|
||||
[name, 150gal_price, 300gal_price, 500gal_price, service_area, updated, link, flag]
|
||||
|
||||
Price fields come as HTML strings like "$3.69<br />(Total $553.50*)"
|
||||
"""
|
||||
import re
|
||||
import logging
|
||||
|
||||
# Common abbreviations that should stay uppercase after title-casing
|
||||
_KEEP_UPPER = {"LLC", "INC", "LP", "HVAC", "II", "III", "IV", "USA"}
|
||||
|
||||
|
||||
def _smart_title(name: str) -> str:
|
||||
"""Convert a company name to title case, preserving common abbreviations."""
|
||||
words = name.title().split()
|
||||
return " ".join(w.upper() if w.upper() in _KEEP_UPPER else w for w in words)
|
||||
|
||||
|
||||
def parse_price_150(price_html: str) -> float | None:
|
||||
"""
|
||||
Extract the per-gallon price from a CheapestOil price field.
|
||||
|
||||
Examples:
|
||||
"$3.69<br />(Total $553.50*)" -> 3.69
|
||||
"$4.199" -> 4.199
|
||||
"" -> None
|
||||
|
||||
Args:
|
||||
price_html: Raw price string from the API
|
||||
|
||||
Returns:
|
||||
Float price or None if unparseable.
|
||||
"""
|
||||
if not price_html or not isinstance(price_html, str):
|
||||
return None
|
||||
# The per-gallon price is the first dollar amount before any <br> tag
|
||||
match = re.search(r'\$(\d+\.\d+)', price_html)
|
||||
if match:
|
||||
try:
|
||||
return float(match.group(1))
|
||||
except ValueError:
|
||||
pass
|
||||
logging.warning(f"Could not parse price from: {price_html!r}")
|
||||
return None
|
||||
|
||||
|
||||
def parse_company_record(row: list, county_name: str | None) -> dict | None:
|
||||
"""
|
||||
Convert an API row array to a structured dict.
|
||||
|
||||
Expected row format:
|
||||
[0] name
|
||||
[1] 150gal price (HTML)
|
||||
[2] 300gal price (HTML)
|
||||
[3] 500gal price (HTML)
|
||||
[4] service area text
|
||||
[5] last updated date string
|
||||
[6] company link/slug
|
||||
[7] flag/badge
|
||||
|
||||
Args:
|
||||
row: Raw array from the API
|
||||
county_name: County name this row came from (None for state-level)
|
||||
|
||||
Returns:
|
||||
Dict with {name, price, service_area, county_name, date} or None.
|
||||
"""
|
||||
if not isinstance(row, list) or len(row) < 6:
|
||||
logging.warning(f"Skipping malformed row: {row!r}")
|
||||
return None
|
||||
|
||||
name = str(row[0]).strip() if row[0] else ""
|
||||
if not name:
|
||||
return None
|
||||
|
||||
# Apply title case normalization
|
||||
name = _smart_title(name)
|
||||
|
||||
price = parse_price_150(str(row[1]) if row[1] else "")
|
||||
service_area = str(row[4]).strip() if row[4] else ""
|
||||
date_str = str(row[5]).strip() if row[5] else ""
|
||||
# DB column is VARCHAR(20), truncate to fit
|
||||
if len(date_str) > 20:
|
||||
date_str = date_str[:20]
|
||||
|
||||
# Extract company URL from row[6] (link/slug)
|
||||
# Only accept if it looks like a real external URL, not a slug
|
||||
url = None
|
||||
slug = None
|
||||
if len(row) > 6 and row[6]:
|
||||
raw_link = str(row[6]).strip()
|
||||
if raw_link:
|
||||
if raw_link.startswith("http"):
|
||||
url = raw_link
|
||||
else:
|
||||
# It's a slug for the cheapestoil detail page
|
||||
slug = raw_link
|
||||
|
||||
return {
|
||||
"slug": slug, # Return slug so scraper can use it to fetch details
|
||||
"name": name,
|
||||
"price": price,
|
||||
"service_area": service_area,
|
||||
"county_name": county_name,
|
||||
"date": date_str,
|
||||
"url": url,
|
||||
"slug": slug,
|
||||
}
|
||||
Reference in New Issue
Block a user