refactor: replace fuel_scraper with newenglandoil + cheapestoil scrapers
- Add newenglandoil/ package as the primary scraper (replaces fuel_scraper) - Add cheapestoil/ package as a secondary market price scraper - Add app.py entry point for direct execution - Update run.py: new scrape_cheapest(), migrate command, --state filter, --refresh-metadata flag for overwriting existing phone/URL data - Update models.py with latest schema fields - Update requirements.txt dependencies - Update Dockerfile and docker-compose.yml for new structure - Remove deprecated fuel_scraper module, test.py, and log file Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
4
cheapestoil/__init__.py
Normal file
4
cheapestoil/__init__.py
Normal file
@@ -0,0 +1,4 @@
|
||||
# cheapestoil package
|
||||
from .scraper import scrape_state
|
||||
|
||||
__all__ = ["scrape_state"]
|
||||
136
cheapestoil/api_client.py
Normal file
136
cheapestoil/api_client.py
Normal file
@@ -0,0 +1,136 @@
|
||||
"""
|
||||
HTTP client for the CheapestOil JSON API.
|
||||
"""
|
||||
import re
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
from .config import API_URL
|
||||
|
||||
DEFAULT_HEADERS = {
|
||||
"User-Agent": (
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
|
||||
"AppleWebKit/537.36 (KHTML, like Gecko) "
|
||||
"Chrome/91.0.4472.124 Safari/537.36"
|
||||
)
|
||||
}
|
||||
|
||||
REQUEST_TIMEOUT = 20
|
||||
|
||||
|
||||
def fetch_company_details(slug: str) -> dict:
|
||||
"""
|
||||
Fetch company details (real URL, phone) from their CheapestOil profile page.
|
||||
|
||||
Args:
|
||||
slug: The company slug/path (e.g. "Abc-Oil-Company")
|
||||
|
||||
Returns:
|
||||
Dict with keys: "url" (str|None), "phone" (str|None)
|
||||
"""
|
||||
if not slug:
|
||||
return {"url": None, "phone": None}
|
||||
|
||||
# Construct detail URL
|
||||
# If slug is full URL, use it, else append to base
|
||||
if slug.startswith("http"):
|
||||
url = slug
|
||||
else:
|
||||
url = f"https://www.cheapestoil.com/{slug}"
|
||||
|
||||
try:
|
||||
resp = requests.get(url, headers=DEFAULT_HEADERS, timeout=REQUEST_TIMEOUT)
|
||||
resp.raise_for_status()
|
||||
soup = BeautifulSoup(resp.content, 'html.parser')
|
||||
|
||||
real_url = None
|
||||
phone = None
|
||||
|
||||
# 1. Extract Real URL
|
||||
# Look for "Visit Website" link or similar anchor texts
|
||||
# Usually contained in a link with text "Visit Website" or the company name
|
||||
# We look for a link that is NOT internal (doesn't contain cheapestoil.com)
|
||||
# and behaves like an external link.
|
||||
|
||||
# Common pattern: <a href="..." target="_blank">Visit Website</a>
|
||||
visit_link = soup.find('a', string=re.compile(r"Visit Website|Company Website", re.IGNORECASE))
|
||||
if visit_link and visit_link.get('href'):
|
||||
href = visit_link.get('href')
|
||||
if 'cheapestoil.com' not in href and href.startswith('http'):
|
||||
real_url = href
|
||||
|
||||
# Fallback: look for any external link in the contact section if structured
|
||||
if not real_url:
|
||||
# Try to find the first external link in the main content area
|
||||
# (This is heuristics-based, might need adjustment)
|
||||
content_div = soup.find('div', class_='col-md-8') # Common bootstrap main col
|
||||
if content_div:
|
||||
links = content_div.find_all('a', href=True)
|
||||
for a in links:
|
||||
href = a['href']
|
||||
if href.startswith('http') and 'cheapestoil.com' not in href:
|
||||
real_url = href
|
||||
break
|
||||
|
||||
# 2. Extract Phone
|
||||
# Reuse robust regex pattern logic
|
||||
page_text = soup.get_text(" ", strip=True)
|
||||
|
||||
# Look for "Phone:", "Tel:", etc.
|
||||
# This is a bit simplified compared to the other scraper but likely sufficient
|
||||
phone_match = re.search(r'(?:Phone|Tel|Call).*?(\(?\d{3}\)?[\s.\-]?\d{3}[\s.\-]?\d{4})', page_text, re.IGNORECASE)
|
||||
if phone_match:
|
||||
phone_candidate = phone_match.group(1)
|
||||
else:
|
||||
# Fallback to just finding a phone pattern
|
||||
phone_match = re.search(r'(?:\(?\d{3}\)?[\s.\-]?\d{3}[\s.\-]?\d{4})', page_text)
|
||||
phone_candidate = phone_match.group(0) if phone_match else None
|
||||
|
||||
if phone_candidate:
|
||||
digits = re.sub(r'\D', '', phone_candidate)
|
||||
if len(digits) == 10:
|
||||
phone = f"({digits[:3]}) {digits[3:6]}-{digits[6:]}"
|
||||
else:
|
||||
phone = phone_candidate
|
||||
|
||||
return {"url": real_url, "phone": phone}
|
||||
|
||||
except Exception as e:
|
||||
logging.warning(f"Failed to fetch details for {slug}: {e}")
|
||||
return {"url": None, "phone": None}
|
||||
|
||||
|
||||
|
||||
def fetch_county_prices(state_api_name: str, county_name: str | None = None) -> list:
|
||||
"""
|
||||
Fetch price data from the CheapestOil API.
|
||||
|
||||
Args:
|
||||
state_api_name: State name as used by the API (e.g. "Massachusetts", "NewHampshire")
|
||||
county_name: County name filter, or None for state-level results
|
||||
|
||||
Returns:
|
||||
List of raw JSON arrays from the API, or empty list on failure.
|
||||
"""
|
||||
params = {
|
||||
"sort": 0,
|
||||
"state": state_api_name,
|
||||
"county": county_name or "",
|
||||
"zip": "",
|
||||
}
|
||||
try:
|
||||
resp = requests.get(
|
||||
API_URL, params=params, headers=DEFAULT_HEADERS, timeout=REQUEST_TIMEOUT
|
||||
)
|
||||
resp.raise_for_status()
|
||||
data = resp.json()
|
||||
if isinstance(data, list):
|
||||
return data
|
||||
logging.warning(f"Unexpected response type from API: {type(data)}")
|
||||
return []
|
||||
except requests.exceptions.RequestException as e:
|
||||
logging.error(f"Error fetching CheapestOil API for {state_api_name}/{county_name}: {e}")
|
||||
return []
|
||||
except ValueError as e:
|
||||
logging.error(f"Invalid JSON from CheapestOil API: {e}")
|
||||
return []
|
||||
90
cheapestoil/company_matcher.py
Normal file
90
cheapestoil/company_matcher.py
Normal file
@@ -0,0 +1,90 @@
|
||||
"""
|
||||
Company name normalization and matching for cross-source deduplication.
|
||||
|
||||
Handles slight naming variations between NewEnglandOil and CheapestOil:
|
||||
"Fireman's Fuel Co." == "Firemans Fuel" after normalization.
|
||||
"""
|
||||
import re
|
||||
import logging
|
||||
|
||||
import sys
|
||||
import os
|
||||
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
||||
|
||||
from sqlalchemy.orm import Session
|
||||
import models
|
||||
|
||||
# Suffixes to strip during normalization (order matters: longer first)
|
||||
_STRIP_SUFFIXES = [
|
||||
"enterprises", "company", "oil co", "fuel co", "corp", "inc", "llc", "co",
|
||||
]
|
||||
|
||||
|
||||
def normalize_company_name(name: str) -> str:
|
||||
"""
|
||||
Normalize a company name for fuzzy matching.
|
||||
|
||||
Steps:
|
||||
1. Strip whitespace, lowercase
|
||||
2. Replace '&' with 'and'
|
||||
3. Remove punctuation (apostrophes, periods, commas)
|
||||
4. Remove common suffixes
|
||||
5. Collapse multiple spaces
|
||||
|
||||
Args:
|
||||
name: Raw company name
|
||||
|
||||
Returns:
|
||||
Normalized string for comparison.
|
||||
"""
|
||||
s = name.strip().lower()
|
||||
s = s.replace("&", "and")
|
||||
s = re.sub(r"['.,$]", "", s)
|
||||
s = s.strip()
|
||||
# Remove common suffixes (longest first to avoid partial matches)
|
||||
for suffix in _STRIP_SUFFIXES:
|
||||
if s.endswith(suffix):
|
||||
s = s[: -len(suffix)]
|
||||
break
|
||||
s = re.sub(r"\s+", " ", s).strip()
|
||||
return s
|
||||
|
||||
|
||||
def find_existing_record(
|
||||
db_session: Session,
|
||||
raw_name: str,
|
||||
state_abbr: str,
|
||||
county_id: int | None,
|
||||
) -> "models.OilPrice | None":
|
||||
"""
|
||||
Find an existing oil_prices record that matches by normalized company name.
|
||||
|
||||
Queries all records for the given state+county_id (or state+zone=0 if no county),
|
||||
then compares normalized names in Python.
|
||||
|
||||
Args:
|
||||
db_session: SQLAlchemy session
|
||||
raw_name: Raw company name from CheapestOil
|
||||
state_abbr: Two-letter state abbreviation
|
||||
county_id: County ID or None
|
||||
|
||||
Returns:
|
||||
Matching OilPrice record or None.
|
||||
"""
|
||||
target = normalize_company_name(raw_name)
|
||||
if not target:
|
||||
return None
|
||||
|
||||
query = db_session.query(models.OilPrice).filter(
|
||||
models.OilPrice.state == state_abbr,
|
||||
)
|
||||
if county_id is not None:
|
||||
query = query.filter(models.OilPrice.county_id == county_id)
|
||||
else:
|
||||
query = query.filter(models.OilPrice.zone == 0)
|
||||
|
||||
for record in query.all():
|
||||
if normalize_company_name(record.name) == target:
|
||||
return record
|
||||
|
||||
return None
|
||||
50
cheapestoil/config.py
Normal file
50
cheapestoil/config.py
Normal file
@@ -0,0 +1,50 @@
|
||||
"""
|
||||
Configuration for the CheapestOil scraper.
|
||||
"""
|
||||
|
||||
API_URL = "https://www.cheapestoil.com/heating-oil-prices/api"
|
||||
|
||||
# Seconds between requests to be polite
|
||||
SCRAPE_DELAY = 2
|
||||
|
||||
# State abbreviation -> list of county names on cheapestoil.com
|
||||
# None means state-level only (no county filter)
|
||||
STATE_COUNTIES = {
|
||||
"MA": [
|
||||
"Barnstable", "Berkshire", "Bristol", "Essex", "Franklin",
|
||||
"Hampden", "Hampshire", "Middlesex", "Norfolk", "Plymouth",
|
||||
"Suffolk", "Worcester",
|
||||
],
|
||||
"CT": [
|
||||
"Fairfield", "Hartford", "Litchfield", "Middlesex",
|
||||
"New Haven", "New London", "Tolland", "Windham",
|
||||
],
|
||||
"ME": [
|
||||
"Cumberland", "York", "Penobscot", "Kennebec", "Androscoggin",
|
||||
"Aroostook", "Oxford", "Hancock", "Somerset", "Knox",
|
||||
"Waldo", "Sagadahoc", "Lincoln", "Washington", "Franklin",
|
||||
"Piscataquis",
|
||||
],
|
||||
"NH": [
|
||||
"Belknap", "Carroll", "Cheshire", "Coos", "Grafton",
|
||||
"Hillsborough", "Merrimack", "Rockingham", "Strafford", "Sullivan",
|
||||
],
|
||||
"RI": [
|
||||
"Bristol", "Kent", "Newport", "Providence", "Washington",
|
||||
],
|
||||
"VT": [
|
||||
"Addison", "Bennington", "Caledonia", "Chittenden", "Essex",
|
||||
"Franklin", "Grand Isle", "Lamoille", "Orange", "Orleans",
|
||||
"Rutland", "Washington", "Windham", "Windsor",
|
||||
],
|
||||
}
|
||||
|
||||
# State abbreviation -> API state name (as used in cheapestoil.com params)
|
||||
STATE_API_NAMES = {
|
||||
"MA": "Massachusetts",
|
||||
"CT": "Connecticut",
|
||||
"ME": "Maine",
|
||||
"NH": "NewHampshire",
|
||||
"RI": "RhodeIsland",
|
||||
"VT": "Vermont",
|
||||
}
|
||||
111
cheapestoil/parsers.py
Normal file
111
cheapestoil/parsers.py
Normal file
@@ -0,0 +1,111 @@
|
||||
"""
|
||||
Parsers for CheapestOil API response data.
|
||||
|
||||
API returns arrays like:
|
||||
[name, 150gal_price, 300gal_price, 500gal_price, service_area, updated, link, flag]
|
||||
|
||||
Price fields come as HTML strings like "$3.69<br />(Total $553.50*)"
|
||||
"""
|
||||
import re
|
||||
import logging
|
||||
|
||||
# Common abbreviations that should stay uppercase after title-casing
|
||||
_KEEP_UPPER = {"LLC", "INC", "LP", "HVAC", "II", "III", "IV", "USA"}
|
||||
|
||||
|
||||
def _smart_title(name: str) -> str:
|
||||
"""Convert a company name to title case, preserving common abbreviations."""
|
||||
words = name.title().split()
|
||||
return " ".join(w.upper() if w.upper() in _KEEP_UPPER else w for w in words)
|
||||
|
||||
|
||||
def parse_price_150(price_html: str) -> float | None:
|
||||
"""
|
||||
Extract the per-gallon price from a CheapestOil price field.
|
||||
|
||||
Examples:
|
||||
"$3.69<br />(Total $553.50*)" -> 3.69
|
||||
"$4.199" -> 4.199
|
||||
"" -> None
|
||||
|
||||
Args:
|
||||
price_html: Raw price string from the API
|
||||
|
||||
Returns:
|
||||
Float price or None if unparseable.
|
||||
"""
|
||||
if not price_html or not isinstance(price_html, str):
|
||||
return None
|
||||
# The per-gallon price is the first dollar amount before any <br> tag
|
||||
match = re.search(r'\$(\d+\.\d+)', price_html)
|
||||
if match:
|
||||
try:
|
||||
return float(match.group(1))
|
||||
except ValueError:
|
||||
pass
|
||||
logging.warning(f"Could not parse price from: {price_html!r}")
|
||||
return None
|
||||
|
||||
|
||||
def parse_company_record(row: list, county_name: str | None) -> dict | None:
|
||||
"""
|
||||
Convert an API row array to a structured dict.
|
||||
|
||||
Expected row format:
|
||||
[0] name
|
||||
[1] 150gal price (HTML)
|
||||
[2] 300gal price (HTML)
|
||||
[3] 500gal price (HTML)
|
||||
[4] service area text
|
||||
[5] last updated date string
|
||||
[6] company link/slug
|
||||
[7] flag/badge
|
||||
|
||||
Args:
|
||||
row: Raw array from the API
|
||||
county_name: County name this row came from (None for state-level)
|
||||
|
||||
Returns:
|
||||
Dict with {name, price, service_area, county_name, date} or None.
|
||||
"""
|
||||
if not isinstance(row, list) or len(row) < 6:
|
||||
logging.warning(f"Skipping malformed row: {row!r}")
|
||||
return None
|
||||
|
||||
name = str(row[0]).strip() if row[0] else ""
|
||||
if not name:
|
||||
return None
|
||||
|
||||
# Apply title case normalization
|
||||
name = _smart_title(name)
|
||||
|
||||
price = parse_price_150(str(row[1]) if row[1] else "")
|
||||
service_area = str(row[4]).strip() if row[4] else ""
|
||||
date_str = str(row[5]).strip() if row[5] else ""
|
||||
# DB column is VARCHAR(20), truncate to fit
|
||||
if len(date_str) > 20:
|
||||
date_str = date_str[:20]
|
||||
|
||||
# Extract company URL from row[6] (link/slug)
|
||||
# Only accept if it looks like a real external URL, not a slug
|
||||
url = None
|
||||
slug = None
|
||||
if len(row) > 6 and row[6]:
|
||||
raw_link = str(row[6]).strip()
|
||||
if raw_link:
|
||||
if raw_link.startswith("http"):
|
||||
url = raw_link
|
||||
else:
|
||||
# It's a slug for the cheapestoil detail page
|
||||
slug = raw_link
|
||||
|
||||
return {
|
||||
"slug": slug, # Return slug so scraper can use it to fetch details
|
||||
"name": name,
|
||||
"price": price,
|
||||
"service_area": service_area,
|
||||
"county_name": county_name,
|
||||
"date": date_str,
|
||||
"url": url,
|
||||
"slug": slug,
|
||||
}
|
||||
217
cheapestoil/scraper.py
Normal file
217
cheapestoil/scraper.py
Normal file
@@ -0,0 +1,217 @@
|
||||
"""
|
||||
Main orchestrator for the CheapestOil scraper.
|
||||
"""
|
||||
import logging
|
||||
import time
|
||||
from datetime import datetime
|
||||
|
||||
import sys
|
||||
import os
|
||||
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
||||
|
||||
from sqlalchemy.orm import Session
|
||||
import models
|
||||
|
||||
from .config import STATE_COUNTIES, STATE_API_NAMES, SCRAPE_DELAY
|
||||
from .api_client import fetch_company_details, fetch_county_prices
|
||||
from .parsers import parse_company_record
|
||||
from .company_matcher import find_existing_record
|
||||
from .town_lookup import resolve_county_from_service_area
|
||||
|
||||
|
||||
def _resolve_county_id(
|
||||
county_name: str | None,
|
||||
service_area: str,
|
||||
state_abbr: str,
|
||||
county_lookup: dict,
|
||||
) -> int | None:
|
||||
"""
|
||||
Resolve a county_id from either a direct county name or service area text.
|
||||
|
||||
For MA/CT/ME: county_name comes directly from the API query parameter.
|
||||
For NH/RI/VT: parse service_area text to find a town -> county mapping.
|
||||
"""
|
||||
# Direct county match (MA/CT/ME)
|
||||
if county_name:
|
||||
county_id = county_lookup.get((state_abbr, county_name))
|
||||
if county_id is None:
|
||||
logging.warning(f"County not in DB: ({state_abbr}, {county_name})")
|
||||
return county_id
|
||||
|
||||
# Service area parsing (NH/RI/VT)
|
||||
if service_area:
|
||||
resolved = resolve_county_from_service_area(service_area, state_abbr)
|
||||
if resolved:
|
||||
county_id = county_lookup.get((state_abbr, resolved))
|
||||
if county_id is not None:
|
||||
return county_id
|
||||
logging.warning(f"Resolved county '{resolved}' not in DB for {state_abbr}")
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def scrape_state(state_abbr: str, db_session: Session, county_lookup: dict, refresh_metadata: bool = False) -> dict:
|
||||
"""
|
||||
Scrape all CheapestOil data for a single state.
|
||||
|
||||
Args:
|
||||
state_abbr: Two-letter state code (MA, CT, ME, NH, RI, VT)
|
||||
db_session: SQLAlchemy session
|
||||
county_lookup: Dict of (state_abbr, county_name) -> county_id
|
||||
refresh_metadata: If True, force re-fetch details (phone/url) and overwrite DB.
|
||||
|
||||
Returns:
|
||||
Summary dict with {state, counties_scraped, records_added, records_updated, records_skipped}
|
||||
"""
|
||||
state_abbr = state_abbr.upper()
|
||||
if state_abbr not in STATE_API_NAMES:
|
||||
raise ValueError(f"Unknown state: {state_abbr}. Must be one of {list(STATE_API_NAMES.keys())}")
|
||||
|
||||
api_name = STATE_API_NAMES[state_abbr]
|
||||
counties = STATE_COUNTIES[state_abbr]
|
||||
|
||||
summary = {
|
||||
"state": state_abbr,
|
||||
"counties_scraped": 0,
|
||||
"records_added": 0,
|
||||
"records_updated": 0,
|
||||
"records_skipped": 0,
|
||||
}
|
||||
|
||||
details_cache = {} # cache for detail pages: slug -> {url, phone}
|
||||
|
||||
for i, county_name in enumerate(counties):
|
||||
if i > 0:
|
||||
time.sleep(SCRAPE_DELAY)
|
||||
|
||||
label = county_name or "(state-level)"
|
||||
logging.info(f"[CheapestOil] Fetching: {state_abbr} / {label}")
|
||||
|
||||
rows = fetch_county_prices(api_name, county_name)
|
||||
if not rows:
|
||||
logging.info(f"No results for {state_abbr} / {label}")
|
||||
continue
|
||||
|
||||
logging.info(f"[CheapestOil] Processing {len(rows)} records from {state_abbr} / {label} (Size: {len(rows)})")
|
||||
|
||||
summary["counties_scraped"] += 1
|
||||
|
||||
for row in rows:
|
||||
record = parse_company_record(row, county_name)
|
||||
if not record or record["price"] is None:
|
||||
summary["records_skipped"] += 1
|
||||
continue
|
||||
|
||||
# Resolve county_id
|
||||
county_id = _resolve_county_id(
|
||||
record["county_name"],
|
||||
record["service_area"],
|
||||
state_abbr,
|
||||
county_lookup,
|
||||
)
|
||||
|
||||
# Check for existing record (cross-source dedup)
|
||||
existing = find_existing_record(
|
||||
db_session, record["name"], state_abbr, county_id
|
||||
)
|
||||
|
||||
# Fetch details logic:
|
||||
slug = record.get("slug")
|
||||
real_url = record.get("url")
|
||||
phone = None
|
||||
|
||||
# Determine if we need to fetch details
|
||||
# If refresh_metadata is True, we want to fetch to ensure fresh data.
|
||||
# If not, we fetch if we are missing info (which is handled if we don't have existing record or existing record missing info)
|
||||
# Simplest approach: fetch if we have slug and (refresh_metadata OR missing basic info)
|
||||
|
||||
should_fetch_details = False
|
||||
if slug:
|
||||
if refresh_metadata:
|
||||
should_fetch_details = True
|
||||
elif existing:
|
||||
if not existing.url or not existing.phone:
|
||||
should_fetch_details = True
|
||||
else:
|
||||
# New record, always fetch
|
||||
should_fetch_details = True
|
||||
|
||||
if should_fetch_details:
|
||||
if slug in details_cache:
|
||||
cached = details_cache[slug]
|
||||
real_url = cached["url"]
|
||||
phone = cached["phone"]
|
||||
else:
|
||||
details = fetch_company_details(slug)
|
||||
details_cache[slug] = details
|
||||
real_url = details["url"]
|
||||
phone = details["phone"]
|
||||
time.sleep(1.0) # Polite delay between detail pages
|
||||
|
||||
if existing:
|
||||
# Skip vendor-managed records
|
||||
if existing.company_id is not None:
|
||||
logging.debug(f"Skipping vendor-managed: {record['name']}")
|
||||
summary["records_skipped"] += 1
|
||||
continue
|
||||
|
||||
updated = False
|
||||
|
||||
# Backfill or Force Update url
|
||||
if real_url:
|
||||
if not existing.url or (refresh_metadata and existing.url != real_url):
|
||||
existing.url = real_url
|
||||
updated = True
|
||||
logging.info(f"Updated/Backfilled URL for {record['name']}")
|
||||
|
||||
# Backfill or Force Update phone
|
||||
if phone:
|
||||
if not existing.phone or (refresh_metadata and existing.phone != phone):
|
||||
existing.phone = phone
|
||||
updated = True
|
||||
logging.info(f"Updated/Backfilled Phone for {record['name']}")
|
||||
|
||||
# Backfill county_id if we have it now
|
||||
if county_id is not None and existing.county_id != county_id:
|
||||
existing.county_id = county_id
|
||||
updated = True
|
||||
logging.info(f"Updated county_id for {record['name']}")
|
||||
|
||||
# Update if price changed, otherwise just touch timestamp
|
||||
if existing.price != record["price"]:
|
||||
existing.price = record["price"]
|
||||
existing.date = record["date"]
|
||||
existing.scrapetimestamp = datetime.utcnow()
|
||||
summary["records_updated"] += 1
|
||||
logging.info(f"Updated price: {record['name']} ${existing.price:.2f} → ${record['price']:.2f}")
|
||||
elif updated:
|
||||
existing.scrapetimestamp = datetime.utcnow()
|
||||
summary["records_updated"] += 1
|
||||
else:
|
||||
existing.scrapetimestamp = datetime.utcnow()
|
||||
summary["records_skipped"] += 1
|
||||
logging.debug(f"No changes for {record['name']} (${record['price']:.2f})")
|
||||
else:
|
||||
# Insert new record (zone=0 for cheapestoil)
|
||||
oil_price = models.OilPrice(
|
||||
state=state_abbr,
|
||||
zone=0,
|
||||
name=record["name"],
|
||||
price=record["price"],
|
||||
date=record["date"],
|
||||
county_id=county_id,
|
||||
url=real_url,
|
||||
phone=phone,
|
||||
scrapetimestamp=datetime.utcnow(),
|
||||
)
|
||||
db_session.add(oil_price)
|
||||
summary["records_added"] += 1
|
||||
logging.info(f"Added: {record['name']} in {state_abbr} (county_id={county_id}, phone={phone})")
|
||||
|
||||
db_session.commit()
|
||||
logging.info(
|
||||
f"[CheapestOil] State {state_abbr} complete: "
|
||||
f"{summary['records_added']} added, {summary['records_updated']} updated, "
|
||||
f"{summary['records_skipped']} skipped (no changes)"
|
||||
)
|
||||
return summary
|
||||
1586
cheapestoil/town_lookup.py
Normal file
1586
cheapestoil/town_lookup.py
Normal file
File diff suppressed because it is too large
Load Diff
Reference in New Issue
Block a user