refactor: replace fuel_scraper with newenglandoil + cheapestoil scrapers

- Add newenglandoil/ package as the primary scraper (replaces fuel_scraper) - Add cheapestoil/ package as a secondary market price scraper - Add app.py entry point for direct execution - Update run.py: new scrape_cheapest(), migrate command, --state filter, --refresh-metadata flag for overwriting existing phone/URL data - Update models.py with latest schema fields - Update requirements.txt dependencies - Update Dockerfile and docker-compose.yml for new structure - Remove deprecated fuel_scraper module, test.py, and log file Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-03-06 11:34:21 -05:00
parent 8f45f4c209
commit 1592e6d685
26 changed files with 3221 additions and 1468 deletions
@@ -0,0 +1,4 @@
+# newenglandoil package
+from .scraper import main
+
+__all__ = ["main"]
@@ -0,0 +1,125 @@
+"""
+Configuration module for the fuel scraper.
+Contains site definitions, zone-to-county mapping, and logging setup.
+"""
+import logging
+
+# --- SITES CONFIGURATION ---
+SITES_CONFIG = [
+    {
+        "site_name": "NewEnglandOil",
+        "base_url": "https://www.newenglandoil.com",
+        "url_template": "{base_url}/{state_slug}/{zone_slug}.asp?type={oil_type}",
+        "oil_type": 0,
+        "locations": {
+            "connecticut": [
+                "zone1", "zone2", "zone3", "zone4", "zone5", "zone6", "zone7",
+                "zone8", "zone9", "zone10"
+            ],
+            "massachusetts": [
+                "zone1", "zone2", "zone3", "zone4", "zone5", "zone6",
+                "zone7", "zone8", "zone9", "zone10", "zone11", "zone12",
+                "zone13", "zone14", "zone15"
+            ],
+            "newhampshire": [
+                "zone1", "zone2", "zone3", "zone4", "zone5", "zone6"
+            ],
+            "rhodeisland": [
+                "zone1", "zone2", "zone3", "zone4"
+            ],
+        }
+    },
+    {
+        "site_name": "MaineOil",
+        "base_url": "https://www.maineoil.com",
+        "url_template": "{base_url}/{zone_slug}.asp?type={oil_type}",
+        "oil_type": 0,
+        "locations": {
+            "maine": [
+                "zone1", "zone2", "zone3", "zone4", "zone5",
+                "zone6", "zone7"
+            ]
+        }
+    }
+]
+
+# --- STATE ABBREVIATION MAP ---
+# Maps lowercase state keys (as used in SITES_CONFIG locations) to 2-letter abbreviations
+STATE_ABBREV_MAP = {
+    "connecticut": "CT",
+    "massachusetts": "MA",
+    "maine": "ME",
+    "newhampshire": "NH",
+    "rhodeisland": "RI",
+    "vermont": "VT",
+}
+
+# --- ZONE-TO-COUNTY MAPPING ---
+# Maps (state_key, zone_number) -> (state_abbrev, county_name)
+# state_key matches the keys in SITES_CONFIG locations (lowercase, no spaces)
+# county_name must match the county.name in the database exactly
+ZONE_COUNTY_MAP = {
+    # Connecticut (10 zones -> 8 counties)
+    ("connecticut", 1): ("CT", "New London"),       # Southeast CT
+    ("connecticut", 2): ("CT", "Windham"),           # Northeast CT
+    ("connecticut", 3): ("CT", "New Haven"),         # New Haven, Bridgeport
+    ("connecticut", 4): ("CT", "Middlesex"),         # Southeast Central CT
+    ("connecticut", 5): ("CT", "New Haven"),         # Southwest Central CT
+    ("connecticut", 6): ("CT", "Hartford"),          # Greater Hartford
+    ("connecticut", 7): ("CT", "Litchfield"),        # West CT
+    ("connecticut", 8): ("CT", "Fairfield"),         # Southwest CT
+    ("connecticut", 9): ("CT", "Tolland"),           # Northeast Central CT
+    ("connecticut", 10): ("CT", "Litchfield"),       # Northwest CT
+
+    # Massachusetts (15 zones -> 14 counties)
+    ("massachusetts", 1): ("MA", "Suffolk"),         # South Boston
+    ("massachusetts", 2): ("MA", "Middlesex"),       # North Boston
+    ("massachusetts", 3): ("MA", "Norfolk"),         # Southwest of Boston
+    ("massachusetts", 4): ("MA", "Plymouth"),        # South of Boston
+    ("massachusetts", 5): ("MA", "Middlesex"),       # West of Boston
+    ("massachusetts", 6): ("MA", "Bristol"),         # Southern Massachusetts
+    ("massachusetts", 7): ("MA", "Barnstable"),      # Cape Cod & Islands
+    ("massachusetts", 8): ("MA", "Essex"),           # Northwest of Boston
+    ("massachusetts", 9): ("MA", "Essex"),           # North of Boston
+    ("massachusetts", 10): ("MA", "Worcester"),      # Central Massachusetts
+    ("massachusetts", 11): ("MA", "Worcester"),      # East Central Massachusetts
+    ("massachusetts", 12): ("MA", "Hampshire"),      # West Central Massachusetts
+    ("massachusetts", 13): ("MA", "Hampden"),        # Springfield Area
+    ("massachusetts", 14): ("MA", "Franklin"),       # Northwestern Massachusetts
+    ("massachusetts", 15): ("MA", "Berkshire"),      # Western Massachusetts
+
+    # New Hampshire (6 zones -> 10 counties)
+    ("newhampshire", 1): ("NH", "Coos"),             # Northern NH
+    ("newhampshire", 2): ("NH", "Strafford"),        # Eastern NH
+    ("newhampshire", 3): ("NH", "Merrimack"),        # Central NH
+    ("newhampshire", 4): ("NH", "Grafton"),          # West Central NH
+    ("newhampshire", 5): ("NH", "Cheshire"),         # Southwest NH
+    ("newhampshire", 6): ("NH", "Hillsborough"),     # South Central NH
+
+    # Rhode Island (4 zones -> 5 counties)
+    ("rhodeisland", 1): ("RI", "Newport"),           # Southeast RI
+    ("rhodeisland", 2): ("RI", "Providence"),        # Northern RI
+    ("rhodeisland", 3): ("RI", "Washington"),        # Southwest RI
+    ("rhodeisland", 4): ("RI", "Kent"),              # Central RI
+
+    # Maine (7 zones -> 16 counties, via MaineOil.com)
+    ("maine", 1): ("ME", "Cumberland"),              # Greater Portland
+    ("maine", 2): ("ME", "Kennebec"),                # Augusta/Waterville
+    ("maine", 3): ("ME", "Androscoggin"),            # Auburn/Lewiston/Western
+    ("maine", 4): ("ME", "York"),                    # Southern Maine
+    ("maine", 5): ("ME", "Knox"),                    # Mid-Coast
+    ("maine", 6): ("ME", "Penobscot"),               # Bangor West
+    ("maine", 7): ("ME", "Washington"),              # Downeast
+}
+
+# --- LOGGING CONFIGURATION ---
+LOG_FILE = "oil_scraper.log"
+
+
+def setup_logging():
+    """Configure logging for the scraper."""
+    logging.basicConfig(
+        filename=LOG_FILE,
+        level=logging.INFO,
+        format='%(asctime)s - %(levelname)s - [%(filename)s:%(lineno)d] - %(message)s'
+    )
@@ -0,0 +1,131 @@
+"""
+Database operations module for oil price CRUD operations.
+"""
+import logging
+import sys
+import os
+from datetime import datetime
+# Add parent directory to path for imports
+sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
+from sqlalchemy.orm import Session
+from sqlalchemy import func
+import models
+
+
+def upsert_oil_price(db_session: Session, item_dict: dict, force_update_metadata: bool = False) -> bool:
+    """
+    Insert or update an oil price record.
+
+    Logic:
+    - Match by (name, state, county_id) - case insensitive on name!
+    - If county_id is None, fall back to (name, state, zone).
+    - If match found:
+        - If company_id is set: SKIP (vendor managed).
+        - Update name to formatted version (e.g. "Leblanc Oil" vs "LEBLANC OIL").
+        - Update phone/url if missing OR force_update_metadata is True.
+        - Update price/date if changed.
+    - If no match: INSERT.
+
+    Args:
+        db_session: SQLAlchemy session
+        item_dict: Dictionary with state, zone, name, price, date, county_id
+        force_update_metadata: If True, overwrite existing phone/url
+    """
+    county_id = item_dict.get("county_id")
+    site_name = item_dict.get("site_name", "NewEnglandOil")
+    name_clean = item_dict["name"].strip()
+
+    # Query for existing record - Case Insensitive
+    query = db_session.query(models.OilPrice).filter(
+        func.lower(models.OilPrice.name) == name_clean.lower(),
+        models.OilPrice.state == item_dict["state"]
+    )
+    
+    if county_id is not None:
+        query = query.filter(models.OilPrice.county_id == county_id)
+    else:
+        query = query.filter(models.OilPrice.zone == item_dict["zone"])
+        
+    existing_record = query.first()
+
+    new_phone = item_dict.get("phone")
+    new_url = item_dict.get("url")
+
+    if existing_record:
+        # Record exists
+        if existing_record.company_id is not None:
+            logging.debug(
+                f"[{site_name}] Skipping update for {name_clean} (ID={existing_record.id}) "
+                "due to non-null company_id"
+            )
+            return False
+
+        updated = False
+        
+        # 1. Update name casing if different (and new name looks "better" e.g. not all caps)
+        # Simple heuristic: if existing is all caps and new is mixed, take new.
+        if existing_record.name != name_clean:
+            # We trust the scraper's _smart_title() output is generally good
+            existing_record.name = name_clean
+            updated = True
+
+        # 2. Update county_id if we have one (scraper resolved it) and DB didn't have it
+        if county_id is not None and existing_record.county_id != county_id:
+            existing_record.county_id = county_id
+            updated = True
+
+        # 3. Backfill or Force Update phone/url
+        if new_phone:
+            if not existing_record.phone or (force_update_metadata and existing_record.phone != new_phone):
+                existing_record.phone = new_phone
+                updated = True
+        
+        if new_url:
+            if not existing_record.url or (force_update_metadata and existing_record.url != new_url):
+                existing_record.url = new_url
+                updated = True
+
+        # 4. Check Price Change
+        # We compare as float provided logic is sound, but float equality can be tricky.
+        # However, price is usually 2 decimals.
+        if abs(existing_record.price - item_dict["price"]) > 0.001:
+            existing_record.price = item_dict["price"]
+            existing_record.date = item_dict["date"]
+            existing_record.scrapetimestamp = datetime.utcnow()
+            logging.info(
+                f"[{site_name}] Updated price for {name_clean} (ID={existing_record.id}) "
+                f"to {item_dict['price']}"
+            )
+            return True
+        elif updated:
+            existing_record.scrapetimestamp = datetime.utcnow()
+            logging.info(
+                f"[{site_name}] Updated metadata for {name_clean} (ID={existing_record.id})"
+            )
+            return True
+        else:
+            # No meaningful change
+            logging.debug(
+                f"[{site_name}] Price unchanged for {name_clean} in {item_dict['state']} zone {item_dict['zone']}"
+            )
+            return False
+    else:
+        # Create new
+        oil_price_record = models.OilPrice(
+            state=item_dict["state"],
+            zone=item_dict["zone"],
+            name=name_clean,
+            price=item_dict["price"],
+            date=item_dict["date"],
+            county_id=county_id,
+            phone=new_phone,
+            url=new_url,
+            scrapetimestamp=datetime.utcnow()
+        )
+        db_session.add(oil_price_record)
+        logging.info(
+            f"[{site_name}] Added new record for {name_clean} in {item_dict['state']} zone {item_dict['zone']} "
+            f"(county_id={county_id})"
+        )
+        return True
@@ -0,0 +1,111 @@
+"""
+HTTP client module for making web requests.
+"""
+import logging
+import re
+import time
+import requests
+from bs4 import BeautifulSoup
+
+# Default headers to mimic a browser
+DEFAULT_HEADERS = {
+    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
+}
+
+REQUEST_TIMEOUT = 20
+PHONE_FETCH_DELAY = 1  # seconds between phone page requests
+
+
+def make_request(url: str) -> BeautifulSoup | None:
+    """
+    Fetch a URL and return a BeautifulSoup object.
+
+    Args:
+        url: The URL to fetch
+
+    Returns:
+        BeautifulSoup object if successful, None otherwise
+    """
+    try:
+        response = requests.get(url, headers=DEFAULT_HEADERS, timeout=REQUEST_TIMEOUT)
+        response.raise_for_status()
+        return BeautifulSoup(response.content, 'html.parser')
+    except requests.exceptions.RequestException as e:
+        logging.error(f"Error fetching {url}: {e}")
+        return None
+
+
+def fetch_phone_number(base_url: str, phone_page_path: str, state_slug: str = "") -> str | None:
+    """
+    Fetch a phone number from a newenglandoil phones.asp page.
+
+    Args:
+        base_url: Site base URL (e.g. "https://www.newenglandoil.com")
+        phone_page_path: Relative path like "phones.asp?zone=1&ID=10&a=MA1"
+        state_slug: State slug for URL path (e.g. "massachusetts")
+
+    Returns:
+        Phone number string or None if not found.
+    """
+    # Build full URL - phone_page_path may be relative
+    if phone_page_path.startswith('http'):
+        url = phone_page_path
+    elif state_slug:
+        url = f"{base_url}/{state_slug}/{phone_page_path}"
+    else:
+        url = f"{base_url}/{phone_page_path}"
+
+    time.sleep(PHONE_FETCH_DELAY)
+
+    soup = make_request(url)
+    if not soup:
+        return None
+
+    # Look for phone number patterns in the page text
+    page_text = soup.get_text(" ", strip=True) 
+    
+    # Common US phone formats: (508) 555-1234, 508-555-1234, 508.555.1234, 5085551234
+    # Captures:
+    # 1. Optional open paren
+    # 2. 3 digits (area code)
+    # 3. Optional close paren
+    # 4. Separator (space, dot, dash)
+    # 5. 3 digits (prefix)
+    # 6. Separator
+    # 7. 4 digits (line number)
+    phone_pattern = re.compile(
+        r'(?:\(?(\d{3})\)?[\s.\-]?(\d{3})[\s.\-]?(\d{4}))'
+    )
+    
+    # Try to find a phone number near "Phone:" or "Tel:" first
+    keyword_pattern = re.compile(r'(?:Phone|Tel|Call|Contact).*?(\(?\d{3}\)?[\s.\-]?\d{3}[\s.\-]?\d{4})', re.IGNORECASE)
+    keyword_match = keyword_pattern.search(page_text)
+    
+    candidate = None
+    if keyword_match:
+        # If we found a number near a keyword, use that one.
+        candidate = keyword_match.group(1)
+    else:
+        # Otherwise, look for the first valid phone pattern
+        matches = phone_pattern.findall(page_text)
+        for m in matches:
+            # m is a tuple of groups: ('508', '555', '1234')
+            full_num = "".join(m)
+            
+            # Simple heuristic to avoid dates like 2024, 2025 or common years if adjacent
+            # But the regex requires 3-3-4 structure so a simple "2024" won't match unless it's like 202-455-1234
+            # We can filter out obviously bad "numbers" if needed, e.g. 000-000-0000
+            if full_num.startswith('000'): 
+                continue
+                
+            candidate = f"{m[0]}-{m[1]}-{m[2]}"
+            break
+
+    if candidate:
+        digits = re.sub(r'\D', '', candidate)
+        if len(digits) == 10:
+            return f"({digits[:3]}) {digits[3:6]}-{digits[6:]}"
+        return candidate
+
+    logging.debug(f"No phone number found on {url}")
+    return None
@@ -0,0 +1,289 @@
+"""
+HTML parsing module for extracting oil price data from web pages.
+"""
+import logging
+import re
+from urllib.parse import urlparse, parse_qs
+from bs4 import BeautifulSoup
+
+from .config import STATE_ABBREV_MAP
+
+
+def parse_zone_slug_to_int(zone_slug_str: str) -> int | None:
+    """
+    Extract the numeric part of a zone slug.
+    
+    Examples:
+        "zone1" -> 1
+        "zonema5" -> 5
+        
+    Args:
+        zone_slug_str: Zone slug string like "zone1", "zonema5"
+        
+    Returns:
+        Integer zone number or None if parsing fails
+    """
+    if not zone_slug_str:
+        return None
+    match = re.search(r'\d+$', zone_slug_str)
+    if match:
+        return int(match.group(0))
+    logging.warning(f"Could not parse numeric zone from slug: '{zone_slug_str}'")
+    return None
+
+
+def _find_price_table_columns(thead) -> dict | None:
+    """
+    Find column indices for company, price, and date in a table header.
+    
+    Args:
+        thead: BeautifulSoup thead element
+        
+    Returns:
+        Dictionary with column indices or None if not a price table
+    """
+    headers_lower = [th.get_text(strip=True).lower() for th in thead.find_all('th')]
+    column_indices = {}
+    
+    try:
+        column_indices['company'] = headers_lower.index('company name')
+        price_col_name_part = 'price'
+        column_indices['price'] = next(
+            i for i, header in enumerate(headers_lower) if price_col_name_part in header
+        )
+        column_indices['date'] = headers_lower.index('date')
+        return column_indices
+    except (ValueError, StopIteration):
+        return None
+
+
+def _smart_title(name: str) -> str:
+    """
+    Convert a company name to title case, preserving common abbreviations.
+
+    Handles: LLC, INC, CO, LP, HVAC, A1, etc.
+    """
+    # Common abbreviations that should stay uppercase
+    keep_upper = {"LLC", "INC", "LP", "HVAC", "II", "III", "IV", "USA", "CT", "MA", "NH", "ME", "RI", "VT"}
+    words = name.title().split()
+    result = []
+    for word in words:
+        if word.upper() in keep_upper:
+            result.append(word.upper())
+        else:
+            result.append(word)
+    return " ".join(result)
+
+
+def _extract_company_url(company_link) -> str | None:
+    """
+    Extract the actual company URL from a link.
+    
+    Handles:
+    1. Redirects: click.asp?x=http://example.com&... -> http://example.com
+    2. Direct links: http://example.com -> http://example.com
+    """
+    if not company_link:
+        return None
+        
+    href = company_link.get('href', '')
+    if not href:
+        return None
+
+    url_candidate = None
+
+    if 'click.asp' in href:
+        # Parse the x parameter which contains the actual URL
+        try:
+            parsed = urlparse(href)
+            params = parse_qs(parsed.query)
+            extracted = params.get('x', [None])[0]
+            if extracted:
+                url_candidate = extracted
+        except Exception:
+            pass
+    elif href.startswith(('http://', 'https://')):
+        # Direct link
+        url_candidate = href
+    
+    # Validate the candidate URL
+    if url_candidate:
+        try:
+            # Basic validation
+            if not url_candidate.startswith(('http://', 'https://')):
+                 return None
+                 
+            lower_url = url_candidate.lower()
+            # Filter out internal or competitor site loops
+            if 'newenglandoil.com' in lower_url or 'cheapestoil.com' in lower_url:
+                return None
+            
+            return url_candidate
+        except Exception:
+            pass
+            
+    return None
+
+
+def _extract_phone_link(cells: list) -> dict | None:
+    """
+    Extract the phone page link info from a row's phone cell.
+
+    Phone link format: phones.asp?zone=1&ID=10&a=MA1
+    Returns dict with {url, company_neo_id} or None.
+    """
+    for cell in cells:
+        link = cell.find('a', href=lambda h: h and 'phones.asp' in h)
+        if link:
+            href = link.get('href', '')
+            try:
+                parsed = urlparse(href)
+                params = parse_qs(parsed.query)
+                neo_id = params.get('ID', [None])[0]
+                return {
+                    "phone_page_path": href,
+                    "neo_id": neo_id,
+                }
+            except Exception:
+                pass
+    return None
+
+
+def _parse_row(cells: list, column_indices: dict, state_name: str, zone: int) -> dict | None:
+    """
+    Parse a single table row into a price record.
+
+    Args:
+        cells: List of td elements
+        column_indices: Dictionary mapping column names to indices
+        state_name: State name string (lowercase key like "connecticut")
+        zone: Zone number
+
+    Returns:
+        Dictionary with parsed data or None if parsing fails
+    """
+    max_required_index = max(column_indices.values())
+
+    if len(cells) <= max_required_index:
+        return None
+
+    # Extract company name (prefer link text if available)
+    company_cell = cells[column_indices['company']]
+    company_name = company_cell.get_text(strip=True)
+    company_link = company_cell.find('a')
+    if company_link:
+        company_name = company_link.get_text(strip=True)
+
+    # Apply title case normalization
+    company_name = _smart_title(company_name)
+
+    # Extract company URL from click.asp link
+    company_url = _extract_company_url(company_link)
+
+    # Extract phone page link info
+    phone_info = _extract_phone_link(cells)
+
+    # Extract and parse price
+    price_str = cells[column_indices['price']].get_text(strip=True)
+    price_float = None
+    try:
+        cleaned_price_str = ''.join(filter(lambda x: x.isdigit() or x == '.', price_str))
+        if cleaned_price_str:
+            price_float = float(cleaned_price_str)
+    except ValueError:
+        logging.warning(f"Could not parse price: '{price_str}' for {company_name} in {state_name}/zone{zone}.")
+    except Exception as e:
+        logging.error(f"Unexpected error parsing price: '{price_str}' for {company_name}. Error: {e}")
+
+    # Extract date
+    date_posted_str = cells[column_indices['date']].get_text(strip=True)
+
+    # Convert state name to 2-letter abbreviation
+    state_abbr = STATE_ABBREV_MAP.get(state_name.lower())
+    if not state_abbr:
+        logging.warning(f"Unknown state key: {state_name}, using capitalized form")
+        state_abbr = state_name.capitalize()
+
+    return {
+        "state": state_abbr,
+        "zone": zone,
+        "name": company_name,
+        "price": price_float,
+        "date": date_posted_str,
+        "url": company_url,
+        "phone_info": phone_info,
+    }
+
+
+def parse_price_table(soup: BeautifulSoup, state_name_key: str, zone_slug_str: str, site_name: str = "NewEnglandOil") -> list[dict]:
+    """
+    Parse price tables from a BeautifulSoup page.
+    
+    Args:
+        soup: BeautifulSoup object of the page
+        state_name_key: State key like "connecticut", "maine"
+        zone_slug_str: Zone slug like "zone1", "zonema5"
+        
+    Returns:
+        List of dictionaries containing price data
+    """
+    data_dicts = []
+    all_tables = soup.find_all('table')
+    logging.info(f"[{site_name}] Found {len(all_tables)} table(s) on page for {state_name_key} - {zone_slug_str}.")
+    
+    if not all_tables:
+        logging.warning(f"[{site_name}] No HTML tables found at all for {state_name_key} - {zone_slug_str}.")
+        return data_dicts
+    
+    # Parse zone number from slug
+    zone_int = parse_zone_slug_to_int(zone_slug_str)
+    if zone_int is None:
+        logging.error(f"[{site_name}] Cannot parse zone number for {state_name_key} - {zone_slug_str}. Skipping.")
+        return data_dicts
+    
+    candidate_tables_found = 0
+    
+    for table_index, table in enumerate(all_tables):
+        thead = table.find('thead')
+        if not thead:
+            logging.debug(f"Table {table_index} has no thead.")
+            continue
+        
+        # Check if this is a price table
+        column_indices = _find_price_table_columns(thead)
+        if column_indices is None:
+            logging.debug(f"Table {table_index} headers do not contain all key columns.")
+            continue
+        
+        logging.debug(f"Table {table_index} identified as price table. Indices: {column_indices}")
+        candidate_tables_found += 1
+        
+        # Parse table body
+        tbody = table.find('tbody')
+        if not tbody:
+            logging.warning(f"[{site_name}] Price table identified by headers has no tbody. Skipping. State: {state_name_key}, Zone: {zone_slug_str}")
+            continue
+        
+        rows = tbody.find_all('tr')
+        if not rows:
+            logging.debug(f"No rows found in tbody for price table in {state_name_key}/{zone_slug_str}")
+            continue
+        
+        # Parse each row
+        for row_index, row in enumerate(rows):
+            cells = row.find_all('td')
+            record = _parse_row(cells, column_indices, state_name_key, zone_int)
+            
+            if record:
+                data_dicts.append(record)
+            elif len(cells) > 0:
+                max_required = max(column_indices.values()) + 1
+                logging.warning(
+                    f"[{site_name}] Skipping row {row_index+1} with insufficient cells ({len(cells)}, need {max_required}) "
+                    f"in {state_name_key}/{zone_slug_str}"
+                )
+    
+    if candidate_tables_found == 0:
+        logging.warning(f"[{site_name}] No tables matching expected price table structure found for {state_name_key} - {zone_slug_str}.")
+    
+    return data_dicts
@@ -0,0 +1,266 @@
+#!/usr/bin/env python3
+"""
+Main scraper orchestrator module.
+Coordinates fetching, parsing, and storing oil price data.
+"""
+import logging
+import sys
+import os
+
+# Add parent directory to path for imports
+sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
+from sqlalchemy.orm import Session
+from database import SessionLocal, init_db
+import models
+
+from .config import SITES_CONFIG, ZONE_COUNTY_MAP, setup_logging, STATE_ABBREV_MAP
+from .http_client import make_request, fetch_phone_number
+from .parsers import parse_price_table, parse_zone_slug_to_int
+from .db_operations import upsert_oil_price
+
+
+def _build_county_lookup(db_session: Session) -> dict:
+    """
+    Build a lookup dict from (state_abbrev, county_name) -> county_id
+    by querying the county table.
+    """
+    counties = db_session.query(models.County).all()
+    lookup = {}
+    for c in counties:
+        if c.name:
+            lookup[(c.state, c.name.strip())] = c.id
+    logging.info(f"Built county lookup with {len(lookup)} entries")
+    return lookup
+
+
+def _resolve_county_id(state_key: str, zone_number: int, county_lookup: dict) -> int | None:
+    """
+    Resolve a county_id from ZONE_COUNTY_MAP and the county lookup.
+    Returns None if no mapping exists.
+    """
+    mapping = ZONE_COUNTY_MAP.get((state_key, zone_number))
+    if not mapping:
+        logging.debug(f"No zone-to-county mapping for ({state_key}, {zone_number})")
+        return None
+    state_abbrev, county_name = mapping
+    county_id = county_lookup.get((state_abbrev, county_name))
+    if county_id is None:
+        logging.warning(f"County not found in DB: ({state_abbrev}, {county_name}) for zone ({state_key}, {zone_number})")
+    return county_id
+
+
+def _scrape_zone(
+    db_session: Session,
+    site_name: str,
+    url_template: str,
+    base_url: str,
+    oil_type: int,
+    state_key: str,
+    zone_slug: str,
+    county_lookup: dict,
+    phone_cache: dict,
+    refresh_metadata: bool = False,
+) -> int:
+    """
+    Scrape a single zone and store records.
+
+    Args:
+        phone_cache: Dict mapping neo_id -> phone string. Shared across zones
+                     to avoid re-fetching the same company's phone page.
+        refresh_metadata: If True, force re-fetch phone even if in cache (or not cached yet)
+                          and overwrite DB values.
+
+    Returns:
+        Number of records processed
+    """
+    format_params = {
+        "base_url": base_url,
+        "state_slug": state_key,
+        "zone_slug": zone_slug,
+        "oil_type": oil_type
+    }
+    target_url = url_template.format(**format_params)
+
+    logging.info(f"[{site_name}] Scraping: {target_url} (State: {state_key}, Zone Slug: {zone_slug})")
+
+    soup = make_request(target_url)
+    if not soup:
+        logging.warning(f"[{site_name}] Failed to retrieve or parse {target_url}. Skipping.")
+        return 0
+
+    parsed_items = parse_price_table(soup, state_key, zone_slug, site_name)
+
+    if not parsed_items:
+        logging.info(f"[{site_name}] No data extracted from {target_url}")
+        return 0
+
+    # Resolve county_id for this zone
+    zone_number = parse_zone_slug_to_int(zone_slug)
+    county_id = None
+    if zone_number is not None:
+        county_id = _resolve_county_id(state_key, zone_number, county_lookup)
+
+    records_processed = 0
+    for item_dict in parsed_items:
+        item_dict["county_id"] = county_id
+        item_dict["site_name"] = site_name
+
+        # Fetch phone number if we have phone_info and haven't fetched this company yet
+        phone_info = item_dict.pop("phone_info", None)
+        if phone_info:
+            neo_id = phone_info.get("neo_id")
+            
+            # If refresh_metadata is True, we want to fetch regardless of cache check initially
+            # to refresh the cache value if needed.
+            # Use phone_page_path as the cache key because neo_id is only unique per zone.
+            # phone_page_path typically looks like "phones.asp?zone=1&ID=10&a=MA1" effectively unique.
+            phone_key = phone_info.get("phone_page_path")
+            
+            if phone_key:
+                should_fetch = False
+                if phone_key in phone_cache:
+                    if refresh_metadata:
+                        # Even if in cache, we might want to refetch? 
+                        # Or maybe just trust first fetch in this run.
+                        # Let's say cache handles current runtime, refresh_metadata handles DB.
+                        # BUT if we want to refresh, we should fetch it at least once this run.
+                        item_dict["phone"] = phone_cache[phone_key]
+                    else:
+                        item_dict["phone"] = phone_cache[phone_key]
+                else:
+                     should_fetch = True
+
+                if should_fetch:
+                    # Only include state_slug in phone URL if the site uses it in its URL template
+                    slug = state_key if "{state_slug}" in url_template else ""
+                    phone = fetch_phone_number(base_url, phone_info["phone_page_path"], slug)
+                    phone_cache[phone_key] = phone
+                    item_dict["phone"] = phone
+                    if phone:
+                        logging.info(f"[{site_name}] Fetched phone for {item_dict['name']} (ID={neo_id}): {phone}")
+
+        if upsert_oil_price(db_session, item_dict, force_update_metadata=refresh_metadata):
+            records_processed += 1
+
+    logging.info(
+        f"[{site_name}] Processed {len(parsed_items)} records from {site_name} - {state_key}/{zone_slug} "
+        f"({records_processed} inserted/updated, county_id={county_id}) (Size: {len(parsed_items)})"
+    )
+
+    return len(parsed_items)
+
+
+def _scrape_site(db_session: Session, site_config: dict, county_lookup: dict, refresh_metadata: bool = False) -> int:
+    """
+    Scrape all zones for a single site.
+
+    Returns:
+        Total number of records processed
+    """
+    site_name = site_config["site_name"]
+    base_url = site_config["base_url"]
+    url_template = site_config["url_template"]
+    oil_type = site_config["oil_type"]
+
+    logging.info(f"--- Processing site: {site_name} ---")
+
+    total_records = 0
+    # Shared phone cache across all zones for this site to avoid redundant fetches
+    phone_cache = {}
+
+    for state_key, zone_slugs in site_config["locations"].items():
+        for zone_slug in zone_slugs:
+            records = _scrape_zone(
+                db_session=db_session,
+                site_name=site_name,
+                url_template=url_template,
+                base_url=base_url,
+                oil_type=oil_type,
+                state_key=state_key,
+                zone_slug=zone_slug,
+                county_lookup=county_lookup,
+                phone_cache=phone_cache,
+                refresh_metadata=refresh_metadata,
+            )
+            total_records += records
+
+    logging.info(f"Phone cache: fetched {len(phone_cache)} unique company phones for {site_name}")
+    return total_records
+
+
+def main(refresh_metadata: bool = False, target_state_abbr: str | None = None):
+    """
+    Main entry point for the oil price scraper.
+
+    Args:
+        refresh_metadata: If True, force re-fetch details.
+        target_state_abbr: If set (e.g. "MA"), only scrape that state.
+    """
+    setup_logging()
+    
+    state_msg = f" (State: {target_state_abbr})" if target_state_abbr else ""
+    logging.info(f"Starting oil price scraper job.{state_msg} (Refresh Metadata: {refresh_metadata})")
+
+    # Initialize database
+    try:
+        init_db()
+        logging.info("Database initialized/checked successfully.")
+    except Exception as e:
+        logging.error(f"Failed to initialize database: {e}", exc_info=True)
+        return
+
+    db_session: Session = SessionLocal()
+    total_records = 0
+
+    try:
+        # Build county lookup at startup
+        county_lookup = _build_county_lookup(db_session)
+        
+        # Build reverse map for state filtering
+        abbrev_to_state = {v: k for k, v in STATE_ABBREV_MAP.items()}
+        target_state_key = abbrev_to_state.get(target_state_abbr.upper()) if target_state_abbr else None
+        
+        if target_state_abbr and not target_state_key:
+             logging.error(f"Unknown state abbreviation: {target_state_abbr}")
+             return
+
+        # Process each configured site
+        for site_config in SITES_CONFIG:
+            # If filtering by state, create a shallow copy of config with filtered locations
+            config_to_use = site_config
+            if target_state_key:
+                # Check if this site has the target state
+                if target_state_key in site_config["locations"]:
+                    # Create filtered config
+                    config_to_use = site_config.copy()
+                    config_to_use["locations"] = {
+                        target_state_key: site_config["locations"][target_state_key]
+                    }
+                else:
+                    logging.info(f"Skipping {site_config['site_name']} (does not cover {target_state_abbr})")
+                    continue
+            
+            records = _scrape_site(db_session, config_to_use, county_lookup, refresh_metadata=refresh_metadata)
+            total_records += records
+
+        # Commit all changes
+        if total_records > 0:
+            db_session.commit()
+            logging.info(f"Successfully committed records to the database.")
+        else:
+            logging.info("No new records were queued for database insertion in this run.")
+
+    except Exception as e:
+        logging.error(f"An error occurred during scraping or DB operation: {e}", exc_info=True)
+        db_session.rollback()
+        logging.info("Database transaction rolled back due to error.")
+    finally:
+        db_session.close()
+        logging.info("Database session closed.")
+
+    logging.info("Oil price scraper job finished.")
+
+
+if __name__ == "__main__":
+    main()