crawler/cheapestoil/api_client.py

"""
HTTP client for the CheapestOil JSON API.
"""
import re
import requests
from bs4 import BeautifulSoup

from .config import API_URL

DEFAULT_HEADERS = {
    "User-Agent": (
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
        "AppleWebKit/537.36 (KHTML, like Gecko) "
        "Chrome/91.0.4472.124 Safari/537.36"
    )
}

REQUEST_TIMEOUT = 20


def fetch_company_details(slug: str) -> dict:
    """
    Fetch company details (real URL, phone) from their CheapestOil profile page.

    Args:
        slug: The company slug/path (e.g. "Abc-Oil-Company")

    Returns:
        Dict with keys: "url" (str|None), "phone" (str|None)
    """
    if not slug:
        return {"url": None, "phone": None}

    # Construct detail URL
    # If slug is full URL, use it, else append to base
    if slug.startswith("http"):
        url = slug
    else:
        url = f"https://www.cheapestoil.com/{slug}"

    try:
        resp = requests.get(url, headers=DEFAULT_HEADERS, timeout=REQUEST_TIMEOUT)
        resp.raise_for_status()
        soup = BeautifulSoup(resp.content, 'html.parser')

        real_url = None
        phone = None

        # 1. Extract Real URL
        # Look for "Visit Website" link or similar anchor texts
        # Usually contained in a link with text "Visit Website" or the company name
        # We look for a link that is NOT internal (doesn't contain cheapestoil.com)
        # and behaves like an external link.

        # Common pattern: <a href="..." target="_blank">Visit Website</a>
        visit_link = soup.find('a', string=re.compile(r"Visit Website|Company Website", re.IGNORECASE))
        if visit_link and visit_link.get('href'):
            href = visit_link.get('href')
            if 'cheapestoil.com' not in href and href.startswith('http'):
                real_url = href

        # Fallback: look for any external link in the contact section if structured
        if not real_url:
            # Try to find the first external link in the main content area
            # (This is heuristics-based, might need adjustment)
            content_div = soup.find('div', class_='col-md-8') # Common bootstrap main col
            if content_div:
                links = content_div.find_all('a', href=True)
                for a in links:
                    href = a['href']
                    if href.startswith('http') and 'cheapestoil.com' not in href:
                        real_url = href
                        break

        # 2. Extract Phone
        # Reuse robust regex pattern logic
        page_text = soup.get_text(" ", strip=True)

        # Look for "Phone:", "Tel:", etc.
        # This is a bit simplified compared to the other scraper but likely sufficient
        phone_match = re.search(r'(?:Phone|Tel|Call).*?(\(?\d{3}\)?[\s.\-]?\d{3}[\s.\-]?\d{4})', page_text, re.IGNORECASE)
        if phone_match:
            phone_candidate = phone_match.group(1)
        else:
             # Fallback to just finding a phone pattern
            phone_match = re.search(r'(?:\(?\d{3}\)?[\s.\-]?\d{3}[\s.\-]?\d{4})', page_text)
            phone_candidate = phone_match.group(0) if phone_match else None

        if phone_candidate:
            digits = re.sub(r'\D', '', phone_candidate)
            if len(digits) == 10:
                phone = f"({digits[:3]}) {digits[3:6]}-{digits[6:]}"
            else:
                phone = phone_candidate

        return {"url": real_url, "phone": phone}

    except Exception as e:
        logging.warning(f"Failed to fetch details for {slug}: {e}")
        return {"url": None, "phone": None}


def fetch_county_prices(state_api_name: str, county_name: str | None = None) -> list:
    """
    Fetch price data from the CheapestOil API.

    Args:
        state_api_name: State name as used by the API (e.g. "Massachusetts", "NewHampshire")
        county_name: County name filter, or None for state-level results

    Returns:
        List of raw JSON arrays from the API, or empty list on failure.
    """
    params = {
        "sort": 0,
        "state": state_api_name,
        "county": county_name or "",
        "zip": "",
    }
    try:
        resp = requests.get(
            API_URL, params=params, headers=DEFAULT_HEADERS, timeout=REQUEST_TIMEOUT
        )
        resp.raise_for_status()
        data = resp.json()
        if isinstance(data, list):
            return data
        logging.warning(f"Unexpected response type from API: {type(data)}")
        return []
    except requests.exceptions.RequestException as e:
        logging.error(f"Error fetching CheapestOil API for {state_api_name}/{county_name}: {e}")
        return []
    except ValueError as e:
        logging.error(f"Invalid JSON from CheapestOil API: {e}")
        return []