""" HTTP client for the CheapestOil JSON API. """ import re import requests from bs4 import BeautifulSoup from .config import API_URL DEFAULT_HEADERS = { "User-Agent": ( "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " "AppleWebKit/537.36 (KHTML, like Gecko) " "Chrome/91.0.4472.124 Safari/537.36" ) } REQUEST_TIMEOUT = 20 def fetch_company_details(slug: str) -> dict: """ Fetch company details (real URL, phone) from their CheapestOil profile page. Args: slug: The company slug/path (e.g. "Abc-Oil-Company") Returns: Dict with keys: "url" (str|None), "phone" (str|None) """ if not slug: return {"url": None, "phone": None} # Construct detail URL # If slug is full URL, use it, else append to base if slug.startswith("http"): url = slug else: url = f"https://www.cheapestoil.com/{slug}" try: resp = requests.get(url, headers=DEFAULT_HEADERS, timeout=REQUEST_TIMEOUT) resp.raise_for_status() soup = BeautifulSoup(resp.content, 'html.parser') real_url = None phone = None # 1. Extract Real URL # Look for "Visit Website" link or similar anchor texts # Usually contained in a link with text "Visit Website" or the company name # We look for a link that is NOT internal (doesn't contain cheapestoil.com) # and behaves like an external link. # Common pattern: Visit Website visit_link = soup.find('a', string=re.compile(r"Visit Website|Company Website", re.IGNORECASE)) if visit_link and visit_link.get('href'): href = visit_link.get('href') if 'cheapestoil.com' not in href and href.startswith('http'): real_url = href # Fallback: look for any external link in the contact section if structured if not real_url: # Try to find the first external link in the main content area # (This is heuristics-based, might need adjustment) content_div = soup.find('div', class_='col-md-8') # Common bootstrap main col if content_div: links = content_div.find_all('a', href=True) for a in links: href = a['href'] if href.startswith('http') and 'cheapestoil.com' not in href: real_url = href break # 2. Extract Phone # Reuse robust regex pattern logic page_text = soup.get_text(" ", strip=True) # Look for "Phone:", "Tel:", etc. # This is a bit simplified compared to the other scraper but likely sufficient phone_match = re.search(r'(?:Phone|Tel|Call).*?(\(?\d{3}\)?[\s.\-]?\d{3}[\s.\-]?\d{4})', page_text, re.IGNORECASE) if phone_match: phone_candidate = phone_match.group(1) else: # Fallback to just finding a phone pattern phone_match = re.search(r'(?:\(?\d{3}\)?[\s.\-]?\d{3}[\s.\-]?\d{4})', page_text) phone_candidate = phone_match.group(0) if phone_match else None if phone_candidate: digits = re.sub(r'\D', '', phone_candidate) if len(digits) == 10: phone = f"({digits[:3]}) {digits[3:6]}-{digits[6:]}" else: phone = phone_candidate return {"url": real_url, "phone": phone} except Exception as e: logging.warning(f"Failed to fetch details for {slug}: {e}") return {"url": None, "phone": None} def fetch_county_prices(state_api_name: str, county_name: str | None = None) -> list: """ Fetch price data from the CheapestOil API. Args: state_api_name: State name as used by the API (e.g. "Massachusetts", "NewHampshire") county_name: County name filter, or None for state-level results Returns: List of raw JSON arrays from the API, or empty list on failure. """ params = { "sort": 0, "state": state_api_name, "county": county_name or "", "zip": "", } try: resp = requests.get( API_URL, params=params, headers=DEFAULT_HEADERS, timeout=REQUEST_TIMEOUT ) resp.raise_for_status() data = resp.json() if isinstance(data, list): return data logging.warning(f"Unexpected response type from API: {type(data)}") return [] except requests.exceptions.RequestException as e: logging.error(f"Error fetching CheapestOil API for {state_api_name}/{county_name}: {e}") return [] except ValueError as e: logging.error(f"Invalid JSON from CheapestOil API: {e}") return []