""" HTTP client module for making web requests. """ import logging import re import time import requests from bs4 import BeautifulSoup # Default headers to mimic a browser DEFAULT_HEADERS = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' } REQUEST_TIMEOUT = 20 PHONE_FETCH_DELAY = 1 # seconds between phone page requests def make_request(url: str) -> BeautifulSoup | None: """ Fetch a URL and return a BeautifulSoup object. Args: url: The URL to fetch Returns: BeautifulSoup object if successful, None otherwise """ try: response = requests.get(url, headers=DEFAULT_HEADERS, timeout=REQUEST_TIMEOUT) response.raise_for_status() return BeautifulSoup(response.content, 'html.parser') except requests.exceptions.RequestException as e: logging.error(f"Error fetching {url}: {e}") return None def fetch_phone_number(base_url: str, phone_page_path: str, state_slug: str = "") -> str | None: """ Fetch a phone number from a newenglandoil phones.asp page. Args: base_url: Site base URL (e.g. "https://www.newenglandoil.com") phone_page_path: Relative path like "phones.asp?zone=1&ID=10&a=MA1" state_slug: State slug for URL path (e.g. "massachusetts") Returns: Phone number string or None if not found. """ # Build full URL - phone_page_path may be relative if phone_page_path.startswith('http'): url = phone_page_path elif state_slug: url = f"{base_url}/{state_slug}/{phone_page_path}" else: url = f"{base_url}/{phone_page_path}" time.sleep(PHONE_FETCH_DELAY) soup = make_request(url) if not soup: return None # Look for phone number patterns in the page text page_text = soup.get_text(" ", strip=True) # Common US phone formats: (508) 555-1234, 508-555-1234, 508.555.1234, 5085551234 # Captures: # 1. Optional open paren # 2. 3 digits (area code) # 3. Optional close paren # 4. Separator (space, dot, dash) # 5. 3 digits (prefix) # 6. Separator # 7. 4 digits (line number) phone_pattern = re.compile( r'(?:\(?(\d{3})\)?[\s.\-]?(\d{3})[\s.\-]?(\d{4}))' ) # Try to find a phone number near "Phone:" or "Tel:" first keyword_pattern = re.compile(r'(?:Phone|Tel|Call|Contact).*?(\(?\d{3}\)?[\s.\-]?\d{3}[\s.\-]?\d{4})', re.IGNORECASE) keyword_match = keyword_pattern.search(page_text) candidate = None if keyword_match: # If we found a number near a keyword, use that one. candidate = keyword_match.group(1) else: # Otherwise, look for the first valid phone pattern matches = phone_pattern.findall(page_text) for m in matches: # m is a tuple of groups: ('508', '555', '1234') full_num = "".join(m) # Simple heuristic to avoid dates like 2024, 2025 or common years if adjacent # But the regex requires 3-3-4 structure so a simple "2024" won't match unless it's like 202-455-1234 # We can filter out obviously bad "numbers" if needed, e.g. 000-000-0000 if full_num.startswith('000'): continue candidate = f"{m[0]}-{m[1]}-{m[2]}" break if candidate: digits = re.sub(r'\D', '', candidate) if len(digits) == 10: return f"({digits[:3]}) {digits[3:6]}-{digits[6:]}" return candidate logging.debug(f"No phone number found on {url}") return None