feat: initial commit for oil price scraper service

FastAPI-based scraper for commodity ticker prices (HO, CL, RB futures) and competitor oil pricing from NewEnglandOil. Includes cron-driven scraping, PostgreSQL storage, and REST endpoints for price retrieval. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-08 17:57:44 -05:00
commit af9c2f99e7
25 changed files with 1566 additions and 0 deletions
--- a/app/newenglandoil/scraper.py
+++ b/app/newenglandoil/scraper.py
@@ -0,0 +1,170 @@
+"""
+Web scraping module for New England Oil prices.
+
+This module handles scraping oil price data from the New England Oil website
+for Zone 10 (Central Massachusetts).
+"""
+
+import logging
+import time
+from datetime import date
+from typing import List, Dict, Optional
+from decimal import Decimal
+
+import requests
+from bs4 import BeautifulSoup
+
+from app.config import (
+    NEWENGLAND_OIL_ZONE10_URL,
+    SCRAPER_USER_AGENT,
+    SCRAPER_TIMEOUT,
+    SCRAPER_DELAY_SECONDS,
+)
+
+logger = logging.getLogger(__name__)
+
+
+class ScraperError(Exception):
+    """Custom exception for scraper errors."""
+    pass
+
+
+def scrape_newengland_oil() -> List[Dict[str, any]]:
+    """
+    Scrape oil prices from New England Oil Zone 10 page.
+    
+    Fetches the page, parses the HTML table, and extracts company names,
+    towns, and prices.
+    
+    Returns:
+        List of dictionaries with keys: company_name, town, price_decimal, scrape_date, zone
+        
+    Raises:
+        ScraperError: If the request fails or parsing fails
+    """
+    logger.info(f"Starting scrape of {NEWENGLAND_OIL_ZONE10_URL}")
+    
+    headers = {
+        "User-Agent": SCRAPER_USER_AGENT,
+        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
+        "Accept-Language": "en-US,en;q=0.5",
+        "Accept-Encoding": "gzip, deflate",
+        "Connection": "keep-alive",
+    }
+    
+    try:
+        # Make the request
+        response = requests.get(
+            NEWENGLAND_OIL_ZONE10_URL,
+            headers=headers,
+            timeout=SCRAPER_TIMEOUT
+        )
+        response.raise_for_status()
+        
+        logger.info(f"Successfully fetched page (status: {response.status_code})")
+        
+        # Parse HTML
+        soup = BeautifulSoup(response.content, 'lxml')
+        
+        # Find the price table
+        # The table typically has company names, towns, and prices
+        # We need to inspect the actual HTML structure
+        prices = []
+        today = date.today()
+        
+        # Look for table rows with price data
+        # The structure appears to be: company links followed by town and price info
+        # We'll look for patterns in the HTML
+        
+        # Find all table rows
+        tables = soup.find_all('table')
+        
+        if not tables:
+            logger.warning("No tables found on page")
+            # Debug: Save HTMl to file
+            with open("debug_page.html", "wb") as f:
+                f.write(response.content)
+            raise ScraperError("No price table found on page")
+        
+        # The main price table is usually the largest one or contains specific markers
+        # Let's find rows that contain price information
+        for table in tables:
+            rows = table.find_all('tr')
+            
+            for row in rows:
+                cells = row.find_all(['td', 'th'])
+                
+                if len(cells) >= 3:  # Expect at least company, town, price
+                    # Try to extract company name (usually in a link)
+                    company_link = row.find('a')
+                    if company_link:
+                        company_name = company_link.get_text(strip=True)
+                        
+                        # Extract text from all cells
+                        cell_texts = [cell.get_text(strip=True) for cell in cells]
+                        
+                        # Look for price pattern (e.g., "$2.599" or "2.599")
+                        price_value = None
+                        town_value = None
+                        
+                        for text in cell_texts:
+                            # Check if this looks like a price
+                            text_clean = text.replace('$', '').replace(',', '').strip()
+                            try:
+                                # Try to parse as decimal
+                                if text_clean and '.' in text_clean:
+                                    potential_price = Decimal(text_clean)
+                                    # Reasonable price range for heating oil (0.50 to 10.00)
+                                    if Decimal('0.50') <= potential_price <= Decimal('10.00'):
+                                        price_value = potential_price
+                                        break
+                            except (ValueError, ArithmeticError):
+                                # Not a valid price, might be town name
+                                if text and not text.startswith('$') and len(text) > 2:
+                                    if not town_value:  # Take first non-price text as town
+                                        town_value = text
+                        
+                        if price_value:
+                            prices.append({
+                                "company_name": company_name,
+                                "town": town_value,
+                                "price_decimal": price_value,
+                                "scrape_date": today,
+                                "zone": "zone10"
+                            })
+                            logger.debug(f"Found: {company_name} - {town_value} - ${price_value}")
+        
+        if not prices:
+            logger.warning("No prices extracted from page")
+            raise ScraperError("Failed to extract any price data from page")
+        
+        logger.info(f"Successfully scraped {len(prices)} price records")
+        return prices
+        
+    except requests.RequestException as e:
+        logger.error(f"Request failed: {e}")
+        raise ScraperError(f"Failed to fetch page: {str(e)}")
+    
+    except Exception as e:
+        logger.error(f"Scraping failed: {e}", exc_info=True)
+        raise ScraperError(f"Failed to parse page: {str(e)}")
+
+
+def scrape_and_delay() -> List[Dict[str, any]]:
+    """
+    Scrape prices and apply rate limiting delay.
+    
+    This is a convenience function that scrapes and then sleeps
+    to respect rate limits.
+    
+    Returns:
+        List of price dictionaries
+    """
+    prices = scrape_newengland_oil()
+    
+    # Apply rate limiting delay
+    if SCRAPER_DELAY_SECONDS > 0:
+        logger.debug(f"Sleeping {SCRAPER_DELAY_SECONDS}s for rate limiting")
+        time.sleep(SCRAPER_DELAY_SECONDS)
+    
+    return prices