feat: initial commit for oil price scraper service

FastAPI-based scraper for commodity ticker prices (HO, CL, RB futures) and competitor oil pricing from NewEnglandOil. Includes cron-driven scraping, PostgreSQL storage, and REST endpoints for price retrieval. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-08 17:57:44 -05:00
commit af9c2f99e7
25 changed files with 1566 additions and 0 deletions
@@ -0,0 +1,9 @@
+"""
+New England Oil Scraper Module.
+
+This package contains code specific to scraping prices from the New England Oil website.
+"""
+
+from app.newenglandoil.scraper import scrape_newengland_oil, ScraperError
+
+__all__ = ['scrape_newengland_oil', 'ScraperError']
@@ -0,0 +1,140 @@
+import logging
+from datetime import datetime
+from typing import List
+
+from fastapi import APIRouter, Depends, HTTPException
+from sqlalchemy.orm import Session
+from sqlalchemy.exc import SQLAlchemyError
+
+from app.models import CompanyPrice
+from app.schemas import PriceRecord
+from app.newenglandoil.schemas import LatestPriceResponse
+from app.newenglandoil.scraper import scrape_newengland_oil, ScraperError
+from app.database import get_db
+
+logger = logging.getLogger(__name__)
+
+router = APIRouter(
+    prefix="/scraper/newenglandoil",
+    tags=["New England Oil Scraper"],
+)
+
+@router.get(
+    "/latestprice",
+    response_model=LatestPriceResponse,
+)
+async def get_latest_prices(db: Session = Depends(get_db)):
+    """
+    Scrape latest oil prices from New England Oil Zone 10.
+
+    This endpoint:
+    1. Scrapes the New England Oil website for current prices
+    2. Stores all prices in the database (historical tracking)
+    3. Returns the scraped prices
+
+    Designed to be called from cron for automated price tracking.
+
+    Example:
+        curl http://localhost:8000/scraper/newenglandoil/latestprice
+
+    Returns:
+        LatestPriceResponse with scraped prices and storage statistics
+    """
+    logger.info("=" * 60)
+    logger.info("SCRAPER ENDPOINT CALLED - New England Oil Zone 10")
+    logger.info("=" * 60)
+
+    try:
+        # Scrape the website
+        scraped_prices = scrape_newengland_oil()
+        
+        if not scraped_prices:
+            logger.warning("No prices were scraped")
+            return LatestPriceResponse(
+                status="warning",
+                message="No prices found on website",
+                prices_scraped=0,
+                prices_stored=0,
+                scrape_timestamp=datetime.utcnow().isoformat(),
+                prices=[]
+            )
+
+        # Store prices in database
+        stored_count = 0
+        price_records = []
+        
+        for price_data in scraped_prices:
+            # Add to response list (regardless of whether it's new or existing)
+            price_records.append(PriceRecord(
+                company_name=price_data["company_name"],
+                town=price_data.get("town"),
+                price_decimal=float(price_data["price_decimal"]),
+                scrape_date=price_data["scrape_date"].isoformat(),
+                zone=price_data.get("zone", "zone10"),
+            ))
+
+            try:
+                # Check for existing record to prevent duplicates
+                existing_record = db.query(CompanyPrice).filter(
+                    CompanyPrice.company_name == price_data["company_name"],
+                    CompanyPrice.scrape_date == price_data["scrape_date"],
+                    CompanyPrice.zone == price_data.get("zone", "zone10")
+                ).first()
+                
+                if existing_record:
+                    logger.debug(f"Skipping duplicate record for {price_data['company_name']} on {price_data['scrape_date']}")
+                    continue
+
+                # Create database record
+                price_record = CompanyPrice(
+                    company_name=price_data["company_name"],
+                    town=price_data.get("town"),
+                    price_decimal=price_data["price_decimal"],
+                    scrape_date=price_data["scrape_date"],
+                    zone=price_data.get("zone", "zone10"),
+                )
+                
+                db.add(price_record)
+                stored_count += 1
+                
+            except Exception as e:
+                logger.error(f"Failed to store price for {price_data.get('company_name')}: {e}")
+        
+        # Commit all new records
+        if stored_count > 0:
+            db.commit()
+            logger.info(f"Successfully stored {stored_count} new price records")
+        else:
+            logger.info("No new price records to store (all duplicates)")
+
+        return LatestPriceResponse(
+            status="success",
+            message=f"Successfully scraped {len(scraped_prices)} prices, stored {stored_count} new records",
+            prices_scraped=len(scraped_prices),
+            prices_stored=stored_count,
+            scrape_timestamp=datetime.utcnow().isoformat(),
+            prices=price_records,
+        )
+
+    except ScraperError as e:
+        logger.error(f"Scraper error: {e}")
+        raise HTTPException(
+            status_code=500,
+            detail=f"Scraping failed: {str(e)}"
+        )
+
+    except SQLAlchemyError as e:
+        db.rollback()
+        logger.error(f"Database error during price storage: {e}", exc_info=True)
+        raise HTTPException(
+            status_code=500,
+            detail=f"Database error: {str(e)}"
+        )
+
+    except Exception as e:
+        db.rollback()
+        logger.error(f"Unexpected error during scraping: {e}", exc_info=True)
+        raise HTTPException(
+            status_code=500,
+            detail=f"Scraping failed: {str(e)}"
+        )
@@ -0,0 +1,12 @@
+from typing import List
+from pydantic import BaseModel
+from app.schemas import PriceRecord
+
+class LatestPriceResponse(BaseModel):
+    """Latest price scrape response schema."""
+    status: str
+    message: str
+    prices_scraped: int
+    prices_stored: int
+    scrape_timestamp: str
+    prices: List[PriceRecord]
@@ -0,0 +1,170 @@
+"""
+Web scraping module for New England Oil prices.
+
+This module handles scraping oil price data from the New England Oil website
+for Zone 10 (Central Massachusetts).
+"""
+
+import logging
+import time
+from datetime import date
+from typing import List, Dict, Optional
+from decimal import Decimal
+
+import requests
+from bs4 import BeautifulSoup
+
+from app.config import (
+    NEWENGLAND_OIL_ZONE10_URL,
+    SCRAPER_USER_AGENT,
+    SCRAPER_TIMEOUT,
+    SCRAPER_DELAY_SECONDS,
+)
+
+logger = logging.getLogger(__name__)
+
+
+class ScraperError(Exception):
+    """Custom exception for scraper errors."""
+    pass
+
+
+def scrape_newengland_oil() -> List[Dict[str, any]]:
+    """
+    Scrape oil prices from New England Oil Zone 10 page.
+    
+    Fetches the page, parses the HTML table, and extracts company names,
+    towns, and prices.
+    
+    Returns:
+        List of dictionaries with keys: company_name, town, price_decimal, scrape_date, zone
+        
+    Raises:
+        ScraperError: If the request fails or parsing fails
+    """
+    logger.info(f"Starting scrape of {NEWENGLAND_OIL_ZONE10_URL}")
+    
+    headers = {
+        "User-Agent": SCRAPER_USER_AGENT,
+        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
+        "Accept-Language": "en-US,en;q=0.5",
+        "Accept-Encoding": "gzip, deflate",
+        "Connection": "keep-alive",
+    }
+    
+    try:
+        # Make the request
+        response = requests.get(
+            NEWENGLAND_OIL_ZONE10_URL,
+            headers=headers,
+            timeout=SCRAPER_TIMEOUT
+        )
+        response.raise_for_status()
+        
+        logger.info(f"Successfully fetched page (status: {response.status_code})")
+        
+        # Parse HTML
+        soup = BeautifulSoup(response.content, 'lxml')
+        
+        # Find the price table
+        # The table typically has company names, towns, and prices
+        # We need to inspect the actual HTML structure
+        prices = []
+        today = date.today()
+        
+        # Look for table rows with price data
+        # The structure appears to be: company links followed by town and price info
+        # We'll look for patterns in the HTML
+        
+        # Find all table rows
+        tables = soup.find_all('table')
+        
+        if not tables:
+            logger.warning("No tables found on page")
+            # Debug: Save HTMl to file
+            with open("debug_page.html", "wb") as f:
+                f.write(response.content)
+            raise ScraperError("No price table found on page")
+        
+        # The main price table is usually the largest one or contains specific markers
+        # Let's find rows that contain price information
+        for table in tables:
+            rows = table.find_all('tr')
+            
+            for row in rows:
+                cells = row.find_all(['td', 'th'])
+                
+                if len(cells) >= 3:  # Expect at least company, town, price
+                    # Try to extract company name (usually in a link)
+                    company_link = row.find('a')
+                    if company_link:
+                        company_name = company_link.get_text(strip=True)
+                        
+                        # Extract text from all cells
+                        cell_texts = [cell.get_text(strip=True) for cell in cells]
+                        
+                        # Look for price pattern (e.g., "$2.599" or "2.599")
+                        price_value = None
+                        town_value = None
+                        
+                        for text in cell_texts:
+                            # Check if this looks like a price
+                            text_clean = text.replace('$', '').replace(',', '').strip()
+                            try:
+                                # Try to parse as decimal
+                                if text_clean and '.' in text_clean:
+                                    potential_price = Decimal(text_clean)
+                                    # Reasonable price range for heating oil (0.50 to 10.00)
+                                    if Decimal('0.50') <= potential_price <= Decimal('10.00'):
+                                        price_value = potential_price
+                                        break
+                            except (ValueError, ArithmeticError):
+                                # Not a valid price, might be town name
+                                if text and not text.startswith('$') and len(text) > 2:
+                                    if not town_value:  # Take first non-price text as town
+                                        town_value = text
+                        
+                        if price_value:
+                            prices.append({
+                                "company_name": company_name,
+                                "town": town_value,
+                                "price_decimal": price_value,
+                                "scrape_date": today,
+                                "zone": "zone10"
+                            })
+                            logger.debug(f"Found: {company_name} - {town_value} - ${price_value}")
+        
+        if not prices:
+            logger.warning("No prices extracted from page")
+            raise ScraperError("Failed to extract any price data from page")
+        
+        logger.info(f"Successfully scraped {len(prices)} price records")
+        return prices
+        
+    except requests.RequestException as e:
+        logger.error(f"Request failed: {e}")
+        raise ScraperError(f"Failed to fetch page: {str(e)}")
+    
+    except Exception as e:
+        logger.error(f"Scraping failed: {e}", exc_info=True)
+        raise ScraperError(f"Failed to parse page: {str(e)}")
+
+
+def scrape_and_delay() -> List[Dict[str, any]]:
+    """
+    Scrape prices and apply rate limiting delay.
+    
+    This is a convenience function that scrapes and then sleeps
+    to respect rate limits.
+    
+    Returns:
+        List of price dictionaries
+    """
+    prices = scrape_newengland_oil()
+    
+    # Apply rate limiting delay
+    if SCRAPER_DELAY_SECONDS > 0:
+        logger.debug(f"Sleeping {SCRAPER_DELAY_SECONDS}s for rate limiting")
+        time.sleep(SCRAPER_DELAY_SECONDS)
+    
+    return prices