feat: initial commit for oil price scraper service

FastAPI-based scraper for commodity ticker prices (HO, CL, RB futures) and competitor oil pricing from NewEnglandOil. Includes cron-driven scraping, PostgreSQL storage, and REST endpoints for price retrieval. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-08 17:57:44 -05:00
commit af9c2f99e7
25 changed files with 1566 additions and 0 deletions
@@ -0,0 +1,23 @@
 # Environment variables for eamco_scraper - PRODUCTION
 # Copy this to .env.prod and fill in actual values
 # Application mode
 MODE=PRODUCTION
 CURRENT_SETTINGS=PRODUCTION
 # Database configuration
 POSTGRES_USERNAME=postgres
 POSTGRES_PW=your_password_here
 POSTGRES_SERVER=192.168.1.204
 POSTGRES_PORT=5432
 POSTGRES_DBNAME=auburnoil
 # Logging
 LOG_LEVEL=INFO
 # Scraper configuration
 SCRAPER_DELAY=2.0
 SCRAPER_TIMEOUT=10
 # CORS (optional - comma-separated)
 # CORS_ORIGINS=https://oil.edwineames.com,https://edwineames.com
@@ -0,0 +1,132 @@
 # Byte-compiled / optimized / DLL files
 __pycache__/
 *.py[cod]
 *$py.class
 # C extensions
 *.so
 # Distribution / packaging
 .Python
 build/
 develop-eggs/
 dist/
 downloads/
 eggs/
 .eggs/
 lib/
 lib64/
 parts/
 sdist/
 var/
 wheels/
 pip-wheel-metadata/
 share/python-wheels/
 *.egg-info/
 .installed.cfg
 *.egg
 MANIFEST
 # PyInstaller
 #  Usually these files are written by a python script from a template
 #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 *.manifest
 *.spec
 # Installer logs
 pip-log.txt
 pip-delete-this-directory.txt
 # Unit test / coverage reports
 htmlcov/
 .tox/
 .nox/
 .coverage
 .coverage.*
 .cache
 nosetests.xml
 coverage.xml
 *.cover
 *.py,cover
 .hypothesis/
 .pytest_cache/
 # Translations
 *.mo
 *.pot
 # Django stuff:
 *.log
 local_settings.py
 db.sqlite3
 db.sqlite3-journal
 # Flask stuff:
 instance/
 .webassets-cache
 # Scrapy stuff:
 .scrapy
 # Sphinx documentation
 docs/_build/
 # PyBuilder
 target/
 # Jupyter Notebook
 .ipynb_checkpoints
 # IPython
 profile_default/
 ipython_config.py
 # pyenv
 .python-version
 # pipenv
 #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 #   having sub-dependencies with platform-specific binaries, it is better to ignore the Pipfile.lock.
 # Pipfile.lock
 # PEP 582; __pypackages__
 __pypackages__/
 # Celery stuff
 celerybeat-schedule
 celerybeat.pid
 # SageMath parsed files
 *.sage.py
 # Environments
 .env
 .venv
 env/
 venv/
 ENV/
 env.bak/
 venv.bak/
 # Spyder project settings
 .spyderproject
 .spyderworkspace
 # Rope project settings
 .ropeproject
 # mkdocs documentation
 /site
 # mypy
 .mypy_cache/
 .dmypy.json
 dmypy.json
 # Pyre type checker
 .pyre/
 # Environments
 .env.local
 .env.prod
@@ -0,0 +1,18 @@
 # eamco_scraper - DEVELOPMENT Dockerfile
 # Minimal development setup
 FROM python:3.11-slim
 WORKDIR /app
 RUN apt-get update && apt-get install -y --no-install-recommends \
    libpq-dev gcc && rm -rf /var/lib/apt/lists/*
 COPY requirements.txt .
 RUN pip install --no-cache-dir -r requirements.txt
 COPY . .
 EXPOSE 8000
 CMD ["uvicorn", "app.main:app", "--reload", "--host", "0.0.0.0", "--port", "8000"]
@@ -0,0 +1,37 @@
 # eamco_scraper - LOCAL Dockerfile
 # Used by docker-compose.local.yml
 # Features: Volume mounts for development, hot reload
 FROM python:3.11-slim
 # Set environment variables
 ENV PYTHONDONTWRITEBYTECODE=1 \
    PYTHONUNBUFFERED=1 \
    PYTHONPATH=/app
 # Set working directory
 WORKDIR /app
 # Install system dependencies
 RUN apt-get update && apt-get install -y --no-install-recommends \
    libpq-dev \
    gcc \
    && rm -rf /var/lib/apt/lists/*
 # Copy requirements
 COPY requirements.txt .
 # Install Python dependencies
 RUN pip install --no-cache-dir -r requirements.txt
 # Copy environment file for local
 COPY .env.local .env
 # Copy application code (will be overridden by volume mount)
 COPY app/ ./app/
 # Expose port
 EXPOSE 8000
 # Development: Run with reload
 CMD ["uvicorn", "app.main:app", "--reload", "--host", "0.0.0.0", "--port", "8000"]
@@ -0,0 +1,49 @@
 # eamco_scraper - PRODUCTION Dockerfile
 # Used by docker-compose.prod.yml
 # Features: Optimized for production, non-root user, health checks
 FROM python:3.11-slim
 # Set environment variables
 ENV PYTHONDONTWRITEBYTECODE=1 \
    PYTHONUNBUFFERED=1 \
    PYTHONPATH=/app \
    PIP_NO_CACHE_DIR=1 \
    PIP_DISABLE_PIP_VERSION_CHECK=1
 # Set working directory
 WORKDIR /app
 # Install system dependencies (psycopg2 requirements)
 RUN apt-get update && apt-get install -y --no-install-recommends \
    libpq-dev \
    gcc \
    && rm -rf /var/lib/apt/lists/* \
    && apt-get clean
 # Copy requirements first (for better layer caching)
 COPY requirements.txt .
 # Install Python dependencies
 RUN pip install --no-cache-dir -r requirements.txt
 # Copy environment file for production
 COPY .env.prod .env
 # Copy application code
 COPY app/ ./app/
 # Create non-root user for security
 RUN useradd --create-home --shell /bin/bash appuser && \
    chown -R appuser:appuser /app
 USER appuser
 # Expose port
 EXPOSE 8000
 # Health check
 HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \
    CMD python -c "import urllib.request; urllib.request.urlopen('http://localhost:8000/health')" || exit 1
 # Production: Run without reload, with workers
 CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "8000", "--workers", "2"]
@@ -0,0 +1,155 @@
 # eamco_scraper
 FastAPI microservice for scraping heating oil prices from New England Oil and storing historical pricing data.
 ## Overview
 This service scrapes oil company pricing data from the New England Oil website (Zone 10 - Central Massachusetts) and stores it in a PostgreSQL database for historical tracking and trend analysis.
 ## Features
 - **Web Scraping**: Automated scraping of oil prices using BeautifulSoup4
 - **Historical Tracking**: Stores all price records (no updates, only inserts) for trend analysis
 - **Cron-Friendly**: Single GET request triggers scrape and storage
 - **Health Checks**: Built-in health endpoint for monitoring
 - **Docker Ready**: Production and development Docker configurations
 ## API Endpoints
 ### `GET /health`
 Health check endpoint with database connectivity status.
 **Response:**
 ```json
 {
  "status": "healthy",
  "db_connected": true
 }
 ```
 ### `GET /scraper/newenglandoil/latestprice`
 Trigger scrape of New England Oil Zone 10 prices, store in database, and return results.
 **Response:**
 ```json
 {
  "status": "success",
  "message": "Successfully scraped and stored 30 prices",
  "prices_scraped": 30,
  "prices_stored": 30,
  "scrape_timestamp": "2026-02-07T22:00:00",
  "prices": [
    {
      "company_name": "AUBURN OIL",
      "town": "Auburn",
      "price_decimal": 2.599,
      "scrape_date": "2026-02-07",
      "zone": "zone10"
    }
  ]
 }
 ```
 ## Database Schema
 ### `company_prices` Table
 | Column | Type | Description |
 |--------|------|-------------|
 | id | SERIAL | Primary key |
 | company_name | VARCHAR(255) | Oil company name |
 | town | VARCHAR(100) | Town/city |
 | price_decimal | DECIMAL(6,3) | Price per gallon |
 | scrape_date | DATE | Date price was listed |
 | zone | VARCHAR(50) | Geographic zone (default: zone10) |
 | created_at | TIMESTAMP | Record creation timestamp |
 **Indexes:**
 - `idx_company_prices_company` on `company_name`
 - `idx_company_prices_scrape_date` on `scrape_date`
 - `idx_company_prices_zone` on `zone`
 - `idx_company_prices_company_date` on `(company_name, scrape_date)`
 - `idx_company_prices_zone_date` on `(zone, scrape_date)`
 ## Development
 ### Local Setup
 ```bash
 # Create virtual environment
 python -m venv venv
 source venv/bin/activate  # On Windows: venv\Scripts\activate
 # Install dependencies
 pip install -r requirements.txt
 # Copy environment file
 cp .env.example .env.local
 # Edit .env.local with your database credentials
 # Run the application
 uvicorn app.main:app --reload --host 0.0.0.0 --port 8000
 ```
 ### Docker Local
 ```bash
 cd /mnt/code/oil/eamco/eamco_deploy
 docker-compose -f docker-compose.local.yml up scraper_local
 ```
 Access at: http://localhost:9619
 ## Production
 ### Docker Production
 ```bash
 cd /mnt/code/oil/eamco/eamco_deploy
 docker-compose -f docker-compose.prod.yml up -d scraper_prod
 ```
 Access at: http://192.168.1.204:9519
 ## Cron Integration
 Add to Unraid cron or system crontab:
 ```bash
 # Scrape prices daily at 6 AM
 0 6 * * * curl -s http://192.168.1.204:9619/scraper/newenglandoil/latestprice > /dev/null 2>&1
 ```
 ## Environment Variables
 | Variable | Description | Default |
 |----------|-------------|---------|
 | MODE | Application mode (LOCAL/PRODUCTION) | LOCAL |
 | POSTGRES_USERNAME | Database username | postgres |
 | POSTGRES_PW | Database password | password |
 | POSTGRES_SERVER | Database server | 192.168.1.204 |
 | POSTGRES_PORT | Database port | 5432 |
 | POSTGRES_DBNAME | Database name | eamco |
 | LOG_LEVEL | Logging level | INFO |
 | SCRAPER_DELAY | Delay between requests (seconds) | 2.0 |
 | SCRAPER_TIMEOUT | Request timeout (seconds) | 10 |
 ## Architecture
 - **Framework**: FastAPI 0.109+
 - **Database**: PostgreSQL 15+ with SQLAlchemy 2.0
 - **Scraping**: BeautifulSoup4 + lxml + requests
 - **Server**: Uvicorn with 2 workers (production)
 ## Ports
 - **Local Development**: 9619
 - **Production**: 9519
 ## Future Enhancements
 - Frontend display on Home page (table or cards)
 - Price change alerts/notifications
 - Support for additional zones
 - Price trend graphs and analytics
@@ -0,0 +1 @@
 # eamco_scraper app package
@@ -0,0 +1,111 @@
 """
 Configuration settings for eamco_scraper.
 This module provides configuration with environment-based switching:
 - LOCAL: Uses 'eamco' database, localhost CORS origins
 - PRODUCTION: Uses 'auburnoil' database, production domain CORS origins
 Environment variables are loaded from .env.local or .env.prod depending
 on the Docker compose file used.
 """
 import logging
 import os
 from typing import List
 from dotenv import load_dotenv
 # Load environment variables from .env file if present
 load_dotenv()
 # =============================================================================
 # ENVIRONMENT MODE
 # =============================================================================
 MODE = os.getenv("MODE", "LOCAL")
 CURRENT_SETTINGS = os.getenv("CURRENT_SETTINGS", "DEVELOPMENT")
 # Log configuration mode (logger setup happens after config is loaded)
 _config_mode_msg = f"Using {'PRODUCTION' if CURRENT_SETTINGS == 'PRODUCTION' else 'DEVELOPMENT'} configuration"
 # =============================================================================
 # DATABASE CONFIGURATION
 # =============================================================================
 # Database connection components (can be overridden individually)
 POSTGRES_USERNAME = os.getenv("POSTGRES_USERNAME", "postgres")
 POSTGRES_PW = os.getenv("POSTGRES_PW", "password")
 POSTGRES_SERVER = os.getenv("POSTGRES_SERVER", "192.168.1.204")
 POSTGRES_PORT = os.getenv("POSTGRES_PORT", "5432")
 # Database name differs by environment
 if CURRENT_SETTINGS == "PRODUCTION":
    POSTGRES_DBNAME = os.getenv("POSTGRES_DBNAME", "auburnoil")
 else:
    POSTGRES_DBNAME = os.getenv("POSTGRES_DBNAME", "eamco")
 # Build connection URI from components (fallback)
 _DEFAULT_DATABASE_URI = "postgresql+psycopg2://{}:{}@{}:{}/{}".format(
    POSTGRES_USERNAME,
    POSTGRES_PW,
    POSTGRES_SERVER,
    POSTGRES_PORT,
    POSTGRES_DBNAME
 )
 # Allow full DATABASE_URL override
 DATABASE_URL: str = os.getenv("DATABASE_URL", _DEFAULT_DATABASE_URI)
 # SQLAlchemy binds (for compatibility)
 SQLALCHEMY_DATABASE_URI = DATABASE_URL
 SQLALCHEMY_BINDS = {POSTGRES_DBNAME: SQLALCHEMY_DATABASE_URI}
 # =============================================================================
 # CORS CONFIGURATION
 # =============================================================================
 # Parse CORS origins from environment (comma-separated) or use defaults
 _cors_env = os.getenv("CORS_ORIGINS", "")
 if _cors_env:
    CORS_ORIGINS: List[str] = [origin.strip() for origin in _cors_env.split(",")]
 elif CURRENT_SETTINGS == "PRODUCTION":
    # Production CORS origins
    CORS_ORIGINS = [
        "https://oil.edwineames.com",
        "https://edwineames.com",
    ]
 else:
    # Development CORS origins
    CORS_ORIGINS = [
        "http://localhost:9000",
        "https://localhost:9513",
        "http://localhost:9514",
        "http://localhost:9512",
        "http://localhost:9511",
        "http://localhost:5173",  # Frontend port
        "http://localhost:9616",  # Authorize service port
    ]
 # =============================================================================
 # SCRAPER CONFIGURATION
 # =============================================================================
 # User agent for web scraping (identifies your application)
 SCRAPER_USER_AGENT: str = "Unraid-EamcoScraper/1.0 (eeames214@gmail.com)"
 # Rate limiting: Sleep between requests (be respectful to target servers)
 SCRAPER_DELAY_SECONDS: float = float(os.getenv("SCRAPER_DELAY", "2.0"))
 # Request timeout in seconds
 SCRAPER_TIMEOUT: int = int(os.getenv("SCRAPER_TIMEOUT", "10"))
 # Target URL for New England Oil Zone 10
 NEWENGLAND_OIL_ZONE10_URL: str = "https://www.newenglandoil.com/massachusetts/zone10.asp?x=0"
 # =============================================================================
 # LOGGING CONFIGURATION
 # =============================================================================
 LOG_LEVEL: str = os.getenv("LOG_LEVEL", "INFO")
 LOG_FORMAT: str = "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
@@ -0,0 +1,88 @@
 import logging
 import sys
 from contextlib import contextmanager
 from typing import Generator
 from sqlalchemy import create_engine, text
 from sqlalchemy.orm import sessionmaker, Session
 from sqlalchemy.exc import SQLAlchemyError
 from app.config import (
    DATABASE_URL,
    LOG_LEVEL,
    LOG_FORMAT,
 )
 # =============================================================================
 # LOGGING CONFIGURATION
 # =============================================================================
 logging.basicConfig(
    level=getattr(logging, LOG_LEVEL.upper(), logging.INFO),
    format=LOG_FORMAT,
    handlers=[
        logging.StreamHandler(sys.stdout),
    ]
 )
 logger = logging.getLogger(__name__)
 # =============================================================================
 # DATABASE SETUP
 # =============================================================================
 # Create SQLAlchemy engine with connection pooling
 engine = create_engine(
    DATABASE_URL,
    pool_pre_ping=True,  # Verify connections before use
    pool_size=5,
    max_overflow=10,
    echo=False,  # Set to True for SQL debugging
 )
 # Session factory
 SessionLocal = sessionmaker(
    autocommit=False,
    autoflush=False,
    bind=engine,
 )
 def get_db() -> Generator[Session, None, None]:
    """
    Dependency that provides a database session.
    Yields a SQLAlchemy session and ensures proper cleanup.
    """
    db = SessionLocal()
    try:
        yield db
    finally:
        db.close()
@contextmanager
 def get_db_session() -> Generator[Session, None, None]:
    """
    Context manager for database sessions (non-dependency use).
    """
    db = SessionLocal()
    try:
        yield db
    finally:
        db.close()
 def check_db_connection() -> bool:
    """
    Test database connectivity.
    Returns:
        True if database is reachable, False otherwise
    """
    try:
        with get_db_session() as db:
            db.execute(text("SELECT 1"))
        return True
    except SQLAlchemyError as e:
        logger.error(f"Database connection failed: {e}")
        return False
@@ -0,0 +1,195 @@
 """
 eamco_scraper - FastAPI Oil Price Scraping Microservice.
 This microservice provides endpoints for scraping oil prices from New England Oil
 and storing them in the database for historical tracking.
 Endpoints:
    GET  /health                              - Health check with database connectivity status
    GET  /scraper/newenglandoil/latestprice   - Trigger scrape and return latest prices
 Usage:
    # Development
    uvicorn app.main:app --reload --host 0.0.0.0 --port 8000
    # Production (Docker)
    docker run -p 8000:8000 eamco_scraper
    # Trigger from cron
    curl http://localhost:8000/scraper/newenglandoil/latestprice
 """
 import logging
 import os
 import sys
 from typing import List
 from datetime import datetime
 from fastapi import FastAPI, Depends, HTTPException
 from fastapi.middleware.cors import CORSMiddleware
 from sqlalchemy.orm import Session
 from app.config import (
    DATABASE_URL,
    CORS_ORIGINS,
    LOG_LEVEL,
    LOG_FORMAT,
 )
 from app.models import Base, CompanyPrice
 from app.database import engine, get_db, check_db_connection
 from app.schemas import HealthResponse, PriceRecord
 from app.newenglandoil.router import router as newenglandoil_router
 from app.priceticker.router import router as priceticker_router
 # =============================================================================
 # LOGGING CONFIGURATION
 # =============================================================================
 logging.basicConfig(
    level=getattr(logging, LOG_LEVEL.upper(), logging.INFO),
    format=LOG_FORMAT,
    handlers=[
        logging.StreamHandler(sys.stdout),
    ]
 )
 logger = logging.getLogger(__name__)
 # =============================================================================
 # FASTAPI APPLICATION
 # =============================================================================
 app = FastAPI(
    title="eamco_scraper",
    description="Oil price scraping microservice for New England Oil",
    version="1.0.0",
    docs_url="/docs",
    redoc_url="/redoc",
 )
 # =============================================================================
 # CORS MIDDLEWARE
 # =============================================================================
 app.add_middleware(
    CORSMiddleware,
    allow_origins=CORS_ORIGINS,
    allow_credentials=True,
    allow_methods=["*"],
    allow_headers=["*"],
 )
 # =============================================================================
 # ROUTERS
 # =============================================================================
 app.include_router(newenglandoil_router)
 app.include_router(priceticker_router)
 # =============================================================================
 # ENDPOINTS
 # =============================================================================
@app.get("/", include_in_schema=False)
 async def root():
    """Root endpoint - redirect to docs."""
    return {
        "service": "eamco_scraper",
        "version": "1.0.0",
        "docs": "/docs",
    }
@app.get("/health", response_model=HealthResponse, tags=["Health"])
 async def health_check():
    """
    Health check endpoint.
    Returns service status and database connectivity.
    Use this endpoint for container health checks and monitoring.
    Returns:
        HealthResponse with status and db_connected flag
    """
    db_connected = check_db_connection()
    return HealthResponse(
        status="healthy" if db_connected else "degraded",
        db_connected=db_connected,
    )
@app.get(
    "/scraper/prices",
    response_model=List[PriceRecord],
    tags=["Prices"],
 )
 async def get_stored_prices(
    date: str | None = None,
    db: Session = Depends(get_db)
 ):
    """
    Get stored oil prices from the database.
    If no date is provided, returns prices for the current date (UTC).
    Does NOT trigger a scrape.
    """
    if not date:
        date = datetime.utcnow().date().isoformat()
    try:
        # Query prices for the specific date
        prices = db.query(CompanyPrice).filter(
            CompanyPrice.scrape_date == date
        ).all()
        return [
            PriceRecord(
                company_name=p.company_name,
                town=p.town,
                price_decimal=float(p.price_decimal),
                scrape_date=str(p.scrape_date),
                zone=p.zone
            ) for p in prices
        ]
    except Exception as e:
        logger.error(f"Error fetching prices: {e}")
        raise HTTPException(status_code=500, detail=str(e))
 # =============================================================================
 # STARTUP/SHUTDOWN EVENTS
 # =============================================================================
@app.on_event("startup")
 async def startup_event():
    """Application startup - log configuration and test DB connection."""
    logger.info("🚀 eamco_scraper STARTING")
    mode = os.environ.get('MODE', 'DEVELOPMENT').upper()
    if mode in ['DEVELOPMENT', 'DEV', 'LOCAL']:
        logger.info("🤖🤖🤖🤖🤖 Mode: Development 🤖🤖🤖🤖🤖")
    elif mode in ['PRODUCTION', 'PROD']:
        logger.info("💀💀💀💀💀💀💀💀💀💀 ⚠️ WARNING PRODUCTION 💀💀💀💀💀💀💀💀💀💀")
    logger.info(f"DB: {DATABASE_URL[:30]}...")
    logger.info(f"CORS: {len(CORS_ORIGINS)} origins configured")
    # Test database connection
    if check_db_connection():
        logger.info("DB Connection: ✅ OK")
    else:
        logger.warning("DB Connection: ❌ FAILED")
    # Create tables if they don't exist
    try:
        Base.metadata.create_all(bind=engine)
        logger.info("Database tables verified/created")
    except Exception as e:
        logger.error(f"Failed to create tables: {e}")
@app.on_event("shutdown")
 async def shutdown_event():
    """Application shutdown - cleanup."""
    logger.info("🛑 eamco_scraper SHUTTING DOWN")
@@ -0,0 +1,99 @@
 """
 SQLAlchemy models for eamco_scraper.
 This module defines the database models for storing scraped oil price data.
 """
 from datetime import datetime
 from sqlalchemy import Column, Integer, String, Numeric, Date, DateTime, Index
 from sqlalchemy.ext.declarative import declarative_base
 Base = declarative_base()
 class CompanyPrice(Base):
    """
    Model for storing oil company pricing data.
    This table stores historical pricing data from oil companies.
    Each scrape creates new records (no updates) to enable price trend analysis.
    Attributes:
        id: Primary key
        company_name: Name of the oil company
        town: Town where the company operates
        price_decimal: Price per gallon (e.g., 2.599)
        scrape_date: Date when the price was listed on the website
        zone: Geographic zone (default: 'zone10' for Central Massachusetts)
        created_at: Timestamp when the record was inserted into database
    """
    __tablename__ = "company_prices"
    id = Column(Integer, primary_key=True, autoincrement=True)
    company_name = Column(String(255), nullable=False, index=True)
    town = Column(String(100), nullable=True)
    price_decimal = Column(Numeric(6, 3), nullable=False)
    scrape_date = Column(Date, nullable=False, index=True)
    zone = Column(String(50), nullable=False, default="zone10", index=True)
    created_at = Column(DateTime, nullable=False, default=datetime.utcnow)
    def __repr__(self):
        return f"<CompanyPrice(company='{self.company_name}', price={self.price_decimal}, date={self.scrape_date})>"
    def to_dict(self):
        """Convert model instance to dictionary for JSON serialization."""
        return {
            "id": self.id,
            "company_name": self.company_name,
            "town": self.town,
            "price_decimal": float(self.price_decimal) if self.price_decimal else None,
            "scrape_date": self.scrape_date.isoformat() if self.scrape_date else None,
            "zone": self.zone,
            "created_at": self.created_at.isoformat() if self.created_at else None,
        }
 # Create composite indexes for common queries
 Index('idx_company_prices_company_date', CompanyPrice.company_name, CompanyPrice.scrape_date)
 Index('idx_company_prices_zone_date', CompanyPrice.zone, CompanyPrice.scrape_date)
 class TickerPrice(Base):
    """
    Model for storing ticker prices (Stocks/Commodities).
    Attributes:
        id: Primary key
        symbol: Ticker symbol (e.g., HO=F, CL=F, RB=F)
        price_decimal: Current price
        currency: Currency code (e.g., USD)
        change_decimal: Price change amount
        percent_change_decimal: Price change percentage
        timestamp: Time of scrape
    """
    __tablename__ = "ticker_prices"
    id = Column(Integer, primary_key=True, autoincrement=True)
    symbol = Column(String(20), nullable=False, index=True)
    price_decimal = Column(Numeric(10, 4), nullable=False)
    currency = Column(String(10), nullable=True)
    change_decimal = Column(Numeric(10, 4), nullable=True)
    percent_change_decimal = Column(Numeric(10, 4), nullable=True)
    timestamp = Column(DateTime, nullable=False, default=datetime.utcnow, index=True)
    def __repr__(self):
        return f"<TickerPrice(symbol='{self.symbol}', price={self.price_decimal}, time={self.timestamp})>"
    def to_dict(self):
        return {
            "id": self.id,
            "symbol": self.symbol,
            "price": float(self.price_decimal) if self.price_decimal is not None else None,
            "currency": self.currency,
            "change": float(self.change_decimal) if self.change_decimal is not None else None,
            "percent_change": float(self.percent_change_decimal) if self.percent_change_decimal is not None else None,
            "timestamp": self.timestamp.isoformat() if self.timestamp else None,
        }
@@ -0,0 +1,9 @@
 """
 New England Oil Scraper Module.
 This package contains code specific to scraping prices from the New England Oil website.
 """
 from app.newenglandoil.scraper import scrape_newengland_oil, ScraperError
 __all__ = ['scrape_newengland_oil', 'ScraperError']
@@ -0,0 +1,140 @@
 import logging
 from datetime import datetime
 from typing import List
 from fastapi import APIRouter, Depends, HTTPException
 from sqlalchemy.orm import Session
 from sqlalchemy.exc import SQLAlchemyError
 from app.models import CompanyPrice
 from app.schemas import PriceRecord
 from app.newenglandoil.schemas import LatestPriceResponse
 from app.newenglandoil.scraper import scrape_newengland_oil, ScraperError
 from app.database import get_db
 logger = logging.getLogger(__name__)
 router = APIRouter(
    prefix="/scraper/newenglandoil",
    tags=["New England Oil Scraper"],
 )
@router.get(
    "/latestprice",
    response_model=LatestPriceResponse,
 )
 async def get_latest_prices(db: Session = Depends(get_db)):
    """
    Scrape latest oil prices from New England Oil Zone 10.
    This endpoint:
    1. Scrapes the New England Oil website for current prices
    2. Stores all prices in the database (historical tracking)
    3. Returns the scraped prices
    Designed to be called from cron for automated price tracking.
    Example:
        curl http://localhost:8000/scraper/newenglandoil/latestprice
    Returns:
        LatestPriceResponse with scraped prices and storage statistics
    """
    logger.info("=" * 60)
    logger.info("SCRAPER ENDPOINT CALLED - New England Oil Zone 10")
    logger.info("=" * 60)
    try:
        # Scrape the website
        scraped_prices = scrape_newengland_oil()
        if not scraped_prices:
            logger.warning("No prices were scraped")
            return LatestPriceResponse(
                status="warning",
                message="No prices found on website",
                prices_scraped=0,
                prices_stored=0,
                scrape_timestamp=datetime.utcnow().isoformat(),
                prices=[]
            )
        # Store prices in database
        stored_count = 0
        price_records = []
        for price_data in scraped_prices:
            # Add to response list (regardless of whether it's new or existing)
            price_records.append(PriceRecord(
                company_name=price_data["company_name"],
                town=price_data.get("town"),
                price_decimal=float(price_data["price_decimal"]),
                scrape_date=price_data["scrape_date"].isoformat(),
                zone=price_data.get("zone", "zone10"),
            ))
            try:
                # Check for existing record to prevent duplicates
                existing_record = db.query(CompanyPrice).filter(
                    CompanyPrice.company_name == price_data["company_name"],
                    CompanyPrice.scrape_date == price_data["scrape_date"],
                    CompanyPrice.zone == price_data.get("zone", "zone10")
                ).first()
                if existing_record:
                    logger.debug(f"Skipping duplicate record for {price_data['company_name']} on {price_data['scrape_date']}")
                    continue
                # Create database record
                price_record = CompanyPrice(
                    company_name=price_data["company_name"],
                    town=price_data.get("town"),
                    price_decimal=price_data["price_decimal"],
                    scrape_date=price_data["scrape_date"],
                    zone=price_data.get("zone", "zone10"),
                )
                db.add(price_record)
                stored_count += 1
            except Exception as e:
                logger.error(f"Failed to store price for {price_data.get('company_name')}: {e}")
        # Commit all new records
        if stored_count > 0:
            db.commit()
            logger.info(f"Successfully stored {stored_count} new price records")
        else:
            logger.info("No new price records to store (all duplicates)")
        return LatestPriceResponse(
            status="success",
            message=f"Successfully scraped {len(scraped_prices)} prices, stored {stored_count} new records",
            prices_scraped=len(scraped_prices),
            prices_stored=stored_count,
            scrape_timestamp=datetime.utcnow().isoformat(),
            prices=price_records,
        )
    except ScraperError as e:
        logger.error(f"Scraper error: {e}")
        raise HTTPException(
            status_code=500,
            detail=f"Scraping failed: {str(e)}"
        )
    except SQLAlchemyError as e:
        db.rollback()
        logger.error(f"Database error during price storage: {e}", exc_info=True)
        raise HTTPException(
            status_code=500,
            detail=f"Database error: {str(e)}"
        )
    except Exception as e:
        db.rollback()
        logger.error(f"Unexpected error during scraping: {e}", exc_info=True)
        raise HTTPException(
            status_code=500,
            detail=f"Scraping failed: {str(e)}"
        )
@@ -0,0 +1,12 @@
 from typing import List
 from pydantic import BaseModel
 from app.schemas import PriceRecord
 class LatestPriceResponse(BaseModel):
    """Latest price scrape response schema."""
    status: str
    message: str
    prices_scraped: int
    prices_stored: int
    scrape_timestamp: str
    prices: List[PriceRecord]
@@ -0,0 +1,170 @@
 """
 Web scraping module for New England Oil prices.
 This module handles scraping oil price data from the New England Oil website
 for Zone 10 (Central Massachusetts).
 """
 import logging
 import time
 from datetime import date
 from typing import List, Dict, Optional
 from decimal import Decimal
 import requests
 from bs4 import BeautifulSoup
 from app.config import (
    NEWENGLAND_OIL_ZONE10_URL,
    SCRAPER_USER_AGENT,
    SCRAPER_TIMEOUT,
    SCRAPER_DELAY_SECONDS,
 )
 logger = logging.getLogger(__name__)
 class ScraperError(Exception):
    """Custom exception for scraper errors."""
    pass
 def scrape_newengland_oil() -> List[Dict[str, any]]:
    """
    Scrape oil prices from New England Oil Zone 10 page.
    Fetches the page, parses the HTML table, and extracts company names,
    towns, and prices.
    Returns:
        List of dictionaries with keys: company_name, town, price_decimal, scrape_date, zone
    Raises:
        ScraperError: If the request fails or parsing fails
    """
    logger.info(f"Starting scrape of {NEWENGLAND_OIL_ZONE10_URL}")
    headers = {
        "User-Agent": SCRAPER_USER_AGENT,
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
        "Accept-Language": "en-US,en;q=0.5",
        "Accept-Encoding": "gzip, deflate",
        "Connection": "keep-alive",
    }
    try:
        # Make the request
        response = requests.get(
            NEWENGLAND_OIL_ZONE10_URL,
            headers=headers,
            timeout=SCRAPER_TIMEOUT
        )
        response.raise_for_status()
        logger.info(f"Successfully fetched page (status: {response.status_code})")
        # Parse HTML
        soup = BeautifulSoup(response.content, 'lxml')
        # Find the price table
        # The table typically has company names, towns, and prices
        # We need to inspect the actual HTML structure
        prices = []
        today = date.today()
        # Look for table rows with price data
        # The structure appears to be: company links followed by town and price info
        # We'll look for patterns in the HTML
        # Find all table rows
        tables = soup.find_all('table')
        if not tables:
            logger.warning("No tables found on page")
            # Debug: Save HTMl to file
            with open("debug_page.html", "wb") as f:
                f.write(response.content)
            raise ScraperError("No price table found on page")
        # The main price table is usually the largest one or contains specific markers
        # Let's find rows that contain price information
        for table in tables:
            rows = table.find_all('tr')
            for row in rows:
                cells = row.find_all(['td', 'th'])
                if len(cells) >= 3:  # Expect at least company, town, price
                    # Try to extract company name (usually in a link)
                    company_link = row.find('a')
                    if company_link:
                        company_name = company_link.get_text(strip=True)
                        # Extract text from all cells
                        cell_texts = [cell.get_text(strip=True) for cell in cells]
                        # Look for price pattern (e.g., "$2.599" or "2.599")
                        price_value = None
                        town_value = None
                        for text in cell_texts:
                            # Check if this looks like a price
                            text_clean = text.replace('$', '').replace(',', '').strip()
                            try:
                                # Try to parse as decimal
                                if text_clean and '.' in text_clean:
                                    potential_price = Decimal(text_clean)
                                    # Reasonable price range for heating oil (0.50 to 10.00)
                                    if Decimal('0.50') <= potential_price <= Decimal('10.00'):
                                        price_value = potential_price
                                        break
                            except (ValueError, ArithmeticError):
                                # Not a valid price, might be town name
                                if text and not text.startswith('$') and len(text) > 2:
                                    if not town_value:  # Take first non-price text as town
                                        town_value = text
                        if price_value:
                            prices.append({
                                "company_name": company_name,
                                "town": town_value,
                                "price_decimal": price_value,
                                "scrape_date": today,
                                "zone": "zone10"
                            })
                            logger.debug(f"Found: {company_name} - {town_value} - ${price_value}")
        if not prices:
            logger.warning("No prices extracted from page")
            raise ScraperError("Failed to extract any price data from page")
        logger.info(f"Successfully scraped {len(prices)} price records")
        return prices
    except requests.RequestException as e:
        logger.error(f"Request failed: {e}")
        raise ScraperError(f"Failed to fetch page: {str(e)}")
    except Exception as e:
        logger.error(f"Scraping failed: {e}", exc_info=True)
        raise ScraperError(f"Failed to parse page: {str(e)}")
 def scrape_and_delay() -> List[Dict[str, any]]:
    """
    Scrape prices and apply rate limiting delay.
    This is a convenience function that scrapes and then sleeps
    to respect rate limits.
    Returns:
        List of price dictionaries
    """
    prices = scrape_newengland_oil()
    # Apply rate limiting delay
    if SCRAPER_DELAY_SECONDS > 0:
        logger.debug(f"Sleeping {SCRAPER_DELAY_SECONDS}s for rate limiting")
        time.sleep(SCRAPER_DELAY_SECONDS)
    return prices
@@ -0,0 +1,72 @@
 from fastapi import APIRouter, Depends, HTTPException
 from sqlalchemy.orm import Session
 from typing import List
 import logging
 from app.database import get_db
 from app.models import TickerPrice
 from app.priceticker.scraper import fetch_ticker_data
 logger = logging.getLogger(__name__)
 router = APIRouter(
    prefix="/scraper/priceticker",
    tags=["Price Ticker"]
 )
@router.post("/update")
 async def update_prices(db: Session = Depends(get_db)):
    """
    Trigger an immediate update of stock/commodity prices.
    """
    logger.info("Triggering ticker update...")
    data = fetch_ticker_data()
    if not data:
        raise HTTPException(status_code=500, detail="Failed to fetch ticker data")
    try:
        saved_records = []
        for item in data:
            record = TickerPrice(
                symbol=item["symbol"],
                price_decimal=item["price"],
                currency=item["currency"],
                change_decimal=item["change"],
                percent_change_decimal=item["percent_change"],
                timestamp=item["timestamp"]
            )
            db.add(record)
            saved_records.append(record)
        db.commit()
        return {"status": "success", "updated": len(saved_records)}
    except Exception as e:
        db.rollback()
        logger.error(f"Database error saving tickers: {e}")
        raise HTTPException(status_code=500, detail=f"Database error: {str(e)}")
@router.get("/latest")
 async def get_latest_prices(db: Session = Depends(get_db)):
    """
    Get the most recent price for each ticker symbol.
    """
    # Subquery to find the latest timestamp for each symbol
    # This is a bit complex in pure ORM, so we might do it simply for now:
    # 1. Get list of distinct symbols we care about
    # 2. Query latest for each
    results = []
    # We know the symbols we care about: HO=F, CL=F, RB=F
    TARGET_SYMBOLS = ["HO=F", "CL=F", "RB=F"]
    for symbol in TARGET_SYMBOLS:
        latest = db.query(TickerPrice).filter(
            TickerPrice.symbol == symbol
        ).order_by(TickerPrice.timestamp.desc()).first()
        if latest:
            results.append(latest.to_dict())
    return results
@@ -0,0 +1,65 @@
 import logging
 import yfinance as yf
 from decimal import Decimal
 from datetime import datetime
 from typing import List, Dict, Any
 logger = logging.getLogger(__name__)
 # Ticker mapping
 # HO=F : Heating Oil
 # CL=F : Crude Oil
 # RB=F : RBOB Gasoline
 TICKERS = ["HO=F", "CL=F", "RB=F"]
 def fetch_ticker_data() -> List[Dict[str, Any]]:
    """
    Fetch current data for oil tickers from Yahoo Finance.
    Returns:
        List of dictionaries containing ticker data.
    """
    results = []
    try:
        # Fetch data for all tickers at once
        tickers_str = " ".join(TICKERS)
        data = yf.Tickers(tickers_str)
        for symbol in TICKERS:
            try:
                ticker = data.tickers[symbol]
                # Fast info usually contains the latest price
                info = ticker.fast_info
                # Fallback to history if fast_info is missing crucial data (simplified here)
                # But fast_info is generally faster and sufficient for last_price
                last_price = info.last_price
                previous_close = info.previous_close
                if last_price is None:
                    logger.warning(f"No price found for {symbol}")
                    continue
                change = last_price - previous_close
                percent_change = (change / previous_close) * 100 if previous_close else 0
                results.append({
                    "symbol": symbol,
                    "price": Decimal(str(last_price)),
                    "currency": info.currency,
                    "change": Decimal(str(change)),
                    "percent_change": Decimal(str(percent_change)),
                    "timestamp": datetime.utcnow()
                })
                logger.info(f"Fetched {symbol}: {last_price} ({percent_change:.2f}%)")
            except Exception as e:
                logger.error(f"Error processing {symbol}: {e}")
    except Exception as e:
        logger.error(f"Error fetching ticker data: {e}")
    return results
@@ -0,0 +1,16 @@
 from typing import Optional
 from pydantic import BaseModel
 class HealthResponse(BaseModel):
    """Health check response schema."""
    status: str
    db_connected: bool
 class PriceRecord(BaseModel):
    """Single price record schema."""
    company_name: str
    town: Optional[str] = None
    price_decimal: float
    scrape_date: str
    zone: str
@@ -0,0 +1 @@
 */5 * * * * /usr/bin/curl -m 120 -s http://localhost:9519/scraper/newenglandoil/latestprice &>/dev/null
@@ -0,0 +1,26 @@
 import requests
 from bs4 import BeautifulSoup
 url = "https://www.newenglandoil.com/massachusetts/zone10.asp?x=0"
 headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
 }
 try:
    response = requests.get(url, headers=headers, timeout=15)
    print(f"Status Code: {response.status_code}")
    with open("debug_page.html", "w", encoding="utf-8") as f:
        f.write(response.text)
    soup = BeautifulSoup(response.content, 'lxml')
    tables = soup.find_all('table')
    print(f"Tables found: {len(tables)}")
    if not tables:
        print("Preview of content:")
        print(response.text[:500])
 except Exception as e:
    print(f"Error: {e}")
@@ -0,0 +1,25 @@
 import requests
 from bs4 import BeautifulSoup
 url = "https://www.newenglandoil.com/massachusetts/zone10.asp?x=0"
 # Use the UA from config.py
 headers = {
    "User-Agent": "Unraid-EamcoScraper/1.0 (eeames214@gmail.com)"
 }
 try:
    print(f"Testing with UA: {headers['User-Agent']}")
    response = requests.get(url, headers=headers, timeout=15)
    print(f"Status Code: {response.status_code}")
    soup = BeautifulSoup(response.content, 'lxml')
    tables = soup.find_all('table')
    print(f"Tables found: {len(tables)}")
    if not tables:
        print("No tables found. Dumping start of response:")
        print(response.text[:500])
 except Exception as e:
    print(f"Error: {e}")
@@ -0,0 +1,29 @@
 -- Database migration for eamco_scraper
 -- Creates company_prices table for storing historical oil price data
 -- Create company_prices table
 CREATE TABLE IF NOT EXISTS company_prices (
    id SERIAL PRIMARY KEY,
    company_name VARCHAR(255) NOT NULL,
    town VARCHAR(100),
    price_decimal DECIMAL(6,3) NOT NULL,
    scrape_date DATE NOT NULL,
    zone VARCHAR(50) NOT NULL DEFAULT 'zone10',
    created_at TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP
 );
 -- Create indexes for efficient querying
 CREATE INDEX IF NOT EXISTS idx_company_prices_company ON company_prices(company_name);
 CREATE INDEX IF NOT EXISTS idx_company_prices_scrape_date ON company_prices(scrape_date);
 CREATE INDEX IF NOT EXISTS idx_company_prices_zone ON company_prices(zone);
 CREATE INDEX IF NOT EXISTS idx_company_prices_company_date ON company_prices(company_name, scrape_date);
 CREATE INDEX IF NOT EXISTS idx_company_prices_zone_date ON company_prices(zone, scrape_date);
 -- Add comment to table
 COMMENT ON TABLE company_prices IS 'Historical oil price data scraped from New England Oil website';
 COMMENT ON COLUMN company_prices.company_name IS 'Name of the oil company';
 COMMENT ON COLUMN company_prices.town IS 'Town where the company operates';
 COMMENT ON COLUMN company_prices.price_decimal IS 'Price per gallon in dollars';
 COMMENT ON COLUMN company_prices.scrape_date IS 'Date when the price was listed on the website';
 COMMENT ON COLUMN company_prices.zone IS 'Geographic zone (e.g., zone10 for Central Massachusetts)';
 COMMENT ON COLUMN company_prices.created_at IS 'Timestamp when the record was inserted into database';
@@ -0,0 +1,18 @@
 # eamco_scraper dependencies
 # FastAPI web framework and server
 fastapi>=0.109.0,<1.0.0
 uvicorn[standard]>=0.27.0,<1.0.0
 pydantic>=2.5.0,<3.0.0
 # Database
 sqlalchemy>=2.0.0,<3.0.0
 psycopg2-binary>=2.9.9,<3.0.0
 # Web scraping
 beautifulsoup4>=4.12.0,<5.0.0
 requests>=2.31.0,<3.0.0
 lxml>=5.0.0,<6.0.0
 # Configuration
 python-dotenv>=1.0.0,<2.0.0
 yfinance>=0.2.36
@@ -0,0 +1,75 @@
 #!/bin/bash
 # Test script for eamco_scraper service
 echo "=== eamco_scraper Test Script ==="
 echo ""
 # Check if service directory exists
 if [ ! -d "/mnt/code/oil/eamco/eamco_scraper" ]; then
    echo "❌ Service directory not found"
    exit 1
 fi
 echo "✅ Service directory exists"
 echo ""
 # Test 1: Build Docker image
 echo "Test 1: Building Docker image..."
 cd /mnt/code/oil/eamco/eamco_deploy
 docker-compose -f docker-compose.local.yml build scraper_local
 if [ $? -eq 0 ]; then
    echo "✅ Docker build successful"
 else
    echo "❌ Docker build failed"
    exit 1
 fi
 echo ""
 # Test 2: Start the service
 echo "Test 2: Starting service..."
 docker-compose -f docker-compose.local.yml up -d scraper_local
 if [ $? -eq 0 ]; then
    echo "✅ Service started"
 else
    echo "❌ Service failed to start"
    exit 1
 fi
 echo ""
 echo "Waiting 5 seconds for service to initialize..."
 sleep 5
 # Test 3: Health check
 echo "Test 3: Testing health endpoint..."
 HEALTH_RESPONSE=$(curl -s http://localhost:9619/health)
 if echo "$HEALTH_RESPONSE" | grep -q "status"; then
    echo "✅ Health endpoint responding"
    echo "Response: $HEALTH_RESPONSE"
 else
    echo "❌ Health endpoint not responding"
 fi
 echo ""
 # Test 4: Scraper endpoint
 echo "Test 4: Testing scraper endpoint..."
 echo "This will scrape live data from New England Oil..."
 SCRAPER_RESPONSE=$(curl -s http://localhost:9619/scraper/newenglandoil/latestprice)
 if echo "$SCRAPER_RESPONSE" | grep -q "status"; then
    echo "✅ Scraper endpoint responding"
    echo "Response preview:"
    echo "$SCRAPER_RESPONSE" | head -n 20
 else
    echo "❌ Scraper endpoint not responding"
 fi
 echo ""
 echo "=== Test Complete ==="
 echo ""
 echo "To view logs: docker-compose -f docker-compose.local.yml logs scraper_local"
 echo "To stop service: docker-compose -f docker-compose.local.yml down scraper_local"
		`@@ -0,0 +1 @@`
							`/5 * * * /usr/bin/curl -m 120 -s http://localhost:9519/scraper/newenglandoil/latestprice &>/dev/null`