commit af9c2f99e7513b7f0f38fb7bf621f7550be8bbfe Author: Edwin Eames Date: Sun Feb 8 17:57:44 2026 -0500 feat: initial commit for oil price scraper service FastAPI-based scraper for commodity ticker prices (HO, CL, RB futures) and competitor oil pricing from NewEnglandOil. Includes cron-driven scraping, PostgreSQL storage, and REST endpoints for price retrieval. Co-Authored-By: Claude Opus 4.6 diff --git a/.env.example b/.env.example new file mode 100644 index 0000000..357ae98 --- /dev/null +++ b/.env.example @@ -0,0 +1,23 @@ +# Environment variables for eamco_scraper - PRODUCTION +# Copy this to .env.prod and fill in actual values + +# Application mode +MODE=PRODUCTION +CURRENT_SETTINGS=PRODUCTION + +# Database configuration +POSTGRES_USERNAME=postgres +POSTGRES_PW=your_password_here +POSTGRES_SERVER=192.168.1.204 +POSTGRES_PORT=5432 +POSTGRES_DBNAME=auburnoil + +# Logging +LOG_LEVEL=INFO + +# Scraper configuration +SCRAPER_DELAY=2.0 +SCRAPER_TIMEOUT=10 + +# CORS (optional - comma-separated) +# CORS_ORIGINS=https://oil.edwineames.com,https://edwineames.com diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..70c12d9 --- /dev/null +++ b/.gitignore @@ -0,0 +1,132 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +pip-wheel-metadata/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +.python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having sub-dependencies with platform-specific binaries, it is better to ignore the Pipfile.lock. +# Pipfile.lock + +# PEP 582; __pypackages__ +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyderworkspace + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# Environments +.env.local +.env.prod \ No newline at end of file diff --git a/Dockerfile.dev b/Dockerfile.dev new file mode 100644 index 0000000..9fa7537 --- /dev/null +++ b/Dockerfile.dev @@ -0,0 +1,18 @@ +# eamco_scraper - DEVELOPMENT Dockerfile +# Minimal development setup + +FROM python:3.11-slim + +WORKDIR /app + +RUN apt-get update && apt-get install -y --no-install-recommends \ + libpq-dev gcc && rm -rf /var/lib/apt/lists/* + +COPY requirements.txt . +RUN pip install --no-cache-dir -r requirements.txt + +COPY . . + +EXPOSE 8000 + +CMD ["uvicorn", "app.main:app", "--reload", "--host", "0.0.0.0", "--port", "8000"] diff --git a/Dockerfile.local b/Dockerfile.local new file mode 100644 index 0000000..f9d43e3 --- /dev/null +++ b/Dockerfile.local @@ -0,0 +1,37 @@ +# eamco_scraper - LOCAL Dockerfile +# Used by docker-compose.local.yml +# Features: Volume mounts for development, hot reload + +FROM python:3.11-slim + +# Set environment variables +ENV PYTHONDONTWRITEBYTECODE=1 \ + PYTHONUNBUFFERED=1 \ + PYTHONPATH=/app + +# Set working directory +WORKDIR /app + +# Install system dependencies +RUN apt-get update && apt-get install -y --no-install-recommends \ + libpq-dev \ + gcc \ + && rm -rf /var/lib/apt/lists/* + +# Copy requirements +COPY requirements.txt . + +# Install Python dependencies +RUN pip install --no-cache-dir -r requirements.txt + +# Copy environment file for local +COPY .env.local .env + +# Copy application code (will be overridden by volume mount) +COPY app/ ./app/ + +# Expose port +EXPOSE 8000 + +# Development: Run with reload +CMD ["uvicorn", "app.main:app", "--reload", "--host", "0.0.0.0", "--port", "8000"] diff --git a/Dockerfile.prod b/Dockerfile.prod new file mode 100644 index 0000000..7b63a64 --- /dev/null +++ b/Dockerfile.prod @@ -0,0 +1,49 @@ +# eamco_scraper - PRODUCTION Dockerfile +# Used by docker-compose.prod.yml +# Features: Optimized for production, non-root user, health checks + +FROM python:3.11-slim + +# Set environment variables +ENV PYTHONDONTWRITEBYTECODE=1 \ + PYTHONUNBUFFERED=1 \ + PYTHONPATH=/app \ + PIP_NO_CACHE_DIR=1 \ + PIP_DISABLE_PIP_VERSION_CHECK=1 + +# Set working directory +WORKDIR /app + +# Install system dependencies (psycopg2 requirements) +RUN apt-get update && apt-get install -y --no-install-recommends \ + libpq-dev \ + gcc \ + && rm -rf /var/lib/apt/lists/* \ + && apt-get clean + +# Copy requirements first (for better layer caching) +COPY requirements.txt . + +# Install Python dependencies +RUN pip install --no-cache-dir -r requirements.txt + +# Copy environment file for production +COPY .env.prod .env + +# Copy application code +COPY app/ ./app/ + +# Create non-root user for security +RUN useradd --create-home --shell /bin/bash appuser && \ + chown -R appuser:appuser /app +USER appuser + +# Expose port +EXPOSE 8000 + +# Health check +HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \ + CMD python -c "import urllib.request; urllib.request.urlopen('http://localhost:8000/health')" || exit 1 + +# Production: Run without reload, with workers +CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "8000", "--workers", "2"] diff --git a/README.md b/README.md new file mode 100644 index 0000000..481bcfa --- /dev/null +++ b/README.md @@ -0,0 +1,155 @@ +# eamco_scraper + +FastAPI microservice for scraping heating oil prices from New England Oil and storing historical pricing data. + +## Overview + +This service scrapes oil company pricing data from the New England Oil website (Zone 10 - Central Massachusetts) and stores it in a PostgreSQL database for historical tracking and trend analysis. + +## Features + +- **Web Scraping**: Automated scraping of oil prices using BeautifulSoup4 +- **Historical Tracking**: Stores all price records (no updates, only inserts) for trend analysis +- **Cron-Friendly**: Single GET request triggers scrape and storage +- **Health Checks**: Built-in health endpoint for monitoring +- **Docker Ready**: Production and development Docker configurations + +## API Endpoints + +### `GET /health` +Health check endpoint with database connectivity status. + +**Response:** +```json +{ + "status": "healthy", + "db_connected": true +} +``` + +### `GET /scraper/newenglandoil/latestprice` +Trigger scrape of New England Oil Zone 10 prices, store in database, and return results. + +**Response:** +```json +{ + "status": "success", + "message": "Successfully scraped and stored 30 prices", + "prices_scraped": 30, + "prices_stored": 30, + "scrape_timestamp": "2026-02-07T22:00:00", + "prices": [ + { + "company_name": "AUBURN OIL", + "town": "Auburn", + "price_decimal": 2.599, + "scrape_date": "2026-02-07", + "zone": "zone10" + } + ] +} +``` + +## Database Schema + +### `company_prices` Table + +| Column | Type | Description | +|--------|------|-------------| +| id | SERIAL | Primary key | +| company_name | VARCHAR(255) | Oil company name | +| town | VARCHAR(100) | Town/city | +| price_decimal | DECIMAL(6,3) | Price per gallon | +| scrape_date | DATE | Date price was listed | +| zone | VARCHAR(50) | Geographic zone (default: zone10) | +| created_at | TIMESTAMP | Record creation timestamp | + +**Indexes:** +- `idx_company_prices_company` on `company_name` +- `idx_company_prices_scrape_date` on `scrape_date` +- `idx_company_prices_zone` on `zone` +- `idx_company_prices_company_date` on `(company_name, scrape_date)` +- `idx_company_prices_zone_date` on `(zone, scrape_date)` + +## Development + +### Local Setup + +```bash +# Create virtual environment +python -m venv venv +source venv/bin/activate # On Windows: venv\Scripts\activate + +# Install dependencies +pip install -r requirements.txt + +# Copy environment file +cp .env.example .env.local + +# Edit .env.local with your database credentials + +# Run the application +uvicorn app.main:app --reload --host 0.0.0.0 --port 8000 +``` + +### Docker Local + +```bash +cd /mnt/code/oil/eamco/eamco_deploy +docker-compose -f docker-compose.local.yml up scraper_local +``` + +Access at: http://localhost:9619 + +## Production + +### Docker Production + +```bash +cd /mnt/code/oil/eamco/eamco_deploy +docker-compose -f docker-compose.prod.yml up -d scraper_prod +``` + +Access at: http://192.168.1.204:9519 + +## Cron Integration + +Add to Unraid cron or system crontab: + +```bash +# Scrape prices daily at 6 AM +0 6 * * * curl -s http://192.168.1.204:9619/scraper/newenglandoil/latestprice > /dev/null 2>&1 +``` + +## Environment Variables + +| Variable | Description | Default | +|----------|-------------|---------| +| MODE | Application mode (LOCAL/PRODUCTION) | LOCAL | +| POSTGRES_USERNAME | Database username | postgres | +| POSTGRES_PW | Database password | password | +| POSTGRES_SERVER | Database server | 192.168.1.204 | +| POSTGRES_PORT | Database port | 5432 | +| POSTGRES_DBNAME | Database name | eamco | +| LOG_LEVEL | Logging level | INFO | +| SCRAPER_DELAY | Delay between requests (seconds) | 2.0 | +| SCRAPER_TIMEOUT | Request timeout (seconds) | 10 | + +## Architecture + +- **Framework**: FastAPI 0.109+ +- **Database**: PostgreSQL 15+ with SQLAlchemy 2.0 +- **Scraping**: BeautifulSoup4 + lxml + requests +- **Server**: Uvicorn with 2 workers (production) + +## Ports + +- **Local Development**: 9619 +- **Production**: 9519 + +## Future Enhancements + +- Frontend display on Home page (table or cards) +- Price change alerts/notifications +- Support for additional zones +- Price trend graphs and analytics diff --git a/app/__init__.py b/app/__init__.py new file mode 100644 index 0000000..635cbfa --- /dev/null +++ b/app/__init__.py @@ -0,0 +1 @@ +# eamco_scraper app package diff --git a/app/config.py b/app/config.py new file mode 100644 index 0000000..a454a8f --- /dev/null +++ b/app/config.py @@ -0,0 +1,111 @@ +""" +Configuration settings for eamco_scraper. + +This module provides configuration with environment-based switching: +- LOCAL: Uses 'eamco' database, localhost CORS origins +- PRODUCTION: Uses 'auburnoil' database, production domain CORS origins + +Environment variables are loaded from .env.local or .env.prod depending +on the Docker compose file used. +""" + +import logging +import os +from typing import List + +from dotenv import load_dotenv + +# Load environment variables from .env file if present +load_dotenv() + +# ============================================================================= +# ENVIRONMENT MODE +# ============================================================================= + +MODE = os.getenv("MODE", "LOCAL") +CURRENT_SETTINGS = os.getenv("CURRENT_SETTINGS", "DEVELOPMENT") + +# Log configuration mode (logger setup happens after config is loaded) +_config_mode_msg = f"Using {'PRODUCTION' if CURRENT_SETTINGS == 'PRODUCTION' else 'DEVELOPMENT'} configuration" + +# ============================================================================= +# DATABASE CONFIGURATION +# ============================================================================= + +# Database connection components (can be overridden individually) +POSTGRES_USERNAME = os.getenv("POSTGRES_USERNAME", "postgres") +POSTGRES_PW = os.getenv("POSTGRES_PW", "password") +POSTGRES_SERVER = os.getenv("POSTGRES_SERVER", "192.168.1.204") +POSTGRES_PORT = os.getenv("POSTGRES_PORT", "5432") + +# Database name differs by environment +if CURRENT_SETTINGS == "PRODUCTION": + POSTGRES_DBNAME = os.getenv("POSTGRES_DBNAME", "auburnoil") +else: + POSTGRES_DBNAME = os.getenv("POSTGRES_DBNAME", "eamco") + +# Build connection URI from components (fallback) +_DEFAULT_DATABASE_URI = "postgresql+psycopg2://{}:{}@{}:{}/{}".format( + POSTGRES_USERNAME, + POSTGRES_PW, + POSTGRES_SERVER, + POSTGRES_PORT, + POSTGRES_DBNAME +) + +# Allow full DATABASE_URL override +DATABASE_URL: str = os.getenv("DATABASE_URL", _DEFAULT_DATABASE_URI) + +# SQLAlchemy binds (for compatibility) +SQLALCHEMY_DATABASE_URI = DATABASE_URL +SQLALCHEMY_BINDS = {POSTGRES_DBNAME: SQLALCHEMY_DATABASE_URI} + +# ============================================================================= +# CORS CONFIGURATION +# ============================================================================= + +# Parse CORS origins from environment (comma-separated) or use defaults +_cors_env = os.getenv("CORS_ORIGINS", "") + +if _cors_env: + CORS_ORIGINS: List[str] = [origin.strip() for origin in _cors_env.split(",")] +elif CURRENT_SETTINGS == "PRODUCTION": + # Production CORS origins + CORS_ORIGINS = [ + "https://oil.edwineames.com", + "https://edwineames.com", + ] +else: + # Development CORS origins + CORS_ORIGINS = [ + "http://localhost:9000", + "https://localhost:9513", + "http://localhost:9514", + "http://localhost:9512", + "http://localhost:9511", + "http://localhost:5173", # Frontend port + "http://localhost:9616", # Authorize service port + ] + +# ============================================================================= +# SCRAPER CONFIGURATION +# ============================================================================= + +# User agent for web scraping (identifies your application) +SCRAPER_USER_AGENT: str = "Unraid-EamcoScraper/1.0 (eeames214@gmail.com)" + +# Rate limiting: Sleep between requests (be respectful to target servers) +SCRAPER_DELAY_SECONDS: float = float(os.getenv("SCRAPER_DELAY", "2.0")) + +# Request timeout in seconds +SCRAPER_TIMEOUT: int = int(os.getenv("SCRAPER_TIMEOUT", "10")) + +# Target URL for New England Oil Zone 10 +NEWENGLAND_OIL_ZONE10_URL: str = "https://www.newenglandoil.com/massachusetts/zone10.asp?x=0" + +# ============================================================================= +# LOGGING CONFIGURATION +# ============================================================================= + +LOG_LEVEL: str = os.getenv("LOG_LEVEL", "INFO") +LOG_FORMAT: str = "%(asctime)s - %(name)s - %(levelname)s - %(message)s" diff --git a/app/database.py b/app/database.py new file mode 100644 index 0000000..7fab21f --- /dev/null +++ b/app/database.py @@ -0,0 +1,88 @@ +import logging +import sys +from contextlib import contextmanager +from typing import Generator + +from sqlalchemy import create_engine, text +from sqlalchemy.orm import sessionmaker, Session +from sqlalchemy.exc import SQLAlchemyError + +from app.config import ( + DATABASE_URL, + LOG_LEVEL, + LOG_FORMAT, +) + +# ============================================================================= +# LOGGING CONFIGURATION +# ============================================================================= + +logging.basicConfig( + level=getattr(logging, LOG_LEVEL.upper(), logging.INFO), + format=LOG_FORMAT, + handlers=[ + logging.StreamHandler(sys.stdout), + ] +) +logger = logging.getLogger(__name__) + +# ============================================================================= +# DATABASE SETUP +# ============================================================================= + +# Create SQLAlchemy engine with connection pooling +engine = create_engine( + DATABASE_URL, + pool_pre_ping=True, # Verify connections before use + pool_size=5, + max_overflow=10, + echo=False, # Set to True for SQL debugging +) + +# Session factory +SessionLocal = sessionmaker( + autocommit=False, + autoflush=False, + bind=engine, +) + + +def get_db() -> Generator[Session, None, None]: + """ + Dependency that provides a database session. + + Yields a SQLAlchemy session and ensures proper cleanup. + """ + db = SessionLocal() + try: + yield db + finally: + db.close() + + +@contextmanager +def get_db_session() -> Generator[Session, None, None]: + """ + Context manager for database sessions (non-dependency use). + """ + db = SessionLocal() + try: + yield db + finally: + db.close() + + +def check_db_connection() -> bool: + """ + Test database connectivity. + + Returns: + True if database is reachable, False otherwise + """ + try: + with get_db_session() as db: + db.execute(text("SELECT 1")) + return True + except SQLAlchemyError as e: + logger.error(f"Database connection failed: {e}") + return False diff --git a/app/main.py b/app/main.py new file mode 100644 index 0000000..6bed657 --- /dev/null +++ b/app/main.py @@ -0,0 +1,195 @@ +""" +eamco_scraper - FastAPI Oil Price Scraping Microservice. + +This microservice provides endpoints for scraping oil prices from New England Oil +and storing them in the database for historical tracking. + +Endpoints: + GET /health - Health check with database connectivity status + GET /scraper/newenglandoil/latestprice - Trigger scrape and return latest prices + +Usage: + # Development + uvicorn app.main:app --reload --host 0.0.0.0 --port 8000 + + # Production (Docker) + docker run -p 8000:8000 eamco_scraper + + # Trigger from cron + curl http://localhost:8000/scraper/newenglandoil/latestprice +""" + +import logging +import os +import sys +from typing import List +from datetime import datetime + +from fastapi import FastAPI, Depends, HTTPException +from fastapi.middleware.cors import CORSMiddleware +from sqlalchemy.orm import Session + +from app.config import ( + DATABASE_URL, + CORS_ORIGINS, + LOG_LEVEL, + LOG_FORMAT, +) +from app.models import Base, CompanyPrice +from app.database import engine, get_db, check_db_connection +from app.schemas import HealthResponse, PriceRecord +from app.newenglandoil.router import router as newenglandoil_router +from app.priceticker.router import router as priceticker_router + + +# ============================================================================= +# LOGGING CONFIGURATION +# ============================================================================= + +logging.basicConfig( + level=getattr(logging, LOG_LEVEL.upper(), logging.INFO), + format=LOG_FORMAT, + handlers=[ + logging.StreamHandler(sys.stdout), + ] +) +logger = logging.getLogger(__name__) + +# ============================================================================= +# FASTAPI APPLICATION +# ============================================================================= + +app = FastAPI( + title="eamco_scraper", + description="Oil price scraping microservice for New England Oil", + version="1.0.0", + docs_url="/docs", + redoc_url="/redoc", +) + +# ============================================================================= +# CORS MIDDLEWARE +# ============================================================================= + +app.add_middleware( + CORSMiddleware, + allow_origins=CORS_ORIGINS, + allow_credentials=True, + allow_methods=["*"], + allow_headers=["*"], +) + +# ============================================================================= +# ROUTERS +# ============================================================================= + +app.include_router(newenglandoil_router) +app.include_router(priceticker_router) + +# ============================================================================= +# ENDPOINTS +# ============================================================================= + + +@app.get("/", include_in_schema=False) +async def root(): + """Root endpoint - redirect to docs.""" + return { + "service": "eamco_scraper", + "version": "1.0.0", + "docs": "/docs", + } + + +@app.get("/health", response_model=HealthResponse, tags=["Health"]) +async def health_check(): + """ + Health check endpoint. + + Returns service status and database connectivity. + Use this endpoint for container health checks and monitoring. + + Returns: + HealthResponse with status and db_connected flag + """ + db_connected = check_db_connection() + + return HealthResponse( + status="healthy" if db_connected else "degraded", + db_connected=db_connected, + ) + + +@app.get( + "/scraper/prices", + response_model=List[PriceRecord], + tags=["Prices"], +) +async def get_stored_prices( + date: str | None = None, + db: Session = Depends(get_db) +): + """ + Get stored oil prices from the database. + + If no date is provided, returns prices for the current date (UTC). + Does NOT trigger a scrape. + """ + if not date: + date = datetime.utcnow().date().isoformat() + + try: + # Query prices for the specific date + prices = db.query(CompanyPrice).filter( + CompanyPrice.scrape_date == date + ).all() + + return [ + PriceRecord( + company_name=p.company_name, + town=p.town, + price_decimal=float(p.price_decimal), + scrape_date=str(p.scrape_date), + zone=p.zone + ) for p in prices + ] + except Exception as e: + logger.error(f"Error fetching prices: {e}") + raise HTTPException(status_code=500, detail=str(e)) + + +# ============================================================================= +# STARTUP/SHUTDOWN EVENTS +# ============================================================================= + + +@app.on_event("startup") +async def startup_event(): + """Application startup - log configuration and test DB connection.""" + logger.info("🚀 eamco_scraper STARTING") + mode = os.environ.get('MODE', 'DEVELOPMENT').upper() + if mode in ['DEVELOPMENT', 'DEV', 'LOCAL']: + logger.info("🤖🤖🤖🤖🤖 Mode: Development 🤖🤖🤖🤖🤖") + elif mode in ['PRODUCTION', 'PROD']: + logger.info("💀💀💀💀💀💀💀💀💀💀 ⚠️ WARNING PRODUCTION 💀💀💀💀💀💀💀💀💀💀") + logger.info(f"DB: {DATABASE_URL[:30]}...") + logger.info(f"CORS: {len(CORS_ORIGINS)} origins configured") + + # Test database connection + if check_db_connection(): + logger.info("DB Connection: ✅ OK") + else: + logger.warning("DB Connection: ❌ FAILED") + + # Create tables if they don't exist + try: + Base.metadata.create_all(bind=engine) + logger.info("Database tables verified/created") + except Exception as e: + logger.error(f"Failed to create tables: {e}") + + +@app.on_event("shutdown") +async def shutdown_event(): + """Application shutdown - cleanup.""" + logger.info("🛑 eamco_scraper SHUTTING DOWN") diff --git a/app/models.py b/app/models.py new file mode 100644 index 0000000..de99e6c --- /dev/null +++ b/app/models.py @@ -0,0 +1,99 @@ +""" +SQLAlchemy models for eamco_scraper. + +This module defines the database models for storing scraped oil price data. +""" + +from datetime import datetime +from sqlalchemy import Column, Integer, String, Numeric, Date, DateTime, Index +from sqlalchemy.ext.declarative import declarative_base + +Base = declarative_base() + + +class CompanyPrice(Base): + """ + Model for storing oil company pricing data. + + This table stores historical pricing data from oil companies. + Each scrape creates new records (no updates) to enable price trend analysis. + + Attributes: + id: Primary key + company_name: Name of the oil company + town: Town where the company operates + price_decimal: Price per gallon (e.g., 2.599) + scrape_date: Date when the price was listed on the website + zone: Geographic zone (default: 'zone10' for Central Massachusetts) + created_at: Timestamp when the record was inserted into database + """ + + __tablename__ = "company_prices" + + id = Column(Integer, primary_key=True, autoincrement=True) + company_name = Column(String(255), nullable=False, index=True) + town = Column(String(100), nullable=True) + price_decimal = Column(Numeric(6, 3), nullable=False) + scrape_date = Column(Date, nullable=False, index=True) + zone = Column(String(50), nullable=False, default="zone10", index=True) + created_at = Column(DateTime, nullable=False, default=datetime.utcnow) + + def __repr__(self): + return f"" + + def to_dict(self): + """Convert model instance to dictionary for JSON serialization.""" + return { + "id": self.id, + "company_name": self.company_name, + "town": self.town, + "price_decimal": float(self.price_decimal) if self.price_decimal else None, + "scrape_date": self.scrape_date.isoformat() if self.scrape_date else None, + "zone": self.zone, + "created_at": self.created_at.isoformat() if self.created_at else None, + } + + +# Create composite indexes for common queries +Index('idx_company_prices_company_date', CompanyPrice.company_name, CompanyPrice.scrape_date) +Index('idx_company_prices_zone_date', CompanyPrice.zone, CompanyPrice.scrape_date) + + +class TickerPrice(Base): + """ + Model for storing ticker prices (Stocks/Commodities). + + Attributes: + id: Primary key + symbol: Ticker symbol (e.g., HO=F, CL=F, RB=F) + price_decimal: Current price + currency: Currency code (e.g., USD) + change_decimal: Price change amount + percent_change_decimal: Price change percentage + timestamp: Time of scrape + """ + + __tablename__ = "ticker_prices" + + id = Column(Integer, primary_key=True, autoincrement=True) + symbol = Column(String(20), nullable=False, index=True) + price_decimal = Column(Numeric(10, 4), nullable=False) + currency = Column(String(10), nullable=True) + change_decimal = Column(Numeric(10, 4), nullable=True) + percent_change_decimal = Column(Numeric(10, 4), nullable=True) + timestamp = Column(DateTime, nullable=False, default=datetime.utcnow, index=True) + + def __repr__(self): + return f"" + + def to_dict(self): + return { + "id": self.id, + "symbol": self.symbol, + "price": float(self.price_decimal) if self.price_decimal is not None else None, + "currency": self.currency, + "change": float(self.change_decimal) if self.change_decimal is not None else None, + "percent_change": float(self.percent_change_decimal) if self.percent_change_decimal is not None else None, + "timestamp": self.timestamp.isoformat() if self.timestamp else None, + } + diff --git a/app/newenglandoil/__init__.py b/app/newenglandoil/__init__.py new file mode 100644 index 0000000..8c51914 --- /dev/null +++ b/app/newenglandoil/__init__.py @@ -0,0 +1,9 @@ +""" +New England Oil Scraper Module. + +This package contains code specific to scraping prices from the New England Oil website. +""" + +from app.newenglandoil.scraper import scrape_newengland_oil, ScraperError + +__all__ = ['scrape_newengland_oil', 'ScraperError'] diff --git a/app/newenglandoil/router.py b/app/newenglandoil/router.py new file mode 100644 index 0000000..edb9873 --- /dev/null +++ b/app/newenglandoil/router.py @@ -0,0 +1,140 @@ +import logging +from datetime import datetime +from typing import List + +from fastapi import APIRouter, Depends, HTTPException +from sqlalchemy.orm import Session +from sqlalchemy.exc import SQLAlchemyError + +from app.models import CompanyPrice +from app.schemas import PriceRecord +from app.newenglandoil.schemas import LatestPriceResponse +from app.newenglandoil.scraper import scrape_newengland_oil, ScraperError +from app.database import get_db + +logger = logging.getLogger(__name__) + +router = APIRouter( + prefix="/scraper/newenglandoil", + tags=["New England Oil Scraper"], +) + +@router.get( + "/latestprice", + response_model=LatestPriceResponse, +) +async def get_latest_prices(db: Session = Depends(get_db)): + """ + Scrape latest oil prices from New England Oil Zone 10. + + This endpoint: + 1. Scrapes the New England Oil website for current prices + 2. Stores all prices in the database (historical tracking) + 3. Returns the scraped prices + + Designed to be called from cron for automated price tracking. + + Example: + curl http://localhost:8000/scraper/newenglandoil/latestprice + + Returns: + LatestPriceResponse with scraped prices and storage statistics + """ + logger.info("=" * 60) + logger.info("SCRAPER ENDPOINT CALLED - New England Oil Zone 10") + logger.info("=" * 60) + + try: + # Scrape the website + scraped_prices = scrape_newengland_oil() + + if not scraped_prices: + logger.warning("No prices were scraped") + return LatestPriceResponse( + status="warning", + message="No prices found on website", + prices_scraped=0, + prices_stored=0, + scrape_timestamp=datetime.utcnow().isoformat(), + prices=[] + ) + + # Store prices in database + stored_count = 0 + price_records = [] + + for price_data in scraped_prices: + # Add to response list (regardless of whether it's new or existing) + price_records.append(PriceRecord( + company_name=price_data["company_name"], + town=price_data.get("town"), + price_decimal=float(price_data["price_decimal"]), + scrape_date=price_data["scrape_date"].isoformat(), + zone=price_data.get("zone", "zone10"), + )) + + try: + # Check for existing record to prevent duplicates + existing_record = db.query(CompanyPrice).filter( + CompanyPrice.company_name == price_data["company_name"], + CompanyPrice.scrape_date == price_data["scrape_date"], + CompanyPrice.zone == price_data.get("zone", "zone10") + ).first() + + if existing_record: + logger.debug(f"Skipping duplicate record for {price_data['company_name']} on {price_data['scrape_date']}") + continue + + # Create database record + price_record = CompanyPrice( + company_name=price_data["company_name"], + town=price_data.get("town"), + price_decimal=price_data["price_decimal"], + scrape_date=price_data["scrape_date"], + zone=price_data.get("zone", "zone10"), + ) + + db.add(price_record) + stored_count += 1 + + except Exception as e: + logger.error(f"Failed to store price for {price_data.get('company_name')}: {e}") + + # Commit all new records + if stored_count > 0: + db.commit() + logger.info(f"Successfully stored {stored_count} new price records") + else: + logger.info("No new price records to store (all duplicates)") + + return LatestPriceResponse( + status="success", + message=f"Successfully scraped {len(scraped_prices)} prices, stored {stored_count} new records", + prices_scraped=len(scraped_prices), + prices_stored=stored_count, + scrape_timestamp=datetime.utcnow().isoformat(), + prices=price_records, + ) + + except ScraperError as e: + logger.error(f"Scraper error: {e}") + raise HTTPException( + status_code=500, + detail=f"Scraping failed: {str(e)}" + ) + + except SQLAlchemyError as e: + db.rollback() + logger.error(f"Database error during price storage: {e}", exc_info=True) + raise HTTPException( + status_code=500, + detail=f"Database error: {str(e)}" + ) + + except Exception as e: + db.rollback() + logger.error(f"Unexpected error during scraping: {e}", exc_info=True) + raise HTTPException( + status_code=500, + detail=f"Scraping failed: {str(e)}" + ) diff --git a/app/newenglandoil/schemas.py b/app/newenglandoil/schemas.py new file mode 100644 index 0000000..cd31785 --- /dev/null +++ b/app/newenglandoil/schemas.py @@ -0,0 +1,12 @@ +from typing import List +from pydantic import BaseModel +from app.schemas import PriceRecord + +class LatestPriceResponse(BaseModel): + """Latest price scrape response schema.""" + status: str + message: str + prices_scraped: int + prices_stored: int + scrape_timestamp: str + prices: List[PriceRecord] diff --git a/app/newenglandoil/scraper.py b/app/newenglandoil/scraper.py new file mode 100644 index 0000000..d028105 --- /dev/null +++ b/app/newenglandoil/scraper.py @@ -0,0 +1,170 @@ +""" +Web scraping module for New England Oil prices. + +This module handles scraping oil price data from the New England Oil website +for Zone 10 (Central Massachusetts). +""" + +import logging +import time +from datetime import date +from typing import List, Dict, Optional +from decimal import Decimal + +import requests +from bs4 import BeautifulSoup + +from app.config import ( + NEWENGLAND_OIL_ZONE10_URL, + SCRAPER_USER_AGENT, + SCRAPER_TIMEOUT, + SCRAPER_DELAY_SECONDS, +) + +logger = logging.getLogger(__name__) + + +class ScraperError(Exception): + """Custom exception for scraper errors.""" + pass + + +def scrape_newengland_oil() -> List[Dict[str, any]]: + """ + Scrape oil prices from New England Oil Zone 10 page. + + Fetches the page, parses the HTML table, and extracts company names, + towns, and prices. + + Returns: + List of dictionaries with keys: company_name, town, price_decimal, scrape_date, zone + + Raises: + ScraperError: If the request fails or parsing fails + """ + logger.info(f"Starting scrape of {NEWENGLAND_OIL_ZONE10_URL}") + + headers = { + "User-Agent": SCRAPER_USER_AGENT, + "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", + "Accept-Language": "en-US,en;q=0.5", + "Accept-Encoding": "gzip, deflate", + "Connection": "keep-alive", + } + + try: + # Make the request + response = requests.get( + NEWENGLAND_OIL_ZONE10_URL, + headers=headers, + timeout=SCRAPER_TIMEOUT + ) + response.raise_for_status() + + logger.info(f"Successfully fetched page (status: {response.status_code})") + + # Parse HTML + soup = BeautifulSoup(response.content, 'lxml') + + # Find the price table + # The table typically has company names, towns, and prices + # We need to inspect the actual HTML structure + prices = [] + today = date.today() + + # Look for table rows with price data + # The structure appears to be: company links followed by town and price info + # We'll look for patterns in the HTML + + # Find all table rows + tables = soup.find_all('table') + + if not tables: + logger.warning("No tables found on page") + # Debug: Save HTMl to file + with open("debug_page.html", "wb") as f: + f.write(response.content) + raise ScraperError("No price table found on page") + + # The main price table is usually the largest one or contains specific markers + # Let's find rows that contain price information + for table in tables: + rows = table.find_all('tr') + + for row in rows: + cells = row.find_all(['td', 'th']) + + if len(cells) >= 3: # Expect at least company, town, price + # Try to extract company name (usually in a link) + company_link = row.find('a') + if company_link: + company_name = company_link.get_text(strip=True) + + # Extract text from all cells + cell_texts = [cell.get_text(strip=True) for cell in cells] + + # Look for price pattern (e.g., "$2.599" or "2.599") + price_value = None + town_value = None + + for text in cell_texts: + # Check if this looks like a price + text_clean = text.replace('$', '').replace(',', '').strip() + try: + # Try to parse as decimal + if text_clean and '.' in text_clean: + potential_price = Decimal(text_clean) + # Reasonable price range for heating oil (0.50 to 10.00) + if Decimal('0.50') <= potential_price <= Decimal('10.00'): + price_value = potential_price + break + except (ValueError, ArithmeticError): + # Not a valid price, might be town name + if text and not text.startswith('$') and len(text) > 2: + if not town_value: # Take first non-price text as town + town_value = text + + if price_value: + prices.append({ + "company_name": company_name, + "town": town_value, + "price_decimal": price_value, + "scrape_date": today, + "zone": "zone10" + }) + logger.debug(f"Found: {company_name} - {town_value} - ${price_value}") + + if not prices: + logger.warning("No prices extracted from page") + raise ScraperError("Failed to extract any price data from page") + + logger.info(f"Successfully scraped {len(prices)} price records") + return prices + + except requests.RequestException as e: + logger.error(f"Request failed: {e}") + raise ScraperError(f"Failed to fetch page: {str(e)}") + + except Exception as e: + logger.error(f"Scraping failed: {e}", exc_info=True) + raise ScraperError(f"Failed to parse page: {str(e)}") + + +def scrape_and_delay() -> List[Dict[str, any]]: + """ + Scrape prices and apply rate limiting delay. + + This is a convenience function that scrapes and then sleeps + to respect rate limits. + + Returns: + List of price dictionaries + """ + prices = scrape_newengland_oil() + + # Apply rate limiting delay + if SCRAPER_DELAY_SECONDS > 0: + logger.debug(f"Sleeping {SCRAPER_DELAY_SECONDS}s for rate limiting") + time.sleep(SCRAPER_DELAY_SECONDS) + + return prices diff --git a/app/priceticker/__init__.py b/app/priceticker/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/app/priceticker/router.py b/app/priceticker/router.py new file mode 100644 index 0000000..a6ccba5 --- /dev/null +++ b/app/priceticker/router.py @@ -0,0 +1,72 @@ +from fastapi import APIRouter, Depends, HTTPException +from sqlalchemy.orm import Session +from typing import List +import logging + +from app.database import get_db +from app.models import TickerPrice +from app.priceticker.scraper import fetch_ticker_data + +logger = logging.getLogger(__name__) + +router = APIRouter( + prefix="/scraper/priceticker", + tags=["Price Ticker"] +) + +@router.post("/update") +async def update_prices(db: Session = Depends(get_db)): + """ + Trigger an immediate update of stock/commodity prices. + """ + logger.info("Triggering ticker update...") + data = fetch_ticker_data() + + if not data: + raise HTTPException(status_code=500, detail="Failed to fetch ticker data") + + try: + saved_records = [] + for item in data: + record = TickerPrice( + symbol=item["symbol"], + price_decimal=item["price"], + currency=item["currency"], + change_decimal=item["change"], + percent_change_decimal=item["percent_change"], + timestamp=item["timestamp"] + ) + db.add(record) + saved_records.append(record) + + db.commit() + return {"status": "success", "updated": len(saved_records)} + + except Exception as e: + db.rollback() + logger.error(f"Database error saving tickers: {e}") + raise HTTPException(status_code=500, detail=f"Database error: {str(e)}") + +@router.get("/latest") +async def get_latest_prices(db: Session = Depends(get_db)): + """ + Get the most recent price for each ticker symbol. + """ + # Subquery to find the latest timestamp for each symbol + # This is a bit complex in pure ORM, so we might do it simply for now: + # 1. Get list of distinct symbols we care about + # 2. Query latest for each + + results = [] + # We know the symbols we care about: HO=F, CL=F, RB=F + TARGET_SYMBOLS = ["HO=F", "CL=F", "RB=F"] + + for symbol in TARGET_SYMBOLS: + latest = db.query(TickerPrice).filter( + TickerPrice.symbol == symbol + ).order_by(TickerPrice.timestamp.desc()).first() + + if latest: + results.append(latest.to_dict()) + + return results diff --git a/app/priceticker/scraper.py b/app/priceticker/scraper.py new file mode 100644 index 0000000..a10c6dc --- /dev/null +++ b/app/priceticker/scraper.py @@ -0,0 +1,65 @@ +import logging +import yfinance as yf +from decimal import Decimal +from datetime import datetime +from typing import List, Dict, Any + +logger = logging.getLogger(__name__) + +# Ticker mapping +# HO=F : Heating Oil +# CL=F : Crude Oil +# RB=F : RBOB Gasoline +TICKERS = ["HO=F", "CL=F", "RB=F"] + +def fetch_ticker_data() -> List[Dict[str, Any]]: + """ + Fetch current data for oil tickers from Yahoo Finance. + + Returns: + List of dictionaries containing ticker data. + """ + results = [] + + try: + # Fetch data for all tickers at once + tickers_str = " ".join(TICKERS) + data = yf.Tickers(tickers_str) + + for symbol in TICKERS: + try: + ticker = data.tickers[symbol] + # Fast info usually contains the latest price + info = ticker.fast_info + + # Fallback to history if fast_info is missing crucial data (simplified here) + # But fast_info is generally faster and sufficient for last_price + + last_price = info.last_price + previous_close = info.previous_close + + if last_price is None: + logger.warning(f"No price found for {symbol}") + continue + + change = last_price - previous_close + percent_change = (change / previous_close) * 100 if previous_close else 0 + + results.append({ + "symbol": symbol, + "price": Decimal(str(last_price)), + "currency": info.currency, + "change": Decimal(str(change)), + "percent_change": Decimal(str(percent_change)), + "timestamp": datetime.utcnow() + }) + + logger.info(f"Fetched {symbol}: {last_price} ({percent_change:.2f}%)") + + except Exception as e: + logger.error(f"Error processing {symbol}: {e}") + + except Exception as e: + logger.error(f"Error fetching ticker data: {e}") + + return results diff --git a/app/schemas.py b/app/schemas.py new file mode 100644 index 0000000..b27b9e7 --- /dev/null +++ b/app/schemas.py @@ -0,0 +1,16 @@ +from typing import Optional +from pydantic import BaseModel + +class HealthResponse(BaseModel): + """Health check response schema.""" + status: str + db_connected: bool + + +class PriceRecord(BaseModel): + """Single price record schema.""" + company_name: str + town: Optional[str] = None + price_decimal: float + scrape_date: str + zone: str diff --git a/cronjob.txt b/cronjob.txt new file mode 100644 index 0000000..faa7761 --- /dev/null +++ b/cronjob.txt @@ -0,0 +1 @@ +*/5 * * * * /usr/bin/curl -m 120 -s http://localhost:9519/scraper/newenglandoil/latestprice &>/dev/null diff --git a/debug_scraper.py b/debug_scraper.py new file mode 100644 index 0000000..21a3379 --- /dev/null +++ b/debug_scraper.py @@ -0,0 +1,26 @@ + +import requests +from bs4 import BeautifulSoup + +url = "https://www.newenglandoil.com/massachusetts/zone10.asp?x=0" +headers = { + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36" +} + +try: + response = requests.get(url, headers=headers, timeout=15) + print(f"Status Code: {response.status_code}") + + with open("debug_page.html", "w", encoding="utf-8") as f: + f.write(response.text) + + soup = BeautifulSoup(response.content, 'lxml') + tables = soup.find_all('table') + print(f"Tables found: {len(tables)}") + + if not tables: + print("Preview of content:") + print(response.text[:500]) + +except Exception as e: + print(f"Error: {e}") diff --git a/debug_scraper_ua.py b/debug_scraper_ua.py new file mode 100644 index 0000000..2b413fe --- /dev/null +++ b/debug_scraper_ua.py @@ -0,0 +1,25 @@ + +import requests +from bs4 import BeautifulSoup + +url = "https://www.newenglandoil.com/massachusetts/zone10.asp?x=0" +# Use the UA from config.py +headers = { + "User-Agent": "Unraid-EamcoScraper/1.0 (eeames214@gmail.com)" +} + +try: + print(f"Testing with UA: {headers['User-Agent']}") + response = requests.get(url, headers=headers, timeout=15) + print(f"Status Code: {response.status_code}") + + soup = BeautifulSoup(response.content, 'lxml') + tables = soup.find_all('table') + print(f"Tables found: {len(tables)}") + + if not tables: + print("No tables found. Dumping start of response:") + print(response.text[:500]) + +except Exception as e: + print(f"Error: {e}") diff --git a/migration.sql b/migration.sql new file mode 100644 index 0000000..80628d7 --- /dev/null +++ b/migration.sql @@ -0,0 +1,29 @@ +-- Database migration for eamco_scraper +-- Creates company_prices table for storing historical oil price data + +-- Create company_prices table +CREATE TABLE IF NOT EXISTS company_prices ( + id SERIAL PRIMARY KEY, + company_name VARCHAR(255) NOT NULL, + town VARCHAR(100), + price_decimal DECIMAL(6,3) NOT NULL, + scrape_date DATE NOT NULL, + zone VARCHAR(50) NOT NULL DEFAULT 'zone10', + created_at TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP +); + +-- Create indexes for efficient querying +CREATE INDEX IF NOT EXISTS idx_company_prices_company ON company_prices(company_name); +CREATE INDEX IF NOT EXISTS idx_company_prices_scrape_date ON company_prices(scrape_date); +CREATE INDEX IF NOT EXISTS idx_company_prices_zone ON company_prices(zone); +CREATE INDEX IF NOT EXISTS idx_company_prices_company_date ON company_prices(company_name, scrape_date); +CREATE INDEX IF NOT EXISTS idx_company_prices_zone_date ON company_prices(zone, scrape_date); + +-- Add comment to table +COMMENT ON TABLE company_prices IS 'Historical oil price data scraped from New England Oil website'; +COMMENT ON COLUMN company_prices.company_name IS 'Name of the oil company'; +COMMENT ON COLUMN company_prices.town IS 'Town where the company operates'; +COMMENT ON COLUMN company_prices.price_decimal IS 'Price per gallon in dollars'; +COMMENT ON COLUMN company_prices.scrape_date IS 'Date when the price was listed on the website'; +COMMENT ON COLUMN company_prices.zone IS 'Geographic zone (e.g., zone10 for Central Massachusetts)'; +COMMENT ON COLUMN company_prices.created_at IS 'Timestamp when the record was inserted into database'; diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..d3fc7d3 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,18 @@ +# eamco_scraper dependencies +# FastAPI web framework and server +fastapi>=0.109.0,<1.0.0 +uvicorn[standard]>=0.27.0,<1.0.0 +pydantic>=2.5.0,<3.0.0 + +# Database +sqlalchemy>=2.0.0,<3.0.0 +psycopg2-binary>=2.9.9,<3.0.0 + +# Web scraping +beautifulsoup4>=4.12.0,<5.0.0 +requests>=2.31.0,<3.0.0 +lxml>=5.0.0,<6.0.0 + +# Configuration +python-dotenv>=1.0.0,<2.0.0 +yfinance>=0.2.36 diff --git a/test.sh b/test.sh new file mode 100755 index 0000000..90e4c14 --- /dev/null +++ b/test.sh @@ -0,0 +1,75 @@ +#!/bin/bash +# Test script for eamco_scraper service + +echo "=== eamco_scraper Test Script ===" +echo "" + +# Check if service directory exists +if [ ! -d "/mnt/code/oil/eamco/eamco_scraper" ]; then + echo "❌ Service directory not found" + exit 1 +fi + +echo "✅ Service directory exists" +echo "" + +# Test 1: Build Docker image +echo "Test 1: Building Docker image..." +cd /mnt/code/oil/eamco/eamco_deploy +docker-compose -f docker-compose.local.yml build scraper_local + +if [ $? -eq 0 ]; then + echo "✅ Docker build successful" +else + echo "❌ Docker build failed" + exit 1 +fi + +echo "" + +# Test 2: Start the service +echo "Test 2: Starting service..." +docker-compose -f docker-compose.local.yml up -d scraper_local + +if [ $? -eq 0 ]; then + echo "✅ Service started" +else + echo "❌ Service failed to start" + exit 1 +fi + +echo "" +echo "Waiting 5 seconds for service to initialize..." +sleep 5 + +# Test 3: Health check +echo "Test 3: Testing health endpoint..." +HEALTH_RESPONSE=$(curl -s http://localhost:9619/health) + +if echo "$HEALTH_RESPONSE" | grep -q "status"; then + echo "✅ Health endpoint responding" + echo "Response: $HEALTH_RESPONSE" +else + echo "❌ Health endpoint not responding" +fi + +echo "" + +# Test 4: Scraper endpoint +echo "Test 4: Testing scraper endpoint..." +echo "This will scrape live data from New England Oil..." +SCRAPER_RESPONSE=$(curl -s http://localhost:9619/scraper/newenglandoil/latestprice) + +if echo "$SCRAPER_RESPONSE" | grep -q "status"; then + echo "✅ Scraper endpoint responding" + echo "Response preview:" + echo "$SCRAPER_RESPONSE" | head -n 20 +else + echo "❌ Scraper endpoint not responding" +fi + +echo "" +echo "=== Test Complete ===" +echo "" +echo "To view logs: docker-compose -f docker-compose.local.yml logs scraper_local" +echo "To stop service: docker-compose -f docker-compose.local.yml down scraper_local"