feat: initial commit for oil price scraper service
FastAPI-based scraper for commodity ticker prices (HO, CL, RB futures) and competitor oil pricing from NewEnglandOil. Includes cron-driven scraping, PostgreSQL storage, and REST endpoints for price retrieval. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
1
app/__init__.py
Normal file
1
app/__init__.py
Normal file
@@ -0,0 +1 @@
|
||||
# eamco_scraper app package
|
||||
111
app/config.py
Normal file
111
app/config.py
Normal file
@@ -0,0 +1,111 @@
|
||||
"""
|
||||
Configuration settings for eamco_scraper.
|
||||
|
||||
This module provides configuration with environment-based switching:
|
||||
- LOCAL: Uses 'eamco' database, localhost CORS origins
|
||||
- PRODUCTION: Uses 'auburnoil' database, production domain CORS origins
|
||||
|
||||
Environment variables are loaded from .env.local or .env.prod depending
|
||||
on the Docker compose file used.
|
||||
"""
|
||||
|
||||
import logging
|
||||
import os
|
||||
from typing import List
|
||||
|
||||
from dotenv import load_dotenv
|
||||
|
||||
# Load environment variables from .env file if present
|
||||
load_dotenv()
|
||||
|
||||
# =============================================================================
|
||||
# ENVIRONMENT MODE
|
||||
# =============================================================================
|
||||
|
||||
MODE = os.getenv("MODE", "LOCAL")
|
||||
CURRENT_SETTINGS = os.getenv("CURRENT_SETTINGS", "DEVELOPMENT")
|
||||
|
||||
# Log configuration mode (logger setup happens after config is loaded)
|
||||
_config_mode_msg = f"Using {'PRODUCTION' if CURRENT_SETTINGS == 'PRODUCTION' else 'DEVELOPMENT'} configuration"
|
||||
|
||||
# =============================================================================
|
||||
# DATABASE CONFIGURATION
|
||||
# =============================================================================
|
||||
|
||||
# Database connection components (can be overridden individually)
|
||||
POSTGRES_USERNAME = os.getenv("POSTGRES_USERNAME", "postgres")
|
||||
POSTGRES_PW = os.getenv("POSTGRES_PW", "password")
|
||||
POSTGRES_SERVER = os.getenv("POSTGRES_SERVER", "192.168.1.204")
|
||||
POSTGRES_PORT = os.getenv("POSTGRES_PORT", "5432")
|
||||
|
||||
# Database name differs by environment
|
||||
if CURRENT_SETTINGS == "PRODUCTION":
|
||||
POSTGRES_DBNAME = os.getenv("POSTGRES_DBNAME", "auburnoil")
|
||||
else:
|
||||
POSTGRES_DBNAME = os.getenv("POSTGRES_DBNAME", "eamco")
|
||||
|
||||
# Build connection URI from components (fallback)
|
||||
_DEFAULT_DATABASE_URI = "postgresql+psycopg2://{}:{}@{}:{}/{}".format(
|
||||
POSTGRES_USERNAME,
|
||||
POSTGRES_PW,
|
||||
POSTGRES_SERVER,
|
||||
POSTGRES_PORT,
|
||||
POSTGRES_DBNAME
|
||||
)
|
||||
|
||||
# Allow full DATABASE_URL override
|
||||
DATABASE_URL: str = os.getenv("DATABASE_URL", _DEFAULT_DATABASE_URI)
|
||||
|
||||
# SQLAlchemy binds (for compatibility)
|
||||
SQLALCHEMY_DATABASE_URI = DATABASE_URL
|
||||
SQLALCHEMY_BINDS = {POSTGRES_DBNAME: SQLALCHEMY_DATABASE_URI}
|
||||
|
||||
# =============================================================================
|
||||
# CORS CONFIGURATION
|
||||
# =============================================================================
|
||||
|
||||
# Parse CORS origins from environment (comma-separated) or use defaults
|
||||
_cors_env = os.getenv("CORS_ORIGINS", "")
|
||||
|
||||
if _cors_env:
|
||||
CORS_ORIGINS: List[str] = [origin.strip() for origin in _cors_env.split(",")]
|
||||
elif CURRENT_SETTINGS == "PRODUCTION":
|
||||
# Production CORS origins
|
||||
CORS_ORIGINS = [
|
||||
"https://oil.edwineames.com",
|
||||
"https://edwineames.com",
|
||||
]
|
||||
else:
|
||||
# Development CORS origins
|
||||
CORS_ORIGINS = [
|
||||
"http://localhost:9000",
|
||||
"https://localhost:9513",
|
||||
"http://localhost:9514",
|
||||
"http://localhost:9512",
|
||||
"http://localhost:9511",
|
||||
"http://localhost:5173", # Frontend port
|
||||
"http://localhost:9616", # Authorize service port
|
||||
]
|
||||
|
||||
# =============================================================================
|
||||
# SCRAPER CONFIGURATION
|
||||
# =============================================================================
|
||||
|
||||
# User agent for web scraping (identifies your application)
|
||||
SCRAPER_USER_AGENT: str = "Unraid-EamcoScraper/1.0 (eeames214@gmail.com)"
|
||||
|
||||
# Rate limiting: Sleep between requests (be respectful to target servers)
|
||||
SCRAPER_DELAY_SECONDS: float = float(os.getenv("SCRAPER_DELAY", "2.0"))
|
||||
|
||||
# Request timeout in seconds
|
||||
SCRAPER_TIMEOUT: int = int(os.getenv("SCRAPER_TIMEOUT", "10"))
|
||||
|
||||
# Target URL for New England Oil Zone 10
|
||||
NEWENGLAND_OIL_ZONE10_URL: str = "https://www.newenglandoil.com/massachusetts/zone10.asp?x=0"
|
||||
|
||||
# =============================================================================
|
||||
# LOGGING CONFIGURATION
|
||||
# =============================================================================
|
||||
|
||||
LOG_LEVEL: str = os.getenv("LOG_LEVEL", "INFO")
|
||||
LOG_FORMAT: str = "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
|
||||
88
app/database.py
Normal file
88
app/database.py
Normal file
@@ -0,0 +1,88 @@
|
||||
import logging
|
||||
import sys
|
||||
from contextlib import contextmanager
|
||||
from typing import Generator
|
||||
|
||||
from sqlalchemy import create_engine, text
|
||||
from sqlalchemy.orm import sessionmaker, Session
|
||||
from sqlalchemy.exc import SQLAlchemyError
|
||||
|
||||
from app.config import (
|
||||
DATABASE_URL,
|
||||
LOG_LEVEL,
|
||||
LOG_FORMAT,
|
||||
)
|
||||
|
||||
# =============================================================================
|
||||
# LOGGING CONFIGURATION
|
||||
# =============================================================================
|
||||
|
||||
logging.basicConfig(
|
||||
level=getattr(logging, LOG_LEVEL.upper(), logging.INFO),
|
||||
format=LOG_FORMAT,
|
||||
handlers=[
|
||||
logging.StreamHandler(sys.stdout),
|
||||
]
|
||||
)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# =============================================================================
|
||||
# DATABASE SETUP
|
||||
# =============================================================================
|
||||
|
||||
# Create SQLAlchemy engine with connection pooling
|
||||
engine = create_engine(
|
||||
DATABASE_URL,
|
||||
pool_pre_ping=True, # Verify connections before use
|
||||
pool_size=5,
|
||||
max_overflow=10,
|
||||
echo=False, # Set to True for SQL debugging
|
||||
)
|
||||
|
||||
# Session factory
|
||||
SessionLocal = sessionmaker(
|
||||
autocommit=False,
|
||||
autoflush=False,
|
||||
bind=engine,
|
||||
)
|
||||
|
||||
|
||||
def get_db() -> Generator[Session, None, None]:
|
||||
"""
|
||||
Dependency that provides a database session.
|
||||
|
||||
Yields a SQLAlchemy session and ensures proper cleanup.
|
||||
"""
|
||||
db = SessionLocal()
|
||||
try:
|
||||
yield db
|
||||
finally:
|
||||
db.close()
|
||||
|
||||
|
||||
@contextmanager
|
||||
def get_db_session() -> Generator[Session, None, None]:
|
||||
"""
|
||||
Context manager for database sessions (non-dependency use).
|
||||
"""
|
||||
db = SessionLocal()
|
||||
try:
|
||||
yield db
|
||||
finally:
|
||||
db.close()
|
||||
|
||||
|
||||
def check_db_connection() -> bool:
|
||||
"""
|
||||
Test database connectivity.
|
||||
|
||||
Returns:
|
||||
True if database is reachable, False otherwise
|
||||
"""
|
||||
try:
|
||||
with get_db_session() as db:
|
||||
db.execute(text("SELECT 1"))
|
||||
return True
|
||||
except SQLAlchemyError as e:
|
||||
logger.error(f"Database connection failed: {e}")
|
||||
return False
|
||||
195
app/main.py
Normal file
195
app/main.py
Normal file
@@ -0,0 +1,195 @@
|
||||
"""
|
||||
eamco_scraper - FastAPI Oil Price Scraping Microservice.
|
||||
|
||||
This microservice provides endpoints for scraping oil prices from New England Oil
|
||||
and storing them in the database for historical tracking.
|
||||
|
||||
Endpoints:
|
||||
GET /health - Health check with database connectivity status
|
||||
GET /scraper/newenglandoil/latestprice - Trigger scrape and return latest prices
|
||||
|
||||
Usage:
|
||||
# Development
|
||||
uvicorn app.main:app --reload --host 0.0.0.0 --port 8000
|
||||
|
||||
# Production (Docker)
|
||||
docker run -p 8000:8000 eamco_scraper
|
||||
|
||||
# Trigger from cron
|
||||
curl http://localhost:8000/scraper/newenglandoil/latestprice
|
||||
"""
|
||||
|
||||
import logging
|
||||
import os
|
||||
import sys
|
||||
from typing import List
|
||||
from datetime import datetime
|
||||
|
||||
from fastapi import FastAPI, Depends, HTTPException
|
||||
from fastapi.middleware.cors import CORSMiddleware
|
||||
from sqlalchemy.orm import Session
|
||||
|
||||
from app.config import (
|
||||
DATABASE_URL,
|
||||
CORS_ORIGINS,
|
||||
LOG_LEVEL,
|
||||
LOG_FORMAT,
|
||||
)
|
||||
from app.models import Base, CompanyPrice
|
||||
from app.database import engine, get_db, check_db_connection
|
||||
from app.schemas import HealthResponse, PriceRecord
|
||||
from app.newenglandoil.router import router as newenglandoil_router
|
||||
from app.priceticker.router import router as priceticker_router
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# LOGGING CONFIGURATION
|
||||
# =============================================================================
|
||||
|
||||
logging.basicConfig(
|
||||
level=getattr(logging, LOG_LEVEL.upper(), logging.INFO),
|
||||
format=LOG_FORMAT,
|
||||
handlers=[
|
||||
logging.StreamHandler(sys.stdout),
|
||||
]
|
||||
)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# =============================================================================
|
||||
# FASTAPI APPLICATION
|
||||
# =============================================================================
|
||||
|
||||
app = FastAPI(
|
||||
title="eamco_scraper",
|
||||
description="Oil price scraping microservice for New England Oil",
|
||||
version="1.0.0",
|
||||
docs_url="/docs",
|
||||
redoc_url="/redoc",
|
||||
)
|
||||
|
||||
# =============================================================================
|
||||
# CORS MIDDLEWARE
|
||||
# =============================================================================
|
||||
|
||||
app.add_middleware(
|
||||
CORSMiddleware,
|
||||
allow_origins=CORS_ORIGINS,
|
||||
allow_credentials=True,
|
||||
allow_methods=["*"],
|
||||
allow_headers=["*"],
|
||||
)
|
||||
|
||||
# =============================================================================
|
||||
# ROUTERS
|
||||
# =============================================================================
|
||||
|
||||
app.include_router(newenglandoil_router)
|
||||
app.include_router(priceticker_router)
|
||||
|
||||
# =============================================================================
|
||||
# ENDPOINTS
|
||||
# =============================================================================
|
||||
|
||||
|
||||
@app.get("/", include_in_schema=False)
|
||||
async def root():
|
||||
"""Root endpoint - redirect to docs."""
|
||||
return {
|
||||
"service": "eamco_scraper",
|
||||
"version": "1.0.0",
|
||||
"docs": "/docs",
|
||||
}
|
||||
|
||||
|
||||
@app.get("/health", response_model=HealthResponse, tags=["Health"])
|
||||
async def health_check():
|
||||
"""
|
||||
Health check endpoint.
|
||||
|
||||
Returns service status and database connectivity.
|
||||
Use this endpoint for container health checks and monitoring.
|
||||
|
||||
Returns:
|
||||
HealthResponse with status and db_connected flag
|
||||
"""
|
||||
db_connected = check_db_connection()
|
||||
|
||||
return HealthResponse(
|
||||
status="healthy" if db_connected else "degraded",
|
||||
db_connected=db_connected,
|
||||
)
|
||||
|
||||
|
||||
@app.get(
|
||||
"/scraper/prices",
|
||||
response_model=List[PriceRecord],
|
||||
tags=["Prices"],
|
||||
)
|
||||
async def get_stored_prices(
|
||||
date: str | None = None,
|
||||
db: Session = Depends(get_db)
|
||||
):
|
||||
"""
|
||||
Get stored oil prices from the database.
|
||||
|
||||
If no date is provided, returns prices for the current date (UTC).
|
||||
Does NOT trigger a scrape.
|
||||
"""
|
||||
if not date:
|
||||
date = datetime.utcnow().date().isoformat()
|
||||
|
||||
try:
|
||||
# Query prices for the specific date
|
||||
prices = db.query(CompanyPrice).filter(
|
||||
CompanyPrice.scrape_date == date
|
||||
).all()
|
||||
|
||||
return [
|
||||
PriceRecord(
|
||||
company_name=p.company_name,
|
||||
town=p.town,
|
||||
price_decimal=float(p.price_decimal),
|
||||
scrape_date=str(p.scrape_date),
|
||||
zone=p.zone
|
||||
) for p in prices
|
||||
]
|
||||
except Exception as e:
|
||||
logger.error(f"Error fetching prices: {e}")
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# STARTUP/SHUTDOWN EVENTS
|
||||
# =============================================================================
|
||||
|
||||
|
||||
@app.on_event("startup")
|
||||
async def startup_event():
|
||||
"""Application startup - log configuration and test DB connection."""
|
||||
logger.info("🚀 eamco_scraper STARTING")
|
||||
mode = os.environ.get('MODE', 'DEVELOPMENT').upper()
|
||||
if mode in ['DEVELOPMENT', 'DEV', 'LOCAL']:
|
||||
logger.info("🤖🤖🤖🤖🤖 Mode: Development 🤖🤖🤖🤖🤖")
|
||||
elif mode in ['PRODUCTION', 'PROD']:
|
||||
logger.info("💀💀💀💀💀💀💀💀💀💀 ⚠️ WARNING PRODUCTION 💀💀💀💀💀💀💀💀💀💀")
|
||||
logger.info(f"DB: {DATABASE_URL[:30]}...")
|
||||
logger.info(f"CORS: {len(CORS_ORIGINS)} origins configured")
|
||||
|
||||
# Test database connection
|
||||
if check_db_connection():
|
||||
logger.info("DB Connection: ✅ OK")
|
||||
else:
|
||||
logger.warning("DB Connection: ❌ FAILED")
|
||||
|
||||
# Create tables if they don't exist
|
||||
try:
|
||||
Base.metadata.create_all(bind=engine)
|
||||
logger.info("Database tables verified/created")
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to create tables: {e}")
|
||||
|
||||
|
||||
@app.on_event("shutdown")
|
||||
async def shutdown_event():
|
||||
"""Application shutdown - cleanup."""
|
||||
logger.info("🛑 eamco_scraper SHUTTING DOWN")
|
||||
99
app/models.py
Normal file
99
app/models.py
Normal file
@@ -0,0 +1,99 @@
|
||||
"""
|
||||
SQLAlchemy models for eamco_scraper.
|
||||
|
||||
This module defines the database models for storing scraped oil price data.
|
||||
"""
|
||||
|
||||
from datetime import datetime
|
||||
from sqlalchemy import Column, Integer, String, Numeric, Date, DateTime, Index
|
||||
from sqlalchemy.ext.declarative import declarative_base
|
||||
|
||||
Base = declarative_base()
|
||||
|
||||
|
||||
class CompanyPrice(Base):
|
||||
"""
|
||||
Model for storing oil company pricing data.
|
||||
|
||||
This table stores historical pricing data from oil companies.
|
||||
Each scrape creates new records (no updates) to enable price trend analysis.
|
||||
|
||||
Attributes:
|
||||
id: Primary key
|
||||
company_name: Name of the oil company
|
||||
town: Town where the company operates
|
||||
price_decimal: Price per gallon (e.g., 2.599)
|
||||
scrape_date: Date when the price was listed on the website
|
||||
zone: Geographic zone (default: 'zone10' for Central Massachusetts)
|
||||
created_at: Timestamp when the record was inserted into database
|
||||
"""
|
||||
|
||||
__tablename__ = "company_prices"
|
||||
|
||||
id = Column(Integer, primary_key=True, autoincrement=True)
|
||||
company_name = Column(String(255), nullable=False, index=True)
|
||||
town = Column(String(100), nullable=True)
|
||||
price_decimal = Column(Numeric(6, 3), nullable=False)
|
||||
scrape_date = Column(Date, nullable=False, index=True)
|
||||
zone = Column(String(50), nullable=False, default="zone10", index=True)
|
||||
created_at = Column(DateTime, nullable=False, default=datetime.utcnow)
|
||||
|
||||
def __repr__(self):
|
||||
return f"<CompanyPrice(company='{self.company_name}', price={self.price_decimal}, date={self.scrape_date})>"
|
||||
|
||||
def to_dict(self):
|
||||
"""Convert model instance to dictionary for JSON serialization."""
|
||||
return {
|
||||
"id": self.id,
|
||||
"company_name": self.company_name,
|
||||
"town": self.town,
|
||||
"price_decimal": float(self.price_decimal) if self.price_decimal else None,
|
||||
"scrape_date": self.scrape_date.isoformat() if self.scrape_date else None,
|
||||
"zone": self.zone,
|
||||
"created_at": self.created_at.isoformat() if self.created_at else None,
|
||||
}
|
||||
|
||||
|
||||
# Create composite indexes for common queries
|
||||
Index('idx_company_prices_company_date', CompanyPrice.company_name, CompanyPrice.scrape_date)
|
||||
Index('idx_company_prices_zone_date', CompanyPrice.zone, CompanyPrice.scrape_date)
|
||||
|
||||
|
||||
class TickerPrice(Base):
|
||||
"""
|
||||
Model for storing ticker prices (Stocks/Commodities).
|
||||
|
||||
Attributes:
|
||||
id: Primary key
|
||||
symbol: Ticker symbol (e.g., HO=F, CL=F, RB=F)
|
||||
price_decimal: Current price
|
||||
currency: Currency code (e.g., USD)
|
||||
change_decimal: Price change amount
|
||||
percent_change_decimal: Price change percentage
|
||||
timestamp: Time of scrape
|
||||
"""
|
||||
|
||||
__tablename__ = "ticker_prices"
|
||||
|
||||
id = Column(Integer, primary_key=True, autoincrement=True)
|
||||
symbol = Column(String(20), nullable=False, index=True)
|
||||
price_decimal = Column(Numeric(10, 4), nullable=False)
|
||||
currency = Column(String(10), nullable=True)
|
||||
change_decimal = Column(Numeric(10, 4), nullable=True)
|
||||
percent_change_decimal = Column(Numeric(10, 4), nullable=True)
|
||||
timestamp = Column(DateTime, nullable=False, default=datetime.utcnow, index=True)
|
||||
|
||||
def __repr__(self):
|
||||
return f"<TickerPrice(symbol='{self.symbol}', price={self.price_decimal}, time={self.timestamp})>"
|
||||
|
||||
def to_dict(self):
|
||||
return {
|
||||
"id": self.id,
|
||||
"symbol": self.symbol,
|
||||
"price": float(self.price_decimal) if self.price_decimal is not None else None,
|
||||
"currency": self.currency,
|
||||
"change": float(self.change_decimal) if self.change_decimal is not None else None,
|
||||
"percent_change": float(self.percent_change_decimal) if self.percent_change_decimal is not None else None,
|
||||
"timestamp": self.timestamp.isoformat() if self.timestamp else None,
|
||||
}
|
||||
|
||||
9
app/newenglandoil/__init__.py
Normal file
9
app/newenglandoil/__init__.py
Normal file
@@ -0,0 +1,9 @@
|
||||
"""
|
||||
New England Oil Scraper Module.
|
||||
|
||||
This package contains code specific to scraping prices from the New England Oil website.
|
||||
"""
|
||||
|
||||
from app.newenglandoil.scraper import scrape_newengland_oil, ScraperError
|
||||
|
||||
__all__ = ['scrape_newengland_oil', 'ScraperError']
|
||||
140
app/newenglandoil/router.py
Normal file
140
app/newenglandoil/router.py
Normal file
@@ -0,0 +1,140 @@
|
||||
import logging
|
||||
from datetime import datetime
|
||||
from typing import List
|
||||
|
||||
from fastapi import APIRouter, Depends, HTTPException
|
||||
from sqlalchemy.orm import Session
|
||||
from sqlalchemy.exc import SQLAlchemyError
|
||||
|
||||
from app.models import CompanyPrice
|
||||
from app.schemas import PriceRecord
|
||||
from app.newenglandoil.schemas import LatestPriceResponse
|
||||
from app.newenglandoil.scraper import scrape_newengland_oil, ScraperError
|
||||
from app.database import get_db
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
router = APIRouter(
|
||||
prefix="/scraper/newenglandoil",
|
||||
tags=["New England Oil Scraper"],
|
||||
)
|
||||
|
||||
@router.get(
|
||||
"/latestprice",
|
||||
response_model=LatestPriceResponse,
|
||||
)
|
||||
async def get_latest_prices(db: Session = Depends(get_db)):
|
||||
"""
|
||||
Scrape latest oil prices from New England Oil Zone 10.
|
||||
|
||||
This endpoint:
|
||||
1. Scrapes the New England Oil website for current prices
|
||||
2. Stores all prices in the database (historical tracking)
|
||||
3. Returns the scraped prices
|
||||
|
||||
Designed to be called from cron for automated price tracking.
|
||||
|
||||
Example:
|
||||
curl http://localhost:8000/scraper/newenglandoil/latestprice
|
||||
|
||||
Returns:
|
||||
LatestPriceResponse with scraped prices and storage statistics
|
||||
"""
|
||||
logger.info("=" * 60)
|
||||
logger.info("SCRAPER ENDPOINT CALLED - New England Oil Zone 10")
|
||||
logger.info("=" * 60)
|
||||
|
||||
try:
|
||||
# Scrape the website
|
||||
scraped_prices = scrape_newengland_oil()
|
||||
|
||||
if not scraped_prices:
|
||||
logger.warning("No prices were scraped")
|
||||
return LatestPriceResponse(
|
||||
status="warning",
|
||||
message="No prices found on website",
|
||||
prices_scraped=0,
|
||||
prices_stored=0,
|
||||
scrape_timestamp=datetime.utcnow().isoformat(),
|
||||
prices=[]
|
||||
)
|
||||
|
||||
# Store prices in database
|
||||
stored_count = 0
|
||||
price_records = []
|
||||
|
||||
for price_data in scraped_prices:
|
||||
# Add to response list (regardless of whether it's new or existing)
|
||||
price_records.append(PriceRecord(
|
||||
company_name=price_data["company_name"],
|
||||
town=price_data.get("town"),
|
||||
price_decimal=float(price_data["price_decimal"]),
|
||||
scrape_date=price_data["scrape_date"].isoformat(),
|
||||
zone=price_data.get("zone", "zone10"),
|
||||
))
|
||||
|
||||
try:
|
||||
# Check for existing record to prevent duplicates
|
||||
existing_record = db.query(CompanyPrice).filter(
|
||||
CompanyPrice.company_name == price_data["company_name"],
|
||||
CompanyPrice.scrape_date == price_data["scrape_date"],
|
||||
CompanyPrice.zone == price_data.get("zone", "zone10")
|
||||
).first()
|
||||
|
||||
if existing_record:
|
||||
logger.debug(f"Skipping duplicate record for {price_data['company_name']} on {price_data['scrape_date']}")
|
||||
continue
|
||||
|
||||
# Create database record
|
||||
price_record = CompanyPrice(
|
||||
company_name=price_data["company_name"],
|
||||
town=price_data.get("town"),
|
||||
price_decimal=price_data["price_decimal"],
|
||||
scrape_date=price_data["scrape_date"],
|
||||
zone=price_data.get("zone", "zone10"),
|
||||
)
|
||||
|
||||
db.add(price_record)
|
||||
stored_count += 1
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to store price for {price_data.get('company_name')}: {e}")
|
||||
|
||||
# Commit all new records
|
||||
if stored_count > 0:
|
||||
db.commit()
|
||||
logger.info(f"Successfully stored {stored_count} new price records")
|
||||
else:
|
||||
logger.info("No new price records to store (all duplicates)")
|
||||
|
||||
return LatestPriceResponse(
|
||||
status="success",
|
||||
message=f"Successfully scraped {len(scraped_prices)} prices, stored {stored_count} new records",
|
||||
prices_scraped=len(scraped_prices),
|
||||
prices_stored=stored_count,
|
||||
scrape_timestamp=datetime.utcnow().isoformat(),
|
||||
prices=price_records,
|
||||
)
|
||||
|
||||
except ScraperError as e:
|
||||
logger.error(f"Scraper error: {e}")
|
||||
raise HTTPException(
|
||||
status_code=500,
|
||||
detail=f"Scraping failed: {str(e)}"
|
||||
)
|
||||
|
||||
except SQLAlchemyError as e:
|
||||
db.rollback()
|
||||
logger.error(f"Database error during price storage: {e}", exc_info=True)
|
||||
raise HTTPException(
|
||||
status_code=500,
|
||||
detail=f"Database error: {str(e)}"
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
db.rollback()
|
||||
logger.error(f"Unexpected error during scraping: {e}", exc_info=True)
|
||||
raise HTTPException(
|
||||
status_code=500,
|
||||
detail=f"Scraping failed: {str(e)}"
|
||||
)
|
||||
12
app/newenglandoil/schemas.py
Normal file
12
app/newenglandoil/schemas.py
Normal file
@@ -0,0 +1,12 @@
|
||||
from typing import List
|
||||
from pydantic import BaseModel
|
||||
from app.schemas import PriceRecord
|
||||
|
||||
class LatestPriceResponse(BaseModel):
|
||||
"""Latest price scrape response schema."""
|
||||
status: str
|
||||
message: str
|
||||
prices_scraped: int
|
||||
prices_stored: int
|
||||
scrape_timestamp: str
|
||||
prices: List[PriceRecord]
|
||||
170
app/newenglandoil/scraper.py
Normal file
170
app/newenglandoil/scraper.py
Normal file
@@ -0,0 +1,170 @@
|
||||
"""
|
||||
Web scraping module for New England Oil prices.
|
||||
|
||||
This module handles scraping oil price data from the New England Oil website
|
||||
for Zone 10 (Central Massachusetts).
|
||||
"""
|
||||
|
||||
import logging
|
||||
import time
|
||||
from datetime import date
|
||||
from typing import List, Dict, Optional
|
||||
from decimal import Decimal
|
||||
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
from app.config import (
|
||||
NEWENGLAND_OIL_ZONE10_URL,
|
||||
SCRAPER_USER_AGENT,
|
||||
SCRAPER_TIMEOUT,
|
||||
SCRAPER_DELAY_SECONDS,
|
||||
)
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class ScraperError(Exception):
|
||||
"""Custom exception for scraper errors."""
|
||||
pass
|
||||
|
||||
|
||||
def scrape_newengland_oil() -> List[Dict[str, any]]:
|
||||
"""
|
||||
Scrape oil prices from New England Oil Zone 10 page.
|
||||
|
||||
Fetches the page, parses the HTML table, and extracts company names,
|
||||
towns, and prices.
|
||||
|
||||
Returns:
|
||||
List of dictionaries with keys: company_name, town, price_decimal, scrape_date, zone
|
||||
|
||||
Raises:
|
||||
ScraperError: If the request fails or parsing fails
|
||||
"""
|
||||
logger.info(f"Starting scrape of {NEWENGLAND_OIL_ZONE10_URL}")
|
||||
|
||||
headers = {
|
||||
"User-Agent": SCRAPER_USER_AGENT,
|
||||
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
||||
"Accept-Language": "en-US,en;q=0.5",
|
||||
"Accept-Encoding": "gzip, deflate",
|
||||
"Connection": "keep-alive",
|
||||
}
|
||||
|
||||
try:
|
||||
# Make the request
|
||||
response = requests.get(
|
||||
NEWENGLAND_OIL_ZONE10_URL,
|
||||
headers=headers,
|
||||
timeout=SCRAPER_TIMEOUT
|
||||
)
|
||||
response.raise_for_status()
|
||||
|
||||
logger.info(f"Successfully fetched page (status: {response.status_code})")
|
||||
|
||||
# Parse HTML
|
||||
soup = BeautifulSoup(response.content, 'lxml')
|
||||
|
||||
# Find the price table
|
||||
# The table typically has company names, towns, and prices
|
||||
# We need to inspect the actual HTML structure
|
||||
prices = []
|
||||
today = date.today()
|
||||
|
||||
# Look for table rows with price data
|
||||
# The structure appears to be: company links followed by town and price info
|
||||
# We'll look for patterns in the HTML
|
||||
|
||||
# Find all table rows
|
||||
tables = soup.find_all('table')
|
||||
|
||||
if not tables:
|
||||
logger.warning("No tables found on page")
|
||||
# Debug: Save HTMl to file
|
||||
with open("debug_page.html", "wb") as f:
|
||||
f.write(response.content)
|
||||
raise ScraperError("No price table found on page")
|
||||
|
||||
# The main price table is usually the largest one or contains specific markers
|
||||
# Let's find rows that contain price information
|
||||
for table in tables:
|
||||
rows = table.find_all('tr')
|
||||
|
||||
for row in rows:
|
||||
cells = row.find_all(['td', 'th'])
|
||||
|
||||
if len(cells) >= 3: # Expect at least company, town, price
|
||||
# Try to extract company name (usually in a link)
|
||||
company_link = row.find('a')
|
||||
if company_link:
|
||||
company_name = company_link.get_text(strip=True)
|
||||
|
||||
# Extract text from all cells
|
||||
cell_texts = [cell.get_text(strip=True) for cell in cells]
|
||||
|
||||
# Look for price pattern (e.g., "$2.599" or "2.599")
|
||||
price_value = None
|
||||
town_value = None
|
||||
|
||||
for text in cell_texts:
|
||||
# Check if this looks like a price
|
||||
text_clean = text.replace('$', '').replace(',', '').strip()
|
||||
try:
|
||||
# Try to parse as decimal
|
||||
if text_clean and '.' in text_clean:
|
||||
potential_price = Decimal(text_clean)
|
||||
# Reasonable price range for heating oil (0.50 to 10.00)
|
||||
if Decimal('0.50') <= potential_price <= Decimal('10.00'):
|
||||
price_value = potential_price
|
||||
break
|
||||
except (ValueError, ArithmeticError):
|
||||
# Not a valid price, might be town name
|
||||
if text and not text.startswith('$') and len(text) > 2:
|
||||
if not town_value: # Take first non-price text as town
|
||||
town_value = text
|
||||
|
||||
if price_value:
|
||||
prices.append({
|
||||
"company_name": company_name,
|
||||
"town": town_value,
|
||||
"price_decimal": price_value,
|
||||
"scrape_date": today,
|
||||
"zone": "zone10"
|
||||
})
|
||||
logger.debug(f"Found: {company_name} - {town_value} - ${price_value}")
|
||||
|
||||
if not prices:
|
||||
logger.warning("No prices extracted from page")
|
||||
raise ScraperError("Failed to extract any price data from page")
|
||||
|
||||
logger.info(f"Successfully scraped {len(prices)} price records")
|
||||
return prices
|
||||
|
||||
except requests.RequestException as e:
|
||||
logger.error(f"Request failed: {e}")
|
||||
raise ScraperError(f"Failed to fetch page: {str(e)}")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Scraping failed: {e}", exc_info=True)
|
||||
raise ScraperError(f"Failed to parse page: {str(e)}")
|
||||
|
||||
|
||||
def scrape_and_delay() -> List[Dict[str, any]]:
|
||||
"""
|
||||
Scrape prices and apply rate limiting delay.
|
||||
|
||||
This is a convenience function that scrapes and then sleeps
|
||||
to respect rate limits.
|
||||
|
||||
Returns:
|
||||
List of price dictionaries
|
||||
"""
|
||||
prices = scrape_newengland_oil()
|
||||
|
||||
# Apply rate limiting delay
|
||||
if SCRAPER_DELAY_SECONDS > 0:
|
||||
logger.debug(f"Sleeping {SCRAPER_DELAY_SECONDS}s for rate limiting")
|
||||
time.sleep(SCRAPER_DELAY_SECONDS)
|
||||
|
||||
return prices
|
||||
0
app/priceticker/__init__.py
Normal file
0
app/priceticker/__init__.py
Normal file
72
app/priceticker/router.py
Normal file
72
app/priceticker/router.py
Normal file
@@ -0,0 +1,72 @@
|
||||
from fastapi import APIRouter, Depends, HTTPException
|
||||
from sqlalchemy.orm import Session
|
||||
from typing import List
|
||||
import logging
|
||||
|
||||
from app.database import get_db
|
||||
from app.models import TickerPrice
|
||||
from app.priceticker.scraper import fetch_ticker_data
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
router = APIRouter(
|
||||
prefix="/scraper/priceticker",
|
||||
tags=["Price Ticker"]
|
||||
)
|
||||
|
||||
@router.post("/update")
|
||||
async def update_prices(db: Session = Depends(get_db)):
|
||||
"""
|
||||
Trigger an immediate update of stock/commodity prices.
|
||||
"""
|
||||
logger.info("Triggering ticker update...")
|
||||
data = fetch_ticker_data()
|
||||
|
||||
if not data:
|
||||
raise HTTPException(status_code=500, detail="Failed to fetch ticker data")
|
||||
|
||||
try:
|
||||
saved_records = []
|
||||
for item in data:
|
||||
record = TickerPrice(
|
||||
symbol=item["symbol"],
|
||||
price_decimal=item["price"],
|
||||
currency=item["currency"],
|
||||
change_decimal=item["change"],
|
||||
percent_change_decimal=item["percent_change"],
|
||||
timestamp=item["timestamp"]
|
||||
)
|
||||
db.add(record)
|
||||
saved_records.append(record)
|
||||
|
||||
db.commit()
|
||||
return {"status": "success", "updated": len(saved_records)}
|
||||
|
||||
except Exception as e:
|
||||
db.rollback()
|
||||
logger.error(f"Database error saving tickers: {e}")
|
||||
raise HTTPException(status_code=500, detail=f"Database error: {str(e)}")
|
||||
|
||||
@router.get("/latest")
|
||||
async def get_latest_prices(db: Session = Depends(get_db)):
|
||||
"""
|
||||
Get the most recent price for each ticker symbol.
|
||||
"""
|
||||
# Subquery to find the latest timestamp for each symbol
|
||||
# This is a bit complex in pure ORM, so we might do it simply for now:
|
||||
# 1. Get list of distinct symbols we care about
|
||||
# 2. Query latest for each
|
||||
|
||||
results = []
|
||||
# We know the symbols we care about: HO=F, CL=F, RB=F
|
||||
TARGET_SYMBOLS = ["HO=F", "CL=F", "RB=F"]
|
||||
|
||||
for symbol in TARGET_SYMBOLS:
|
||||
latest = db.query(TickerPrice).filter(
|
||||
TickerPrice.symbol == symbol
|
||||
).order_by(TickerPrice.timestamp.desc()).first()
|
||||
|
||||
if latest:
|
||||
results.append(latest.to_dict())
|
||||
|
||||
return results
|
||||
65
app/priceticker/scraper.py
Normal file
65
app/priceticker/scraper.py
Normal file
@@ -0,0 +1,65 @@
|
||||
import logging
|
||||
import yfinance as yf
|
||||
from decimal import Decimal
|
||||
from datetime import datetime
|
||||
from typing import List, Dict, Any
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Ticker mapping
|
||||
# HO=F : Heating Oil
|
||||
# CL=F : Crude Oil
|
||||
# RB=F : RBOB Gasoline
|
||||
TICKERS = ["HO=F", "CL=F", "RB=F"]
|
||||
|
||||
def fetch_ticker_data() -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Fetch current data for oil tickers from Yahoo Finance.
|
||||
|
||||
Returns:
|
||||
List of dictionaries containing ticker data.
|
||||
"""
|
||||
results = []
|
||||
|
||||
try:
|
||||
# Fetch data for all tickers at once
|
||||
tickers_str = " ".join(TICKERS)
|
||||
data = yf.Tickers(tickers_str)
|
||||
|
||||
for symbol in TICKERS:
|
||||
try:
|
||||
ticker = data.tickers[symbol]
|
||||
# Fast info usually contains the latest price
|
||||
info = ticker.fast_info
|
||||
|
||||
# Fallback to history if fast_info is missing crucial data (simplified here)
|
||||
# But fast_info is generally faster and sufficient for last_price
|
||||
|
||||
last_price = info.last_price
|
||||
previous_close = info.previous_close
|
||||
|
||||
if last_price is None:
|
||||
logger.warning(f"No price found for {symbol}")
|
||||
continue
|
||||
|
||||
change = last_price - previous_close
|
||||
percent_change = (change / previous_close) * 100 if previous_close else 0
|
||||
|
||||
results.append({
|
||||
"symbol": symbol,
|
||||
"price": Decimal(str(last_price)),
|
||||
"currency": info.currency,
|
||||
"change": Decimal(str(change)),
|
||||
"percent_change": Decimal(str(percent_change)),
|
||||
"timestamp": datetime.utcnow()
|
||||
})
|
||||
|
||||
logger.info(f"Fetched {symbol}: {last_price} ({percent_change:.2f}%)")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error processing {symbol}: {e}")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error fetching ticker data: {e}")
|
||||
|
||||
return results
|
||||
16
app/schemas.py
Normal file
16
app/schemas.py
Normal file
@@ -0,0 +1,16 @@
|
||||
from typing import Optional
|
||||
from pydantic import BaseModel
|
||||
|
||||
class HealthResponse(BaseModel):
|
||||
"""Health check response schema."""
|
||||
status: str
|
||||
db_connected: bool
|
||||
|
||||
|
||||
class PriceRecord(BaseModel):
|
||||
"""Single price record schema."""
|
||||
company_name: str
|
||||
town: Optional[str] = None
|
||||
price_decimal: float
|
||||
scrape_date: str
|
||||
zone: str
|
||||
Reference in New Issue
Block a user