first commit

This commit is contained in:
2026-01-18 17:53:26 -05:00
commit 0b9c0915a1
15 changed files with 2692 additions and 0 deletions

39
.env.example Normal file
View File

@@ -0,0 +1,39 @@
# =============================================================================
# eamco_address_checker Environment Configuration
# =============================================================================
# Copy this file to .env and adjust values as needed.
# All values have sensible defaults; only override what you need.
# =============================================================================
# DATABASE
# =============================================================================
# Override the default PostgreSQL connection string
# Default: postgresql+psycopg2://postgres:password@192.168.1.204/eamco
# DATABASE_URL=postgresql+psycopg2://user:pass@host:5432/database
# =============================================================================
# BATCH PROCESSING
# =============================================================================
# Maximum records to process per batch run (default: 150)
# BATCH_SIZE=150
# Number of records to process before committing to database (default: 20)
# COMMIT_BATCH_SIZE=20
# =============================================================================
# RATE LIMITING (Nominatim)
# =============================================================================
# Minimum sleep between geocoding requests in seconds (default: 1.2)
# MIN_SLEEP=1.2
# Maximum sleep between geocoding requests in seconds (default: 1.8)
# MAX_SLEEP=1.8
# Geocoding request timeout in seconds (default: 10)
# GEOCODE_TIMEOUT=10
# =============================================================================
# LOGGING
# =============================================================================
# Log level: DEBUG, INFO, WARNING, ERROR, CRITICAL (default: INFO)
# LOG_LEVEL=INFO

132
.gitignore vendored Normal file
View File

@@ -0,0 +1,132 @@
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
# C extensions
*.so
# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
pip-wheel-metadata/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py,cover
.hypothesis/
.pytest_cache/
# Translations
*.mo
*.pot
# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal
# Flask stuff:
instance/
.webassets-cache
# Scrapy stuff:
.scrapy
# Sphinx documentation
docs/_build/
# PyBuilder
target/
# Jupyter Notebook
.ipynb_checkpoints
# IPython
profile_default/
ipython_config.py
# pyenv
.python-version
# pipenv
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
# However, in case of collaboration, if having platform-specific dependencies or dependencies
# having sub-dependencies with platform-specific binaries, it is better to ignore the Pipfile.lock.
# Pipfile.lock
# PEP 582; __pypackages__
__pypackages__/
# Celery stuff
celerybeat-schedule
celerybeat.pid
# SageMath parsed files
*.sage.py
# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/
# Spyder project settings
.spyderproject
.spyderworkspace
# Rope project settings
.ropeproject
# mkdocs documentation
/site
# mypy
.mypy_cache/
.dmypy.json
dmypy.json
# Pyre type checker
.pyre/
# Environments
.env.local
.env.prod

45
Dockerfile Normal file
View File

@@ -0,0 +1,45 @@
# eamco_address_checker Dockerfile
# Lightweight Python 3.11 image for Unraid Docker deployment
FROM python:3.11-slim
# Set environment variables
ENV PYTHONDONTWRITEBYTECODE=1 \
PYTHONUNBUFFERED=1 \
PYTHONPATH=/app \
PIP_NO_CACHE_DIR=1 \
PIP_DISABLE_PIP_VERSION_CHECK=1
# Set working directory
WORKDIR /app
# Install system dependencies (psycopg2 requirements)
RUN apt-get update && apt-get install -y --no-install-recommends \
libpq-dev \
gcc \
&& rm -rf /var/lib/apt/lists/* \
&& apt-get clean
# Copy requirements first (for better layer caching)
COPY requirements.txt .
# Install Python dependencies
RUN pip install --no-cache-dir -r requirements.txt
# Copy application code
COPY app/ ./app/
# Create non-root user for security
RUN useradd --create-home --shell /bin/bash appuser && \
chown -R appuser:appuser /app
USER appuser
# Expose port
EXPOSE 8000
# Health check
HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \
CMD python -c "import urllib.request; urllib.request.urlopen('http://localhost:8000/health')" || exit 1
# Run the application
CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "8000"]

14
Dockerfile.dev Normal file
View File

@@ -0,0 +1,14 @@
FROM python:3.11
ENV PYTHONFAULTHANDLER=1
ENV PYTHONUNBUFFERED=1
ENV MODE="DEVELOPMENT"
WORKDIR /app
COPY requirements.txt requirements.txt
RUN pip install -r requirements.txt
COPY . .
CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "8000"]

40
Dockerfile.local Normal file
View File

@@ -0,0 +1,40 @@
# eamco_address_checker - DEVELOPMENT Dockerfile
# Used by docker-compose.local.yml
# Features: Hot reload via volume mount, debug logging
FROM python:3.11-slim
# Set environment variables
ENV PYTHONDONTWRITEBYTECODE=1 \
PYTHONUNBUFFERED=1 \
PYTHONPATH=/app \
PIP_NO_CACHE_DIR=1 \
PIP_DISABLE_PIP_VERSION_CHECK=1
# Set working directory
WORKDIR /app
# Install system dependencies (psycopg2 requirements)
RUN apt-get update && apt-get install -y --no-install-recommends \
libpq-dev \
gcc \
&& rm -rf /var/lib/apt/lists/* \
&& apt-get clean
# Copy requirements first (for better layer caching)
COPY requirements.txt .
# Install Python dependencies
RUN pip install --no-cache-dir -r requirements.txt
# Copy environment file for local development
COPY .env.local .env
# Copy application code (will be overridden by volume mount in compose)
COPY app/ ./app/
# Expose port
EXPOSE 8000
# Development: Run with reload enabled
CMD ["uvicorn", "app.main:app", "--reload", "--host", "0.0.0.0", "--port", "8000"]

49
Dockerfile.prod Normal file
View File

@@ -0,0 +1,49 @@
# eamco_address_checker - PRODUCTION Dockerfile
# Used by docker-compose.prod.yml
# Features: Optimized for production, non-root user, health checks
FROM python:3.11-slim
# Set environment variables
ENV PYTHONDONTWRITEBYTECODE=1 \
PYTHONUNBUFFERED=1 \
PYTHONPATH=/app \
PIP_NO_CACHE_DIR=1 \
PIP_DISABLE_PIP_VERSION_CHECK=1
# Set working directory
WORKDIR /app
# Install system dependencies (psycopg2 requirements)
RUN apt-get update && apt-get install -y --no-install-recommends \
libpq-dev \
gcc \
&& rm -rf /var/lib/apt/lists/* \
&& apt-get clean
# Copy requirements first (for better layer caching)
COPY requirements.txt .
# Install Python dependencies
RUN pip install --no-cache-dir -r requirements.txt
# Copy environment file for production
COPY .env.prod .env
# Copy application code
COPY app/ ./app/
# Create non-root user for security
RUN useradd --create-home --shell /bin/bash appuser && \
chown -R appuser:appuser /app
USER appuser
# Expose port
EXPOSE 8000
# Health check
HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \
CMD python -c "import urllib.request; urllib.request.urlopen('http://localhost:8000/health')" || exit 1
# Production: Run without reload, with workers
CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "8000", "--workers", "2"]

3
README.md Normal file
View File

@@ -0,0 +1,3 @@
# EAMCO Address Checker
This service checks addresses.

1
app/__init__.py Normal file
View File

@@ -0,0 +1 @@
# eamco_address_checker app package

516
app/agent.py Normal file
View File

@@ -0,0 +1,516 @@
"""
Agentic Address Verification Orchestrator.
This module implements a lightweight ReAct-inspired autonomous agent for batch
address verification. The agent follows a structured workflow:
1. PLANNING PHASE: Query records needing verification
2. EXECUTION PHASE: For each record, follow think-act-observe-reflect cycle
- If geocoding fails, attempt fuzzy matching to correct misspellings
- Retry geocoding with corrected address
3. REFLECTION PHASE: Summarize batch results and statistics
The agent is designed for resilience - individual record failures don't stop
the batch, and progress is committed incrementally.
"""
import logging
from dataclasses import dataclass, field
from datetime import datetime, date
from typing import List, Optional
from geopy.geocoders import Nominatim
from sqlalchemy import or_, func
from sqlalchemy.orm import Session
from app.config import (
BATCH_SIZE,
COMMIT_BATCH_SIZE,
NOMINATIM_USER_AGENT,
)
from app.models import CustomerCustomer
from app.tools import (
build_address,
validate_address_components,
format_address_string,
geocode_address,
validate_geocode_result,
update_record,
rate_limit_sleep,
GeocodeResult,
get_state_abbreviation,
)
from app.streets import correct_address, get_town_street_count
logger = logging.getLogger(__name__)
@dataclass
class BatchStats:
"""Statistics for a batch verification run."""
total_queried: int = 0
processed: int = 0
updated: int = 0
corrected: int = 0 # Addresses fixed via fuzzy matching
failed: int = 0
skipped: int = 0
rate_limited: int = 0
errors: List[str] = field(default_factory=list)
corrections: List[str] = field(default_factory=list) # Log of corrections made
start_time: Optional[datetime] = None
end_time: Optional[datetime] = None
@property
def duration_seconds(self) -> float:
"""Calculate batch duration in seconds."""
if self.start_time and self.end_time:
return (self.end_time - self.start_time).total_seconds()
return 0.0
def to_dict(self) -> dict:
"""Convert stats to dictionary for JSON response."""
return {
"total_queried": self.total_queried,
"processed": self.processed,
"updated": self.updated,
"corrected": self.corrected,
"failed": self.failed,
"skipped": self.skipped,
"rate_limited": self.rate_limited,
"duration_seconds": round(self.duration_seconds, 2),
"errors_count": len(self.errors),
"sample_errors": self.errors[:5] if self.errors else [],
"sample_corrections": self.corrections[:5] if self.corrections else [],
}
class AddressVerificationAgent:
"""
Lightweight autonomous agent for address verification.
Implements a ReAct-inspired workflow where each record goes through:
- OBSERVE: Examine the address data
- THINK: Decide if geocoding should be attempted
- ACT: Call geocoding API
- OBSERVE: Examine the result
- REFLECT: Log decision and update database
Attributes:
session: SQLAlchemy database session
batch_size: Maximum records per batch
commit_size: Records between commits
stats: Running statistics for the batch
geocoder: Nominatim geocoder instance
"""
def __init__(
self,
session: Session,
batch_size: int = BATCH_SIZE,
commit_size: int = COMMIT_BATCH_SIZE,
):
"""
Initialize the address verification agent.
Args:
session: SQLAlchemy session for database operations
batch_size: Max records to process (default from config)
commit_size: Records before intermediate commit
"""
self.session = session
self.batch_size = batch_size
self.commit_size = commit_size
self.stats = BatchStats()
self.geocoder = Nominatim(user_agent=NOMINATIM_USER_AGENT)
logger.info(
f"Agent initialized: batch_size={batch_size}, commit_size={commit_size}"
)
# =========================================================================
# PHASE 1: PLANNING
# =========================================================================
def plan_batch(self) -> List[CustomerCustomer]:
"""
PLANNING PHASE: Query records that need address verification.
Criteria for selection:
- correct_address = FALSE, OR
- verified_at IS NULL, OR
- verified_at < today (not verified today)
Returns:
List of CustomerCustomer records to process
"""
logger.info("=" * 60)
logger.info("PLANNING PHASE: Querying records needing verification")
logger.info("=" * 60)
today = date.today()
# Build query for records needing verification
query = self.session.query(CustomerCustomer).filter(
or_(
CustomerCustomer.correct_address == False, # noqa: E712
CustomerCustomer.verified_at.is_(None),
func.date(CustomerCustomer.verified_at) < today,
)
).limit(self.batch_size)
records = query.all()
self.stats.total_queried = len(records)
logger.info(
f"PLAN RESULT: Found {len(records)} records needing verification",
extra={"record_count": len(records), "batch_limit": self.batch_size}
)
# Log sample of record IDs for debugging
if records:
sample_ids = [r.id for r in records[:10]]
logger.debug(f"Sample record IDs: {sample_ids}")
return records
# =========================================================================
# PHASE 2: EXECUTION (ReAct-style per record)
# =========================================================================
def process_record(self, customer: CustomerCustomer) -> bool:
"""
EXECUTION PHASE: Process a single record with ReAct-style workflow.
Steps:
1. OBSERVE: Build address from record components
2. THINK: Validate address - skip if obviously invalid
3. ACT: Call Nominatim geocoder
4. OBSERVE: Examine geocoding result
5. REFLECT: Log decision and update database
Args:
customer: CustomerCustomer record to process
Returns:
True if record was successfully updated, False otherwise
"""
logger.info("-" * 40)
logger.info(f"Processing record ID: {customer.id}")
# -----------------------------------------------------------------
# STEP 1: OBSERVE - Build address from components
# -----------------------------------------------------------------
logger.debug(f"[OBSERVE] Building address for customer {customer.id}")
address_components = build_address(customer)
# -----------------------------------------------------------------
# STEP 2: THINK - Validate address components
# -----------------------------------------------------------------
logger.debug(f"[THINK] Validating address components")
address_components = validate_address_components(address_components)
if not address_components.is_valid:
# REFLECT: Skip invalid addresses
logger.info(
f"[REFLECT] Skipping record {customer.id}: "
f"{address_components.validation_error}"
)
self.stats.skipped += 1
# Still update the record to mark it as processed
geocode_result = GeocodeResult(
success=False,
skipped=True,
skip_reason=address_components.validation_error,
error_message=address_components.validation_error,
)
update_record(self.session, customer, geocode_result, is_valid=False)
return False
# Format address for geocoding
address_string = format_address_string(address_components)
logger.debug(f"[THINK] Formatted address: {address_string}")
# -----------------------------------------------------------------
# STEP 3: ACT - Call geocoding API
# -----------------------------------------------------------------
logger.debug(f"[ACT] Calling Nominatim geocoder")
geocode_result = geocode_address(address_string, self.geocoder)
# -----------------------------------------------------------------
# STEP 4: OBSERVE - Examine geocoding result
# -----------------------------------------------------------------
logger.debug(f"[OBSERVE] Geocoding result: success={geocode_result.success}")
if not geocode_result.success:
# -----------------------------------------------------------------
# STEP 4a: THINK - Try fuzzy matching to correct address
# -----------------------------------------------------------------
logger.info(
f"[THINK] Geocoding failed, attempting fuzzy street matching..."
)
# Get state abbreviation for fuzzy matching
state_abbr = get_state_abbreviation(customer.customer_state)
town = address_components.city
if state_abbr and town:
# Check if we have street data for this town
street_count = get_town_street_count(self.session, town, state_abbr)
if street_count > 0:
# Try to correct the address
match = correct_address(
session=self.session,
full_address=address_components.street or "",
town=town,
state=state_abbr,
min_confidence=75.0,
)
if match and match.corrected_address:
logger.info(
f"[ACT] Found correction: '{address_components.street}' "
f"-> '{match.corrected_address}' "
f"(confidence: {match.confidence_score:.1f}%)"
)
# Build corrected address string
corrected_components = address_components
corrected_components.street = match.corrected_address
corrected_address_string = format_address_string(corrected_components)
logger.info(f"[ACT] Retrying with corrected address: {corrected_address_string}")
# Rate limit before retry
rate_limit_sleep()
# Retry geocoding with corrected address
geocode_result = geocode_address(corrected_address_string, self.geocoder)
if geocode_result.success:
logger.info(
f"[OBSERVE] Corrected address geocoded successfully!"
)
self.stats.corrected += 1
self.stats.corrections.append(
f"ID {customer.id}: '{address_components.street}' "
f"-> '{match.corrected_address}'"
)
else:
logger.info(
f"[OBSERVE] Corrected address still failed to geocode"
)
else:
logger.debug(
f"[THINK] No confident fuzzy match found"
)
else:
logger.debug(
f"[THINK] No street reference data for {town}, {state_abbr}. "
f"Use POST /streets/{town}/{state_abbr} to populate."
)
# If still failed after correction attempt
if not geocode_result.success:
# -----------------------------------------------------------------
# STEP 5a: REFLECT - Handle failed geocoding
# -----------------------------------------------------------------
logger.info(
f"[REFLECT] Geocoding failed for record {customer.id}: "
f"{geocode_result.error_message}"
)
self.stats.failed += 1
self.stats.errors.append(
f"ID {customer.id}: {geocode_result.error_message}"
)
update_record(self.session, customer, geocode_result, is_valid=False)
return False
# Validate geocode result quality
is_valid, validation_reason = validate_geocode_result(geocode_result)
logger.debug(f"[OBSERVE] Validation: valid={is_valid}, reason={validation_reason}")
# -----------------------------------------------------------------
# STEP 5b: REFLECT - Update database with result
# -----------------------------------------------------------------
if is_valid:
logger.info(
f"[REFLECT] Success for record {customer.id}: "
f"lat={geocode_result.latitude}, lon={geocode_result.longitude}"
)
self.stats.updated += 1
else:
logger.info(
f"[REFLECT] Invalid result for record {customer.id}: {validation_reason}"
)
self.stats.failed += 1
self.stats.errors.append(f"ID {customer.id}: {validation_reason}")
update_record(self.session, customer, geocode_result, is_valid=is_valid)
return is_valid
def execute_batch(self, records: List[CustomerCustomer]) -> None:
"""
Execute the batch processing loop with rate limiting.
Processes records sequentially with proper rate limiting between
geocoding calls. Commits to database periodically.
Args:
records: List of CustomerCustomer records to process
"""
logger.info("=" * 60)
logger.info("EXECUTION PHASE: Processing records")
logger.info("=" * 60)
uncommitted_count = 0
for i, customer in enumerate(records):
try:
# Process the record
self.process_record(customer)
self.stats.processed += 1
uncommitted_count += 1
# Commit in batches
if uncommitted_count >= self.commit_size:
logger.info(f"Committing batch of {uncommitted_count} records")
self.session.commit()
uncommitted_count = 0
# Rate limiting (skip on last record)
if i < len(records) - 1:
rate_limit_sleep()
except Exception as e:
# Handle unexpected errors - continue processing
logger.error(
f"Unexpected error processing record {customer.id}: {e}",
exc_info=True
)
self.stats.failed += 1
self.stats.errors.append(f"ID {customer.id}: Unexpected error: {str(e)}")
self.stats.processed += 1
# Rollback the current transaction and continue
self.session.rollback()
uncommitted_count = 0
# Final commit for any remaining records
if uncommitted_count > 0:
logger.info(f"Final commit of {uncommitted_count} records")
self.session.commit()
# =========================================================================
# PHASE 3: REFLECTION
# =========================================================================
def reflect(self) -> dict:
"""
REFLECTION PHASE: Summarize batch results and statistics.
Logs comprehensive statistics about the batch run and returns
a summary dictionary suitable for API response.
Returns:
Dictionary with batch statistics
"""
self.stats.end_time = datetime.utcnow()
logger.info("=" * 60)
logger.info("REFLECTION PHASE: Batch Summary")
logger.info("=" * 60)
stats_dict = self.stats.to_dict()
logger.info(f"Total queried: {stats_dict['total_queried']}")
logger.info(f"Processed: {stats_dict['processed']}")
logger.info(f"Updated (valid): {stats_dict['updated']}")
logger.info(f"Corrected: {stats_dict['corrected']}")
logger.info(f"Failed: {stats_dict['failed']}")
logger.info(f"Skipped: {stats_dict['skipped']}")
logger.info(f"Duration: {stats_dict['duration_seconds']}s")
if stats_dict['errors_count'] > 0:
logger.warning(f"Errors encountered: {stats_dict['errors_count']}")
for error in stats_dict['sample_errors']:
logger.warning(f" - {error}")
if stats_dict['corrected'] > 0:
logger.info(f"Addresses corrected via fuzzy matching: {stats_dict['corrected']}")
for correction in stats_dict['sample_corrections']:
logger.info(f" - {correction}")
# Calculate success rate
if stats_dict['processed'] > 0:
success_rate = (stats_dict['updated'] / stats_dict['processed']) * 100
logger.info(f"Success rate: {success_rate:.1f}%")
stats_dict['success_rate'] = round(success_rate, 1)
else:
stats_dict['success_rate'] = 0.0
logger.info("=" * 60)
return stats_dict
# =========================================================================
# MAIN ENTRY POINT
# =========================================================================
def run(self) -> dict:
"""
Execute the full agent workflow.
Runs through all three phases:
1. Planning - Query records
2. Execution - Process each record
3. Reflection - Summarize results
Returns:
Dictionary with batch statistics and message
"""
logger.info("*" * 60)
logger.info("ADDRESS VERIFICATION AGENT STARTING")
logger.info("*" * 60)
self.stats.start_time = datetime.utcnow()
try:
# Phase 1: Planning
records = self.plan_batch()
if not records:
logger.info("No records to process - batch complete")
self.stats.end_time = datetime.utcnow()
return {
"status": "success",
"message": "No records needed verification",
**self.stats.to_dict(),
}
# Phase 2: Execution
self.execute_batch(records)
# Phase 3: Reflection
stats = self.reflect()
logger.info("*" * 60)
logger.info("ADDRESS VERIFICATION AGENT COMPLETE")
logger.info("*" * 60)
return {
"status": "success",
"message": f"Batch complete: {stats['updated']} addresses updated",
**stats,
}
except Exception as e:
logger.error(f"Agent failed with error: {e}", exc_info=True)
self.stats.end_time = datetime.utcnow()
return {
"status": "error",
"message": f"Agent failed: {str(e)}",
**self.stats.to_dict(),
}

184
app/config.py Normal file
View File

@@ -0,0 +1,184 @@
"""
Configuration settings for eamco_address_checker.
This module provides configuration with environment-based switching:
- DEVELOPMENT: Uses 'eamco' database, localhost CORS origins
- PRODUCTION: Uses 'auburnoil' database, production domain CORS origins
Environment variables are loaded from .env.local or .env.prod depending
on the Docker compose file used.
"""
import os
from typing import List
from dotenv import load_dotenv
# Load environment variables from .env file if present
load_dotenv()
# =============================================================================
# ENVIRONMENT MODE
# =============================================================================
MODE = os.getenv("MODE", "LOCAL")
CURRENT_SETTINGS = os.getenv("CURRENT_SETTINGS", "DEVELOPMENT")
if CURRENT_SETTINGS == "PRODUCTION":
print("USING PRODUCTION APPLICATIONCONFIG!!!!!")
else:
print("USING DEVELOPMENT APPLICATIONCONFIG!!!!!")
# =============================================================================
# DATABASE CONFIGURATION
# =============================================================================
# Database connection components (can be overridden individually)
POSTGRES_USERNAME = os.getenv("POSTGRES_USERNAME", "postgres")
POSTGRES_PW = os.getenv("POSTGRES_PW", "password")
POSTGRES_SERVER = os.getenv("POSTGRES_SERVER", "192.168.1.204")
POSTGRES_PORT = os.getenv("POSTGRES_PORT", "5432")
# Database name differs by environment
if CURRENT_SETTINGS == "PRODUCTION":
POSTGRES_DBNAME = os.getenv("POSTGRES_DBNAME", "auburnoil")
else:
POSTGRES_DBNAME = os.getenv("POSTGRES_DBNAME", "eamco")
# Build connection URI from components (fallback)
_DEFAULT_DATABASE_URI = "postgresql+psycopg2://{}:{}@{}:{}/{}".format(
POSTGRES_USERNAME,
POSTGRES_PW,
POSTGRES_SERVER,
POSTGRES_PORT,
POSTGRES_DBNAME
)
# Allow full DATABASE_URL override
DATABASE_URL: str = os.getenv("DATABASE_URL", _DEFAULT_DATABASE_URI)
# SQLAlchemy binds (for compatibility)
SQLALCHEMY_DATABASE_URI = DATABASE_URL
SQLALCHEMY_BINDS = {POSTGRES_DBNAME: SQLALCHEMY_DATABASE_URI}
# =============================================================================
# CORS CONFIGURATION
# =============================================================================
# Parse CORS origins from environment (comma-separated) or use defaults
_cors_env = os.getenv("CORS_ORIGINS", "")
if _cors_env:
CORS_ORIGINS: List[str] = [origin.strip() for origin in _cors_env.split(",")]
elif CURRENT_SETTINGS == "PRODUCTION":
# Production CORS origins
CORS_ORIGINS = [
"https://oil.edwineames.com",
"https://edwineames.com",
]
else:
# Development CORS origins
CORS_ORIGINS = [
"http://localhost:9000",
"https://localhost:9513",
"http://localhost:9514",
"http://localhost:9512",
"http://localhost:9511",
"http://localhost:5173", # Frontend port
"http://localhost:9616", # Authorize service port
]
# =============================================================================
# BATCH PROCESSING CONFIGURATION
# =============================================================================
# Maximum records to process in a single batch run
BATCH_SIZE: int = int(os.getenv("BATCH_SIZE", "150"))
# Records to process before committing to database
COMMIT_BATCH_SIZE: int = int(os.getenv("COMMIT_BATCH_SIZE", "20"))
# =============================================================================
# GEOCODING CONFIGURATION (Nominatim)
# =============================================================================
# User agent for Nominatim API (required - identifies your application)
NOMINATIM_USER_AGENT: str = "Unraid-EamcoAddressChecker/1.0 (eeames214@gmail.com)"
# Rate limiting: Sleep range between requests (Nominatim requires 1 req/sec max)
MIN_SLEEP_SECONDS: float = float(os.getenv("MIN_SLEEP", "1.2"))
MAX_SLEEP_SECONDS: float = float(os.getenv("MAX_SLEEP", "1.8"))
# Geocoding timeout in seconds
GEOCODE_TIMEOUT: int = int(os.getenv("GEOCODE_TIMEOUT", "10"))
# =============================================================================
# STATE MAPPING
# =============================================================================
# Integer -> US State Abbreviation mapping
# Replace with proper states table lookup when available
STATE_MAPPING: dict[int, str] = {
1: "AL", # Alabama
2: "AK", # Alaska
3: "AS", # American Samoa
4: "AZ", # Arizona
5: "AR", # Arkansas
6: "CA", # California
7: "CO", # Colorado
8: "CT", # Connecticut
9: "DE", # Delaware
10: "DC", # District of Columbia
11: "FL", # Florida
12: "GA", # Georgia
13: "GU", # Guam
14: "HI", # Hawaii
15: "ID", # Idaho
16: "IL", # Illinois
17: "IN", # Indiana
18: "IA", # Iowa
19: "KS", # Kansas
20: "KY", # Kentucky
21: "LA", # Louisiana
22: "ME", # Maine
23: "MD", # Maryland
24: "MA", # Massachusetts
25: "MI", # Michigan
26: "MN", # Minnesota
27: "MS", # Mississippi
28: "MO", # Missouri
29: "MT", # Montana
30: "NE", # Nebraska
31: "NV", # Nevada
32: "NH", # New Hampshire
33: "NJ", # New Jersey
34: "NM", # New Mexico
35: "NY", # New York
36: "NC", # North Carolina
37: "ND", # North Dakota
38: "OH", # Ohio
39: "OK", # Oklahoma
40: "OR", # Oregon
41: "PA", # Pennsylvania
42: "PR", # Puerto Rico
43: "RI", # Rhode Island
44: "SC", # South Carolina
45: "SD", # South Dakota
46: "TN", # Tennessee
47: "TX", # Texas
48: "UT", # Utah
49: "VT", # Vermont
50: "VA", # Virginia
51: "VI", # Virgin Islands
52: "WA", # Washington
53: "WV", # West Virginia
54: "WI", # Wisconsin
55: "WY", # Wyoming
}
# =============================================================================
# LOGGING CONFIGURATION
# =============================================================================
LOG_LEVEL: str = os.getenv("LOG_LEVEL", "INFO")
LOG_FORMAT: str = "%(asctime)s - %(name)s - %(levelname)s - %(message)s"

558
app/main.py Normal file
View File

@@ -0,0 +1,558 @@
"""
eamco_address_checker - FastAPI Address Verification Microservice.
This microservice provides a batch job endpoint for verifying customer addresses
using geocoding. Designed to be triggered via cron from Unraid.
Endpoints:
GET /health - Health check with database connectivity status
POST /verify-addresses - Trigger batch address verification
POST /reset-verifications - Clear all verification data for re-checking
POST /streets/{town}/{state} - Fetch and store streets from OSM for a town
GET /streets/{town}/{state} - Get street count for a town
Usage:
# Development
uvicorn app.main:app --reload --host 0.0.0.0 --port 8000
# Production (Docker)
docker run -p 8000:8000 eamco_address_checker
# Trigger from cron
curl -X POST http://localhost:8000/verify-addresses
"""
import logging
import sys
from contextlib import contextmanager
from typing import Generator
from fastapi import FastAPI, Depends, HTTPException
from fastapi.middleware.cors import CORSMiddleware
from pydantic import BaseModel
from sqlalchemy import create_engine, text
from sqlalchemy.orm import sessionmaker, Session
from sqlalchemy.exc import SQLAlchemyError
from app.config import (
DATABASE_URL,
CORS_ORIGINS,
LOG_LEVEL,
LOG_FORMAT,
BATCH_SIZE,
COMMIT_BATCH_SIZE,
)
from app.agent import AddressVerificationAgent
from app.models import CustomerCustomer, StreetReference, Base
from app.streets import (
populate_streets_for_town,
get_town_street_count,
)
# =============================================================================
# LOGGING CONFIGURATION
# =============================================================================
logging.basicConfig(
level=getattr(logging, LOG_LEVEL.upper(), logging.INFO),
format=LOG_FORMAT,
handlers=[
logging.StreamHandler(sys.stdout),
]
)
logger = logging.getLogger(__name__)
# =============================================================================
# DATABASE SETUP
# =============================================================================
# Create SQLAlchemy engine with connection pooling
engine = create_engine(
DATABASE_URL,
pool_pre_ping=True, # Verify connections before use
pool_size=5,
max_overflow=10,
echo=False, # Set to True for SQL debugging
)
# Session factory
SessionLocal = sessionmaker(
autocommit=False,
autoflush=False,
bind=engine,
)
def get_db() -> Generator[Session, None, None]:
"""
Dependency that provides a database session.
Yields a SQLAlchemy session and ensures proper cleanup.
"""
db = SessionLocal()
try:
yield db
finally:
db.close()
@contextmanager
def get_db_session() -> Generator[Session, None, None]:
"""
Context manager for database sessions (non-dependency use).
"""
db = SessionLocal()
try:
yield db
finally:
db.close()
def check_db_connection() -> bool:
"""
Test database connectivity.
Returns:
True if database is reachable, False otherwise
"""
try:
with get_db_session() as db:
db.execute(text("SELECT 1"))
return True
except SQLAlchemyError as e:
logger.error(f"Database connection failed: {e}")
return False
# =============================================================================
# FASTAPI APPLICATION
# =============================================================================
app = FastAPI(
title="eamco_address_checker",
description="Address verification microservice using Nominatim geocoding",
version="1.0.0",
docs_url="/docs",
redoc_url="/redoc",
)
# =============================================================================
# CORS MIDDLEWARE
# =============================================================================
app.add_middleware(
CORSMiddleware,
allow_origins=CORS_ORIGINS,
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"],
)
# =============================================================================
# PYDANTIC MODELS (Response Schemas)
# =============================================================================
class HealthResponse(BaseModel):
"""Health check response schema."""
status: str
db_connected: bool
class VerificationResponse(BaseModel):
"""Address verification batch response schema."""
status: str
message: str
total_queried: int
processed: int
updated: int
corrected: int
failed: int
skipped: int
rate_limited: int
duration_seconds: float
success_rate: float
errors_count: int
sample_errors: list
sample_corrections: list
class ResetResponse(BaseModel):
"""Reset verifications response schema."""
status: str
message: str
records_reset: int
class StreetPopulateResponse(BaseModel):
"""Response for street population endpoint."""
status: str
message: str
town: str
state: str
streets_added: int
streets_updated: int
total_found: int
errors: list
class StreetInfoResponse(BaseModel):
"""Response for street info endpoint."""
town: str
state: str
street_count: int
message: str
# =============================================================================
# ENDPOINTS
# =============================================================================
@app.get("/", include_in_schema=False)
async def root():
"""Root endpoint - redirect to docs."""
return {
"service": "eamco_address_checker",
"version": "1.0.0",
"docs": "/docs",
}
@app.get("/health", response_model=HealthResponse, tags=["Health"])
async def health_check():
"""
Health check endpoint.
Returns service status and database connectivity.
Use this endpoint for container health checks and monitoring.
Returns:
HealthResponse with status and db_connected flag
"""
db_connected = check_db_connection()
return HealthResponse(
status="healthy" if db_connected else "degraded",
db_connected=db_connected,
)
@app.post(
"/verify-addresses",
response_model=VerificationResponse,
tags=["Verification"],
)
async def verify_addresses(db: Session = Depends(get_db)):
"""
Trigger batch address verification.
This endpoint runs a synchronous batch job that:
1. Queries records needing verification (max BATCH_SIZE)
2. Geocodes each address using Nominatim
3. Updates records with lat/long and verification status
4. Returns statistics about the batch run
The batch respects Nominatim rate limits (1 req/sec) so execution
time is approximately BATCH_SIZE * 1.5 seconds.
Use this endpoint from Unraid cron:
curl -X POST http://localhost:8000/verify-addresses
Returns:
VerificationResponse with batch statistics
"""
logger.info("=" * 60)
logger.info("VERIFY-ADDRESSES ENDPOINT CALLED")
logger.info("=" * 60)
logger.info(f"Configuration: BATCH_SIZE={BATCH_SIZE}, COMMIT_SIZE={COMMIT_BATCH_SIZE}")
try:
# Initialize and run the agent
agent = AddressVerificationAgent(
session=db,
batch_size=BATCH_SIZE,
commit_size=COMMIT_BATCH_SIZE,
)
result = agent.run()
logger.info(f"Batch complete: {result.get('message', 'No message')}")
return VerificationResponse(
status=result.get("status", "unknown"),
message=result.get("message", ""),
total_queried=result.get("total_queried", 0),
processed=result.get("processed", 0),
updated=result.get("updated", 0),
corrected=result.get("corrected", 0),
failed=result.get("failed", 0),
skipped=result.get("skipped", 0),
rate_limited=result.get("rate_limited", 0),
duration_seconds=result.get("duration_seconds", 0.0),
success_rate=result.get("success_rate", 0.0),
errors_count=result.get("errors_count", 0),
sample_errors=result.get("sample_errors", []),
sample_corrections=result.get("sample_corrections", []),
)
except SQLAlchemyError as e:
logger.error(f"Database error during verification: {e}", exc_info=True)
raise HTTPException(
status_code=500,
detail=f"Database error: {str(e)}"
)
except Exception as e:
logger.error(f"Unexpected error during verification: {e}", exc_info=True)
raise HTTPException(
status_code=500,
detail=f"Verification failed: {str(e)}"
)
@app.post(
"/reset-verifications",
response_model=ResetResponse,
tags=["Verification"],
)
async def reset_verifications(db: Session = Depends(get_db)):
"""
Reset all address verifications for re-checking.
This endpoint clears verification data for ALL customer records:
- Sets correct_address = FALSE
- Sets verified_at = NULL
- Clears customer_latitude and customer_longitude
After calling this endpoint, all addresses will be eligible for
re-verification on the next /verify-addresses call.
WARNING: This is a mass update operation. Use with caution.
Returns:
ResetResponse with count of records reset
"""
logger.info("=" * 60)
logger.info("RESET-VERIFICATIONS ENDPOINT CALLED")
logger.info("=" * 60)
try:
# Count records before update
total_records = db.query(CustomerCustomer).count()
logger.info(f"Total customer records: {total_records}")
# Mass update to reset all verification data
updated_count = db.query(CustomerCustomer).update(
{
CustomerCustomer.correct_address: False,
CustomerCustomer.verified_at: None,
CustomerCustomer.customer_latitude: None,
CustomerCustomer.customer_longitude: None,
},
synchronize_session=False
)
db.commit()
logger.info(f"Reset {updated_count} records successfully")
return ResetResponse(
status="success",
message=f"Reset {updated_count} address verifications. All addresses are now eligible for re-verification.",
records_reset=updated_count,
)
except SQLAlchemyError as e:
db.rollback()
logger.error(f"Database error during reset: {e}", exc_info=True)
raise HTTPException(
status_code=500,
detail=f"Database error: {str(e)}"
)
except Exception as e:
db.rollback()
logger.error(f"Unexpected error during reset: {e}", exc_info=True)
raise HTTPException(
status_code=500,
detail=f"Reset failed: {str(e)}"
)
# =============================================================================
# STREET REFERENCE ENDPOINTS
# =============================================================================
@app.post(
"/streets/{town}/{state}",
response_model=StreetPopulateResponse,
tags=["Streets"],
)
async def populate_streets(
town: str,
state: str,
clear_existing: bool = False,
db: Session = Depends(get_db)
):
"""
Fetch and store all streets for a town from OpenStreetMap.
This endpoint queries the OSM Overpass API to get all named streets
in the specified town and stores them in the street_reference table
for fuzzy matching during address verification.
Args:
town: Town/city name (e.g., "Boston")
state: 2-letter state abbreviation (e.g., "MA")
clear_existing: If true, delete existing streets for this town first
Example:
curl -X POST http://localhost:8000/streets/Boston/MA
Returns:
StreetPopulateResponse with count of streets added
"""
logger.info("=" * 60)
logger.info(f"POPULATE STREETS: {town}, {state}")
logger.info("=" * 60)
# Validate state abbreviation (2 letters)
if len(state) != 2 or not state.isalpha():
raise HTTPException(
status_code=400,
detail="State must be a 2-letter abbreviation (e.g., MA, NY, CA)"
)
try:
# Ensure the street_reference table exists
Base.metadata.create_all(bind=engine, tables=[StreetReference.__table__])
result = populate_streets_for_town(
session=db,
town=town,
state=state.upper(),
clear_existing=clear_existing,
)
return StreetPopulateResponse(
status="success" if result.success else "partial",
message=result.message,
town=town,
state=state.upper(),
streets_added=result.streets_added,
streets_updated=result.streets_updated,
total_found=result.total_found,
errors=result.errors,
)
except SQLAlchemyError as e:
db.rollback()
logger.error(f"Database error populating streets: {e}", exc_info=True)
raise HTTPException(
status_code=500,
detail=f"Database error: {str(e)}"
)
except Exception as e:
logger.error(f"Error populating streets: {e}", exc_info=True)
raise HTTPException(
status_code=500,
detail=f"Failed to populate streets: {str(e)}"
)
@app.get(
"/streets/{town}/{state}",
response_model=StreetInfoResponse,
tags=["Streets"],
)
async def get_street_info(
town: str,
state: str,
db: Session = Depends(get_db)
):
"""
Get information about streets stored for a town.
Returns the count of streets in the reference table for the
specified town/state combination.
Args:
town: Town/city name
state: 2-letter state abbreviation
Example:
curl http://localhost:8000/streets/Boston/MA
Returns:
StreetInfoResponse with street count
"""
# Validate state abbreviation
if len(state) != 2 or not state.isalpha():
raise HTTPException(
status_code=400,
detail="State must be a 2-letter abbreviation"
)
count = get_town_street_count(db, town, state.upper())
if count == 0:
message = f"No streets found for {town}, {state}. Use POST to populate."
else:
message = f"Found {count} streets for {town}, {state}"
return StreetInfoResponse(
town=town,
state=state.upper(),
street_count=count,
message=message,
)
# =============================================================================
# STARTUP/SHUTDOWN EVENTS
# =============================================================================
@app.on_event("startup")
async def startup_event():
"""Application startup - log configuration and test DB connection."""
logger.info("*" * 60)
logger.info("eamco_address_checker STARTING")
logger.info("*" * 60)
logger.info(f"Database URL: {DATABASE_URL[:50]}...")
logger.info(f"CORS Origins: {CORS_ORIGINS}")
logger.info(f"Batch Size: {BATCH_SIZE}")
logger.info(f"Commit Batch Size: {COMMIT_BATCH_SIZE}")
# Test database connection
if check_db_connection():
logger.info("Database connection: OK")
else:
logger.warning("Database connection: FAILED - service may be degraded")
@app.on_event("shutdown")
async def shutdown_event():
"""Application shutdown - cleanup."""
logger.info("eamco_address_checker SHUTTING DOWN")
engine.dispose()
logger.info("Database connections closed")
# =============================================================================
# MAIN ENTRY POINT (for direct execution)
# =============================================================================
if __name__ == "__main__":
import uvicorn
uvicorn.run(
"app.main:app",
host="0.0.0.0",
port=8000,
reload=True,
log_level="info",
)

127
app/models.py Normal file
View File

@@ -0,0 +1,127 @@
"""
SQLAlchemy 2.x ORM Models for eamco_address_checker.
This module defines the database models using SQLAlchemy's DeclarativeBase.
Models:
CustomerCustomer: Customer records with address fields for geocoding
StreetReference: Known streets by town/state for fuzzy matching corrections
"""
from sqlalchemy import Column, Integer, String, VARCHAR, TIMESTAMP, BOOLEAN, Index
from sqlalchemy.orm import DeclarativeBase
class Base(DeclarativeBase):
"""Base class for all SQLAlchemy models."""
pass
class CustomerCustomer(Base):
"""
Customer model representing address and contact information.
The verified_at timestamp tracks when the address was last geocoded.
The correct_address boolean indicates if geocoding was successful.
Attributes:
id: Primary key
auth_net_profile_id: Authorize.net customer profile ID
account_number: Customer account number (max 25 chars)
customer_last_name: Customer's last name (max 250 chars)
customer_first_name: Customer's first name (max 250 chars)
customer_town: City/town name (max 140 chars)
customer_state: Integer mapping to US state abbreviation
customer_zip: ZIP code (max 25 chars)
customer_first_call: Timestamp of first customer contact
customer_email: Customer email address (max 500 chars)
customer_automatic: Automatic billing flag
customer_phone_number: Phone number (max 25 chars)
customer_home_type: Type of residence
customer_apt: Apartment/unit number (max 140 chars)
customer_address: Street address (max 1000 chars)
company_id: Associated company ID
customer_latitude: Geocoded latitude as string (max 250 chars)
customer_longitude: Geocoded longitude as string (max 250 chars)
correct_address: Flag indicating successful geocoding
verified_at: Timestamp of last verification attempt
"""
__tablename__ = "customer_customer"
__table_args__ = {"schema": "public"}
id = Column(Integer, primary_key=True, autoincrement=True)
auth_net_profile_id = Column(String, unique=True, index=True, nullable=True)
account_number = Column(VARCHAR(25))
customer_last_name = Column(VARCHAR(250))
customer_first_name = Column(VARCHAR(250))
customer_town = Column(VARCHAR(140))
customer_state = Column(Integer) # Integer -> 2-letter US state abbreviation
customer_zip = Column(VARCHAR(25))
customer_first_call = Column(TIMESTAMP)
customer_email = Column(VARCHAR(500))
customer_automatic = Column(Integer)
customer_phone_number = Column(VARCHAR(25))
customer_home_type = Column(Integer)
customer_apt = Column(VARCHAR(140))
customer_address = Column(VARCHAR(1000))
company_id = Column(Integer)
customer_latitude = Column(VARCHAR(250))
customer_longitude = Column(VARCHAR(250))
correct_address = Column(BOOLEAN, default=False, nullable=False)
verified_at = Column(TIMESTAMP, nullable=True) # NEW: Tracks verification timestamp
def __repr__(self) -> str:
return (
f"<CustomerCustomer(id={self.id}, "
f"name='{self.customer_first_name} {self.customer_last_name}', "
f"address='{self.customer_address}', "
f"verified={self.correct_address})>"
)
class StreetReference(Base):
"""
Reference table of known streets for fuzzy matching address corrections.
Streets are populated per town/state from OpenStreetMap data.
Used to correct misspellings and wrong street suffixes (rd vs dr, etc.)
when geocoding fails.
Attributes:
id: Primary key
street_name: Full street name (e.g., "Main Street")
street_name_normalized: Lowercase, cleaned for matching
street_number_low: Lowest known street number (if available)
street_number_high: Highest known street number (if available)
town: Town/city name
town_normalized: Lowercase town name for matching
state: 2-letter state abbreviation (e.g., "MA")
zip_codes: Comma-separated ZIP codes this street spans
osm_id: OpenStreetMap way ID for reference
created_at: When this record was added
"""
__tablename__ = "street_reference"
__table_args__ = (
Index("ix_street_ref_town_state", "town_normalized", "state"),
Index("ix_street_ref_name_town", "street_name_normalized", "town_normalized"),
{"schema": "public"},
)
id = Column(Integer, primary_key=True, autoincrement=True)
street_name = Column(VARCHAR(500), nullable=False)
street_name_normalized = Column(VARCHAR(500), nullable=False, index=True)
street_number_low = Column(Integer, nullable=True)
street_number_high = Column(Integer, nullable=True)
town = Column(VARCHAR(140), nullable=False)
town_normalized = Column(VARCHAR(140), nullable=False)
state = Column(VARCHAR(2), nullable=False)
zip_codes = Column(VARCHAR(100), nullable=True)
osm_id = Column(String, nullable=True, index=True)
created_at = Column(TIMESTAMP, nullable=False)
def __repr__(self) -> str:
return (
f"<StreetReference(id={self.id}, "
f"street='{self.street_name}', "
f"town='{self.town}', state='{self.state}')>"
)

572
app/streets.py Normal file
View File

@@ -0,0 +1,572 @@
"""
Street reference tools for address correction.
This module provides functionality to:
1. Fetch streets from OpenStreetMap Overpass API for a given town/state
2. Store streets in the StreetReference table
3. Perform fuzzy matching to correct misspelled addresses
The fuzzy matching handles common issues like:
- Misspelled street names ("Mian St" -> "Main St")
- Wrong suffixes ("Main Rd" -> "Main St")
- Missing/extra spaces
- Abbreviated vs full names ("St" vs "Street")
"""
import logging
import re
import time
from dataclasses import dataclass
from datetime import datetime
from typing import List, Optional, Tuple
import requests
from rapidfuzz import fuzz, process
from sqlalchemy.orm import Session
from app.config import STATE_MAPPING
from app.models import StreetReference
logger = logging.getLogger(__name__)
# Overpass API endpoints (multiple for fallback)
OVERPASS_API_URLS = [
"https://overpass-api.de/api/interpreter",
"https://overpass.kumi.systems/api/interpreter",
"https://maps.mail.ru/osm/tools/overpass/api/interpreter",
]
# Common street suffix variations for normalization
STREET_SUFFIXES = {
# Standard -> variations
"street": ["st", "str", "strt"],
"avenue": ["ave", "av", "aven"],
"road": ["rd", "rod"],
"drive": ["dr", "drv", "driv"],
"lane": ["ln", "lne"],
"court": ["ct", "crt", "cour"],
"circle": ["cir", "circ", "crcl"],
"boulevard": ["blvd", "boul", "blv"],
"place": ["pl", "plc"],
"terrace": ["ter", "terr", "trc"],
"way": ["wy"],
"highway": ["hwy", "hiway", "hgwy"],
"parkway": ["pkwy", "pky", "pkway"],
"square": ["sq", "sqr"],
"trail": ["trl", "tr"],
"crossing": ["xing", "crssng"],
"heights": ["hts", "hgts"],
"point": ["pt", "pnt"],
"ridge": ["rdg", "rdge"],
"valley": ["vly", "vlly"],
"view": ["vw", "viw"],
"center": ["ctr", "cntr", "centre"],
"north": ["n"],
"south": ["s"],
"east": ["e"],
"west": ["w"],
"northeast": ["ne"],
"northwest": ["nw"],
"southeast": ["se"],
"southwest": ["sw"],
}
# Build reverse lookup: abbreviation -> full form
SUFFIX_TO_FULL = {}
for full, abbrevs in STREET_SUFFIXES.items():
for abbr in abbrevs:
SUFFIX_TO_FULL[abbr] = full
SUFFIX_TO_FULL[full] = full # Also map full to itself
@dataclass
class StreetMatch:
"""Result of fuzzy street matching."""
original_street: str
matched_street: str
confidence_score: float
town: str
state: str
street_ref_id: int
corrected_address: Optional[str] = None
@dataclass
class FetchResult:
"""Result of fetching streets from OSM."""
success: bool
streets_added: int
streets_updated: int
total_found: int
message: str
errors: List[str]
def normalize_street_name(street: str) -> str:
"""
Normalize a street name for fuzzy matching.
- Lowercase
- Remove extra whitespace
- Expand common abbreviations to full form
- Remove punctuation
Args:
street: Raw street name
Returns:
Normalized street name
"""
if not street:
return ""
# Lowercase and strip
normalized = street.lower().strip()
# Remove punctuation except hyphens
normalized = re.sub(r"[.,']", "", normalized)
# Normalize whitespace
normalized = re.sub(r"\s+", " ", normalized)
# Split into words and expand abbreviations
words = normalized.split()
expanded_words = []
for word in words:
if word in SUFFIX_TO_FULL:
expanded_words.append(SUFFIX_TO_FULL[word])
else:
expanded_words.append(word)
return " ".join(expanded_words)
def extract_street_number(address: str) -> Tuple[Optional[str], str]:
"""
Extract street number from an address string.
Args:
address: Full address like "123 Main Street"
Returns:
Tuple of (street_number, remaining_address)
"""
if not address:
return None, ""
# Match leading number (possibly with letter suffix like "123A")
match = re.match(r"^(\d+[A-Za-z]?)\s+(.+)$", address.strip())
if match:
return match.group(1), match.group(2)
return None, address.strip()
def get_state_name(state_abbr: str) -> str:
"""
Get full state name from abbreviation for Overpass query.
Args:
state_abbr: 2-letter state abbreviation
Returns:
Full state name
"""
state_names = {
"AL": "Alabama", "AK": "Alaska", "AZ": "Arizona", "AR": "Arkansas",
"CA": "California", "CO": "Colorado", "CT": "Connecticut", "DE": "Delaware",
"DC": "District of Columbia", "FL": "Florida", "GA": "Georgia", "HI": "Hawaii",
"ID": "Idaho", "IL": "Illinois", "IN": "Indiana", "IA": "Iowa",
"KS": "Kansas", "KY": "Kentucky", "LA": "Louisiana", "ME": "Maine",
"MD": "Maryland", "MA": "Massachusetts", "MI": "Michigan", "MN": "Minnesota",
"MS": "Mississippi", "MO": "Missouri", "MT": "Montana", "NE": "Nebraska",
"NV": "Nevada", "NH": "New Hampshire", "NJ": "New Jersey", "NM": "New Mexico",
"NY": "New York", "NC": "North Carolina", "ND": "North Dakota", "OH": "Ohio",
"OK": "Oklahoma", "OR": "Oregon", "PA": "Pennsylvania", "RI": "Rhode Island",
"SC": "South Carolina", "SD": "South Dakota", "TN": "Tennessee", "TX": "Texas",
"UT": "Utah", "VT": "Vermont", "VA": "Virginia", "WA": "Washington",
"WV": "West Virginia", "WI": "Wisconsin", "WY": "Wyoming",
"PR": "Puerto Rico", "VI": "Virgin Islands", "GU": "Guam", "AS": "American Samoa",
}
return state_names.get(state_abbr.upper(), state_abbr)
def fetch_streets_from_osm(town: str, state: str) -> Tuple[List[dict], str]:
"""
Fetch all streets in a town from OpenStreetMap using Overpass API.
Args:
town: Town/city name
state: 2-letter state abbreviation
Returns:
Tuple of (list of street dicts, error message or empty string)
"""
state_name = get_state_name(state)
state_upper = state.upper()
# Simpler, more reliable Overpass query
# Uses geocodeArea which is optimized for place lookups
query = f"""
[out:json][timeout:120];
// Use geocodeArea for reliable city lookup with state context
{{geocodeArea:{town}, {state_name}, United States}}->.city;
// Get all named streets in the city
way["highway"]["name"](area.city);
out tags;
"""
# Alternative query if geocodeArea fails (more explicit)
fallback_query = f"""
[out:json][timeout:120];
// Find state by ISO code
area["ISO3166-2"="US-{state_upper}"]->.state;
// Find city/town within state
(
relation["name"="{town}"]["type"="boundary"](area.state);
way["name"="{town}"]["place"](area.state);
node["name"="{town}"]["place"](area.state);
);
map_to_area->.city;
// Get streets
way["highway"]["name"](area.city);
out tags;
"""
# Most reliable: search by name within bounding box of state
# This uses Nominatim-style search which is very reliable
simple_query = f"""
[out:json][timeout:60];
area["name"="{state_name}"]["boundary"="administrative"]["admin_level"="4"]->.state;
area["name"="{town}"](area.state)->.city;
way["highway"]["name"](area.city);
out tags;
"""
queries = [simple_query, query, fallback_query]
query_names = ["simple", "geocodeArea", "fallback"]
logger.info(f"Fetching streets from OSM for {town}, {state_name}")
last_error = ""
for api_url in OVERPASS_API_URLS:
for q, q_name in zip(queries, query_names):
try:
logger.info(f"Trying {q_name} query on {api_url.split('/')[2]}...")
logger.debug(f"Query: {q}")
response = requests.post(
api_url,
data={"data": q},
timeout=120,
headers={"User-Agent": "EamcoAddressChecker/1.0"}
)
if response.status_code == 429:
logger.warning("Rate limited, waiting 30s...")
time.sleep(30)
continue
if response.status_code == 504:
logger.warning(f"Timeout on {q_name} query, trying next...")
continue
response.raise_for_status()
data = response.json()
elements = data.get("elements", [])
if elements:
logger.info(f"Success with {q_name} query: {len(elements)} street segments")
# Process and return results
streets = []
seen_names = set()
for element in elements:
tags = element.get("tags", {})
name = tags.get("name")
if name and name.lower() not in seen_names:
seen_names.add(name.lower())
streets.append({
"name": name,
"osm_id": str(element.get("id", "")),
"highway_type": tags.get("highway", ""),
})
logger.info(f"Extracted {len(streets)} unique street names")
return streets, ""
else:
logger.debug(f"No results from {q_name} query")
except requests.exceptions.Timeout:
last_error = f"Timeout on {api_url}"
logger.warning(last_error)
continue
except requests.exceptions.RequestException as e:
last_error = f"Request error: {str(e)}"
logger.warning(last_error)
continue
except Exception as e:
last_error = f"Error: {str(e)}"
logger.warning(last_error)
continue
# All attempts failed
error = f"All Overpass queries failed for {town}, {state}. Last error: {last_error}"
logger.error(error)
return [], error
def populate_streets_for_town(
session: Session,
town: str,
state: str,
clear_existing: bool = False
) -> FetchResult:
"""
Fetch streets from OSM and populate the StreetReference table.
Args:
session: SQLAlchemy session
town: Town/city name
state: 2-letter state abbreviation
clear_existing: If True, delete existing streets for this town first
Returns:
FetchResult with statistics
"""
state = state.upper()
town_normalized = town.lower().strip()
errors = []
logger.info(f"Populating streets for {town}, {state}")
# Optionally clear existing streets for this town
if clear_existing:
deleted = session.query(StreetReference).filter(
StreetReference.town_normalized == town_normalized,
StreetReference.state == state
).delete(synchronize_session=False)
session.commit()
logger.info(f"Cleared {deleted} existing street records")
# Fetch from OSM
streets, error = fetch_streets_from_osm(town, state)
if error:
errors.append(error)
if not streets:
return FetchResult(
success=len(errors) == 0,
streets_added=0,
streets_updated=0,
total_found=0,
message=f"No streets found for {town}, {state}",
errors=errors,
)
# Check for existing streets to avoid duplicates
existing_streets = session.query(StreetReference).filter(
StreetReference.town_normalized == town_normalized,
StreetReference.state == state
).all()
existing_names = {s.street_name_normalized for s in existing_streets}
added = 0
now = datetime.utcnow()
for street_data in streets:
name = street_data["name"]
name_normalized = normalize_street_name(name)
if name_normalized in existing_names:
continue
street_ref = StreetReference(
street_name=name,
street_name_normalized=name_normalized,
town=town,
town_normalized=town_normalized,
state=state,
osm_id=street_data.get("osm_id"),
created_at=now,
)
session.add(street_ref)
existing_names.add(name_normalized)
added += 1
session.commit()
logger.info(f"Added {added} new streets for {town}, {state}")
return FetchResult(
success=True,
streets_added=added,
streets_updated=0,
total_found=len(streets),
message=f"Successfully added {added} streets for {town}, {state}",
errors=errors,
)
def find_matching_street(
session: Session,
street_input: str,
town: str,
state: str,
min_confidence: float = 70.0
) -> Optional[StreetMatch]:
"""
Find the best matching street for a potentially misspelled input.
Uses fuzzy string matching with rapidfuzz to find the closest
match in the StreetReference table.
Args:
session: SQLAlchemy session
street_input: The street name to match (may be misspelled)
town: Town/city to search within
state: State abbreviation
min_confidence: Minimum match confidence (0-100)
Returns:
StreetMatch if found above threshold, None otherwise
"""
state = state.upper()
town_normalized = town.lower().strip()
# Normalize the input for matching
input_normalized = normalize_street_name(street_input)
# Get all streets for this town
streets = session.query(StreetReference).filter(
StreetReference.town_normalized == town_normalized,
StreetReference.state == state
).all()
if not streets:
logger.debug(f"No reference streets found for {town}, {state}")
return None
# Build list of (normalized_name, street_object) for matching
choices = [(s.street_name_normalized, s) for s in streets]
# Use rapidfuzz to find best match
# We use token_set_ratio which handles word order differences well
best_match = None
best_score = 0
for normalized_name, street_obj in choices:
# Try multiple scoring methods and take the best
scores = [
fuzz.ratio(input_normalized, normalized_name),
fuzz.partial_ratio(input_normalized, normalized_name),
fuzz.token_sort_ratio(input_normalized, normalized_name),
fuzz.token_set_ratio(input_normalized, normalized_name),
]
score = max(scores)
if score > best_score:
best_score = score
best_match = street_obj
if best_match and best_score >= min_confidence:
logger.info(
f"Fuzzy match: '{street_input}' -> '{best_match.street_name}' "
f"(confidence: {best_score:.1f}%)"
)
return StreetMatch(
original_street=street_input,
matched_street=best_match.street_name,
confidence_score=best_score,
town=best_match.town,
state=best_match.state,
street_ref_id=best_match.id,
)
logger.debug(
f"No confident match for '{street_input}' "
f"(best: {best_score:.1f}%, threshold: {min_confidence}%)"
)
return None
def correct_address(
session: Session,
full_address: str,
town: str,
state: str,
min_confidence: float = 75.0
) -> Optional[StreetMatch]:
"""
Attempt to correct a full address using fuzzy street matching.
Extracts the street portion, finds a match, and returns
a corrected address with the matched street name.
Args:
session: SQLAlchemy session
full_address: Full street address (e.g., "123 Mian St")
town: Town/city name
state: State abbreviation
min_confidence: Minimum match confidence
Returns:
StreetMatch with corrected_address if match found, None otherwise
"""
# Extract street number and street name
street_number, street_name = extract_street_number(full_address)
if not street_name:
return None
# Find matching street
match = find_matching_street(
session=session,
street_input=street_name,
town=town,
state=state,
min_confidence=min_confidence,
)
if match:
# Build corrected address
if street_number:
match.corrected_address = f"{street_number} {match.matched_street}"
else:
match.corrected_address = match.matched_street
logger.info(
f"Address correction: '{full_address}' -> '{match.corrected_address}'"
)
return match
def get_town_street_count(session: Session, town: str, state: str) -> int:
"""
Get the number of streets in the reference table for a town.
Args:
session: SQLAlchemy session
town: Town/city name
state: State abbreviation
Returns:
Number of streets in the reference table
"""
return session.query(StreetReference).filter(
StreetReference.town_normalized == town.lower().strip(),
StreetReference.state == state.upper()
).count()

389
app/tools.py Normal file
View File

@@ -0,0 +1,389 @@
"""
Geocoding tools for eamco_address_checker.
This module provides modular tool functions for the agentic address verification
workflow. Each function represents a discrete action in the ReAct-style pipeline.
Tools:
- build_address(): Constructs full US address string from components
- validate_address_components(): Validates required address fields
- geocode_address(): Calls Nominatim API to get lat/long
- validate_geocode_result(): Checks quality of geocoding result
- update_record(): Updates database record with geocoding results
"""
import logging
import random
import time
from dataclasses import dataclass
from datetime import datetime
from typing import Optional, Tuple
from geopy.geocoders import Nominatim
from geopy.exc import GeocoderTimedOut, GeocoderServiceError, GeocoderUnavailable
from sqlalchemy.orm import Session
from app.config import (
NOMINATIM_USER_AGENT,
MIN_SLEEP_SECONDS,
MAX_SLEEP_SECONDS,
GEOCODE_TIMEOUT,
STATE_MAPPING,
)
from app.models import CustomerCustomer
logger = logging.getLogger(__name__)
@dataclass
class GeocodeResult:
"""Result from geocoding operation."""
success: bool
latitude: Optional[str] = None
longitude: Optional[str] = None
raw_address: Optional[str] = None
country_code: Optional[str] = None
error_message: Optional[str] = None
skipped: bool = False
skip_reason: Optional[str] = None
@dataclass
class AddressComponents:
"""Structured address components for geocoding."""
street: Optional[str]
apt: Optional[str]
city: Optional[str]
state: Optional[str]
zip_code: Optional[str]
is_valid: bool = True
validation_error: Optional[str] = None
def get_state_abbreviation(state_id: Optional[int]) -> Optional[str]:
"""
Convert state integer ID to 2-letter US state abbreviation.
Args:
state_id: Integer ID from database
Returns:
2-letter state abbreviation or None if not found
Note:
Replace with proper states table lookup when available
"""
if state_id is None:
return None
return STATE_MAPPING.get(state_id)
def build_address(customer: CustomerCustomer) -> AddressComponents:
"""
TOOL: Build full US address string from customer record components.
Constructs a normalized address string suitable for geocoding.
Format: "street, apt, city, state zip"
Args:
customer: CustomerCustomer record with address fields
Returns:
AddressComponents dataclass with parsed components and validation status
"""
# Extract and clean components
street = (customer.customer_address or "").strip()
apt = (customer.customer_apt or "").strip()
city = (customer.customer_town or "").strip()
state = get_state_abbreviation(customer.customer_state)
zip_code = (customer.customer_zip or "").strip()
logger.debug(
"Building address",
extra={
"customer_id": customer.id,
"street": street,
"apt": apt,
"city": city,
"state": state,
"zip": zip_code,
}
)
return AddressComponents(
street=street if street else None,
apt=apt if apt else None,
city=city if city else None,
state=state,
zip_code=zip_code if zip_code else None,
)
def validate_address_components(components: AddressComponents) -> AddressComponents:
"""
TOOL: Validate that address has minimum required components.
An address is considered valid for geocoding if it has:
- Street address (required)
- City (required)
- ZIP code (required)
- State is recommended but not strictly required
Args:
components: AddressComponents to validate
Returns:
Updated AddressComponents with is_valid flag and validation_error
"""
missing = []
if not components.street:
missing.append("street")
if not components.city:
missing.append("city")
if not components.zip_code:
missing.append("zip")
if missing:
components.is_valid = False
components.validation_error = f"Missing required fields: {', '.join(missing)}"
logger.debug(f"Address validation failed: {components.validation_error}")
else:
components.is_valid = True
logger.debug("Address validation passed")
return components
def format_address_string(components: AddressComponents) -> str:
"""
Format address components into a single string for geocoding.
Args:
components: Validated AddressComponents
Returns:
Formatted address string
"""
parts = []
# Street + Apt
if components.street:
if components.apt:
parts.append(f"{components.street}, {components.apt}")
else:
parts.append(components.street)
# City
if components.city:
parts.append(components.city)
# State + ZIP
if components.state and components.zip_code:
parts.append(f"{components.state} {components.zip_code}")
elif components.state:
parts.append(components.state)
elif components.zip_code:
parts.append(components.zip_code)
# Add country for better accuracy
parts.append("USA")
return ", ".join(parts)
def geocode_address(
address_string: str,
geocoder: Optional[Nominatim] = None
) -> GeocodeResult:
"""
TOOL: Call Nominatim API to geocode an address.
Uses geopy's Nominatim geocoder with proper rate limiting.
Respects Nominatim's 1 request/second policy.
Args:
address_string: Full formatted address to geocode
geocoder: Optional pre-initialized Nominatim instance
Returns:
GeocodeResult with lat/long or error information
"""
if geocoder is None:
geocoder = Nominatim(user_agent=NOMINATIM_USER_AGENT)
logger.info(f"Geocoding address: {address_string}")
try:
# Call Nominatim API with timeout
location = geocoder.geocode(
address_string,
timeout=GEOCODE_TIMEOUT,
addressdetails=True,
country_codes="us", # Limit to USA
)
if location is None:
logger.warning(f"No geocoding result for: {address_string}")
return GeocodeResult(
success=False,
error_message="No location found for address"
)
# Extract country code from raw response if available
country_code = None
if hasattr(location, 'raw') and 'address' in location.raw:
country_code = location.raw['address'].get('country_code', '').upper()
logger.info(
f"Geocoding successful: lat={location.latitude}, lon={location.longitude}",
extra={
"latitude": location.latitude,
"longitude": location.longitude,
"raw_address": location.address,
"country_code": country_code,
}
)
return GeocodeResult(
success=True,
latitude=str(location.latitude),
longitude=str(location.longitude),
raw_address=location.address,
country_code=country_code,
)
except GeocoderTimedOut as e:
logger.error(f"Geocoding timeout: {e}")
return GeocodeResult(
success=False,
error_message=f"Geocoding timed out after {GEOCODE_TIMEOUT}s"
)
except GeocoderServiceError as e:
logger.error(f"Geocoder service error: {e}")
return GeocodeResult(
success=False,
error_message=f"Geocoder service error: {str(e)}"
)
except GeocoderUnavailable as e:
logger.error(f"Geocoder unavailable: {e}")
return GeocodeResult(
success=False,
error_message=f"Geocoder unavailable: {str(e)}"
)
except Exception as e:
logger.error(f"Unexpected geocoding error: {e}", exc_info=True)
return GeocodeResult(
success=False,
error_message=f"Unexpected error: {str(e)}"
)
def validate_geocode_result(result: GeocodeResult) -> Tuple[bool, str]:
"""
TOOL: Validate quality of geocoding result.
Checks:
- Result was successful
- Country is USA (if available)
- Coordinates are within reasonable US bounds
Args:
result: GeocodeResult to validate
Returns:
Tuple of (is_valid, reason_string)
"""
if not result.success:
return False, f"Geocoding failed: {result.error_message}"
# Check country code if available
if result.country_code and result.country_code != "US":
logger.warning(f"Non-US country code: {result.country_code}")
return False, f"Result is outside USA (country: {result.country_code})"
# Basic bounds check for continental US + Alaska + Hawaii
try:
lat = float(result.latitude)
lon = float(result.longitude)
# Rough US bounds (including Alaska and Hawaii)
if not (18.0 <= lat <= 72.0):
return False, f"Latitude {lat} outside US bounds"
if not (-180.0 <= lon <= -65.0):
return False, f"Longitude {lon} outside US bounds"
except (ValueError, TypeError) as e:
return False, f"Invalid coordinates: {e}"
return True, "Valid US geocode result"
def update_record(
session: Session,
customer: CustomerCustomer,
geocode_result: GeocodeResult,
is_valid: bool
) -> bool:
"""
TOOL: Update customer record with geocoding results.
Sets latitude, longitude, correct_address flag, and verified_at timestamp.
Args:
session: SQLAlchemy session
customer: CustomerCustomer record to update
geocode_result: Result from geocoding operation
is_valid: Whether the geocode result passed validation
Returns:
True if update successful, False otherwise
"""
try:
now = datetime.utcnow()
if is_valid and geocode_result.success:
# Successful geocoding - update all fields
customer.customer_latitude = geocode_result.latitude
customer.customer_longitude = geocode_result.longitude
customer.correct_address = True
customer.verified_at = now
logger.info(
f"Updated record {customer.id}: lat={geocode_result.latitude}, "
f"lon={geocode_result.longitude}, correct_address=True"
)
else:
# Failed geocoding - mark as verified but not correct
customer.correct_address = False
customer.verified_at = now
logger.info(
f"Updated record {customer.id}: correct_address=False "
f"(reason: {geocode_result.error_message or 'validation failed'})"
)
return True
except Exception as e:
logger.error(f"Failed to update record {customer.id}: {e}", exc_info=True)
return False
def rate_limit_sleep() -> float:
"""
Sleep for a random duration to respect Nominatim rate limits.
Nominatim requires max 1 request per second. We sleep between
MIN_SLEEP_SECONDS and MAX_SLEEP_SECONDS (default 1.2-1.8s).
Returns:
Actual sleep duration in seconds
"""
sleep_time = random.uniform(MIN_SLEEP_SECONDS, MAX_SLEEP_SECONDS)
logger.debug(f"Rate limiting: sleeping {sleep_time:.2f}s")
time.sleep(sleep_time)
return sleep_time

23
requirements.txt Normal file
View File

@@ -0,0 +1,23 @@
# eamco_address_checker dependencies
# FastAPI web framework and server
fastapi>=0.109.0,<1.0.0
uvicorn[standard]>=0.27.0,<1.0.0
pydantic>=2.5.0,<3.0.0
# Database
sqlalchemy>=2.0.0,<3.0.0
psycopg2-binary>=2.9.9,<3.0.0
# Geocoding
geopy>=2.4.1,<3.0.0
# Fuzzy string matching for address correction
rapidfuzz>=3.5.0,<4.0.0
# HTTP client (for OSM Overpass API and geopy)
requests>=2.31.0,<3.0.0
urllib3>=2.0.0,<3.0.0
certifi>=2023.0.0
# Configuration
python-dotenv>=1.0.0,<2.0.0