Files
2026-01-18 17:53:26 -05:00

390 lines
11 KiB
Python

"""
Geocoding tools for eamco_address_checker.
This module provides modular tool functions for the agentic address verification
workflow. Each function represents a discrete action in the ReAct-style pipeline.
Tools:
- build_address(): Constructs full US address string from components
- validate_address_components(): Validates required address fields
- geocode_address(): Calls Nominatim API to get lat/long
- validate_geocode_result(): Checks quality of geocoding result
- update_record(): Updates database record with geocoding results
"""
import logging
import random
import time
from dataclasses import dataclass
from datetime import datetime
from typing import Optional, Tuple
from geopy.geocoders import Nominatim
from geopy.exc import GeocoderTimedOut, GeocoderServiceError, GeocoderUnavailable
from sqlalchemy.orm import Session
from app.config import (
NOMINATIM_USER_AGENT,
MIN_SLEEP_SECONDS,
MAX_SLEEP_SECONDS,
GEOCODE_TIMEOUT,
STATE_MAPPING,
)
from app.models import CustomerCustomer
logger = logging.getLogger(__name__)
@dataclass
class GeocodeResult:
"""Result from geocoding operation."""
success: bool
latitude: Optional[str] = None
longitude: Optional[str] = None
raw_address: Optional[str] = None
country_code: Optional[str] = None
error_message: Optional[str] = None
skipped: bool = False
skip_reason: Optional[str] = None
@dataclass
class AddressComponents:
"""Structured address components for geocoding."""
street: Optional[str]
apt: Optional[str]
city: Optional[str]
state: Optional[str]
zip_code: Optional[str]
is_valid: bool = True
validation_error: Optional[str] = None
def get_state_abbreviation(state_id: Optional[int]) -> Optional[str]:
"""
Convert state integer ID to 2-letter US state abbreviation.
Args:
state_id: Integer ID from database
Returns:
2-letter state abbreviation or None if not found
Note:
Replace with proper states table lookup when available
"""
if state_id is None:
return None
return STATE_MAPPING.get(state_id)
def build_address(customer: CustomerCustomer) -> AddressComponents:
"""
TOOL: Build full US address string from customer record components.
Constructs a normalized address string suitable for geocoding.
Format: "street, apt, city, state zip"
Args:
customer: CustomerCustomer record with address fields
Returns:
AddressComponents dataclass with parsed components and validation status
"""
# Extract and clean components
street = (customer.customer_address or "").strip()
apt = (customer.customer_apt or "").strip()
city = (customer.customer_town or "").strip()
state = get_state_abbreviation(customer.customer_state)
zip_code = (customer.customer_zip or "").strip()
logger.debug(
"Building address",
extra={
"customer_id": customer.id,
"street": street,
"apt": apt,
"city": city,
"state": state,
"zip": zip_code,
}
)
return AddressComponents(
street=street if street else None,
apt=apt if apt else None,
city=city if city else None,
state=state,
zip_code=zip_code if zip_code else None,
)
def validate_address_components(components: AddressComponents) -> AddressComponents:
"""
TOOL: Validate that address has minimum required components.
An address is considered valid for geocoding if it has:
- Street address (required)
- City (required)
- ZIP code (required)
- State is recommended but not strictly required
Args:
components: AddressComponents to validate
Returns:
Updated AddressComponents with is_valid flag and validation_error
"""
missing = []
if not components.street:
missing.append("street")
if not components.city:
missing.append("city")
if not components.zip_code:
missing.append("zip")
if missing:
components.is_valid = False
components.validation_error = f"Missing required fields: {', '.join(missing)}"
logger.debug(f"Address validation failed: {components.validation_error}")
else:
components.is_valid = True
logger.debug("Address validation passed")
return components
def format_address_string(components: AddressComponents) -> str:
"""
Format address components into a single string for geocoding.
Args:
components: Validated AddressComponents
Returns:
Formatted address string
"""
parts = []
# Street + Apt
if components.street:
if components.apt:
parts.append(f"{components.street}, {components.apt}")
else:
parts.append(components.street)
# City
if components.city:
parts.append(components.city)
# State + ZIP
if components.state and components.zip_code:
parts.append(f"{components.state} {components.zip_code}")
elif components.state:
parts.append(components.state)
elif components.zip_code:
parts.append(components.zip_code)
# Add country for better accuracy
parts.append("USA")
return ", ".join(parts)
def geocode_address(
address_string: str,
geocoder: Optional[Nominatim] = None
) -> GeocodeResult:
"""
TOOL: Call Nominatim API to geocode an address.
Uses geopy's Nominatim geocoder with proper rate limiting.
Respects Nominatim's 1 request/second policy.
Args:
address_string: Full formatted address to geocode
geocoder: Optional pre-initialized Nominatim instance
Returns:
GeocodeResult with lat/long or error information
"""
if geocoder is None:
geocoder = Nominatim(user_agent=NOMINATIM_USER_AGENT)
logger.info(f"Geocoding address: {address_string}")
try:
# Call Nominatim API with timeout
location = geocoder.geocode(
address_string,
timeout=GEOCODE_TIMEOUT,
addressdetails=True,
country_codes="us", # Limit to USA
)
if location is None:
logger.warning(f"No geocoding result for: {address_string}")
return GeocodeResult(
success=False,
error_message="No location found for address"
)
# Extract country code from raw response if available
country_code = None
if hasattr(location, 'raw') and 'address' in location.raw:
country_code = location.raw['address'].get('country_code', '').upper()
logger.info(
f"Geocoding successful: lat={location.latitude}, lon={location.longitude}",
extra={
"latitude": location.latitude,
"longitude": location.longitude,
"raw_address": location.address,
"country_code": country_code,
}
)
return GeocodeResult(
success=True,
latitude=str(location.latitude),
longitude=str(location.longitude),
raw_address=location.address,
country_code=country_code,
)
except GeocoderTimedOut as e:
logger.error(f"Geocoding timeout: {e}")
return GeocodeResult(
success=False,
error_message=f"Geocoding timed out after {GEOCODE_TIMEOUT}s"
)
except GeocoderServiceError as e:
logger.error(f"Geocoder service error: {e}")
return GeocodeResult(
success=False,
error_message=f"Geocoder service error: {str(e)}"
)
except GeocoderUnavailable as e:
logger.error(f"Geocoder unavailable: {e}")
return GeocodeResult(
success=False,
error_message=f"Geocoder unavailable: {str(e)}"
)
except Exception as e:
logger.error(f"Unexpected geocoding error: {e}", exc_info=True)
return GeocodeResult(
success=False,
error_message=f"Unexpected error: {str(e)}"
)
def validate_geocode_result(result: GeocodeResult) -> Tuple[bool, str]:
"""
TOOL: Validate quality of geocoding result.
Checks:
- Result was successful
- Country is USA (if available)
- Coordinates are within reasonable US bounds
Args:
result: GeocodeResult to validate
Returns:
Tuple of (is_valid, reason_string)
"""
if not result.success:
return False, f"Geocoding failed: {result.error_message}"
# Check country code if available
if result.country_code and result.country_code != "US":
logger.warning(f"Non-US country code: {result.country_code}")
return False, f"Result is outside USA (country: {result.country_code})"
# Basic bounds check for continental US + Alaska + Hawaii
try:
lat = float(result.latitude)
lon = float(result.longitude)
# Rough US bounds (including Alaska and Hawaii)
if not (18.0 <= lat <= 72.0):
return False, f"Latitude {lat} outside US bounds"
if not (-180.0 <= lon <= -65.0):
return False, f"Longitude {lon} outside US bounds"
except (ValueError, TypeError) as e:
return False, f"Invalid coordinates: {e}"
return True, "Valid US geocode result"
def update_record(
session: Session,
customer: CustomerCustomer,
geocode_result: GeocodeResult,
is_valid: bool
) -> bool:
"""
TOOL: Update customer record with geocoding results.
Sets latitude, longitude, correct_address flag, and verified_at timestamp.
Args:
session: SQLAlchemy session
customer: CustomerCustomer record to update
geocode_result: Result from geocoding operation
is_valid: Whether the geocode result passed validation
Returns:
True if update successful, False otherwise
"""
try:
now = datetime.utcnow()
if is_valid and geocode_result.success:
# Successful geocoding - update all fields
customer.customer_latitude = geocode_result.latitude
customer.customer_longitude = geocode_result.longitude
customer.correct_address = True
customer.verified_at = now
logger.info(
f"Updated record {customer.id}: lat={geocode_result.latitude}, "
f"lon={geocode_result.longitude}, correct_address=True"
)
else:
# Failed geocoding - mark as verified but not correct
customer.correct_address = False
customer.verified_at = now
logger.info(
f"Updated record {customer.id}: correct_address=False "
f"(reason: {geocode_result.error_message or 'validation failed'})"
)
return True
except Exception as e:
logger.error(f"Failed to update record {customer.id}: {e}", exc_info=True)
return False
def rate_limit_sleep() -> float:
"""
Sleep for a random duration to respect Nominatim rate limits.
Nominatim requires max 1 request per second. We sleep between
MIN_SLEEP_SECONDS and MAX_SLEEP_SECONDS (default 1.2-1.8s).
Returns:
Actual sleep duration in seconds
"""
sleep_time = random.uniform(MIN_SLEEP_SECONDS, MAX_SLEEP_SECONDS)
logger.debug(f"Rate limiting: sleeping {sleep_time:.2f}s")
time.sleep(sleep_time)
return sleep_time