eamco_address_checker/app/streets.py

"""
Street reference tools for address correction.

This module provides functionality to:
1. Fetch streets from OpenStreetMap Overpass API for a given town/state
2. Store streets in the StreetReference table
3. Perform fuzzy matching to correct misspelled addresses

The fuzzy matching handles common issues like:
- Misspelled street names ("Mian St" -> "Main St")
- Wrong suffixes ("Main Rd" -> "Main St")
- Missing/extra spaces
- Abbreviated vs full names ("St" vs "Street")
"""

import logging
import re
import time
from dataclasses import dataclass
from datetime import datetime
from typing import List, Optional, Tuple

import requests
from rapidfuzz import fuzz, process
from sqlalchemy.orm import Session

from app.config import STATE_MAPPING
from app.models import StreetReference

logger = logging.getLogger(__name__)

# Overpass API endpoints (multiple for fallback)
OVERPASS_API_URLS = [
    "https://overpass-api.de/api/interpreter",
    "https://overpass.kumi.systems/api/interpreter",
    "https://maps.mail.ru/osm/tools/overpass/api/interpreter",
]

# Common street suffix variations for normalization
STREET_SUFFIXES = {
    # Standard -> variations
    "street": ["st", "str", "strt"],
    "avenue": ["ave", "av", "aven"],
    "road": ["rd", "rod"],
    "drive": ["dr", "drv", "driv"],
    "lane": ["ln", "lne"],
    "court": ["ct", "crt", "cour"],
    "circle": ["cir", "circ", "crcl"],
    "boulevard": ["blvd", "boul", "blv"],
    "place": ["pl", "plc"],
    "terrace": ["ter", "terr", "trc"],
    "way": ["wy"],
    "highway": ["hwy", "hiway", "hgwy"],
    "parkway": ["pkwy", "pky", "pkway"],
    "square": ["sq", "sqr"],
    "trail": ["trl", "tr"],
    "crossing": ["xing", "crssng"],
    "heights": ["hts", "hgts"],
    "point": ["pt", "pnt"],
    "ridge": ["rdg", "rdge"],
    "valley": ["vly", "vlly"],
    "view": ["vw", "viw"],
    "center": ["ctr", "cntr", "centre"],
    "north": ["n"],
    "south": ["s"],
    "east": ["e"],
    "west": ["w"],
    "northeast": ["ne"],
    "northwest": ["nw"],
    "southeast": ["se"],
    "southwest": ["sw"],
}

# Build reverse lookup: abbreviation -> full form
SUFFIX_TO_FULL = {}
for full, abbrevs in STREET_SUFFIXES.items():
    for abbr in abbrevs:
        SUFFIX_TO_FULL[abbr] = full
    SUFFIX_TO_FULL[full] = full  # Also map full to itself


@dataclass
class StreetMatch:
    """Result of fuzzy street matching."""
    original_street: str
    matched_street: str
    confidence_score: float
    town: str
    state: str
    street_ref_id: int
    corrected_address: Optional[str] = None


@dataclass
class FetchResult:
    """Result of fetching streets from OSM."""
    success: bool
    streets_added: int
    streets_updated: int
    total_found: int
    message: str
    errors: List[str]


def normalize_street_name(street: str) -> str:
    """
    Normalize a street name for fuzzy matching.

    - Lowercase
    - Remove extra whitespace
    - Expand common abbreviations to full form
    - Remove punctuation

    Args:
        street: Raw street name

    Returns:
        Normalized street name
    """
    if not street:
        return ""

    # Lowercase and strip
    normalized = street.lower().strip()

    # Remove punctuation except hyphens
    normalized = re.sub(r"[.,']", "", normalized)

    # Normalize whitespace
    normalized = re.sub(r"\s+", " ", normalized)

    # Split into words and expand abbreviations
    words = normalized.split()
    expanded_words = []
    for word in words:
        if word in SUFFIX_TO_FULL:
            expanded_words.append(SUFFIX_TO_FULL[word])
        else:
            expanded_words.append(word)

    return " ".join(expanded_words)


def extract_street_number(address: str) -> Tuple[Optional[str], str]:
    """
    Extract street number from an address string.

    Args:
        address: Full address like "123 Main Street"

    Returns:
        Tuple of (street_number, remaining_address)
    """
    if not address:
        return None, ""

    # Match leading number (possibly with letter suffix like "123A")
    match = re.match(r"^(\d+[A-Za-z]?)\s+(.+)$", address.strip())
    if match:
        return match.group(1), match.group(2)

    return None, address.strip()


def get_state_name(state_abbr: str) -> str:
    """
    Get full state name from abbreviation for Overpass query.

    Args:
        state_abbr: 2-letter state abbreviation

    Returns:
        Full state name
    """
    state_names = {
        "AL": "Alabama", "AK": "Alaska", "AZ": "Arizona", "AR": "Arkansas",
        "CA": "California", "CO": "Colorado", "CT": "Connecticut", "DE": "Delaware",
        "DC": "District of Columbia", "FL": "Florida", "GA": "Georgia", "HI": "Hawaii",
        "ID": "Idaho", "IL": "Illinois", "IN": "Indiana", "IA": "Iowa",
        "KS": "Kansas", "KY": "Kentucky", "LA": "Louisiana", "ME": "Maine",
        "MD": "Maryland", "MA": "Massachusetts", "MI": "Michigan", "MN": "Minnesota",
        "MS": "Mississippi", "MO": "Missouri", "MT": "Montana", "NE": "Nebraska",
        "NV": "Nevada", "NH": "New Hampshire", "NJ": "New Jersey", "NM": "New Mexico",
        "NY": "New York", "NC": "North Carolina", "ND": "North Dakota", "OH": "Ohio",
        "OK": "Oklahoma", "OR": "Oregon", "PA": "Pennsylvania", "RI": "Rhode Island",
        "SC": "South Carolina", "SD": "South Dakota", "TN": "Tennessee", "TX": "Texas",
        "UT": "Utah", "VT": "Vermont", "VA": "Virginia", "WA": "Washington",
        "WV": "West Virginia", "WI": "Wisconsin", "WY": "Wyoming",
        "PR": "Puerto Rico", "VI": "Virgin Islands", "GU": "Guam", "AS": "American Samoa",
    }
    return state_names.get(state_abbr.upper(), state_abbr)


def fetch_streets_from_osm(town: str, state: str) -> Tuple[List[dict], str]:
    """
    Fetch all streets in a town from OpenStreetMap using Overpass API.

    Args:
        town: Town/city name
        state: 2-letter state abbreviation

    Returns:
        Tuple of (list of street dicts, error message or empty string)
    """
    state_name = get_state_name(state)
    state_upper = state.upper()

    # Simpler, more reliable Overpass query
    # Uses geocodeArea which is optimized for place lookups
    query = f"""
    [out:json][timeout:120];

    // Use geocodeArea for reliable city lookup with state context
    {{geocodeArea:{town}, {state_name}, United States}}->.city;

    // Get all named streets in the city
    way["highway"]["name"](area.city);
    out tags;
    """

    # Alternative query if geocodeArea fails (more explicit)
    fallback_query = f"""
    [out:json][timeout:120];

    // Find state by ISO code
    area["ISO3166-2"="US-{state_upper}"]->.state;

    // Find city/town within state
    (
      relation["name"="{town}"]["type"="boundary"](area.state);
      way["name"="{town}"]["place"](area.state);
      node["name"="{town}"]["place"](area.state);
    );
    map_to_area->.city;

    // Get streets
    way["highway"]["name"](area.city);
    out tags;
    """

    # Most reliable: search by name within bounding box of state
    # This uses Nominatim-style search which is very reliable
    simple_query = f"""
    [out:json][timeout:60];
    area["name"="{state_name}"]["boundary"="administrative"]["admin_level"="4"]->.state;
    area["name"="{town}"](area.state)->.city;
    way["highway"]["name"](area.city);
    out tags;
    """

    queries = [simple_query, query, fallback_query]
    query_names = ["simple", "geocodeArea", "fallback"]

    logger.info(f"Fetching streets from OSM for {town}, {state_name}")

    last_error = ""

    for api_url in OVERPASS_API_URLS:
        for q, q_name in zip(queries, query_names):
            try:
                logger.info(f"Trying {q_name} query on {api_url.split('/')[2]}...")
                logger.debug(f"Query: {q}")

                response = requests.post(
                    api_url,
                    data={"data": q},
                    timeout=120,
                    headers={"User-Agent": "EamcoAddressChecker/1.0"}
                )

                if response.status_code == 429:
                    logger.warning("Rate limited, waiting 30s...")
                    time.sleep(30)
                    continue

                if response.status_code == 504:
                    logger.warning(f"Timeout on {q_name} query, trying next...")
                    continue

                response.raise_for_status()

                data = response.json()
                elements = data.get("elements", [])

                if elements:
                    logger.info(f"Success with {q_name} query: {len(elements)} street segments")
                    # Process and return results
                    streets = []
                    seen_names = set()

                    for element in elements:
                        tags = element.get("tags", {})
                        name = tags.get("name")

                        if name and name.lower() not in seen_names:
                            seen_names.add(name.lower())
                            streets.append({
                                "name": name,
                                "osm_id": str(element.get("id", "")),
                                "highway_type": tags.get("highway", ""),
                            })

                    logger.info(f"Extracted {len(streets)} unique street names")
                    return streets, ""
                else:
                    logger.debug(f"No results from {q_name} query")

            except requests.exceptions.Timeout:
                last_error = f"Timeout on {api_url}"
                logger.warning(last_error)
                continue

            except requests.exceptions.RequestException as e:
                last_error = f"Request error: {str(e)}"
                logger.warning(last_error)
                continue

            except Exception as e:
                last_error = f"Error: {str(e)}"
                logger.warning(last_error)
                continue

    # All attempts failed
    error = f"All Overpass queries failed for {town}, {state}. Last error: {last_error}"
    logger.error(error)
    return [], error


def populate_streets_for_town(
    session: Session,
    town: str,
    state: str,
    clear_existing: bool = False
) -> FetchResult:
    """
    Fetch streets from OSM and populate the StreetReference table.

    Args:
        session: SQLAlchemy session
        town: Town/city name
        state: 2-letter state abbreviation
        clear_existing: If True, delete existing streets for this town first

    Returns:
        FetchResult with statistics
    """
    state = state.upper()
    town_normalized = town.lower().strip()
    errors = []

    logger.info(f"Populating streets for {town}, {state}")

    # Optionally clear existing streets for this town
    if clear_existing:
        deleted = session.query(StreetReference).filter(
            StreetReference.town_normalized == town_normalized,
            StreetReference.state == state
        ).delete(synchronize_session=False)
        session.commit()
        logger.info(f"Cleared {deleted} existing street records")

    # Fetch from OSM
    streets, error = fetch_streets_from_osm(town, state)

    if error:
        errors.append(error)

    if not streets:
        return FetchResult(
            success=len(errors) == 0,
            streets_added=0,
            streets_updated=0,
            total_found=0,
            message=f"No streets found for {town}, {state}",
            errors=errors,
        )

    # Check for existing streets to avoid duplicates
    existing_streets = session.query(StreetReference).filter(
        StreetReference.town_normalized == town_normalized,
        StreetReference.state == state
    ).all()

    existing_names = {s.street_name_normalized for s in existing_streets}

    added = 0
    now = datetime.utcnow()

    for street_data in streets:
        name = street_data["name"]
        name_normalized = normalize_street_name(name)

        if name_normalized in existing_names:
            continue

        street_ref = StreetReference(
            street_name=name,
            street_name_normalized=name_normalized,
            town=town,
            town_normalized=town_normalized,
            state=state,
            osm_id=street_data.get("osm_id"),
            created_at=now,
        )
        session.add(street_ref)
        existing_names.add(name_normalized)
        added += 1

    session.commit()

    logger.info(f"Added {added} new streets for {town}, {state}")

    return FetchResult(
        success=True,
        streets_added=added,
        streets_updated=0,
        total_found=len(streets),
        message=f"Successfully added {added} streets for {town}, {state}",
        errors=errors,
    )


def find_matching_street(
    session: Session,
    street_input: str,
    town: str,
    state: str,
    min_confidence: float = 70.0
) -> Optional[StreetMatch]:
    """
    Find the best matching street for a potentially misspelled input.

    Uses fuzzy string matching with rapidfuzz to find the closest
    match in the StreetReference table.

    Args:
        session: SQLAlchemy session
        street_input: The street name to match (may be misspelled)
        town: Town/city to search within
        state: State abbreviation
        min_confidence: Minimum match confidence (0-100)

    Returns:
        StreetMatch if found above threshold, None otherwise
    """
    state = state.upper()
    town_normalized = town.lower().strip()

    # Normalize the input for matching
    input_normalized = normalize_street_name(street_input)

    # Get all streets for this town
    streets = session.query(StreetReference).filter(
        StreetReference.town_normalized == town_normalized,
        StreetReference.state == state
    ).all()

    if not streets:
        logger.debug(f"No reference streets found for {town}, {state}")
        return None

    # Build list of (normalized_name, street_object) for matching
    choices = [(s.street_name_normalized, s) for s in streets]

    # Use rapidfuzz to find best match
    # We use token_set_ratio which handles word order differences well
    best_match = None
    best_score = 0

    for normalized_name, street_obj in choices:
        # Try multiple scoring methods and take the best
        scores = [
            fuzz.ratio(input_normalized, normalized_name),
            fuzz.partial_ratio(input_normalized, normalized_name),
            fuzz.token_sort_ratio(input_normalized, normalized_name),
            fuzz.token_set_ratio(input_normalized, normalized_name),
        ]
        score = max(scores)

        if score > best_score:
            best_score = score
            best_match = street_obj

    if best_match and best_score >= min_confidence:
        logger.info(
            f"Fuzzy match: '{street_input}' -> '{best_match.street_name}' "
            f"(confidence: {best_score:.1f}%)"
        )
        return StreetMatch(
            original_street=street_input,
            matched_street=best_match.street_name,
            confidence_score=best_score,
            town=best_match.town,
            state=best_match.state,
            street_ref_id=best_match.id,
        )

    logger.debug(
        f"No confident match for '{street_input}' "
        f"(best: {best_score:.1f}%, threshold: {min_confidence}%)"
    )
    return None


def correct_address(
    session: Session,
    full_address: str,
    town: str,
    state: str,
    min_confidence: float = 75.0
) -> Optional[StreetMatch]:
    """
    Attempt to correct a full address using fuzzy street matching.

    Extracts the street portion, finds a match, and returns
    a corrected address with the matched street name.

    Args:
        session: SQLAlchemy session
        full_address: Full street address (e.g., "123 Mian St")
        town: Town/city name
        state: State abbreviation
        min_confidence: Minimum match confidence

    Returns:
        StreetMatch with corrected_address if match found, None otherwise
    """
    # Extract street number and street name
    street_number, street_name = extract_street_number(full_address)

    if not street_name:
        return None

    # Find matching street
    match = find_matching_street(
        session=session,
        street_input=street_name,
        town=town,
        state=state,
        min_confidence=min_confidence,
    )

    if match:
        # Build corrected address
        if street_number:
            match.corrected_address = f"{street_number} {match.matched_street}"
        else:
            match.corrected_address = match.matched_street

        logger.info(
            f"Address correction: '{full_address}' -> '{match.corrected_address}'"
        )

    return match


def get_town_street_count(session: Session, town: str, state: str) -> int:
    """
    Get the number of streets in the reference table for a town.

    Args:
        session: SQLAlchemy session
        town: Town/city name
        state: State abbreviation

    Returns:
        Number of streets in the reference table
    """
    return session.query(StreetReference).filter(
        StreetReference.town_normalized == town.lower().strip(),
        StreetReference.state == state.upper()
    ).count()