first commit

2026-01-18 17:53:26 -05:00
commit 0b9c0915a1
15 changed files with 2692 additions and 0 deletions
--- a/app/streets.py
+++ b/app/streets.py
@@ -0,0 +1,572 @@
+"""
+Street reference tools for address correction.
+
+This module provides functionality to:
+1. Fetch streets from OpenStreetMap Overpass API for a given town/state
+2. Store streets in the StreetReference table
+3. Perform fuzzy matching to correct misspelled addresses
+
+The fuzzy matching handles common issues like:
+- Misspelled street names ("Mian St" -> "Main St")
+- Wrong suffixes ("Main Rd" -> "Main St")
+- Missing/extra spaces
+- Abbreviated vs full names ("St" vs "Street")
+"""
+
+import logging
+import re
+import time
+from dataclasses import dataclass
+from datetime import datetime
+from typing import List, Optional, Tuple
+
+import requests
+from rapidfuzz import fuzz, process
+from sqlalchemy.orm import Session
+
+from app.config import STATE_MAPPING
+from app.models import StreetReference
+
+logger = logging.getLogger(__name__)
+
+# Overpass API endpoints (multiple for fallback)
+OVERPASS_API_URLS = [
+    "https://overpass-api.de/api/interpreter",
+    "https://overpass.kumi.systems/api/interpreter",
+    "https://maps.mail.ru/osm/tools/overpass/api/interpreter",
+]
+
+# Common street suffix variations for normalization
+STREET_SUFFIXES = {
+    # Standard -> variations
+    "street": ["st", "str", "strt"],
+    "avenue": ["ave", "av", "aven"],
+    "road": ["rd", "rod"],
+    "drive": ["dr", "drv", "driv"],
+    "lane": ["ln", "lne"],
+    "court": ["ct", "crt", "cour"],
+    "circle": ["cir", "circ", "crcl"],
+    "boulevard": ["blvd", "boul", "blv"],
+    "place": ["pl", "plc"],
+    "terrace": ["ter", "terr", "trc"],
+    "way": ["wy"],
+    "highway": ["hwy", "hiway", "hgwy"],
+    "parkway": ["pkwy", "pky", "pkway"],
+    "square": ["sq", "sqr"],
+    "trail": ["trl", "tr"],
+    "crossing": ["xing", "crssng"],
+    "heights": ["hts", "hgts"],
+    "point": ["pt", "pnt"],
+    "ridge": ["rdg", "rdge"],
+    "valley": ["vly", "vlly"],
+    "view": ["vw", "viw"],
+    "center": ["ctr", "cntr", "centre"],
+    "north": ["n"],
+    "south": ["s"],
+    "east": ["e"],
+    "west": ["w"],
+    "northeast": ["ne"],
+    "northwest": ["nw"],
+    "southeast": ["se"],
+    "southwest": ["sw"],
+}
+
+# Build reverse lookup: abbreviation -> full form
+SUFFIX_TO_FULL = {}
+for full, abbrevs in STREET_SUFFIXES.items():
+    for abbr in abbrevs:
+        SUFFIX_TO_FULL[abbr] = full
+    SUFFIX_TO_FULL[full] = full  # Also map full to itself
+
+
+@dataclass
+class StreetMatch:
+    """Result of fuzzy street matching."""
+    original_street: str
+    matched_street: str
+    confidence_score: float
+    town: str
+    state: str
+    street_ref_id: int
+    corrected_address: Optional[str] = None
+
+
+@dataclass
+class FetchResult:
+    """Result of fetching streets from OSM."""
+    success: bool
+    streets_added: int
+    streets_updated: int
+    total_found: int
+    message: str
+    errors: List[str]
+
+
+def normalize_street_name(street: str) -> str:
+    """
+    Normalize a street name for fuzzy matching.
+
+    - Lowercase
+    - Remove extra whitespace
+    - Expand common abbreviations to full form
+    - Remove punctuation
+
+    Args:
+        street: Raw street name
+
+    Returns:
+        Normalized street name
+    """
+    if not street:
+        return ""
+
+    # Lowercase and strip
+    normalized = street.lower().strip()
+
+    # Remove punctuation except hyphens
+    normalized = re.sub(r"[.,']", "", normalized)
+
+    # Normalize whitespace
+    normalized = re.sub(r"\s+", " ", normalized)
+
+    # Split into words and expand abbreviations
+    words = normalized.split()
+    expanded_words = []
+    for word in words:
+        if word in SUFFIX_TO_FULL:
+            expanded_words.append(SUFFIX_TO_FULL[word])
+        else:
+            expanded_words.append(word)
+
+    return " ".join(expanded_words)
+
+
+def extract_street_number(address: str) -> Tuple[Optional[str], str]:
+    """
+    Extract street number from an address string.
+
+    Args:
+        address: Full address like "123 Main Street"
+
+    Returns:
+        Tuple of (street_number, remaining_address)
+    """
+    if not address:
+        return None, ""
+
+    # Match leading number (possibly with letter suffix like "123A")
+    match = re.match(r"^(\d+[A-Za-z]?)\s+(.+)$", address.strip())
+    if match:
+        return match.group(1), match.group(2)
+
+    return None, address.strip()
+
+
+def get_state_name(state_abbr: str) -> str:
+    """
+    Get full state name from abbreviation for Overpass query.
+
+    Args:
+        state_abbr: 2-letter state abbreviation
+
+    Returns:
+        Full state name
+    """
+    state_names = {
+        "AL": "Alabama", "AK": "Alaska", "AZ": "Arizona", "AR": "Arkansas",
+        "CA": "California", "CO": "Colorado", "CT": "Connecticut", "DE": "Delaware",
+        "DC": "District of Columbia", "FL": "Florida", "GA": "Georgia", "HI": "Hawaii",
+        "ID": "Idaho", "IL": "Illinois", "IN": "Indiana", "IA": "Iowa",
+        "KS": "Kansas", "KY": "Kentucky", "LA": "Louisiana", "ME": "Maine",
+        "MD": "Maryland", "MA": "Massachusetts", "MI": "Michigan", "MN": "Minnesota",
+        "MS": "Mississippi", "MO": "Missouri", "MT": "Montana", "NE": "Nebraska",
+        "NV": "Nevada", "NH": "New Hampshire", "NJ": "New Jersey", "NM": "New Mexico",
+        "NY": "New York", "NC": "North Carolina", "ND": "North Dakota", "OH": "Ohio",
+        "OK": "Oklahoma", "OR": "Oregon", "PA": "Pennsylvania", "RI": "Rhode Island",
+        "SC": "South Carolina", "SD": "South Dakota", "TN": "Tennessee", "TX": "Texas",
+        "UT": "Utah", "VT": "Vermont", "VA": "Virginia", "WA": "Washington",
+        "WV": "West Virginia", "WI": "Wisconsin", "WY": "Wyoming",
+        "PR": "Puerto Rico", "VI": "Virgin Islands", "GU": "Guam", "AS": "American Samoa",
+    }
+    return state_names.get(state_abbr.upper(), state_abbr)
+
+
+def fetch_streets_from_osm(town: str, state: str) -> Tuple[List[dict], str]:
+    """
+    Fetch all streets in a town from OpenStreetMap using Overpass API.
+
+    Args:
+        town: Town/city name
+        state: 2-letter state abbreviation
+
+    Returns:
+        Tuple of (list of street dicts, error message or empty string)
+    """
+    state_name = get_state_name(state)
+    state_upper = state.upper()
+
+    # Simpler, more reliable Overpass query
+    # Uses geocodeArea which is optimized for place lookups
+    query = f"""
+    [out:json][timeout:120];
+
+    // Use geocodeArea for reliable city lookup with state context
+    {{geocodeArea:{town}, {state_name}, United States}}->.city;
+
+    // Get all named streets in the city
+    way["highway"]["name"](area.city);
+    out tags;
+    """
+
+    # Alternative query if geocodeArea fails (more explicit)
+    fallback_query = f"""
+    [out:json][timeout:120];
+
+    // Find state by ISO code
+    area["ISO3166-2"="US-{state_upper}"]->.state;
+
+    // Find city/town within state
+    (
+      relation["name"="{town}"]["type"="boundary"](area.state);
+      way["name"="{town}"]["place"](area.state);
+      node["name"="{town}"]["place"](area.state);
+    );
+    map_to_area->.city;
+
+    // Get streets
+    way["highway"]["name"](area.city);
+    out tags;
+    """
+
+    # Most reliable: search by name within bounding box of state
+    # This uses Nominatim-style search which is very reliable
+    simple_query = f"""
+    [out:json][timeout:60];
+    area["name"="{state_name}"]["boundary"="administrative"]["admin_level"="4"]->.state;
+    area["name"="{town}"](area.state)->.city;
+    way["highway"]["name"](area.city);
+    out tags;
+    """
+
+    queries = [simple_query, query, fallback_query]
+    query_names = ["simple", "geocodeArea", "fallback"]
+
+    logger.info(f"Fetching streets from OSM for {town}, {state_name}")
+
+    last_error = ""
+
+    for api_url in OVERPASS_API_URLS:
+        for q, q_name in zip(queries, query_names):
+            try:
+                logger.info(f"Trying {q_name} query on {api_url.split('/')[2]}...")
+                logger.debug(f"Query: {q}")
+
+                response = requests.post(
+                    api_url,
+                    data={"data": q},
+                    timeout=120,
+                    headers={"User-Agent": "EamcoAddressChecker/1.0"}
+                )
+
+                if response.status_code == 429:
+                    logger.warning("Rate limited, waiting 30s...")
+                    time.sleep(30)
+                    continue
+
+                if response.status_code == 504:
+                    logger.warning(f"Timeout on {q_name} query, trying next...")
+                    continue
+
+                response.raise_for_status()
+
+                data = response.json()
+                elements = data.get("elements", [])
+
+                if elements:
+                    logger.info(f"Success with {q_name} query: {len(elements)} street segments")
+                    # Process and return results
+                    streets = []
+                    seen_names = set()
+
+                    for element in elements:
+                        tags = element.get("tags", {})
+                        name = tags.get("name")
+
+                        if name and name.lower() not in seen_names:
+                            seen_names.add(name.lower())
+                            streets.append({
+                                "name": name,
+                                "osm_id": str(element.get("id", "")),
+                                "highway_type": tags.get("highway", ""),
+                            })
+
+                    logger.info(f"Extracted {len(streets)} unique street names")
+                    return streets, ""
+                else:
+                    logger.debug(f"No results from {q_name} query")
+
+            except requests.exceptions.Timeout:
+                last_error = f"Timeout on {api_url}"
+                logger.warning(last_error)
+                continue
+
+            except requests.exceptions.RequestException as e:
+                last_error = f"Request error: {str(e)}"
+                logger.warning(last_error)
+                continue
+
+            except Exception as e:
+                last_error = f"Error: {str(e)}"
+                logger.warning(last_error)
+                continue
+
+    # All attempts failed
+    error = f"All Overpass queries failed for {town}, {state}. Last error: {last_error}"
+    logger.error(error)
+    return [], error
+
+
+def populate_streets_for_town(
+    session: Session,
+    town: str,
+    state: str,
+    clear_existing: bool = False
+) -> FetchResult:
+    """
+    Fetch streets from OSM and populate the StreetReference table.
+
+    Args:
+        session: SQLAlchemy session
+        town: Town/city name
+        state: 2-letter state abbreviation
+        clear_existing: If True, delete existing streets for this town first
+
+    Returns:
+        FetchResult with statistics
+    """
+    state = state.upper()
+    town_normalized = town.lower().strip()
+    errors = []
+
+    logger.info(f"Populating streets for {town}, {state}")
+
+    # Optionally clear existing streets for this town
+    if clear_existing:
+        deleted = session.query(StreetReference).filter(
+            StreetReference.town_normalized == town_normalized,
+            StreetReference.state == state
+        ).delete(synchronize_session=False)
+        session.commit()
+        logger.info(f"Cleared {deleted} existing street records")
+
+    # Fetch from OSM
+    streets, error = fetch_streets_from_osm(town, state)
+
+    if error:
+        errors.append(error)
+
+    if not streets:
+        return FetchResult(
+            success=len(errors) == 0,
+            streets_added=0,
+            streets_updated=0,
+            total_found=0,
+            message=f"No streets found for {town}, {state}",
+            errors=errors,
+        )
+
+    # Check for existing streets to avoid duplicates
+    existing_streets = session.query(StreetReference).filter(
+        StreetReference.town_normalized == town_normalized,
+        StreetReference.state == state
+    ).all()
+
+    existing_names = {s.street_name_normalized for s in existing_streets}
+
+    added = 0
+    now = datetime.utcnow()
+
+    for street_data in streets:
+        name = street_data["name"]
+        name_normalized = normalize_street_name(name)
+
+        if name_normalized in existing_names:
+            continue
+
+        street_ref = StreetReference(
+            street_name=name,
+            street_name_normalized=name_normalized,
+            town=town,
+            town_normalized=town_normalized,
+            state=state,
+            osm_id=street_data.get("osm_id"),
+            created_at=now,
+        )
+        session.add(street_ref)
+        existing_names.add(name_normalized)
+        added += 1
+
+    session.commit()
+
+    logger.info(f"Added {added} new streets for {town}, {state}")
+
+    return FetchResult(
+        success=True,
+        streets_added=added,
+        streets_updated=0,
+        total_found=len(streets),
+        message=f"Successfully added {added} streets for {town}, {state}",
+        errors=errors,
+    )
+
+
+def find_matching_street(
+    session: Session,
+    street_input: str,
+    town: str,
+    state: str,
+    min_confidence: float = 70.0
+) -> Optional[StreetMatch]:
+    """
+    Find the best matching street for a potentially misspelled input.
+
+    Uses fuzzy string matching with rapidfuzz to find the closest
+    match in the StreetReference table.
+
+    Args:
+        session: SQLAlchemy session
+        street_input: The street name to match (may be misspelled)
+        town: Town/city to search within
+        state: State abbreviation
+        min_confidence: Minimum match confidence (0-100)
+
+    Returns:
+        StreetMatch if found above threshold, None otherwise
+    """
+    state = state.upper()
+    town_normalized = town.lower().strip()
+
+    # Normalize the input for matching
+    input_normalized = normalize_street_name(street_input)
+
+    # Get all streets for this town
+    streets = session.query(StreetReference).filter(
+        StreetReference.town_normalized == town_normalized,
+        StreetReference.state == state
+    ).all()
+
+    if not streets:
+        logger.debug(f"No reference streets found for {town}, {state}")
+        return None
+
+    # Build list of (normalized_name, street_object) for matching
+    choices = [(s.street_name_normalized, s) for s in streets]
+
+    # Use rapidfuzz to find best match
+    # We use token_set_ratio which handles word order differences well
+    best_match = None
+    best_score = 0
+
+    for normalized_name, street_obj in choices:
+        # Try multiple scoring methods and take the best
+        scores = [
+            fuzz.ratio(input_normalized, normalized_name),
+            fuzz.partial_ratio(input_normalized, normalized_name),
+            fuzz.token_sort_ratio(input_normalized, normalized_name),
+            fuzz.token_set_ratio(input_normalized, normalized_name),
+        ]
+        score = max(scores)
+
+        if score > best_score:
+            best_score = score
+            best_match = street_obj
+
+    if best_match and best_score >= min_confidence:
+        logger.info(
+            f"Fuzzy match: '{street_input}' -> '{best_match.street_name}' "
+            f"(confidence: {best_score:.1f}%)"
+        )
+        return StreetMatch(
+            original_street=street_input,
+            matched_street=best_match.street_name,
+            confidence_score=best_score,
+            town=best_match.town,
+            state=best_match.state,
+            street_ref_id=best_match.id,
+        )
+
+    logger.debug(
+        f"No confident match for '{street_input}' "
+        f"(best: {best_score:.1f}%, threshold: {min_confidence}%)"
+    )
+    return None
+
+
+def correct_address(
+    session: Session,
+    full_address: str,
+    town: str,
+    state: str,
+    min_confidence: float = 75.0
+) -> Optional[StreetMatch]:
+    """
+    Attempt to correct a full address using fuzzy street matching.
+
+    Extracts the street portion, finds a match, and returns
+    a corrected address with the matched street name.
+
+    Args:
+        session: SQLAlchemy session
+        full_address: Full street address (e.g., "123 Mian St")
+        town: Town/city name
+        state: State abbreviation
+        min_confidence: Minimum match confidence
+
+    Returns:
+        StreetMatch with corrected_address if match found, None otherwise
+    """
+    # Extract street number and street name
+    street_number, street_name = extract_street_number(full_address)
+
+    if not street_name:
+        return None
+
+    # Find matching street
+    match = find_matching_street(
+        session=session,
+        street_input=street_name,
+        town=town,
+        state=state,
+        min_confidence=min_confidence,
+    )
+
+    if match:
+        # Build corrected address
+        if street_number:
+            match.corrected_address = f"{street_number} {match.matched_street}"
+        else:
+            match.corrected_address = match.matched_street
+
+        logger.info(
+            f"Address correction: '{full_address}' -> '{match.corrected_address}'"
+        )
+
+    return match
+
+
+def get_town_street_count(session: Session, town: str, state: str) -> int:
+    """
+    Get the number of streets in the reference table for a town.
+
+    Args:
+        session: SQLAlchemy session
+        town: Town/city name
+        state: State abbreviation
+
+    Returns:
+        Number of streets in the reference table
+    """
+    return session.query(StreetReference).filter(
+        StreetReference.town_normalized == town.lower().strip(),
+        StreetReference.state == state.upper()
+    ).count()