""" Street reference tools for address correction. This module provides functionality to: 1. Fetch streets from OpenStreetMap Overpass API for a given town/state 2. Store streets in the StreetReference table 3. Perform fuzzy matching to correct misspelled addresses The fuzzy matching handles common issues like: - Misspelled street names ("Mian St" -> "Main St") - Wrong suffixes ("Main Rd" -> "Main St") - Missing/extra spaces - Abbreviated vs full names ("St" vs "Street") """ import logging import re import time from dataclasses import dataclass from datetime import datetime from typing import List, Optional, Tuple import requests from rapidfuzz import fuzz, process from sqlalchemy.orm import Session from app.config import STATE_MAPPING from app.models import StreetReference logger = logging.getLogger(__name__) # Overpass API endpoints (multiple for fallback) OVERPASS_API_URLS = [ "https://overpass-api.de/api/interpreter", "https://overpass.kumi.systems/api/interpreter", "https://maps.mail.ru/osm/tools/overpass/api/interpreter", ] # Common street suffix variations for normalization STREET_SUFFIXES = { # Standard -> variations "street": ["st", "str", "strt"], "avenue": ["ave", "av", "aven"], "road": ["rd", "rod"], "drive": ["dr", "drv", "driv"], "lane": ["ln", "lne"], "court": ["ct", "crt", "cour"], "circle": ["cir", "circ", "crcl"], "boulevard": ["blvd", "boul", "blv"], "place": ["pl", "plc"], "terrace": ["ter", "terr", "trc"], "way": ["wy"], "highway": ["hwy", "hiway", "hgwy"], "parkway": ["pkwy", "pky", "pkway"], "square": ["sq", "sqr"], "trail": ["trl", "tr"], "crossing": ["xing", "crssng"], "heights": ["hts", "hgts"], "point": ["pt", "pnt"], "ridge": ["rdg", "rdge"], "valley": ["vly", "vlly"], "view": ["vw", "viw"], "center": ["ctr", "cntr", "centre"], "north": ["n"], "south": ["s"], "east": ["e"], "west": ["w"], "northeast": ["ne"], "northwest": ["nw"], "southeast": ["se"], "southwest": ["sw"], } # Build reverse lookup: abbreviation -> full form SUFFIX_TO_FULL = {} for full, abbrevs in STREET_SUFFIXES.items(): for abbr in abbrevs: SUFFIX_TO_FULL[abbr] = full SUFFIX_TO_FULL[full] = full # Also map full to itself @dataclass class StreetMatch: """Result of fuzzy street matching.""" original_street: str matched_street: str confidence_score: float town: str state: str street_ref_id: int corrected_address: Optional[str] = None @dataclass class FetchResult: """Result of fetching streets from OSM.""" success: bool streets_added: int streets_updated: int total_found: int message: str errors: List[str] def normalize_street_name(street: str) -> str: """ Normalize a street name for fuzzy matching. - Lowercase - Remove extra whitespace - Expand common abbreviations to full form - Remove punctuation Args: street: Raw street name Returns: Normalized street name """ if not street: return "" # Lowercase and strip normalized = street.lower().strip() # Remove punctuation except hyphens normalized = re.sub(r"[.,']", "", normalized) # Normalize whitespace normalized = re.sub(r"\s+", " ", normalized) # Split into words and expand abbreviations words = normalized.split() expanded_words = [] for word in words: if word in SUFFIX_TO_FULL: expanded_words.append(SUFFIX_TO_FULL[word]) else: expanded_words.append(word) return " ".join(expanded_words) def extract_street_number(address: str) -> Tuple[Optional[str], str]: """ Extract street number from an address string. Args: address: Full address like "123 Main Street" Returns: Tuple of (street_number, remaining_address) """ if not address: return None, "" # Match leading number (possibly with letter suffix like "123A") match = re.match(r"^(\d+[A-Za-z]?)\s+(.+)$", address.strip()) if match: return match.group(1), match.group(2) return None, address.strip() def get_state_name(state_abbr: str) -> str: """ Get full state name from abbreviation for Overpass query. Args: state_abbr: 2-letter state abbreviation Returns: Full state name """ state_names = { "AL": "Alabama", "AK": "Alaska", "AZ": "Arizona", "AR": "Arkansas", "CA": "California", "CO": "Colorado", "CT": "Connecticut", "DE": "Delaware", "DC": "District of Columbia", "FL": "Florida", "GA": "Georgia", "HI": "Hawaii", "ID": "Idaho", "IL": "Illinois", "IN": "Indiana", "IA": "Iowa", "KS": "Kansas", "KY": "Kentucky", "LA": "Louisiana", "ME": "Maine", "MD": "Maryland", "MA": "Massachusetts", "MI": "Michigan", "MN": "Minnesota", "MS": "Mississippi", "MO": "Missouri", "MT": "Montana", "NE": "Nebraska", "NV": "Nevada", "NH": "New Hampshire", "NJ": "New Jersey", "NM": "New Mexico", "NY": "New York", "NC": "North Carolina", "ND": "North Dakota", "OH": "Ohio", "OK": "Oklahoma", "OR": "Oregon", "PA": "Pennsylvania", "RI": "Rhode Island", "SC": "South Carolina", "SD": "South Dakota", "TN": "Tennessee", "TX": "Texas", "UT": "Utah", "VT": "Vermont", "VA": "Virginia", "WA": "Washington", "WV": "West Virginia", "WI": "Wisconsin", "WY": "Wyoming", "PR": "Puerto Rico", "VI": "Virgin Islands", "GU": "Guam", "AS": "American Samoa", } return state_names.get(state_abbr.upper(), state_abbr) def fetch_streets_from_osm(town: str, state: str) -> Tuple[List[dict], str]: """ Fetch all streets in a town from OpenStreetMap using Overpass API. Args: town: Town/city name state: 2-letter state abbreviation Returns: Tuple of (list of street dicts, error message or empty string) """ state_name = get_state_name(state) state_upper = state.upper() # Simpler, more reliable Overpass query # Uses geocodeArea which is optimized for place lookups query = f""" [out:json][timeout:120]; // Use geocodeArea for reliable city lookup with state context {{geocodeArea:{town}, {state_name}, United States}}->.city; // Get all named streets in the city way["highway"]["name"](area.city); out tags; """ # Alternative query if geocodeArea fails (more explicit) fallback_query = f""" [out:json][timeout:120]; // Find state by ISO code area["ISO3166-2"="US-{state_upper}"]->.state; // Find city/town within state ( relation["name"="{town}"]["type"="boundary"](area.state); way["name"="{town}"]["place"](area.state); node["name"="{town}"]["place"](area.state); ); map_to_area->.city; // Get streets way["highway"]["name"](area.city); out tags; """ # Most reliable: search by name within bounding box of state # This uses Nominatim-style search which is very reliable simple_query = f""" [out:json][timeout:60]; area["name"="{state_name}"]["boundary"="administrative"]["admin_level"="4"]->.state; area["name"="{town}"](area.state)->.city; way["highway"]["name"](area.city); out tags; """ queries = [simple_query, query, fallback_query] query_names = ["simple", "geocodeArea", "fallback"] logger.info(f"Fetching streets from OSM for {town}, {state_name}") last_error = "" for api_url in OVERPASS_API_URLS: for q, q_name in zip(queries, query_names): try: logger.info(f"Trying {q_name} query on {api_url.split('/')[2]}...") logger.debug(f"Query: {q}") response = requests.post( api_url, data={"data": q}, timeout=120, headers={"User-Agent": "EamcoAddressChecker/1.0"} ) if response.status_code == 429: logger.warning("Rate limited, waiting 30s...") time.sleep(30) continue if response.status_code == 504: logger.warning(f"Timeout on {q_name} query, trying next...") continue response.raise_for_status() data = response.json() elements = data.get("elements", []) if elements: logger.info(f"Success with {q_name} query: {len(elements)} street segments") # Process and return results streets = [] seen_names = set() for element in elements: tags = element.get("tags", {}) name = tags.get("name") if name and name.lower() not in seen_names: seen_names.add(name.lower()) streets.append({ "name": name, "osm_id": str(element.get("id", "")), "highway_type": tags.get("highway", ""), }) logger.info(f"Extracted {len(streets)} unique street names") return streets, "" else: logger.debug(f"No results from {q_name} query") except requests.exceptions.Timeout: last_error = f"Timeout on {api_url}" logger.warning(last_error) continue except requests.exceptions.RequestException as e: last_error = f"Request error: {str(e)}" logger.warning(last_error) continue except Exception as e: last_error = f"Error: {str(e)}" logger.warning(last_error) continue # All attempts failed error = f"All Overpass queries failed for {town}, {state}. Last error: {last_error}" logger.error(error) return [], error def populate_streets_for_town( session: Session, town: str, state: str, clear_existing: bool = False ) -> FetchResult: """ Fetch streets from OSM and populate the StreetReference table. Args: session: SQLAlchemy session town: Town/city name state: 2-letter state abbreviation clear_existing: If True, delete existing streets for this town first Returns: FetchResult with statistics """ state = state.upper() town_normalized = town.lower().strip() errors = [] logger.info(f"Populating streets for {town}, {state}") # Optionally clear existing streets for this town if clear_existing: deleted = session.query(StreetReference).filter( StreetReference.town_normalized == town_normalized, StreetReference.state == state ).delete(synchronize_session=False) session.commit() logger.info(f"Cleared {deleted} existing street records") # Fetch from OSM streets, error = fetch_streets_from_osm(town, state) if error: errors.append(error) if not streets: return FetchResult( success=len(errors) == 0, streets_added=0, streets_updated=0, total_found=0, message=f"No streets found for {town}, {state}", errors=errors, ) # Check for existing streets to avoid duplicates existing_streets = session.query(StreetReference).filter( StreetReference.town_normalized == town_normalized, StreetReference.state == state ).all() existing_names = {s.street_name_normalized for s in existing_streets} added = 0 now = datetime.utcnow() for street_data in streets: name = street_data["name"] name_normalized = normalize_street_name(name) if name_normalized in existing_names: continue street_ref = StreetReference( street_name=name, street_name_normalized=name_normalized, town=town, town_normalized=town_normalized, state=state, osm_id=street_data.get("osm_id"), created_at=now, ) session.add(street_ref) existing_names.add(name_normalized) added += 1 session.commit() logger.info(f"Added {added} new streets for {town}, {state}") return FetchResult( success=True, streets_added=added, streets_updated=0, total_found=len(streets), message=f"Successfully added {added} streets for {town}, {state}", errors=errors, ) def find_matching_street( session: Session, street_input: str, town: str, state: str, min_confidence: float = 70.0 ) -> Optional[StreetMatch]: """ Find the best matching street for a potentially misspelled input. Uses fuzzy string matching with rapidfuzz to find the closest match in the StreetReference table. Args: session: SQLAlchemy session street_input: The street name to match (may be misspelled) town: Town/city to search within state: State abbreviation min_confidence: Minimum match confidence (0-100) Returns: StreetMatch if found above threshold, None otherwise """ state = state.upper() town_normalized = town.lower().strip() # Normalize the input for matching input_normalized = normalize_street_name(street_input) # Get all streets for this town streets = session.query(StreetReference).filter( StreetReference.town_normalized == town_normalized, StreetReference.state == state ).all() if not streets: logger.debug(f"No reference streets found for {town}, {state}") return None # Build list of (normalized_name, street_object) for matching choices = [(s.street_name_normalized, s) for s in streets] # Use rapidfuzz to find best match # We use token_set_ratio which handles word order differences well best_match = None best_score = 0 for normalized_name, street_obj in choices: # Try multiple scoring methods and take the best scores = [ fuzz.ratio(input_normalized, normalized_name), fuzz.partial_ratio(input_normalized, normalized_name), fuzz.token_sort_ratio(input_normalized, normalized_name), fuzz.token_set_ratio(input_normalized, normalized_name), ] score = max(scores) if score > best_score: best_score = score best_match = street_obj if best_match and best_score >= min_confidence: logger.info( f"Fuzzy match: '{street_input}' -> '{best_match.street_name}' " f"(confidence: {best_score:.1f}%)" ) return StreetMatch( original_street=street_input, matched_street=best_match.street_name, confidence_score=best_score, town=best_match.town, state=best_match.state, street_ref_id=best_match.id, ) logger.debug( f"No confident match for '{street_input}' " f"(best: {best_score:.1f}%, threshold: {min_confidence}%)" ) return None def correct_address( session: Session, full_address: str, town: str, state: str, min_confidence: float = 75.0 ) -> Optional[StreetMatch]: """ Attempt to correct a full address using fuzzy street matching. Extracts the street portion, finds a match, and returns a corrected address with the matched street name. Args: session: SQLAlchemy session full_address: Full street address (e.g., "123 Mian St") town: Town/city name state: State abbreviation min_confidence: Minimum match confidence Returns: StreetMatch with corrected_address if match found, None otherwise """ # Extract street number and street name street_number, street_name = extract_street_number(full_address) if not street_name: return None # Find matching street match = find_matching_street( session=session, street_input=street_name, town=town, state=state, min_confidence=min_confidence, ) if match: # Build corrected address if street_number: match.corrected_address = f"{street_number} {match.matched_street}" else: match.corrected_address = match.matched_street logger.info( f"Address correction: '{full_address}' -> '{match.corrected_address}'" ) return match def get_town_street_count(session: Session, town: str, state: str) -> int: """ Get the number of streets in the reference table for a town. Args: session: SQLAlchemy session town: Town/city name state: State abbreviation Returns: Number of streets in the reference table """ return session.query(StreetReference).filter( StreetReference.town_normalized == town.lower().strip(), StreetReference.state == state.upper() ).count()