573 lines
17 KiB
Python
573 lines
17 KiB
Python
"""
|
|
Street reference tools for address correction.
|
|
|
|
This module provides functionality to:
|
|
1. Fetch streets from OpenStreetMap Overpass API for a given town/state
|
|
2. Store streets in the StreetReference table
|
|
3. Perform fuzzy matching to correct misspelled addresses
|
|
|
|
The fuzzy matching handles common issues like:
|
|
- Misspelled street names ("Mian St" -> "Main St")
|
|
- Wrong suffixes ("Main Rd" -> "Main St")
|
|
- Missing/extra spaces
|
|
- Abbreviated vs full names ("St" vs "Street")
|
|
"""
|
|
|
|
import logging
|
|
import re
|
|
import time
|
|
from dataclasses import dataclass
|
|
from datetime import datetime
|
|
from typing import List, Optional, Tuple
|
|
|
|
import requests
|
|
from rapidfuzz import fuzz, process
|
|
from sqlalchemy.orm import Session
|
|
|
|
from app.config import STATE_MAPPING
|
|
from app.models import StreetReference
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
# Overpass API endpoints (multiple for fallback)
|
|
OVERPASS_API_URLS = [
|
|
"https://overpass-api.de/api/interpreter",
|
|
"https://overpass.kumi.systems/api/interpreter",
|
|
"https://maps.mail.ru/osm/tools/overpass/api/interpreter",
|
|
]
|
|
|
|
# Common street suffix variations for normalization
|
|
STREET_SUFFIXES = {
|
|
# Standard -> variations
|
|
"street": ["st", "str", "strt"],
|
|
"avenue": ["ave", "av", "aven"],
|
|
"road": ["rd", "rod"],
|
|
"drive": ["dr", "drv", "driv"],
|
|
"lane": ["ln", "lne"],
|
|
"court": ["ct", "crt", "cour"],
|
|
"circle": ["cir", "circ", "crcl"],
|
|
"boulevard": ["blvd", "boul", "blv"],
|
|
"place": ["pl", "plc"],
|
|
"terrace": ["ter", "terr", "trc"],
|
|
"way": ["wy"],
|
|
"highway": ["hwy", "hiway", "hgwy"],
|
|
"parkway": ["pkwy", "pky", "pkway"],
|
|
"square": ["sq", "sqr"],
|
|
"trail": ["trl", "tr"],
|
|
"crossing": ["xing", "crssng"],
|
|
"heights": ["hts", "hgts"],
|
|
"point": ["pt", "pnt"],
|
|
"ridge": ["rdg", "rdge"],
|
|
"valley": ["vly", "vlly"],
|
|
"view": ["vw", "viw"],
|
|
"center": ["ctr", "cntr", "centre"],
|
|
"north": ["n"],
|
|
"south": ["s"],
|
|
"east": ["e"],
|
|
"west": ["w"],
|
|
"northeast": ["ne"],
|
|
"northwest": ["nw"],
|
|
"southeast": ["se"],
|
|
"southwest": ["sw"],
|
|
}
|
|
|
|
# Build reverse lookup: abbreviation -> full form
|
|
SUFFIX_TO_FULL = {}
|
|
for full, abbrevs in STREET_SUFFIXES.items():
|
|
for abbr in abbrevs:
|
|
SUFFIX_TO_FULL[abbr] = full
|
|
SUFFIX_TO_FULL[full] = full # Also map full to itself
|
|
|
|
|
|
@dataclass
|
|
class StreetMatch:
|
|
"""Result of fuzzy street matching."""
|
|
original_street: str
|
|
matched_street: str
|
|
confidence_score: float
|
|
town: str
|
|
state: str
|
|
street_ref_id: int
|
|
corrected_address: Optional[str] = None
|
|
|
|
|
|
@dataclass
|
|
class FetchResult:
|
|
"""Result of fetching streets from OSM."""
|
|
success: bool
|
|
streets_added: int
|
|
streets_updated: int
|
|
total_found: int
|
|
message: str
|
|
errors: List[str]
|
|
|
|
|
|
def normalize_street_name(street: str) -> str:
|
|
"""
|
|
Normalize a street name for fuzzy matching.
|
|
|
|
- Lowercase
|
|
- Remove extra whitespace
|
|
- Expand common abbreviations to full form
|
|
- Remove punctuation
|
|
|
|
Args:
|
|
street: Raw street name
|
|
|
|
Returns:
|
|
Normalized street name
|
|
"""
|
|
if not street:
|
|
return ""
|
|
|
|
# Lowercase and strip
|
|
normalized = street.lower().strip()
|
|
|
|
# Remove punctuation except hyphens
|
|
normalized = re.sub(r"[.,']", "", normalized)
|
|
|
|
# Normalize whitespace
|
|
normalized = re.sub(r"\s+", " ", normalized)
|
|
|
|
# Split into words and expand abbreviations
|
|
words = normalized.split()
|
|
expanded_words = []
|
|
for word in words:
|
|
if word in SUFFIX_TO_FULL:
|
|
expanded_words.append(SUFFIX_TO_FULL[word])
|
|
else:
|
|
expanded_words.append(word)
|
|
|
|
return " ".join(expanded_words)
|
|
|
|
|
|
def extract_street_number(address: str) -> Tuple[Optional[str], str]:
|
|
"""
|
|
Extract street number from an address string.
|
|
|
|
Args:
|
|
address: Full address like "123 Main Street"
|
|
|
|
Returns:
|
|
Tuple of (street_number, remaining_address)
|
|
"""
|
|
if not address:
|
|
return None, ""
|
|
|
|
# Match leading number (possibly with letter suffix like "123A")
|
|
match = re.match(r"^(\d+[A-Za-z]?)\s+(.+)$", address.strip())
|
|
if match:
|
|
return match.group(1), match.group(2)
|
|
|
|
return None, address.strip()
|
|
|
|
|
|
def get_state_name(state_abbr: str) -> str:
|
|
"""
|
|
Get full state name from abbreviation for Overpass query.
|
|
|
|
Args:
|
|
state_abbr: 2-letter state abbreviation
|
|
|
|
Returns:
|
|
Full state name
|
|
"""
|
|
state_names = {
|
|
"AL": "Alabama", "AK": "Alaska", "AZ": "Arizona", "AR": "Arkansas",
|
|
"CA": "California", "CO": "Colorado", "CT": "Connecticut", "DE": "Delaware",
|
|
"DC": "District of Columbia", "FL": "Florida", "GA": "Georgia", "HI": "Hawaii",
|
|
"ID": "Idaho", "IL": "Illinois", "IN": "Indiana", "IA": "Iowa",
|
|
"KS": "Kansas", "KY": "Kentucky", "LA": "Louisiana", "ME": "Maine",
|
|
"MD": "Maryland", "MA": "Massachusetts", "MI": "Michigan", "MN": "Minnesota",
|
|
"MS": "Mississippi", "MO": "Missouri", "MT": "Montana", "NE": "Nebraska",
|
|
"NV": "Nevada", "NH": "New Hampshire", "NJ": "New Jersey", "NM": "New Mexico",
|
|
"NY": "New York", "NC": "North Carolina", "ND": "North Dakota", "OH": "Ohio",
|
|
"OK": "Oklahoma", "OR": "Oregon", "PA": "Pennsylvania", "RI": "Rhode Island",
|
|
"SC": "South Carolina", "SD": "South Dakota", "TN": "Tennessee", "TX": "Texas",
|
|
"UT": "Utah", "VT": "Vermont", "VA": "Virginia", "WA": "Washington",
|
|
"WV": "West Virginia", "WI": "Wisconsin", "WY": "Wyoming",
|
|
"PR": "Puerto Rico", "VI": "Virgin Islands", "GU": "Guam", "AS": "American Samoa",
|
|
}
|
|
return state_names.get(state_abbr.upper(), state_abbr)
|
|
|
|
|
|
def fetch_streets_from_osm(town: str, state: str) -> Tuple[List[dict], str]:
|
|
"""
|
|
Fetch all streets in a town from OpenStreetMap using Overpass API.
|
|
|
|
Args:
|
|
town: Town/city name
|
|
state: 2-letter state abbreviation
|
|
|
|
Returns:
|
|
Tuple of (list of street dicts, error message or empty string)
|
|
"""
|
|
state_name = get_state_name(state)
|
|
state_upper = state.upper()
|
|
|
|
# Simpler, more reliable Overpass query
|
|
# Uses geocodeArea which is optimized for place lookups
|
|
query = f"""
|
|
[out:json][timeout:120];
|
|
|
|
// Use geocodeArea for reliable city lookup with state context
|
|
{{geocodeArea:{town}, {state_name}, United States}}->.city;
|
|
|
|
// Get all named streets in the city
|
|
way["highway"]["name"](area.city);
|
|
out tags;
|
|
"""
|
|
|
|
# Alternative query if geocodeArea fails (more explicit)
|
|
fallback_query = f"""
|
|
[out:json][timeout:120];
|
|
|
|
// Find state by ISO code
|
|
area["ISO3166-2"="US-{state_upper}"]->.state;
|
|
|
|
// Find city/town within state
|
|
(
|
|
relation["name"="{town}"]["type"="boundary"](area.state);
|
|
way["name"="{town}"]["place"](area.state);
|
|
node["name"="{town}"]["place"](area.state);
|
|
);
|
|
map_to_area->.city;
|
|
|
|
// Get streets
|
|
way["highway"]["name"](area.city);
|
|
out tags;
|
|
"""
|
|
|
|
# Most reliable: search by name within bounding box of state
|
|
# This uses Nominatim-style search which is very reliable
|
|
simple_query = f"""
|
|
[out:json][timeout:60];
|
|
area["name"="{state_name}"]["boundary"="administrative"]["admin_level"="4"]->.state;
|
|
area["name"="{town}"](area.state)->.city;
|
|
way["highway"]["name"](area.city);
|
|
out tags;
|
|
"""
|
|
|
|
queries = [simple_query, query, fallback_query]
|
|
query_names = ["simple", "geocodeArea", "fallback"]
|
|
|
|
logger.info(f"Fetching streets from OSM for {town}, {state_name}")
|
|
|
|
last_error = ""
|
|
|
|
for api_url in OVERPASS_API_URLS:
|
|
for q, q_name in zip(queries, query_names):
|
|
try:
|
|
logger.info(f"Trying {q_name} query on {api_url.split('/')[2]}...")
|
|
logger.debug(f"Query: {q}")
|
|
|
|
response = requests.post(
|
|
api_url,
|
|
data={"data": q},
|
|
timeout=120,
|
|
headers={"User-Agent": "EamcoAddressChecker/1.0"}
|
|
)
|
|
|
|
if response.status_code == 429:
|
|
logger.warning("Rate limited, waiting 30s...")
|
|
time.sleep(30)
|
|
continue
|
|
|
|
if response.status_code == 504:
|
|
logger.warning(f"Timeout on {q_name} query, trying next...")
|
|
continue
|
|
|
|
response.raise_for_status()
|
|
|
|
data = response.json()
|
|
elements = data.get("elements", [])
|
|
|
|
if elements:
|
|
logger.info(f"Success with {q_name} query: {len(elements)} street segments")
|
|
# Process and return results
|
|
streets = []
|
|
seen_names = set()
|
|
|
|
for element in elements:
|
|
tags = element.get("tags", {})
|
|
name = tags.get("name")
|
|
|
|
if name and name.lower() not in seen_names:
|
|
seen_names.add(name.lower())
|
|
streets.append({
|
|
"name": name,
|
|
"osm_id": str(element.get("id", "")),
|
|
"highway_type": tags.get("highway", ""),
|
|
})
|
|
|
|
logger.info(f"Extracted {len(streets)} unique street names")
|
|
return streets, ""
|
|
else:
|
|
logger.debug(f"No results from {q_name} query")
|
|
|
|
except requests.exceptions.Timeout:
|
|
last_error = f"Timeout on {api_url}"
|
|
logger.warning(last_error)
|
|
continue
|
|
|
|
except requests.exceptions.RequestException as e:
|
|
last_error = f"Request error: {str(e)}"
|
|
logger.warning(last_error)
|
|
continue
|
|
|
|
except Exception as e:
|
|
last_error = f"Error: {str(e)}"
|
|
logger.warning(last_error)
|
|
continue
|
|
|
|
# All attempts failed
|
|
error = f"All Overpass queries failed for {town}, {state}. Last error: {last_error}"
|
|
logger.error(error)
|
|
return [], error
|
|
|
|
|
|
def populate_streets_for_town(
|
|
session: Session,
|
|
town: str,
|
|
state: str,
|
|
clear_existing: bool = False
|
|
) -> FetchResult:
|
|
"""
|
|
Fetch streets from OSM and populate the StreetReference table.
|
|
|
|
Args:
|
|
session: SQLAlchemy session
|
|
town: Town/city name
|
|
state: 2-letter state abbreviation
|
|
clear_existing: If True, delete existing streets for this town first
|
|
|
|
Returns:
|
|
FetchResult with statistics
|
|
"""
|
|
state = state.upper()
|
|
town_normalized = town.lower().strip()
|
|
errors = []
|
|
|
|
logger.info(f"Populating streets for {town}, {state}")
|
|
|
|
# Optionally clear existing streets for this town
|
|
if clear_existing:
|
|
deleted = session.query(StreetReference).filter(
|
|
StreetReference.town_normalized == town_normalized,
|
|
StreetReference.state == state
|
|
).delete(synchronize_session=False)
|
|
session.commit()
|
|
logger.info(f"Cleared {deleted} existing street records")
|
|
|
|
# Fetch from OSM
|
|
streets, error = fetch_streets_from_osm(town, state)
|
|
|
|
if error:
|
|
errors.append(error)
|
|
|
|
if not streets:
|
|
return FetchResult(
|
|
success=len(errors) == 0,
|
|
streets_added=0,
|
|
streets_updated=0,
|
|
total_found=0,
|
|
message=f"No streets found for {town}, {state}",
|
|
errors=errors,
|
|
)
|
|
|
|
# Check for existing streets to avoid duplicates
|
|
existing_streets = session.query(StreetReference).filter(
|
|
StreetReference.town_normalized == town_normalized,
|
|
StreetReference.state == state
|
|
).all()
|
|
|
|
existing_names = {s.street_name_normalized for s in existing_streets}
|
|
|
|
added = 0
|
|
now = datetime.utcnow()
|
|
|
|
for street_data in streets:
|
|
name = street_data["name"]
|
|
name_normalized = normalize_street_name(name)
|
|
|
|
if name_normalized in existing_names:
|
|
continue
|
|
|
|
street_ref = StreetReference(
|
|
street_name=name,
|
|
street_name_normalized=name_normalized,
|
|
town=town,
|
|
town_normalized=town_normalized,
|
|
state=state,
|
|
osm_id=street_data.get("osm_id"),
|
|
created_at=now,
|
|
)
|
|
session.add(street_ref)
|
|
existing_names.add(name_normalized)
|
|
added += 1
|
|
|
|
session.commit()
|
|
|
|
logger.info(f"Added {added} new streets for {town}, {state}")
|
|
|
|
return FetchResult(
|
|
success=True,
|
|
streets_added=added,
|
|
streets_updated=0,
|
|
total_found=len(streets),
|
|
message=f"Successfully added {added} streets for {town}, {state}",
|
|
errors=errors,
|
|
)
|
|
|
|
|
|
def find_matching_street(
|
|
session: Session,
|
|
street_input: str,
|
|
town: str,
|
|
state: str,
|
|
min_confidence: float = 70.0
|
|
) -> Optional[StreetMatch]:
|
|
"""
|
|
Find the best matching street for a potentially misspelled input.
|
|
|
|
Uses fuzzy string matching with rapidfuzz to find the closest
|
|
match in the StreetReference table.
|
|
|
|
Args:
|
|
session: SQLAlchemy session
|
|
street_input: The street name to match (may be misspelled)
|
|
town: Town/city to search within
|
|
state: State abbreviation
|
|
min_confidence: Minimum match confidence (0-100)
|
|
|
|
Returns:
|
|
StreetMatch if found above threshold, None otherwise
|
|
"""
|
|
state = state.upper()
|
|
town_normalized = town.lower().strip()
|
|
|
|
# Normalize the input for matching
|
|
input_normalized = normalize_street_name(street_input)
|
|
|
|
# Get all streets for this town
|
|
streets = session.query(StreetReference).filter(
|
|
StreetReference.town_normalized == town_normalized,
|
|
StreetReference.state == state
|
|
).all()
|
|
|
|
if not streets:
|
|
logger.debug(f"No reference streets found for {town}, {state}")
|
|
return None
|
|
|
|
# Build list of (normalized_name, street_object) for matching
|
|
choices = [(s.street_name_normalized, s) for s in streets]
|
|
|
|
# Use rapidfuzz to find best match
|
|
# We use token_set_ratio which handles word order differences well
|
|
best_match = None
|
|
best_score = 0
|
|
|
|
for normalized_name, street_obj in choices:
|
|
# Try multiple scoring methods and take the best
|
|
scores = [
|
|
fuzz.ratio(input_normalized, normalized_name),
|
|
fuzz.partial_ratio(input_normalized, normalized_name),
|
|
fuzz.token_sort_ratio(input_normalized, normalized_name),
|
|
fuzz.token_set_ratio(input_normalized, normalized_name),
|
|
]
|
|
score = max(scores)
|
|
|
|
if score > best_score:
|
|
best_score = score
|
|
best_match = street_obj
|
|
|
|
if best_match and best_score >= min_confidence:
|
|
logger.info(
|
|
f"Fuzzy match: '{street_input}' -> '{best_match.street_name}' "
|
|
f"(confidence: {best_score:.1f}%)"
|
|
)
|
|
return StreetMatch(
|
|
original_street=street_input,
|
|
matched_street=best_match.street_name,
|
|
confidence_score=best_score,
|
|
town=best_match.town,
|
|
state=best_match.state,
|
|
street_ref_id=best_match.id,
|
|
)
|
|
|
|
logger.debug(
|
|
f"No confident match for '{street_input}' "
|
|
f"(best: {best_score:.1f}%, threshold: {min_confidence}%)"
|
|
)
|
|
return None
|
|
|
|
|
|
def correct_address(
|
|
session: Session,
|
|
full_address: str,
|
|
town: str,
|
|
state: str,
|
|
min_confidence: float = 75.0
|
|
) -> Optional[StreetMatch]:
|
|
"""
|
|
Attempt to correct a full address using fuzzy street matching.
|
|
|
|
Extracts the street portion, finds a match, and returns
|
|
a corrected address with the matched street name.
|
|
|
|
Args:
|
|
session: SQLAlchemy session
|
|
full_address: Full street address (e.g., "123 Mian St")
|
|
town: Town/city name
|
|
state: State abbreviation
|
|
min_confidence: Minimum match confidence
|
|
|
|
Returns:
|
|
StreetMatch with corrected_address if match found, None otherwise
|
|
"""
|
|
# Extract street number and street name
|
|
street_number, street_name = extract_street_number(full_address)
|
|
|
|
if not street_name:
|
|
return None
|
|
|
|
# Find matching street
|
|
match = find_matching_street(
|
|
session=session,
|
|
street_input=street_name,
|
|
town=town,
|
|
state=state,
|
|
min_confidence=min_confidence,
|
|
)
|
|
|
|
if match:
|
|
# Build corrected address
|
|
if street_number:
|
|
match.corrected_address = f"{street_number} {match.matched_street}"
|
|
else:
|
|
match.corrected_address = match.matched_street
|
|
|
|
logger.info(
|
|
f"Address correction: '{full_address}' -> '{match.corrected_address}'"
|
|
)
|
|
|
|
return match
|
|
|
|
|
|
def get_town_street_count(session: Session, town: str, state: str) -> int:
|
|
"""
|
|
Get the number of streets in the reference table for a town.
|
|
|
|
Args:
|
|
session: SQLAlchemy session
|
|
town: Town/city name
|
|
state: State abbreviation
|
|
|
|
Returns:
|
|
Number of streets in the reference table
|
|
"""
|
|
return session.query(StreetReference).filter(
|
|
StreetReference.town_normalized == town.lower().strip(),
|
|
StreetReference.state == state.upper()
|
|
).count()
|