first commit

This commit is contained in:
2026-01-18 17:53:26 -05:00
commit 0b9c0915a1
15 changed files with 2692 additions and 0 deletions

572
app/streets.py Normal file
View File

@@ -0,0 +1,572 @@
"""
Street reference tools for address correction.
This module provides functionality to:
1. Fetch streets from OpenStreetMap Overpass API for a given town/state
2. Store streets in the StreetReference table
3. Perform fuzzy matching to correct misspelled addresses
The fuzzy matching handles common issues like:
- Misspelled street names ("Mian St" -> "Main St")
- Wrong suffixes ("Main Rd" -> "Main St")
- Missing/extra spaces
- Abbreviated vs full names ("St" vs "Street")
"""
import logging
import re
import time
from dataclasses import dataclass
from datetime import datetime
from typing import List, Optional, Tuple
import requests
from rapidfuzz import fuzz, process
from sqlalchemy.orm import Session
from app.config import STATE_MAPPING
from app.models import StreetReference
logger = logging.getLogger(__name__)
# Overpass API endpoints (multiple for fallback)
OVERPASS_API_URLS = [
"https://overpass-api.de/api/interpreter",
"https://overpass.kumi.systems/api/interpreter",
"https://maps.mail.ru/osm/tools/overpass/api/interpreter",
]
# Common street suffix variations for normalization
STREET_SUFFIXES = {
# Standard -> variations
"street": ["st", "str", "strt"],
"avenue": ["ave", "av", "aven"],
"road": ["rd", "rod"],
"drive": ["dr", "drv", "driv"],
"lane": ["ln", "lne"],
"court": ["ct", "crt", "cour"],
"circle": ["cir", "circ", "crcl"],
"boulevard": ["blvd", "boul", "blv"],
"place": ["pl", "plc"],
"terrace": ["ter", "terr", "trc"],
"way": ["wy"],
"highway": ["hwy", "hiway", "hgwy"],
"parkway": ["pkwy", "pky", "pkway"],
"square": ["sq", "sqr"],
"trail": ["trl", "tr"],
"crossing": ["xing", "crssng"],
"heights": ["hts", "hgts"],
"point": ["pt", "pnt"],
"ridge": ["rdg", "rdge"],
"valley": ["vly", "vlly"],
"view": ["vw", "viw"],
"center": ["ctr", "cntr", "centre"],
"north": ["n"],
"south": ["s"],
"east": ["e"],
"west": ["w"],
"northeast": ["ne"],
"northwest": ["nw"],
"southeast": ["se"],
"southwest": ["sw"],
}
# Build reverse lookup: abbreviation -> full form
SUFFIX_TO_FULL = {}
for full, abbrevs in STREET_SUFFIXES.items():
for abbr in abbrevs:
SUFFIX_TO_FULL[abbr] = full
SUFFIX_TO_FULL[full] = full # Also map full to itself
@dataclass
class StreetMatch:
"""Result of fuzzy street matching."""
original_street: str
matched_street: str
confidence_score: float
town: str
state: str
street_ref_id: int
corrected_address: Optional[str] = None
@dataclass
class FetchResult:
"""Result of fetching streets from OSM."""
success: bool
streets_added: int
streets_updated: int
total_found: int
message: str
errors: List[str]
def normalize_street_name(street: str) -> str:
"""
Normalize a street name for fuzzy matching.
- Lowercase
- Remove extra whitespace
- Expand common abbreviations to full form
- Remove punctuation
Args:
street: Raw street name
Returns:
Normalized street name
"""
if not street:
return ""
# Lowercase and strip
normalized = street.lower().strip()
# Remove punctuation except hyphens
normalized = re.sub(r"[.,']", "", normalized)
# Normalize whitespace
normalized = re.sub(r"\s+", " ", normalized)
# Split into words and expand abbreviations
words = normalized.split()
expanded_words = []
for word in words:
if word in SUFFIX_TO_FULL:
expanded_words.append(SUFFIX_TO_FULL[word])
else:
expanded_words.append(word)
return " ".join(expanded_words)
def extract_street_number(address: str) -> Tuple[Optional[str], str]:
"""
Extract street number from an address string.
Args:
address: Full address like "123 Main Street"
Returns:
Tuple of (street_number, remaining_address)
"""
if not address:
return None, ""
# Match leading number (possibly with letter suffix like "123A")
match = re.match(r"^(\d+[A-Za-z]?)\s+(.+)$", address.strip())
if match:
return match.group(1), match.group(2)
return None, address.strip()
def get_state_name(state_abbr: str) -> str:
"""
Get full state name from abbreviation for Overpass query.
Args:
state_abbr: 2-letter state abbreviation
Returns:
Full state name
"""
state_names = {
"AL": "Alabama", "AK": "Alaska", "AZ": "Arizona", "AR": "Arkansas",
"CA": "California", "CO": "Colorado", "CT": "Connecticut", "DE": "Delaware",
"DC": "District of Columbia", "FL": "Florida", "GA": "Georgia", "HI": "Hawaii",
"ID": "Idaho", "IL": "Illinois", "IN": "Indiana", "IA": "Iowa",
"KS": "Kansas", "KY": "Kentucky", "LA": "Louisiana", "ME": "Maine",
"MD": "Maryland", "MA": "Massachusetts", "MI": "Michigan", "MN": "Minnesota",
"MS": "Mississippi", "MO": "Missouri", "MT": "Montana", "NE": "Nebraska",
"NV": "Nevada", "NH": "New Hampshire", "NJ": "New Jersey", "NM": "New Mexico",
"NY": "New York", "NC": "North Carolina", "ND": "North Dakota", "OH": "Ohio",
"OK": "Oklahoma", "OR": "Oregon", "PA": "Pennsylvania", "RI": "Rhode Island",
"SC": "South Carolina", "SD": "South Dakota", "TN": "Tennessee", "TX": "Texas",
"UT": "Utah", "VT": "Vermont", "VA": "Virginia", "WA": "Washington",
"WV": "West Virginia", "WI": "Wisconsin", "WY": "Wyoming",
"PR": "Puerto Rico", "VI": "Virgin Islands", "GU": "Guam", "AS": "American Samoa",
}
return state_names.get(state_abbr.upper(), state_abbr)
def fetch_streets_from_osm(town: str, state: str) -> Tuple[List[dict], str]:
"""
Fetch all streets in a town from OpenStreetMap using Overpass API.
Args:
town: Town/city name
state: 2-letter state abbreviation
Returns:
Tuple of (list of street dicts, error message or empty string)
"""
state_name = get_state_name(state)
state_upper = state.upper()
# Simpler, more reliable Overpass query
# Uses geocodeArea which is optimized for place lookups
query = f"""
[out:json][timeout:120];
// Use geocodeArea for reliable city lookup with state context
{{geocodeArea:{town}, {state_name}, United States}}->.city;
// Get all named streets in the city
way["highway"]["name"](area.city);
out tags;
"""
# Alternative query if geocodeArea fails (more explicit)
fallback_query = f"""
[out:json][timeout:120];
// Find state by ISO code
area["ISO3166-2"="US-{state_upper}"]->.state;
// Find city/town within state
(
relation["name"="{town}"]["type"="boundary"](area.state);
way["name"="{town}"]["place"](area.state);
node["name"="{town}"]["place"](area.state);
);
map_to_area->.city;
// Get streets
way["highway"]["name"](area.city);
out tags;
"""
# Most reliable: search by name within bounding box of state
# This uses Nominatim-style search which is very reliable
simple_query = f"""
[out:json][timeout:60];
area["name"="{state_name}"]["boundary"="administrative"]["admin_level"="4"]->.state;
area["name"="{town}"](area.state)->.city;
way["highway"]["name"](area.city);
out tags;
"""
queries = [simple_query, query, fallback_query]
query_names = ["simple", "geocodeArea", "fallback"]
logger.info(f"Fetching streets from OSM for {town}, {state_name}")
last_error = ""
for api_url in OVERPASS_API_URLS:
for q, q_name in zip(queries, query_names):
try:
logger.info(f"Trying {q_name} query on {api_url.split('/')[2]}...")
logger.debug(f"Query: {q}")
response = requests.post(
api_url,
data={"data": q},
timeout=120,
headers={"User-Agent": "EamcoAddressChecker/1.0"}
)
if response.status_code == 429:
logger.warning("Rate limited, waiting 30s...")
time.sleep(30)
continue
if response.status_code == 504:
logger.warning(f"Timeout on {q_name} query, trying next...")
continue
response.raise_for_status()
data = response.json()
elements = data.get("elements", [])
if elements:
logger.info(f"Success with {q_name} query: {len(elements)} street segments")
# Process and return results
streets = []
seen_names = set()
for element in elements:
tags = element.get("tags", {})
name = tags.get("name")
if name and name.lower() not in seen_names:
seen_names.add(name.lower())
streets.append({
"name": name,
"osm_id": str(element.get("id", "")),
"highway_type": tags.get("highway", ""),
})
logger.info(f"Extracted {len(streets)} unique street names")
return streets, ""
else:
logger.debug(f"No results from {q_name} query")
except requests.exceptions.Timeout:
last_error = f"Timeout on {api_url}"
logger.warning(last_error)
continue
except requests.exceptions.RequestException as e:
last_error = f"Request error: {str(e)}"
logger.warning(last_error)
continue
except Exception as e:
last_error = f"Error: {str(e)}"
logger.warning(last_error)
continue
# All attempts failed
error = f"All Overpass queries failed for {town}, {state}. Last error: {last_error}"
logger.error(error)
return [], error
def populate_streets_for_town(
session: Session,
town: str,
state: str,
clear_existing: bool = False
) -> FetchResult:
"""
Fetch streets from OSM and populate the StreetReference table.
Args:
session: SQLAlchemy session
town: Town/city name
state: 2-letter state abbreviation
clear_existing: If True, delete existing streets for this town first
Returns:
FetchResult with statistics
"""
state = state.upper()
town_normalized = town.lower().strip()
errors = []
logger.info(f"Populating streets for {town}, {state}")
# Optionally clear existing streets for this town
if clear_existing:
deleted = session.query(StreetReference).filter(
StreetReference.town_normalized == town_normalized,
StreetReference.state == state
).delete(synchronize_session=False)
session.commit()
logger.info(f"Cleared {deleted} existing street records")
# Fetch from OSM
streets, error = fetch_streets_from_osm(town, state)
if error:
errors.append(error)
if not streets:
return FetchResult(
success=len(errors) == 0,
streets_added=0,
streets_updated=0,
total_found=0,
message=f"No streets found for {town}, {state}",
errors=errors,
)
# Check for existing streets to avoid duplicates
existing_streets = session.query(StreetReference).filter(
StreetReference.town_normalized == town_normalized,
StreetReference.state == state
).all()
existing_names = {s.street_name_normalized for s in existing_streets}
added = 0
now = datetime.utcnow()
for street_data in streets:
name = street_data["name"]
name_normalized = normalize_street_name(name)
if name_normalized in existing_names:
continue
street_ref = StreetReference(
street_name=name,
street_name_normalized=name_normalized,
town=town,
town_normalized=town_normalized,
state=state,
osm_id=street_data.get("osm_id"),
created_at=now,
)
session.add(street_ref)
existing_names.add(name_normalized)
added += 1
session.commit()
logger.info(f"Added {added} new streets for {town}, {state}")
return FetchResult(
success=True,
streets_added=added,
streets_updated=0,
total_found=len(streets),
message=f"Successfully added {added} streets for {town}, {state}",
errors=errors,
)
def find_matching_street(
session: Session,
street_input: str,
town: str,
state: str,
min_confidence: float = 70.0
) -> Optional[StreetMatch]:
"""
Find the best matching street for a potentially misspelled input.
Uses fuzzy string matching with rapidfuzz to find the closest
match in the StreetReference table.
Args:
session: SQLAlchemy session
street_input: The street name to match (may be misspelled)
town: Town/city to search within
state: State abbreviation
min_confidence: Minimum match confidence (0-100)
Returns:
StreetMatch if found above threshold, None otherwise
"""
state = state.upper()
town_normalized = town.lower().strip()
# Normalize the input for matching
input_normalized = normalize_street_name(street_input)
# Get all streets for this town
streets = session.query(StreetReference).filter(
StreetReference.town_normalized == town_normalized,
StreetReference.state == state
).all()
if not streets:
logger.debug(f"No reference streets found for {town}, {state}")
return None
# Build list of (normalized_name, street_object) for matching
choices = [(s.street_name_normalized, s) for s in streets]
# Use rapidfuzz to find best match
# We use token_set_ratio which handles word order differences well
best_match = None
best_score = 0
for normalized_name, street_obj in choices:
# Try multiple scoring methods and take the best
scores = [
fuzz.ratio(input_normalized, normalized_name),
fuzz.partial_ratio(input_normalized, normalized_name),
fuzz.token_sort_ratio(input_normalized, normalized_name),
fuzz.token_set_ratio(input_normalized, normalized_name),
]
score = max(scores)
if score > best_score:
best_score = score
best_match = street_obj
if best_match and best_score >= min_confidence:
logger.info(
f"Fuzzy match: '{street_input}' -> '{best_match.street_name}' "
f"(confidence: {best_score:.1f}%)"
)
return StreetMatch(
original_street=street_input,
matched_street=best_match.street_name,
confidence_score=best_score,
town=best_match.town,
state=best_match.state,
street_ref_id=best_match.id,
)
logger.debug(
f"No confident match for '{street_input}' "
f"(best: {best_score:.1f}%, threshold: {min_confidence}%)"
)
return None
def correct_address(
session: Session,
full_address: str,
town: str,
state: str,
min_confidence: float = 75.0
) -> Optional[StreetMatch]:
"""
Attempt to correct a full address using fuzzy street matching.
Extracts the street portion, finds a match, and returns
a corrected address with the matched street name.
Args:
session: SQLAlchemy session
full_address: Full street address (e.g., "123 Mian St")
town: Town/city name
state: State abbreviation
min_confidence: Minimum match confidence
Returns:
StreetMatch with corrected_address if match found, None otherwise
"""
# Extract street number and street name
street_number, street_name = extract_street_number(full_address)
if not street_name:
return None
# Find matching street
match = find_matching_street(
session=session,
street_input=street_name,
town=town,
state=state,
min_confidence=min_confidence,
)
if match:
# Build corrected address
if street_number:
match.corrected_address = f"{street_number} {match.matched_street}"
else:
match.corrected_address = match.matched_street
logger.info(
f"Address correction: '{full_address}' -> '{match.corrected_address}'"
)
return match
def get_town_street_count(session: Session, town: str, state: str) -> int:
"""
Get the number of streets in the reference table for a town.
Args:
session: SQLAlchemy session
town: Town/city name
state: State abbreviation
Returns:
Number of streets in the reference table
"""
return session.query(StreetReference).filter(
StreetReference.town_normalized == town.lower().strip(),
StreetReference.state == state.upper()
).count()