feat(CRIT-010): add zone-to-county mapping and county_id to oil_prices

Add ZONE_COUNTY_MAP for all 5 scraped states (42 zone-to-county entries).
Scraper now resolves county_id at startup and assigns it to each record.
Upsert logic deduplicates by (name, state, county_id) to prevent duplicates
when multiple zones map to the same county. Also adds County model for
DB lookups and fixes Rhode Island zone count (4, not 5).

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-02-09 18:31:46 -05:00
parent 6daa706e5a
commit 8f45f4c209
9 changed files with 746 additions and 34 deletions

View File

@@ -31,7 +31,7 @@ SITES_CONFIG = [
"zone1", "zone2", "zone3", "zone4", "zone5", "zone6"
],
"rhodeisland": [
"zone1", "zone2", "zone3", "zone4", "zone5"
"zone1", "zone2", "zone3", "zone4"
],
@@ -40,23 +40,64 @@ SITES_CONFIG = [
{
"site_name": "MaineOil",
"base_url": "https://www.maineoil.com",
# URL template for MaineOil using numeric zones like zone1.asp, zone2.asp
# {zone_slug} will be "zone1", "zone2", etc.
# No {state_slug} is needed in this part of the path for maineoil.com
"url_template": "{base_url}/{zone_slug}.asp?type={oil_type}",
"oil_type": 0,
"locations": {
# "maine" is our internal key for the state.
# The zone_slugs are "zone1", "zone2", etc.
# YOU NEED TO VERIFY THE ACTUAL ZONE SLUGS AND COUNT FOR MAINEOIL.COM
"maine": [
"zone1", "zone2", "zone3", "zone4", "zone5",
"zone6", "zone7" # Example: Add/remove based on actual zones on maineoil.com
"zone6", "zone7"
]
}
}
]
# --- ZONE-TO-COUNTY MAPPING ---
# Maps (state_key, zone_number) -> (state_abbrev, county_name)
ZONE_COUNTY_MAP = {
("connecticut", 1): ("CT", "New London"),
("connecticut", 2): ("CT", "Windham"),
("connecticut", 3): ("CT", "New Haven"),
("connecticut", 4): ("CT", "Middlesex"),
("connecticut", 5): ("CT", "New Haven"),
("connecticut", 6): ("CT", "Hartford"),
("connecticut", 7): ("CT", "Litchfield"),
("connecticut", 8): ("CT", "Fairfield"),
("connecticut", 9): ("CT", "Tolland"),
("connecticut", 10): ("CT", "Litchfield"),
("massachusetts", 1): ("MA", "Suffolk"),
("massachusetts", 2): ("MA", "Middlesex"),
("massachusetts", 3): ("MA", "Norfolk"),
("massachusetts", 4): ("MA", "Plymouth"),
("massachusetts", 5): ("MA", "Middlesex"),
("massachusetts", 6): ("MA", "Bristol"),
("massachusetts", 7): ("MA", "Barnstable"),
("massachusetts", 8): ("MA", "Essex"),
("massachusetts", 9): ("MA", "Essex"),
("massachusetts", 10): ("MA", "Worcester"),
("massachusetts", 11): ("MA", "Worcester"),
("massachusetts", 12): ("MA", "Hampshire"),
("massachusetts", 13): ("MA", "Hampden"),
("massachusetts", 14): ("MA", "Franklin"),
("massachusetts", 15): ("MA", "Berkshire"),
("newhampshire", 1): ("NH", "Coos"),
("newhampshire", 2): ("NH", "Strafford"),
("newhampshire", 3): ("NH", "Merrimack"),
("newhampshire", 4): ("NH", "Grafton"),
("newhampshire", 5): ("NH", "Cheshire"),
("newhampshire", 6): ("NH", "Hillsborough"),
("rhodeisland", 1): ("RI", "Newport"),
("rhodeisland", 2): ("RI", "Providence"),
("rhodeisland", 3): ("RI", "Washington"),
("rhodeisland", 4): ("RI", "Kent"),
("maine", 1): ("ME", "Cumberland"),
("maine", 2): ("ME", "Kennebec"),
("maine", 3): ("ME", "Androscoggin"),
("maine", 4): ("ME", "York"),
("maine", 5): ("ME", "Knox"),
("maine", 6): ("ME", "Penobscot"),
("maine", 7): ("ME", "Washington"),
}
LOG_FILE = "oil_scraper.log"
logging.basicConfig(
filename=LOG_FILE,
@@ -125,7 +166,7 @@ def parse_price_table(soup, state_name_key, zone_slug_str):
if not is_price_table:
continue
candidate_tables_found += 1
tbody = table.find('tbody')
if not tbody:
@@ -139,7 +180,7 @@ def parse_price_table(soup, state_name_key, zone_slug_str):
for row_index, row in enumerate(rows):
cells = row.find_all('td')
max_required_index = max(actual_column_indices.values()) if actual_column_indices else -1
if max_required_index == -1:
logging.error(f"Logic error: is_price_table true but no column indices for {state_name_key}/{zone_slug_str}")
continue
@@ -172,11 +213,31 @@ def parse_price_table(soup, state_name_key, zone_slug_str):
})
elif len(cells) > 0:
logging.warning(f"Skipping row {row_index+1} with insufficient cells ({len(cells)}, need {max_required_index+1}) in {state_name_key}/{zone_slug_str}")
if candidate_tables_found == 0:
logging.warning(f"No tables matching expected price table structure found for {state_name_key} - {zone_slug_str}.")
return data_dicts
# --- Helper: Build county lookup ---
def build_county_lookup(db_session):
"""Build (state_abbrev, county_name) -> county_id lookup from DB."""
counties = db_session.query(models.County).all()
lookup = {}
for c in counties:
lookup[(c.state, c.name)] = c.id
logging.info(f"Built county lookup with {len(lookup)} entries")
return lookup
def resolve_county_id(state_key, zone_number, county_lookup):
"""Resolve county_id from ZONE_COUNTY_MAP and county lookup."""
mapping = ZONE_COUNTY_MAP.get((state_key, zone_number))
if not mapping:
return None
state_abbrev, county_name = mapping
return county_lookup.get((state_abbrev, county_name))
# --- Main Script ---
def main():
logging.info("Starting oil price scraper job.")
@@ -191,22 +252,24 @@ def main():
total_records_added_this_run = 0
try:
# Build county lookup at startup
county_lookup = build_county_lookup(db_session)
for site_config in SITES_CONFIG:
site_name = site_config["site_name"]
base_url = site_config["base_url"]
url_template = site_config["url_template"]
oil_type = site_config["oil_type"]
logging.info(f"--- Processing site: {site_name} ---")
for state_key_in_config, zone_slugs_list in site_config["locations"].items():
# state_key_in_config is "connecticut", "maine", etc.
for zone_slug_from_list in zone_slugs_list: # e.g., "zone1", "zonema5"
for zone_slug_from_list in zone_slugs_list:
format_params = {
"base_url": base_url,
"state_slug": state_key_in_config, # Used if {state_slug} in template
"zone_slug": zone_slug_from_list, # This is "zone1", "zonema5", etc.
"state_slug": state_key_in_config,
"zone_slug": zone_slug_from_list,
"oil_type": oil_type
}
target_url = url_template.format(**format_params)
@@ -215,44 +278,61 @@ def main():
soup = make_request(target_url)
if soup:
# Pass state_key_in_config as state_name_key
# Pass zone_slug_from_list (e.g. "zone1") as zone_slug_str for parsing to int
parsed_items = parse_price_table(soup, state_key_in_config, zone_slug_from_list)
if parsed_items:
for item_dict in parsed_items: # item_dict["zone"] will be an integer
# Check if a record with the same name, state, and zone already exists
existing_record = db_session.query(models.OilPrice).filter(
models.OilPrice.name == item_dict["name"],
models.OilPrice.state == item_dict["state"],
models.OilPrice.zone == item_dict["zone"]
).first()
# Resolve county_id for this zone
zone_int = parse_zone_slug_to_int(zone_slug_from_list)
county_id = None
if zone_int is not None:
county_id = resolve_county_id(state_key_in_config, zone_int, county_lookup)
for item_dict in parsed_items:
# Match by county_id when available to avoid duplicates
# when multiple zones map to the same county
if county_id is not None:
existing_record = db_session.query(models.OilPrice).filter(
models.OilPrice.name == item_dict["name"],
models.OilPrice.state == item_dict["state"],
models.OilPrice.county_id == county_id
).first()
else:
existing_record = db_session.query(models.OilPrice).filter(
models.OilPrice.name == item_dict["name"],
models.OilPrice.state == item_dict["state"],
models.OilPrice.zone == item_dict["zone"]
).first()
if existing_record:
# If record exists, check if company_id is not null
if existing_record.company_id is not None:
logging.debug(f"Skipping update for {item_dict['name']} in {item_dict['state']} zone {item_dict['zone']} due to non-null company_id")
else:
# If company_id is null, check if price is different
updated = False
if county_id is not None and existing_record.county_id != county_id:
existing_record.county_id = county_id
updated = True
if existing_record.price != item_dict["price"]:
existing_record.price = item_dict["price"]
existing_record.date = item_dict["date"]
existing_record.scrapetimestamp = datetime.utcnow()
logging.info(f"Updated price for {item_dict['name']} in {item_dict['state']} zone {item_dict['zone']} to {item_dict['price']}")
elif updated:
existing_record.scrapetimestamp = datetime.utcnow()
logging.info(f"Updated county_id for {item_dict['name']} in {item_dict['state']} zone {item_dict['zone']} to {county_id}")
else:
logging.debug(f"Price unchanged for {item_dict['name']} in {item_dict['state']} zone {item_dict['zone']}")
else:
# If no record exists, create a new one
oil_price_record = models.OilPrice(
state=item_dict["state"],
zone=item_dict["zone"],
name=item_dict["name"],
price=item_dict["price"],
date=item_dict["date"],
county_id=county_id,
scrapetimestamp=datetime.utcnow()
)
db_session.add(oil_price_record)
logging.info(f"Added new record for {item_dict['name']} in {item_dict['state']} zone {item_dict['zone']}")
logging.info(f"Added new record for {item_dict['name']} in {item_dict['state']} zone {item_dict['zone']} (county_id={county_id})")
total_records_added_this_run += len(parsed_items)
logging.info(f"Queued {len(parsed_items)} records from {site_name} - {state_key_in_config}/{zone_slug_from_list} for DB insertion.")
else: