feat(CRIT-010): add zone-to-county mapping and county_id to oil_prices
Add ZONE_COUNTY_MAP for all 5 scraped states (42 zone-to-county entries). Scraper now resolves county_id at startup and assigns it to each record. Upsert logic deduplicates by (name, state, county_id) to prevent duplicates when multiple zones map to the same county. Also adds County model for DB lookups and fixes Rhode Island zone count (4, not 5). Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
144
fuel_scraper.py
144
fuel_scraper.py
@@ -31,7 +31,7 @@ SITES_CONFIG = [
|
||||
"zone1", "zone2", "zone3", "zone4", "zone5", "zone6"
|
||||
],
|
||||
"rhodeisland": [
|
||||
"zone1", "zone2", "zone3", "zone4", "zone5"
|
||||
"zone1", "zone2", "zone3", "zone4"
|
||||
],
|
||||
|
||||
|
||||
@@ -40,23 +40,64 @@ SITES_CONFIG = [
|
||||
{
|
||||
"site_name": "MaineOil",
|
||||
"base_url": "https://www.maineoil.com",
|
||||
# URL template for MaineOil using numeric zones like zone1.asp, zone2.asp
|
||||
# {zone_slug} will be "zone1", "zone2", etc.
|
||||
# No {state_slug} is needed in this part of the path for maineoil.com
|
||||
"url_template": "{base_url}/{zone_slug}.asp?type={oil_type}",
|
||||
"oil_type": 0,
|
||||
"locations": {
|
||||
# "maine" is our internal key for the state.
|
||||
# The zone_slugs are "zone1", "zone2", etc.
|
||||
# YOU NEED TO VERIFY THE ACTUAL ZONE SLUGS AND COUNT FOR MAINEOIL.COM
|
||||
"maine": [
|
||||
"zone1", "zone2", "zone3", "zone4", "zone5",
|
||||
"zone6", "zone7" # Example: Add/remove based on actual zones on maineoil.com
|
||||
"zone6", "zone7"
|
||||
]
|
||||
}
|
||||
}
|
||||
]
|
||||
|
||||
# --- ZONE-TO-COUNTY MAPPING ---
|
||||
# Maps (state_key, zone_number) -> (state_abbrev, county_name)
|
||||
ZONE_COUNTY_MAP = {
|
||||
("connecticut", 1): ("CT", "New London"),
|
||||
("connecticut", 2): ("CT", "Windham"),
|
||||
("connecticut", 3): ("CT", "New Haven"),
|
||||
("connecticut", 4): ("CT", "Middlesex"),
|
||||
("connecticut", 5): ("CT", "New Haven"),
|
||||
("connecticut", 6): ("CT", "Hartford"),
|
||||
("connecticut", 7): ("CT", "Litchfield"),
|
||||
("connecticut", 8): ("CT", "Fairfield"),
|
||||
("connecticut", 9): ("CT", "Tolland"),
|
||||
("connecticut", 10): ("CT", "Litchfield"),
|
||||
("massachusetts", 1): ("MA", "Suffolk"),
|
||||
("massachusetts", 2): ("MA", "Middlesex"),
|
||||
("massachusetts", 3): ("MA", "Norfolk"),
|
||||
("massachusetts", 4): ("MA", "Plymouth"),
|
||||
("massachusetts", 5): ("MA", "Middlesex"),
|
||||
("massachusetts", 6): ("MA", "Bristol"),
|
||||
("massachusetts", 7): ("MA", "Barnstable"),
|
||||
("massachusetts", 8): ("MA", "Essex"),
|
||||
("massachusetts", 9): ("MA", "Essex"),
|
||||
("massachusetts", 10): ("MA", "Worcester"),
|
||||
("massachusetts", 11): ("MA", "Worcester"),
|
||||
("massachusetts", 12): ("MA", "Hampshire"),
|
||||
("massachusetts", 13): ("MA", "Hampden"),
|
||||
("massachusetts", 14): ("MA", "Franklin"),
|
||||
("massachusetts", 15): ("MA", "Berkshire"),
|
||||
("newhampshire", 1): ("NH", "Coos"),
|
||||
("newhampshire", 2): ("NH", "Strafford"),
|
||||
("newhampshire", 3): ("NH", "Merrimack"),
|
||||
("newhampshire", 4): ("NH", "Grafton"),
|
||||
("newhampshire", 5): ("NH", "Cheshire"),
|
||||
("newhampshire", 6): ("NH", "Hillsborough"),
|
||||
("rhodeisland", 1): ("RI", "Newport"),
|
||||
("rhodeisland", 2): ("RI", "Providence"),
|
||||
("rhodeisland", 3): ("RI", "Washington"),
|
||||
("rhodeisland", 4): ("RI", "Kent"),
|
||||
("maine", 1): ("ME", "Cumberland"),
|
||||
("maine", 2): ("ME", "Kennebec"),
|
||||
("maine", 3): ("ME", "Androscoggin"),
|
||||
("maine", 4): ("ME", "York"),
|
||||
("maine", 5): ("ME", "Knox"),
|
||||
("maine", 6): ("ME", "Penobscot"),
|
||||
("maine", 7): ("ME", "Washington"),
|
||||
}
|
||||
|
||||
LOG_FILE = "oil_scraper.log"
|
||||
logging.basicConfig(
|
||||
filename=LOG_FILE,
|
||||
@@ -125,7 +166,7 @@ def parse_price_table(soup, state_name_key, zone_slug_str):
|
||||
|
||||
if not is_price_table:
|
||||
continue
|
||||
|
||||
|
||||
candidate_tables_found += 1
|
||||
tbody = table.find('tbody')
|
||||
if not tbody:
|
||||
@@ -139,7 +180,7 @@ def parse_price_table(soup, state_name_key, zone_slug_str):
|
||||
for row_index, row in enumerate(rows):
|
||||
cells = row.find_all('td')
|
||||
max_required_index = max(actual_column_indices.values()) if actual_column_indices else -1
|
||||
|
||||
|
||||
if max_required_index == -1:
|
||||
logging.error(f"Logic error: is_price_table true but no column indices for {state_name_key}/{zone_slug_str}")
|
||||
continue
|
||||
@@ -172,11 +213,31 @@ def parse_price_table(soup, state_name_key, zone_slug_str):
|
||||
})
|
||||
elif len(cells) > 0:
|
||||
logging.warning(f"Skipping row {row_index+1} with insufficient cells ({len(cells)}, need {max_required_index+1}) in {state_name_key}/{zone_slug_str}")
|
||||
|
||||
|
||||
if candidate_tables_found == 0:
|
||||
logging.warning(f"No tables matching expected price table structure found for {state_name_key} - {zone_slug_str}.")
|
||||
return data_dicts
|
||||
|
||||
# --- Helper: Build county lookup ---
|
||||
def build_county_lookup(db_session):
|
||||
"""Build (state_abbrev, county_name) -> county_id lookup from DB."""
|
||||
counties = db_session.query(models.County).all()
|
||||
lookup = {}
|
||||
for c in counties:
|
||||
lookup[(c.state, c.name)] = c.id
|
||||
logging.info(f"Built county lookup with {len(lookup)} entries")
|
||||
return lookup
|
||||
|
||||
|
||||
def resolve_county_id(state_key, zone_number, county_lookup):
|
||||
"""Resolve county_id from ZONE_COUNTY_MAP and county lookup."""
|
||||
mapping = ZONE_COUNTY_MAP.get((state_key, zone_number))
|
||||
if not mapping:
|
||||
return None
|
||||
state_abbrev, county_name = mapping
|
||||
return county_lookup.get((state_abbrev, county_name))
|
||||
|
||||
|
||||
# --- Main Script ---
|
||||
def main():
|
||||
logging.info("Starting oil price scraper job.")
|
||||
@@ -191,22 +252,24 @@ def main():
|
||||
total_records_added_this_run = 0
|
||||
|
||||
try:
|
||||
# Build county lookup at startup
|
||||
county_lookup = build_county_lookup(db_session)
|
||||
|
||||
for site_config in SITES_CONFIG:
|
||||
site_name = site_config["site_name"]
|
||||
base_url = site_config["base_url"]
|
||||
url_template = site_config["url_template"]
|
||||
oil_type = site_config["oil_type"]
|
||||
|
||||
|
||||
logging.info(f"--- Processing site: {site_name} ---")
|
||||
|
||||
for state_key_in_config, zone_slugs_list in site_config["locations"].items():
|
||||
# state_key_in_config is "connecticut", "maine", etc.
|
||||
|
||||
for zone_slug_from_list in zone_slugs_list: # e.g., "zone1", "zonema5"
|
||||
|
||||
for zone_slug_from_list in zone_slugs_list:
|
||||
format_params = {
|
||||
"base_url": base_url,
|
||||
"state_slug": state_key_in_config, # Used if {state_slug} in template
|
||||
"zone_slug": zone_slug_from_list, # This is "zone1", "zonema5", etc.
|
||||
"state_slug": state_key_in_config,
|
||||
"zone_slug": zone_slug_from_list,
|
||||
"oil_type": oil_type
|
||||
}
|
||||
target_url = url_template.format(**format_params)
|
||||
@@ -215,44 +278,61 @@ def main():
|
||||
|
||||
soup = make_request(target_url)
|
||||
if soup:
|
||||
# Pass state_key_in_config as state_name_key
|
||||
# Pass zone_slug_from_list (e.g. "zone1") as zone_slug_str for parsing to int
|
||||
parsed_items = parse_price_table(soup, state_key_in_config, zone_slug_from_list)
|
||||
|
||||
|
||||
if parsed_items:
|
||||
for item_dict in parsed_items: # item_dict["zone"] will be an integer
|
||||
# Check if a record with the same name, state, and zone already exists
|
||||
existing_record = db_session.query(models.OilPrice).filter(
|
||||
models.OilPrice.name == item_dict["name"],
|
||||
models.OilPrice.state == item_dict["state"],
|
||||
models.OilPrice.zone == item_dict["zone"]
|
||||
).first()
|
||||
|
||||
# Resolve county_id for this zone
|
||||
zone_int = parse_zone_slug_to_int(zone_slug_from_list)
|
||||
county_id = None
|
||||
if zone_int is not None:
|
||||
county_id = resolve_county_id(state_key_in_config, zone_int, county_lookup)
|
||||
|
||||
for item_dict in parsed_items:
|
||||
# Match by county_id when available to avoid duplicates
|
||||
# when multiple zones map to the same county
|
||||
if county_id is not None:
|
||||
existing_record = db_session.query(models.OilPrice).filter(
|
||||
models.OilPrice.name == item_dict["name"],
|
||||
models.OilPrice.state == item_dict["state"],
|
||||
models.OilPrice.county_id == county_id
|
||||
).first()
|
||||
else:
|
||||
existing_record = db_session.query(models.OilPrice).filter(
|
||||
models.OilPrice.name == item_dict["name"],
|
||||
models.OilPrice.state == item_dict["state"],
|
||||
models.OilPrice.zone == item_dict["zone"]
|
||||
).first()
|
||||
|
||||
if existing_record:
|
||||
# If record exists, check if company_id is not null
|
||||
if existing_record.company_id is not None:
|
||||
logging.debug(f"Skipping update for {item_dict['name']} in {item_dict['state']} zone {item_dict['zone']} due to non-null company_id")
|
||||
else:
|
||||
# If company_id is null, check if price is different
|
||||
updated = False
|
||||
if county_id is not None and existing_record.county_id != county_id:
|
||||
existing_record.county_id = county_id
|
||||
updated = True
|
||||
if existing_record.price != item_dict["price"]:
|
||||
existing_record.price = item_dict["price"]
|
||||
existing_record.date = item_dict["date"]
|
||||
existing_record.scrapetimestamp = datetime.utcnow()
|
||||
logging.info(f"Updated price for {item_dict['name']} in {item_dict['state']} zone {item_dict['zone']} to {item_dict['price']}")
|
||||
elif updated:
|
||||
existing_record.scrapetimestamp = datetime.utcnow()
|
||||
logging.info(f"Updated county_id for {item_dict['name']} in {item_dict['state']} zone {item_dict['zone']} to {county_id}")
|
||||
else:
|
||||
logging.debug(f"Price unchanged for {item_dict['name']} in {item_dict['state']} zone {item_dict['zone']}")
|
||||
else:
|
||||
# If no record exists, create a new one
|
||||
oil_price_record = models.OilPrice(
|
||||
state=item_dict["state"],
|
||||
zone=item_dict["zone"],
|
||||
name=item_dict["name"],
|
||||
price=item_dict["price"],
|
||||
date=item_dict["date"],
|
||||
county_id=county_id,
|
||||
scrapetimestamp=datetime.utcnow()
|
||||
)
|
||||
db_session.add(oil_price_record)
|
||||
logging.info(f"Added new record for {item_dict['name']} in {item_dict['state']} zone {item_dict['zone']}")
|
||||
logging.info(f"Added new record for {item_dict['name']} in {item_dict['state']} zone {item_dict['zone']} (county_id={county_id})")
|
||||
total_records_added_this_run += len(parsed_items)
|
||||
logging.info(f"Queued {len(parsed_items)} records from {site_name} - {state_key_in_config}/{zone_slug_from_list} for DB insertion.")
|
||||
else:
|
||||
|
||||
Reference in New Issue
Block a user