Files
crawler/test.py
2025-06-08 13:17:33 -04:00

34 lines
1.4 KiB
Python

import requests
from bs4 import BeautifulSoup
url = "https://www.newenglandoil.com/connecticut/zone1.asp?type=0"
headers_req = { # Renamed to avoid conflict with 'headers' variable later
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}
try:
response = requests.get(url, headers=headers_req, timeout=10)
response.raise_for_status()
soup = BeautifulSoup(response.content, 'html.parser')
all_tables = soup.find_all('table')
print(f"Found {len(all_tables)} table(s) in total.")
if all_tables:
table = all_tables[0] # Assuming it's the first (and only) table
thead = table.find('thead')
if thead:
# Get the exact header texts
actual_headers = [th.get_text(strip=True) for th in thead.find_all('th')]
print(f"Actual headers found in the first table's thead: {actual_headers}")
# Get the lowercased versions for easy comparison
actual_headers_lower = [th.get_text(strip=True).lower() for th in thead.find_all('th')]
print(f"Actual headers (lowercase): {actual_headers_lower}")
else:
print("The first table found does not have a <thead> element.")
else:
print("No tables found on the page.")
except requests.exceptions.RequestException as e:
print(f"Error fetching page: {e}")