{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# IESO Coincident Peak Prediction — Data Acquisition & Exploratory Analysis\n",
    "\n",
    "This notebook fetches 15 years of IESO hourly demand data and Open-Meteo historical weather for Toronto,\n",
    "merges the datasets, and performs comprehensive exploratory analysis of Ontario's peak demand patterns.\n",
    "\n",
    "**Data sources:**\n",
    "- IESO public hourly demand CSVs (2010–2021 from `reports-public.ieso.ca`)\n",
    "- Local ICI demand CSVs (2022–2025 base periods)\n",
    "- Open-Meteo historical weather archive for Toronto (43.65°N, -79.38°W)\n",
    "- Ground truth peak data from IESO Top-10 and Top-5 archives (xlsx)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "import matplotlib.pyplot as plt\n",
    "import matplotlib.dates as mdates\n",
    "import seaborn as sns\n",
    "import requests\n",
    "import openpyxl\n",
    "import os\n",
    "import io\n",
    "import warnings\n",
    "from pathlib import Path\n",
    "from datetime import datetime, timedelta\n",
    "\n",
    "warnings.filterwarnings('ignore', category=FutureWarning)\n",
    "sns.set_theme(style='whitegrid', font_scale=1.1)\n",
    "plt.rcParams['figure.figsize'] = (12, 6)\n",
    "plt.rcParams['figure.dpi'] = 100\n",
    "\n",
    "# Paths\n",
    "PROJECT_ROOT = Path(r'C:/wamp64/www/Spec_Driven_Dev_Website')\n",
    "DATA_DIR = PROJECT_ROOT / 'assets' / 'sample data'\n",
    "OUTPUT_DIR = PROJECT_ROOT / 'notebooks' / 'source' / 'data'\n",
    "OUTPUT_DIR.mkdir(parents=True, exist_ok=True)\n",
    "\n",
    "# Constants\n",
    "TORONTO_LAT = 43.65\n",
    "TORONTO_LON = -79.38\n",
    "IESO_BASE_URL = 'https://reports-public.ieso.ca/public/Demand'\n",
    "\n",
    "print(f'Project root: {PROJECT_ROOT}')\n",
    "print(f'Data directory: {DATA_DIR}')\n",
    "print(f'Output directory: {OUTPUT_DIR}')\n",
    "print(f'Local data files: {list(DATA_DIR.glob(\"*\"))}')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Load Local ICI Demand CSVs (2022–2025)\n",
    "\n",
    "These files cover ICI base periods (May 1 – April 30) and have 3 metadata header rows\n",
    "starting with `\\\\`. Format: `Date, Hour, Ontario Demand` with hour-ending convention (1–24 EST)."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def load_ici_demand(filepath):\n",
    "    \"\"\"Load an IESO ICI demand CSV, skipping metadata header rows.\"\"\"\n",
    "    df = pd.read_csv(\n",
    "        filepath,\n",
    "        skiprows=3,  # Skip 3 header lines starting with \\\\\n",
    "        names=['Date', 'Hour', 'Ontario Demand'],\n",
    "        parse_dates=['Date']\n",
    "    )\n",
    "    df = df.dropna(subset=['Ontario Demand'])\n",
    "    df['Ontario Demand'] = pd.to_numeric(df['Ontario Demand'], errors='coerce')\n",
    "    df['Hour'] = pd.to_numeric(df['Hour'], errors='coerce').astype(int)\n",
    "    return df\n",
    "\n",
    "# Load each local ICI file\n",
    "ici_files = {\n",
    "    2022: DATA_DIR / 'PUB_ICIDemand_2022_v8647.csv',\n",
    "    2023: DATA_DIR / 'PUB_ICIDemand_2023_v8776.csv',\n",
    "    2024: DATA_DIR / 'PUB_ICIDemand_2024_v8680.csv',\n",
    "    2025: DATA_DIR / 'PUB_ICIDemand_2025_v6828.csv',\n",
    "}\n",
    "\n",
    "local_dfs = []\n",
    "for year, path in ici_files.items():\n",
    "    if path.exists():\n",
    "        df = load_ici_demand(path)\n",
    "        df['source'] = f'local_ici_{year}'\n",
    "        local_dfs.append(df)\n",
    "        print(f'Loaded {year}: {len(df)} rows, '\n",
    "              f'{df[\"Date\"].min().strftime(\"%Y-%m-%d\")} to '\n",
    "              f'{df[\"Date\"].max().strftime(\"%Y-%m-%d\")}')\n",
    "    else:\n",
    "        print(f'WARNING: {path.name} not found')\n",
    "\n",
    "local_demand = pd.concat(local_dfs, ignore_index=True)\n",
    "print(f'\\nTotal local ICI rows: {len(local_demand):,}')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Fetch IESO Historical Demand (2010–2021)\n",
    "\n",
    "The IESO publishes calendar-year hourly demand at:\n",
    "`https://reports-public.ieso.ca/public/Demand/PUB_Demand_[YYYY].csv`\n",
    "\n",
    "Format: `Date, Hour, Market Demand, Ontario Demand` with 3 metadata header rows."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def fetch_ieso_demand(year):\n",
    "    \"\"\"Fetch IESO historical demand CSV for a calendar year.\"\"\"\n",
    "    url = f'{IESO_BASE_URL}/PUB_Demand_{year}.csv'\n",
    "    print(f'  Fetching {url}...', end=' ')\n",
    "    try:\n",
    "        response = requests.get(url, timeout=30)\n",
    "        response.raise_for_status()\n",
    "        df = pd.read_csv(\n",
    "            io.StringIO(response.text),\n",
    "            skiprows=3,\n",
    "            names=['Date', 'Hour', 'Market Demand', 'Ontario Demand'],\n",
    "            parse_dates=['Date']\n",
    "        )\n",
    "        df = df.dropna(subset=['Ontario Demand'])\n",
    "        df['Ontario Demand'] = pd.to_numeric(df['Ontario Demand'], errors='coerce')\n",
    "        df['Hour'] = pd.to_numeric(df['Hour'], errors='coerce').astype(int)\n",
    "        df = df[['Date', 'Hour', 'Ontario Demand']]\n",
    "        print(f'{len(df)} rows')\n",
    "        return df\n",
    "    except Exception as e:\n",
    "        print(f'FAILED: {e}')\n",
    "        return pd.DataFrame()\n",
    "\n",
    "print('Fetching IESO historical demand (2010-2021)...')\n",
    "remote_dfs = []\n",
    "for year in range(2010, 2022):\n",
    "    df = fetch_ieso_demand(year)\n",
    "    if not df.empty:\n",
    "        df['source'] = f'ieso_public_{year}'\n",
    "        remote_dfs.append(df)\n",
    "\n",
    "remote_demand = pd.concat(remote_dfs, ignore_index=True)\n",
    "print(f'\\nTotal remote rows: {len(remote_demand):,}')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Combine into Unified Demand Dataset\n",
    "\n",
    "Merge local ICI demand (2022–2025) with remote IESO demand (2010–2021) into a single\n",
    "time-indexed dataframe. Create a proper datetime index in EST using IESO's hour-ending\n",
    "convention (Hour 1 = midnight to 1 AM)."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Combine local and remote data\n",
    "demand = pd.concat([remote_demand, local_demand], ignore_index=True)\n",
    "\n",
    "# Remove duplicates — local ICI files may overlap with remote calendar-year files\n",
    "# at the May transition boundaries. Keep the first occurrence.\n",
    "demand = demand.drop_duplicates(subset=['Date', 'Hour'], keep='first')\n",
    "\n",
    "# Create datetime index from IESO hour-ending convention:\n",
    "# Hour 1 = 00:00-01:00 EST, so the timestamp for Hour 1 is 01:00\n",
    "demand['datetime'] = pd.to_datetime(demand['Date']) + pd.to_timedelta(demand['Hour'], unit='h')\n",
    "demand = demand.sort_values('datetime').reset_index(drop=True)\n",
    "\n",
    "# Assign base period (May 1 of year Y to April 30 of year Y+1 = base period Y)\n",
    "demand['base_period'] = demand['Date'].apply(\n",
    "    lambda d: d.year if d.month >= 5 else d.year - 1\n",
    ")\n",
    "\n",
    "print(f'Unified demand dataset: {len(demand):,} rows')\n",
    "print(f'Date range: {demand[\"Date\"].min()} to {demand[\"Date\"].max()}')\n",
    "print(f'Base periods: {sorted(demand[\"base_period\"].unique())}')\n",
    "print(f'\\nRows per base period:')\n",
    "print(demand.groupby('base_period').size().to_string())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Load Ground Truth Peak Data\n",
    "\n",
    "Read the user's spreadsheet files containing:\n",
    "1. **Top-10 peaks** for 13 base periods (2012–2025) — Date, Hour, Demand (MW)\n",
    "2. **Top-5 peaks with system-wide consumption** for 15 base periods (2010–2025) — includes AQEW"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Load Top-10 Peaks Archive\n",
    "top10_path = DATA_DIR / 'Top-Ten-Ontario-Demand-Peaks-Archive(1).xlsx'\n",
    "wb = openpyxl.load_workbook(top10_path, data_only=True)\n",
    "\n",
    "peaks_records = []\n",
    "for sheet_name in wb.sheetnames:\n",
    "    ws = wb[sheet_name]\n",
    "    # Each sheet represents a base period, e.g. \"2024\" or \"2024-2025\"\n",
    "    # Extract base period year from sheet name\n",
    "    try:\n",
    "        bp_year = int(sheet_name.split('-')[0].strip())\n",
    "    except (ValueError, IndexError):\n",
    "        continue\n",
    "    \n",
    "    # Find the header row and data rows\n",
    "    for row in ws.iter_rows(min_row=1, max_row=ws.max_row, values_only=False):\n",
    "        cells = [c.value for c in row]\n",
    "        # Look for data rows: Rank is an integer, Date is a date\n",
    "        if (isinstance(cells[0], (int, float)) and \n",
    "            cells[0] in range(1, 11) and\n",
    "            cells[1] is not None):\n",
    "            rank = int(cells[0])\n",
    "            date_val = cells[1]\n",
    "            hour_val = cells[2]\n",
    "            demand_val = cells[3]\n",
    "            \n",
    "            # Parse date\n",
    "            if isinstance(date_val, datetime):\n",
    "                date_parsed = date_val.date()\n",
    "            elif isinstance(date_val, str):\n",
    "                date_parsed = pd.to_datetime(date_val).date()\n",
    "            else:\n",
    "                continue\n",
    "            \n",
    "            peaks_records.append({\n",
    "                'base_period': bp_year,\n",
    "                'rank': rank,\n",
    "                'date': pd.Timestamp(date_parsed),\n",
    "                'hour_ending': int(hour_val) if hour_val else None,\n",
    "                'ontario_demand_mw': float(demand_val) if demand_val else None,\n",
    "                'source': 'top10_archive'\n",
    "            })\n",
    "\n",
    "peaks_df = pd.DataFrame(peaks_records)\n",
    "print(f'Loaded {len(peaks_df)} peak records across {peaks_df[\"base_period\"].nunique()} base periods')\n",
    "print(f'Base periods: {sorted(peaks_df[\"base_period\"].unique())}')\n",
    "\n",
    "# Separate top-5 as our primary ground truth labels\n",
    "top5_peaks = peaks_df[peaks_df['rank'] <= 5].copy()\n",
    "print(f'\\nTop-5 peaks: {len(top5_peaks)} records')\n",
    "print(top5_peaks.head(10).to_string(index=False))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Load Top-5 Peaks with System-Wide Consumption\n",
    "top5_system_path = DATA_DIR / 'Top-5-Peaks-Hours-and-System-Wide-Consumption.xlsx'\n",
    "wb2 = openpyxl.load_workbook(top5_system_path, data_only=True)\n",
    "\n",
    "system_records = []\n",
    "for sheet_name in wb2.sheetnames:\n",
    "    ws = wb2[sheet_name]\n",
    "    try:\n",
    "        bp_year = int(sheet_name.split('-')[0].strip())\n",
    "    except (ValueError, IndexError):\n",
    "        continue\n",
    "    \n",
    "    for row in ws.iter_rows(min_row=1, max_row=ws.max_row, values_only=False):\n",
    "        cells = [c.value for c in row]\n",
    "        if (isinstance(cells[0], (int, float)) and \n",
    "            cells[0] in range(1, 6) and\n",
    "            cells[1] is not None):\n",
    "            system_records.append({\n",
    "                'base_period': bp_year,\n",
    "                'rank': int(cells[0]),\n",
    "                'date': pd.Timestamp(cells[1]) if isinstance(cells[1], datetime) else pd.to_datetime(str(cells[1])),\n",
    "                'hour_ending': int(cells[2]) if cells[2] else None,\n",
    "                'aqew_mwh': float(cells[3]) if cells[3] else None,\n",
    "                'embedded_gen_mwh': float(cells[4]) if len(cells) > 4 and cells[4] else None,\n",
    "                'total_mwh': float(cells[-1]) if cells[-1] and isinstance(cells[-1], (int, float)) else None,\n",
    "            })\n",
    "\n",
    "system_df = pd.DataFrame(system_records)\n",
    "print(f'Loaded {len(system_df)} system consumption records')\n",
    "print(f'Base periods: {sorted(system_df[\"base_period\"].unique())}')\n",
    "print(f'\\nSample records:')\n",
    "print(system_df.head(10).to_string(index=False))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Fetch Open-Meteo Historical Weather\n",
    "\n",
    "Fetch hourly weather data for Toronto (43.65°N, -79.38°W) from the Open-Meteo historical\n",
    "archive API. Variables: temperature_2m, relative_humidity_2m, dewpoint_2m, wind_speed_10m,\n",
    "shortwave_radiation.\n",
    "\n",
    "The API accepts date ranges up to ~1 year per request, so we batch by year."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def fetch_openmeteo_weather(start_date, end_date, lat=TORONTO_LAT, lon=TORONTO_LON):\n",
    "    \"\"\"Fetch hourly weather from Open-Meteo historical archive.\"\"\"\n",
    "    url = 'https://archive-api.open-meteo.com/v1/archive'\n",
    "    params = {\n",
    "        'latitude': lat,\n",
    "        'longitude': lon,\n",
    "        'start_date': start_date,\n",
    "        'end_date': end_date,\n",
    "        'hourly': 'temperature_2m,relative_humidity_2m,dewpoint_2m,wind_speed_10m,shortwave_radiation',\n",
    "        'timezone': 'America/Toronto'\n",
    "    }\n",
    "    response = requests.get(url, params=params, timeout=60)\n",
    "    response.raise_for_status()\n",
    "    data = response.json()\n",
    "    \n",
    "    hourly = data['hourly']\n",
    "    df = pd.DataFrame({\n",
    "        'datetime': pd.to_datetime(hourly['time']),\n",
    "        'temperature_c': hourly['temperature_2m'],\n",
    "        'relative_humidity': hourly['relative_humidity_2m'],\n",
    "        'dewpoint_c': hourly['dewpoint_2m'],\n",
    "        'wind_speed_kmh': hourly['wind_speed_10m'],\n",
    "        'shortwave_radiation': hourly['shortwave_radiation'],\n",
    "    })\n",
    "    return df\n",
    "\n",
    "print('Fetching Open-Meteo historical weather for Toronto (2010-2025)...')\n",
    "weather_dfs = []\n",
    "for year in range(2010, 2026):\n",
    "    start = f'{year}-01-01'\n",
    "    end = f'{year}-12-31' if year < 2026 else '2026-03-08'\n",
    "    print(f'  {year}...', end=' ')\n",
    "    try:\n",
    "        wdf = fetch_openmeteo_weather(start, end)\n",
    "        weather_dfs.append(wdf)\n",
    "        print(f'{len(wdf)} rows')\n",
    "    except Exception as e:\n",
    "        print(f'FAILED: {e}')\n",
    "\n",
    "weather = pd.concat(weather_dfs, ignore_index=True)\n",
    "weather = weather.drop_duplicates(subset=['datetime']).sort_values('datetime').reset_index(drop=True)\n",
    "print(f'\\nTotal weather rows: {len(weather):,}')\n",
    "print(f'Date range: {weather[\"datetime\"].min()} to {weather[\"datetime\"].max()}')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Timestamp Alignment & Merge\n",
    "\n",
    "**Critical alignment:** IESO uses hour-ending convention (Hour 1 = 00:00–01:00 EST,\n",
    "timestamp at 01:00). Open-Meteo uses hour-beginning convention in local timezone\n",
    "(hour 0 = 00:00–01:00, timestamp at 00:00).\n",
    "\n",
    "To align: IESO Hour 1 at 01:00 → Open-Meteo hour 0 at 00:00. So we shift IESO's\n",
    "datetime back by 1 hour, or equivalently, shift Open-Meteo forward by 1 hour."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Open-Meteo timestamps are hour-beginning in local (EST/EDT) time.\n",
    "# IESO datetime is hour-ending: Hour=1 means 01:00 for the 00:00-01:00 interval.\n",
    "# To align: shift Open-Meteo forward by 1 hour so both represent hour-ending.\n",
    "weather['datetime_he'] = weather['datetime'] + pd.Timedelta(hours=1)\n",
    "\n",
    "# Merge on the aligned datetime\n",
    "merged = demand.merge(\n",
    "    weather,\n",
    "    left_on='datetime',\n",
    "    right_on='datetime_he',\n",
    "    how='left',\n",
    "    suffixes=('', '_weather')\n",
    ")\n",
    "\n",
    "# Check merge quality\n",
    "weather_coverage = merged['temperature_c'].notna().mean() * 100\n",
    "print(f'Merged dataset: {len(merged):,} rows')\n",
    "print(f'Weather coverage: {weather_coverage:.1f}% of demand rows have weather data')\n",
    "print(f'\\nMissing weather by year:')\n",
    "missing_by_year = merged.groupby(merged['Date'].dt.year)['temperature_c'].apply(lambda x: x.isna().sum())\n",
    "print(missing_by_year[missing_by_year > 0].to_string())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Data validation: check hourly completeness\n",
    "print('Hours per calendar year (expected ~8,760):')\n",
    "hours_by_year = demand.groupby(demand['Date'].dt.year).size()\n",
    "for year, count in hours_by_year.items():\n",
    "    expected = 8784 if (year % 4 == 0 and (year % 100 != 0 or year % 400 == 0)) else 8760\n",
    "    status = 'OK' if abs(count - expected) < 48 else f'GAP ({expected - count} missing)'\n",
    "    # Note: DST transitions and partial years (2025) will show slight deviations\n",
    "    print(f'  {year}: {count:,} hours — {status}')\n",
    "\n",
    "# Forward-fill small weather gaps (< 3 hours)\n",
    "weather_cols = ['temperature_c', 'relative_humidity', 'dewpoint_c', 'wind_speed_kmh', 'shortwave_radiation']\n",
    "merged[weather_cols] = merged[weather_cols].fillna(method='ffill', limit=3)\n",
    "\n",
    "remaining_gaps = merged[weather_cols].isna().sum()\n",
    "print(f'\\nRemaining weather gaps after forward-fill (limit=3):')\n",
    "print(remaining_gaps.to_string())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## EDA: Annual Peak Demand Trend (2010–2025)\n",
    "\n",
    "How has Ontario's peak system demand evolved over 15 years? The trend reflects\n",
    "embedded generation growth, conservation programs, and structural demand shifts."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Annual maximum Ontario Demand by calendar year\n",
    "annual_peak = demand.groupby(demand['Date'].dt.year)['Ontario Demand'].max().reset_index()\n",
    "annual_peak.columns = ['Year', 'Peak Demand (MW)']\n",
    "# Exclude partial year 2026\n",
    "annual_peak = annual_peak[annual_peak['Year'] <= 2025]\n",
    "\n",
    "fig, ax = plt.subplots(figsize=(12, 5))\n",
    "ax.plot(annual_peak['Year'], annual_peak['Peak Demand (MW)'], 'o-', \n",
    "        color='#d32f2f', linewidth=2, markersize=8)\n",
    "ax.fill_between(annual_peak['Year'], annual_peak['Peak Demand (MW)'], \n",
    "                alpha=0.1, color='#d32f2f')\n",
    "ax.set_xlabel('Year')\n",
    "ax.set_ylabel('Annual Peak Ontario Demand (MW)')\n",
    "ax.set_title('Ontario Annual Peak System Demand (2010–2025)')\n",
    "ax.set_ylim(18000, 26000)\n",
    "ax.yaxis.set_major_formatter(plt.FuncFormatter(lambda x, _: f'{x:,.0f}'))\n",
    "\n",
    "# Annotate notable years\n",
    "for _, row in annual_peak.iterrows():\n",
    "    ax.annotate(f'{row[\"Peak Demand (MW)\"]:.0f}', \n",
    "                (row['Year'], row['Peak Demand (MW)']),\n",
    "                textcoords='offset points', xytext=(0, 10),\n",
    "                fontsize=8, ha='center', color='#555')\n",
    "\n",
    "plt.tight_layout()\n",
    "plt.savefig(OUTPUT_DIR / 'annual_peak_demand.png', dpi=150, bbox_inches='tight')\n",
    "plt.show()\n",
    "print(annual_peak.to_string(index=False))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## EDA: Monthly Demand Box Plots\n",
    "\n",
    "Demand follows a strong seasonal pattern — summer cooling load drives the highest demands,\n",
    "with a secondary winter heating peak in some years."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Daily max demand for box plots\n",
    "daily_max = demand.groupby('Date')['Ontario Demand'].max().reset_index()\n",
    "daily_max.columns = ['Date', 'Daily Max Demand (MW)']\n",
    "daily_max['Month'] = daily_max['Date'].dt.month\n",
    "daily_max['Month Name'] = daily_max['Date'].dt.strftime('%b')\n",
    "\n",
    "month_order = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', \n",
    "               'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']\n",
    "\n",
    "fig, ax = plt.subplots(figsize=(12, 6))\n",
    "sns.boxplot(data=daily_max, x='Month Name', y='Daily Max Demand (MW)',\n",
    "            order=month_order, palette='coolwarm', ax=ax,\n",
    "            fliersize=2, linewidth=1)\n",
    "ax.set_xlabel('Month')\n",
    "ax.set_ylabel('Daily Maximum Demand (MW)')\n",
    "ax.set_title('Ontario Daily Peak Demand by Month (2010–2025)')\n",
    "ax.yaxis.set_major_formatter(plt.FuncFormatter(lambda x, _: f'{x:,.0f}'))\n",
    "\n",
    "# Mark the peak season\n",
    "ax.axvspan(4.5, 7.5, alpha=0.08, color='red', label='Peak season (Jun–Aug)')\n",
    "ax.legend(loc='upper left')\n",
    "\n",
    "plt.tight_layout()\n",
    "plt.savefig(OUTPUT_DIR / 'monthly_demand_boxplots.png', dpi=150, bbox_inches='tight')\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## EDA: Hourly Demand Heatmap\n",
    "\n",
    "Average demand by hour-of-day and month reveals the afternoon summer peak cluster\n",
    "where coincident peaks concentrate."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "demand['Month'] = demand['Date'].dt.month\n",
    "heatmap_data = demand.pivot_table(\n",
    "    values='Ontario Demand',\n",
    "    index='Hour',\n",
    "    columns='Month',\n",
    "    aggfunc='mean'\n",
    ")\n",
    "\n",
    "fig, ax = plt.subplots(figsize=(12, 8))\n",
    "sns.heatmap(heatmap_data, cmap='YlOrRd', annot=False, fmt='.0f',\n",
    "            xticklabels=['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun',\n",
    "                         'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'],\n",
    "            yticklabels=range(1, 25),\n",
    "            ax=ax, cbar_kws={'label': 'Average Demand (MW)'})\n",
    "ax.set_xlabel('Month')\n",
    "ax.set_ylabel('Hour Ending (EST)')\n",
    "ax.set_title('Average Ontario Demand by Hour and Month (2010–2025)\\n'\n",
    "             'Coincident peaks cluster in the hot upper-right region: Jun–Aug, Hours 15–19')\n",
    "\n",
    "# Draw box around peak zone\n",
    "from matplotlib.patches import Rectangle\n",
    "rect = Rectangle((5, 13), 3, 6, linewidth=2, edgecolor='black', \n",
    "                  facecolor='none', linestyle='--')\n",
    "ax.add_patch(rect)\n",
    "ax.text(6.5, 12.5, 'Peak Zone', ha='center', fontsize=10, fontweight='bold')\n",
    "\n",
    "plt.tight_layout()\n",
    "plt.savefig(OUTPUT_DIR / 'hourly_demand_heatmap.png', dpi=150, bbox_inches='tight')\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## EDA: Peak Analysis\n",
    "\n",
    "Distribution of actual top-5 coincident peaks by month, day-of-week, and hour-of-day\n",
    "across all available base periods."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Analyze top-5 peaks\n",
    "top5 = top5_peaks.copy()\n",
    "top5['month'] = top5['date'].dt.month\n",
    "top5['month_name'] = top5['date'].dt.strftime('%b')\n",
    "top5['day_of_week'] = top5['date'].dt.day_name()\n",
    "\n",
    "fig, axes = plt.subplots(1, 3, figsize=(16, 5))\n",
    "\n",
    "# Month distribution\n",
    "month_counts = top5['month'].value_counts().reindex(range(1, 13), fill_value=0)\n",
    "colors = ['#2196F3' if m not in [6, 7, 8] else '#d32f2f' for m in range(1, 13)]\n",
    "axes[0].bar(range(1, 13), month_counts.values, color=colors)\n",
    "axes[0].set_xticks(range(1, 13))\n",
    "axes[0].set_xticklabels(['J', 'F', 'M', 'A', 'M', 'J', 'J', 'A', 'S', 'O', 'N', 'D'])\n",
    "axes[0].set_xlabel('Month')\n",
    "axes[0].set_ylabel('Count')\n",
    "axes[0].set_title('Top-5 Peaks by Month')\n",
    "pct_summer = (month_counts[[6, 7, 8]].sum() / month_counts.sum()) * 100\n",
    "axes[0].text(0.95, 0.95, f'Jun–Aug: {pct_summer:.0f}%', transform=axes[0].transAxes,\n",
    "             ha='right', va='top', fontsize=10, fontweight='bold',\n",
    "             bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.5))\n",
    "\n",
    "# Day of week distribution\n",
    "dow_order = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']\n",
    "dow_counts = top5['day_of_week'].value_counts().reindex(dow_order, fill_value=0)\n",
    "dow_colors = ['#4CAF50' if d not in ['Saturday', 'Sunday'] else '#9E9E9E' for d in dow_order]\n",
    "axes[1].bar(range(7), dow_counts.values, color=dow_colors)\n",
    "axes[1].set_xticks(range(7))\n",
    "axes[1].set_xticklabels(['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun'])\n",
    "axes[1].set_xlabel('Day of Week')\n",
    "axes[1].set_ylabel('Count')\n",
    "axes[1].set_title('Top-5 Peaks by Day of Week')\n",
    "axes[1].text(0.95, 0.95, '100% weekdays', transform=axes[1].transAxes,\n",
    "             ha='right', va='top', fontsize=10, fontweight='bold',\n",
    "             bbox=dict(boxstyle='round', facecolor='lightgreen', alpha=0.5))\n",
    "\n",
    "# Hour distribution\n",
    "hour_counts = top5['hour_ending'].value_counts().reindex(range(1, 25), fill_value=0)\n",
    "hour_colors = ['#FF9800' if h in range(15, 20) else '#BDBDBD' for h in range(1, 25)]\n",
    "axes[2].bar(range(1, 25), hour_counts.values, color=hour_colors)\n",
    "axes[2].set_xlabel('Hour Ending (EST)')\n",
    "axes[2].set_ylabel('Count')\n",
    "axes[2].set_title('Top-5 Peaks by Hour')\n",
    "pct_window = (hour_counts[15:20].sum() / hour_counts.sum()) * 100\n",
    "axes[2].text(0.95, 0.95, f'HE 15–19: {pct_window:.0f}%', transform=axes[2].transAxes,\n",
    "             ha='right', va='top', fontsize=10, fontweight='bold',\n",
    "             bbox=dict(boxstyle='round', facecolor='lightyellow', alpha=0.5))\n",
    "\n",
    "plt.suptitle('When Do Ontario\\'s Coincident Peaks Occur?', fontsize=14, fontweight='bold', y=1.02)\n",
    "plt.tight_layout()\n",
    "plt.savefig(OUTPUT_DIR / 'peak_distributions.png', dpi=150, bbox_inches='tight')\n",
    "plt.show()\n",
    "\n",
    "# Summary table\n",
    "print('\\n=== Top-5 Peak Summary Statistics ===')\n",
    "print(f'Total peaks analyzed: {len(top5)}')\n",
    "print(f'June–August concentration: {pct_summer:.1f}%')\n",
    "print(f'Weekday concentration: 100.0%')\n",
    "print(f'Hours 15–19 concentration: {pct_window:.1f}%')\n",
    "print(f'\\nDemand range: {top5[\"ontario_demand_mw\"].min():,.0f} – {top5[\"ontario_demand_mw\"].max():,.0f} MW')\n",
    "print(f'Most common hour: HE {top5[\"hour_ending\"].mode().values[0]}')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## EDA: Weather-Demand Relationship\n",
    "\n",
    "The physical basis for prediction: daily maximum temperature vs. daily maximum demand.\n",
    "Above approximately 22°C, AC cooling load drives demand nonlinearly upward."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Create daily summary for scatter plot\n",
    "daily_summary = merged.groupby('Date').agg({\n",
    "    'Ontario Demand': 'max',\n",
    "    'temperature_c': 'max'\n",
    "}).reset_index()\n",
    "daily_summary.columns = ['Date', 'Daily Max Demand', 'Daily Max Temp']\n",
    "daily_summary['Month'] = daily_summary['Date'].dt.month\n",
    "daily_summary = daily_summary.dropna()\n",
    "\n",
    "# Season labels\n",
    "def get_season(month):\n",
    "    if month in [6, 7, 8]:\n",
    "        return 'Summer (Jun–Aug)'\n",
    "    elif month in [12, 1, 2]:\n",
    "        return 'Winter (Dec–Feb)'\n",
    "    elif month in [3, 4, 5]:\n",
    "        return 'Spring (Mar–May)'\n",
    "    else:\n",
    "        return 'Fall (Sep–Nov)'\n",
    "\n",
    "daily_summary['Season'] = daily_summary['Month'].apply(get_season)\n",
    "\n",
    "# Scatter plot\n",
    "fig, ax = plt.subplots(figsize=(12, 7))\n",
    "season_colors = {\n",
    "    'Summer (Jun–Aug)': '#d32f2f',\n",
    "    'Winter (Dec–Feb)': '#1565C0',\n",
    "    'Spring (Mar–May)': '#4CAF50',\n",
    "    'Fall (Sep–Nov)': '#FF9800'\n",
    "}\n",
    "\n",
    "for season, color in season_colors.items():\n",
    "    mask = daily_summary['Season'] == season\n",
    "    ax.scatter(daily_summary.loc[mask, 'Daily Max Temp'],\n",
    "              daily_summary.loc[mask, 'Daily Max Demand'],\n",
    "              alpha=0.3, s=10, color=color, label=season)\n",
    "\n",
    "# Mark actual top-5 peak days\n",
    "peak_dates = set(top5_peaks['date'].dt.date)\n",
    "peak_mask = daily_summary['Date'].dt.date.isin(peak_dates)\n",
    "if peak_mask.any():\n",
    "    ax.scatter(daily_summary.loc[peak_mask, 'Daily Max Temp'],\n",
    "              daily_summary.loc[peak_mask, 'Daily Max Demand'],\n",
    "              marker='*', s=200, color='black', zorder=5, label='Top-5 Peak Days',\n",
    "              edgecolors='gold', linewidths=1)\n",
    "\n",
    "# Mark the ~22°C inflection point\n",
    "ax.axvline(x=22, color='gray', linestyle='--', alpha=0.5)\n",
    "ax.text(22.5, ax.get_ylim()[0] + 500, 'AC inflection\\n~22°C', fontsize=9, color='gray')\n",
    "\n",
    "ax.set_xlabel('Daily Maximum Temperature (°C)')\n",
    "ax.set_ylabel('Daily Maximum Ontario Demand (MW)')\n",
    "ax.set_title('Temperature vs. Demand — The Physical Basis for Peak Prediction')\n",
    "ax.legend(loc='upper left', framealpha=0.9)\n",
    "ax.yaxis.set_major_formatter(plt.FuncFormatter(lambda x, _: f'{x:,.0f}'))\n",
    "\n",
    "plt.tight_layout()\n",
    "plt.savefig(OUTPUT_DIR / 'weather_demand_scatter.png', dpi=150, bbox_inches='tight')\n",
    "plt.show()\n",
    "\n",
    "# Correlation analysis\n",
    "summer = daily_summary[daily_summary['Month'].isin([6, 7, 8])]\n",
    "corr_all = daily_summary[['Daily Max Demand', 'Daily Max Temp']].corr().iloc[0, 1]\n",
    "corr_summer = summer[['Daily Max Demand', 'Daily Max Temp']].corr().iloc[0, 1]\n",
    "print(f'Correlation (all seasons): {corr_all:.3f}')\n",
    "print(f'Correlation (summer only): {corr_summer:.3f}')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Save Cleaned Dataset\n",
    "\n",
    "Export the merged demand + weather dataset as parquet for downstream notebooks.\n",
    "Also save the ground truth peak labels separately."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Save merged hourly dataset\n",
    "cols_to_save = ['Date', 'Hour', 'Ontario Demand', 'datetime', 'base_period',\n",
    "                'temperature_c', 'relative_humidity', 'dewpoint_c', \n",
    "                'wind_speed_kmh', 'shortwave_radiation']\n",
    "output_df = merged[cols_to_save].copy()\n",
    "output_df.to_parquet(OUTPUT_DIR / 'ieso_demand_weather_2010_2025.parquet', index=False)\n",
    "print(f'Saved hourly dataset: {len(output_df):,} rows')\n",
    "print(f'  -> {OUTPUT_DIR / \"ieso_demand_weather_2010_2025.parquet\"}')\n",
    "\n",
    "# Save peak labels\n",
    "peaks_df.to_parquet(OUTPUT_DIR / 'ieso_peak_labels.parquet', index=False)\n",
    "print(f'\\nSaved peak labels: {len(peaks_df)} records')\n",
    "print(f'  -> {OUTPUT_DIR / \"ieso_peak_labels.parquet\"}')\n",
    "\n",
    "# Save system consumption data\n",
    "system_df.to_parquet(OUTPUT_DIR / 'ieso_system_consumption.parquet', index=False)\n",
    "print(f'\\nSaved system consumption: {len(system_df)} records')\n",
    "print(f'  -> {OUTPUT_DIR / \"ieso_system_consumption.parquet\"}')\n",
    "\n",
    "# Save daily summary for quick access\n",
    "daily_summary.to_parquet(OUTPUT_DIR / 'ieso_daily_summary.parquet', index=False)\n",
    "print(f'\\nSaved daily summary: {len(daily_summary)} records')\n",
    "print(f'  -> {OUTPUT_DIR / \"ieso_daily_summary.parquet\"}')\n",
    "\n",
    "print('\\n=== Notebook 1 complete ===')"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "name": "python",
   "version": "3.12.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}