Source code for climada_petals.hazard.emulator.emulator

"""
This file is part of CLIMADA.

Copyright (C) 2017 ETH Zurich, CLIMADA contributors listed in AUTHORS.

CLIMADA is free software: you can redistribute it and/or modify it under the
terms of the GNU General Public License as published by the Free
Software Foundation, version 3.

CLIMADA is distributed in the hope that it will be useful, but WITHOUT ANY
WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
PARTICULAR PURPOSE.  See the GNU General Public License for more details.

You should have received a copy of the GNU General Public License along
with CLIMADA. If not, see <https://www.gnu.org/licenses/>.

---

Hazard event emulator.
"""

import logging
import sys

import numpy as np
import pandas as pd

import climada_petals.hazard.emulator.const as const
import climada_petals.hazard.emulator.stats as stats
import climada_petals.hazard.emulator.random as random

LOGGER = logging.getLogger(__name__)

[docs] class HazardEmulator(): """Draw samples for a time period driven by climate forcing Draw samples from the given pool of hazard events while making sure that the frequency and intensity are as predicted according to given climate indices. """ explaineds = ['intensity_mean', 'eventcount']
[docs] def __init__(self, haz_events, haz_events_obs, region, freq_norm, pool=None): """Initialize HazardEmulator Parameters ---------- haz_events : DataFrame Output of `stats.haz_max_events`. haz_events_obs : DataFrame Observed events for normalization. Output of `stats.haz_max_events`. region : HazRegion object The geographical region for which to run emulations. freq_norm : DataFrame { year, freq } Information about the relative surplus of events in `tracks`, i.e., if `freq_norm` specifies the value 0.2 in some year, then it is assumed that the number of events given for that year is 5 times as large as it is predicted to be. Usually, the value will be smaller than 1 because the event set should be a good representation of TC distribution, but this is not necessary. pool : EventPool object, optional If omitted, draws are made from `haz_events`. """ self.pool = EventPool(haz_events) if pool is None else pool self.region = region haz_stats = stats.seasonal_statistics(haz_events, region.season) haz_stats_obs = stats.seasonal_statistics(haz_events_obs, region.season) self.stats = stats.normalize_seasonal_statistics(haz_stats, haz_stats_obs, freq_norm) self.stats_pred = None self.fit_info = None self.ci_cols = [] norm_period = [haz_events_obs['year'].min(), haz_events_obs['year'].max()] idx = self.stats.index[(self.stats['year'] >= norm_period[0]) \ & (self.stats['year'] <= norm_period[1])] norm_mean = self.stats.loc[idx, "intensity_mean_obs"].mean() self.pool.init_drop(norm_period, norm_mean)
[docs] def calibrate_statistics(self, climate_indices): """Statistically fit hazard data to given climate indices The internal statistics are truncated to fit the temporal range of the climate indices. Parameters ---------- climate_indices : list of DataFrames { year, month, ... } Yearly or monthly time series of GMT, ESOI etc. """ if len(self.ci_cols) > 0: self.stats = self.stats.drop(labels=self.ci_cols, axis=1) self.ci_cols = [] for cidx in climate_indices: ci_name = cidx.columns.values.tolist() ci_name.remove("year") ci_name.remove("month") self.ci_cols += ci_name avg_season = const.PDO_SEASON if "pdo" in ci_name else self.region.season avg = stats.seasonal_average(cidx, avg_season) self.stats = pd.merge(self.stats, avg, on="year", how="inner", sort=True) self.stats = self.stats.dropna(axis=0, how="any", subset=self.explaineds + self.ci_cols) self.fit_info = {} for explained in self.explaineds: self.fit_info[explained] = stats.fit_data( self.stats, explained, self.ci_cols, poisson=(explained == 'eventcount'))
[docs] def predict_statistics(self, climate_indices=None): """Predict hypothetical hazard statistics according to climate indices The statistical fit from `calibrate_statistics` is used to predict the frequency and intensity of hazard events. The standard deviation of yearly residuals is used to define the yearly acceptable deviation of sample intensity. Without calibration, the prediction is done according to the (bias-corrected) within-year statistics of the event pool. In this case, the within-year standard deviation of intensity is taken as the acceptable deviation of samples for that year. Parameters ---------- climate_indices : list of DataFrames { year, month, ... } Yearly or monthly time series of GMT, ESOI etc. including at least those passed to `calibrate_statistics`. If omitted, and if `calibrate_statistics` has been called before, the climate indices from calibration are reused for prediction. Otherwise, the internal (within-year) statistics of the data set are used to predict frequency and intensity. """ reuse_indices = False if not climate_indices: reuse_indices = True elif len(climate_indices) > 0 and len(self.ci_cols) == 0: self.calibrate_statistics(climate_indices) reuse_indices = True if len(self.ci_cols) == 0: LOGGER.info("Predicting statistics without climate index predictor...") self.stats_pred = self.stats[['year', 'intensity_mean', 'eventcount']] self.stats_pred["intensity_mean_residuals"] = self.stats["intensity_std"] self.stats_pred["eventcount_residuals"] = 0 elif reuse_indices: LOGGER.info("Predicting statistics with climate indices from calibration...") self.stats_pred = self.stats[['year'] + self.ci_cols] for explained in self.explaineds: sm_results = self.fit_info[explained][-1] self.stats_pred[explained] = sm_results.fittedvalues self.stats_pred[f"{explained}_residuals"] = sm_results.resid else: LOGGER.info("Predicting statistics with new climate index time series...") ci_avg = None for cidx in climate_indices: ci_name = cidx.columns.values.tolist() ci_name.remove("year") ci_name.remove("month") avg_season = const.PDO_SEASON if "pdo" in ci_name else self.region.season avg = stats.seasonal_average(cidx, avg_season) if ci_avg is None: ci_avg = avg else: ci_avg = pd.merge(ci_avg, avg, on="year", how="inner") self.stats_pred = ci_avg[["year"] + self.ci_cols] ci_data = self.stats_pred[self.ci_cols] ci_data['const'] = 1.0 for explained in self.explaineds: sm_results = self.fit_info[explained][-1] explanatory = sm_results.params.index.tolist() haz_stats_pred = sm_results.predict(ci_data[explanatory]) self.stats_pred[explained] = haz_stats_pred # use standard deviation of calibration residuals as "residuals": self.stats_pred[f"{explained}_residuals"] = float(sm_results.resid.std())
[docs] def draw_realizations(self, nrealizations, period): """Draw samples for given time period according to calibration Draws for a specific year in the given period are not necessarily restricted to events in the pool that are explicitly assigned to that year because the pool might be too small to allow for draws of the expected sample size and mean intensity. Parameters ---------- nrealizations : int Number of samples to draw. period : pair of ints [minyear, maxyear] Period for which to make draws. Returns ------- draws : list of DataFrames, length `nrealizations` Each entry is a sample for the whole period, given as a DataFrame with columns as in `self.pool.events`. The `year` column is set to the respective year and columns for the driving climate indices are added for reference. """ if self.stats_pred is None: raise Exception("Run `predict_statistics` before making draws!") LOGGER.info("Drawing %d realizations for period (%d, %d)", nrealizations, period[0], period[1]) year_draws = [] for year in range(period[0], period[1] + 1): sys.stdout.write(f"\r{period[0]} ... {year} ... {period[1]}") sys.stdout.flush() year_idx = self.stats_pred.index[self.stats_pred['year'] == year] freq_poisson = self.stats_pred.loc[year_idx, 'eventcount'].values[0] intensity_mean = self.stats_pred.loc[year_idx, 'intensity_mean'].values[0] intensity_std = self.stats_pred.loc[year_idx, 'intensity_mean_residuals'].values[0] intensity_std = np.clip(np.abs(intensity_std), 0.5, 10) draws = self.pool.draw_realizations(nrealizations, freq_poisson, intensity_mean, intensity_std) for real_id, draw in enumerate(draws): draw['year'] = year draw['real_id'] = real_id draws[real_id] = draw[['id', 'name', 'year', 'real_id']] year_draws += draws sys.stdout.write(f"\r{period[0]} ... {period[1]} ... {period[1]}\n") return pd.concat(year_draws, ignore_index=True)
[docs] class EventPool(): """Make draws from a hazard event pool according to given statistics The event pool might cover an arbitrary number of years and an arbitrary geographical region since the time and geo information fields are ignored when making draws. No assumptions are made about where the statistics come from that are used in making the draw. Example ------- Let `haz_events` be a given dataset of all TC events making landfall in Belize between 1980 and 2050, together with their respective maximum wind speeds on land. Assume that we expect (from some other statistical model) 5 events of annual mean maximum wind speed 30 ± 10 m/s in the year 2025. Then, we can draw 100 realizations of hypothetical 2025 TC event sets hitting Belize with the following commands: >>> pool = EventPool(haz_events) >>> draws = pool.draw_realizations(100, 5, 30, 10) The realization `draw[i]` might contain events from any year between 1980 and 2050, but the size of the realization and the mean maximum wind speed will be according to the given statistics. """
[docs] def __init__(self, haz_events): """Initialize instance of EventPool Parameters ---------- haz_events : DataFrame Output of `stats.haz_max_events`. """ self.events = haz_events self.drop = None
[docs] def init_drop(self, norm_period, norm_mean): """Use a drop rule when making draws With the drop rule, a random choice of entries is dropped from events before the actual drawing is done in order to speed up the process in case of data sets where the acceptable mean is far from the input data mean. Parameters ---------- norm_period : pair of ints [minyear, maxyear] Normalization period for which a specific mean intensity is expected. norm_mean : float Desired mean intensity of events in the given time period. """ self.drop = random.estimate_drop(self.events, 'year', 'intensity', norm_period, norm_mean=norm_mean)
[docs] def draw_realizations(self, nrealizations, freq_poisson, intensity_mean, intensity_std): """Draw samples from the event pool according to given statistics If `EventPool.init_drop` has been called before, the drop rule is applied. Parameters ---------- nrealizations : int Number of samples to draw freq_poisson : float Expected sample size ("frequency", Poisson distributed). intensity_mean : float Expected sample mean intensity. intensity_std : float Acceptable deviation from `intensity_mean`. Returns ------- draws : list of DataFrames, length `nrealizations` Each entry is a sample, given as a DataFrame with columns as in `self.events`. """ draws = [] while len(draws) < nrealizations: intensity_accept = [intensity_mean - intensity_std, intensity_mean + intensity_std] drawn = None while drawn is None: drawn = random.draw_poisson_events( freq_poisson, self.events, 'intensity', intensity_accept, drop=self.drop) intensity_accept[0] -= 1 intensity_accept[1] += 1 draws.append(self.events.loc[drawn]) return draws