"""
This file is part of CLIMADA.
Copyright (C) 2017 ETH Zurich, CLIMADA contributors listed in AUTHORS.
CLIMADA is free software: you can redistribute it and/or modify it under the
terms of the GNU General Public License as published by the Free
Software Foundation, version 3.
CLIMADA is distributed in the hope that it will be useful, but WITHOUT ANY
WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
PARTICULAR PURPOSE. See the GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with CLIMADA. If not, see <https://www.gnu.org/licenses/>.
---
Statistical tools for the hazard event emulator.
"""
import datetime
import logging
import numpy as np
import pandas as pd
import statsmodels.api as sm
import statsmodels.discrete.discrete_model as smd
LOGGER = logging.getLogger(__name__)
[docs]
def seasonal_average(data, season):
"""Compute seasonal average from monthly-time series.
For seasons that are across newyear, the months after June are attributed to the following
year's season. For example: The 6-month season from November 1980 till April 1981 is attributed
to the year 1981.
The two seasons that are truncated at the beginning/end of the dataset's time period are
discarded. When the input data is 1980-2010, the output data will be 1981-2010, where 2010
corresponds to the 2009/2010 season and 1981 corresponds to the 1980/1981 season.
Parameters
----------
data : DataFrame { year, month, ... }
All further columns will be averaged over.
season : pair of ints
Start/end month of season.
Returns
-------
averaged_data : DataFrame { year, ... }
Same format as input, but with month column removed.
"""
start, end = season
if data['year'].unique().size == data.shape[0] and "month" in data.columns:
data = data.drop(labels=['month'], axis=1)
if "month" not in data.columns:
return data.iloc[1:] if start > end else data
if start > end:
msk = (data['month'] >= start) | (data['month'] <= end)
else:
msk = (data['month'] >= start) & (data['month'] <= end)
data = data[msk]
if start > end:
year_min, year_max = data['year'].min(), data['year'].max()
data['year'][data['month'] > 6] += 1
data = data[(data['year'] > year_min) & (data['year'] <= year_max)]
data = data.reset_index(drop=True)
data = data.groupby('year').mean().reset_index()
return data.drop('month', axis=1)
[docs]
def seasonal_statistics(events, season):
"""Compute seasonal statistics from given hazard event data
Parameters
----------
events : DataFrame { year, month, intensity, ... }
Events outside of the given season are ignored.
season : pair of ints
Start/end month of season.
Returns
-------
haz_stats : DataFrame { year, events, intensity_mean, intensity_std, intensity_max }
For seasons that are across newyear, this might cover one year less than the input data
since truncated seasons are discarded.
"""
events = events.reindex(columns=['year', 'month', 'eventcount', 'intensity'])
events['eventcount'] = 1
year_min, year_max = events['year'].min(), events['year'].max()
sea_start, sea_end = season
if sea_start > sea_end:
events['year'][events['month'] > 6] += 1
msk = (events['month'] >= sea_start) | (events['month'] <= sea_end)
else:
msk = (events['month'] >= sea_start) & (events['month'] <= sea_end)
events = events[msk].drop(labels=['month'], axis=1)
def collapse(group):
new_cols = ['eventcount', 'intensity_mean', 'intensity_std', 'intensity_max']
new_vals = [group['eventcount'].sum(),
group['intensity'].mean(),
group['intensity'].std(ddof=0),
group['intensity'].max()]
return pd.Series(new_vals, index=new_cols)
haz_stats = events.groupby(['year']).apply(collapse).reset_index()
if sea_end < sea_start:
# drop first and last years as they are incomplete
haz_stats = haz_stats[(haz_stats['year'] > year_min)
& (haz_stats['year'] <= year_max)].reset_index(drop=True)
return haz_stats
[docs]
def haz_max_events(hazard, min_thresh=0):
"""Table of max intensity events for given hazard
Parameters
----------
hazard : climada.hazard.Hazard object
min_thresh : float
Minimum intensity for event to be registered.
Returns
-------
events : DataFrame { id, name, year, month, day, lat, lon, intensity }
The integer value in column `id` refers to the internal order of events in the given
`hazard` object. `lat`, `lon` and `intensity` specify location and intensity of the maximum
intensity registered.
"""
inten = hazard.intensity
if min_thresh == 0:
# this might require considerable amounts of memory
exp_hazards = inten.todense() >= min_thresh
else:
exp_hazards = (inten >= min_thresh).todense()
exp_hazards = np.where(np.any(exp_hazards, axis=1))[0]
LOGGER.info("Condensing %d hazards to %d max events ...", inten.shape[0], exp_hazards.size)
inten = inten[exp_hazards]
inten_max_ids = np.asarray(inten.argmax(axis=1)).ravel()
inten_max = inten[range(inten.shape[0]), inten_max_ids]
dates = hazard.date[exp_hazards]
dates = [datetime.date.fromordinal(d) for d in dates]
return pd.DataFrame({
'id': exp_hazards,
'name': [hazard.event_name[s] for s in exp_hazards],
'year': np.int64([d.year for d in dates]),
'month': np.int64([d.month for d in dates]),
'day': np.int64([d.day for d in dates]),
'lat': hazard.centroids.lat[inten_max_ids],
'lon': hazard.centroids.lon[inten_max_ids],
'intensity': np.asarray(inten_max).ravel(),
})
[docs]
def normalize_seasonal_statistics(haz_stats, haz_stats_obs, freq_norm):
"""Bias-corrected annual hazard statistics
Parameters
----------
haz_stats : DataFrame { ... }
Output of `seasonal_statistics`.
haz_stats_obs : DataFrame { ... }
Output of `seasonal_statistics`.
freq_norm : DataFrame { year, freq }
Information about the relative surplus of hazard events per year, i.e.,
if `freq_norm` specifies the value 0.2 in some year, then it is
assumed that the number of events given for that year is 5 times as
large as it is predicted to be.
Returns
-------
statistics : DataFrame { year, intensity_max, intensity_mean, eventcount,
intensity_max_obs, intensity_mean_obs, eventcount_obs }
Normalized and observed hazard statistics.
"""
norm_period = [haz_stats_obs['year'].min(), haz_stats_obs['year'].max()]
# Merge observed into modelled statistics for comparison
haz_stats = pd.merge(haz_stats, haz_stats_obs, suffixes=('', '_obs'),
on="year", how="left", sort=True)
# Normalize `eventcount` according to simulated frequency.
# In case of season across newyear, this normalizes by the year with most of
# hazard season, ignoring the fractional contribution from the year before.
haz_stats = pd.merge(haz_stats, freq_norm, on='year', how='left', sort=True)
haz_stats['eventcount'] *= haz_stats['freq']
haz_stats = haz_stats.drop(labels=['freq'], axis=1)
# Bias-correct intensity and frequency to observations in norm period
for col in ['eventcount', 'intensity_mean', 'intensity_std', 'intensity_max']:
idx = haz_stats.index[(haz_stats['year'] >= norm_period[0]) \
& (haz_stats['year'] <= norm_period[1])]
col_data = haz_stats.loc[idx, col]
col_data_obs = haz_stats.loc[idx, f"{col}_obs"].dropna()
if col == 'eventcount':
fact = col_data_obs.sum() / col_data.sum()
else:
fact = col_data_obs.mean() / col_data.mean()
haz_stats[col] *= fact
return haz_stats
[docs]
def fit_data(data, explained, explanatory, poisson=False):
"""Fit a response variable (e.g. intensity) to a list of explanatory variables
The fitting is run twice, restricting to the significant explanatory
variables in the second run.
Parameters
----------
data : DataFrame { year, `explained`, `explanatory`, ... }
An intercept column is added automatically.
explained : str
Name of explained variable, e.g. 'intensity'.
explanatory : list of str
Names of explanatory variables, e.g. ['gmt','esoi'].
poisson : boolean
Optionally, use Poisson regression for fitting.
If False (default), uses ordinary least squares (OLS) regression.
Returns
-------
sm_results : pair of statsmodels Results object
Results for first and second run.
"""
d_explained = data[explained]
d_explanatory = data[explanatory]
# for the first run, assume that all variables are significant
significant = explanatory
sm_results = []
for _ in range(2):
# restrict to variables with significant relationship
d_explanatory = d_explanatory[significant]
# add column for intercept
d_explanatory['const'] = 1.0
if poisson:
mod = smd.Poisson(d_explained, d_explanatory)
res = mod.fit(maxiter=100, disp=0, cov_type='HC1')
else:
mod = sm.OLS(d_explained, d_explanatory)
res = mod.fit(maxiter=100, disp=0, cov_type='HC1', use_t=True)
significant = fit_significant(res)
sm_results.append(res)
return sm_results
[docs]
def fit_significant(sm_results):
"""List significant variables in `sm_results`
Note: The last variable (usually intercept) is omitted!
"""
significant = []
cols = sm_results.params.index.tolist()
for i, pval in enumerate(sm_results.pvalues[:-1]):
if pval <= 0.1:
significant.append(cols[i])
return significant
[docs]
def fit_significance(sm_results):
"""Extract and visualize significance of model parameters"""
significance = ['***' if el <= 0.01 else \
'**' if el <= 0.05 else \
'*' if el <= 0.1 else \
'-' for el in sm_results.pvalues[:-1]]
significance = dict(zip(fit_significant(sm_results), significance))
return significance