Source code for climada_petals.hazard.emulator.random
"""
This file is part of CLIMADA.
Copyright (C) 2017 ETH Zurich, CLIMADA contributors listed in AUTHORS.
CLIMADA is free software: you can redistribute it and/or modify it under the
terms of the GNU General Public License as published by the Free
Software Foundation, version 3.
CLIMADA is distributed in the hope that it will be useful, but WITHOUT ANY
WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
PARTICULAR PURPOSE. See the GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with CLIMADA. If not, see <https://www.gnu.org/licenses/>.
---
Randomized sampling tools for the hazard event emulator.
"""
import logging
import numpy as np
LOGGER = logging.getLogger(__name__)
random_state = np.random.RandomState(123456789)
# random_state = np.random.default_rng(123456789)
[docs]
def estimate_drop(events, time_col, val_col, norm_period, norm_fact=None, norm_mean=None):
"""Determine fraction of outlying events to be dropped
If the mean intensity of events in the given time period `norm_period`
is far from the desired mean `norm_mean`, sampling from `events` will
usually yield draws whose mean is far from the desired mean, so that many
resamplings will be necessary in order to get an acceptable draw.
Dropping events off the desired mean before sampling can reduce the
necessary number of samplings.
This function estimates which portion of the events should be dropped.
Parameters
----------
events : DataFrame
Each row describes one event.
The dataset should contain at least the columns `time_col` and `val_col`.
time_col : str
Name of time column in `events`.
val_col : str
Name of value column in `events`.
norm_period : pair of timestamps (e.g. floats or ints)
Normalization period for which a specific mean intensity is expected.
norm_mean : float
Desired mean intensity of events in the given time period.
norm_fact : float
Instead of `norm_mean`, the ratio between desired and observed
intensity in the given time period can be given.
Returns
-------
drop : pair [expr, frac]
Only events satisfying the pandas query expression `expr` should be
eligible for dropping. `frac` specifies the fraction of these events
that are to be dropped.
"""
all_idx = events.index[(events[time_col] >= norm_period[0])
& (events[time_col] <= norm_period[1])]
all_mean = events.loc[all_idx, val_col].mean()
all_std = events.loc[all_idx, val_col].std()
if norm_mean is None:
assert norm_fact is not None
norm_mean = all_mean * norm_fact
else:
norm_fact = norm_mean / all_mean
drop_expr = f"{val_col} {'<' if norm_fact > 1 else '>'} {all_mean}"
drop_frac = 0.0
if 0.98 < norm_fact < 1.02:
return drop_expr, drop_frac
step_size = 0.5 * np.abs(norm_mean - all_mean) / all_std
drop_frac += step_size
diff = 0.1
sub_mean = 0
while drop_frac < 1.0 and np.abs(diff) > 0.025 and diff > 0:
drop_idx = events.query(drop_expr) \
.sample(frac=drop_frac, random_state=random_state) \
.index
events_sub = events.drop(drop_idx).reset_index(drop=True)
sub_idx = events_sub.index[(events_sub[time_col] >= norm_period[0])
& (events_sub[time_col] <= norm_period[1])]
sub_mean = events_sub.loc[sub_idx, val_col].mean()
diff = (norm_mean - sub_mean) / np.abs(norm_mean)
diff = diff if norm_fact > 1.0 else -diff
if np.abs(diff) > 0.025:
drop_frac += step_size
drop_frac = min(1.0, drop_frac)
LOGGER.info("Results of intensity normalization by subsampling:")
LOGGER.info("- drop %d%% of entries satisfying '%s'", int(100 * drop_frac), drop_expr)
LOGGER.info("- mean intensity of simulated events before dropping is %.4f", all_mean)
LOGGER.info("- mean intensity of simulated events after dropping is %.4f", sub_mean)
LOGGER.info("- mean intensity of observed events is %.4f", norm_mean)
return drop_expr, drop_frac
[docs]
def draw_poisson_events(poisson, events, val_col, val_accept, drop=None):
"""Draw poisson distributed events with acceptable value statistics
The size of the draw is poisson distributed. Redraws are made until the
draw mean is within the range specified by `val_accept`.
If `drop` is specified, a random choice of entries is dropped from `events`
before the actual drawing is done in order to speed up the process in case
of data sets where the acceptable mean is far from the input data mean.
Parameters
----------
poisson : float
Poisson parameter.
events : DataFrame
Each row describes one event.
The dataset should contain at least the column `val_col`.
val_col : str
Name of value column in `events`.
val_accept : pair of floats
Acceptable range of draw means.
drop : pair [expr, frac] or None
If given, only events satisfying the pandas query expression `expr` are dropped.
`frac` specifies the fraction of these events that is dropped.
Returns
-------
draw_idx : Series or None
Indices into `events`.
If no acceptable draw was among the first 10,000 attempts, the return value is None.
"""
fail_counts = 0
if drop is not None:
drop_query, drop_frac = drop
while True:
events_sub = events
if drop is not None:
drop_idx = events.query(drop_query) \
.sample(frac=drop_frac, random_state=random_state) \
.index
events_sub = events.drop(drop_idx)
draw_size = max(1, random_state.poisson(poisson, 1)[0])
draw_inds = random_state.choice(events_sub.shape[0], draw_size,
replace=(events_sub.shape[0] < 1.2 * draw_size))
draw_mean = events_sub[val_col].iloc[draw_inds].mean()
if val_accept[0] <= draw_mean <= val_accept[1]:
return events_sub.index[draw_inds]
fail_counts += 1
if fail_counts >= 10000:
return None