Source code for climada_petals.hazard.emulator.random

"""
This file is part of CLIMADA.

Copyright (C) 2017 ETH Zurich, CLIMADA contributors listed in AUTHORS.

CLIMADA is free software: you can redistribute it and/or modify it under the
terms of the GNU General Public License as published by the Free
Software Foundation, version 3.

CLIMADA is distributed in the hope that it will be useful, but WITHOUT ANY
WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
PARTICULAR PURPOSE.  See the GNU General Public License for more details.

You should have received a copy of the GNU General Public License along
with CLIMADA. If not, see <https://www.gnu.org/licenses/>.

---

Randomized sampling tools for the hazard event emulator.
"""

import logging
import numpy as np

LOGGER = logging.getLogger(__name__)

random_state = np.random.RandomState(123456789)
# random_state = np.random.default_rng(123456789)


[docs]
def estimate_drop(events, time_col, val_col, norm_period, norm_fact=None, norm_mean=None):
    """Determine fraction of outlying events to be dropped

    If the mean intensity of events in the given time period `norm_period`
    is far from the desired mean `norm_mean`, sampling from `events` will
    usually yield draws whose mean is far from the desired mean, so that many
    resamplings will be necessary in order to get an acceptable draw.

    Dropping events off the desired mean before sampling can reduce the
    necessary number of samplings.

    This function estimates which portion of the events should be dropped.

    Parameters
    ----------
    events : DataFrame
        Each row describes one event.
        The dataset should contain at least the columns `time_col` and `val_col`.
    time_col : str
        Name of time column in `events`.
    val_col : str
        Name of value column in `events`.
    norm_period : pair of timestamps (e.g. floats or ints)
        Normalization period for which a specific mean intensity is expected.
    norm_mean : float
        Desired mean intensity of events in the given time period.
    norm_fact : float
        Instead of `norm_mean`, the ratio between desired and observed
        intensity in the given time period can be given.

    Returns
    -------
    drop : pair [expr, frac]
        Only events satisfying the pandas query expression `expr` should be
        eligible for dropping. `frac` specifies the fraction of these events
        that are to be dropped.
    """
    all_idx = events.index[(events[time_col] >= norm_period[0])
                           & (events[time_col] <= norm_period[1])]
    all_mean = events.loc[all_idx, val_col].mean()
    all_std = events.loc[all_idx, val_col].std()

    if norm_mean is None:
        assert norm_fact is not None
        norm_mean = all_mean * norm_fact
    else:
        norm_fact = norm_mean / all_mean

    drop_expr = f"{val_col} {'<' if norm_fact > 1 else '>'} {all_mean}"
    drop_frac = 0.0
    if 0.98 < norm_fact < 1.02:
        return drop_expr, drop_frac

    step_size = 0.5 * np.abs(norm_mean - all_mean) / all_std
    drop_frac += step_size
    diff = 0.1
    sub_mean = 0
    while drop_frac < 1.0 and np.abs(diff) > 0.025 and diff > 0:
        drop_idx = events.query(drop_expr) \
                         .sample(frac=drop_frac, random_state=random_state) \
                         .index
        events_sub = events.drop(drop_idx).reset_index(drop=True)
        sub_idx = events_sub.index[(events_sub[time_col] >= norm_period[0])
                                   & (events_sub[time_col] <= norm_period[1])]
        sub_mean = events_sub.loc[sub_idx, val_col].mean()

        diff = (norm_mean - sub_mean) / np.abs(norm_mean)
        diff = diff if norm_fact > 1.0 else -diff
        if np.abs(diff) > 0.025:
            drop_frac += step_size
    drop_frac = min(1.0, drop_frac)
    LOGGER.info("Results of intensity normalization by subsampling:")
    LOGGER.info("- drop %d%% of entries satisfying '%s'", int(100 * drop_frac), drop_expr)
    LOGGER.info("- mean intensity of simulated events before dropping is %.4f", all_mean)
    LOGGER.info("- mean intensity of simulated events after dropping is %.4f", sub_mean)
    LOGGER.info("- mean intensity of observed events is %.4f", norm_mean)

    return drop_expr, drop_frac




[docs]
def draw_poisson_events(poisson, events, val_col, val_accept, drop=None):
    """Draw poisson distributed events with acceptable value statistics

    The size of the draw is poisson distributed. Redraws are made until the
    draw mean is within the range specified by `val_accept`.

    If `drop` is specified, a random choice of entries is dropped from `events`
    before the actual drawing is done in order to speed up the process in case
    of data sets where the acceptable mean is far from the input data mean.

    Parameters
    ----------
    poisson : float
        Poisson parameter.
    events : DataFrame
        Each row describes one event.
        The dataset should contain at least the column `val_col`.
    val_col : str
        Name of value column in `events`.
    val_accept : pair of floats
        Acceptable range of draw means.
    drop : pair [expr, frac] or None
        If given, only events satisfying the pandas query expression `expr` are dropped.
        `frac` specifies the fraction of these events that is dropped.

    Returns
    -------
    draw_idx : Series or None
        Indices into `events`.
        If no acceptable draw was among the first 10,000 attempts, the return value is None.
    """
    fail_counts = 0
    if drop is not None:
        drop_query, drop_frac = drop
    while True:
        events_sub = events
        if drop is not None:
            drop_idx = events.query(drop_query) \
                             .sample(frac=drop_frac, random_state=random_state) \
                             .index
            events_sub = events.drop(drop_idx)

        draw_size = max(1, random_state.poisson(poisson, 1)[0])
        draw_inds = random_state.choice(events_sub.shape[0], draw_size,
                                        replace=(events_sub.shape[0] < 1.2 * draw_size))
        draw_mean = events_sub[val_col].iloc[draw_inds].mean()


        if val_accept[0] <= draw_mean <= val_accept[1]:
            return events_sub.index[draw_inds]

        fail_counts += 1
        if fail_counts >= 10000:
            return None