calibration_lib.py

# coding=utf-8
# Copyright 2020 The Google Research Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Lint as: python2, python3
"""Library of calibration metrics."""

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import numpy as np
import scipy.special
import six
from six.moves import range
from sklearn.metrics import confusion_matrix as sklearn_cm


# import tensorflow as tf


# import tensorflow.compat.v2 as tf


def bin_predictions_and_accuracies(probabilities, ground_truth, bins=10):
    """A helper function which histograms a vector of probabilities into bins.
  
    Args:
      probabilities: A numpy vector of N probabilities assigned to each prediction
      ground_truth: A numpy vector of N ground truth labels in {0,1}
      bins: Number of equal width bins to bin predictions into in [0, 1], or an
        array representing bin edges.
  
    Returns:
      bin_edges: Numpy vector of floats containing the edges of the bins
        (including leftmost and rightmost).
      accuracies: Numpy vector of floats for the average accuracy of the
        predictions in each bin.
      counts: Numpy vector of ints containing the number of examples per bin.
    """
    _validate_probabilities(probabilities)
    _check_rank_nonempty(rank=1,
                         probabilities=probabilities,
                         ground_truth=ground_truth)

    if len(probabilities) != len(ground_truth):
        raise ValueError(
            'Probabilies and ground truth must have the same number of elements.')

    if [v for v in ground_truth if v not in [0., 1., True, False]]:
        raise ValueError(
            'Ground truth must contain binary labels {0,1} or {False, True}.')

    if isinstance(bins, int):
        num_bins = bins
    else:
        num_bins = bins.size - 1

    # Ensure probabilities are never 0, since the bins in np.digitize are open on
    # one side.
    probabilities = np.where(probabilities == 0, 1e-8, probabilities)
    counts, bin_edges = np.histogram(probabilities, bins=bins, range=[0., 1.])
    indices = np.digitize(probabilities, bin_edges, right=True)
    accuracies = np.array([np.mean(ground_truth[indices == i])
                           for i in range(1, num_bins + 1)])
    return bin_edges, accuracies, counts


def bin_centers_of_mass(probabilities, bin_edges):
    probabilities = np.where(probabilities == 0, 1e-8, probabilities)
    indices = np.digitize(probabilities, bin_edges, right=True)
    return np.array([np.mean(probabilities[indices == i])
                     for i in range(1, len(bin_edges))])


def expected_calibration_error(probabilities, ground_truth, bins=15):
    """Compute the expected calibration error of a set of preditions in [0, 1].
  
    Args:
      probabilities: A numpy vector of N probabilities assigned to each prediction
      ground_truth: A numpy vector of N ground truth labels in {0,1, True, False}
      bins: Number of equal width bins to bin predictions into in [0, 1], or
        an array representing bin edges.
    Returns:
      Float: the expected calibration error.
    """
    bin_edges, accuracies, counts = bin_predictions_and_accuracies(
        probabilities, ground_truth, bins)
    bin_centers = bin_centers_of_mass(probabilities, bin_edges)
    num_examples = np.sum(counts)

    ece = np.sum([(counts[i] / float(num_examples)) * np.sum(
        np.abs(bin_centers[i] - accuracies[i]))
                  for i in range(bin_centers.size) if counts[i] > 0])
    return ece


def accuracy_top_k(probabilities, labels, top_k):
    """Computes the top-k accuracy of predictions.
  
    A prediction is considered correct if the ground-truth class is among the k
    classes with the highest predicted probabilities.
  
    Args:
      probabilities: Array of probabilities of shape [num_samples, num_classes].
      labels: Integer array labels of shape [num_samples].
      top_k: Integer. Number of highest-probability classes to consider.
    Returns:
      float: Top-k accuracy of predictions.
    """
    _, ground_truth = _filter_top_k(probabilities, labels, top_k)
    return ground_truth.any(axis=-1).mean()


def _filter_top_k(probabilities, labels, top_k):
    """Extract top k predicted probabilities and corresponding ground truths."""

    labels_one_hot = np.zeros(probabilities.shape)
    labels_one_hot[np.arange(probabilities.shape[0]), labels] = 1

    if top_k is None:
        return probabilities, labels_one_hot

    # Negate probabilities for easier use with argpartition (which sorts from
    # lowest)
    negative_prob = -1. * probabilities

    ind = np.argpartition(negative_prob, top_k - 1, axis=-1)
    top_k_ind = ind[:, :top_k]
    rows = np.expand_dims(np.arange(probabilities.shape[0]), axis=1)
    lowest_k_negative_probs = negative_prob[rows, top_k_ind]
    output_probs = -1. * lowest_k_negative_probs

    labels_one_hot_k = labels_one_hot[rows, top_k_ind]
    return output_probs, labels_one_hot_k


def get_multiclass_predictions_and_correctness(probabilities, labels, top_k=1):
    """Returns predicted class, correctness boolean vector."""
    _validate_probabilities(probabilities, multiclass=True)
    _check_rank_nonempty(rank=1, labels=labels)
    _check_rank_nonempty(rank=2, probabilities=probabilities)

    if top_k == 1:
        class_predictions = np.argmax(probabilities, -1)
        top_k_probs = probabilities[np.arange(len(labels)), class_predictions]
        is_correct = np.equal(class_predictions, labels)
    else:
        top_k_probs, is_correct = _filter_top_k(probabilities, labels, top_k)

    return top_k_probs, is_correct


def expected_calibration_error_multiclass(probabilities, labels, bins=15,
                                          top_k=1):
    """Computes expected calibration error from Guo et al. 2017.
  
    For details, see https://arxiv.org/abs/1706.04599.
    Note: If top_k is None, this only measures calibration of the argmax
      prediction.
  
    Args:
      probabilities: Array of probabilities of shape [num_samples, num_classes].
      labels: Integer array labels of shape [num_samples].
      bins: Number of equal width bins to bin predictions into in [0, 1], or
        an array representing bin edges.
      top_k: Integer or None. If integer, use the top k predicted
        probabilities in ECE calculation (can be informative for problems with
        many classes and lower top-1 accuracy). If None, use all classes.
    Returns:
      float: Expected calibration error.
    """
    top_k_probs, is_correct = get_multiclass_predictions_and_correctness(
        probabilities, labels, top_k)
    top_k_probs = top_k_probs.flatten()
    is_correct = is_correct.flatten()
    return expected_calibration_error(top_k_probs, is_correct, bins)


# TODO(yovadia): Write unit-tests.
def compute_accuracies_at_confidences(labels, probs, thresholds):
    """Compute accuracy of samples above each confidence threshold.
  
    Args:
      labels: Array of integer categorical labels.
      probs: Array of categorical probabilities.
      thresholds: Array of floating point probability thresholds in [0, 1).
    Returns:
      accuracies: Array of accuracies over examples with confidence > T for each T
          in thresholds.
      counts: Count of examples with confidence > T for each T in thresholds.
    """
    assert probs.shape[:-1] == labels.shape

    predict_class = probs.argmax(-1)
    predict_confidence = probs.max(-1)

    shape = (len(thresholds),) + probs.shape[:-2]
    accuracies = np.zeros(shape)
    counts = np.zeros(shape)

    eq = np.equal(predict_class, labels)
    for i, thresh in enumerate(thresholds):
        mask = predict_confidence >= thresh
        counts[i] = mask.sum(-1)
        accuracies[i] = np.ma.masked_array(eq, mask=~mask).mean(-1)
    return accuracies, counts


def brier_scores(labels, probs=None, logits=None):
    """Compute elementwise Brier score.
  
    Args:
      labels: Tensor of integer labels shape [N1, N2, ...]
      probs: Tensor of categorical probabilities of shape [N1, N2, ..., M].
      logits: If `probs` is None, class probabilities are computed as a softmax
        over these logits, otherwise, this argument is ignored.
    Returns:
      Tensor of shape [N1, N2, ...] consisting of Brier score contribution from
      each element. The full-dataset Brier score is an average of these values.

    I will let my labels shape to be [num_samples], and the probs shape should be 
    [num_samples, num_classes]
    """
    assert (probs is None) != (logits is None)
    if probs is None:
        probs = scipy.special.softmax(logits, axis=-1)
    nlabels = probs.shape[-1]
    flat_probs = probs.reshape([-1, nlabels])
    flat_labels = labels.reshape([len(flat_probs)])

    plabel = flat_probs[np.arange(len(flat_labels)), flat_labels]
    out = np.square(flat_probs).sum(axis=-1) - 2 * plabel
    return out.reshape(labels.shape)


# def brier_decompositions(labels, probs):
#     """Compute Brier decompositions for batches of datasets.

#     Args:
#       labels: Tensor of integer labels shape [S1, S2, ..., N]
#       probs: Tensor of categorical probabilities of shape [S1, S2, ..., N, M].
#     Returns:
#       Tensor of shape [S1, S2, ..., 3] consisting of 3-component Brier
#       decompositions for each series of probabilities and labels. The components
#       are ordered as <uncertainty, resolution, reliability>.
#     """
#     labels = tf.cast(labels, tf.int32)
#     probs = tf.cast(probs, tf.float32)
#     batch_shape = labels.shape[:-1]
#     flatten, unflatten = _make_flatten_unflatten_fns(batch_shape)
#     labels = flatten(labels)
#     probs = flatten(probs)
#     out = []
#     for labels_i, probs_i in zip(labels, probs):
#         out_i = brier_decomposition(labels_i, probabilities=probs_i)
#         out.append(tf.stack(out_i, axis=-1))
#     out = tf.stack(out)
#     return unflatten(out)


# def brier_decomposition(labels=None, logits=None, probabilities=None):
#     r"""Decompose the Brier score into uncertainty, resolution, and reliability.

#     [Proper scoring rules][1] measure the quality of probabilistic predictions;
#     any proper scoring rule admits a [unique decomposition][2] as
#     `Score = Uncertainty - Resolution + Reliability`, where:

#     * `Uncertainty`, is a generalized entropy of the average predictive
#       distribution; it can both be positive or negative.
#     * `Resolution`, is a generalized variance of individual predictive
#       distributions; it is always non-negative.  Difference in predictions reveal
#       information, that is why a larger resolution improves the predictive score.
#     * `Reliability`, a measure of calibration of predictions against the true
#       frequency of events.  It is always non-negative and a lower value here
#       indicates better calibration.

#     This method estimates the above decomposition for the case of the Brier
#     scoring rule for discrete outcomes.  For this, we need to discretize the space
#     of probability distributions; we choose a simple partition of the space into
#     `nlabels` events: given a distribution `p` over `nlabels` outcomes, the index
#     `k` for which `p_k > p_i` for all `i != k` determines the discretization
#     outcome; that is, `p in M_k`, where `M_k` is the set of all distributions for
#     which `p_k` is the largest value among all probabilities.

#     The estimation error of each component is O(k/n), where n is the number
#     of instances and k is the number of labels.  There may be an error of this
#     order when compared to `brier_score`.

#     #### References
#     [1]: Tilmann Gneiting, Adrian E. Raftery.
#          Strictly Proper Scoring Rules, Prediction, and Estimation.
#          Journal of the American Statistical Association, Vol. 102, 2007.
#          https://www.stat.washington.edu/raftery/Research/PDF/Gneiting2007jasa.pdf
#     [2]: Jochen Broecker.  Reliability, sufficiency, and the decomposition of
#          proper scores.
#          Quarterly Journal of the Royal Meteorological Society, Vol. 135, 2009.
#          https://rmets.onlinelibrary.wiley.com/doi/epdf/10.1002/qj.456

#     Args:
#       labels: Tensor, (n,), with tf.int32 or tf.int64 elements containing ground
#         truth class labels in the range [0,nlabels].
#       logits: Tensor, (n, nlabels), with logits for n instances and nlabels.
#       probabilities: Tensor, (n, nlabels), with predictive probability
#         distribution (alternative to logits argument).

#     Returns:
#       uncertainty: Tensor, scalar, the uncertainty component of the
#         decomposition.
#       resolution: Tensor, scalar, the resolution component of the decomposition.
#       reliability: Tensor, scalar, the reliability component of the
#         decomposition.
#     """
#     if (logits is None) == (probabilities is None):
#         raise ValueError(
#             'brier_decomposition expects exactly one of logits or probabilities.')
#     if probabilities is None:
#         probabilities = scipy.special.softmax(logits, axis=1)
#     _, nlabels = probabilities.shape  # Implicit rank check.

#     # Compute pbar, the average distribution
#     pred_class = tf.argmax(probabilities, axis=1, output_type=tf.int32)
#     confusion_matrix = tf.math.confusion_matrix(pred_class, labels, nlabels,
#                                                 dtype=tf.float32)
#     dist_weights = tf.reduce_sum(confusion_matrix, axis=1)
#     dist_weights /= tf.reduce_sum(dist_weights)
#     pbar = tf.reduce_sum(confusion_matrix, axis=0)
#     pbar /= tf.reduce_sum(pbar)

#     # dist_mean[k,:] contains the empirical distribution for the set M_k
#     # Some outcomes may not realize, corresponding to dist_weights[k] = 0
#     dist_mean = confusion_matrix / tf.expand_dims(
#         tf.reduce_sum(confusion_matrix, axis=1) + 1.0e-7, 1)

#     # Uncertainty: quadratic entropy of the average label distribution
#     uncertainty = -tf.reduce_sum(tf.square(pbar))

#     # Resolution: expected quadratic divergence of predictive to mean
#     resolution = tf.square(tf.expand_dims(pbar, 1) - dist_mean)
#     resolution = tf.reduce_sum(dist_weights * tf.reduce_sum(resolution, axis=1))

#     # Reliability: expected quadratic divergence of predictive to true
#     prob_true = tf.gather(dist_mean, pred_class, axis=0)
#     reliability = tf.reduce_sum(tf.square(prob_true - probabilities), axis=1)
#     reliability = tf.reduce_mean(reliability)

#     return uncertainty, resolution, reliability


def brier_decomp_npy(labels=None, logits=None, probabilities=None):
    if (logits is None) == (probabilities is None):
        raise ValueError("brier decomposition expects either logits or probabilities")
    if probabilities is None:
        probabilities = scipy.special.softmax(logits, axis=1)
    _, nlabels = probabilities.shape  # [num_samples, num_class]
    pred_class = np.argmax(probabilities, axis=1)  # predicted label
    confusion_matrix = sklearn_cm(y_pred=labels, y_true=pred_class,
                                  labels=np.arange(nlabels))
    dist_weights = np.sum(confusion_matrix, axis=1)
    dist_weights = dist_weights / np.sum(dist_weights)
    pbar = np.sum(confusion_matrix, axis=0)
    pbar = pbar / np.sum(pbar)

    dist_mean = confusion_matrix / np.expand_dims(np.sum(confusion_matrix, axis=1) + 1.0e-7, 1)
    uncertainty = -np.sum(pbar ** 2)

    resolution = (np.expand_dims(pbar, 1) - dist_mean) ** 2
    resolution = np.sum(dist_weights * np.sum(resolution, axis=1))

    prob_true = dist_mean[pred_class]
    reliability = np.sum((prob_true - probabilities) ** 2, axis=1)
    reliability = np.mean(reliability)

    return uncertainty, resolution, reliability


def nll(probs):
    """Returns the negative loglikehood NLL calibration score
    Args:
      probs: [num_samples, num_class]
    """
    pred_prob = np.max(probs, axis=-1)
    nll_loss = np.log(pred_prob + 1e-8)
    return -np.sum(nll_loss)


def soften_probabilities(probs, epsilon=1e-8):
    """Returns heavily weighted average of categorical distribution and uniform.
  
    Args:
      probs: Categorical probabilities of shape [num_samples, num_classes].
      epsilon: Small positive value for weighted average.
    Returns:
      epsilon * uniform + (1-epsilon) * probs
    """
    uniform = np.ones_like(probs) / probs.shape[-1]
    return epsilon * uniform + (1 - epsilon) * probs


def get_quantile_bins(num_bins, probs, top_k=1):
    """Find quantile bin edges.
  
    Args:
      num_bins: int, number of bins desired.
      probs: Categorical probabilities of shape [num_samples, num_classes].
      top_k: int, number of highest-predicted classes to consider in binning.
    Returns:
      Numpy vector, quantile bin edges.
    """
    edge_percentiles = np.linspace(0, 100, num_bins + 1)

    if len(probs.shape) == 1:
        probs = np.stack([probs, 1 - probs]).T

    if top_k == 1:
        max_probs = probs.max(-1)
    else:
        unused_labels = np.zeros(probs.shape[0]).astype(np.int32)
        max_probs, _ = _filter_top_k(probs, unused_labels, top_k)

    bins = np.percentile(max_probs, edge_percentiles)
    bins[0], bins[-1] = 0., 1.
    return bins


def _validate_probabilities(probabilities, multiclass=False):
    if np.max(probabilities) > 1. or np.min(probabilities) < 0.:
        raise ValueError('All probabilities must be in [0,1].')
    if multiclass and not np.allclose(1, np.sum(probabilities, axis=-1),
                                      atol=1e-5):
        raise ValueError(
            'Multiclass probabilities must sum to 1 along the last dimension.')


def _check_rank_nonempty(rank, **kwargs):
    for key, array in six.iteritems(kwargs):
        if len(array) <= 1 or array.ndim != rank:
            raise ValueError(
                '%s must be a rank-1 array of length > 1; actual shape is %s.' %
                (key, array.shape))

# def _make_flatten_unflatten_fns(batch_shape):
#     """Builds functions for flattening and unflattening batch dimensions."""
#     batch_shape = tuple(batch_shape)
#     batch_rank = len(batch_shape)
#     ndims = np.prod(batch_shape)

#     def flatten_fn(x):
#         x_shape = tuple(x.shape)
#         if x_shape[:batch_rank] != batch_shape:
#             raise ValueError('Expected batch-shape=%s; received array of shape=%s' %
#                              (batch_shape, x_shape))
#         flat_shape = (ndims,) + x_shape[batch_rank:]
#         return tf.reshape(x, flat_shape)

#     def unflatten_fn(x):
#         x_shape = tuple(x.shape)
#         if x_shape[0] != ndims:
#             raise ValueError('Expected batch-size=%d; received shape=%s' %
#                              (ndims, x_shape))
#         return tf.reshape(x, batch_shape + x_shape[1:])

#     return flatten_fn, unflatten_fn