Source code for empulse.metrics.common

import numpy as np

from .._types import FloatArrayLike, FloatNDArray



[docs]
def classification_threshold(y_true: FloatArrayLike, y_score: FloatArrayLike, customer_threshold: float) -> float:
    """
    Return classification threshold for given customer threshold.

    Parameters
    ----------
    y_true : 1D array-like, shape=(n_samples,)
        Binary target values ('positive': 1, 'negative': 0).

    y_score : 1D array-like, shape=(n_samples,)
        Target scores, can either be probability estimates or non-thresholded decision values.

    customer_threshold : float
        Customer threshold determined by value-driven metric.

    Returns
    -------
    threshold : float
        Classification threshold for given customer threshold.

    Examples
    --------
    >>> from empulse.metrics import classification_threshold
    >>> from empulse.metrics import empc
    >>> y_true = [0, 1, 0, 1, 0, 1, 0, 1]
    >>> y_score = [0.1, 0.2, 0.3, 0.4, 0.5, 0.7, 0.8, 0.9]
    >>> score, threshold = empc(y_true, y_score)
    >>> classification_threshold(y_true, y_score, threshold)
    0.2
    """
    y_true = np.asarray(y_true)
    y_score = np.asarray(y_score)
    confusion_matrix, sorted_indices, duplicated_prediction_indices = _compute_confusion_matrix(y_true, y_score)
    classification_thresholds = np.pad(y_score[sorted_indices], pad_width=(1, 0), constant_values=1)
    classification_thresholds = np.delete(classification_thresholds, duplicated_prediction_indices)  # type: ignore[arg-type]
    customer_thresholds = np.sum(confusion_matrix, axis=0) / y_score.shape[0]
    return float(classification_thresholds[np.argmin(np.abs(customer_thresholds - customer_threshold))])



def _compute_confusion_matrix(
    y_true: FloatNDArray, y_pred: FloatNDArray
) -> tuple[FloatNDArray, FloatNDArray, FloatNDArray]:
    # sort true labels and predictions by highest to the lowest predicted score
    sorted_indices = y_pred.argsort()[::-1]
    sorted_labels = y_true[sorted_indices]
    sorted_predictions = y_pred[sorted_indices]

    # calculate the TP & FP at each new lead targeted
    true_positives = np.pad(np.cumsum(sorted_labels), pad_width=(1, 0))
    false_positives = np.pad(np.cumsum(sorted_labels == 0), pad_width=(1, 0))

    # merge consecutive equal prediction values
    duplicated_prediction_indices = np.where(np.diff(sorted_predictions) == 0)[0] + 1
    true_positives = np.delete(true_positives, duplicated_prediction_indices)
    false_positives = np.delete(false_positives, duplicated_prediction_indices)

    return np.array([true_positives, false_positives]), sorted_indices, duplicated_prediction_indices


def _compute_prior_class_probabilities(y_true: FloatNDArray) -> tuple[float, float]:
    """Calculate prior class probabilities from target values."""
    positive_class_prob = float(np.mean(y_true))  # pi_0
    negative_class_prob = 1 - positive_class_prob  # pi_1

    return positive_class_prob, negative_class_prob


def _compute_tpr_fpr_diffs(
    true_positive_rates: FloatNDArray, false_positive_rates: FloatNDArray
) -> tuple[FloatNDArray, FloatNDArray]:
    """Calculate differences between subsequent true positive rates and false positive rates."""
    tpr_diff = np.diff(true_positive_rates, axis=0)  # F_0(T_i) - F_0(T_{i-1})
    fpr_diff = np.diff(false_positive_rates, axis=0)  # F_1(T_i) - F_1(T_{i-1})

    return tpr_diff, fpr_diff


def _compute_profits(
    y_true: FloatNDArray, y_pred: FloatNDArray, cost_benefits: FloatNDArray
) -> tuple[FloatNDArray, FloatNDArray]:
    n_samples = y_pred.shape[0]
    confusion_matrix, _, _ = _compute_confusion_matrix(y_true, y_pred)
    profit_matrix = np.dot(confusion_matrix.T, cost_benefits) / n_samples
    customer_thresholds = np.sum(confusion_matrix, axis=0) / n_samples
    return profit_matrix, customer_thresholds