Source code for empulse.metrics.churn.stochastic

import warnings

import numpy as np
from scipy import stats as st

from ..._types import FloatArrayLike, FloatNDArray
from .._cy_convex_hull import convex_hull
from ..common import _compute_prior_class_probabilities, _compute_tpr_fpr_diffs
from ._validation import _validate_input_emp, _validate_input_empb


[docs] def empc_score( y_true: FloatArrayLike, y_score: FloatArrayLike, *, alpha: float = 6, beta: float = 14, clv: float | FloatArrayLike = 200, incentive_cost: float = 10, contact_cost: float = 1, check_input: bool = True, ) -> float: """ :func:`~empulse.metrics.empc()` but only returning the EMPC score. EMPC presumes a situation where identified churners are contacted and offered an incentive to remain customers. Only a fraction of churners accepts the incentive offer, this fraction is described by a :math:`Beta(\\alpha, \\beta)` distribution. As opposed to :func:`~empulse.metrics.empb`, the incentive cost is a fixed value, rather than a fraction of the customer lifetime value. For detailed information, consult the paper [1]_. .. seealso:: :func:`~empulse.metrics.empc` : to also return the fraction of the customer base that should be targeted to maximize profit. :func:`~empulse.metrics.mpc_score` : for a deterministic version of this metric. :func:`~empulse.metrics.empb_score` : for a similar metric, but with a variable incentive cost. Parameters ---------- y_true : 1D array-like, shape=(n_samples,) Binary target values ('churn': 1, 'no churn': 0). y_score : 1D array-like, shape=(n_samples,) Target scores, can either be probability estimates or non-thresholded decision values. alpha : float, default=6 Shape parameter of the beta distribution of the probability that a churner accepts the incentive (``alpha > 1``). beta : float, default=14 Shape parameter of the beta distribution of the probability that a churner accepts the incentive (``beta > 1``). clv : float or 1D array-like, shape=(n_samples), default=200 If ``float``: average customer lifetime value of retained customers (``clv > incentive_cost``). If ``array``: customer lifetime value of each customer when retained (``mean(clv) > incentive_cost``). .. note:: Passing a CLV array is equivalent to passing a float with the average CLV of that array. incentive_cost : float, default=10 Cost of incentive offered to a customer (``incentive_cost > 0``). contact_cost : float, default=1 Cost of contacting a customer (``contact_cost > 0``). check_input : bool, default=True Perform input validation. Turning off improves performance, useful when using this metric as a loss function. Returns ------- empc : float Expected Maximum Profit Measure for Customer Churn. Notes ----- The EMPC is defined as [1]_: .. math:: \\int_\\gamma CLV (\\gamma (1 - \\delta) - \\phi) \\pi_0 F_0(T) - \ CLV (\\delta + \\phi) \\pi_1 F_1(T) d\\gamma The EMPC requires that the churn class is encoded as 0, and it is NOT interchangeable (see [3]_ p37). However, this implementation assumes the standard notation ('churn': 1, 'no churn': 0). An equivalent R implementation is available in [2]_. References ---------- .. [1] Verbraken, T., Verbeke, W. and Baesens, B. (2013). A Novel Profit Maximizing Metric for Measuring Classification Performance of Customer Churn Prediction Models. IEEE Transactions on Knowledge and Data Engineering, 25(5), 961-973. Available Online: http://ieeexplore.ieee.org/iel5/69/6486492/06165289.pdf?arnumber=6165289 .. [2] Bravo, C. and Vanden Broucke, S. and Verbraken, T. (2019). EMP: Expected Maximum Profit Classification Performance Measure. R package version 2.0.5. Available Online: http://cran.r-project.org/web/packages/EMP/index.html .. [3] Verbraken, T. (2013). Business-Oriented Data Analytics: Theory and Case Studies. Ph.D. dissertation, Dept. LIRIS, KU Leuven, Leuven, Belgium, 2013. Examples -------- >>> from empulse.metrics import empc_score >>> >>> y_true = [0, 1, 0, 1, 0, 1, 0, 1] >>> y_score = [0.1, 0.2, 0.3, 0.4, 0.5, 0.7, 0.8, 0.9] >>> empc_score(y_true, y_score) 23.875593418348124 Using scorer: >>> import numpy as np >>> from sklearn.datasets import make_classification >>> from sklearn.linear_model import LogisticRegression >>> from sklearn.model_selection import cross_val_score, StratifiedKFold >>> from sklearn.metrics import make_scorer >>> from empulse.metrics import empa_score >>> >>> X, y = make_classification(random_state=42) >>> model = LogisticRegression() >>> cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42) >>> scorer = make_scorer( ... empc_score, ... response_method='predict_proba', ... clv=300, ... incentive_cost=15, ... ) >>> np.mean(cross_val_score(model, X, y, cv=cv, scoring=scorer)) 42.09000050753503 """ return empc( y_true, y_score, alpha=alpha, beta=beta, clv=clv, incentive_cost=incentive_cost, contact_cost=contact_cost, check_input=check_input, )[0]
[docs] def empc( y_true: FloatArrayLike, y_score: FloatArrayLike, *, alpha: float = 6, beta: float = 14, clv: float | FloatArrayLike = 200, incentive_cost: float = 10, contact_cost: float = 1, check_input: bool = True, ) -> tuple[float, float]: """ Expected Maximum Profit Measure for Customer Churn (EMPC). EMPC presumes a situation where identified churners are contacted and offered an incentive to remain customers. Only a fraction of churners accepts the incentive offer, this fraction is described by a :math:`Beta(\\alpha, \\beta)` distribution. As opposed to :func:`~empulse.metrics.empb`, the incentive cost is a fixed value, rather than a fraction of the customer lifetime value. For detailed information, consult the paper [1]_. .. seealso:: :func:`~empulse.metrics.empc_score` : to only return the EMPC score. :func:`~empulse.metrics.mpc` : for a deterministic version of this metric. :func:`~empulse.metrics.empb` : for a similar metric, but with a variable incentive cost. Parameters ---------- y_true : 1D array-like, shape=(n_samples,) Binary target values ('churn': 1, 'no churn': 0). y_score : 1D array-like, shape=(n_samples,) Target scores, can either be probability estimates or non-thresholded decision values. alpha : float, default=6 Shape parameter of the beta distribution of the probability that a churner accepts the incentive (``alpha > 1``). beta : float, default=14 Shape parameter of the beta distribution of the probability that a churner accepts the incentive (``beta > 1``). clv : float or 1D array-like, shape=(n_samples), default=200 If ``float``: average customer lifetime value of retained customers (``clv > incentive_cost``). If ``array``: customer lifetime value of each customer when retained (``mean(clv) > incentive_cost``). .. note:: Passing a CLV array is equivalent to passing a float with the average CLV of that array. incentive_cost : float, default=10 Cost of incentive offered to a customer (``incentive_cost > 0``). contact_cost : float, default=1 Cost of contacting a customer (``contact_cost > 0``). check_input : bool, default=True Perform input validation. Turning off improves performance, useful when using this metric as a loss function. Returns ------- empc : float Expected Maximum Profit Measure for Customer Churn threshold : float Fraction of the customer base that should be targeted to maximize profit Notes ----- The EMPC is defined as [1]_: .. math:: \\int_\\gamma CLV (\\gamma (1 - \\delta) - \\phi) \\pi_0 F_0(T) - \ CLV (\\delta + \\phi) \\pi_1 F_1(T) d\\gamma The EMPC requires that the churn class is encoded as 0, and it is NOT interchangeable (see [3]_ p37). However, this implementation assumes the standard notation ('churn': 1, 'no churn': 0). An equivalent R implementation is available in [2]_. References ---------- .. [1] Verbraken, T., Verbeke, W. and Baesens, B. (2013). A Novel Profit Maximizing Metric for Measuring Classification Performance of Customer Churn Prediction Models. IEEE Transactions on Knowledge and Data Engineering, 25(5), 961-973. Available Online: http://ieeexplore.ieee.org/iel5/69/6486492/06165289.pdf?arnumber=6165289 .. [2] Bravo, C. and Vanden Broucke, S. and Verbraken, T. (2019). EMP: Expected Maximum Profit Classification Performance Measure. R package version 2.0.5. Available Online: http://cran.r-project.org/web/packages/EMP/index.html .. [3] Verbraken, T. (2013). Business-Oriented Data Analytics: Theory and Case Studies. Ph.D. dissertation, Dept. LIRIS, KU Leuven, Leuven, Belgium, 2013. Examples -------- >>> from empulse.metrics import empc >>> >>> y_true = [0, 1, 0, 1, 0, 1, 0, 1] >>> y_score = [0.1, 0.2, 0.3, 0.4, 0.5, 0.7, 0.8, 0.9] >>> empc(y_true, y_score) (23.875593418348124, 0.8743700763487141) """ # noqa: D401 if check_input: y_true, y_score, clv = _validate_input_emp(y_true, y_score, alpha, beta, clv, incentive_cost, contact_cost) else: y_true = np.asarray(y_true) y_score = np.asarray(y_score) clv = np.asarray(clv) y_true = y_true.astype(np.int32) y_score = y_score.astype(np.float64) if isinstance(clv, np.ndarray): clv = float(np.mean(clv)) delta = incentive_cost / clv phi = contact_cost / clv positive_class_prob, negative_class_prob = _compute_prior_class_probabilities(y_true) true_positive_rates, false_positive_rates = convex_hull(y_true, y_score) tpr_diff, fpr_diff = _compute_tpr_fpr_diffs(true_positive_rates, false_positive_rates) tpr_coef = phi * positive_class_prob fpr_coef = (delta + phi) * negative_class_prob gamma_bounds = _compute_gamma_bounds(tpr_coef, fpr_coef, delta, tpr_diff, fpr_diff, positive_class_prob) gamma_cdf_diff = np.diff(st.beta.cdf(gamma_bounds, a=alpha, b=beta)) gamma_cdf_1_diff = np.diff(st.beta.cdf(gamma_bounds, a=alpha + 1, b=beta)) cutoff = len(true_positive_rates) - len(gamma_cdf_diff) if cutoff > 0: true_positive_rates = true_positive_rates[:-cutoff] false_positive_rates = false_positive_rates[:-cutoff] mean_gamma = st.beta.mean(a=alpha, b=beta) temp_1 = mean_gamma * (clv * (1 - delta) * positive_class_prob * true_positive_rates) temp_2 = clv * (tpr_coef * true_positive_rates + fpr_coef * false_positive_rates) empc = (temp_1 * gamma_cdf_1_diff - temp_2 * gamma_cdf_diff).sum() customer_threshold = ( gamma_cdf_diff * (positive_class_prob * true_positive_rates + negative_class_prob * false_positive_rates) ).sum() return empc, customer_threshold
def _compute_gamma_bounds( tpr_coef: float, fpr_coef: float, delta: float, tpr_diff: FloatNDArray, fpr_diff: FloatNDArray, positive_class_prob: float, ) -> FloatNDArray: """Compute the gamma bounds of the integral.""" numerator = fpr_coef * fpr_diff + tpr_coef * tpr_diff denominator = positive_class_prob * (1 - delta) * tpr_diff # ignore division by zero warning with warnings.catch_warnings(): warnings.filterwarnings('ignore', category=RuntimeWarning) gamma_bounds = numerator / denominator gamma_bounds = np.append([0], gamma_bounds) return np.append(gamma_bounds[gamma_bounds < 1], [1])
[docs] def empb_score( y_true: FloatArrayLike, y_score: FloatArrayLike, *, clv: FloatArrayLike, alpha: float = 6, beta: float = 14, incentive_fraction: float = 0.05, contact_cost: float = 15, check_input: bool = True, ) -> float: """ :func:`~empulse.metrics.empb()` but only returning the EMPB score. EMPB presumes a situation where identified churners are contacted and offered an incentive to remain customers. Only a fraction of churners accepts the incentive offer, this fraction is described by a :math:`Beta(\\alpha, \\beta)` distribution. As opposed to :func:`~empulse.metrics.empc`, the incentive cost is a fraction of the customer lifetime value, rather than a fixed value. For detailed information, consult the paper [1]_. .. seealso:: :func:`~empulse.metrics.empb` : to also return the fraction of the customer base that should be targeted to maximize profit. :func:`~empulse.metrics.empc_score` : for a similar metric, but with a fixed incentive cost. Parameters ---------- y_true : 1D array-like, shape=(n_samples,) Binary target values ('churn': 1, 'no churn': 0). y_score : 1D array-like, shape=(n_samples,) Target scores, can either be probability estimates or non-thresholded decision values. alpha : float, default=6 Shape parameter of the beta distribution of the probability that a churner accepts the incentive (``alpha > 1``). beta : float, default=14 Shape parameter of the beta distribution of the probability that a churner accepts the incentive (``beta > 1``). clv : float or 1D array-like, shape=(n_samples) If ``float``: average customer lifetime value of retained customers. If ``array``: customer lifetime value of each customer when retained. incentive_fraction : float, default=0.05 Cost of incentive offered to a customer, as a fraction of customer lifetime value (``0 < incentive_fraction < 1``). contact_cost : float, default=15 Cost of contacting a customer (``contact_cost > 0``). check_input : bool, default=True Perform input validation. Turning off improves performance, useful when using this metric as a loss function. Returns ------- empb : float Expected Maximum Profit Measure for B2B Customer Churn References ---------- .. [1] Janssens, B., Bogaert, M., Bagué, A., & Van den Poel, D. (2022). B2Boost: Instance-dependent profit-driven modelling of B2B churn. Annals of Operations Research, 1-27. """ return empb( y_true, y_score, alpha=alpha, beta=beta, clv=clv, contact_cost=contact_cost, incentive_fraction=incentive_fraction, check_input=check_input, )[0]
[docs] def empb( y_true: FloatArrayLike, y_score: FloatArrayLike, *, clv: FloatArrayLike, alpha: float = 6, beta: float = 14, incentive_fraction: float = 0.05, contact_cost: float = 15, check_input: bool = True, ) -> tuple[float, float]: """ Expected Maximum Profit Measure for B2B Customer Churn (EMPB). EMPB presumes a situation where identified churners are contacted and offered an incentive to remain customers. Only a fraction of churners accepts the incentive offer, this fraction is described by a :math:`Beta(\\alpha, \\beta)` distribution. As opposed to :func:`~empulse.metrics.empc`, the incentive cost is a fraction of the customer lifetime value, rather than a fixed value. For detailed information, consult the paper [1]_. .. seealso:: :func:`~empulse.metrics.empb_score` : to only return the EMPB score. :func:`~empulse.metrics.empc` : for a similar metric, but with a fixed incentive cost. Parameters ---------- y_true : 1D array-like, shape=(n_samples,) Binary target values ('churn': 1, 'no churn': 0). y_score : 1D array-like, shape=(n_samples,) Target scores, can either be probability estimates or non-thresholded decision values. alpha : float, default=6 Shape parameter of the beta distribution of the probability that a churner accepts the incentive (``alpha > 1``). beta : float, default=14 Shape parameter of the beta distribution of the probability that a churner accepts the incentive (``beta > 1``). clv : float or 1D array-like, shape=(n_samples) If ``float``: average customer lifetime value of retained customers. If ``array``: customer lifetime value of each customer when retained. incentive_fraction : float, default=0.05 Cost of incentive offered to a customer, as a fraction of customer lifetime value (``0 < incentive_fraction < 1``). contact_cost : float, default=15 Cost of contacting a customer (``contact_cost > 0``). check_input : bool, default=True Perform input validation. Turning off improves performance, useful when using this metric as a loss function. Returns ------- empb : float Expected Maximum Profit Measure for B2B Customer Churn threshold : float Fraction of the customer base that should be targeted to maximize profit References ---------- .. [1] Janssens, B., Bogaert, M., Bagué, A., & Van den Poel, D. (2022). B2Boost: Instance-dependent profit-driven modelling of B2B churn. Annals of Operations Research, 1-27. """ # noqa: D401 if check_input: y_true, y_score, clv = _validate_input_empb(y_true, y_score, clv, alpha, beta, incentive_fraction, contact_cost) else: y_true = np.asarray(y_true) y_score = np.asarray(y_score) clv = np.asarray(clv) gamma = alpha / (alpha + beta) # Sort by predicted probabilities sorted_indices = np.argsort(y_score)[::-1] sorted_y_true = y_true[sorted_indices] sorted_clv = clv[sorted_indices] # Calculate cumulative sums for benefits and costs cumulative_benefits = np.cumsum(gamma * ((1 - incentive_fraction) * sorted_clv - contact_cost) * sorted_y_true) cumulative_costs = np.cumsum((-contact_cost - incentive_fraction * sorted_clv) * (1 - sorted_y_true)) cumulative_profits = cumulative_benefits + cumulative_costs # Add a zero at the beginning to indicate not contacting anyone cumulative_profits = np.insert(cumulative_profits, 0, 0) # Find the maximum profit and corresponding threshold max_profit_index = np.argmax(cumulative_profits) max_profit = cumulative_profits[max_profit_index] threshold = max_profit_index / len(y_score) return float(max_profit), float(threshold)
[docs] def auepc_score( y_true: FloatArrayLike, y_score: FloatArrayLike, *, clv: FloatArrayLike, alpha: float = 6, beta: float = 14, incentive_fraction: float = 0.05, contact_cost: float = 15, normalize: bool = True, check_input: bool = True, ) -> float: """ Area Under the Expected Profit Curve (AUEPC). Calculate the area under the ratio of the expected profit of the model and the perfect model. The expected profit is based on the EMPB's definition of profit. AUEPC presumes a situation where identified churners are contacted and offered an incentive to remain customers. Only a fraction of churners accepts the incentive offer, this fraction is described by a :math:`Beta(\\alpha, \\beta)` distribution. For detailed information, consult the paper [1]_. .. seealso:: :func:`~empulse.metrics.empb` : to return the maximum profit and threshold. Parameters ---------- y_true : 1D array-like, shape=(n_samples,) Binary target values ('churn': 1, 'no churn': 0). y_score : 1D array-like, shape=(n_samples,) Target scores, can either be probability estimates or non-thresholded decision values. alpha : float, default=6 Shape parameter of the beta distribution of the probability that a churner accepts the incentive (``alpha > 1``). beta : float, default=14 Shape parameter of the beta distribution of the probability that a churner accepts the incentive (``beta > 1``). clv : float or 1D array-like, shape=(n_samples) If ``float``: average customer lifetime value of retained customers. If ``array``: customer lifetime value of each customer when retained. incentive_fraction : float, default=0.05 Cost of incentive offered to a customer, as a fraction of customer lifetime value (``0 < incentive_fraction < 1``). contact_cost : float, default=15 Cost of contacting a customer (``contact_cost > 0``). normalize : bool, default=True Whether to normalize the AUEPC score. If True, the score is 1 when the model is perfect. This parameter is only useful if a part of the expected profit curve is negative. check_input : bool, default=True Perform input validation. Turning off improves performance, useful when using this metric as a loss function. Returns ------- empb : float Expected Maximum Profit Measure for B2B Customer Churn threshold : float Fraction of the customer base that should be targeted to maximize profit References ---------- .. [1] Rahman, S., Janssens, B., Bogaert, M. (2025). Profit-Driven Pre-Processing in B2B Customer Churn Modeling using Fairness Techniques. Journal of Business Research. """ if check_input: y_true, y_score, clv = _validate_input_empb(y_true, y_score, clv, alpha, beta, incentive_fraction, contact_cost) else: y_true = np.asarray(y_true) y_score = np.asarray(y_score) clv = np.asarray(clv) if clv.ndim > 1: clv = clv[:, 0] accept_rate = alpha / (alpha + beta) # Calculate the expected profit vector for the perfect model perfect_pred_indices = np.argsort(np.where(y_true == 1, 1, -1) * clv)[::-1] perfect_targets = y_true[perfect_pred_indices] perfect_clv_targets = clv[perfect_pred_indices] perfect_benefits = np.cumsum( accept_rate * ((1 - incentive_fraction) * perfect_clv_targets - contact_cost) * perfect_targets ) perfect_costs = np.cumsum((-contact_cost - incentive_fraction * perfect_clv_targets) * (1 - perfect_targets)) perfect_profits = perfect_benefits + perfect_costs # Calculate the expected profit vector for the perfect model sorted_indices = y_score.argsort()[::-1] targets = y_true[sorted_indices] clv_targets = clv[sorted_indices] benefits = np.cumsum(accept_rate * ((1 - incentive_fraction) * clv_targets - contact_cost) * targets) costs = np.cumsum((-contact_cost - incentive_fraction * clv_targets) * (1 - targets)) profits = benefits + costs # Stop at the point where perfect profits become negative stop_index: int = np.argmax(perfect_profits < 0) if np.any(perfect_profits < 0) else len(perfect_profits) # type: ignore[assignment] # Calculate the AUEPC score = float(np.trapezoid(profits[:stop_index] / perfect_profits[:stop_index], dx=1 / len(profits))) # type: ignore[attr-defined] if normalize: score /= (stop_index - 1) / len(profits) return score