import warnings
import numpy as np
import scipy.stats as st
from ..._types import FloatArrayLike, FloatNDArray
from .._cy_convex_hull import convex_hull
from ..common import _compute_prior_class_probabilities, _compute_tpr_fpr_diffs
from ._validation import _validate_input_stochastic
[docs]
def empa_score(
y_true: FloatArrayLike,
y_score: FloatArrayLike,
*,
alpha: float = 12,
beta: float = 0.0015,
contact_cost: float = 50,
sales_cost: float = 500,
direct_selling: float = 1,
commission: float = 0.1,
check_input: bool = True,
) -> float:
"""
:func:`~empulse.metrics.empa()` but only returning the EMPA score.
EMPA presumes a situation where leads are targeted either directly or indirectly.
Directly targeted leads are contacted and handled by the internal sales team.
Indirectly targeted leads are contacted and then referred to intermediaries,
which receive a commission.
The contribution of a successful acquisition is modeled as a :math:`Gamma(\\alpha, \\beta)` distribution.
.. seealso::
:func:`~empulse.metrics.empa` : to also return the fraction of the leads
that should be targeted to maximize profit.
:func:`~empulse.metrics.mpa_score` : for a deterministic version of this metric.
Parameters
----------
y_true : 1D array-like, shape=(n_samples,)
Binary target values ('acquisition': 1, 'no acquisition': 0).
y_score : 1D array-like, shape=(n_samples,)
Target scores, can either be probability estimates or non-thresholded decision values.
alpha : float, default=10
Shape parameter of the gamma distribution of the average contribution of a new customer. (``alpha > 0``)
beta : float, default=10
Rate parameter of the gamma distribution of the average contribution of a new customer. (``beta > 0``)
sales_cost : float, default=500
Average sale conversion cost of targeted leads handled by the company (``sales_cost ≥ 0``).
contact_cost : float, default=50
Average contact cost of targeted leads (``contact_cost ≥ 0``).
direct_selling : float, default=1
Fraction of leads sold to directly (``0 ≤ direct_selling ≤ 1``).
`direct_selling` = 0 for indirect channel.
`direct_selling` = 1 for direct channel.
commission : float, default=0.1
Fraction of contribution paid to the intermedaries (``0 ≤ commission ≤ 1``).
.. note::
The commission is only relevant when there is an indirect channel (``direct_selling < 1``).
check_input : bool, default=True
Perform input validation.
Turning off improves performance, useful when using this metric as a loss function.
Returns
-------
empa : tuple[float]
Expected Maximum Profit measure for customer Acquisition.
Notes
-----
The EMPA is defined as:
.. math::
\\int_{R} [[ \\rho(R-c-S)+(1-\\rho)(\\gamma R - c)] \\pi_0 F_0(t) - c \\pi_1 F_1(t)] \\cdot g(CLV) \\, dCLV
The EMPA requires that the acquisition class is encoded as 0, and it is NOT interchangeable.
However, this implementation assumes the standard notation ('acquisition': 1, 'no acquisition': 0).
Examples
--------
Direct channel (rho = 1):
>>> from empulse.metrics import empa_score
>>>
>>> y_true = [0, 1, 0, 1, 0, 1, 0, 1]
>>> y_score = [0.1, 0.2, 0.3, 0.4, 0.5, 0.7, 0.8, 0.9]
>>> empa_score(y_true, y_score, direct_selling=1)
3706.2500000052773
Indirect channel using scorer (rho = 0):
>>> import numpy as np
>>> from sklearn.datasets import make_classification
>>> from sklearn.linear_model import LogisticRegression
>>> from sklearn.model_selection import cross_val_score, StratifiedKFold
>>> from sklearn.metrics import make_scorer
>>> from empulse.metrics import empa_score
>>>
>>> X, y = make_classification(random_state=42)
>>> model = LogisticRegression()
>>> cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
>>> scorer = make_scorer(
... empa_score,
... response_method='predict_proba',
... alpha=10,
... beta=0.001,
... sales_cost=2_000,
... contact_cost=100,
... direct_selling=0,
... )
>>> np.mean(cross_val_score(model, X, y, cv=cv, scoring=scorer))
4449.0
"""
return empa(
y_true,
y_score,
alpha=alpha,
beta=beta,
contact_cost=contact_cost,
sales_cost=sales_cost,
direct_selling=direct_selling,
commission=commission,
check_input=check_input,
)[0]
[docs]
def empa(
y_true: FloatArrayLike,
y_score: FloatArrayLike,
*,
alpha: float = 12,
beta: float = 0.0015,
contact_cost: float = 50,
sales_cost: float = 500,
direct_selling: float = 1,
commission: float = 0.1,
check_input: bool = True,
) -> tuple[float, float]:
"""
Expected Maximum Profit measure for customer Acquisition (EMPA).
EMPA presumes a situation where leads are targeted either directly or indirectly.
Directly targeted leads are contacted and handled by the internal sales team.
Indirectly targeted leads are contacted and then referred to intermediaries,
which receive a commission.
The contribution of a successful acquisition is modeled as a :math:`Gamma(\\alpha, \\beta)` distribution.
.. seealso::
:func:`~empulse.metrics.empa_score` : to only return the EMPA score.
:func:`~empulse.metrics.mpa` : for a deterministic version of this metric.
Parameters
----------
y_true : 1D array-like, shape=(n_samples,)
Binary target values ('acquisition': 1, 'no acquisition': 0).
y_score : 1D array-like, shape=(n_samples,)
Target scores, can either be probability estimates or non-thresholded decision values.
alpha : float, default=10
Shape parameter of the gamma distribution of the average contribution of a new customer. (``alpha > 0``)
beta : float, default=10
Rate parameter of the gamma distribution of the average contribution of a new customer. (``beta > 0``)
sales_cost : float, default=500
Average sale conversion cost of targeted leads handled by the company (``sales_cost ≥ 0``).
contact_cost : float, default=50
Average contact cost of targeted leads (``contact_cost ≥ 0``).
direct_selling : float, default=1
Fraction of leads sold to directly (``0 ≤ direct_selling ≤ 1``).
`direct_selling` = 0 for indirect channel.
`direct_selling` = 1 for direct channel.
commission : float, default=0.1
Fraction of contribution paid to the intermedaries (``0 ≤ commission ≤ 1``).
.. note::
The commission is only relevant when there is an indirect channel (``direct_selling < 1``).
check_input : bool, default=True
Perform input validation.
Turning off improves performance, useful when using this metric as a loss function.
Returns
-------
empa : float
Expected Maximum Profit measure for customer Acquisition
threshold : float
Fraction of the leads that should be targeted to maximize profit
Notes
-----
The EMPA is defined as:
.. math::
\\int_{R} [[ \\rho(R-c-S)+(1-\\rho)(\\gamma R - c)] \\pi_0 F_0(t) - c \\pi_1 F_1(t)] \\cdot g(CLV) \\, dCLV
The EMPA requires that the acquisition class is encoded as 0, and it is NOT interchangeable.
However, this implementation assumes the standard notation ('acquisition': 1, 'no acquisition': 0).
Examples
--------
Direct channel (rho = 1):
>>> from empulse.metrics import empa
>>>
>>> y_true = [0, 1, 0, 1, 0, 1, 0, 1]
>>> y_score = [0.1, 0.2, 0.3, 0.4, 0.5, 0.7, 0.8, 0.9]
>>> empa(y_true, y_score, direct_selling=1)
(3706.2500000052773, 0.8749999997947746)
Indirect channel (rho = 0):
>>> from empulse.metrics import empa
>>>
>>> y_true = [0, 1, 0, 1, 0, 1, 0, 1]
>>> y_score = [0.1, 0.2, 0.3, 0.4, 0.5, 0.7, 0.8, 0.9]
>>> empa(y_true, y_score, direct_selling=0)
(3556.25, 0.875)
""" # noqa: D401
if check_input:
y_true, y_score = _validate_input_stochastic(
y_true, y_score, alpha, beta, contact_cost, sales_cost, direct_selling, commission
)
else:
y_true = np.asarray(y_true)
y_score = np.asarray(y_score)
y_true = y_true.astype(np.int32)
y_score = y_score.astype(np.float64)
positive_class_prob, negative_class_prob = _compute_prior_class_probabilities(y_true)
true_positive_rates, false_positive_rates = convex_hull(y_true, y_score)
true_positive_rates = np.expand_dims(true_positive_rates, axis=1)
false_positive_rates = np.expand_dims(false_positive_rates, axis=1)
tpr_diff, fpr_diff = _compute_tpr_fpr_diffs(true_positive_rates, false_positive_rates)
fpr_coef = contact_cost * negative_class_prob
tpr_coef = (-direct_selling * sales_cost - contact_cost) * positive_class_prob
denominator = (direct_selling + (1 - direct_selling) * (1 - commission)) * positive_class_prob
bounds = _compute_integration_bounds(tpr_coef, fpr_coef, denominator, tpr_diff, fpr_diff)
cdf_diff = np.diff(st.gamma.cdf(bounds, a=alpha, loc=0, scale=1 / beta), axis=0)
cdf_1_diff = np.diff(st.gamma.cdf(bounds, a=alpha + 1, loc=0, scale=1 / beta), axis=0)
cdf_coef = tpr_coef * true_positive_rates - fpr_coef * false_positive_rates
cdf_1_coef = denominator * true_positive_rates
expected_profit = (alpha / beta) * (cdf_1_coef * cdf_1_diff).sum(axis=0) + (cdf_coef * cdf_diff).sum(axis=0)
threshold = (
cdf_diff * (positive_class_prob * true_positive_rates + negative_class_prob * false_positive_rates)
).sum()
return expected_profit.sum(), threshold
def _compute_integration_bounds(
tpr_coef: float,
fpr_coef: float,
denominator: float,
tpr_diff: FloatNDArray,
fpr_diff: FloatNDArray,
) -> FloatNDArray:
"""Compute the integration bounds for the contribution of a new customer."""
# ignore division by zero warning
with warnings.catch_warnings():
warnings.filterwarnings('ignore', category=RuntimeWarning)
clv_bounds = (fpr_coef * fpr_diff - tpr_coef * tpr_diff) / (denominator * tpr_diff)
# add zero and infinity to bounds
if clv_bounds.ndim == 2:
return np.concatenate([
np.zeros((1, clv_bounds.shape[1])),
clv_bounds,
np.full(shape=(1, clv_bounds.shape[1]), fill_value=np.inf),
])
elif clv_bounds.ndim == 1:
integration_bounds: FloatNDArray = np.concatenate([[0], clv_bounds, [np.inf]]).reshape(-1, 1)
return integration_bounds
else:
raise ValueError(f'Invalid number of dimensions: {clv_bounds.ndim}')