Source code for empulse.models.cost_sensitive.b2boost

from numbers import Real
from typing import Any, ClassVar, Self, TypeVar

import numpy as np
import sympy
from numpy.typing import ArrayLike
from sklearn.utils._param_validation import HasMethods

from ..._types import FloatArrayLike, ParameterConstraint

try:
    from xgboost import XGBClassifier
except ImportError:
    XGBClassifier = TypeVar('XGBClassifier')  # type: ignore[misc, assignment]
try:
    from lightgbm import LGBMClassifier
except ImportError:
    LGBMClassifier = TypeVar('LGBMClassifier')  # type: ignore[misc, assignment]
try:
    from catboost import CatBoostClassifier
except ImportError:
    CatBoostClassifier = TypeVar('CatBoostClassifier')  # type: ignore[misc, assignment]

from ..._common import Parameter
from ...metrics import Cost, CostMatrix, Metric
from .csboost import CSBoostClassifier


[docs] class B2BoostClassifier(CSBoostClassifier): """ Cost-sensitive gradient boosting classifier for B2B customer churn. B2BoostClassifier supports :class:`xgboost:xgboost.XGBClassifier`, :class:`lightgbm:lightgbm.LGBMClassifier` and :class:`catboost.CatBoostClassifier`. By default, it uses XGBoost classifier with default hyperparameters. Read more in the :ref:`User Guide <csboost>`. Parameters ---------- estimator : :class:`xgboost:xgboost.XGBClassifier`, :class:`lightgbm:lightgbm.LGBMClassifier` \ or :class:`catboost.CatBoostClassifier`, optional XGBoost or LightGBM classifier to be fit with desired hyperparameters. If not provided, a XGBoost classifier with default hyperparameters is used. accept_rate : float, default=0.3 Probability of a customer responding to the retention offer (0 < `accept_rate` < 1). Is overwritten if another `accept_rate` is passed to the ``fit`` method. clv : float or 1D array-like, shape=(n_samples), default=200 If ``float``: constant customer lifetime value per retained customer (``clv > incentive_cost``). If ``array``: individualized customer lifetime value of each customer when retained (``mean(clv) > incentive_cost``). Is overwritten if another `clv` is passed to the ``fit`` method. .. note:: It is not recommended to pass instance-dependent costs to the ``__init__`` method. Instead, pass them to the ``fit`` method. incentive_fraction : float, default=0.05 Cost of incentive offered to a customer, as a fraction of customer lifetime value (``0 < incentive_fraction < 1``). Is overwritten if another `incentive_fraction` is passed to the ``fit`` method. contact_cost : float, default=15 Constant cost of contact (``contact_cost > 0``). Is overwritten if another `contact_cost` is passed to the ``fit`` method. Attributes ---------- classes_ : numpy.ndarray, shape=(n_classes,) Unique classes in the target. estimator_ : :class:`xgboost:xgboost.XGBClassifier` Fitted XGBoost classifier. Notes ----- The instance-specific cost function for customer churn is defined as [1]_: .. math:: C(s_i) = y_i[s_i(f-\\gamma (1-\\delta )CLV_i] + (1-y_i)[s_i(\\delta CLV_i + f)] The measure requires that the churn class is encoded as 0, and it is NOT interchangeable. However, this implementation assumes the standard notation ('churn': 1, 'no churn': 0). .. seealso:: :func:`~empulse.metrics.create_objective_churn` : Creates the instance-dependent cost function for customer churn. Examples -------- .. code-block:: python import numpy as np from empulse.models import B2BoostClassifier from sklearn.datasets import make_classification X, y = make_classification() clv = np.random.rand(y.size) * 100 model = B2BoostClassifier() model.fit(X, y, clv=clv, incentive_fraction=0.1) .. code-block:: python import numpy as np from empulse.models import B2BoostClassifier from sklearn import set_config from sklearn.datasets import make_classification from sklearn.model_selection import cross_val_score from sklearn.pipeline import Pipeline from sklearn.preprocessing import StandardScaler set_config(enable_metadata_routing=True) X, y = make_classification(n_samples=50) clv = np.random.rand(y.size) * 100 pipeline = Pipeline([ ('scaler', StandardScaler()), ('model', B2BoostClassifier(contact_cost=10).set_fit_request(clv=True)) ]) cross_val_score(pipeline, X, y, params={'clv': clv}) .. code-block:: python import numpy as np from empulse.metrics import empb_score from empulse.models import B2BoostClassifier from sklearn import set_config from sklearn.datasets import make_classification from sklearn.model_selection import GridSearchCV from sklearn.metrics import make_scorer from sklearn.pipeline import Pipeline from sklearn.preprocessing import StandardScaler from xgboost import XGBClassifier set_config(enable_metadata_routing=True) X, y = make_classification() clv = np.random.rand(y.size) * 100 contact_cost = 10 pipeline = Pipeline([ ('scaler', StandardScaler()), ('model', B2BoostClassifier( XGBClassifier(n_jobs=2, n_estimators=10), contact_cost=contact_cost ).set_fit_request(clv=True)) ]) param_grid = { 'model__estimator__learning_rate': np.logspace(-5, 0, 5), } scorer = make_scorer( empb_score, response_method='predict_proba', contact_cost=contact_cost ) scorer = scorer.set_score_request(clv=True) grid_search = GridSearchCV(pipeline, param_grid=param_grid, scoring=scorer) grid_search.fit(X, y, clv=clv) References ---------- .. [1] Janssens, B., Bogaert, M., Bagué, A., & Van den Poel, D. (2022). B2Boost: Instance-dependent profit-driven modelling of B2B churn. Annals of Operations Research, 1-27. """ _parameter_constraints: ClassVar[ParameterConstraint] = { 'estimator': [HasMethods(['fit', 'predict_proba']), None], 'accept_rate': [Real], 'clv': ['array-like', Real], 'incentive_fraction': [Real], 'contact_cost': [Real], } _LOSS = Metric( CostMatrix() .add_tp_benefit( sympy.Symbol('gamma') * (sympy.Symbol('clv') - sympy.Symbol('clv') * sympy.Symbol('delta') - sympy.Symbol('f')) ) .add_tp_benefit((1 - sympy.Symbol('gamma')) * -sympy.Symbol('f')) .add_fp_cost(sympy.Symbol('clv') * sympy.Symbol('delta') + sympy.Symbol('f')) .alias('accept_rate', sympy.Symbol('gamma')) .alias('incentive_fraction', sympy.Symbol('delta')) .alias('contact_cost', sympy.Symbol('f')) .alias('clv', sympy.Symbol('clv')), Cost(), ) def __init__( self, estimator: XGBClassifier | LGBMClassifier | CatBoostClassifier | None = None, *, accept_rate: float = 0.3, clv: float | FloatArrayLike = 200, incentive_fraction: float = 0.05, contact_cost: float = 15, ) -> None: super().__init__(estimator=estimator, loss=self._LOSS) self.clv = clv self.incentive_fraction = incentive_fraction self.contact_cost = contact_cost self.accept_rate = accept_rate
[docs] def fit( self, X: FloatArrayLike, y: ArrayLike, *, accept_rate: float | Parameter = Parameter.UNCHANGED, clv: ArrayLike | float | Parameter = Parameter.UNCHANGED, incentive_fraction: float | Parameter = Parameter.UNCHANGED, contact_cost: float | Parameter = Parameter.UNCHANGED, fit_params: dict[str, Any] | None = None, **loss_params: Any, ) -> Self: """ Fit the model. Parameters ---------- X : array-like of shape (n_samples, n_features) y : array-like of shape (n_samples,) accept_rate : float, default=0.3 Probability of a customer responding to the retention offer (``0 < accept_rate < 1``). clv : float or 1D array-like, shape=(n_samples), default=200 If ``float``: constant customer lifetime value per retained customer (``clv > incentive_cost``). If ``array``: individualized customer lifetime value of each customer when retained (``mean(clv) > incentive_cost``). incentive_fraction : float, default=0.05 Cost of incentive offered to a customer, as a fraction of customer lifetime value (``0 < incentive_fraction < 1``). contact_cost : float, default=15 Constant cost of contact (``contact_cost > 0``). fit_params : dict, optional Additional parameters to pass to the estimator's fit method. loss_params : dict Additional keyword arguments to pass to the loss function. Returns ------- self : B2BoostClassifier Fitted B2Boost model. """ if accept_rate is Parameter.UNCHANGED: accept_rate = self.accept_rate if clv is Parameter.UNCHANGED: clv = self.clv if incentive_fraction is Parameter.UNCHANGED: incentive_fraction = self.incentive_fraction if contact_cost is Parameter.UNCHANGED: contact_cost = self.contact_cost if not isinstance(clv, float | int): clv = np.asarray(clv) super().fit( X, y, accept_rate=accept_rate, clv=clv, incentive_fraction=incentive_fraction, contact_cost=contact_cost, fit_params=fit_params, **loss_params, ) return self