Source code for empulse.samplers.cost_sampler

import warnings
from typing import TYPE_CHECKING, Any, ClassVar, Literal, Self

import numpy as np
from imblearn.base import BaseSampler
from numpy.typing import ArrayLike, NDArray
from sklearn.utils import check_random_state
from sklearn.utils._param_validation import Interval, Real, StrOptions

from .._common import Parameter
from .._types import FloatArrayLike, IntNDArray, ParameterConstraint
from ..utils._sklearn_compat import ClassifierTags, Tags  # type: ignore


[docs] class CostSensitiveSampler(BaseSampler): # type: ignore[misc] """ Sampler which performs cost-proportionate resampling. This method adjusts the sampling probability of each sample based on the cost of misclassification. This is done either by rejection sampling [1]_ or oversampling [2]_. Read more in the :ref:`User Guide <cost_sampling>`. Parameters ---------- method : {'rejection sampling', 'oversampling'}, default='rejection sampling' Method to perform the cost-proportionate sampling, either 'RejectionSampling' or 'OverSampling'. oversampling_norm: float, default=0.1 Oversampling norm for the cost. The smaller the oversampling_norm, the more samples are generated. percentile_threshold: float, default=0.975 Outlier adjustment for the cost. Costs are normalized and cost values above the percentile_threshold'th percentile are set to 1. random_state : int or :class:`numpy:numpy.random.RandomState`, optional Random number generator seed for reproducibility. fp_cost : float or array-like, shape=(n_samples,), default=0.0 Cost of false positives. If ``float``, then all false positives have the same cost. If array-like, then it is the cost of each false positive classification. Is overwritten if another `fp_cost` is passed to the ``fit_resample`` method. .. note:: It is not recommended to pass instance-dependent costs to the ``__init__`` method. Instead, pass them to the ``fit_resample`` method. fn_cost : float or array-like, shape=(n_samples,), default=0.0 Cost of false negatives. If ``float``, then all false negatives have the same cost. If array-like, then it is the cost of each false negative classification. Is overwritten if another `fn_cost` is passed to the ``fit_resample`` method. .. note:: It is not recommended to pass instance-dependent costs to the ``__init__`` method. Instead, pass them to the ``fit_resample`` method. Attributes ---------- sample_indices_ : numpy.ndarray Indices of the samples that were selected. References ---------- .. [1] B. Zadrozny, J. Langford, N. Naoki, "Cost-sensitive learning by cost-proportionate example weighting", in Proceedings of the Third IEEE International Conference on Data Mining, 435-442, 2003. .. [2] C. Elkan, "The foundations of Cost-Sensitive Learning", in Seventeenth International Joint Conference on Artificial Intelligence, 973-978, 2001. Notes ----- code modified from `costcla.sampling.cost_sampling`. Examples -------- .. code-block:: python import numpy as np from empulse.samplers import CostSensitiveSampler from sklearn.datasets import make_classification X, y = make_classification() fp_cost = np.ones_like(y) * 10 fn_cost = np.ones_like(y) sampler = CostSensitiveSampler(method='oversampling', random_state=42) X_re, y_re = sampler.fit_resample(X, y, fp_cost=fp_cost, fn_cost=fn_cost) """ _sampling_type: ClassVar[str] = 'bypass' _parameter_constraints: ClassVar[ParameterConstraint] = { 'method': [StrOptions({'oversampling', 'rejection sampling'})], 'oversampling_norm': [Interval(Real, 0, 1, closed='both')], 'percentile_threshold': [Interval(Real, 0, 1, closed='both')], 'random_state': ['random_state'], 'fp_cost': [Real, 'array-like'], 'fn_cost': [Real, 'array-like'], } if TYPE_CHECKING: # pragma: no cover # BaseEstimator should dynamically generate the method signature at runtime def set_fit_resample_request(self, *, fp_cost: bool = False, fn_cost: bool = False) -> Self: # noqa: D102 pass def __init__( self, method: Literal['rejection sampling', 'oversampling'] = 'rejection sampling', *, oversampling_norm: float = 0.1, percentile_threshold: float = 0.975, random_state: int | np.random.RandomState | None = None, fp_cost: float | FloatArrayLike = 0.0, fn_cost: float | FloatArrayLike = 0.0, ): super().__init__() self.method = method self.oversampling_norm = oversampling_norm self.percentile_threshold = percentile_threshold self.random_state = random_state self.fp_cost = fp_cost self.fn_cost = fn_cost def _more_tags(self) -> dict[str, bool]: return { 'binary_only': True, 'poor_score': True, } def __sklearn_tags__(self) -> Tags: tags = super().__sklearn_tags__() tags.classifier_tags = ClassifierTags(multi_class=False) tags.sampler_tags.sample_indices = True return tags
[docs] def fit_resample( self, X: ArrayLike, y: ArrayLike, *, fp_cost: float | ArrayLike | Parameter = Parameter.UNCHANGED, fn_cost: float | ArrayLike | Parameter = Parameter.UNCHANGED, ) -> tuple[NDArray[Any], NDArray[Any]]: """ Resample the dataset. Parameters ---------- X : array-like of shape (n_samples, n_features) y : array-like of shape (n_samples,) fp_cost : float or array-like, shape=(n_samples,), default=$UNCHANGED$ Cost of false positives. If ``float``, then all false positives have the same cost. If array-like, then it is the cost of each false positive classification. fn_cost : float or array-like, shape=(n_samples,), default=$UNCHANGED$ Cost of false negatives. If ``float``, then all false negatives have the same cost. If array-like, then it is the cost of each false negative classification. Returns ------- X_resampled : ndarray of shape (n_samples_new, n_features) The array containing the resampled data. y_resampled : ndarray of shape (n_samples_new,) The corresponding label of `X_resampled`. """ X, y = super().fit_resample(X, y, fp_cost=fp_cost, fn_cost=fn_cost) X: NDArray[Any] y: NDArray[Any] return X, y
def _fit_resample( self, X: NDArray[Any], y: IntNDArray, fp_cost: float | FloatArrayLike | Parameter = 0.0, fn_cost: float | FloatArrayLike | Parameter = 0.0, ) -> tuple[NDArray[Any], NDArray[Any]]: if fp_cost is Parameter.UNCHANGED: fp_cost = self.fp_cost if fn_cost is Parameter.UNCHANGED: fn_cost = self.fn_cost if ( all(isinstance(cost, Real) for cost in (fp_cost, fn_cost)) and sum(abs(cost) for cost in (fp_cost, fn_cost)) == 0.0 # type: ignore[misc, arg-type] ): warnings.warn( 'All costs are zero. Setting fp_cost=1 and fn_cost=1. ' f'To avoid this warning, set costs explicitly in the {self.__class__.__name__}.fit_resample() method.', UserWarning, stacklevel=2, ) fp_cost = 1 fn_cost = 1 fp_cost = np.full_like(y, fp_cost) if isinstance(fp_cost, Real) else np.array(fp_cost) fn_cost = np.full_like(y, fn_cost) if isinstance(fn_cost, Real) else np.asarray(fn_cost) rng = check_random_state(self.random_state) misclassification_costs = fp_cost misclassification_costs[y == 1] = fn_cost[y == 1] normalized_costs = np.minimum( misclassification_costs / np.percentile(misclassification_costs, self.percentile_threshold * 100), 1 ) n_samples = X.shape[0] if self.method == 'rejection sampling': rejection_probability = rng.rand(n_samples) self.sample_indices_ = np.arange(len(y))[rejection_probability <= normalized_costs] elif self.method == 'oversampling': # repeat each sample based on the normalized costs sample_repeats = np.ceil(normalized_costs / self.oversampling_norm).astype(np.int64) self.sample_indices_ = np.repeat(np.arange(n_samples), sample_repeats) else: raise ValueError(f"Method not valid. Expected 'rejection sampling' or 'oversampling', got {self.method!r}.") X_re = X[self.sample_indices_] y_re = y[self.sample_indices_] return X_re, y_re