Source code for empulse.datasets.datasets

from dataclasses import dataclass
from os.path import dirname, join
from typing import Generic, Literal, TypeVar, overload

import numpy as np
import pandas as pd

from .._types import FloatNDArray

Frame = TypeVar('Frame', pd.DataFrame, FloatNDArray)
Series = TypeVar('Series', pd.Series, FloatNDArray)


[docs] @dataclass(frozen=True) class Dataset(Generic[Frame, Series]): """ Container object for datasets returned by the load functions. Attributes ---------- data : :class:`pandas:pandas.DataFrame` or :class:`numpy:numpy.ndarray` Features of the dataset. target : :class:`pandas:pandas.Series` or :class:`numpy:numpy.ndarray` The classification labels. tp_cost : :class:`pandas:pandas.Series`, :class:`numpy:numpy.ndarray` or float The cost of true positives. tn_cost : :class:`pandas:pandas.Series`, :class:`numpy:numpy.ndarray` or float The cost of true negatives. fp_cost : :class:`pandas:pandas.Series`, :class:`numpy:numpy.ndarray` or float The cost of false positives. fn_cost : :class:`pandas:pandas.Series`, :class:`numpy:numpy.ndarray` or float The cost of false negatives. feature_names : :class:`pandas:pandas.Series` or :class:`numpy:numpy.ndarray` The meaning of the features. target_names : :class:`pandas:pandas.Series` or :class:`numpy:numpy.ndarray` The meaning of the labels. name : str The name of the dataset. DESCR : str The full description of the dataset """ data: Frame target: Series tp_cost: Series | float tn_cost: Series | float fp_cost: Series | float fn_cost: Series | float feature_names: Series target_names: Series name: str DESCR: str
@overload def load_churn_tv_subscriptions( *, as_frame: Literal[True], return_X_y_costs: Literal[False] = False, ) -> Dataset[pd.DataFrame, pd.Series]: ... @overload def load_churn_tv_subscriptions( *, as_frame: Literal[False] = False, return_X_y_costs: Literal[False] = False, ) -> Dataset[FloatNDArray, FloatNDArray]: ... @overload def load_churn_tv_subscriptions( *, as_frame: Literal[True], return_X_y_costs: Literal[True], ) -> tuple[pd.DataFrame, pd.Series, pd.Series, pd.Series, pd.Series, pd.Series]: ... @overload def load_churn_tv_subscriptions( *, as_frame: Literal[False] = False, return_X_y_costs: Literal[True], ) -> tuple[FloatNDArray, FloatNDArray, FloatNDArray, FloatNDArray, FloatNDArray, FloatNDArray]: ...
[docs] def load_churn_tv_subscriptions( *, as_frame: bool = False, return_X_y_costs: bool = False ) -> ( Dataset[pd.DataFrame, pd.Series] | Dataset[FloatNDArray, FloatNDArray] | tuple[pd.DataFrame, pd.Series, pd.Series, pd.Series, pd.Series, pd.Series] | tuple[FloatNDArray, FloatNDArray, FloatNDArray, FloatNDArray, FloatNDArray, FloatNDArray] ): """ Load the TV Subscription Churn dataset (binary classification). The goal is to predict whether a customer will churn or not. The target variable is whether the customer churned, 'yes' = 1 and 'no' = 0. This dataset is from a TV cable provider containing all 9410 customers active during the first semester of 2014. Features names are anonymized to protect the privacy of the customers. For additional information about the dataset, consult the :ref:`User Guide <churn_tv_subscriptions>`. ================= ============== Classes 2 Churners 455 Non-churners 8955 Samples 9410 Features 45 ================= ============== Parameters ---------- as_frame : bool, default=False If True, the output will be a pandas DataFrames or Series instead of numpy arrays. return_X_y_costs : bool, default=False If True, return (data, target, tp_cost, fp_cost, tn_cost, fn_cost) instead of a Dataset object. Returns ------- dataset : :class:`~empulse.datasets.Dataset` or tuple of (data, target, tp_cost, fp_cost, tn_cost, fn_cost) Returns a Dataset object if `return_X_y_costs=False` (default), otherwise a tuple. Notes ----- Cost matrix .. list-table:: * - - Actual positive :math:`y_i = 1` - Actual negative :math:`y_i = 0` * - Predicted positive :math:`\\hat{y}_i = 1` - ``tp_cost`` :math:`= \\gamma_i d_i + (1 - \\gamma_i) (CLV_i + c_i)` - ``fp_cost`` :math:`= d_i + c_i` * - Predicted negative :math:`\\hat{y}_i = 0` - ``fn_cost`` :math:`= CLV_i` - ``tn_cost`` :math:`= 0` with - :math:`\\gamma_i` : probability of the customer accepting the retention offer - :math:`CLV_i` : customer lifetime value of the retained customer - :math:`d_i` : cost of incentive offered to the customer - :math:`c_i` : cost of contacting the customer References ---------- .. [1] A. Correa Bahnsen, D.Aouada, B, Ottersten, `"A novel cost-sensitive framework for customer churn predictive modeling" <http://www.decisionanalyticsjournal.com/content/pdf/s40165-015-0014-6.pdf>`__, Decision Analytics, 2:5, 2015. Examples -------- .. code-block:: python from empulse.datasets import load_churn_tv_subscriptions from sklearn.model_selection import train_test_split dataset = load_churn_tv_subscriptions() X_train, X_test, y_train, y_test = train_test_split( dataset.data, dataset.target, random_state=42 ) """ module_path = dirname(__file__) raw_data = pd.read_csv( join(module_path, 'data', 'churn_tv_subscriptions.csv.gz'), delimiter=',', compression='gzip' ) with open(join(module_path, 'descriptions', 'churn_tv_subscriptions.rst'), encoding='utf-8') as f: description = f.read() data = raw_data.iloc[:, 1:-5] if return_X_y_costs: if as_frame: return data, raw_data.target, raw_data['C_TP'], raw_data['C_FP'], raw_data['C_TN'], raw_data['C_FN'] else: return ( data.to_numpy(), raw_data.target.to_numpy().astype(np.int8), raw_data['C_TP'].to_numpy(), raw_data['C_FP'].to_numpy(), raw_data['C_TN'].to_numpy(), raw_data['C_FN'].to_numpy(), ) else: return Dataset( data=data.to_numpy() if not as_frame else data, target=raw_data.target.to_numpy().astype(np.int8) if not as_frame else raw_data['target'], tp_cost=raw_data.C_TP.to_numpy() if not as_frame else raw_data['C_TP'], fp_cost=raw_data.C_FP.to_numpy() if not as_frame else raw_data['C_FP'], tn_cost=raw_data.C_TN.to_numpy() if not as_frame else raw_data['C_TN'], fn_cost=raw_data.C_FN.to_numpy() if not as_frame else raw_data['C_FN'], feature_names=data.columns.to_numpy() if not as_frame else data.columns, target_names=np.array(['no churn', 'churn']) if not as_frame else pd.Series(['no churn', 'churn']), name='Churn TV subscriptions', DESCR=description, )
@overload def load_upsell_bank_telemarketing( *, as_frame: Literal[True], return_X_y_costs: Literal[False] = False, interest_rate: float = 0.02463333, term_deposit_fraction: float = 0.25, contact_cost: float = 1, ) -> Dataset[pd.DataFrame, pd.Series]: ... @overload def load_upsell_bank_telemarketing( *, as_frame: Literal[False] = False, return_X_y_costs: Literal[False] = False, interest_rate: float = 0.02463333, term_deposit_fraction: float = 0.25, contact_cost: float = 1, ) -> Dataset[FloatNDArray, FloatNDArray]: ... @overload def load_upsell_bank_telemarketing( *, as_frame: Literal[True], return_X_y_costs: Literal[True], interest_rate: float = 0.02463333, term_deposit_fraction: float = 0.25, contact_cost: float = 1, ) -> tuple[pd.DataFrame, pd.Series, pd.Series, pd.Series, pd.Series, pd.Series]: ... @overload def load_upsell_bank_telemarketing( *, as_frame: Literal[False] = False, return_X_y_costs: Literal[True], interest_rate: float = 0.02463333, term_deposit_fraction: float = 0.25, contact_cost: float = 1, ) -> tuple[FloatNDArray, FloatNDArray, FloatNDArray, FloatNDArray, FloatNDArray, FloatNDArray]: ...
[docs] def load_upsell_bank_telemarketing( *, as_frame: bool = False, return_X_y_costs: bool = False, interest_rate: float = 0.02463333, term_deposit_fraction: float = 0.25, contact_cost: float = 1, ) -> ( Dataset[pd.DataFrame, pd.Series] | Dataset[FloatNDArray, FloatNDArray] | tuple[pd.DataFrame, pd.Series, pd.Series, pd.Series, pd.Series, pd.Series] | tuple[FloatNDArray, FloatNDArray, FloatNDArray, FloatNDArray, FloatNDArray, FloatNDArray] ): """ Load the bank telemarketing dataset (binary classification). The goal is to predict whether a client will subscribe to a term deposit after being called by the bank. The target variable is whether the client subscribed to the term deposit, 'yes' = 1 and 'no' = 0. The dataset is related to a direct marketing campaigns (phone calls) of a Portuguese banking institution. The marketing campaigns were based on phone calls. Often, more than one contact to the same client was required, in order to access if the product (bank term deposit) would be or not subscribed. Features recorded before the contact event are removed from the original dataset [1]_ to avoid data leakage. Only clients with a positive balance are considered, since clients in debt are not eligible for term deposits. For a full data description and additional information about the dataset, consult the :ref:`User Guide <upsell_bank_telemarketing>`. ================= ============== Classes 2 Subscribers 4787 Non-subscribers 33144 Samples 37931 Features 10 ================= ============== Parameters ---------- as_frame : bool, default=False If True, the output will be a pandas DataFrames or Series instead of numpy arrays. return_X_y_costs : bool, default=False If True, return (data, target, tp_cost, fp_cost, tn_cost, fn_cost) instead of a Dataset object. interest_rate : float, default=0.02463333 Interest rate of the term deposit. term_deposit_fraction : float, default=0.25 Fraction of the client's balance that is deposited in the term deposit. contact_cost : float, default=1 Cost of contacting the client. Returns ------- dataset : :class:`~empulse.datasets.Dataset` or tuple of (data, target, tp_cost, fp_cost, tn_cost, fn_cost) Returns a Dataset object if `return_X_y_costs=False` (default), otherwise a tuple. Notes ----- Cost matrix .. list-table:: * - - Actual positive :math:`y_i = 1` - Actual negative :math:`y_i = 0` * - Predicted positive :math:`\\hat{y}_i = 1` - ``tp_cost`` :math:`= c` - ``fp_cost`` :math:`= c` * - Predicted negative :math:`\\hat{y}_i = 0` - ``fn_cost`` :math:`= r \\, d_i \\, b_i` - ``tn_cost`` :math:`= 0` with - :math:`c` : cost of contacting the client - :math:`r` : interest rate of the term deposit - :math:`d_i` : fraction of the client's balance that is deposited in the term deposit - :math:`b_i` : client's balance Using default parameters, it is assumed that :math:`c = 1`, :math:`r = 0.02463333`, :math:`d_i = 0.25` for all clients. References ---------- .. [1] Moro, S., Rita, P., & Cortez, P. (2014). Bank Marketing [Dataset]. UCI Machine Learning Repository. https://doi.org/10.24432/C5K306. .. [2] S. Moro, R. Laureano and P. Cortez. Using Data Mining for Bank Direct Marketing: An Application of the CRISP-DM Methodology. In P. Novais et al. (Eds.), Proceedings of the European Simulation and Modelling Conference - ESM'2011, pp. 117-121, Guimaraes, Portugal, October, 2011. EUROSIS. [bank.zip] .. [3] A. Correa Bahnsen, A. Stojanovic, D.Aouada, B, Ottersten, "Improving Credit Card Fraud Detection with Calibrated Probabilities", in Proceedings of the fourteenth SIAM International Conference on Data Mining, 677-685, 2014. Examples -------- .. code-block:: python from empulse.datasets import load_upsell_bank_telemarketing from sklearn.model_selection import train_test_split dataset = load_upsell_bank_telemarketing() X_train, X_test, y_train, y_test = train_test_split( dataset.data, dataset.target, random_state=42 ) """ module_path = dirname(__file__) raw_data = pd.read_csv(join(module_path, 'data', 'bankmarketing.csv.gz'), delimiter=';', compression='gzip') with open(join(module_path, 'descriptions', 'bankmarketing.rst'), encoding='utf-8') as f: description = f.read() # only use features pre-contact: # 1 - age (numeric) # 2 - job : type of job (categorical: "admin.","unknown","unemployed","management","housemaid","entrepreneur", # "student","blue-collar","self-employed","retired","technician","services") # 3 - marital : marital status (categorical: "married","divorced","single"; # note: "divorced" means divorced or widowed) # 4 - education (categorical: "unknown","secondary","primary","tertiary") # 5 - default: has credit in default? (binary: "yes","no") # 6 - balance: average yearly balance, in euros (numeric) # 7 - housing: has housing loan? (binary: "yes","no") # 8 - loan: has personal loan? (binary: "yes","no") # 15 - previous: number of contacts performed before this campaign and for this client (numeric) # 16 - poutcome: outcome of the previous marketing campaign (categorical: "unknown","other","failure","success") # Following features exclude because are collected after the contact event # # related with the last contact of the current campaign: # 9 - contact: contact communication type (categorical: "unknown","telephone","cellular") # 10 - day: last contact day of the month (numeric) # 11 - month: last contact month of year (categorical: "jan", "feb", "mar", ..., "nov", "dec") # 12 - duration: last contact duration, in seconds (numeric) # # other attributes: # 13 - campaign: number of contacts performed during this campaign and for this client # 14 - pdays: number of days that passed by after the client was last contacted from a # previous campaign (numeric, -1 means client was not previously contacted) # Filter if balance>0 raw_data = raw_data.loc[raw_data['balance'] > 0] target = (raw_data.y.to_numpy() == 'yes').astype(np.int8) data = raw_data[ ['age', 'balance', 'previous', 'job', 'marital', 'education', 'default', 'housing', 'loan', 'poutcome'] ] fp_cost = contact_cost fn_cost = np.maximum(data['balance'].to_numpy() * interest_rate * term_deposit_fraction, contact_cost) tp_cost = contact_cost tn_cost = 0.0 data['default'] = data['default'].map({'yes': 1, 'no': 0}) data['housing'] = data['housing'].map({'yes': 1, 'no': 0}) data['loan'] = data['loan'].map({'yes': 1, 'no': 0}) data = data.astype({ 'age': np.uint8, 'balance': np.int32, 'previous': np.uint8, 'job': 'category', 'marital': 'category', 'education': 'category', 'default': np.uint8, 'housing': np.uint8, 'loan': np.uint8, 'poutcome': 'category', }) data = data.rename( columns={ 'poutcome': 'previous_outcome', 'loan': 'has_personal_loan', 'housing': 'has_housing_loan', 'default': 'has_credit_in_default', } ) if return_X_y_costs: if as_frame: return ( data, pd.Series(target, name='subscription'), tp_cost, fp_cost, tn_cost, pd.Series(fn_cost, name='fn_cost'), ) else: return (data.to_numpy(), target, tp_cost, fp_cost, tn_cost, fn_cost) else: target_names = ['no subscription', 'subscription'] return Dataset( data=data.to_numpy() if not as_frame else data, target=target if not as_frame else pd.Series(target, name='subscription'), tp_cost=tp_cost, fp_cost=fp_cost, tn_cost=tn_cost, fn_cost=fn_cost if not as_frame else pd.Series(fn_cost, name='fn_cost'), feature_names=data.columns.to_numpy() if not as_frame else data.columns, target_names=np.array(target_names) if not as_frame else pd.Series(target_names, name='target'), name='Bank Telemarketing', DESCR=description, )
@overload def load_give_me_some_credit( *, as_frame: Literal[True], return_X_y_costs: Literal[False] = False, interest_rate: float = 0.0479, fund_cost: float = 0.0294, max_credit_line: float = 25000, loss_given_default: float = 0.75, term_length_months: int = 24, loan_to_income_ratio: float = 3, ) -> Dataset[pd.DataFrame, pd.Series]: ... @overload def load_give_me_some_credit( *, as_frame: Literal[False] = False, return_X_y_costs: Literal[False] = False, interest_rate: float = 0.0479, fund_cost: float = 0.0294, max_credit_line: float = 25000, loss_given_default: float = 0.75, term_length_months: int = 24, loan_to_income_ratio: float = 3, ) -> Dataset[FloatNDArray, FloatNDArray]: ... @overload def load_give_me_some_credit( *, as_frame: Literal[True], return_X_y_costs: Literal[True], interest_rate: float = 0.0479, fund_cost: float = 0.0294, max_credit_line: float = 25000, loss_given_default: float = 0.75, term_length_months: int = 24, loan_to_income_ratio: float = 3, ) -> tuple[pd.DataFrame, pd.Series, pd.Series, pd.Series, pd.Series, pd.Series]: ... @overload def load_give_me_some_credit( *, as_frame: Literal[False] = False, return_X_y_costs: Literal[True], interest_rate: float = 0.0479, fund_cost: float = 0.0294, max_credit_line: float = 25000, loss_given_default: float = 0.75, term_length_months: int = 24, loan_to_income_ratio: float = 3, ) -> tuple[FloatNDArray, FloatNDArray, FloatNDArray, FloatNDArray, FloatNDArray, FloatNDArray]: ...
[docs] def load_give_me_some_credit( *, as_frame: bool = False, return_X_y_costs: bool = False, interest_rate: float = 0.0479, fund_cost: float = 0.0294, max_credit_line: float = 25000, loss_given_default: float = 0.75, term_length_months: int = 24, loan_to_income_ratio: float = 3, ) -> ( Dataset[pd.DataFrame, pd.Series] | Dataset[FloatNDArray, FloatNDArray] | tuple[pd.DataFrame, pd.Series, pd.Series, pd.Series, pd.Series, pd.Series] | tuple[FloatNDArray, FloatNDArray, FloatNDArray, FloatNDArray, FloatNDArray, FloatNDArray] ): """ Load the "Give Me Some Credit" Kaggle credit scoring competition dataset (binary classification). The goal is to predict whether a customer will default on a loan in the next two years. The target variable is whether the customer defaulted, 'yes' = 1 and 'no' = 0. Only customers with a positive monthly income and a debt ratio less than 1 are considered. For a full data description and additional information about the dataset, consult the :ref:`User Guide <give_me_some_credit>`. ================= ============== Classes 2 Defaulters 7616 Non-defaulters 105299 Samples 112915 Features 10 ================= ============== Parameters ---------- as_frame : bool, default=False If True, the output will be a pandas DataFrames or Series instead of numpy arrays. return_X_y_costs : bool, default=False If True, return (data, target, tp_cost, fp_cost, tn_cost, fn_cost) instead of a Dataset object. interest_rate : float, default=0.0479 Annual interest rate of the term deposit. fund_cost : float, default=0.0294 Annual cost of funds. max_credit_line : float, default=25000 The maximum amount a client can borrow. loss_given_default : float, default=0.75 The fraction of the loan amount which is lost if the client defaults. term_length_months : int, default=24 The length of the loan term in months. loan_to_income_ratio : float, default=3 The ratio of the loan amount to the client's income. Returns ------- dataset : :class:`~empulse.datasets.Dataset` or tuple of (data, target, tp_cost, fp_cost, tn_cost, fn_cost) Returns a Dataset object if `return_X_y_costs=False` (default), otherwise a tuple. Notes ----- Cost matrix .. list-table:: * - - Actual positive :math:`y_i = 1` - Actual negative :math:`y_i = 0` * - Predicted positive :math:`\\hat{y}_i = 1` - ``tp_cost`` :math:`= 0` - ``fp_cost`` :math:`= r_i + -\\bar{r} \\cdot \\pi_0 + \\bar{Cl} \\cdot L_{gd} \\cdot \\pi_1` * - Predicted negative :math:`\\hat{y}_i = 0` - ``fn_cost`` :math:`= Cl_i \\cdot L_{gd}` - ``tn_cost`` :math:`= 0` with - :math:`r_i` : loss in profit by rejecting what would have been a good loan - :math:`\\bar{r}` : average loss in profit by rejecting what would have been a good loan - :math:`\\pi_0` : percentage of defaulters - :math:`\\pi_1` : percentage of non-defaulters - :math:`Cl_i` : credit line of the client - :math:`\\bar{Cl}` : average credit line - :math:`L_{gd}` : the fraction of the loan amount which is lost if the client defaults References ---------- .. [1] A. Correa Bahnsen, D.Aouada, B, Ottersten, "Example-Dependent Cost-Sensitive Logistic Regression for Credit Scoring", in Proceedings of the International Conference on Machine Learning and Applications, 2014. Examples -------- .. code-block:: python from empulse.datasets import load_give_me_some_credit from sklearn.model_selection import train_test_split dataset = load_give_me_some_credit() X_train, X_test, y_train, y_test = train_test_split( dataset.data, dataset.target, random_state=42 ) """ module_path = dirname(__file__) raw_data = pd.read_csv(join(module_path, 'data', 'creditscoring1.csv.gz'), delimiter=',', compression='gzip') with open(join(module_path, 'descriptions', 'creditscoring1.rst'), encoding='utf-8') as f: description = f.read() # Exclude MonthlyIncome = nan or =0 or DebtRatio >1 raw_data = raw_data.dropna() raw_data = raw_data.loc[(raw_data['MonthlyIncome'] > 0)] raw_data = raw_data.loc[(raw_data['DebtRatio'] < 1)] target = raw_data['SeriousDlqin2yrs'].to_numpy().astype(np.int64) data = raw_data.drop(['SeriousDlqin2yrs', 'id'], axis=1) # Calculate cost_mat (see[1]) cost_mat_parameters = { 'int_r': interest_rate / 12, 'int_cf': fund_cost / 12, 'cl_max': max_credit_line, 'n_term': term_length_months, 'k': loan_to_income_ratio, 'lgd': loss_given_default, } pi_1 = target.mean() # cost_mat[FP,FN,TP,TN] cost_mat = _creditscoring_costmat( data['MonthlyIncome'].to_numpy(), data['DebtRatio'].to_numpy(), pi_1, cost_mat_parameters ) # unroll into separate costs fp_cost = cost_mat[:, 0] fn_cost = cost_mat[:, 1] # normalize feature names column_mapping = { 'RevolvingUtilizationOfUnsecuredLines': 'revolving_utilization', 'age': 'age', 'NumberOfTime30-59DaysPastDueNotWorse': 'n_times_late_30_59_days', 'DebtRatio': 'debt_ratio', 'MonthlyIncome': 'monthly_income', 'NumberOfOpenCreditLinesAndLoans': 'n_open_credit_lines', 'NumberOfTimes90DaysLate': 'n_times_late_over_90_days', 'NumberRealEstateLoansOrLines': 'n_real_estate_loans', 'NumberOfTime60-89DaysPastDueNotWorse': 'n_times_late_60_89_days', 'NumberOfDependents': 'n_dependents', } data = data.rename(columns=column_mapping) new_order = [ 'monthly_income', 'debt_ratio', 'revolving_utilization', 'age', 'n_dependents', 'n_open_credit_lines', 'n_real_estate_loans', 'n_times_late_30_59_days', 'n_times_late_60_89_days', 'n_times_late_over_90_days', ] # Reorder columns data = data.reindex(columns=new_order) data = data.astype({ 'monthly_income': np.float64, 'debt_ratio': np.float64, 'revolving_utilization': np.float64, 'age': np.uint8, 'n_dependents': np.uint8, 'n_open_credit_lines': np.uint8, 'n_real_estate_loans': np.uint8, 'n_times_late_30_59_days': np.uint8, 'n_times_late_60_89_days': np.uint8, 'n_times_late_over_90_days': np.uint8, }) if return_X_y_costs: if as_frame: return ( data, pd.Series(target, name='default'), 0.0, pd.Series(fp_cost, name='fp_cost'), 0.0, pd.Series(fn_cost, name='fn_cost'), ) else: return (data.to_numpy(), target, 0.0, fp_cost, 0.0, fn_cost) else: target_names = ['no default', 'default'] return Dataset( data=data.to_numpy() if not as_frame else data, target=target if not as_frame else pd.Series(target, name='default'), tp_cost=0.0, fp_cost=fp_cost if not as_frame else pd.Series(fp_cost, name='fp_cost'), tn_cost=0.0, fn_cost=fn_cost if not as_frame else pd.Series(fn_cost, name='fn_cost'), feature_names=data.columns.to_numpy() if not as_frame else data.columns, target_names=np.array(target_names) if not as_frame else pd.Series(target_names, name='target'), name='Give Me Some Credit', DESCR=description, )
@overload def load_credit_scoring_pakdd( *, as_frame: Literal[True], return_X_y_costs: Literal[False] = False, interest_rate: float = 0.0479, fund_cost: float = 0.0294, max_credit_line: float = 25000, loss_given_default: float = 0.75, term_length_months: int = 24, loan_to_income_ratio: float = 3, ) -> Dataset[pd.DataFrame, pd.Series]: ... @overload def load_credit_scoring_pakdd( *, as_frame: Literal[False] = False, return_X_y_costs: Literal[False] = False, interest_rate: float = 0.0479, fund_cost: float = 0.0294, max_credit_line: float = 25000, loss_given_default: float = 0.75, term_length_months: int = 24, loan_to_income_ratio: float = 3, ) -> Dataset[FloatNDArray, FloatNDArray]: ... @overload def load_credit_scoring_pakdd( *, as_frame: Literal[True], return_X_y_costs: Literal[True], interest_rate: float = 0.0479, fund_cost: float = 0.0294, max_credit_line: float = 25000, loss_given_default: float = 0.75, term_length_months: int = 24, loan_to_income_ratio: float = 3, ) -> tuple[pd.DataFrame, pd.Series, pd.Series, pd.Series, pd.Series, pd.Series]: ... @overload def load_credit_scoring_pakdd( *, as_frame: Literal[False] = False, return_X_y_costs: Literal[True], interest_rate: float = 0.0479, fund_cost: float = 0.0294, max_credit_line: float = 25000, loss_given_default: float = 0.75, term_length_months: int = 24, loan_to_income_ratio: float = 3, ) -> tuple[FloatNDArray, FloatNDArray, FloatNDArray, FloatNDArray, FloatNDArray, FloatNDArray]: ...
[docs] def load_credit_scoring_pakdd( *, as_frame: bool = False, return_X_y_costs: bool = False, interest_rate: float = 0.63, fund_cost: float = 0.165, max_credit_line: float = 25000, loss_given_default: float = 0.75, term_length_months: int = 24, loan_to_income_ratio: float = 3, ) -> ( Dataset[pd.DataFrame, pd.Series] | Dataset[FloatNDArray, FloatNDArray] | tuple[pd.DataFrame, pd.Series, pd.Series, pd.Series, pd.Series, pd.Series] | tuple[FloatNDArray, FloatNDArray, FloatNDArray, FloatNDArray, FloatNDArray, FloatNDArray] ): """ Load the credit scoring PAKDD 2009 competition dataset (binary classification). The goal is to predict whether a customer will default on a loan in the next two years. The target variable is whether the customer defaulted, 'yes' = 1 and 'no' = 0. Only clients with a personal income between 100 and 10000 are considered. For a full data description and additional information about the dataset, consult the :ref:`User Guide <credit_scoring_pakdd>`. ================= ============== Classes 2 Defaulters 7743 Non-defaulters 31195 Samples 38938 Features 25 ================= ============== Parameters ---------- as_frame : bool, default=False If True, the output will be a pandas DataFrames or Series instead of numpy arrays. return_X_y_costs : bool, default=False If True, return (data, target, tp_cost, fp_cost, tn_cost, fn_cost) instead of a Dataset object. interest_rate : float, default=0.63 Annual interest rate of the term deposit. fund_cost : float, default=0.165 Annual cost of funds. max_credit_line : float, default=25000 The maximum amount a client can borrow. loss_given_default : float, default=0.75 The fraction of the loan amount which is lost if the client defaults. term_length_months : int, default=24 The length of the loan term in months. loan_to_income_ratio : float, default=3 The ratio of the loan amount to the client's income. Returns ------- dataset : :class:`~empulse.datasets.Dataset` or tuple of (data, target, tp_cost, fp_cost, tn_cost, fn_cost) Returns a Dataset object if `return_X_y_costs=False` (default), otherwise a tuple. Notes ----- Cost matrix .. list-table:: * - - Actual positive :math:`y_i = 1` - Actual negative :math:`y_i = 0` * - Predicted positive :math:`\\hat{y}_i = 1` - ``tp_cost`` :math:`= 0` - ``fp_cost`` :math:`= r_i + -\\bar{r} \\cdot \\pi_0 + \\bar{Cl} \\cdot L_{gd} \\cdot \\pi_1` * - Predicted negative :math:`\\hat{y}_i = 0` - ``fn_cost`` :math:`= Cl_i \\cdot L_{gd}` - ``tn_cost`` :math:`= 0` with - :math:`r_i` : loss in profit by rejecting what would have been a good loan - :math:`\\bar{r}` : average loss in profit by rejecting what would have been a good loan - :math:`\\pi_0` : percentage of defaulters - :math:`\\pi_1` : percentage of non-defaulters - :math:`Cl_i` : credit line of the client - :math:`\\bar{Cl}` : average credit line - :math:`L_{gd}` : the fraction of the loan amount which is lost if the client defaults References ---------- .. [1] A. Correa Bahnsen, D.Aouada, B, Ottersten, "Example-Dependent Cost-Sensitive Logistic Regression for Credit Scoring", in Proceedings of the International Conference on Machine Learning and Applications, 2014. Examples -------- .. code-block:: python from empulse.datasets import load_credit_scoring_pakdd from sklearn.model_selection import train_test_split dataset = load_credit_scoring_pakdd() X_train, X_test, y_train, y_test = train_test_split( dataset.data, dataset.target, random_state=42 ) """ module_path = dirname(__file__) raw_data = pd.read_csv(join(module_path, 'data', 'creditscoring2.csv.gz'), delimiter='\t', compression='gzip') with open(join(module_path, 'descriptions', 'creditscoring2.rst'), encoding='utf-8') as f: description = f.read() # Exclude TARGET_LABEL_BAD=1 == 'N' raw_data = raw_data.loc[raw_data['TARGET_LABEL_BAD=1'] != 'N'] # Exclude 100<PERSONAL_NET_INCOME<10000 raw_data = raw_data.loc[(raw_data['PERSONAL_NET_INCOME'].to_numpy().astype(np.float64) > 100)] raw_data = raw_data.loc[(raw_data['PERSONAL_NET_INCOME'].to_numpy().astype(np.float64) < 10000)] target = raw_data['TARGET_LABEL_BAD=1'].to_numpy().astype(np.int64) data = raw_data.drop(['TARGET_LABEL_BAD=1'], axis=1) # drop the last column data = data.iloc[:, :-1] continuous_columns = ['MATE_INCOME', 'PERSONAL_NET_INCOME'] integer_columns = [ 'ID_SHOP', 'AGE', 'AREA_CODE_RESIDENCIAL_PHONE', 'PAYMENT_DAY', 'SHOP_RANK', 'MONTHS_IN_RESIDENCE', 'MONTHS_IN_THE_JOB', 'PROFESSION_CODE', 'QUANT_ADDITIONAL_CARDS_IN_THE_APPLICATION', ] data[continuous_columns] = data[continuous_columns].astype('float64') data[integer_columns] = data[integer_columns].astype('int32') # Calculate cost_mat (see[1]) cost_mat_parameters = { 'int_r': interest_rate / 12, 'int_cf': fund_cost / 12, 'cl_max': max_credit_line * 0.33, 'n_term': term_length_months, 'k': loan_to_income_ratio, 'lgd': loss_given_default, } n_samples = data.shape[0] pi_1 = target.mean() monthly_income = data['PERSONAL_NET_INCOME'].to_numpy() * 0.33 cost_mat = _creditscoring_costmat(monthly_income, np.zeros(n_samples), pi_1, cost_mat_parameters) # unroll into separate costs fp_cost = cost_mat[:, 0] fn_cost = cost_mat[:, 1] # convert all columns which start with flag to integers flag_columns = [col for col in data.columns if col.startswith('FLAG')] data[flag_columns] = (data[flag_columns] == 'Y').astype(np.int8) # normalize feature names data.columns = data.columns.str.lower().str.replace('#', '').str.replace('quant', 'n') data.columns = data.columns.str.replace('_in_the_application', '').str.replace('residencial', 'residential') data['sex'] = data['sex'].map({'M': 1, 'F': 0}) # fill in single missing value as male data['sex'] = data.sex.fillna('1') column_mapping = { 'flag_residence_town_eq_working_town': 'lives_in_work_town', 'flag_residence_state_eq_working_state': 'lives_in_work_state', 'flag_residential_address_eq_postal_address': 'has_same_postal_address', 'flag_residential_phone': 'has_residential_phone', 'sex': 'is_male', 'flag_mothers_name': 'filled_in_mothers_name', 'flag_fathers_name': 'filled_in_fathers_name', 'mate_income': 'partner_income', 'flag_other_card': 'has_other_card', 'flag_mobile_phone': 'has_mobile_phone', 'flag_contact_phone': 'has_contact_phone', 'cod_application_booth': 'application_booth_code', 'flag_card_insurance_option': 'has_card_insurance', 'id_shop': 'shop_code', } data = data.rename(columns=column_mapping) # remap values of matiral_status to more readable values data['marital_status'] = data['marital_status'].map({ 'S': 'single', 'M': 'married', 'D': 'divorced', 'W': 'widowed', 'O': 'other', }) # remap values of residence_type to more readable values data['residence_type'] = data['residence_type'].map({ 'P': 'owned', 'A': 'rented', 'C': 'parents', 'O': 'other', }) # Desired column order new_order = [ 'age', 'personal_net_income', 'partner_income', 'months_in_residence', 'months_in_the_job', 'payment_day', 'n_banking_accounts', 'n_additional_cards', 'is_male', 'has_residential_phone', 'has_mobile_phone', 'has_contact_phone', 'has_same_postal_address', 'has_other_card', 'has_card_insurance', 'lives_in_work_town', 'lives_in_work_state', 'filled_in_mothers_name', 'filled_in_fathers_name', 'shop_rank', 'marital_status', 'residence_type', 'area_code_residential_phone', 'shop_code', 'application_booth_code', 'profession_code', ] # Reorder columns data = data.reindex(columns=new_order) data = data.astype({ 'age': np.uint8, 'personal_net_income': np.float32, 'partner_income': np.float32, 'months_in_residence': np.uint8, 'months_in_the_job': np.uint8, 'payment_day': np.uint8, 'n_banking_accounts': np.uint8, 'n_additional_cards': np.uint8, 'is_male': np.uint8, 'has_residential_phone': np.uint8, 'has_mobile_phone': np.uint8, 'has_contact_phone': np.uint8, 'has_same_postal_address': np.uint8, 'has_other_card': np.uint8, 'has_card_insurance': np.uint8, 'lives_in_work_town': np.uint8, 'lives_in_work_state': np.uint8, 'filled_in_mothers_name': np.uint8, 'filled_in_fathers_name': np.uint8, 'shop_rank': 'category', 'marital_status': 'category', 'residence_type': 'category', 'area_code_residential_phone': 'category', 'shop_code': 'category', 'application_booth_code': 'category', 'profession_code': 'category', }) # remove flag_card_insurance_option data = data.drop(['has_card_insurance'], axis=1) if return_X_y_costs: if as_frame: return ( data, pd.Series(target, name='default'), 0.0, pd.Series(fp_cost, name='fp_cost'), 0.0, pd.Series(fn_cost, name='fn_cost'), ) else: return (data.to_numpy(), target, 0.0, fp_cost, 0.0, fn_cost) else: target_names = ['no default', 'default'] return Dataset( data=data.to_numpy() if not as_frame else data, target=target if not as_frame else pd.Series(target, name='default'), tp_cost=0.0, fp_cost=fp_cost if not as_frame else pd.Series(fp_cost, name='fp_cost'), tn_cost=0.0, fn_cost=fn_cost if not as_frame else pd.Series(fn_cost, name='fn_cost'), feature_names=data.columns.to_numpy() if not as_frame else data.columns, target_names=np.array(target_names) if not as_frame else pd.Series(target_names, name='target'), name='Credit Scoring PAKDD 2009', DESCR=description, )
def _creditscoring_costmat( income: FloatNDArray, debt: FloatNDArray, pi_1: float, cost_mat_parameters: dict[str, float] ) -> FloatNDArray: """Private function to calculate the cost matrix of credit scoring models. Parameters ---------- income : array of shape = [n_samples] Monthly income of each example debt : array of shape = [n_samples] Debt ratio each example pi_1 : float Percentage of positives in the training set References ---------- .. [1] A. Correa Bahnsen, D.Aouada, B, Ottersten, "Example-Dependent Cost-Sensitive Logistic Regression for Credit Scoring", in Proceedings of the International Conference on Machine Learning and Applications, , 2014. Returns ------- cost_mat : array-like of shape = [n_samples, 4] Cost matrix of the classification problem Where the columns represent the costs of: false positives, false negatives, true positives and true negatives, for each example. """ def calculate_a(cl_i: float, int_: float, n_term: float) -> float: return float(cl_i * ((int_ * (1 + int_) ** n_term) / ((1 + int_) ** n_term - 1))) def calculate_pv(a: float, int_: float, n_term: float) -> float: return float(a / int_ * (1 - 1 / (1 + int_) ** n_term)) # Calculate credit line Cl def calculate_cl(k: float, inc_i: float, cl_max: float, debt_i: float, int_r: float, n_term: float) -> float: cl_k = k * inc_i a = calculate_a(cl_k, int_r, n_term) cl_debt = calculate_pv(inc_i * min(a / inc_i, 1 - debt_i), int_r, n_term) return min(cl_k, cl_max, cl_debt) # calculate costs def calculate_cost_fn(cl_i: float, lgd: float) -> float: return cl_i * lgd def calculate_cost_fp( cl_i: float, int_r: float, n_term: float, int_cf: float, pi_1: float, lgd: float, cl_avg: float ) -> float: a = calculate_a(cl_i, int_r, n_term) pv = calculate_pv(a, int_cf, n_term) r = pv - cl_i r_avg = calculate_pv(calculate_a(cl_avg, int_r, n_term), int_cf, n_term) - cl_avg cost_fp = r - (1 - pi_1) * r_avg + pi_1 * calculate_cost_fn(cl_avg, lgd) return max(0, cost_fp) v_calculate_cost_fp = np.vectorize(calculate_cost_fp) v_calculate_cost_fn = np.vectorize(calculate_cost_fn) v_calculate_cl = np.vectorize(calculate_cl) # Parameters k = cost_mat_parameters['k'] int_r = cost_mat_parameters['int_r'] n_term = cost_mat_parameters['n_term'] int_cf = cost_mat_parameters['int_cf'] lgd = cost_mat_parameters['lgd'] cl_max = cost_mat_parameters['cl_max'] cl = v_calculate_cl(k, income, cl_max, debt, int_r, n_term) cl_avg = cl.mean() n_samples = income.shape[0] cost_mat = np.zeros((n_samples, 4)) # cost_mat[FP,FN,TP,TN] cost_mat[:, 0] = v_calculate_cost_fp(cl, int_r, n_term, int_cf, pi_1, lgd, cl_avg) cost_mat[:, 1] = v_calculate_cost_fn(cl, lgd) cost_mat[:, 2] = 0.0 cost_mat[:, 3] = 0.0 return cost_mat