import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, cross_validate, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegressionCV
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier, BaggingClassifier
import xgboost
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.preprocessing import StandardScaler
import seaborn as sns

import warnings
warnings.filterwarnings("ignore")

df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')

df_train['dataset'] = 'train'
df_test['dataset'] = 'test'

df_all = pd.concat([df_train, df_test], ignore_index=True)

display(df_all.head())
display(df_all.describe())

def bar_plots(columns):
    for column in columns:
        plt.figure(figsize=(10, 6))

        combined_density = pd.concat([
            df_train[column].value_counts(normalize=True).rename('Proportion').reset_index().assign(Dataset='Train'),
            df_test[column].value_counts(normalize=True).rename('Proportion').reset_index().assign(Dataset='Test')
        ]).rename(columns={'index': column})

        sns.barplot(x=column, y='Proportion', hue='Dataset', data=combined_density, alpha=0.6)

        plt.title(fr'Distribution of ${column}$')
        plt.xlabel(column)
        plt.ylabel('Proportion')
        plt.legend(title='Dataset')
        plt.show()

bar_plots(['Geography', 'Gender', 'HasCrCard'])

def hist_plots(df_all, columns):
    for column in columns:
        plt.figure(figsize=(10, 6))

        sns.histplot(data=df_all, x=column, hue='dataset', element='step', stat='density', common_norm=False, alpha=0.4)

        plt.title(rf'Distribution of ${column}$')
        plt.xlabel(column)
        plt.ylabel('Density')
        plt.show()

hist_plots(df_all, ['EstimatedSalary', 'Balance', 'CreditScore'])

df_all['LowBalance'] = (df_all['Balance'] < 25000).astype(int)

cat_cols = ['Geography', 'Gender', 'HasCrCard', 'IsActiveMember']
num_cols = ['CreditScore', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'EstimatedSalary']

df_all = pd.get_dummies(df_all, columns=cat_cols)

scaler = StandardScaler()
df_all[num_cols] = scaler.fit_transform(df_all[num_cols])

X_train = df_all[df_all['dataset'] == 'train'].drop(columns=['dataset', 'Exited', 'id', 'CustomerId', 'Surname'])
y_train = df_all[df_all['dataset'] == 'train']['Exited']
X_test_ids = df_all[df_all['dataset'] == 'test'].id
X_test = df_all[df_all['dataset'] == 'test'].drop(columns=['dataset', 'id', 'Exited', 'CustomerId', 'Surname'])

X_train_full = X_train.copy()
y_train_full = y_train.copy()

X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=.2, random_state=0, stratify=y_train)

sns.heatmap(pd.concat([X_train, y_train], axis=1).corr().abs(), cmap='gray')
plt.show()

Cs = [1e-4,1e-3,1e-2,1e-1,1e0,1e1,1e2,1e3,1e4]
logistic = LogisticRegressionCV(
    Cs=Cs, cv=4, penalty='l1', solver='liblinear', scoring='roc_auc'
).fit(X_train_full, y_train_full)

logistic.C_

array([0.1])

cv_scores = logistic.scores_[1]

# Calculate mean and standard deviation of scores for each C
mean_cv_scores = np.mean(cv_scores, axis=0)
std_cv_scores = np.std(cv_scores, axis=0)

# Plotting
plt.figure(figsize=(10, 6))
plt.errorbar(np.log10(Cs), mean_cv_scores, yerr=std_cv_scores, label='Validation Score (AUC)', fmt='-o')
plt.axvline(np.log10(logistic.C_[0]), linestyle='--', color='black', label=rf'Cross-validated $C={logistic.C_[0]}$')
plt.xlabel('Log10 of C')
plt.ylabel('ROC AUC Score')
plt.title('Logistic Regression with L1 Penalty')
plt.legend()
plt.grid(True)
plt.show()

# Function to save predictions for submission in competition
def get_submission(model, model_name=''):
    pred_probas = model.predict_proba(X_test)[:, 1]
    preds_df = pd.DataFrame({
        'id': X_test_ids,
        'Exited': pred_probas
    }).set_index('id')
    if model_name:
        model_name = model_name + '_'
    preds_df.to_csv(f'predictions/{model_name}predictions.csv')

get_submission(logistic, 'logistic')

ks = list(range(60,120,5))
cv = 5

results = []
training_error = []
validation_error = []
validation_std = []

for k in ks:
    knn = KNeighborsClassifier(k)
    knn_cv = cross_validate(knn, X_train_full, y_train_full, cv=cv, scoring='roc_auc', return_train_score=True)
    training_error.append(np.mean(knn_cv['train_score']))
    validation_error.append(np.mean(knn_cv['test_score']))
    validation_std.append(np.std(knn_cv['test_score']))

best_k = ks[np.argmax(validation_error)]

fig, ax = plt.subplots(figsize=(6,4))

ax.plot(ks, training_error, label='training error');
ax.plot(ks, validation_error, label='validation error');

ax.set_xlabel(r'$k$', fontsize=10)
ax.set_ylabel('ROC Area Under Curve', fontsize=10)
ax.axvline(best_k, color='black', linestyle='--', label=rf'best $k={best_k}$')
ax.set_title('kNN: Train vs. Validation', fontsize=12)
ax.grid(":", alpha=0.4)

ax.legend(loc='best')
plt.tight_layout()

best_knn = KNeighborsClassifier(best_k).fit(X_train_full, y_train_full)

get_submission(best_knn, 'knn')

single_tree = GridSearchCV(estimator=DecisionTreeClassifier(random_state=109),
                  param_grid={'min_samples_split': range(3,15), 'max_depth': range(3,10)},
                  cv=5,
                  scoring='roc_auc',
                  n_jobs=-1)
single_tree.fit(X_train_full, y_train_full);

best_max_depth = single_tree.best_params_['max_depth']
best_min_samples = single_tree.best_params_['min_samples_split']
single_tree.best_params_
best_tree = single_tree.best_estimator_

depths = range(3, 10)
min_samples_splits = range(3,7)
train_scores = []
oob_scores = []
best_params = {}
best_oob_score = 0

for i, depth in enumerate(depths):
    for j, min_samples in enumerate(min_samples_splits):
        bagger = BaggingClassifier(estimator=DecisionTreeClassifier(min_samples_split=min_samples, max_depth = depth), oob_score=True)
        bagger.fit(X_train_full, y_train_full)
        train_scores.append(bagger.score(X_train_full, y_train_full))
        oob_scores.append(bagger.oob_score_)

        if bagger.oob_score_ > best_oob_score:
            best_oob_score = bagger.oob_score_
            best_params['depth'] = depth
            best_params['min_samples_split'] = min_samples

best_depth = best_params['depth']
best_min_samples = best_params['min_samples_split']
print(f"best_depth: {best_depth}\nbest_min_samples: {best_min_samples}")

best_depth: 6
best_min_samples: 5

bag = BaggingClassifier(estimator=DecisionTreeClassifier(min_samples_split=best_min_samples, max_depth=best_depth), oob_score=True)
max_estimators = 50
train_scores = np.zeros(max_estimators)
oob_scores = np.zeros_like(train_scores)
for i, n in enumerate(range(1, max_estimators+1)):
    bag.n_estimators = n
    bag.fit(X_train, y_train)
    train_scores[i] = bag.score(X_train, y_train)
    oob_scores[i] = bag.oob_score_

# Visualize the plateau as n_estimators increases (and select the best)
plt.plot(range(1, max_estimators+1),
         train_scores,
         label = 'Train Scores')
plt.plot(range(1, max_estimators+1),
         oob_scores,
         label='OOB Scores')
best_idx = oob_scores.argmax()
best_n_estimators = best_idx + 1
plt.axvline(best_n_estimators, color='black', linestyle='--', label = rf'best n_estimators $= {best_n_estimators}$')
plt.title('Bagging Regressor')
plt.xlabel('Number of Estimators')
plt.ylabel('Accuracy')
plt.ylim(.75,1)
plt.legend();

best_bagger = BaggingClassifier(estimator=DecisionTreeClassifier(min_samples_split=best_min_samples, max_depth=best_depth),
                                n_estimators=best_n_estimators
                                ).fit(X_train_full, y_train_full)

get_submission(best_bagger, 'bagging')

rf_cv = GridSearchCV(estimator=RandomForestClassifier(random_state=109),
                  param_grid={'min_samples_split': [6,7,8,9], 'max_features': [3,4,5]},
                  cv=5)
rf_cv.fit(X_train_full, y_train_full);

best_min_samples = rf_cv.best_params_['min_samples_split']
best_max_feautures = rf_cv.best_params_['max_features']
print(f"best_min_samples: {best_min_samples}\nbest_max_feautures: {best_max_feautures}")

best_min_samples: 9
best_max_feautures: 3

best_rf = RandomForestClassifier(min_samples_split=best_min_samples, max_features=best_max_feautures, random_state=109
                                 ).fit(X_train_full, y_train_full)

importances = best_rf.feature_importances_

fig, ax = plt.subplots(figsize=(12,6))
order = np.argsort(importances)
ax.barh(range(len(importances)), importances[order], tick_label=X_train.columns[order]);
ax.set_title(f"Random Forest: Variable Importance")
fig.show()

get_submission(best_rf, 'rf')

# Find the best learning rate
lrs = [.025, .05, .1, .2]
depth = 2
n_estimators = 1000
best_val_scores = {}
val_auc_scores = {}
for lr in lrs:
    ada = AdaBoostClassifier(estimator=DecisionTreeClassifier(max_depth=2), n_estimators=n_estimators, learning_rate=lr, random_state=109).fit(X_train, y_train)
    val_auc_scores[lr] = []
    for stage_pred_probas in ada.staged_predict_proba(X_val):
        auc_score = roc_auc_score(y_val, stage_pred_probas[:, 1])
        val_auc_scores[lr].append(auc_score)
    # Score the best validation score and its number of estimators for a given learning rate
    best_stage_ix = np.argmax(val_auc_scores[lr])
    best_val_scores[lr] = (best_stage_ix + 1, val_auc_scores[lr][best_stage_ix])

best_lr = max(best_val_scores, key=lambda k: best_val_scores[k][1])
best_n_estimators, best_score = best_val_scores[best_lr]
print(f"best_lr: {best_lr}\nbest_n_estimators: {best_n_estimators}\nbest_score: {best_score:.2}")

best_lr: 0.05
best_n_estimators: 197
best_score: 0.92

plt.plot(range(1, n_estimators+1),
         val_auc_scores[best_lr],
         color='orange',
         label = 'Validation Scores')
best_n_estimators = np.argmax(val_auc_scores[best_lr]) + 1
plt.axvline(best_n_estimators, color='black', linestyle='--', label = rf'best n_estimators $= {best_n_estimators}$')
plt.title('Adaboost Performance vs. Number of Trees')
plt.xlabel('Number of Estimators')
plt.ylabel('ROC Area Under Curve')
plt.legend();

best_ada = AdaBoostClassifier(estimator=DecisionTreeClassifier(max_depth=depth),
                              learning_rate=best_lr, n_estimators=best_n_estimators, random_state=109
                              ).fit(X_train_full, y_train_full)

get_submission(best_ada, 'ada')

# Preliminary model
xgb = XGBClassifier(objective='binary:logistic',
                    eval_metric=roc_auc_score,
                    n_estimators=1000,
                    max_depth=3,
                    learning_rate=0.1,
                    random_state=0)

xgb.fit(X_train, y_train, 
        eval_set=[(X_val, y_val)],  
        verbose=False);

# Extract the predictions with the best ROC AUC score
roc_auc_scores = xgb.evals_result()['validation_0']['roc_auc_score']
best_iteration = np.argmax(roc_auc_scores)
print(f"best_iteration: {best_iteration}")
print(f"ROC AUC Score: {roc_auc_score(y_val, xgb.predict_proba(X_val, iteration_range=(0,best_iteration+1))[:,1]):.4f}")

best_iteration: 109
ROC AUC Score: 0.9230

# Train preliminay model on the full training set
xgb = XGBClassifier(objective='binary:logistic',
                    eval_metric=roc_auc_score,
                    n_estimators=best_iteration+1,
                    max_depth=3,
                    learning_rate=0.1,
                    random_state=0)

xgb.fit(X_train_full, y_train_full, verbose=False);

get_submission(xgb, 'xgb_prelim')

# Grid search cross validation with early stopping
param_grid = {
    'max_depth': [1, 2, 3, 4],
    'learning_rate': [0.05, 0.1, 0.15, .2]
}

es = xgboost.callback.EarlyStopping(
    rounds=10,
    min_delta=1e-3,
    save_best=True,
    maximize=True,
    data_name="validation_0",
)

xgb_cv = XGBClassifier(objective='binary:logistic',
                       eval_metric=roc_auc_score,
                       n_estimators=1000,
                       random_state=0,
                       callbacks=[es])

grid_search = GridSearchCV(estimator=xgb_cv, param_grid=param_grid, cv=5, scoring='roc_auc', return_train_score=True)
grid_search.fit(X_train, y_train, eval_set=[(X_val, y_val)], verbose=False);

results = pd.DataFrame(grid_search.cv_results_)

# Extract mean training and validation scores for each parameter combination
mean_train_scores = results['mean_train_score']
mean_test_scores = results['mean_test_score']
params = results['param_max_depth'].astype(str) + ', ' + results['param_learning_rate'].astype(str)

# Plot the results
plt.figure(figsize=(14, 7))
plt.plot(params, mean_train_scores, label='Mean Train Score', marker='o')
plt.plot(params, mean_test_scores, label='Mean Validation Score', marker='o')

# Add titles and labels
plt.title('XGBoost: Cross Validation Results')
plt.xlabel(rf'Parameter Combination $(max\_depth, learning\_rate)$')
plt.ylabel('ROC AUC Score')
plt.axvline(params[10], linestyle='--', color='black', label=f'Best Parameters: $({params[10]})$')
plt.xticks(rotation=45, ha='right')
plt.legend()
plt.tight_layout()
plt.show()

print("Best iteration:", grid_search.best_estimator_.best_iteration)
print("Best Parameters:", grid_search.best_params_)
print(f"Best Validation ROC AUC Score: {grid_search.best_score_:.4f}")

Best iteration: 50
Best Parameters: {'learning_rate': 0.15, 'max_depth': 3}
Best Validation ROC AUC Score: 0.9410

# Compute the ROC curve and ROC area
y_pred_proba = grid_search.best_estimator_.predict_proba(X_val)[:, 1]
fpr, tpr, _ = roc_curve(y_val, y_pred_proba)
roc_auc = roc_auc_score(y_val, y_pred_proba)

# Plot ROC curve
plt.figure()
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (AUC = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate', fontsize=12)
plt.ylabel('True Positive Rate', fontsize=12)
plt.title('XGBoost: ROC Curve', fontsize=15)
plt.legend(loc='lower right')
plt.show()

# Fit the final XGB model
best_iteration = grid_search.best_estimator_.best_iteration + 1
best_lr = grid_search.best_params_['learning_rate']
best_depth = grid_search.best_params_['max_depth']

best_xgb = XGBClassifier(objective='binary:logistic',
                         n_estimators=best_iteration,
                         max_depth=best_depth,
                         learning_rate=best_lr,
                         random_state=0)

best_xgb.fit(X_train_full, y_train_full, verbose=False);

get_submission(best_xgb, 'xgb')

	id	CustomerId	Surname	CreditScore	Geography	Gender	Age	Tenure	Balance	NumOfProducts	HasCrCard	IsActiveMember	EstimatedSalary	dataset
0	0	15778838.0	Mazzanti	582.0	Germany	Female	39.0	8.0	125534.51	1.0	1.0	0.0	131281.28	train
1	1	15788151.0	Burtch	744.0	France	Female	29.0	7.0	0.00	2.0	1.0	1.0	93829.17	train
2	2	15588928.0	Frolov	752.0	Spain	Male	32.0	3.0	0.00	2.0	0.0	0.0	125979.36	train
3	3	15694890.0	Lu	749.0	Germany	Female	54.0	7.0	82916.43	2.0	1.0	1.0	131736.23	train
4	4	15742841.0	Macleod	770.0	Spain	Male	43.0	2.0	0.00	2.0	1.0	0.0	157527.60	train

	id	CustomerId	CreditScore	Age	Tenure	Balance	NumOfProducts	HasCrCard	IsActiveMember	EstimatedSalary	Exited
count	25000.000000	2.500000e+04	25000.000000	25000.000000	25000.000000	25000.000000	25000.00000	25000.00000	25000.000000	25000.000000	15000.000000
mean	12499.500000	1.569232e+07	658.432440	37.818120	5.031560	42796.704554	1.58992	0.78268	0.494720	117618.123325	0.205933
std	7217.022701	1.562388e+05	72.453969	8.207853	2.791716	59784.712943	0.52802	0.41243	0.499982	45690.982334	0.404395
min	0.000000	1.563152e+05	431.000000	18.000000	0.000000	0.000000	1.00000	0.00000	0.000000	11.580000	0.000000
25%	6249.750000	1.563639e+07	603.000000	32.000000	3.000000	0.000000	1.00000	1.00000	0.000000	82709.800000	0.000000
50%	12499.500000	1.569090e+07	661.000000	37.000000	5.000000	0.000000	2.00000	1.00000	0.000000	122960.980000	0.000000
75%	18749.250000	1.575881e+07	709.000000	42.000000	7.000000	110153.270000	2.00000	1.00000	1.000000	156774.940000	0.000000
max	24999.000000	1.581569e+07	850.000000	74.000000	15.000000	208614.240000	4.00000	1.00000	1.000000	626144.090000	1.000000

Predicting Bank Customer Churn¶

Load in and inspect the data¶

EDA¶

Let's take a closer look at the data. We will use a bar plot for categorical features and a histogram for numerical features.¶

Preprocessing¶

Now we will process the data to get it into a format that can be handled by ML models. Namely, categorical columns will be one-hot encoded and numerical columns will be standardized.¶

Models¶

Logistic Regression¶

kNN¶

Decision Tree¶

Bagging¶

Rather than using cross-validation, we can use the out-of-bag (OOB) errors, calculated by testing individual trees on data points that they did not see during training. This occurs since each tree is fit on a bootstrapped dataset.¶

Random Forest¶

AdaBoost¶

XGBoost¶

First we fit a preliminary model with intuitively selected hyperparameters to get a sense of XGBoost's potential.¶

Now, we take a more structured approach by finding the best `max_depth` and `learning_rate` hyperaparameters with 5-fold cross-validation.¶

Now that we've identified the best hyperparameters, we will train our final XGBoost model on the full training set (i.e., training + validation)¶

This model's predictions achieved an ROC AUC score of $0.92868$ on the withheld test set, securing its position as the winner of the competition. (The calculations cannot be shown here as they were done separately through Kaggle).¶

Predicting Bank Customer Churn¶

Load in and inspect the data¶

EDA¶

Let's take a closer look at the data. We will use a bar plot for categorical features and a histogram for numerical features.¶

Preprocessing¶

Now we will process the data to get it into a format that can be handled by ML models. Namely, categorical columns will be one-hot encoded and numerical columns will be standardized.¶

Models¶

Logistic Regression¶

kNN¶

Decision Tree¶

Bagging¶

Rather than using cross-validation, we can use the out-of-bag (OOB) errors, calculated by testing individual trees on data points that they did not see during training. This occurs since each tree is fit on a bootstrapped dataset.¶

Random Forest¶

AdaBoost¶

XGBoost¶

First we fit a preliminary model with intuitively selected hyperparameters to get a sense of XGBoost's potential.¶

Now, we take a more structured approach by finding the best max_depth and learning_rate hyperaparameters with 5-fold cross-validation.¶

Now that we've identified the best hyperparameters, we will train our final XGBoost model on the full training set (i.e., training + validation)¶

This model's predictions achieved an ROC AUC score of $0.92868$ on the withheld test set, securing its position as the winner of the competition. (The calculations cannot be shown here as they were done separately through Kaggle).¶

Now, we take a more structured approach by finding the best `max_depth` and `learning_rate` hyperaparameters with 5-fold cross-validation.¶