[This article was first published on   T. Moudiki's Webpage - R, and kindly contributed to R-bloggers].  (You can report issue about the content on this page here)
Want to share your content on R-bloggers? click here if you have a blog, or here if you don't.
            Want to share your content on R-bloggers? click here if you have a blog, or here if you don't.
This week’s post is about mlsauce (again), and LSBoost in particular. No new working paper (still working on it), but:
- An updated R version, working at least on Linux and macOS (Windows users, if not working on your machine, give a try to the Windows Subsystem for Linux, WSL)
- A new updated documentation page
- My first StackOverflow question ever (still unanswered)
The examples below probably include some kind of leakage (great if you can spot it), but take it as an illustration.
0 – import packages
Importing mlsauce from GitHub remains the preferred way to install it.
#!pip install numpy matplotlib scikit-learn !pip install git+https://github.com/Techtonique/mlsauce.git --verbose # Importing necessary libraries import mlsauce as ms import numpy as np import matplotlib.pyplot as plt from sklearn.datasets import load_breast_cancer from sklearn.preprocessing import StandardScaler from sklearn.decomposition import KernelPCA # Non-linear dimensionality reduction through the use of kernels from sklearn.model_selection import cross_val_score, train_test_split from sklearn.ensemble import RandomForestClassifier from sklearn.metrics import accuracy_score
1 – Data preprocessing
# Load breast cancer dataset data = load_breast_cancer() X = data.data y = data.target print(X.shape) print(y.shape) (569, 30) (569,)
1 – 1 Kernel PCA features
# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
# Perform Kernel PCA to extract 2 'good' features
# (easier to visualize)
kpca = KernelPCA(n_components=2)
X_kpca = kpca.fit_transform(X_scaled)
# Splitting the dataset into training and testing sets
X_train_kpca, X_test_kpca, y_train, y_test = train_test_split(X_kpca, y, test_size=0.2,
                                                    random_state=32)
# Plotting the two principal components
plt.figure(figsize=(8, 6))
plt.scatter(X_test_kpca[:, 0], X_test_kpca[:, 1], c=y_test, cmap='viridis')
plt.xlabel('Kernel Principal Component 1')
plt.ylabel('Kernel Principal Component 2')
plt.title('Kernel PCA of Breast Cancer Dataset')
plt.colorbar(label='Malignant (0) / Benign (1)')
plt.show()
1 – 2 ‘Important’ features
# Training a Random Forest classifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
rf_classifier.fit(X, y)
# Feature importances
importances = rf_classifier.feature_importances_
print(importances)
indices = np.argsort(importances)[::-1]
print(indices)
# Select top 2 features
top_two_indices = indices[:2]
print(data.feature_names[top_two_indices])
X_rf = X[:,top_two_indices]
# Splitting the dataset into training and testing sets
X_train_rf, X_test_rf, y_train, y_test = train_test_split(X_rf, y, test_size=0.2,
                                                    random_state=32)
# Plotting the two principal components
plt.figure(figsize=(8, 6))
plt.scatter(X_test_rf[:, 0], X_test_rf[:, 1], c=y_test, cmap='viridis')
plt.xlabel("Most 'important' feature 1")
plt.ylabel("Most 'important' feature 2")
plt.title('Response for Breast Cancer Dataset')
plt.colorbar(label='Malignant (0) / Benign (1)')
plt.show()
[0.03484323 0.01522515 0.06799034 0.06046164 0.00795845 0.01159704
 0.06691736 0.10704566 0.00342279 0.00261508 0.0142637  0.00374427
 0.01008506 0.02955283 0.00472157 0.00561183 0.00581969 0.00375975
 0.00354597 0.00594233 0.08284828 0.01748526 0.0808497  0.13935694
 0.01223202 0.01986386 0.03733871 0.13222509 0.00817908 0.00449731]
[23 27  7 20 22  2  6  3 26  0 13 25 21  1 10 24  5 12 28  4 19 16 15 14
 29 17 11 18  8  9]
['worst area' 'worst concave points']
2 – Adjust LSBoostClassifier
!pip install GPopt
import GPopt as gp
import mlsauce as ms
from sklearn.model_selection import cross_val_score
opt_objects_lsboost = []
def lsboost_cv(X_train, y_train,
               n_estimators=100,
               learning_rate=0.1,
               n_hidden_features=5,
               reg_lambda=0.1,
               dropout=0,
               tolerance=1e-4,
               n_clusters=2,
               seed=123,
               solver="ridge"):
  estimator = ms.LSBoostClassifier(n_estimators=int(n_estimators),
                                   learning_rate=learning_rate,
                                   n_hidden_features=int(n_hidden_features),
                                   reg_lambda=reg_lambda,
                                   dropout=dropout,
                                   tolerance=tolerance,
                                   n_clusters=int(n_clusters),
                                   seed=seed, solver=solver, verbose=0)
  return -cross_val_score(estimator, X_train, y_train,
                          scoring='f1_macro', cv=5).mean()
def optimize_lsboost(X_train, y_train, solver="ridge"):
  # objective function for hyperparams tuning
  def crossval_objective(x):
    return lsboost_cv(
      X_train=X_train,
      y_train=y_train,
      n_estimators=int(x[0]),
      learning_rate=x[1],
      n_hidden_features=int(x[2]),
      reg_lambda=x[3],
      dropout=x[4],
      tolerance=x[5],
      n_clusters=int(x[6]),
      solver = solver)
  gp_opt = gp.GPOpt(objective_func=crossval_objective,
                      lower_bound = np.array([ 10, 0.001,   5, 1e-2,   0,    0, 0]),
                      upper_bound = np.array([250,   0.4, 250,  1e4, 0.7, 1e-1, 4]),
                      params_names=["n_estimators", "learning_rate",
                                    "n_hidden_features", "reg_lambda",
                                    "dropout", "tolerance", "n_clusters"],
                      n_init=10, n_iter=190, seed=123)
  return {'parameters': gp_opt.optimize(verbose=2, abs_tol=1e-2), 'opt_object':  gp_opt}
opt_objects_lsboost.append(optimize_lsboost(X_train_kpca, y_train, solver="ridge"))
opt_objects_lsboost.append(optimize_lsboost(X_train_rf, y_train, solver="ridge"))
3 – Graphs
display(opt_objects_lsboost[0]['parameters'].best_params)
display(opt_objects_lsboost[1]['parameters'].best_params)
opt_objects_lsboost[0]['parameters'].best_params['n_estimators'] = int(opt_objects_lsboost[0]['parameters'].best_params['n_estimators'])
opt_objects_lsboost[1]['parameters'].best_params['n_estimators'] = int(opt_objects_lsboost[1]['parameters'].best_params['n_estimators'])
opt_objects_lsboost[0]['parameters'].best_params['n_hidden_features'] = int(opt_objects_lsboost[0]['parameters'].best_params['n_hidden_features'])
opt_objects_lsboost[1]['parameters'].best_params['n_hidden_features'] = int(opt_objects_lsboost[1]['parameters'].best_params['n_hidden_features'])
opt_objects_lsboost[0]['parameters'].best_params['n_clusters'] = int(opt_objects_lsboost[0]['parameters'].best_params['n_clusters'])
opt_objects_lsboost[1]['parameters'].best_params['n_clusters'] = int(opt_objects_lsboost[1]['parameters'].best_params['n_clusters'])
{'n_estimators': 221.10595703125,
 'learning_rate': 0.12772097778320313,
 'n_hidden_features': 45.053253173828125,
 'reg_lambda': 2496.6505697631837,
 'dropout': 0.2851226806640625,
 'tolerance': 0.0047698974609375,
 'n_clusters': 3.1986083984375}
{'n_estimators': 193.544921875,
 'learning_rate': 0.3466668701171875,
 'n_hidden_features': 208.9971923828125,
 'reg_lambda': 1866.4632116699217,
 'dropout': 0.37947998046875,
 'tolerance': 0.01290283203125,
 'n_clusters': 3.04443359375}
import matplotlib.pyplot as plt
import numpy as np
from matplotlib.colors import ListedColormap
from sklearn.inspection import DecisionBoundaryDisplay
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import GradientBoostingClassifier
classifiers = [RandomForestClassifier(),
               GradientBoostingClassifier(),
               ms.LSBoostClassifier(**opt_objects_lsboost[0]['parameters'].best_params),
               ms.LSBoostClassifier(**opt_objects_lsboost[1]['parameters'].best_params)]
names = ["rf", "gb", "lsboost_pca", "lsboost_rf"]
figure = plt.figure(figsize=(27, 9))
i = 1
datasets = [(X_kpca, y), (X_rf, y)]
# iterate over datasets
for ds_cnt, ds in enumerate(datasets):
    # preprocess dataset, split into training and test part
    X, y = ds[0], ds[1]
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.4, random_state=42
    )
    x_min, x_max = X[:, 0].min() - 0.5, X[:, 0].max() + 0.5
    y_min, y_max = X[:, 1].min() - 0.5, X[:, 1].max() + 0.5
    # just plot the dataset first
    cm = plt.cm.RdBu
    cm_bright = ListedColormap(["#FF0000", "#0000FF"])
    ax = plt.subplot(len(datasets), len(classifiers) + 1, i)
    if ds_cnt == 0:
        ax.set_title("Input data")
    # Plot the training points
    ax.scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap=cm_bright, edgecolors="k")
    # Plot the testing points
    ax.scatter(
        X_test[:, 0], X_test[:, 1], c=y_test, cmap=cm_bright, alpha=0.6, edgecolors="k"
    )
    ax.set_xlim(x_min, x_max)
    ax.set_ylim(y_min, y_max)
    ax.set_xticks(())
    ax.set_yticks(())
    i += 1
    # iterate over classifiers
    for name, clf in zip(names, classifiers):
        ax = plt.subplot(len(datasets), len(classifiers) + 1, i)
        clf = make_pipeline(StandardScaler(), clf)
        clf.fit(X_train, y_train)
        try:
            score = clf.score(X_test, y_test)
        except: # no scoring method available yet for prediction sets
            score = np.mean(clf.predict_proba(X_test).argmax(axis=1) == y_test)
        DecisionBoundaryDisplay.from_estimator(
            clf, X, cmap=cm, alpha=0.8, ax=ax, eps=0.5
        )
        # Plot the training points
        ax.scatter(
            X_train[:, 0], X_train[:, 1], c=y_train, cmap=cm_bright, edgecolors="k"
        )
        # Plot the testing points
        ax.scatter(
            X_test[:, 0],
            X_test[:, 1],
            c=y_test,
            cmap=cm_bright,
            edgecolors="k",
            alpha=0.6,
        )
        ax.set_xlim(x_min, x_max)
        ax.set_ylim(y_min, y_max)
        ax.set_xticks(())
        ax.set_yticks(())
        if ds_cnt == 0:
            ax.set_title(name)
        ax.text(
            x_max - 0.3,
            y_min + 0.3,
            ("%.2f" % score).lstrip("0"),
            size=15,
            horizontalalignment="right",
        )
        i += 1
plt.tight_layout()
plt.show()
 43%|████▎     | 94/221 [00:00<00:00, 178.28it/s]
 26%|██▋       | 51/193 [00:02<00:07, 18.66it/s]
 54%|█████▍    | 51/94 [00:00<00:00, 449.07it/s]
100%|██████████| 51/51 [00:00<00:00, 61.11it/s]
To leave a comment for the author, please follow the link and comment on their blog:  T. Moudiki's Webpage - R.
R-bloggers.com offers daily e-mail updates about R news and tutorials about learning R and many other topics. Click here if you're looking to post or find an R/data-science job.
Want to share your content on R-bloggers? click here if you have a blog, or here if you don't.
