Supervised Learning with scikit-learn

Before We Start

We will use and apply the following with sklearn.

Datasets

  1. breast cancer: sklearn provided binary classification dataset; whether a patient’s cancer is benign or malignant
  2. boston: sklearn provided regression dataset; house price in boston

Model Selection

  1. train test split
  2. cross validate (k-fold)

Feature Preprocessing

  1. standard scaler
  2. polynomial features

Algorithms

  1. k nearest neighbors
  2. linear, logistic regression
  3. ridge regression
  4. lasso regression
  5. support vector machine
  6. decision tree
  7. random forest
  8. gradient boosting
  9. neural network

Evaluation Metric

  1. accuracy
  2. r-squared score
  3. precision
  4. recall
  5. ROC AUC
%matplotlib inline
import numpy as np
import pandas as pd
import seaborn as sns
sns.set()
import matplotlib.pyplot as plt

from sklearn.datasets import load_breast_cancer, load_boston

from sklearn.model_selection import train_test_split, cross_validate, cross_val_score

from sklearn.preprocessing import StandardScaler, PolynomialFeatures

from sklearn.metrics import roc_auc_score, precision_score, recall_score, f1_score, classification_report

from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.linear_model import LogisticRegression, Ridge, Lasso, LinearRegression
from sklearn.svm import SVC, SVR
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier, MLPRegressor

np.set_printoptions(precision=2)
np.random.seed(0)

Function to evaluate different models

def evaluate_model(model, scaling, poly_degree=False):

    if poly_degree:
        poly = PolynomialFeatures(degree=poly_degree)
        X_train_poly = poly.fit_transform(X_train)    
        X_test_poly = poly.transform(X_test)
        if scaling:
            # scale both X_train and X_test with scaler fitted on X_train
            scaler = StandardScaler()
            X_train_scaled = scaler.fit_transform(X_train_poly)
            X_test_scaled = scaler.transform(X_test_poly)
        else:
            X_train_scaled = X_train_poly
            X_test_scaled = X_test_poly
    else:
        if scaling:
            # scale both X_train and X_test with scaler fitted on X_train
            scaler = StandardScaler()
            X_train_scaled = scaler.fit_transform(X_train)
            X_test_scaled = scaler.transform(X_test)
        else:
            X_train_scaled = X_train
            X_test_scaled = X_test

    # fit the model with X_train_scaled, y_train
    model.fit(X_train_scaled, y_train)

    # predict the test labels
    prediction = model.predict(X_test_scaled)

    # evaluate the model
    train_score = model.score(X_train_scaled, y_train)
    test_score = model.score(X_test_scaled, y_test)

    print('Train Score: '+ str(train_score))
    print('Test Score:' + str(test_score))
    print('')

    return model

Dataset for classification: Breast Cancer

data = load_breast_cancer()
X_iris = data['data']
y_iris = data['target']
target_names = data['target_names']
feature_names = data['feature_names']
print(target_names, feature_names)

X_train, X_test, y_train, y_test = train_test_split(X_iris, y_iris, random_state=1)
['malignant' 'benign'] ['mean radius' 'mean texture' 'mean perimeter' 'mean area'
 'mean smoothness' 'mean compactness' 'mean concavity'
 'mean concave points' 'mean symmetry' 'mean fractal dimension'
 'radius error' 'texture error' 'perimeter error' 'area error'
 'smoothness error' 'compactness error' 'concavity error'
 'concave points error' 'symmetry error' 'fractal dimension error'
 'worst radius' 'worst texture' 'worst perimeter' 'worst area'
 'worst smoothness' 'worst compactness' 'worst concavity'
 'worst concave points' 'worst symmetry' 'worst fractal dimension']

k-NN Classifier

# n_neighbors = 5 (default)
model = KNeighborsClassifier()
trained_model = evaluate_model(model, scaling=True)

# n_neighbors = 10
model = KNeighborsClassifier(n_neighbors=10)
trained_model = evaluate_model(model, scaling=True)

# n_neighbors = 1
model = KNeighborsClassifier(n_neighbors=1)
trained_model = evaluate_model(model, scaling=True)
Train Score: 0.9812206572769953
Test Score:0.951048951048951

Train Score: 0.9788732394366197
Test Score:0.958041958041958

Train Score: 1.0
Test Score:0.958041958041958

Logistic Regression

# C=1 (Default)
model = LogisticRegression()
trained_model = evaluate_model(model, scaling=False)

# C=10
model = LogisticRegression(C=10)
trained_model = evaluate_model(model, scaling=False)
print('Test precision: ' + str(precision_score(y_test, trained_model.predict(X_test))))
print('Test recall: ' + str(recall_score(y_test, trained_model.predict(X_test))))
print('Test f1 score: ' + str(f1_score(y_test, trained_model.predict(X_test))))
print('Test ROC AUC score: ' + str(roc_auc_score(y_test, trained_model.predict_proba(X_test)[:, 1])) + '\n')

# C=0.1
model = LogisticRegression(C=0.1)
trained_model = evaluate_model(model, scaling=False)
Train Score: 0.9577464788732394
Test Score:0.951048951048951

Train Score: 0.9671361502347418
Test Score:0.965034965034965

Test precision: 0.9662921348314607
Test recall: 0.9772727272727273
Test f1 score: 0.9717514124293786
Test ROC AUC score: 0.9915289256198347

Train Score: 0.9460093896713615
Test Score:0.951048951048951

Linear Support Vector Classifier

# C=1 (default)
model = SVC(kernel='linear')
trained_model = evaluate_model(model, scaling=True)

# C=10
model = SVC(kernel='linear', C=10)
trained_model = evaluate_model(model, scaling=True)

# C=0.1
model = SVC(kernel='linear', C=0.1)
trained_model = evaluate_model(model, scaling=True)
Train Score: 0.9882629107981221
Test Score:0.965034965034965

Train Score: 0.9929577464788732
Test Score:0.958041958041958

Train Score: 0.9882629107981221
Test Score:0.958041958041958

Kernelized Support Vector Classifier

# C=1 (default)
model = SVC()
trained_model = evaluate_model(model, scaling=True)

# C=10
model = SVC(C=10)
trained_model = evaluate_model(model, scaling=True)

# C=0.1
model = SVC(C=0.1)
trained_model = evaluate_model(model, scaling=True)
Train Score: 0.9882629107981221
Test Score:0.965034965034965

Train Score: 0.9953051643192489
Test Score:0.958041958041958

Train Score: 0.9530516431924883
Test Score:0.951048951048951

Decision Tree

# max_depth=None (default)
model = DecisionTreeClassifier()
trained_model = evaluate_model(model, scaling=False)

print('Feature Importances')
print(list(zip(feature_names, trained_model.feature_importances_)))
print('')

# max_depth=3
model = DecisionTreeClassifier(max_depth=3)
trained_model = evaluate_model(model, scaling=False)

print(classification_report(y_test, trained_model.predict(X_test)))
Train Score: 1.0
Test Score:0.9440559440559441

Feature Importances
[('mean radius', 0.0), ('mean texture', 0.022007779353930466), ('mean perimeter', 0.0), ('mean area', 0.0), ('mean smoothness', 0.010012730496503279), ('mean compactness', 0.0), ('mean concavity', 0.006724599247034309), ('mean concave points', 0.021009493579550387), ('mean symmetry', 0.0), ('mean fractal dimension', 0.0), ('radius error', 0.0033206182232256025), ('texture error', 0.0), ('perimeter error', 0.0), ('area error', 0.0), ('smoothness error', 0.0), ('compactness error', 0.036312835933985266), ('concavity error', 0.0), ('concave points error', 0.0), ('symmetry error', 0.0), ('fractal dimension error', 0.0), ('worst radius', 0.0), ('worst texture', 0.08065645725141522), ('worst perimeter', 0.7505955544414274), ('worst area', 0.0), ('worst smoothness', 0.0), ('worst compactness', 0.0), ('worst concavity', 0.022191177515213226), ('worst concave points', 0.047168753957714735), ('worst symmetry', 0.0), ('worst fractal dimension', 0.0)]

Train Score: 0.971830985915493
Test Score:0.8951048951048951

             precision    recall  f1-score   support

          0       0.87      0.85      0.86        55
          1       0.91      0.92      0.92        88

avg / total       0.89      0.90      0.89       143

RandomForest Classifier

model = RandomForestClassifier()
trained_model = evaluate_model(model, scaling=False)

model = RandomForestClassifier(max_depth=3)
trained_model = evaluate_model(model, scaling=False)

model = RandomForestClassifier(max_depth=10)
trained_model = evaluate_model(model, scaling=False)
Train Score: 1.0
Test Score:0.9370629370629371

Train Score: 0.9788732394366197
Test Score:0.951048951048951

Train Score: 0.9953051643192489
Test Score:0.9300699300699301

Gradient Boosted Decision Tree Classifier

model = GradientBoostingClassifier()
trained_model = evaluate_model(model, scaling=False)

model = GradientBoostingClassifier(n_estimators=5)
trained_model = evaluate_model(model, scaling=False)

model = GradientBoostingClassifier(learning_rate = 0.001)
trained_model = evaluate_model(model, scaling=False)
Train Score: 1.0
Test Score:0.965034965034965

Train Score: 0.9647887323943662
Test Score:0.916083916083916

Train Score: 0.6314553990610329
Test Score:0.6153846153846154

Neural Network

model = MLPClassifier(max_iter=10000)
trained_model = evaluate_model(model, scaling=True)
Train Score: 0.9953051643192489
Test Score:0.972027972027972

Dataset for Regression: boston

boston = load_boston()
X_boston = boston['data']
y_boston = boston['target']
feature_names = boston['feature_names']

X_train, X_test, y_train, y_test = train_test_split(X_boston, y_boston, random_state=1)

KNeighbors Regression

model = KNeighborsRegressor()
trained_model = evaluate_model(model, scaling=False)

model = KNeighborsRegressor(n_neighbors=4)
trained_model = evaluate_model(model, scaling=False)
Train Score: 0.6667843569655112
Test Score:0.5281871748119744

Train Score: 0.7169330804027261
Test Score:0.5847379837420476

Linear regression

model = LinearRegression()
trained_model = evaluate_model(model, scaling=False)

model = Ridge()
trained_model = evaluate_model(model, scaling=True)

model = Ridge(alpha=20)
trained_model = evaluate_model(model, scaling=True)

model = Lasso()
trained_model = evaluate_model(model, scaling=True)

print('Features with non-zero weight (sorted by absolute magnitude):')

for e in sorted (list(zip(list(feature_names), trained_model.coef_)), key = lambda e: -abs(e[1])):
    print('\t{}, {:.3f}'.format(e[0], e[1]))
Train Score: 0.7167286808673382
Test Score:0.7790257749137307

Train Score: 0.7167018610210802
Test Score:0.7790559050301669

Train Score: 0.7120104783011092
Test Score:0.7743235092832248

Train Score: 0.6350155488350683
Test Score:0.669415418149101

Features with non-zero weight (sorted by absolute magnitude):
	LSTAT, -3.882
	RM, 1.981
	PTRATIO, -1.353
	CRIM, -0.000
	ZN, 0.000
	INDUS, -0.000
	CHAS, 0.000
	NOX, -0.000
	AGE, -0.000
	DIS, -0.000
	RAD, -0.000
	TAX, -0.000
	B, 0.000

Polynomial regression

model = LinearRegression()
trained_model = evaluate_model(model, scaling=False)

model = LinearRegression()
trained_model = evaluate_model(model, scaling=False, poly_degree=2)

model = Ridge()
trained_model = evaluate_model(model, scaling=True, poly_degree=2)

model = Ridge()
trained_model = evaluate_model(model, scaling=True, poly_degree=3)
Train Score: 0.7167286808673382
Test Score:0.7790257749137307

Train Score: 0.7389256270176032
Test Score:0.5867916321234202

Train Score: 0.9014299197022099
Test Score:0.9191961799874806

Train Score: 0.9490398580910372
Test Score:0.9321929473192055

Support Vector Regression

model = SVR()
trained_model = evaluate_model(model, scaling=True)

model = SVR(C=40)
trained_model = evaluate_model(model, scaling=True)

model = SVR(kernel='linear')
trained_model = evaluate_model(model, scaling=True)
Train Score: 0.6474681713166364
Test Score:0.6629378122683037

Train Score: 0.9298640839552765
Test Score:0.9269844037332291

Train Score: 0.6812467645514385
Test Score:0.7740854988602284

Neural Network

model = MLPRegressor(max_iter=10000)
trained_model = evaluate_model(model, scaling=True)
Train Score: 0.9360686347144158
Test Score:0.9165525614065948

Cross-validation

model = LinearRegression()
result = pd.DataFrame(cross_validate(model, X_boston, y_boston, scoring='r2', cv=10, return_train_score=True))
result
fit_time score_time test_score train_score
0 0.001995 0.000000 0.733349 0.738790
1 0.000997 0.000000 0.472298 0.747077
2 0.000000 0.000000 -1.010977 0.743820
3 0.000999 0.000000 0.641263 0.717158
4 0.004989 0.000000 0.547098 0.741726
5 0.002018 0.000976 0.736102 0.704297
6 0.000997 0.000000 0.377618 0.746045
7 0.000000 0.000000 -0.130269 0.838793
8 0.000000 0.000000 -0.783723 0.736517
9 0.000000 0.000997 0.418618 0.742091
result.mean(axis=0)
fit_time       0.001199
score_time     0.000197
test_score     0.200138
train_score    0.745632
dtype: float64
model = LogisticRegression()
result = pd.DataFrame(cross_validate(model, X_iris, y_iris, scoring='accuracy', cv=10, return_train_score=True))
result.mean(axis=0)
fit_time       0.008273
score_time     0.000399
test_score     0.950900
train_score    0.958212
dtype: float64

Leave a Comment