For classification, the target value is a discrete class value
For regression, the target value is continuous (floating point / real-valued)
It's a specific mathematical or computational description that expresses the relationship between a set of input variables and one or more outcome variables that are being studied or predicted. In statistical terms the input variables are called independent variable
and outcome variables are termed dependent variables
. In machine learning we use the term features
to refer to the input, or independent variables, and target value
or target label
to refer to the output, dependent varbales
%matplotlib notebook
import numpy as np
import pandas as pd
import seaborn as sn
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
np.set_printoptions(precision=2)
fruits = pd.read_table('fruit_data_with_colors.txt')
feature_names_fruits = ['height', 'width', 'mass', 'color_score']
X_fruits = fruits[feature_names_fruits]
y_fruits = fruits['fruit_label']
target_names_fruits = ['apple', 'mandarin', 'orange', 'lemon']
X_fruits_2d = fruits[['height', 'width']]
y_fruits_2d = fruits['fruit_label']
X_train, X_test, y_train, y_test = train_test_split(X_fruits, y_fruits, random_state=0)
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
# we must apply the scaling to the test set that we computed for the training set
X_test_scaled = scaler.transform(X_test)
knn = KNeighborsClassifier(n_neighbors = 5)
knn.fit(X_train_scaled, y_train)
print('Accuracy of K-NN classifier on training set: {:.2f}'
.format(knn.score(X_train_scaled, y_train)))
print('Accuracy of K-NN classifier on test set: {:.2f}'
.format(knn.score(X_test_scaled, y_test)))
example_fruit = [[5.5, 2.2, 10, 0.70]]
example_fruit_scaled = scaler.transform(example_fruit)
print('Predicted fruit type for ', example_fruit, ' is ',
target_names_fruits[knn.predict(example_fruit_scaled)[0]-1])
from sklearn.datasets import make_classification, make_blobs
from matplotlib.colors import ListedColormap
from sklearn.datasets import load_breast_cancer
from adspy_shared_utilities import load_crime_dataset
cmap_bold = ListedColormap(['#FFFF00', '#00FF00', '#0000FF','#000000'])
# synthetic dataset for simple regression
from sklearn.datasets import make_regression
plt.figure()
plt.title('Sample regression problem with one input variable')
X_R1, y_R1 = make_regression(n_samples = 100, n_features=1,
n_informative=1, bias = 150.0,
noise = 30, random_state=0)
plt.scatter(X_R1, y_R1, marker= 'o', s=50)
plt.show()
# synthetic dataset for more complex regression
from sklearn.datasets import make_friedman1
plt.figure()
plt.title('Complex regression problem with one input variable')
X_F1, y_F1 = make_friedman1(n_samples = 100,
n_features = 7, random_state=0)
plt.scatter(X_F1[:, 2], y_F1, marker= 'o', s=50)
plt.show()
# synthetic dataset for classification (binary)
plt.figure()
plt.title('Sample binary classification problem with two informative features')
X_C2, y_C2 = make_classification(n_samples = 100, n_features=2,
n_redundant=0, n_informative=2,
n_clusters_per_class=1, flip_y = 0.1,
class_sep = 0.5, random_state=0)
plt.scatter(X_C2[:, 0], X_C2[:, 1], c=y_C2,
marker= 'o', s=50, cmap=cmap_bold)
plt.show()
# more difficult synthetic dataset for classification (binary)
# with classes that are not linearly separable
X_D2, y_D2 = make_blobs(n_samples = 100, n_features = 2, centers = 8,
cluster_std = 1.3, random_state = 4)
y_D2 = y_D2 % 2
plt.figure()
plt.title('Sample binary classification problem with non-linearly separable classes')
plt.scatter(X_D2[:,0], X_D2[:,1], c=y_D2,
marker= 'o', s=50, cmap=cmap_bold)
plt.show()
# Breast cancer dataset for classification
cancer = load_breast_cancer()
(X_cancer, y_cancer) = load_breast_cancer(return_X_y = True)
# Communities and Crime dataset
(X_crime, y_crime) = load_crime_dataset()
from adspy_shared_utilities import plot_two_class_knn
X_train, X_test, y_train, y_test = train_test_split(X_C2, y_C2,
random_state=0)
plot_two_class_knn(X_train, y_train, 1, 'uniform', X_test, y_test)
plot_two_class_knn(X_train, y_train, 3, 'uniform', X_test, y_test)
plot_two_class_knn(X_train, y_train, 11, 'uniform', X_test, y_test)
from sklearn.neighbors import KNeighborsRegressor
X_train, X_test, y_train, y_test = train_test_split(X_R1, y_R1, random_state = 0)
knnreg = KNeighborsRegressor(n_neighbors = 5).fit(X_train, y_train)
print(knnreg.predict(X_test))
print('R-squared test score: {:.3f}'
.format(knnreg.score(X_test, y_test)))
fig, subaxes = plt.subplots(1, 2, figsize=(8,4))
X_predict_input = np.linspace(-3, 3, 50).reshape(-1,1)
X_train, X_test, y_train, y_test = train_test_split(X_R1[0::5], y_R1[0::5], random_state = 0)
for thisaxis, K in zip(subaxes, [1, 3]):
knnreg = KNeighborsRegressor(n_neighbors = K).fit(X_train, y_train)
y_predict_output = knnreg.predict(X_predict_input)
thisaxis.set_xlim([-2.5, 0.75])
thisaxis.plot(X_predict_input, y_predict_output, '^', markersize = 10,
label='Predicted', alpha=0.8)
thisaxis.plot(X_train, y_train, 'o', label='True Value', alpha=0.8)
thisaxis.set_xlabel('Input feature')
thisaxis.set_ylabel('Target value')
thisaxis.set_title('KNN regression (K={})'.format(K))
thisaxis.legend()
plt.tight_layout()
# plot k-NN regression on sample dataset for different values of K
fig, subaxes = plt.subplots(5, 1, figsize=(5,20))
X_predict_input = np.linspace(-3, 3, 500).reshape(-1,1)
X_train, X_test, y_train, y_test = train_test_split(X_R1, y_R1,
random_state = 0)
for thisaxis, K in zip(subaxes, [1, 3, 7, 15, 55]):
knnreg = KNeighborsRegressor(n_neighbors = K).fit(X_train, y_train)
y_predict_output = knnreg.predict(X_predict_input)
train_score = knnreg.score(X_train, y_train)
test_score = knnreg.score(X_test, y_test)
thisaxis.plot(X_predict_input, y_predict_output)
thisaxis.plot(X_train, y_train, 'o', alpha=0.9, label='Train')
thisaxis.plot(X_test, y_test, '^', alpha=0.9, label='Test')
thisaxis.set_xlabel('Input feature')
thisaxis.set_ylabel('Target value')
thisaxis.set_title('KNN Regression (K={})\n\
Train $R^2 = {:.3f}$, Test $R^2 = {:.3f}$'
.format(K, train_score, test_score))
thisaxis.legend()
plt.tight_layout(pad=0.4, w_pad=0.5, h_pad=1.0)
Model complexity:
n_neighbors
: number of nearest neighbors (k) to considermetric
: distance function between data points$y = w_0x + b$
$w_0$: linreg.coef_
$b$: linreg.intercept_
Note: If a Scikit-learn object attribute ends with an underscore, this means that these attributes were derived from training data, and not say quantities that were set by the user.
from sklearn.linear_model import LinearRegression
X_train, X_test, y_train, y_test = train_test_split(X_R1, y_R1,
random_state = 0)
linreg = LinearRegression().fit(X_train, y_train)
print('linear model coeff (w): {}'
.format(linreg.coef_))
print('linear model intercept (b): {:.3f}'
.format(linreg.intercept_))
print('R-squared score (training): {:.3f}'
.format(linreg.score(X_train, y_train)))
print('R-squared score (test): {:.3f}'
.format(linreg.score(X_test, y_test)))
plt.figure(figsize=(5,4))
plt.scatter(X_R1, y_R1, marker= 'o', s=50, alpha=0.8)
plt.plot(X_R1, linreg.coef_ * X_R1 + linreg.intercept_, 'r-')
plt.title('Least-squares linear regression')
plt.xlabel('Feature value (x)')
plt.ylabel('Target value (y)')
plt.show()
X_train, X_test, y_train, y_test = train_test_split(X_crime, y_crime,
random_state = 0)
linreg = LinearRegression().fit(X_train, y_train)
print('Crime dataset')
print('linear model intercept: {}'
.format(linreg.intercept_))
print('linear model coeff:\n{}'
.format(linreg.coef_))
print('R-squared score (training): {:.3f}'
.format(linreg.score(X_train, y_train)))
print('R-squared score (test): {:.3f}'
.format(linreg.score(X_test, y_test)))
w
, b
using the same least-squares criterion but adds a penalty for large variations in w
parameters$$RSS_{RIDGS}(w, b) = \sum_{i=1}^N(y_i - (w\cdot x_i + b))^2 + \alpha\sum_{j=1}^pw_j^2$$
w
entriesThink for a moment intuitively, what ridge regression is doing. It's regularizing the linear regression by imposing that sum of squares penalty on the size of the w
coefficients. So the effect of increasing alpha
is to shrink the w
coefficients toward zero and towards each other. But if the input variables, the features have very different scales, then when this shrinkage happens of the coefficients, input variables with different scales will have different contributions to this L2 penalty, because the L2 penalty is a sum of squares of all the coefficients. So transforming the input features, then they're all on the same scale, means the ridge penalty is in some sense applied more fairly to all features without unduly weighting some more than others.
from sklearn.linear_model import Ridge
X_train, X_test, y_train, y_test = train_test_split(X_crime, y_crime,
random_state = 0)
linridge = Ridge(alpha=20.0).fit(X_train, y_train)
print('Crime dataset')
print('ridge regression linear model intercept: {}'
.format(linridge.intercept_))
print('ridge regression linear model coeff:\n{}'
.format(linridge.coef_))
print('R-squared score (training): {:.3f}'
.format(linridge.score(X_train, y_train)))
print('R-squared score (test): {:.3f}'
.format(linridge.score(X_test, y_test)))
print('Number of non-zero features: {}'
.format(np.sum(linridge.coef_ != 0)))
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
from sklearn.linear_model import Ridge
X_train, X_test, y_train, y_test = train_test_split(X_crime, y_crime,
random_state = 0)
# `fit` method using the training data to compute the min and max feature values
# To apply the scalar,then call `transform` method
# Note: use `fit_transform` method to efficiently perform fitting and transforming in a single step on the training set
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
linridge = Ridge(alpha=20.0).fit(X_train_scaled, y_train)
print('Crime dataset')
print('ridge regression linear model intercept: {}'
.format(linridge.intercept_))
print('ridge regression linear model coeff:\n{}'
.format(linridge.coef_))
print('R-squared score (training): {:.3f}'
.format(linridge.score(X_train_scaled, y_train)))
print('R-squared score (test): {:.3f}'
.format(linridge.score(X_test_scaled, y_test)))
print('Number of non-zero features: {}'
.format(np.sum(linridge.coef_ != 0)))
Feature Normalization: The test set must use identical scaling to the training set
One downside to performing feature normalization is that the resulting model and the transformed features may be harder to interpret.
In the end, the type of feature normalization that's best to apply can depend on the data set learning task and learning algorithm to be used
Summary: In general, regularisation works especially well when you have relatively small amounts of training data compared to the number of features in your model. Regularisation becomes less important as the amount of training data you have increases.
Significantly larger or smaller values of alpha both lead to significantly worse model fit. This is another illustration of the general relationship between model complexity and test set performance. Where there's often an intermediate best value of a model of complexity parameter that does not lead to either under or overfitting
print('Ridge regression: effect of alpha regularization parameter\n')
for this_alpha in [0, 1, 10, 20, 50, 100, 1000]:
linridge = Ridge(alpha = this_alpha).fit(X_train_scaled, y_train)
r2_train = linridge.score(X_train_scaled, y_train)
r2_test = linridge.score(X_test_scaled, y_test)
num_coeff_bigger = np.sum(abs(linridge.coef_) > 1.0)
print('Alpha = {:.2f}\nnum abs(coeff) > 1.0: {}, \
r-squared training: {:.2f}, r-squared test: {:.2f}\n'
.format(this_alpha, num_coeff_bigger, r2_train, r2_test))
Lasso regression is another form of regularized linear regression that uses an L1 regularization penalty for training (instead of ridge's L2 penalty)
w
to zero for the least influential variables. This is called a sparse solution: a kind of feature selectionfrom sklearn.linear_model import Lasso
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
X_train, X_test, y_train, y_test = train_test_split(X_crime, y_crime,
random_state = 0)
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
linlasso = Lasso(alpha=2.0, max_iter = 10000).fit(X_train_scaled, y_train)
print('Crime dataset')
print('lasso regression linear model intercept: {}'
.format(linlasso.intercept_))
print('lasso regression linear model coeff:\n{}'
.format(linlasso.coef_))
print('Non-zero features: {}'
.format(np.sum(linlasso.coef_ != 0)))
print('R-squared score (training): {:.3f}'
.format(linlasso.score(X_train_scaled, y_train)))
print('R-squared score (test): {:.3f}\n'
.format(linlasso.score(X_test_scaled, y_test)))
print('Features with non-zero weight (sorted by absolute magnitude):')
for e in sorted (list(zip(list(X_crime), linlasso.coef_)),
key = lambda e: -abs(e[1])):
if e[1] != 0:
print('\t{}, {:.3f}'.format(e[0], e[1]))
Code book
top five strongest features:
PctKidsBornNeverMar, 1488.365
: percentage of kids born to people who never PctKids2Par, -1188.740
: percentage of kids in family housing with two parentsHousVacant, 459.538
: number of vacant householdsPctPersDenseHous, 339.045
: percent of persons in dense housing (more than 1 person/room)NumInShelters, 264.932
: number of people in homeless sheltersprint('Lasso regression: effect of alpha regularization\n\
parameter on number of features kept in final model\n')
for alpha in [0.5, 1, 2, 3, 5, 10, 20, 50]:
linlasso = Lasso(alpha, max_iter = 10000).fit(X_train_scaled, y_train)
r2_train = linlasso.score(X_train_scaled, y_train)
r2_test = linlasso.score(X_test_scaled, y_test)
print('Alpha = {:.2f}\nFeatures kept: {}, r-squared training: {:.2f}, \
r-squared test: {:.2f}\n'
.format(alpha, np.sum(linlasso.coef_ != 0), r2_train, r2_test))
Polynomial Features with Linear Regression
$$x=(x_0, x_1) => x^{'}=(x_0, x_1, x_0^2, x_0x_1, x_1^2)$$
$$\hat{y}= \hat{w_0}x_0+\hat{w_1}x_1+\hat{w_{00}}x_0^2+\hat{w_{01}}x_0x_1+\hat{w_{11}}x_1^2$$
w
and b
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.preprocessing import PolynomialFeatures
X_train, X_test, y_train, y_test = train_test_split(X_F1, y_F1,
random_state = 0)
linreg = LinearRegression().fit(X_train, y_train)
print('linear model coeff (w): {}'
.format(linreg.coef_))
print('linear model intercept (b): {:.3f}'
.format(linreg.intercept_))
print('R-squared score (training): {:.3f}'
.format(linreg.score(X_train, y_train)))
print('R-squared score (test): {:.3f}'
.format(linreg.score(X_test, y_test)))
print('\nNow we transform the original input data to add\n\
polynomial features up to degree 2 (quadratic)\n')
poly = PolynomialFeatures(degree=2)
X_F1_poly = poly.fit_transform(X_F1)
X_train, X_test, y_train, y_test = train_test_split(X_F1_poly, y_F1,
random_state = 0)
linreg = LinearRegression().fit(X_train, y_train)
print('(poly deg 2) linear model coeff (w):\n{}'
.format(linreg.coef_))
print('(poly deg 2) linear model intercept (b): {:.3f}'
.format(linreg.intercept_))
print('(poly deg 2) R-squared score (training): {:.3f}'
.format(linreg.score(X_train, y_train)))
print('(poly deg 2) R-squared score (test): {:.3f}\n'
.format(linreg.score(X_test, y_test)))
print('\nAddition of many polynomial features often leads to\n\
overfitting, so we often use polynomial features in combination\n\
with regression that has a regularization penalty, like ridge\n\
regression.\n')
X_train, X_test, y_train, y_test = train_test_split(X_F1_poly, y_F1,
random_state = 0)
linreg = Ridge().fit(X_train, y_train)
print('(poly deg 2 + ridge) linear model coeff (w):\n{}'
.format(linreg.coef_))
print('(poly deg 2 + ridge) linear model intercept (b): {:.3f}'
.format(linreg.intercept_))
print('(poly deg 2 + ridge) R-squared score (training): {:.3f}'
.format(linreg.score(X_train, y_train)))
print('(poly deg 2 + ridge) R-squared score (test): {:.3f}'
.format(linreg.score(X_test, y_test)))
Logistic regression can be seen as a kind of generalized linear model, and like ordinary least squares and other regression methods, logistic regression takes a set input variables, the features, and estimates a target value. However, unlike ordinary linear regression, in it's most basic form logistic repressions target value is a binary variable instead of a continuous value. There are flavors of logistic regression that can be also be used in cases where the target value to be predicted is multi class categorical variable, not just binary.
Linear Regression $$\hat{y}=\hat{b}+\hat{w_1}\cdot{x_1}+...\hat{w_n}\cdot{x_n}$$
Linear models for classification: Logistic Regression
$$\hat{y}=logistic(\hat{b}+\hat{w_1}\cdot{x_1}+...\hat{w_n}\cdot{x_n})$$
$$= \frac{1}{1+e^{-(\hat{b}+\hat{w_1}\cdot{x_1}+...\hat{w_n}\cdot{x_n})}}$$
The logistic function transforms real-valued input to an output number y
between 0 and 1, interpreted as the probability the input object belongs to the positive class, given its input features $(x_0, x_1,...,x_n)$
from sklearn.linear_model import LogisticRegression
from adspy_shared_utilities import (
plot_class_regions_for_classifier_subplot)
fig, subaxes = plt.subplots(1, 1, figsize=(7, 5))
y_fruits_apple = y_fruits_2d == 1 # make into a binary problem: predicting whether an object is apples vs everything else
X_train, X_test, y_train, y_test = (
train_test_split(X_fruits_2d.as_matrix(), #using only height and width as the features space
y_fruits_apple.as_matrix(),
random_state = 0))
clf = LogisticRegression(C=100).fit(X_train, y_train)
plot_class_regions_for_classifier_subplot(clf, X_train, y_train, None,
None, 'Logistic regression \
for binary classification\nFruit dataset: Apple vs others',
subaxes)
h = 6
w = 8
print('A fruit with height {} and width {} is predicted to be: {}'
.format(h,w, ['not an apple', 'an apple'][clf.predict([[h,w]])[0]]))
h = 10
w = 7
print('A fruit with height {} and width {} is predicted to be: {}'
.format(h,w, ['not an apple', 'an apple'][clf.predict([[h,w]])[0]]))
subaxes.set_xlabel('height')
subaxes.set_ylabel('width')
print('Accuracy of Logistic regression classifier on training set: {:.2f}'
.format(clf.score(X_train, y_train)))
print('Accuracy of Logistic regression classifier on test set: {:.2f}'
.format(clf.score(X_test, y_test)))
from sklearn.linear_model import LogisticRegression
from adspy_shared_utilities import (
plot_class_regions_for_classifier_subplot)
X_train, X_test, y_train, y_test = train_test_split(X_C2, y_C2,
random_state = 0)
fig, subaxes = plt.subplots(1, 1, figsize=(7, 5))
clf = LogisticRegression().fit(X_train, y_train)
title = 'Logistic regression, simple synthetic dataset C = {:.3f}'.format(1.0)
plot_class_regions_for_classifier_subplot(clf, X_train, y_train,
None, None, title, subaxes)
print('Accuracy of Logistic regression classifier on training set: {:.2f}'
.format(clf.score(X_train, y_train)))
print('Accuracy of Logistic regression classifier on test set: {:.2f}'
.format(clf.score(X_test, y_test)))
on
by default (like ridge regression)C
controls amount of regularization (default 1.0)X_train, X_test, y_train, y_test = (
train_test_split(X_fruits_2d.as_matrix(),
y_fruits_apple.as_matrix(),
random_state=0))
fig, subaxes = plt.subplots(3, 1, figsize=(4, 10))
for this_C, subplot in zip([0.1, 1, 100], subaxes):
clf = LogisticRegression(C=this_C).fit(X_train, y_train)
title ='Logistic regression (apple vs rest), C = {:.3f}'.format(this_C)
plot_class_regions_for_classifier_subplot(clf, X_train, y_train,
X_test, y_test, title,
subplot)
plt.tight_layout()
from sklearn.linear_model import LogisticRegression
X_train, X_test, y_train, y_test = train_test_split(X_cancer, y_cancer, random_state = 0)
clf = LogisticRegression().fit(X_train, y_train)
print('Breast cancer dataset')
print('Accuracy of Logistic regression classifier on training set: {:.2f}'
.format(clf.score(X_train, y_train)))
print('Accuracy of Logistic regression classifier on test set: {:.2f}'
.format(clf.score(X_test, y_test)))
from sklearn.svm import SVC
from adspy_shared_utilities import plot_class_regions_for_classifier_subplot
X_train, X_test, y_train, y_test = train_test_split(X_C2, y_C2, random_state = 0)
fig, subaxes = plt.subplots(1, 1, figsize=(7, 5))
this_C = 1.0
clf = SVC(kernel = 'linear', C=this_C).fit(X_train, y_train)
title = 'Linear SVC, C = {:.3f}'.format(this_C)
plot_class_regions_for_classifier_subplot(clf, X_train, y_train, None, None, title, subaxes)
C
C
: less regularizationC
: more regularizationfrom sklearn.svm import LinearSVC
from adspy_shared_utilities import plot_class_regions_for_classifier
X_train, X_test, y_train, y_test = train_test_split(X_C2, y_C2, random_state = 0)
fig, subaxes = plt.subplots(1, 2, figsize=(8, 4))
for this_C, subplot in zip([0.00001, 100], subaxes):
clf = LinearSVC(C=this_C).fit(X_train, y_train)
title = 'Linear SVC, C = {:.5f}'.format(this_C)
plot_class_regions_for_classifier_subplot(clf, X_train, y_train,
None, None, title, subplot)
plt.tight_layout()
from sklearn.svm import LinearSVC
X_train, X_test, y_train, y_test = train_test_split(X_cancer, y_cancer, random_state = 0)
clf = LinearSVC().fit(X_train, y_train)
print('Breast cancer dataset')
print('Accuracy of Linear SVC classifier on training set: {:.2f}'
.format(clf.score(X_train, y_train)))
print('Accuracy of Linear SVC classifier on test set: {:.2f}'
.format(clf.score(X_test, y_test)))
Pros:
Cons:
Scikit-learn makes it every easy to learn multi-class classification models. Essentially, it does this by converting a multi-class classification problem into a series of binary probelms. When you pass in a dataset that has a categorical variable for the target value, Scikit-learn detects this automatically and then for each class to be predicted, Scikit-learn creates one binary classifier that predicts that class against all the other classes. So for example, in the fruit dataset there are four categories of fruit, so Scikit-learn learns four different binary classifiers. To predict a new data instance, what is then does is takes that data instance to be predicted whose labels to be predict, and runs it against each of the binary classifiers in turn, and classifier that has the highest score is the one that whose class it uses as prediction value
from sklearn.svm import LinearSVC
X_train, X_test, y_train, y_test = train_test_split(X_fruits_2d, y_fruits_2d, random_state = 0)
clf = LinearSVC(C=5, random_state = 67).fit(X_train, y_train)
print('Coefficients:\n', clf.coef_)
print('Intercepts:\n', clf.intercept_)
$$y_{apple} = -0.26*height + 0.71*width - 3.29$$
height=2, width=6: $y_{apple}$ = +0.549 (>=0 : predict apple)
height=2, width=2: $y_{apple}$ = -2.340 (<0 : predict other)
plt.figure(figsize=(6,6))
colors = ['r', 'g', 'b', 'y']
cmap_fruits = ListedColormap(['#FF0000', '#00FF00', '#0000FF','#FFFF00'])
plt.scatter(X_fruits_2d[['height']], X_fruits_2d[['width']],
c=y_fruits_2d, cmap=cmap_fruits, edgecolor = 'black', alpha=.7)
x_0_range = np.linspace(-10, 15)
for w, b, color in zip(clf.coef_, clf.intercept_, ['r', 'g', 'b', 'y']):
# Since class prediction with a linear model uses the formula y = w_0 x_0 + w_1 x_1 + b,
# and the decision boundary is defined as being all points with y = 0, to plot x_1 as a
# function of x_0 we just solve w_0 x_0 + w_1 x_1 + b = 0 for x_1:
plt.plot(x_0_range, -(x_0_range * w[0] + b) / w[1], c=color, alpha=.8)
plt.legend(target_names_fruits)
plt.xlabel('height')
plt.ylabel('width')
plt.xlim(-2, 12)
plt.ylim(-2, 15)
plt.show()
from sklearn.svm import SVC
from adspy_shared_utilities import plot_class_regions_for_classifier
X_train, X_test, y_train, y_test = train_test_split(X_D2, y_D2, random_state = 0)
# The default SVC kernel is radial basis function (RBF)
plot_class_regions_for_classifier(SVC().fit(X_train, y_train),
X_train, y_train, None, None,
'Support Vector Classifier: RBF kernel')
# Compare decision boundries with polynomial kernel, degree = 3
plot_class_regions_for_classifier(SVC(kernel = 'poly', degree = 3) #polynomial kernal
.fit(X_train, y_train), X_train,
y_train, None, None,
'Support Vector Classifier: Polynomial kernel, degree = 3')
$$K(x,x^{'}) = e^{-\gamma \cdot ||x - x^{'}||}$$ $gamma(\gamma)$ : kernel width parameter
gamma controls how far the influence of a single trending example reaches, which in turn affects how tightly the decision boundaries end up surrounding points in the input space. Small gamma means a larger similarity radius. So that points farther apart are considered similar, which result in more points being group together and smoother decision boundaries. On the other hand, for larger values of gamma, the kernel value to K
is more quickly and points have to be very close to be considered similar. This results in more complex, tightly constrained decision boundaries,
from adspy_shared_utilities import plot_class_regions_for_classifier
X_train, X_test, y_train, y_test = train_test_split(X_D2, y_D2, random_state = 0)
fig, subaxes = plt.subplots(3, 1, figsize=(4, 11))
for this_gamma, subplot in zip([0.01, 1.0, 10.0], subaxes):
clf = SVC(kernel = 'rbf', gamma=this_gamma).fit(X_train, y_train)
title = 'Support Vector Classifier: \nRBF kernel, gamma = {:.2f}'.format(this_gamma)
plot_class_regions_for_classifier_subplot(clf, X_train, y_train,
None, None, title, subplot)
plt.tight_layout()
SVMs also have a regularization parameter C
that controls the tradeoff between satisfying the maximum margin criterion to find the simple decision boundary, and avoiding misclassification errors on the training set. The C
parameter is also an important one for kernelized SVMs, and it interacts with the gamma
parameter. If gamma
is large, then C
will have little to no effect. Well, if gamma
is small, the model is much more constrained and the effective C
will be similar to how it would affect a linear classifier. Typically gamma
and C
are turned together with the optimal combination typically in an intermediate range of values, for example, gamma
between 0.0001 and 10, and C
between 0.1 and 100. Kernelizaed SVMs are pretty sensitive to settings of gamma
.
The most important thing to remember when applying SVMs is that it's important to normalize the input data, so that all the features have comparable units that are on the same scale.
from sklearn.svm import SVC
from adspy_shared_utilities import plot_class_regions_for_classifier_subplot
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_D2, y_D2, random_state = 0)
fig, subaxes = plt.subplots(3, 4, figsize=(15, 10), dpi=50)
for this_gamma, this_axis in zip([0.01, 1, 5], subaxes):
for this_C, subplot in zip([0.1, 1, 15, 250], this_axis):
title = 'gamma = {:.2f}, C = {:.2f}'.format(this_gamma, this_C)
clf = SVC(kernel = 'rbf', gamma = this_gamma,
C = this_C).fit(X_train, y_train)
plot_class_regions_for_classifier_subplot(clf, X_train, y_train,
X_test, y_test, title,
subplot)
plt.tight_layout(pad=0.4, w_pad=0.5, h_pad=1.0)
from sklearn.svm import SVC
X_train, X_test, y_train, y_test = train_test_split(X_cancer, y_cancer,
random_state = 0)
clf = SVC(C=10).fit(X_train, y_train)
print('Breast cancer dataset (unnormalized features)')
print('Accuracy of RBF-kernel SVC on training set: {:.2f}'
.format(clf.score(X_train, y_train)))
print('Accuracy of RBF-kernel SVC on test set: {:.2f}'
.format(clf.score(X_test, y_test)))
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
clf = SVC(C=10).fit(X_train_scaled, y_train)
print('Breast cancer dataset (normalized with MinMax scaling)')
print('RBF-kernel SVC (with MinMax scaling) training set accuracy: {:.2f}'
.format(clf.score(X_train_scaled, y_train)))
print('RBF-kernel SVC (with MinMax scaling) test set accuracy: {:.2f}'
.format(clf.score(X_test_scaled, y_test)))
Pros:
Cons:
Platt scaling
which transforms the output of the classifier to a probability distribution over classes by fitting a logistic regression model to the classifiers course.)Model complexity
kernel
: Type of kernel function to be usedC
: regularization parameterC
and gamma
are turned at the same timeWhen applying a number of supervised learning methods, we follow a consistent series of steps.
First, partitioning the data set into training and test set using the train-test-split
function
Second, calling the fit
method on the training set to estimate the model
Finally, applying the model by using the predict
method to estimate a target value for the new data instances, or by using the score
method to evaluate the trained model's performance on the test set
We divide the original data into training and test sets was to use the test set as a way to estimate how well the model trained on the training data would generalize to new previously unseen data. The test set represented data that had not been seen during training but had the same general attributes as the original data set, or in more technical language, which was drawn from the same underlying distribution as the training set.
Cross-validation is a method that goes beyond evaluating a single model using a single train/test split of the data by using multiple train/test splits, each of which is used to train and evaluate a separate model. So why is this better than our original method of a single train/test split? cross-validation basically gives more stable and reliable estimates of how the classifiers likely to perform on average by running multiple different training/test splits and then averaging the results, instead of relying entirely on a single particular training set
from sklearn.model_selection import cross_val_score
clf = KNeighborsClassifier(n_neighbors = 5)
X = X_fruits_2d.as_matrix()
y = y_fruits_2d.as_matrix()
#By default, `cross_val_score()` does threehold cross-validation.
#If you want to change the number of folds, you can set the `cv` parameter.
cv_scores = cross_val_score(clf, X, y)
print('Cross-validation scores (3-fold):', cv_scores)
#It's typical to then compute the mean of all the accuracy scores across the folds and
#report the mean cross validation score as a measure of how accurate we can expect the
#the model to be on average.
print('Mean cross-validation score (3-fold): {:.3f}'
.format(np.mean(cv_scores)))
One benefit of computing the accuracy of a model on multiple splits instead of a single split, is that it gives us notentially useful information about how sensitive the model is to the nature of the specific training set. We can look at the distribution of these multiple scores across all the cross-validation folds to see how likely it is that by chance, the model will perform very badly or very well on any new data set. So we can do a sort of worst case or best case performance estimate from these multiple scores.
This extra information does come with extra cost. It does take more time and computation to do cross-validation
In some cases (e.g. when feature values have very different ranges), we've seen the need to scale or normalize the training and test sets before use with a classifier. The proper way to do cross-validation when you need to scale the data is not to scale the entire dataset with a single transform, since this will indirectly leak information into the training data about the whole dataset, including the test data (see the lecture on data leakage later in the course). Instead, scaling/normalizing must be computed and applied for each cross-validation fold separately. To do this, the easiest way in scikit-learn is to use pipelines. While these are beyond the scope of this course, further information is available in the scikit-learn documentation here:
http://scikit-learn.org/stable/modules/generated/sklearn.pipeline.Pipeline.html
or the Pipeline section in the recommended textbook: Introduction to Machine Learning with Python by Andreas C. Müller and Sarah Guido (O'Reilly Media).
Sometimes we want to evaluate the effect that an important parameter of a model has on the cross-validation scores, the useful function validation_curve
makes it easy to run this type of expriment.
Like cross_val_score
, validation_cure
will do threefold cross-validation by default, but you can adjust this with the cv
parameter as well. Unlike cross_val_score
you can also specify a classifier parameter name and set of parameter values you want to sween across.
from sklearn.svm import SVC
from sklearn.model_selection import validation_curve
param_range = np.logspace(-3, 3, 4)
# First pass in the estimator object or that is the classifier or regression object to use,
# followd by the data set samples `X` and target values `Y`
# the name of the parameter to sweep
# and the array of parameter values that parameter should take on in doing the sweep
train_scores, test_scores = validation_curve(SVC(), X, y,
param_name='gamma',
param_range=param_range, cv=3)
validation_curve
will return two-dimensional arrays corresponding to evaluating on the training set and the test set.
One row per parameter sweep value : Each array has one row per parameter value in the sweep
One column per CV fold : the number of columns is the number of cross-validation folds that are used.
print(train_scores)
print(test_scores)
The validation curve shows the mean cross-validation accuracy (solid lines) for training (orange) and test (blue) set as a function of the SVM parameter (gamma). It also shows the variation around the mean (shaded region) as computed from k-fold cross-validation scores.
Finally as a reminder, cross-validation is used to evaluate the model and not learn or tune a new model. To do model tuning, we'll look at how to turn the model parameters using something called Grid Search
# This code based on scikit-learn validation_plot example
# See: http://scikit-learn.org/stable/auto_examples/model_selection/plot_validation_curve.html
plt.figure()
train_scores_mean = np.mean(train_scores, axis=1)
train_scores_std = np.std(train_scores, axis=1)
test_scores_mean = np.mean(test_scores, axis=1)
test_scores_std = np.std(test_scores, axis=1)
plt.title('Validation Curve with SVM')
plt.xlabel('$\gamma$ (gamma)')
plt.ylabel('Score')
plt.ylim(0.0, 1.1)
lw = 2
plt.semilogx(param_range, train_scores_mean, label='Training score',
color='darkorange', lw=lw)
plt.fill_between(param_range, train_scores_mean - train_scores_std,
train_scores_mean + train_scores_std, alpha=0.2,
color='darkorange', lw=lw)
plt.semilogx(param_range, test_scores_mean, label='Cross-validation score',
color='navy', lw=lw)
plt.fill_between(param_range, test_scores_mean - test_scores_std,
test_scores_mean + test_scores_std, alpha=0.2,
color='navy', lw=lw)
plt.legend(loc='best')
plt.show()
Decision trees are a popular supervised learning method and can be used for both regression and classification. It's a good exploratory method if you're interested in getting a better idea about what the influential features are in your dataset.
Basically, decision trees learn a series of explicit if then rules on feature values that result in a decision that predicts the target value.
from sklearn.datasets import load_iris
from sklearn.tree import DecisionTreeClassifier
from adspy_shared_utilities import plot_decision_tree
from sklearn.model_selection import train_test_split
iris = load_iris()
X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, random_state = 3)
clf = DecisionTreeClassifier().fit(X_train, y_train)
print('Accuracy of Decision Tree classifier on training set: {:.2f}'
.format(clf.score(X_train, y_train)))
print('Accuracy of Decision Tree classifier on test set: {:.2f}'
.format(clf.score(X_test, y_test)))
clf2 = DecisionTreeClassifier(max_depth = 3).fit(X_train, y_train)
print('Accuracy of Decision Tree classifier on training set: {:.2f}'
.format(clf2.score(X_train, y_train)))
print('Accuracy of Decision Tree classifier on test set: {:.2f}'
.format(clf2.score(X_test, y_test)))
plot_decision_tree(clf, iris.feature_names, iris.target_names)
plot_decision_tree(clf2, iris.feature_names, iris.target_names)
This is one of the most useful and widely used types of summary analysis you can perform on a supervised learning model.
In Scikit-learn feature importance values are stored as a list in an estimated property called feature_importances_
, note the underscore at end of the name which indicates it's property of the object that's set as a result of fitting the model and not say as a user defined property. The shared_utilities
python file contain a function called plot_feature_importances
that you can import and use to visualize future importance. It plot a horizental bar chart with the features listed along the y axis by name and feature importances along the axis
from adspy_shared_utilities import plot_feature_importances
plt.figure(figsize=(10,4), dpi=80)
plot_feature_importances(clf, iris.feature_names)
plt.show()
print('Feature importances: {}'.format(clf.feature_importances_))
Note: if a feature has a low feature importance value, that doesn't necessarily mean that the feature is not important for prediction. It simply means that the particular feature wasn't chosen at an early level of the tree and this could be because the future may be identical and highly correlated with anther informative feature and so doesn't provide any new additional signal for prediction.
Feature importance values don't tell us which specific classes a feature might be especially predictive for, and they are also don't indicate more complex relationship between features that may influence prediction. Even so the feature importance values provide an easy to understand summary that can give useful insight about individual features in the learning model.
Because feature importance can vary depending on the specific model learned for a particular train/test split for example. It's common when computing feature importance to use an average over multiple train/test splits. For example when performing cross-validation
from sklearn.tree import DecisionTreeClassifier
from adspy_shared_utilities import plot_class_regions_for_classifier_subplot
X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, random_state = 0)
fig, subaxes = plt.subplots(6, 1, figsize=(6, 32))
pair_list = [[0,1], [0,2], [0,3], [1,2], [1,3], [2,3]]
tree_max_depth = 4
for pair, axis in zip(pair_list, subaxes):
X = X_train[:, pair]
y = y_train
clf = DecisionTreeClassifier(max_depth=tree_max_depth).fit(X, y)
title = 'Decision Tree, max_depth = {:d}'.format(tree_max_depth)
plot_class_regions_for_classifier_subplot(clf, X, y, None,
None, title, axis,
iris.target_names)
axis.set_xlabel(iris.feature_names[pair[0]])
axis.set_ylabel(iris.feature_names[pair[1]])
plt.tight_layout()
plt.show()
from sklearn.tree import DecisionTreeClassifier
from adspy_shared_utilities import plot_decision_tree
from adspy_shared_utilities import plot_feature_importances
X_train, X_test, y_train, y_test = train_test_split(X_cancer, y_cancer, random_state = 0)
clf = DecisionTreeClassifier(max_depth = 4, min_samples_leaf = 8,
random_state = 0).fit(X_train, y_train)
plot_decision_tree(clf, cancer.feature_names, cancer.target_names)
print('Breast cancer dataset: decision tree')
print('Accuracy of DT classifier on training set: {:.2f}'
.format(clf.score(X_train, y_train)))
print('Accuracy of DT classifier on test set: {:.2f}'
.format(clf.score(X_test, y_test)))
plt.figure(figsize=(10,6),dpi=80)
plot_feature_importances(clf, cancer.feature_names)
plt.tight_layout()
plt.show()
Pros:
Cons:
max_depth
: controls maximum depth (number of split points). Most common way to reduce tree complexity and overfittingmin_samples_leaf
: threshold for the minimum # of data instances a leaf can have to avoid further splittingmax_leaf_nodes
: limits total number of leaves in the treemax_depth
) is enough to reduce overfitting