In this assignment you will train several models and evaluate how effectively they predict instances of fraud using data based on this dataset from Kaggle.
Each row in fraud_data.csv
corresponds to a credit card transaction. Features include confidential variables V1
through V28
as well as Amount
which is the amount of the transaction.
The target is stored in the class
column, where a value of 1 corresponds to an instance of fraud and 0 corresponds to an instance of not fraud.
import numpy as np
import pandas as pd
Import the data from fraud_data.csv
. What percentage of the observations in the dataset are instances of fraud?
This function should return a float between 0 and 1.
def answer_one():
df = pd.read_csv("./Data_Science_with_Python/Applied_Machine_with_Python/week3/fraud_data.csv")
df.head()
return sum(df['Class']) / df.shape[0]
df = pd.read_csv("./Data_Science_with_Python/Applied_Machine_with_Python/week3/fraud_data.csv")
df.head()
sum(df['Class']) / df.shape[0]
# Use X_train, X_test, y_train, y_test for all of the following questions
from sklearn.model_selection import train_test_split
df = pd.read_csv('fraud_data.csv')
X = df.iloc[:,:-1]
y = df.iloc[:,-1]
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
Using X_train
, X_test
, y_train
, and y_test
(as defined above), train a dummy classifier that classifies everything as the majority class of the training data. What is the accuracy of this classifier? What is the recall?
This function should a return a tuple with two floats, i.e. (accuracy score, recall score)
.
def answer_two():
from sklearn.dummy import DummyClassifier
from sklearn.metrics import recall_score, accuracy_score
dummy_majority = DummyClassifier(strategy = 'most_frequent').fit(X_train, y_train)
y_dummy_predictions = dummy_majority.predict(X_test)
dummy_majority.score(X_test, y_test)
ans = (accuracy_score(y_test, y_dummy_predictions), recall_score(y_test, y_dummy_predictions))
return ans
from sklearn.dummy import DummyClassifier
from sklearn.metrics import recall_score, accuracy_score
# Negative class (0) is most frequent
dummy_majority = DummyClassifier(strategy = 'most_frequent').fit(X_train, y_train)
# Therefore the dummy 'most_frequent' classifier always predicts class 0
y_dummy_predictions = dummy_majority.predict(X_test)
dummy_majority.score(X_test, y_test)
(accuracy_score(y_test, y_dummy_predictions), recall_score(y_test, y_dummy_predictions))
Using X_train, X_test, y_train, y_test (as defined above), train a SVC classifer using the default parameters. What is the accuracy, recall, and precision of this classifier?
This function should a return a tuple with three floats, i.e. (accuracy score, recall score, precision score)
.
def answer_three():
from sklearn.metrics import recall_score, precision_score, accuracy_score
from sklearn.svm import SVC
svm = SVC().fit(X_train, y_train)
svm_predicted = svm.predict(X_test)
ans = (accuracy_score(y_test, svm_predicted),recall_score(y_test, svm_predicted),precision_score(y_test, svm_predicted))
return ans
from sklearn.metrics import recall_score, precision_score, accuracy_score
from sklearn.svm import SVC
svm = SVC().fit(X_train, y_train)
svm_predicted = svm.predict(X_test)
(accuracy_score(y_test, svm_predicted),recall_score(y_test, svm_predicted),precision_score(y_test, svm_predicted))
Using the SVC classifier with parameters {'C': 1e9, 'gamma': 1e-07}
, what is the confusion matrix when using a threshold of -220 on the decision function. Use X_test and y_test.
This function should return a confusion matrix, a 2x2 numpy array with 4 integers.
def answer_four():
from sklearn.metrics import confusion_matrix
from sklearn.svm import SVC
svm = SVC(C=1e9, gamma = 1e-07).fit(X_train, y_train)
y_scores_svm = svm.decision_function(X_test) > -220
confusion = confusion_matrix(y_test, y_scores_svm)
return confusion
from sklearn.metrics import confusion_matrix
from sklearn.svm import SVC
svm = SVC(C=1e9, gamma = 1e-07).fit(X_train, y_train)
y_scores_svm = svm.decision_function(X_test) > -220
confusion = confusion_matrix(y_test, y_scores_svm)
confusion
y_scores_svm > -220
Train a logisitic regression classifier with default parameters using X_train and y_train.
For the logisitic regression classifier, create a precision recall curve and a roc curve using y_test and the probability estimates for X_test (probability it is fraud).
Looking at the precision recall curve, what is the recall when the precision is 0.75
?
Looking at the roc curve, what is the true positive rate when the false positive rate is 0.16
?
This function should return a tuple with two floats, i.e. (recall, true positive rate)
.
def answer_five():
#precision recall curve
%matplotlib notebook
import matplotlib.pyplot as plt
from sklearn.metrics import precision_recall_curve
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression().fit(X_train, y_train)
y_scores_lr = lr.decision_function(X_test)
precision, recall, thresholds = precision_recall_curve(y_test, y_scores_lr)
closest_zero = np.argmin(np.abs(thresholds))
closest_zero_p = precision[closest_zero]
closest_zero_r = recall[closest_zero]
plt.figure()
plt.xlim([0.0, 1.01])
plt.ylim([0.0, 1.01])
plt.plot(precision, recall, label='Precision-Recall Curve')
plt.plot(closest_zero_p, closest_zero_r, 'o', markersize = 12, fillstyle = 'none', c='r', mew=3)
plt.xlabel('Precision', fontsize=16)
plt.ylabel('Recall', fontsize=16)
plt.axes().set_aspect('equal')
plt.show()
#roc curve
from sklearn.metrics import roc_curve, auc
fpr_lr, tpr_lr, _ = roc_curve(y_test, y_scores_lr)
roc_auc_lr = auc(fpr_lr, tpr_lr)
plt.figure()
plt.xlim([-0.01, 1.00])
plt.ylim([-0.01, 1.01])
plt.plot(fpr_lr, tpr_lr, lw=3, label='LogRegr ROC curve (area = {:0.2f})'.format(roc_auc_lr))
plt.xlabel('False Positive Rate', fontsize=16)
plt.ylabel('True Positive Rate', fontsize=16)
plt.title('ROC curve (1-of-10 digits classifier)', fontsize=16)
plt.legend(loc='lower right', fontsize=13)
plt.plot([0, 1], [0, 1], color='navy', lw=3, linestyle='--')
plt.axes().set_aspect('equal')
plt.show()
return (0.83, 0.94)
%matplotlib notebook
import matplotlib.pyplot as plt
from sklearn.metrics import precision_recall_curve
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression().fit(X_train, y_train)
y_scores_lr = lr.decision_function(X_test)
precision, recall, thresholds = precision_recall_curve(y_test, y_scores_lr)
closest_zero = np.argmin(np.abs(thresholds))
closest_zero_p = precision[closest_zero]
closest_zero_r = recall[closest_zero]
plt.figure()
plt.xlim([0.0, 1.01])
plt.ylim([0.0, 1.01])
plt.plot(precision, recall, label='Precision-Recall Curve')
plt.plot(closest_zero_p, closest_zero_r, 'o', markersize = 12, fillstyle = 'none', c='r', mew=3)
plt.xlabel('Precision', fontsize=16)
plt.ylabel('Recall', fontsize=16)
plt.axes().set_aspect('equal')
plt.show()
from sklearn.metrics import roc_curve, auc
fpr_lr, tpr_lr, _ = roc_curve(y_test, y_scores_lr)
roc_auc_lr = auc(fpr_lr, tpr_lr)
plt.figure()
plt.xlim([-0.01, 1.00])
plt.ylim([-0.01, 1.01])
plt.plot(fpr_lr, tpr_lr, lw=3, label='LogRegr ROC curve (area = {:0.2f})'.format(roc_auc_lr))
plt.xlabel('False Positive Rate', fontsize=16)
plt.ylabel('True Positive Rate', fontsize=16)
plt.title('ROC curve (1-of-10 digits classifier)', fontsize=16)
plt.legend(loc='lower right', fontsize=13)
plt.plot([0, 1], [0, 1], color='navy', lw=3, linestyle='--')
plt.axes().set_aspect('equal')
plt.show()
Perform a grid search over the parameters listed below for a Logisitic Regression classifier, using recall for scoring and the default 3-fold cross validation.
'penalty': ['l1', 'l2']
'C':[0.01, 0.1, 1, 10, 100]
From .cv_results_
, create an array of the mean test scores of each parameter combination. i.e.
l1 |
l2 |
|
---|---|---|
0.01 |
? | ? |
0.1 |
? | ? |
1 |
? | ? |
10 |
? | ? |
100 |
? | ? |
This function should return a 5 by 2 numpy array with 10 floats.
Note: do not return a DataFrame, just the values denoted by '?' above in a numpy array. You might need to reshape your raw result to meet the format we are looking for.
def answer_six():
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
grid_values = {'penalty': ['l1', 'l2'],'C':[0.01, 0.1, 1, 10, 100]}
lr = LogisticRegression()
grid_lr_acc = GridSearchCV(lr, param_grid = grid_values, scoring = 'recall', cv=3)
ans = grid_lr_acc.fit(X_train, y_train).cv_results_
return ans['mean_test_score'].reshape(5,2)
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
grid_values = {'penalty': ['l1', 'l2'],'C':[0.01, 0.1, 1, 10, 100]}
lr = LogisticRegression()
grid_lr_acc = GridSearchCV(lr, param_grid = grid_values, scoring = 'recall', cv=3)
ans = grid_lr_acc.fit(X_train, y_train).cv_results_
ans
ans['mean_test_score'].reshape(5,2)
# Use the following function to help visualize results from the grid search
def GridSearch_Heatmap(scores):
%matplotlib notebook
import seaborn as sns
import matplotlib.pyplot as plt
plt.figure()
sns.heatmap(scores.reshape(5,2), xticklabels=['l1','l2'], yticklabels=[0.01, 0.1, 1, 10, 100])
plt.yticks(rotation=0);
GridSearch_Heatmap(answer_six())