An insurance company has approached you with a dataset of previous claims of their clients. The insurance company wants you to develop a model to help them predict which claims look fraudulent. By doing so you hope to save the company millions of dollars annually.
#Import necessary libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
#Load the dataset into a dataframe
df = pd.read_csv('insurance_claims.csv')
df.head()
#Check the shape of the dataframe
df.shape
#check the data types of each column
df.info()
df.columns[df.isnull().any()]
df.fraud_reported.value_counts()
sns.pairplot(df)
df.policy_deductable.unique()
df.number_of_vehicles_involved.unique()
df.number_of_vehicles_involved.value_counts()
df.bodily_injuries.unique()
df.witnesses.unique()
df[df['capital-gains'] == 0].shape[0]
df[(df['capital-gains'] > 0) & (df['capital-gains'] <=50000)].shape[0]
df[df['capital-gains'] > 50000].shape[0]
df[df['capital-loss'] == 0].shape[0]
df[(df['capital-loss'] < 0) & (df['capital-loss'] >=-50000)].shape[0]
df[(df['capital-loss'] >= -100000) & (df['capital-loss'] < -50000)].shape[0]
df.umbrella_limit.value_counts()
sns.distplot(df.age, bins=np.arange(19,64,5))
objectcols = [col for col in df.columns if df[col].dtype == 'O']
objectcols
#Plot the distribution of categorical variables
fig = plt.figure(figsize=(12, 12))
sub1 = plt.subplot(5, 4, 1)
sns.countplot(x='policy_state', data=df)
sub2 = plt.subplot(5, 4, 2)
sns.countplot(x='policy_csl', data=df)
sub3 = plt.subplot(5, 4, 3)
sns.countplot(x='insured_sex', data=df)
sub4 = plt.subplot(5, 4, 4)
sns.countplot(x='insured_education_level', data=df)
sub5 = plt.subplot(5, 4, 5)
sns.countplot(x='insured_occupation', data=df)
sub6 = plt.subplot(5, 4, 6)
sns.countplot(x='insured_hobbies', data=df)
sub7 = plt.subplot(5, 4, 7)
sns.countplot(x='insured_relationship', data=df)
sub8 = plt.subplot(5, 4, 8)
sns.countplot(x='incident_type', data=df)
sub9 = plt.subplot(5, 4, 9)
sns.countplot(x='collision_type', data=df)
sub9 = plt.subplot(5, 4, 10)
sns.countplot(x='incident_severity', data=df)
sub9 = plt.subplot(5, 4, 11)
sns.countplot(x='authorities_contacted', data=df)
sub9 = plt.subplot(5, 4, 12)
sns.countplot(x='incident_state', data=df)
sub9 = plt.subplot(5, 4, 13)
sns.countplot(x='incident_city', data=df)
sub9 = plt.subplot(5, 4, 14)
sns.countplot(x='incident_location', data=df)
sub9 = plt.subplot(5, 4, 15)
sns.countplot(x='property_damage', data=df)
sub9 = plt.subplot(5, 4, 16)
sns.countplot(x='police_report_available', data=df)
sub9 = plt.subplot(5, 4, 17)
sns.countplot(x='auto_make', data=df)
sub9 = plt.subplot(5, 4, 18)
sns.countplot(x='auto_model', data=df)
sub9 = plt.subplot(5, 4, 19)
sns.countplot(x='fraud_reported', data=df)
fig.tight_layout()
plt.show()
df.fraud_reported = df.fraud_reported.map({'Y':1, 'N':0})
df.property_damage = df.property_damage.map({'?':'UNKNOWN'})
df.police_report_available = df.police_report_available.map({'?':'UNKNOWN'})
df.collision_type = df.collision_type.map({'?':'UNKNOWN'})
df.insured_sex = df.insured_sex.map({'MALE':1,'FEMALE':0})
df.insured_education_level = df.insured_education_level.map({'High School': 1, 'College': 2, 'Associate': 2, 'JD': 3,\
'Masters': 4, 'MD': 5, 'PhD': 6})
df.incident_severity = df.incident_severity.map({'Minor Damage': 1, 'Major Damage': 2, 'Total Loss': 3, 'Trivial Damage': 4})
df['csl_per_person'] = df.policy_csl.str.split('/', expand=True)[0]
df['csl_per_accident'] = df.policy_csl.str.split('/', expand=True)[1]
features = ['months_as_customer','age','policy_state','csl_per_person','csl_per_accident','policy_deductable',\
'policy_annual_premium','umbrella_limit','insured_sex','insured_education_level','insured_occupation',\
'insured_hobbies','insured_relationship','capital-gains','capital-loss','incident_type','collision_type',\
'incident_severity','authorities_contacted','incident_state','incident_hour_of_the_day','number_of_vehicles_involved',\
'property_damage','bodily_injuries','witnesses','police_report_available','total_claim_amount','auto_make',\
'auto_model','fraud_reported']
df_final = pd.get_dummies(df[features],drop_first=True)
df_final.head()
#We use sklearn’s train_test_split to split the data into a training set and a test set.
from sklearn.model_selection import train_test_split
X = df_final.drop('fraud_reported', axis=1)
y = df['fraud_reported'].values
feature_name = X.columns.tolist()
Due to the massive amounts of computations taking place in deep learning, feature scaling is compulsory. Feature scaling standardizes the range of our independent variables.
print(X.shape)
print(y.shape)
print(type(X))
print(type(y))
# apply PCA on X_train
from sklearn.decomposition import PCA
pca = PCA().fit(X)
plt.plot(np.cumsum(pca.explained_variance_ratio_))
plt.xlabel('Number of components')
plt.ylabel('Cumulative explained variance')
#plt.annotate('90',xy=(90, .90))
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
lda = LinearDiscriminantAnalysis(n_components=10)
X_r2 = lda.fit(X, y).transform(X)
# Percentage of variance explained for each components
print('LDA explained variance ratio (first two components): %s'
% str(lda.explained_variance_ratio_))
#Check the correlations among feature variables
corrmat = df_final.corr()
f, ax = plt.subplots(figsize=(15, 12))
sns.heatmap(corrmat);
#saleprice correlation matrix
k = 20 #number of variables for heatmap
cols = corrmat.nlargest(k, 'fraud_reported')['fraud_reported'].index
cm = np.corrcoef(df_final[cols].values.T)
sns.set(font_scale=1.25)
hm = sns.heatmap(cm, cbar=True, annot=True, square=True, fmt='.2f', annot_kws={'size': 8}, yticklabels=cols.values, xticklabels=cols.values)
plt.show()
def cor_selector(X, y):
cor_list = []
# calculate the correlation with y for each feature
for i in X.columns.tolist():
cor = np.corrcoef(X[i], y)[0, 1]
cor_list.append(cor)
# replace NaN with 0
cor_list = [0 if np.isnan(i) else i for i in cor_list]
# feature name
cor_feature = X.iloc[:,np.argsort(np.abs(cor_list))[-100:]].columns.tolist()
# feature selection? 0 for not select, 1 for select
cor_support = [True if i in cor_feature else False for i in feature_name]
return cor_support, cor_feature
cor_support, cor_feature = cor_selector(X, y)
print(str(len(cor_feature)), 'selected features')
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.preprocessing import MinMaxScaler
X_norm = MinMaxScaler().fit_transform(X)
chi_selector = SelectKBest(chi2, k=100)
chi_selector.fit(X_norm, y)
chi_support = chi_selector.get_support()
chi_feature = X.loc[:,chi_support].columns.tolist()
print(str(len(chi_feature)), 'selected features')
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
rfe_selector = RFE(estimator=LogisticRegression(), n_features_to_select=100, step=10, verbose=5)
rfe_selector.fit(X_norm, y)
rfe_support = rfe_selector.get_support()
rfe_feature = X.loc[:,rfe_support].columns.tolist()
print(str(len(rfe_feature)), 'selected features')
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LogisticRegression
embeded_lr_selector = SelectFromModel(LogisticRegression(penalty="l1"), '1.25*median')
embeded_lr_selector.fit(X_norm, y)
embeded_lr_support = embeded_lr_selector.get_support()
embeded_lr_feature = X.loc[:,embeded_lr_support].columns.tolist()
print(str(len(embeded_lr_feature)), 'selected features')
from sklearn.ensemble import RandomForestClassifier
embeded_rf_selector = SelectFromModel(RandomForestClassifier(n_estimators=100), threshold='1.25*median')
embeded_rf_selector.fit(X, y)
embeded_rf_support = embeded_rf_selector.get_support()
embeded_rf_feature = X.loc[:,embeded_rf_support].columns.tolist()
print(str(len(embeded_rf_feature)), 'selected features')
from lightgbm import LGBMClassifier
lgbc=LGBMClassifier(n_estimators=500, learning_rate=0.05, num_leaves=32, colsample_bytree=0.2,
reg_alpha=3, reg_lambda=1, min_split_gain=0.01, min_child_weight=40)
embeded_lgb_selector = SelectFromModel(lgbc, threshold='1.25*median')
embeded_lgb_selector.fit(X, y)
embeded_lgb_support = embeded_lgb_selector.get_support()
embeded_lgb_feature = X.loc[:,embeded_lgb_support].columns.tolist()
print(str(len(embeded_lgb_feature)), 'selected features')
# put all selection together
feature_selection_df = pd.DataFrame({'Feature':feature_name, 'Pearson':cor_support, 'Chi-2':chi_support, 'RFE':rfe_support, 'Logistics':embeded_lr_support,
'Random Forest':embeded_rf_support, 'LightGBM':embeded_lgb_support})
# count the selected times for each feature
feature_selection_df['Total'] = np.sum(feature_selection_df, axis=1)
# display the top 100
feature_selection_df = feature_selection_df.sort_values(['Total','Feature'] , ascending=False)
feature_selection_df.index = range(1, len(feature_selection_df)+1)
feature_selection_df
support6 = feature_selection_df[feature_selection_df['Total'] == 6]['Feature']
support56 = feature_selection_df[feature_selection_df['Total'] >= 5]['Feature']
support456 = feature_selection_df[feature_selection_df['Total'] >= 4]['Feature']
support3456 = feature_selection_df[feature_selection_df['Total'] >= 3]['Feature']
support23456 = feature_selection_df[feature_selection_df['Total'] >= 2]['Feature']
X_train, X_test, y_train, y_test = train_test_split(X[support6], y, test_size=0.3)
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_validate
lr = LogisticRegression()
lr_score = cross_validate(lr, X_train, y_train, cv=10, scoring='roc_auc')
print(lr_score['test_score'].mean())
X_train, X_test, y_train, y_test = train_test_split(X[support56], y, test_size=0.3)
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_validate
lr = LogisticRegression()
lr_score = cross_validate(lr, X_train, y_train, cv=10, scoring='roc_auc')
print(lr_score['test_score'].mean())
X_train, X_test, y_train, y_test = train_test_split(X[support456], y, test_size=0.3)
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_validate
lr = LogisticRegression()
lr_score = cross_validate(lr, X_train, y_train, cv=10, scoring='roc_auc')
print(lr_score['test_score'].mean())
X_train, X_test, y_train, y_test = train_test_split(X[support3456], y, test_size=0.3)
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_validate
lr = LogisticRegression()
lr_score = cross_validate(lr, X_train, y_train, cv=10, scoring='roc_auc')
print(lr_score['test_score'].mean())
X_train, X_test, y_train, y_test = train_test_split(X[support23456], y, test_size=0.3)
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_validate
lr = LogisticRegression()
lr_score = cross_validate(lr, X_train, y_train, cv=10, scoring='roc_auc')
print(lr_score['test_score'].mean())
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_validate
lr = LogisticRegression()
lr_score = cross_validate(lr, X_train, y_train, cv=10, scoring='roc_auc')
print(lr_score['test_score'].mean())
X_train, X_test, y_train, y_test = train_test_split(X[support56], y, test_size=0.3)
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)
from sklearn.tree import DecisionTreeClassifier
dtc = DecisionTreeClassifier(max_depth = 10)
dtc_score = cross_validate(dtc, X_train, y_train, cv=10, scoring='roc_auc')
print(dtc_score['test_score'].mean())
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(n_estimators = 50, max_depth = 10)
rfc_score = cross_validate(rfc, X_train, y_train, cv=10, scoring = 'roc_auc')
print(rfc_score['test_score'].mean())
from sklearn.neighbors import KNeighborsClassifier
knc = KNeighborsClassifier(n_neighbors=15)
knc_score = cross_validate(knc, X_train, y_train, cv=10, scoring = 'roc_auc')
print(knc_score['test_score'].mean())
from sklearn.svm import SVC
svc = SVC()
svc_score = cross_validate(svc, X_train, y_train, cv=10, scoring = 'roc_auc')
print(svc_score['test_score'].mean())
from sklearn.svm import LinearSVC
svc = LinearSVC()
svc_score = cross_validate(svc, X_train, y_train, cv=10, scoring = 'roc_auc')
print(svc_score['test_score'].mean())
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
lda = LinearDiscriminantAnalysis(n_components=10)
lda_score = cross_validate(lda, X_train, y_train, cv=10, scoring = 'roc_auc')
print(lda_score['test_score'].mean())
#Gradient Boosting
from sklearn.ensemble import GradientBoostingClassifier
gbc = GradientBoostingClassifier(n_estimators = 50, max_depth=10, min_samples_split=20).fit(X_train, y_train)
scoring = {'AUC':'roc_auc','ACC':'accuracy','pre':'precision','rec':'recall', 'f1_score':'f1'}
gbc_cv_score = cross_validate(gbc, X_train, y_train, scoring=scoring, cv=10)
print("ROC_AUC: %.4f" %gbc_cv_score['test_AUC'].mean(), "Accuracy: %.4f" %gbc_cv_score['test_ACC'].mean(), "Precision: %.4f" %gbc_cv_score['test_pre'].mean(),
"Recall: %.4f" %gbc_cv_score['test_rec'].mean(), "F1-Score: %.4f" %gbc_cv_score['test_f1_score'].mean())
#XGBoost
from xgboost import XGBClassifier
from sklearn.grid_search import GridSearchCV
cv_params = {'max_depth': [3,5,7], 'min_child_weight': [15,20,25]}
ind_params = {'learning_rate': 0.1, 'n_estimators': 50, 'seed':0, 'subsample': 0.8, 'colsample_bytree': 0.8, 'objective': 'binary:logistic'}
optimized_GBM = GridSearchCV(XGBClassifier(**ind_params), cv_params, scoring = 'f1', cv = 10)
optimized_GBM.fit(X_train, y_train)
optimized_GBM.grid_scores_
print(optimized_GBM.best_params_)
print(optimized_GBM.best_score_)
#XGBoost
import xgboost as xgb
xgdmat = xgb.DMatrix(X_train, y_train)
tuned_params = {'subsample': 0.8, 'colsample_bytree': 0.8, 'objective': 'binary:logistic', 'max_depth':5, 'min_child_weight':20}
xgb_cv_score = xgb.cv(params = tuned_params, dtrain = xgdmat, num_boost_round = 50, nfold = 10, metrics = ['auc','error'], early_stopping_rounds = 25)
print(xgb_cv_score.tail(5))
print("ROC_AUC: %.4f" %xgb_cv_score["test-auc-mean"].iloc[-1], "Accuracy: %.4f" %(1-xgb_cv_score["test-error-mean"].iloc[-1]))
final_gb = xgb.train(tuned_params, xgdmat, num_boost_round = 7)
#Plot feature importances
sns.set(font_scale = 1.5, rc={'figure.figsize':(4,6)})
xgb.plot_importance(final_gb)
testdmat = xgb.DMatrix(X_test)
y_pred = final_gb.predict(testdmat)
y_pred[0:10]
#Plot the histogram of predicted probabilities for Class 1
plt.hist(y_pred, bins=8)
plt.xlim(0, 1)
plt.title('Histogram of predicted probabilities')
plt.xlabel('Predicted probability of fraud claims')
plt.ylabel('Frequency')
plt.show()
#Predict target classes if the probability is greater than 0.7
from sklearn.preprocessing import binarize
y_pred_class = binarize([y_pred], 0.5)[0]
#Get the new confusion matrix (threshold of 0.7)
df_confusion = metrics.confusion_matrix(y_test, y_pred_class)
df_confusion
def plot_confusion_matrix(df_confusion, title='Confusion matrix', cmap=plt.cm.gray_r):
plt.matshow(df_confusion, cmap=cmap)
plt.title('Confusion Matrix')
plt.colorbar()
plt.ylabel('Actual')
plt.xlabel('Predicted')
plot_confusion_matrix(df_confusion)
# save confusion matrix and slice into four pieces
confusion = metrics.confusion_matrix(y_test, y_pred_class)
TP = confusion[1, 1]
TN = confusion[0, 0]
FP = confusion[0, 1]
FN = confusion[1, 0]
print('True Positive: ', str(TP))
print('True Negative: ', str(TN))
print('False Positive: ', str(FP))
print('False Negative: ', str(FN))
#Print classification and mis-classification scores
print('Classification Rate: ',metrics.accuracy_score(y_test, y_pred_class))
print('Misclassification Rate: ', 1 - metrics.accuracy_score(y_test, y_pred_class))
#Print Recall and Precision scores
print('Recall: ', str(metrics.recall_score(y_test, y_pred_class)))
print('Precision: ', str(metrics.precision_score(y_test, y_pred_class)))
#Plot the AUC-ROC curve
fpr, tpr, thresholds = metrics.roc_curve(y_test, y_pred)
plt.plot(fpr, tpr)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.0])
plt.title('ROC curve for defaulter classifier')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.grid(True)
# AUC is the percentage of the ROC plot that is underneath the curve:
print('Area under ROC curve: ', str(metrics.roc_auc_score(y_test, y_pred)))
#Hyper-parameter tuning for Random Forest
from sklearn.model_selection import GridSearchCV
param_grid = dict(n_estimators=[40,50,60], max_depth=[3,5,7], min_samples_split=[10,20,30], criterion=['gini','entropy'])
grid = GridSearchCV(rfc, param_grid, cv=10, scoring='roc_auc')
grid.fit(X_train, y_train)
#Check the best score and best fitting params
print(grid.best_score_)
print(grid.best_params_)
print(grid.best_estimator_)
#Check the prediction score
from sklearn import metrics
y_pred_class = grid.predict(X_test)
metrics.roc_auc_score(y_test, y_pred_class)
param_grid = dict(n_estimators=[40,50,60], max_depth=[7,10,12], min_samples_split=[10,20,30])
grid = GridSearchCV(gbc, param_grid, cv=10, scoring='roc_auc')
grid.fit(X_train, y_train)
#Check the best score and best fitting params
print(grid.best_score_)
print(grid.best_params_)
print(grid.best_estimator_)
y_pred_class = grid.predict(X_test)
metrics.roc_auc_score(y_test, y_pred_class)
#import keras and its modules
import keras
from keras.models import Sequential #Sequential module is required to initialize ANN
from keras.layers import Dense #Dense module is required to build the layers of ANN
Next we need to initialize our ANN by creating an instance of Sequential. The Sequential function initializes a linear stack of layers. This allows us to add more layers later using the Dense module.
classifier = Sequential()
Adding input layer (First Hidden Layer)
We use the add method to add different layers to our ANN. The first parameter is the number of nodes you want to add to this layer. There is no rule of thumb as to how many nodes you should add. However a common strategy is to choose the number of nodes as the average of nodes in the input layer and the number of nodes in the output layer.
Say for example you had five independent variables and one output. Then you would take the sum of that and divide by two, which is three. You can also decide to experiment with a technique called parameter tuning. The second parameter, kernel_initializer, is the function that will be used to initialize the weights. In this case, it will use a uniform distribution to make sure that the weights are small numbers close to zero. The next parameter is the activation function. We use the Rectifier function, shortened as relu. We mostly use this function for the hidden layer in ANN. The final parameter is input_dim, which is the number of nodes in the input layer. It represents the number of independent variables.
classifier.add(
Dense(5, kernel_initializer = 'uniform',
activation = 'relu', input_dim=63))
Adding Second Hidden Layer
Adding the second hidden layer is similar to adding the first hidden layer.
We don’t need to specify the input_dim parameter because we have already specified it in the first hidden layer. In the first hidden layer we specified this in order to let the layer know how many input nodes to expect. In the second hidden layer the ANN already knows how many input nodes to expect so we don’t need to repeat ourselves.
classifier.add(
Dense(5, kernel_initializer = 'uniform',
activation = 'relu'))
Adding the Output layer
We change the first parameter because in our output node we expect one node. This is because we are only interested in knowing whether a claim was fraudulent or not. We change the activation function because we want to get the probabilities that a claim is fraudulent. We do this by using the Sigmoid activation function. In case you’re dealing with a classification problem that has more than two classes (i.e. classifying cats, dogs, and monkeys) we’d need to change two things. We ‘d change the first parameter to 3 and change the activation function to softmax. Softmax is a sigmoid function applied to an independent variable with more than two categories.
classifier.add(
Dense(1, kernel_initializer = 'uniform',
activation = 'sigmoid'))
Compiling the ANN
Compiling is basically applying a stochastic gradient descent to the whole neural network. The first parameter is the algorithm you want to use to get the optimal set of weights in the neural network. The algorithm used here is a stochastic gradient algorithm. There are many variants of this. A very efficient one to use is adam. The second parameter is the loss function within the stochastic gradient algorithm. Since our categories are binary we use the binary_crossentropy loss function. Otherwise we would have used categorical_crossentopy. The final argument is the criterion we’ll use to evaluate our model. In this case we use the accuracy.
classifier.compile(optimizer= 'adam',
loss = 'binary_crossentropy',
metrics = ['accuracy'])
classifier.summary()
Fitting ANN to the training set
X_train represents the independent variables we’re using to train our ANN, and y_train represents the column we’re predicting. Epochs represents the number of times we’re going to pass our full dataset through the ANN. Batch_size is the number of observations after which the weights will be updated.
classifier.fit(X_train, y_train, batch_size = 10, epochs = 30, verbose=2, validation_data = [X_test, y_test])
Predicting using the training set
y_pred = classifier.predict(X_test)
y_pred[0:10]
y_pred = (y_pred > 0.5)
y_pred[0:10]
Checking the confusion matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
cm