Insurance Claims Fraud Detection

Business Problem¶

An insurance company has approached you with a dataset of previous claims of their clients. The insurance company wants you to develop a model to help them predict which claims look fraudulent. By doing so you hope to save the company millions of dollars annually.

#Import necessary libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

#Load the dataset into a dataframe
df = pd.read_csv('insurance_claims.csv')
df.head()

#Check the shape of the dataframe
df.shape

(1000, 39)

#check the data types of each column
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 39 columns):
months_as_customer             1000 non-null int64
age                            1000 non-null int64
policy_number                  1000 non-null int64
policy_bind_date               1000 non-null object
policy_state                   1000 non-null object
policy_csl                     1000 non-null object
policy_deductable              1000 non-null int64
policy_annual_premium          1000 non-null float64
umbrella_limit                 1000 non-null int64
insured_zip                    1000 non-null int64
insured_sex                    1000 non-null object
insured_education_level        1000 non-null object
insured_occupation             1000 non-null object
insured_hobbies                1000 non-null object
insured_relationship           1000 non-null object
capital-gains                  1000 non-null int64
capital-loss                   1000 non-null int64
incident_date                  1000 non-null object
incident_type                  1000 non-null object
collision_type                 1000 non-null object
incident_severity              1000 non-null object
authorities_contacted          1000 non-null object
incident_state                 1000 non-null object
incident_city                  1000 non-null object
incident_location              1000 non-null object
incident_hour_of_the_day       1000 non-null int64
number_of_vehicles_involved    1000 non-null int64
property_damage                1000 non-null object
bodily_injuries                1000 non-null int64
witnesses                      1000 non-null int64
police_report_available        1000 non-null object
total_claim_amount             1000 non-null int64
injury_claim                   1000 non-null int64
property_claim                 1000 non-null int64
vehicle_claim                  1000 non-null int64
auto_make                      1000 non-null object
auto_model                     1000 non-null object
auto_year                      1000 non-null int64
fraud_reported                 1000 non-null object
dtypes: float64(1), int64(17), object(21)
memory usage: 304.8+ KB

df.columns[df.isnull().any()]

Index([], dtype='object')

df.fraud_reported.value_counts()

N    753
Y    247
Name: fraud_reported, dtype: int64

sns.pairplot(df)

<seaborn.axisgrid.PairGrid at 0x151dc6d8>

df.policy_deductable.unique()

array([1000, 2000,  500], dtype=int64)

df.number_of_vehicles_involved.unique()

array([1, 3, 4, 2], dtype=int64)

df.number_of_vehicles_involved.value_counts()

1    581
3    358
4    31 
2    30 
Name: number_of_vehicles_involved, dtype: int64

df.bodily_injuries.unique()

array([1, 0, 2], dtype=int64)

df.witnesses.unique()

array([2, 0, 3, 1], dtype=int64)

df[df['capital-gains'] == 0].shape[0]

508

df[(df['capital-gains'] > 0) & (df['capital-gains'] <=50000)].shape[0]

233

df[df['capital-gains'] > 50000].shape[0]

259

df[df['capital-loss'] == 0].shape[0]

475

df[(df['capital-loss'] < 0) & (df['capital-loss'] >=-50000)].shape[0]

255

df[(df['capital-loss'] >= -100000) & (df['capital-loss'] < -50000)].shape[0]

269

df.umbrella_limit.value_counts()

 0           798
 6000000     57 
 5000000     46 
 4000000     39 
 7000000     29 
 3000000     12 
 8000000     8  
 9000000     5  
 2000000     3  
 10000000    2  
-1000000     1  
Name: umbrella_limit, dtype: int64

sns.distplot(df.age, bins=np.arange(19,64,5))

C:\ProgramData\Anaconda3\lib\site-packages\matplotlib\axes\_axes.py:6448: UserWarning: The 'normed' kwarg is deprecated, and has been replaced by the 'density' kwarg.
  warnings.warn("The 'normed' kwarg is deprecated, and has been "

<matplotlib.axes._subplots.AxesSubplot at 0x3caa2390>

objectcols = [col for col in df.columns if df[col].dtype == 'O']
objectcols

['policy_bind_date',
 'policy_state',
 'policy_csl',
 'insured_sex',
 'insured_education_level',
 'insured_occupation',
 'insured_hobbies',
 'insured_relationship',
 'incident_date',
 'incident_type',
 'collision_type',
 'incident_severity',
 'authorities_contacted',
 'incident_state',
 'incident_city',
 'incident_location',
 'property_damage',
 'police_report_available',
 'auto_make',
 'auto_model',
 'fraud_reported']

#Plot the distribution of categorical variables
fig = plt.figure(figsize=(12, 12))

sub1 = plt.subplot(5, 4, 1)
sns.countplot(x='policy_state', data=df)

sub2 = plt.subplot(5, 4, 2)
sns.countplot(x='policy_csl', data=df)

sub3 = plt.subplot(5, 4, 3)
sns.countplot(x='insured_sex', data=df)

sub4 = plt.subplot(5, 4, 4)
sns.countplot(x='insured_education_level', data=df)

sub5 = plt.subplot(5, 4, 5)
sns.countplot(x='insured_occupation', data=df)

sub6 = plt.subplot(5, 4, 6)
sns.countplot(x='insured_hobbies', data=df)

sub7 = plt.subplot(5, 4, 7)
sns.countplot(x='insured_relationship', data=df)

sub8 = plt.subplot(5, 4, 8)
sns.countplot(x='incident_type', data=df)

sub9 = plt.subplot(5, 4, 9)
sns.countplot(x='collision_type', data=df)

sub9 = plt.subplot(5, 4, 10)
sns.countplot(x='incident_severity', data=df)
sub9 = plt.subplot(5, 4, 11)
sns.countplot(x='authorities_contacted', data=df)
sub9 = plt.subplot(5, 4, 12)
sns.countplot(x='incident_state', data=df)
sub9 = plt.subplot(5, 4, 13)
sns.countplot(x='incident_city', data=df)
sub9 = plt.subplot(5, 4, 14)
sns.countplot(x='incident_location', data=df)
sub9 = plt.subplot(5, 4, 15)
sns.countplot(x='property_damage', data=df)
sub9 = plt.subplot(5, 4, 16)
sns.countplot(x='police_report_available', data=df)
sub9 = plt.subplot(5, 4, 17)
sns.countplot(x='auto_make', data=df)
sub9 = plt.subplot(5, 4, 18)
sns.countplot(x='auto_model', data=df)
sub9 = plt.subplot(5, 4, 19)
sns.countplot(x='fraud_reported', data=df)
    
fig.tight_layout()
plt.show()

C:\ProgramData\Anaconda3\lib\site-packages\seaborn\categorical.py:1428: FutureWarning: remove_na is deprecated and is a private function. Do not use.
  stat_data = remove_na(group_data)

df.fraud_reported = df.fraud_reported.map({'Y':1, 'N':0})
df.property_damage = df.property_damage.map({'?':'UNKNOWN'})
df.police_report_available = df.police_report_available.map({'?':'UNKNOWN'})
df.collision_type = df.collision_type.map({'?':'UNKNOWN'})
df.insured_sex = df.insured_sex.map({'MALE':1,'FEMALE':0})
df.insured_education_level = df.insured_education_level.map({'High School': 1, 'College': 2, 'Associate': 2, 'JD': 3,\
                                                             'Masters': 4, 'MD': 5, 'PhD': 6})
df.incident_severity = df.incident_severity.map({'Minor Damage': 1, 'Major Damage': 2, 'Total Loss': 3, 'Trivial Damage': 4})

df['csl_per_person'] = df.policy_csl.str.split('/', expand=True)[0]
df['csl_per_accident'] = df.policy_csl.str.split('/', expand=True)[1]

features = ['months_as_customer','age','policy_state','csl_per_person','csl_per_accident','policy_deductable',\
            'policy_annual_premium','umbrella_limit','insured_sex','insured_education_level','insured_occupation',\
            'insured_hobbies','insured_relationship','capital-gains','capital-loss','incident_type','collision_type',\
            'incident_severity','authorities_contacted','incident_state','incident_hour_of_the_day','number_of_vehicles_involved',\
            'property_damage','bodily_injuries','witnesses','police_report_available','total_claim_amount','auto_make',\
            'auto_model','fraud_reported']
df_final = pd.get_dummies(df[features],drop_first=True)
df_final.head()

#We use sklearn’s train_test_split to split the data into a training set and a test set.
from sklearn.model_selection import train_test_split

X = df_final.drop('fraud_reported', axis=1)
y = df['fraud_reported'].values
feature_name = X.columns.tolist()

Due to the massive amounts of computations taking place in deep learning, feature scaling is compulsory. Feature scaling standardizes the range of our independent variables.

print(X.shape)
print(y.shape)

(1000, 122)
(1000,)

print(type(X))
print(type(y))

<class 'numpy.ndarray'>
<class 'numpy.ndarray'>

# apply PCA on X_train
from sklearn.decomposition import PCA
pca = PCA().fit(X)
plt.plot(np.cumsum(pca.explained_variance_ratio_))
plt.xlabel('Number of components')
plt.ylabel('Cumulative explained variance')
#plt.annotate('90',xy=(90, .90))

Text(0,0.5,'Cumulative explained variance')

from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
lda = LinearDiscriminantAnalysis(n_components=10)
X_r2 = lda.fit(X, y).transform(X)
# Percentage of variance explained for each components
print('LDA explained variance ratio (first two components): %s'
      % str(lda.explained_variance_ratio_))

LDA explained variance ratio (first two components): [ 1.]

C:\ProgramData\Anaconda3\lib\site-packages\sklearn\discriminant_analysis.py:388: UserWarning: Variables are collinear.
  warnings.warn("Variables are collinear.")

#Check the correlations among feature variables
corrmat = df_final.corr()
f, ax = plt.subplots(figsize=(15, 12))
sns.heatmap(corrmat);

#saleprice correlation matrix
k = 20 #number of variables for heatmap
cols = corrmat.nlargest(k, 'fraud_reported')['fraud_reported'].index
cm = np.corrcoef(df_final[cols].values.T)
sns.set(font_scale=1.25)
hm = sns.heatmap(cm, cbar=True, annot=True, square=True, fmt='.2f', annot_kws={'size': 8}, yticklabels=cols.values, xticklabels=cols.values)
plt.show()

def cor_selector(X, y):
    cor_list = []
    # calculate the correlation with y for each feature
    for i in X.columns.tolist():
        cor = np.corrcoef(X[i], y)[0, 1]
        cor_list.append(cor)
    # replace NaN with 0
    cor_list = [0 if np.isnan(i) else i for i in cor_list]
    # feature name
    cor_feature = X.iloc[:,np.argsort(np.abs(cor_list))[-100:]].columns.tolist()
    # feature selection? 0 for not select, 1 for select
    cor_support = [True if i in cor_feature else False for i in feature_name]
    return cor_support, cor_feature

cor_support, cor_feature = cor_selector(X, y)
print(str(len(cor_feature)), 'selected features')

100 selected features

from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.preprocessing import MinMaxScaler
X_norm = MinMaxScaler().fit_transform(X)
chi_selector = SelectKBest(chi2, k=100)
chi_selector.fit(X_norm, y)

SelectKBest(k=100, score_func=<function chi2 at 0x000000002EE620D0>)

chi_support = chi_selector.get_support()
chi_feature = X.loc[:,chi_support].columns.tolist()
print(str(len(chi_feature)), 'selected features')

100 selected features

from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
rfe_selector = RFE(estimator=LogisticRegression(), n_features_to_select=100, step=10, verbose=5)
rfe_selector.fit(X_norm, y)

Fitting estimator with 122 features.
Fitting estimator with 112 features.
Fitting estimator with 102 features.

RFE(estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
  n_features_to_select=100, step=10, verbose=5)

rfe_support = rfe_selector.get_support()
rfe_feature = X.loc[:,rfe_support].columns.tolist()
print(str(len(rfe_feature)), 'selected features')

100 selected features

from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LogisticRegression

embeded_lr_selector = SelectFromModel(LogisticRegression(penalty="l1"), '1.25*median')
embeded_lr_selector.fit(X_norm, y)

SelectFromModel(estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l1', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
        norm_order=1, prefit=False, threshold='1.25*median')

embeded_lr_support = embeded_lr_selector.get_support()
embeded_lr_feature = X.loc[:,embeded_lr_support].columns.tolist()
print(str(len(embeded_lr_feature)), 'selected features')

56 selected features

from sklearn.ensemble import RandomForestClassifier

embeded_rf_selector = SelectFromModel(RandomForestClassifier(n_estimators=100), threshold='1.25*median')
embeded_rf_selector.fit(X, y)

SelectFromModel(estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
        norm_order=1, prefit=False, threshold='1.25*median')

embeded_rf_support = embeded_rf_selector.get_support()
embeded_rf_feature = X.loc[:,embeded_rf_support].columns.tolist()
print(str(len(embeded_rf_feature)), 'selected features')

41 selected features

from lightgbm import LGBMClassifier

lgbc=LGBMClassifier(n_estimators=500, learning_rate=0.05, num_leaves=32, colsample_bytree=0.2,
            reg_alpha=3, reg_lambda=1, min_split_gain=0.01, min_child_weight=40)

embeded_lgb_selector = SelectFromModel(lgbc, threshold='1.25*median')
embeded_lgb_selector.fit(X, y)

SelectFromModel(estimator=LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=0.2,
        learning_rate=0.05, max_depth=-1, min_child_samples=20,
        min_child_weight=40, min_split_gain=0.01, n_estimators=500,
        n_jobs=-1, num_leaves=32, objective=None, random_state=None,
        reg_alpha=3, reg_lambda=1, silent=True, subsample=1.0,
        subsample_for_bin=200000, subsample_freq=1),
        norm_order=1, prefit=False, threshold='1.25*median')

embeded_lgb_support = embeded_lgb_selector.get_support()
embeded_lgb_feature = X.loc[:,embeded_lgb_support].columns.tolist()
print(str(len(embeded_lgb_feature)), 'selected features')

122 selected features

# put all selection together
feature_selection_df = pd.DataFrame({'Feature':feature_name, 'Pearson':cor_support, 'Chi-2':chi_support, 'RFE':rfe_support, 'Logistics':embeded_lr_support,
                                    'Random Forest':embeded_rf_support, 'LightGBM':embeded_lgb_support})
# count the selected times for each feature
feature_selection_df['Total'] = np.sum(feature_selection_df, axis=1)
# display the top 100
feature_selection_df = feature_selection_df.sort_values(['Total','Feature'] , ascending=False)
feature_selection_df.index = range(1, len(feature_selection_df)+1)
feature_selection_df

support6 = feature_selection_df[feature_selection_df['Total'] == 6]['Feature']
support56 = feature_selection_df[feature_selection_df['Total'] >= 5]['Feature']
support456 = feature_selection_df[feature_selection_df['Total'] >= 4]['Feature']
support3456 = feature_selection_df[feature_selection_df['Total'] >= 3]['Feature']
support23456 = feature_selection_df[feature_selection_df['Total'] >= 2]['Feature']

X_train, X_test, y_train, y_test = train_test_split(X[support6], y, test_size=0.3)

from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_validate
lr = LogisticRegression()
lr_score = cross_validate(lr, X_train, y_train, cv=10, scoring='roc_auc')
print(lr_score['test_score'].mean())

0.705018758834

X_train, X_test, y_train, y_test = train_test_split(X[support56], y, test_size=0.3)

X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_validate
lr = LogisticRegression()
lr_score = cross_validate(lr, X_train, y_train, cv=10, scoring='roc_auc')
print(lr_score['test_score'].mean())

0.727044025157

X_train, X_test, y_train, y_test = train_test_split(X[support456], y, test_size=0.3)

X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_validate
lr = LogisticRegression()
lr_score = cross_validate(lr, X_train, y_train, cv=10, scoring='roc_auc')
print(lr_score['test_score'].mean())

0.662650355256

X_train, X_test, y_train, y_test = train_test_split(X[support3456], y, test_size=0.3)

X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_validate
lr = LogisticRegression()
lr_score = cross_validate(lr, X_train, y_train, cv=10, scoring='roc_auc')
print(lr_score['test_score'].mean())

0.684268429191

X_train, X_test, y_train, y_test = train_test_split(X[support23456], y, test_size=0.3)

X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_validate
lr = LogisticRegression()
lr_score = cross_validate(lr, X_train, y_train, cv=10, scoring='roc_auc')
print(lr_score['test_score'].mean())

0.703924651622

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_validate
lr = LogisticRegression()
lr_score = cross_validate(lr, X_train, y_train, cv=10, scoring='roc_auc')
print(lr_score['test_score'].mean())

0.685278228369

X_train, X_test, y_train, y_test = train_test_split(X[support56], y, test_size=0.3)
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

from sklearn.tree import DecisionTreeClassifier
dtc = DecisionTreeClassifier(max_depth = 10)
dtc_score = cross_validate(dtc, X_train, y_train, cv=10, scoring='roc_auc')
print(dtc_score['test_score'].mean())

0.787026167506

from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(n_estimators = 50, max_depth = 10)
rfc_score = cross_validate(rfc, X_train, y_train, cv=10, scoring = 'roc_auc')
print(rfc_score['test_score'].mean())

0.871235426924

from sklearn.neighbors import KNeighborsClassifier
knc = KNeighborsClassifier(n_neighbors=15)
knc_score = cross_validate(knc, X_train, y_train, cv=10, scoring = 'roc_auc')
print(knc_score['test_score'].mean())

0.712926757544

from sklearn.svm import SVC
svc = SVC()
svc_score = cross_validate(svc, X_train, y_train, cv=10, scoring = 'roc_auc')
print(svc_score['test_score'].mean())

0.697608307958

from sklearn.svm import LinearSVC
svc = LinearSVC()
svc_score = cross_validate(svc, X_train, y_train, cv=10, scoring = 'roc_auc')
print(svc_score['test_score'].mean())

0.723937552767

from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
lda = LinearDiscriminantAnalysis(n_components=10)
lda_score = cross_validate(lda, X_train, y_train, cv=10, scoring = 'roc_auc')
print(lda_score['test_score'].mean())

0.72503557301

C:\ProgramData\Anaconda3\lib\site-packages\sklearn\discriminant_analysis.py:388: UserWarning: Variables are collinear.
  warnings.warn("Variables are collinear.")
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\discriminant_analysis.py:388: UserWarning: Variables are collinear.
  warnings.warn("Variables are collinear.")
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\discriminant_analysis.py:388: UserWarning: Variables are collinear.
  warnings.warn("Variables are collinear.")
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\discriminant_analysis.py:388: UserWarning: Variables are collinear.
  warnings.warn("Variables are collinear.")
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\discriminant_analysis.py:388: UserWarning: Variables are collinear.
  warnings.warn("Variables are collinear.")
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\discriminant_analysis.py:388: UserWarning: Variables are collinear.
  warnings.warn("Variables are collinear.")
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\discriminant_analysis.py:388: UserWarning: Variables are collinear.
  warnings.warn("Variables are collinear.")
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\discriminant_analysis.py:388: UserWarning: Variables are collinear.
  warnings.warn("Variables are collinear.")
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\discriminant_analysis.py:388: UserWarning: Variables are collinear.
  warnings.warn("Variables are collinear.")
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\discriminant_analysis.py:388: UserWarning: Variables are collinear.
  warnings.warn("Variables are collinear.")

#Gradient Boosting
from sklearn.ensemble import GradientBoostingClassifier
gbc = GradientBoostingClassifier(n_estimators = 50, max_depth=10, min_samples_split=20).fit(X_train, y_train)
scoring = {'AUC':'roc_auc','ACC':'accuracy','pre':'precision','rec':'recall', 'f1_score':'f1'}
gbc_cv_score = cross_validate(gbc, X_train, y_train, scoring=scoring, cv=10)
print("ROC_AUC: %.4f" %gbc_cv_score['test_AUC'].mean(), "Accuracy: %.4f" %gbc_cv_score['test_ACC'].mean(), "Precision: %.4f" %gbc_cv_score['test_pre'].mean(),
      "Recall: %.4f" %gbc_cv_score['test_rec'].mean(), "F1-Score: %.4f" %gbc_cv_score['test_f1_score'].mean())

ROC_AUC: 0.8689 Accuracy: 0.8386 Precision: 0.6874 Recall: 0.6611 F1-Score: 0.6671

#XGBoost
from xgboost import XGBClassifier
from sklearn.grid_search import GridSearchCV

cv_params = {'max_depth': [3,5,7], 'min_child_weight': [15,20,25]}
ind_params = {'learning_rate': 0.1, 'n_estimators': 50, 'seed':0, 'subsample': 0.8, 'colsample_bytree': 0.8, 'objective': 'binary:logistic'}
optimized_GBM = GridSearchCV(XGBClassifier(**ind_params), cv_params, scoring = 'f1', cv = 10) 
optimized_GBM.fit(X_train, y_train)
optimized_GBM.grid_scores_

C:\ProgramData\Anaconda3\lib\site-packages\sklearn\cross_validation.py:41: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.
  "This module will be removed in 0.20.", DeprecationWarning)
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\grid_search.py:42: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. This module will be removed in 0.20.
  DeprecationWarning)

[mean: 0.58597, std: 0.09518, params: {'max_depth': 3, 'min_child_weight': 15},
 mean: 0.59815, std: 0.11564, params: {'max_depth': 3, 'min_child_weight': 20},
 mean: 0.47234, std: 0.10682, params: {'max_depth': 3, 'min_child_weight': 25},
 mean: 0.58794, std: 0.09291, params: {'max_depth': 5, 'min_child_weight': 15},
 mean: 0.60185, std: 0.11954, params: {'max_depth': 5, 'min_child_weight': 20},
 mean: 0.47234, std: 0.10682, params: {'max_depth': 5, 'min_child_weight': 25},
 mean: 0.58794, std: 0.09291, params: {'max_depth': 7, 'min_child_weight': 15},
 mean: 0.60185, std: 0.11954, params: {'max_depth': 7, 'min_child_weight': 20},
 mean: 0.47234, std: 0.10682, params: {'max_depth': 7, 'min_child_weight': 25}]

print(optimized_GBM.best_params_)
print(optimized_GBM.best_score_)

{'max_depth': 5, 'min_child_weight': 20}
0.6018523878355063

#XGBoost
import xgboost as xgb
xgdmat = xgb.DMatrix(X_train, y_train)
tuned_params = {'subsample': 0.8, 'colsample_bytree': 0.8, 'objective': 'binary:logistic', 'max_depth':5, 'min_child_weight':20} 
xgb_cv_score = xgb.cv(params = tuned_params, dtrain = xgdmat, num_boost_round = 50, nfold = 10, metrics = ['auc','error'], early_stopping_rounds = 25)
print(xgb_cv_score.tail(5))
print("ROC_AUC: %.4f" %xgb_cv_score["test-auc-mean"].iloc[-1], "Accuracy: %.4f" %(1-xgb_cv_score["test-error-mean"].iloc[-1]))

   test-auc-mean  test-auc-std  test-error-mean  test-error-std  \
3  0.776344       0.076861      0.195714         0.043822         
4  0.778546       0.070135      0.182857         0.040305         
5  0.793067       0.062178      0.184286         0.037498         
6  0.793279       0.058047      0.171429         0.031298         
7  0.794825       0.058356      0.168571         0.036027         

   train-auc-mean  train-auc-std  train-error-mean  train-error-std  
3  0.796112        0.041989       0.201111          0.028922         
4  0.796901        0.039881       0.188571          0.025285         
5  0.816002        0.006980       0.186508          0.019816         
6  0.818155        0.006886       0.178730          0.008872         
7  0.820103        0.007114       0.174445          0.008014         
ROC_AUC: 0.7948 Accuracy: 0.8314

final_gb = xgb.train(tuned_params, xgdmat, num_boost_round = 7)

#Plot feature importances
sns.set(font_scale = 1.5, rc={'figure.figsize':(4,6)})
xgb.plot_importance(final_gb)

<matplotlib.axes._subplots.AxesSubplot at 0x3ce65518>

testdmat = xgb.DMatrix(X_test)
y_pred = final_gb.predict(testdmat)
y_pred[0:10]

array([ 0.53634608,  0.13929342,  0.62235302,  0.18702345,  0.57508171,
        0.16546598,  0.13965239,  0.57508171,  0.18330199,  0.56660396], dtype=float32)

#Plot the histogram of predicted probabilities for Class 1
plt.hist(y_pred, bins=8)
plt.xlim(0, 1)
plt.title('Histogram of predicted probabilities')
plt.xlabel('Predicted probability of fraud claims')
plt.ylabel('Frequency')
plt.show()

#Predict target classes if the probability is greater than 0.7
from sklearn.preprocessing import binarize
y_pred_class = binarize([y_pred], 0.5)[0]

#Get the new confusion matrix (threshold of 0.7)
df_confusion = metrics.confusion_matrix(y_test, y_pred_class)
df_confusion

array([[185,  39],
       [ 26,  50]], dtype=int64)

def plot_confusion_matrix(df_confusion, title='Confusion matrix', cmap=plt.cm.gray_r):
    plt.matshow(df_confusion, cmap=cmap)
    plt.title('Confusion Matrix')
    plt.colorbar()
    plt.ylabel('Actual')
    plt.xlabel('Predicted')

plot_confusion_matrix(df_confusion)

# save confusion matrix and slice into four pieces
confusion = metrics.confusion_matrix(y_test, y_pred_class)
TP = confusion[1, 1]
TN = confusion[0, 0]
FP = confusion[0, 1]
FN = confusion[1, 0]

print('True Positive: ', str(TP))
print('True Negative: ', str(TN))
print('False Positive: ', str(FP))
print('False Negative: ', str(FN))

True Positive:  50
True Negative:  185
False Positive:  39
False Negative:  26

#Print classification and mis-classification scores
print('Classification Rate: ',metrics.accuracy_score(y_test, y_pred_class))
print('Misclassification Rate: ', 1 - metrics.accuracy_score(y_test, y_pred_class))

Classification Rate:  0.783333333333
Misclassification Rate:  0.216666666667

#Print Recall and Precision scores
print('Recall: ', str(metrics.recall_score(y_test, y_pred_class)))
print('Precision: ', str(metrics.precision_score(y_test, y_pred_class)))

Recall:  0.657894736842
Precision:  0.561797752809

#Plot the AUC-ROC curve
fpr, tpr, thresholds = metrics.roc_curve(y_test, y_pred)
plt.plot(fpr, tpr)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.0])
plt.title('ROC curve for defaulter classifier')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.grid(True)

# AUC is the percentage of the ROC plot that is underneath the curve:
print('Area under ROC curve: ', str(metrics.roc_auc_score(y_test, y_pred)))

Area under ROC curve:  0.744243421053

#Hyper-parameter tuning for Random Forest
from sklearn.model_selection import GridSearchCV
param_grid = dict(n_estimators=[40,50,60], max_depth=[3,5,7], min_samples_split=[10,20,30], criterion=['gini','entropy'])
grid = GridSearchCV(rfc, param_grid, cv=10, scoring='roc_auc')
grid.fit(X_train, y_train)

GridSearchCV(cv=10, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=10, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=50, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'n_estimators': [40, 50, 60], 'max_depth': [3, 5, 7], 'min_samples_split': [10, 20, 30], 'criterion': ['gini', 'entropy']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='roc_auc', verbose=0)

#Check the best score and best fitting params
print(grid.best_score_)
print(grid.best_params_)
print(grid.best_estimator_)

0.880191423449
{'criterion': 'entropy', 'max_depth': 7, 'min_samples_split': 20, 'n_estimators': 60}
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=7, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=20,
            min_weight_fraction_leaf=0.0, n_estimators=60, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

#Check the prediction score
from sklearn import metrics
y_pred_class = grid.predict(X_test)
metrics.roc_auc_score(y_test, y_pred_class)

0.52619830827067682

param_grid = dict(n_estimators=[40,50,60], max_depth=[7,10,12], min_samples_split=[10,20,30])
grid = GridSearchCV(gbc, param_grid, cv=10, scoring='roc_auc')
grid.fit(X_train, y_train)

GridSearchCV(cv=10, error_score='raise',
       estimator=GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=10,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=20,
              min_weight_fraction_leaf=0.0, n_estimators=50,
              presort='auto', random_state=None, subsample=1.0, verbose=0,
              warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'n_estimators': [40, 50, 60], 'max_depth': [7, 10, 12], 'min_samples_split': [10, 20, 30]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='roc_auc', verbose=0)

#Check the best score and best fitting params
print(grid.best_score_)
print(grid.best_params_)
print(grid.best_estimator_)

0.881020233928
{'max_depth': 7, 'min_samples_split': 30, 'n_estimators': 40}
GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=7,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=30,
              min_weight_fraction_leaf=0.0, n_estimators=40,
              presort='auto', random_state=None, subsample=1.0, verbose=0,
              warm_start=False)

y_pred_class = grid.predict(X_test)
metrics.roc_auc_score(y_test, y_pred_class)

0.74447838345864659

#import keras and its modules
import keras
from keras.models import Sequential #Sequential module is required to initialize ANN
from keras.layers import Dense #Dense module is required to build the layers of ANN

Next we need to initialize our ANN by creating an instance of Sequential. The Sequential function initializes a linear stack of layers. This allows us to add more layers later using the Dense module.

classifier = Sequential()

Adding input layer (First Hidden Layer)

We use the add method to add different layers to our ANN. The first parameter is the number of nodes you want to add to this layer. There is no rule of thumb as to how many nodes you should add. However a common strategy is to choose the number of nodes as the average of nodes in the input layer and the number of nodes in the output layer.

Say for example you had five independent variables and one output. Then you would take the sum of that and divide by two, which is three. You can also decide to experiment with a technique called parameter tuning. The second parameter, kernel_initializer, is the function that will be used to initialize the weights. In this case, it will use a uniform distribution to make sure that the weights are small numbers close to zero. The next parameter is the activation function. We use the Rectifier function, shortened as relu. We mostly use this function for the hidden layer in ANN. The final parameter is input_dim, which is the number of nodes in the input layer. It represents the number of independent variables.

classifier.add(
        Dense(5, kernel_initializer = 'uniform',
              activation = 'relu', input_dim=63))

Adding Second Hidden Layer

Adding the second hidden layer is similar to adding the first hidden layer.

We don’t need to specify the input_dim parameter because we have already specified it in the first hidden layer. In the first hidden layer we specified this in order to let the layer know how many input nodes to expect. In the second hidden layer the ANN already knows how many input nodes to expect so we don’t need to repeat ourselves.

classifier.add(
        Dense(5, kernel_initializer = 'uniform',
              activation = 'relu'))

Adding the Output layer

We change the first parameter because in our output node we expect one node. This is because we are only interested in knowing whether a claim was fraudulent or not. We change the activation function because we want to get the probabilities that a claim is fraudulent. We do this by using the Sigmoid activation function. In case you’re dealing with a classification problem that has more than two classes (i.e. classifying cats, dogs, and monkeys) we’d need to change two things. We ‘d change the first parameter to 3 and change the activation function to softmax. Softmax is a sigmoid function applied to an independent variable with more than two categories.

classifier.add(
     Dense(1, kernel_initializer = 'uniform',
           activation = 'sigmoid'))

Compiling the ANN

Compiling is basically applying a stochastic gradient descent to the whole neural network. The first parameter is the algorithm you want to use to get the optimal set of weights in the neural network. The algorithm used here is a stochastic gradient algorithm. There are many variants of this. A very efficient one to use is adam. The second parameter is the loss function within the stochastic gradient algorithm. Since our categories are binary we use the binary_crossentropy loss function. Otherwise we would have used categorical_crossentopy. The final argument is the criterion we’ll use to evaluate our model. In this case we use the accuracy.

classifier.compile(optimizer= 'adam',
                  loss = 'binary_crossentropy',
                  metrics = ['accuracy'])

classifier.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
dense_13 (Dense)             (None, 5)                 320       
_________________________________________________________________
dense_14 (Dense)             (None, 5)                 30        
_________________________________________________________________
dense_15 (Dense)             (None, 1)                 6         
=================================================================
Total params: 356
Trainable params: 356
Non-trainable params: 0
_________________________________________________________________

Fitting ANN to the training set

X_train represents the independent variables we’re using to train our ANN, and y_train represents the column we’re predicting. Epochs represents the number of times we’re going to pass our full dataset through the ANN. Batch_size is the number of observations after which the weights will be updated.

classifier.fit(X_train, y_train, batch_size = 10, epochs = 30, verbose=2, validation_data = [X_test, y_test])

Train on 700 samples, validate on 300 samples
Epoch 1/30
 - 0s - loss: 0.0767 - acc: 0.9843 - val_loss: 1.5046 - val_acc: 0.7333
Epoch 2/30
 - 0s - loss: 0.0752 - acc: 0.9871 - val_loss: 1.5116 - val_acc: 0.7233
Epoch 3/30
 - 0s - loss: 0.0733 - acc: 0.9871 - val_loss: 1.5220 - val_acc: 0.7233
Epoch 4/30
 - 0s - loss: 0.0728 - acc: 0.9871 - val_loss: 1.5476 - val_acc: 0.7367
Epoch 5/30
 - 0s - loss: 0.0716 - acc: 0.9886 - val_loss: 1.5337 - val_acc: 0.7333
Epoch 6/30
 - 0s - loss: 0.0709 - acc: 0.9871 - val_loss: 1.5334 - val_acc: 0.7200
Epoch 7/30
 - 0s - loss: 0.0692 - acc: 0.9886 - val_loss: 1.5594 - val_acc: 0.7300
Epoch 8/30
 - 0s - loss: 0.0694 - acc: 0.9871 - val_loss: 1.5727 - val_acc: 0.7200
Epoch 9/30
 - 0s - loss: 0.0676 - acc: 0.9871 - val_loss: 1.5844 - val_acc: 0.7300
Epoch 10/30
 - 0s - loss: 0.0684 - acc: 0.9871 - val_loss: 1.5828 - val_acc: 0.7300
Epoch 11/30
 - 0s - loss: 0.0670 - acc: 0.9871 - val_loss: 1.5720 - val_acc: 0.7233
Epoch 12/30
 - 0s - loss: 0.0648 - acc: 0.9886 - val_loss: 1.6168 - val_acc: 0.7333
Epoch 13/30
 - 0s - loss: 0.0646 - acc: 0.9871 - val_loss: 1.6149 - val_acc: 0.7167
Epoch 14/30
 - 0s - loss: 0.0631 - acc: 0.9886 - val_loss: 1.6409 - val_acc: 0.7233
Epoch 15/30
 - 0s - loss: 0.0627 - acc: 0.9886 - val_loss: 1.6567 - val_acc: 0.7267
Epoch 16/30
 - 0s - loss: 0.0624 - acc: 0.9886 - val_loss: 1.6462 - val_acc: 0.7267
Epoch 17/30
 - 0s - loss: 0.0615 - acc: 0.9886 - val_loss: 1.6664 - val_acc: 0.7300
Epoch 18/30
 - 0s - loss: 0.0605 - acc: 0.9886 - val_loss: 1.6580 - val_acc: 0.7267
Epoch 19/30
 - 0s - loss: 0.0592 - acc: 0.9886 - val_loss: 1.6837 - val_acc: 0.7267
Epoch 20/30
 - 0s - loss: 0.0595 - acc: 0.9886 - val_loss: 1.6930 - val_acc: 0.7300
Epoch 21/30
 - 0s - loss: 0.0589 - acc: 0.9886 - val_loss: 1.7183 - val_acc: 0.7167
Epoch 22/30
 - 0s - loss: 0.0584 - acc: 0.9886 - val_loss: 1.7377 - val_acc: 0.7267
Epoch 23/30
 - 0s - loss: 0.0584 - acc: 0.9886 - val_loss: 1.7151 - val_acc: 0.7167
Epoch 24/30
 - 0s - loss: 0.0567 - acc: 0.9886 - val_loss: 1.7325 - val_acc: 0.7200
Epoch 25/30
 - 0s - loss: 0.0570 - acc: 0.9886 - val_loss: 1.7250 - val_acc: 0.7200
Epoch 26/30
 - 0s - loss: 0.0563 - acc: 0.9886 - val_loss: 1.7345 - val_acc: 0.7233
Epoch 27/30
 - 0s - loss: 0.0566 - acc: 0.9886 - val_loss: 1.7609 - val_acc: 0.7200
Epoch 28/30
 - 0s - loss: 0.0566 - acc: 0.9886 - val_loss: 1.7662 - val_acc: 0.7333
Epoch 29/30
 - 0s - loss: 0.0556 - acc: 0.9886 - val_loss: 1.7704 - val_acc: 0.7300
Epoch 30/30
 - 0s - loss: 0.0541 - acc: 0.9886 - val_loss: 1.7940 - val_acc: 0.7267

<keras.callbacks.History at 0x38e9b278>

Predicting using the training set

y_pred = classifier.predict(X_test)
y_pred[0:10]

array([[  9.79284209e-07],
       [  9.70875135e-07],
       [  9.70475185e-07],
       [  9.70506676e-07],
       [  9.70475185e-07],
       [  9.70530778e-07],
       [  9.70475185e-07],
       [  9.70475185e-07],
       [  9.73435817e-07],
       [  9.70475185e-07]], dtype=float32)

y_pred = (y_pred > 0.5)
y_pred[0:10]

array([[False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False]], dtype=bool)

Checking the confusion matrix

from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
cm

array([[225,   0],
       [ 75,   0]], dtype=int64)

	months_as_customer	age	policy_number	policy_bind_date	policy_state	policy_csl	policy_deductable	policy_annual_premium	umbrella_limit	insured_zip	insured_sex	insured_education_level	insured_occupation	insured_hobbies	insured_relationship	capital-gains	capital-loss	incident_date	incident_type	collision_type	incident_severity	authorities_contacted	incident_state	incident_city	incident_location	incident_hour_of_the_day	number_of_vehicles_involved	property_damage	bodily_injuries	witnesses	police_report_available	total_claim_amount	injury_claim	property_claim	vehicle_claim	auto_make	auto_model	auto_year	fraud_reported
0	328	48	521585	2014-10-17	OH	250/500	1000	1406.91	0	466132	MALE	MD	craft-repair	sleeping	husband	53300	0	2015-01-25	Single Vehicle Collision	Side Collision	Major Damage	Police	SC	Columbus	9935 4th Drive	5	1	YES	1	2	YES	71610	6510	13020	52080	Saab	92x	2004	Y
1	228	42	342868	2006-06-27	IN	250/500	2000	1197.22	5000000	468176	MALE	MD	machine-op-inspct	reading	other-relative	0	0	2015-01-21	Vehicle Theft	?	Minor Damage	Police	VA	Riverwood	6608 MLK Hwy	8	1	?	0	0	?	5070	780	780	3510	Mercedes	E400	2007	Y
2	134	29	687698	2000-09-06	OH	100/300	2000	1413.14	5000000	430632	FEMALE	PhD	sales	board-games	own-child	35100	0	2015-02-22	Multi-vehicle Collision	Rear Collision	Minor Damage	Police	NY	Columbus	7121 Francis Lane	7	3	NO	2	3	NO	34650	7700	3850	23100	Dodge	RAM	2007	N
3	256	41	227811	1990-05-25	IL	250/500	2000	1415.74	6000000	608117	FEMALE	PhD	armed-forces	board-games	unmarried	48900	-62400	2015-01-10	Single Vehicle Collision	Front Collision	Major Damage	Police	OH	Arlington	6956 Maple Drive	5	1	?	1	2	NO	63400	6340	6340	50720	Chevrolet	Tahoe	2014	Y
4	228	44	367455	2014-06-06	IL	500/1000	1000	1583.91	6000000	610706	MALE	Associate	sales	board-games	unmarried	66000	-46000	2015-02-17	Vehicle Theft	?	Minor Damage	None	NY	Arlington	3041 3rd Ave	20	1	NO	0	1	NO	6500	1300	650	4550	Accura	RSX	2009	N

	months_as_customer	age	policy_deductable	policy_annual_premium	umbrella_limit	insured_sex	insured_education_level	capital-gains	capital-loss	incident_severity	incident_hour_of_the_day	number_of_vehicles_involved	bodily_injuries	witnesses	total_claim_amount	fraud_reported	policy_state_IN	policy_state_OH	csl_per_person_250	csl_per_person_500	csl_per_accident_300	csl_per_accident_500	insured_occupation_armed-forces	insured_occupation_craft-repair	insured_occupation_machine-op-inspct	insured_occupation_sales	insured_hobbies_board-games	insured_hobbies_reading	insured_hobbies_sleeping	insured_relationship_other-relative	insured_relationship_own-child	insured_relationship_unmarried	incident_type_Single Vehicle Collision	incident_type_Vehicle Theft	authorities_contacted_None	authorities_contacted_Police	incident_state_NY	incident_state_OH	incident_state_SC	incident_state_VA	auto_make_Chevrolet	auto_make_Dodge	auto_make_Mercedes	auto_make_Saab	auto_model_92x	auto_model_E400	auto_model_RAM	auto_model_RSX	auto_model_Tahoe
0	328	48	1000	1406.91	0	1	5	53300	0	2	5	1	1	2	71610	1	0	1	1	0	0	1	0	1	0	0	0	0	1	0	0	0	1	0	0	1	0	0	1	0	0	0	0	1	1	0	0	0	0
1	228	42	2000	1197.22	5000000	1	5	0	0	1	8	1	0	0	5070	1	1	0	1	0	0	1	0	0	1	0	0	1	0	1	0	0	0	1	0	1	0	0	0	1	0	0	1	0	0	1	0	0	0
2	134	29	2000	1413.14	5000000	0	6	35100	0	1	7	3	2	3	34650	0	0	1	0	0	1	0	0	0	0	1	1	0	0	0	1	0	0	0	0	1	1	0	0	0	0	1	0	0	0	0	1	0	0
3	256	41	2000	1415.74	6000000	0	6	48900	-62400	2	5	1	1	2	63400	1	0	0	1	0	0	1	1	0	0	0	1	0	0	0	0	1	1	0	0	1	0	1	0	0	1	0	0	0	0	0	0	0	1
4	228	44	1000	1583.91	6000000	1	2	66000	-46000	1	20	1	0	1	6500	0	0	0	0	1	0	0	0	0	0	1	1	0	0	0	0	1	0	1	1	0	1	0	0	0	0	0	0	0	0	0	0	1	0

	Chi-2	Feature	LightGBM	Logistics	Pearson	RFE	Random Forest	Total
1	True	witnesses	True	True	True	True	True	6
2	True	umbrella_limit	True	True	True	True	True	6
3	True	insured_relationship_wife	True	True	True	True	True	6
4	True	insured_relationship_other-relative	True	True	True	True	True	6
5	True	insured_relationship_not-in-family	True	True	True	True	True	6
6	True	insured_occupation_exec-managerial	True	True	True	True	True	6
7	True	insured_occupation_craft-repair	True	True	True	True	True	6
8	True	insured_hobbies_cross-fit	True	True	True	True	True	6
9	True	insured_hobbies_chess	True	True	True	True	True	6
10	True	incident_state_WV	True	True	True	True	True	6
11	True	incident_state_NY	True	True	True	True	True	6
12	True	csl_per_person_500	True	True	True	True	True	6
13	True	capital-gains	True	True	True	True	True	6
14	True	auto_make_Audi	True	True	True	True	True	6
15	True	authorities_contacted_Other	True	True	True	True	True	6
16	True	total_claim_amount	True	False	True	True	True	5
17	True	policy_state_OH	True	False	True	True	True	5
18	True	number_of_vehicles_involved	True	False	True	True	True	5
19	True	insured_sex	True	False	True	True	True	5
20	True	insured_occupation_tech-support	True	False	True	True	True	5
21	True	insured_occupation_protective-serv	True	True	True	True	False	5
22	True	insured_occupation_priv-house-serv	True	True	True	True	False	5
23	True	insured_occupation_other-service	True	True	True	True	False	5
24	True	insured_occupation_handlers-cleaners	True	True	True	True	False	5
25	True	insured_occupation_farming-fishing	True	True	True	True	False	5
26	True	insured_hobbies_yachting	True	True	True	True	False	5
27	True	insured_hobbies_sleeping	True	True	True	True	False	5
28	True	insured_hobbies_movies	True	True	True	True	False	5
29	True	insured_hobbies_kayaking	True	True	True	True	False	5
30	True	insured_hobbies_golf	True	True	True	True	False	5
31	True	insured_hobbies_dancing	True	True	True	True	False	5
32	True	insured_hobbies_camping	True	True	True	True	False	5
33	True	insured_hobbies_bungie-jumping	True	True	True	True	False	5
34	True	insured_hobbies_board-games	True	True	True	True	False	5
35	True	incident_type_Vehicle Theft	True	True	True	True	False	5
36	True	incident_type_Single Vehicle Collision	True	False	True	True	True	5
37	True	incident_type_Parked Car	True	True	True	True	False	5
38	True	incident_state_VA	True	False	True	True	True	5
39	True	incident_state_SC	True	False	True	True	True	5
40	True	incident_state_OH	True	True	True	True	False	5
41	True	incident_severity	True	False	True	True	True	5
42	True	csl_per_person_250	True	False	True	True	True	5
43	True	csl_per_accident_300	True	False	True	True	True	5
44	True	bodily_injuries	True	False	True	True	True	5
45	True	auto_model_X6	True	True	True	True	False	5
46	True	auto_model_Wrangler	True	True	True	True	False	5
47	True	auto_model_TL	True	True	True	True	False	5
48	True	auto_model_RSX	True	True	True	True	False	5
49	True	auto_model_RAM	True	True	True	True	False	5
50	True	auto_model_Pathfinder	True	True	True	True	False	5
51	True	auto_model_Neon	True	True	True	True	False	5
52	True	auto_model_Maxima	True	True	True	True	False	5
53	True	auto_model_Malibu	True	True	True	True	False	5
54	True	auto_model_ML350	True	True	True	True	False	5
55	True	auto_model_M5	True	True	True	True	False	5
56	True	auto_model_Legacy	True	True	True	True	False	5
57	True	auto_model_Impreza	True	True	True	True	False	5
58	True	auto_model_F150	True	True	True	True	False	5
59	True	auto_model_Civic	True	True	True	True	False	5
60	True	auto_model_CRV	True	True	True	True	False	5
61	True	auto_make_Nissan	True	True	True	True	False	5
62	True	auto_make_Mercedes	True	False	True	True	True	5
63	True	authorities_contacted_None	True	True	True	True	False	5
64	False	policy_state_IN	True	True	False	True	True	4
65	False	policy_deductable	True	False	True	True	True	4
66	False	policy_annual_premium	True	False	True	True	True	4
67	False	months_as_customer	True	False	True	True	True	4
68	True	insured_relationship_own-child	True	False	True	False	True	4
69	True	insured_occupation_transport-moving	True	False	True	True	False	4
70	True	insured_hobbies_reading	True	True	False	True	False	4
71	True	insured_hobbies_exercise	True	False	True	True	False	4
72	True	insured_hobbies_basketball	True	False	True	True	False	4
73	False	insured_education_level	True	False	True	True	True	4
74	False	incident_hour_of_the_day	True	True	False	True	True	4
75	True	csl_per_accident_500	True	False	True	False	True	4
76	False	capital-loss	True	False	True	True	True	4
77	True	auto_model_Ultima	True	False	True	True	False	4
78	True	auto_model_Tahoe	True	False	True	True	False	4
79	True	auto_model_Silverado	True	False	True	True	False	4
80	True	auto_model_Jetta	True	False	True	True	False	4
81	True	auto_model_Forrestor	True	False	True	True	False	4
82	True	auto_model_Corolla	True	False	True	True	False	4
83	True	auto_model_Camry	True	False	True	True	False	4
84	True	auto_model_C300	True	False	True	True	False	4
85	True	auto_model_A5	True	False	True	True	False	4
86	True	auto_model_95	True	False	True	True	False	4
87	True	auto_model_93	True	False	True	True	False	4
88	True	auto_model_92x	True	False	True	True	False	4
89	True	auto_make_Volkswagen	True	False	True	True	False	4
90	True	auto_make_Saab	True	False	True	True	False	4
91	True	auto_make_Jeep	True	False	True	True	False	4
92	True	authorities_contacted_Police	True	False	True	False	True	4
93	True	authorities_contacted_Fire	True	False	True	False	True	4
94	False	age	True	True	False	True	True	4
95	True	insured_occupation_sales	True	False	True	False	False	3
96	True	insured_occupation_prof-specialty	True	False	True	False	False	3
97	True	insured_hobbies_video-games	True	False	True	False	False	3
98	True	insured_hobbies_skydiving	True	False	False	True	False	3
99	True	insured_hobbies_polo	True	False	True	False	False	3
100	False	incident_state_PA	True	True	False	True	False	3
101	True	auto_model_X5	True	False	True	False	False	3
102	True	auto_model_Passat	True	False	False	True	False	3
103	True	auto_model_MDX	True	False	False	True	False	3
104	True	auto_model_Fusion	True	False	True	False	False	3
105	False	auto_model_Escape	True	True	False	True	False	3
106	True	auto_make_Toyota	True	False	True	False	False	3
107	True	auto_make_Ford	True	False	True	False	False	3
108	True	auto_make_Chevrolet	True	False	True	False	False	3
109	True	auto_make_BMW	True	False	True	False	False	3
110	False	insured_relationship_unmarried	True	False	False	False	True	2
111	False	insured_occupation_machine-op-inspct	True	False	False	True	False	2
112	True	insured_hobbies_paintball	True	False	False	False	False	2
113	False	insured_hobbies_hiking	True	False	False	True	False	2
114	False	auto_model_Highlander	True	False	False	True	False	2
115	False	auto_model_Grand Cherokee	True	False	False	True	False	2
116	False	auto_make_Suburu	True	False	False	True	False	2
117	False	insured_occupation_armed-forces	True	False	False	False	False	1
118	False	auto_model_E400	True	False	False	False	False	1
119	False	auto_model_Accord	True	False	False	False	False	1
120	False	auto_model_A3	True	False	False	False	False	1
121	False	auto_make_Honda	True	False	False	False	False	1
122	False	auto_make_Dodge	True	False	False	False	False	1