An insurance company has approached you with a dataset of previous claims of their clients. The insurance company wants you to develop a model to help them predict which claims look fraudulent. By doing so you hope to save the company millions of dollars annually.
#Import necessary libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
#Load the dataset into a dataframe
df = pd.read_csv('insurance_claims.csv')
df.head()
#Check the shape of the dataframe
df.shape
#check the data types of each column
df.info()
df.columns[df.isnull().any()]
df.fraud_reported.value_counts()
sns.pairplot(df)
df.policy_deductable.unique()
df.number_of_vehicles_involved.unique()
df.number_of_vehicles_involved.value_counts()
df.bodily_injuries.unique()
df.witnesses.unique()
df[df['capital-gains'] == 0].shape[0]
df[(df['capital-gains'] > 0) & (df['capital-gains'] <=50000)].shape[0]
df[df['capital-gains'] > 50000].shape[0]
df[df['capital-loss'] == 0].shape[0]
df[(df['capital-loss'] < 0) & (df['capital-loss'] >=-50000)].shape[0]
df[(df['capital-loss'] >= -100000) & (df['capital-loss'] < -50000)].shape[0]
df.umbrella_limit.value_counts()
sns.distplot(df.age, bins=np.arange(19,64,5))
objectcols = [col for col in df.columns if df[col].dtype == 'O']
objectcols
#Plot the distribution of categorical variables
fig = plt.figure(figsize=(12, 12))
sub1 = plt.subplot(5, 4, 1)
sns.countplot(x='policy_state', data=df)
sub2 = plt.subplot(5, 4, 2)
sns.countplot(x='policy_csl', data=df)
sub3 = plt.subplot(5, 4, 3)
sns.countplot(x='insured_sex', data=df)
sub4 = plt.subplot(5, 4, 4)
sns.countplot(x='insured_education_level', data=df)
sub5 = plt.subplot(5, 4, 5)
sns.countplot(x='insured_occupation', data=df)
sub6 = plt.subplot(5, 4, 6)
sns.countplot(x='insured_hobbies', data=df)
sub7 = plt.subplot(5, 4, 7)
sns.countplot(x='insured_relationship', data=df)
sub8 = plt.subplot(5, 4, 8)
sns.countplot(x='incident_type', data=df)
sub9 = plt.subplot(5, 4, 9)
sns.countplot(x='collision_type', data=df)
sub9 = plt.subplot(5, 4, 10)
sns.countplot(x='incident_severity', data=df)
sub9 = plt.subplot(5, 4, 11)
sns.countplot(x='authorities_contacted', data=df)
sub9 = plt.subplot(5, 4, 12)
sns.countplot(x='incident_state', data=df)
sub9 = plt.subplot(5, 4, 13)
sns.countplot(x='incident_city', data=df)
sub9 = plt.subplot(5, 4, 14)
sns.countplot(x='incident_location', data=df)
sub9 = plt.subplot(5, 4, 15)
sns.countplot(x='property_damage', data=df)
sub9 = plt.subplot(5, 4, 16)
sns.countplot(x='police_report_available', data=df)
sub9 = plt.subplot(5, 4, 17)
sns.countplot(x='auto_make', data=df)
sub9 = plt.subplot(5, 4, 18)
sns.countplot(x='auto_model', data=df)
sub9 = plt.subplot(5, 4, 19)
sns.countplot(x='fraud_reported', data=df)
fig.tight_layout()
plt.show()
df.fraud_reported = df.fraud_reported.map({'Y':1, 'N':0})
df.property_damage = df.property_damage.map({'?':'UNKNOWN'})
df.police_report_available = df.police_report_available.map({'?':'UNKNOWN'})
df.collision_type = df.collision_type.map({'?':'UNKNOWN'})
df.insured_sex = df.insured_sex.map({'MALE':1,'FEMALE':0})
df.insured_education_level = df.insured_education_level.map({'High School': 1, 'College': 2, 'Associate': 2, 'JD': 3,\
'Masters': 4, 'MD': 5, 'PhD': 6})
df.incident_severity = df.incident_severity.map({'Minor Damage': 1, 'Major Damage': 2, 'Total Loss': 3, 'Trivial Damage': 4})
df['csl_per_person'] = df.policy_csl.str.split('/', expand=True)[0]
df['csl_per_accident'] = df.policy_csl.str.split('/', expand=True)[1]
features = ['months_as_customer','age','policy_state','csl_per_person','csl_per_accident','policy_deductable',\
'policy_annual_premium','umbrella_limit','insured_sex','insured_education_level','insured_occupation',\
'insured_hobbies','insured_relationship','capital-gains','capital-loss','incident_type','collision_type',\
'incident_severity','authorities_contacted','incident_state','incident_hour_of_the_day','number_of_vehicles_involved',\
'property_damage','bodily_injuries','witnesses','police_report_available','total_claim_amount','auto_make',\
'auto_model','fraud_reported']
df_final = pd.get_dummies(df[features],drop_first=True)
df_final.head()
#We use sklearn’s train_test_split to split the data into a training set and a test set.
from sklearn.model_selection import train_test_split
X = df_final.drop('fraud_reported', axis=1)
y = df['fraud_reported'].values
feature_name = X.columns.tolist()
Due to the massive amounts of computations taking place in deep learning, feature scaling is compulsory. Feature scaling standardizes the range of our independent variables.
print(X.shape)
print(y.shape)
print(type(X))
print(type(y))
# apply PCA on X_train
from sklearn.decomposition import PCA
pca = PCA().fit(X)
plt.plot(np.cumsum(pca.explained_variance_ratio_))
plt.xlabel('Number of components')
plt.ylabel('Cumulative explained variance')
#plt.annotate('90',xy=(90, .90))
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
lda = LinearDiscriminantAnalysis(n_components=10)
X_r2 = lda.fit(X, y).transform(X)
# Percentage of variance explained for each components
print('LDA explained variance ratio (first two components): %s'
% str(lda.explained_variance_ratio_))
#Check the correlations among feature variables
corrmat = df_final.corr()
f, ax = plt.subplots(figsize=(15, 12))
sns.heatmap(corrmat);