!pip install statsmodels
!pip install -U imbalanced-learn
from IPython.display import IFrame
IFrame("report.pdf", width=1000, height=600)
• What are the key features behind purchasing a vehicle?
• How large an influence is the price on customer’s purchasing behaviour?
• Plots and tables detailing your exploration of the data
• A basic statistical or machine learning model that can explain the data
• A discussion of the 2 key questions
• A plan for future work and how the insight gained could be actioned in a business scenario
# %matplotlib inline
import pandas as pd
import numpy as np
import statsmodels.api as sm
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('ggplot')
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.metrics import classification_report
from sklearn.model_selection import cross_val_score
from imblearn.over_sampling import RandomOverSampler
Checking the number of features in data
!ls -l -h
!head -1 Sample\ Availability\ Data | tr '|' '\n' | wc -l
Some Important Features:
fmt_names = "v{}"
names = [fmt_names.format(i + 1) for i in range(25)]
df_raw = pd.read_csv("Sample Availability Data", sep = "|", header = None, encoding = "utf-8", names=names)
display(df_raw.head())
display(df_raw.describe())
display(df_raw.dtypes)
ts_v3 = pd.DatetimeIndex(df_raw['v3'].values.tolist())
ts_v23 = pd.DatetimeIndex(df_raw['v23'].values.tolist())
display(ts_v3[0])
display(ts_v23[0])
d = ts_v23[0] - ts_v3[0]
display(d)
display(d.total_seconds() / (60 * 60 * 24))
def encode(x):
if x.dtype is np.dtype('O'):
return x.astype('category').cat.codes
return x
def normalize(df):
return (df - df.mean()) / (df.max() - df.min())
display(np.sort(ts_v3.month.unique()))
display(np.sort(ts_v3.hour.unique()))
display(np.sort(ts_v3.minute.unique()))
display(np.sort(ts_v3.second.unique()))
display(np.sort(ts_v23.day.unique()))
display(np.sort(ts_v23.month.unique()))
display(np.sort(ts_v23.hour.unique()))
display(np.sort(ts_v23.minute.unique()))
display(np.sort(ts_v23.second.unique()))
display(np.sort(ts_v23.hour.unique()))
display(np.sort(ts_v23.minute.unique()))
display(np.sort(ts_v23.second.unique()))
display(np.sort(ts_v23.year.unique()))
We applied feature extraction for new feature over initial feature, below
# Hour and minute of query date are being converting into four different time parts (early morning, morning, afternoon, evening) in day
def convert_to_day_part(ts):
if ts.hour == 0:
hour = 24
else:
hour = ts.hour
# "early_morning"
if 5 >= hour and hour < 23 and ts.minute < 59:
return 0
# "morning"
elif 11 >= hour and hour > 5 and ts.minute < 59:
return 1
# "afteroon"
elif 17 >= hour and hour > 11 and ts.minute < 59:
return 2
# "evening"
elif 23 >= hour and hour > 17 and ts.minute < 59:
return 3
# df_ex is extended dataframe for new features we will be creating, blow.
df_ex = df_raw.copy()
# Year is ignored since the both of them have only one value (2017)
df_ex["v3_day_part"] = list(map(convert_to_day_part, ts_v3))
df_ex["v3_day"] = ts_v3.day
df_ex["v3_week"] = ts_v3.week
df_ex["v3_dayofweek"] = ts_v3.dayofweek
df_ex["v3_dayofyear"] = ts_v3.dayofyear
# Time and year were ignored since the both of them have only one value (00:00:00, 2017)
df_ex["v23_day"] = ts_v23.day
df_ex["v23_week"] = ts_v23.week
df_ex["v23_dayofweek"] = ts_v23.dayofweek
df_ex["v23_dayofyear"] = ts_v23.dayofyear
df_ex["v23_month"] = ts_v23.month
# Taking difference of timestamps [pickup_date - query_date]. The difference is based on day.
df_ex["diff_v23_v3"] = list(map(lambda x: (x[1] - x[0]).total_seconds() / (60 * 60 * 24), zip(ts_v3, ts_v23)))
Regular Booked means that time difference is positive value, and booked status is 1,
Regular Not-Booked means that time difference is positive value, and booked status is 0,
There are another two different possible cases about queries accourding to time difference
However, the last possible situation is not normal case if we have a robust system, so the quering expired ad is more reasonalbe explanation.
# We can define new features according to the value of time difference and booked status
# Regular Booked means that time difference is positive value, and booked status is 1
df_ex['regular_booked'] = ((df_ex["diff_v23_v3"] >= 0) & (df_ex["v25"] == 1)).apply(lambda x: int(x))
nb_regular_booked = len(df_ex[df_ex['regular_booked'] == True])
# Regular Not-Booked means that time difference is positive value, and booked status is 0
df_ex['regular_not_booked'] = ((df_ex["diff_v23_v3"] >= 0) & (df_ex["v25"] == 0)).apply(lambda x: int(x))
nb_regular_not_booked = len(df_ex[df_ex['regular_not_booked'] == True])
# There are two different possible cases about queries accourding to time difference
# 1- revisit ad after booking
# 2- booking expired ad
# however, the last possible situation is not normal case if we have a robust system, so the first case is more suitable.
df_ex['revisited_booked_ad'] = ((df_ex["diff_v23_v3"] < 0) & (df_ex["v25"] == 1)).apply(lambda x: int(x))
revisited_booked_ad = df_ex[df_ex['revisited_booked_ad'] == True]
nb_revisited_booked_ad = len(revisited_booked_ad)
# querying expired ad
df_ex['queried_expired_ad'] = ((df_ex["diff_v23_v3"] < 0) & (df_ex["v25"] == 0)).apply(lambda x: int(x))
queried_expired_ad = df_ex[df_ex['queried_expired_ad'] == True]
nb_queried_expired_ad = len(queried_expired_ad)
display('#regular_booked: {}'.format(nb_regular_booked))
display('#regular_not_booked: {}'.format(nb_regular_not_booked))
display('#revisited_booked_ad: {}'.format(nb_revisited_booked_ad))
display('#queried_expired_ad: {}'.format(nb_queried_expired_ad))
display(df_ex.head())
display(df_ex.describe())
display(df_ex.dtypes)
df_cleaned= df_ex.dropna()
df_cleaned = df_cleaned.drop(["v1", "v2", "v3", "v23"], 1)
assert df_cleaned.isnull().values.any() == False
df_tmp = df_cleaned[(df_cleaned.regular_booked == 1) & (df_cleaned.regular_not_booked == 1) ]
assert len(df_tmp) == 0
len(df_cleaned)
df_encoded = df_cleaned.apply(encode)
g = sns.clustermap(df_encoded.corr(), linewidths=1, square=True, annot=True, fmt=".3f", figsize = (25, 25))
plt.setp(g.ax_heatmap.get_yticklabels(), rotation=0);
In this step, we investigate tendency of customer with respect to two different times, such querying time, booked time. In those different time period, customers have mostly different tendency.
def plot_percentage(by, df):
df_selected = df[[by, 'v6', 'regular_booked', 'v22']][df_cleaned['regular_booked'] == 1]
figure, axes = plt.subplots(1, 3, figsize=(30,5))
a = df_selected.groupby(by)['v6'].mean()
b = a.sum()
p = (a/b)
ax = sns.barplot(x=p.index, y = p.values, ax = axes[0])
ax.set_ylabel('v6 (mean %)')
a = df_selected.groupby(by)['regular_booked'].sum()
b = a.sum()
p = (a/b)
ax = sns.barplot(x=p.index, y = p.values, ax = axes[1])
ax.set_ylabel('regular_booked (sum %)')
a = df_selected.groupby(by)['v22'].mean()
b = a.sum()
p = (a/b)
ax = sns.barplot(x=p.index, y = p.values, ax = axes[2])
ax.set_ylabel('v22 (mean %)')
0: Early Morning, 1: Morning, 2: Afternoon, 3: Evening
According to day part plots at 1st row
According to week plots at 2nd row
plot_percentage('v3_day_part', df_cleaned)
plot_percentage('v3_week', df_cleaned)
According to month part plots in 1st row
According to week plots in 2nd row
plot_percentage('v23_month', df_cleaned)
plot_percentage('v23_week', df_cleaned)
As we discused before, when we were creating a new feature related to the timestamps (querying date and pickup date), diff_v23_v3 is difference (D) between the both specific date which means that a car is being booked in prior to D days by customer.
Since the other two specific features' name already know, we also used them. Actually, we can use the rest continuous variables however, we cannot make meaningful comment about customer.
To expose valuable insight well, we used CDF in this step.
def plot_cdf(p,
ax,
deltax=None,
xlog=False,
xlim=[0, 1],
deltay=0.25,
ylog=False,
ylim=[0,1],
xlabel = 'x'):
ecdf = sm.distributions.ECDF(p)
x = ecdf.x
y = ecdf.y
assert len(x) == len(y)
if deltax is not None:
x_ticks = np.arange(xlim[0], xlim[1] + deltax, deltax)
ax.set_xticks(x_ticks)
ax.set_xlabel(xlabel)
ax.set_xlim(xlim[0], xlim[1])
ax.vlines(np.mean(p), min(y), max(y), color='red', label='mean', linewidth=2)
ax.vlines(np.median(p), min(y), max(y), color='orange', label='median', linewidth=2)
ax.vlines(np.mean(p) + 2 * np.std(p), min(y), max(y), color='blue', label='mean + 2 * std', linewidth=2)
ax.vlines(np.mean(p) + 3 * np.std(p), min(y), max(y), color='green', label='mean + 3 * std', linewidth=2)
y_ticks = np.arange(ylim[0], ylim[1] + deltay, deltay)
ax.set_ylabel('CDF')
ax.set_yticks(y_ticks)
ax.set_ylim(ylim[0], ylim[1])
if xlog is True:
ax.set_xscale('log')
if ylog is True:
ax.set_yscale('log')
ax.grid(which='minor', alpha=0.5)
ax.grid(which='major', alpha=0.9)
ax.legend(loc=4)
sns.set_style('whitegrid')
sns.regplot(x=x, y=y, fit_reg=False, scatter=True, ax = ax)
def plot_cdf_by_feature(by, df, deltax, booked_types = ['regular_not_booked', 'regular_booked']):
cdf_by_1 = df[df[booked_types[1]] == True][by].values.tolist()
figure, axes = plt.subplots(1, figsize=(15,5))
plot_cdf(cdf_by_1,
axes,
deltax=deltax,
xlim=[0, np.mean(cdf_by_1) + 3 * np.std(cdf_by_1) + 50],
deltay = 0.05,
ylim=[0, 1.00], xlabel=by)
CDF of diff_v23_v3 on 1st graph (Time Diff)
CDF of v6 on 2nd graph (Price)
CDF of v22 on 3rd graph (Rental Duration)
# CDF of time difference on original cleaned data
plot_cdf_by_feature('diff_v23_v3', df_cleaned[['regular_not_booked', 'regular_booked', 'diff_v23_v3']], 10)
# CDF of price of car on original cleaned data
plot_cdf_by_feature('v6', df_cleaned[['regular_not_booked', 'regular_booked', 'v6']], 50)
# CDF of rental duration on original cleaned data
plot_cdf_by_feature('v22', df_cleaned[['regular_not_booked', 'regular_booked', 'v22']], 5)
To expose the distribution of labaled features values (0 and 1), we plotted the boxplots of regular_booked and v25 by diff_v23_v3, v6, v22.
According to regular_booked plots in 1st row
According to v5 plots in 2nd row
According to the both type of plots, the labelled feature is mostlikely generated synthetically because there is just small shifting between 0s and 1s on all plots. However, the number of instance is not balanced as you can see in the next picture at the next step. So, that problem can be called as imbalaced data since laballed feature has skewness.
def plot_boxplot(by, df):
figure, axes = plt.subplots(1, 3, figsize=(15,2))
ax = sns.boxplot(x=by, y="diff_v23_v3", data=df[[by, 'diff_v23_v3']], ax = axes[0])
ax.set_yscale('log')
ax = sns.boxplot(x=by, y="v6", data=df[[by, 'v6']], ax = axes[1])
ax.set_yscale('log')
ax = sns.boxplot(x=by, y="v22", data=df[[by, 'v22']], ax = axes[2])
ax.set_yscale('log')
plot_boxplot('regular_booked', df_cleaned[['regular_booked', 'diff_v23_v3', 'v6', 'v22']])
plot_boxplot('v5', df_cleaned[['v5', 'diff_v23_v3', 'v6', 'v22']])
As we discussed in previous step, the labeled feature has shewness shape.
In the next, you can see the distrubation of the labels, below.
Before solve that problem, we applied a few technique to make sure everything, so far.
grouped_by_v25 = df_cleaned.groupby('v25')['v25']
p = grouped_by_v25.size() / len(df_cleaned)
sns.barplot(x=p.index, y=p.values, log=True)
display(p)
We designed two different approach to process the data
def create_train_test(df, by, sample_size):
grouped = df.groupby(by)[by]
frac = grouped.size().values / len(df)
sample_sizes_by_class = sample_size * frac
class_0_train = df[df[by] == 0].sample(int(sample_sizes_by_class[0]))
class_0_test = df[df[by] == 0].drop(class_0_train.index)
class_1_train = df[df[by] == 1].sample(int(sample_sizes_by_class[1]))
class_1_test = df[df[by] == 1].drop(class_1_train.index)
df_train = class_0_train.append(class_1_train , ignore_index=True)
df_test = class_0_test.append(class_1_test, ignore_index=True)
X_train = df_train.drop(by, 1).values.tolist()
y_train = df_train[by].values.tolist()
X_test = df_test.drop(by, 1).values.tolist()
y_test = df_test[by].values.tolist()
return X_train, y_train, X_test, y_test
def report_classification(clf, X_train, y_train, X_test, y_test, target_names = ['not_booked', 'booked']):
y_test_pred = clf.predict(X_test)
y_train_pred = clf.predict(X_train)
figure, axes = plt.subplots(1, 2, figsize=(10,5))
cm_test = confusion_matrix(y_test, y_test_pred)
df_cm_test = pd.DataFrame(cm_test, index = target_names, columns = target_names)
ax = sns.heatmap(df_cm_test, annot=True, ax = axes[0], square= True)
ax.set_title('Test CM')
cm_train = confusion_matrix(y_train, y_train_pred)
df_cm_train = pd.DataFrame(cm_train, index = target_names, columns = target_names)
ax = sns.heatmap(df_cm_train, annot=True, ax = axes[1], square= True)
ax.set_title('Train CM')
print('-' * 20 + 'Testing Performance' + '-' * 20)
print(classification_report(y_test, y_test_pred, target_names=target_names))
print('acc: ', metrics.accuracy_score(y_test, y_test_pred))
print('-' * 20 + 'Training Performance' + '-' * 20)
print(classification_report(y_train, y_train_pred, target_names=target_names))
print('acc: ', metrics.accuracy_score(y_train, y_train_pred))
def apply_cross_val(clf, X_train, y_train):
val_scores = cross_val_score(clf, X_train, y_train, cv = 10, verbose= 1)
print("Cross Validation Scores: ", val_scores)
print("Cross Validation Accuracy: %0.2f (+/- %0.2f)" % (val_scores.mean(), val_scores.std() * 2))
def plot_importances(clf, df, target_label='v25'):
df_imp = pd.DataFrame(index=df.drop([target_label], 1).columns.tolist(),
data=clf.feature_importances_, columns=['importance'])
df_imp.sort_values(by='importance', ascending=False, inplace=True)
print('Most importance features ordered by importance')
print(df_imp.index.tolist())
figure, axes = plt.subplots(1, figsize=(15,8))
ax = sns.barplot(x = df_imp.importance, y = df_imp.index, ax=axes)
ax.set_xscale('log')
plt.grid(True, which="both")
def approach_1(clf, df, cross_val = False, target_label='v25', sample_size=1000):
X_train, y_train, X_test, y_test = create_train_test(df, target_label, sample_size)
if cross_val:
apply_cross_val(clf, X_train, y_train)
clf = clf.fit(X_train, y_train)
y_test_pred = clf.predict(X_test)
y_train_pred = clf.predict(X_train)
report_classification(clf, X_train, y_train, X_test, y_test)
plot_importances(clf, df, target_label)
def approach_2(clf, df, cross_val = False, target_label='v25'):
X_train = df_cleaned.drop(target_label, 1).values.tolist()
y_train = df_cleaned[target_label].values.tolist()
apply_cross_val(clf, X_train, y_train)
clf = RandomForestClassifier(n_jobs=-1, n_estimators=10)
As we disscussed before, we got rid of high correlated features with v25. Those are 'regular_booked', 'regular_not_booked'.
df = df_cleaned.drop(['regular_booked', 'regular_not_booked'], 1)
approach_1(clf, df, True, sample_size=1000)
df = df_cleaned.drop(['regular_booked', 'regular_not_booked'], 1)
approach_2(clf, df)
There are a few sampling techniques to overcome that problem, below
We used oversampling and SMOTE. However, we decided to use oversampling technique at the end of day since it is faster than SMOTE (based on SVM) in terms of performance issue based on hardware computing and time.
# from imblearn.over_sampling import SMOTE
# kind = ['regular', 'borderline1', 'borderline2', 'svm']
# ros = SMOTE(kind=kind[3])
df = df_cleaned.drop(['regular_booked', 'regular_not_booked'], 1)
X = df.drop('v25', 1)
y = df['v25']
ros = RandomOverSampler(0.7)
X_resampled, y_resampled = ros.fit_sample(X, y)
df_oversampled = pd.DataFrame(list(map(lambda x: tuple(x), X_resampled)), columns= X.columns)
df_oversampled['v25'] = y_resampled
grouped_by_v25 = df_oversampled.groupby('v25')['v25']
sns.barplot(x=grouped_by_v25.size().index, y=grouped_by_v25.size().values)
df_encoded = df_oversampled.apply(encode)
g = sns.clustermap(df_encoded.corr(), linewidths=1, square=True, annot=True, fmt=".3f", figsize = (25, 25))
plt.setp(g.ax_heatmap.get_yticklabels(), rotation=0);
According to feature importance on Rando Forest classifier, we can make a comment about their contribution on labeled features
You can find the importance of features ordered by contribution respectively, here:
approach_1(clf, df_oversampled, True, sample_size=10000)
clf = RandomForestClassifier(n_jobs=-1, n_estimators=10, max_depth=5, criterion='entropy')
approach_2(clf, df_oversampled)