실전문제로 배우는 머신러닝 - 4주차
데이터 분석을 통한 구독 해지율 최소화
이번 프로젝트는 금융 습관 분석을 통해 구독 상품 해지를 최소화하는 프로젝트입니다!
구독 상품은 회사의 주요 수입원인 경우가 많기 때문에 해지로 이어지는 유저들의 패턴을 알아내어 이를 방지해야 합니다.
모델의 표적 : 우리 회사의 구독 상품을 구독하는 모든 고객
프로젝트 목표 : 해지할 가능성이 높은 유저를 예측해 이들이 재구독하도록 함
Import Library
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sn
dataset = pd.read_csv('churn_data.csv') # Users who were 60 days enrolled, churn in the next 30
EDA
dataset.head(5) # Viewing the Data
dataset.columns
dataset.describe() # Distribution of Numerical Variables

dataset[dataset.credit_score < 300]
dataset = dataset[dataset.credit_score >= 300]
dataset.isna().any()
dataset.isna().sum()
dataset = dataset.drop(columns = ['credit_score', 'rewards_earned'])
dataset2 = dataset.drop(columns = ['user', 'churn'])
fig = plt.figure(figsize=(15, 12))
plt.suptitle('Histograms of Numerical Columns', fontsize=20)
for i in range(1, dataset2.shape[1] + 1):
plt.subplot(6, 5, i)
f = plt.gca()
f.axes.get_yaxis().set_visible(False)
f.set_title(dataset2.columns.values[i - 1])
vals = np.size(dataset2.iloc[:, i - 1].unique())
plt.hist(dataset2.iloc[:, i - 1], bins=vals, color='#3F5D7D')
plt.tight_layout(rect=[0, 0.03, 1, 0.95])

dataset2 = dataset[['housing', 'is_referred', 'app_downloaded',
'web_user', 'app_web_user', 'ios_user',
'android_user', 'registered_phones', 'payment_type',
'waiting_4_loan', 'cancelled_loan',
'received_loan', 'rejected_loan', 'zodiac_sign',
'left_for_two_month_plus', 'left_for_one_month', 'is_referred']]
fig = plt.figure(figsize=(15, 12))
plt.suptitle('Pie Chart Distributions', fontsize=20)
for i in range(1, dataset2.shape[1] + 1):
plt.subplot(6, 3, i)
f = plt.gca()
f.axes.get_yaxis().set_visible(False)
f.set_title(dataset2.columns.values[i - 1])
values = dataset2.iloc[:, i - 1].value_counts(normalize = True).values
index = dataset2.iloc[:, i - 1].value_counts(normalize = True).index
plt.pie(values, labels = index, autopct='%1.1f%%')
plt.axis('equal')
fig.tight_layout(rect=[0, 0.03, 1, 0.95])

Exploring Uneven Features
dataset[dataset2.waiting_4_loan == 1].churn.value_counts()
dataset[dataset2.cancelled_loan == 1].churn.value_counts()
dataset[dataset2.received_loan == 1].churn.value_counts()
dataset[dataset2.rejected_loan == 1].churn.value_counts()
dataset[dataset2.left_for_one_month == 1].churn.value_counts()
Correlation with Response Variable
dataset2.drop(columns = ['housing', 'payment_type',
'registered_phones', 'zodiac_sign']
).corrwith(dataset.churn).plot.bar(figsize=(20,10),
title = 'Correlation with Response variable',
fontsize = 15, rot = 45,
grid = True)
Correlation Matrix
sn.set(style="white")
corr = dataset.drop(columns = ['user', 'churn']).corr()
mask = np.zeros_like(corr, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True
f, ax = plt.subplots(figsize=(18, 15))
cmap = sn.diverging_palette(220, 10, as_cmap=True)
sn.heatmap(corr, mask=mask, cmap=cmap, vmax=.3, center=0,
square=True, linewidths=.5, cbar_kws={"shrink": .5})

Removing Correlated Fields
dataset = dataset.drop(columns = ['app_web_user'])
dataset.to_csv('new_churn_data.csv', index = False)
Import Data
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sn
dataset = pd.read_csv('churn_data.csv') # Users who were 60 days enrolled, churn in the next 30
dataset = dataset.drop(columns = ['app_web_user'])
Data Preparation
user_identifier = dataset['user']
dataset = dataset.drop(columns = ['user'])
# Cleaning Data
dataset[dataset.credit_score < 300]
dataset = dataset[dataset.credit_score >= 300]
# Removing NaN
dataset.isna().any()
dataset.isna().sum()
dataset = dataset.drop(columns = ['credit_score', 'rewards_earned'])
One Hot Encoding
dataset.housing.value_counts()
dataset.groupby('housing')['churn'].nunique().reset_index()
dataset= pd.get_dummies(dataset)
dataset.columns
dataset= dataset.drop(columns= ['housing_na', 'zodiac_sign_na', 'payment_type_na'])
Splitting the dataset into the Training set and Test set
from sklearn.model_selectionimport train_test_split
X_train, X_test, y_train, y_test= train_test_split(dataset.drop(columns= 'churn'), dataset['churn'],
test_size= 0.2,
random_state= 0)
Balancing the Training Set
import random
y_train.value_counts()
pos_index= y_train[y_train.values== 1].index
neg_index= y_train[y_train.values== 0].index
if len(pos_index)> len(neg_index):
higher= pos_index
lower= neg_index
else:
higher= neg_index
lower= pos_index
random.seed(0)
higher= np.random.choice(higher, size=len(lower))
lower= np.asarray(lower)
new_indexes= np.concatenate((lower, higher))
X_train= X_train.loc[new_indexes, ]
y_train= y_train[new_indexes]
Feature Scaling
from sklearn.preprocessingimport StandardScaler
sc_X= StandardScaler()
X_train2= pd.DataFrame(sc_X.fit_transform(X_train))
X_test2= pd.DataFrame(sc_X.transform(X_test))
X_train2.columns= X_train.columns.values
X_test2.columns= X_test.columns.values
X_train2.index= X_train.index.values
X_test2.index= X_test.index.values
X_train= X_train2
X_test= X_test2
Model Building
Fitting Model to the Training Set
from sklearn.linear_modelimport LogisticRegression
classifier= LogisticRegression(random_state= 0)
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score, precision_score, recall_score
cm = confusion_matrix(y_test, y_pred)
accuracy_score(y_test, y_pred)
precision_score(y_test, y_pred) # tp / (tp + fp)
recall_score(y_test, y_pred) # tp / (tp + fn)
f1_score(y_test, y_pred)
df_cm = pd.DataFrame(cm, index = (0, 1), columns = (0, 1))
plt.figure(figsize = (10,7))
sn.set(font_scale=1.4)
sn.heatmap(df_cm, annot=True, fmt='g')
print("Test Data Accuracy: %0.4f" % accuracy_score(y_test, y_pred))

from sklearn.model_selectionimport cross_val_score
accuracies= cross_val_score(estimator= classifier, X= X_train, y= y_train, cv= 10)
print("SVM Accuracy: %0.3f (+/- %0.3f)"% (accuracies.mean(), accuracies.std()* 2))
pd.concat([pd.DataFrame(X_train.columns, columns = ["features"]),
pd.DataFrame(np.transpose(classifier.coef_), columns = ["coef"])
],axis = 1)

Feature Selection
Recursive Feature Elimination
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
rfe = RFE(classifier, step = 20)
rfe = RFE(classifier, step = 20)
rfe = rfe.fit(X_train, y_train)
New Correlation Matrix
sn.set(style="white")
corr = X_train[X_train.columns[rfe.support_]].corr()
mask = np.zeros_like(corr, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True
f, ax = plt.subplots(figsize=(18, 15))
# Generate a custom diverging colormap
cmap = sn.diverging_palette(220, 10, as_cmap=True)
# Draw the heatmap with the mask and correct aspect ratio
sn.heatmap(corr, mask=mask, cmap=cmap, vmax=.3, center=0,
square=True, linewidths=.5, cbar_kws={"shrink": .5})

Fitting Model to the Training Set
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression()
classifier.fit(X_train[X_train.columns[rfe.support_]], y_train)
# Predicting Test Set
y_pred = classifier.predict(X_test[X_train.columns[rfe.support_]])
Evaluating Results
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score, precision_score, recall_score
cm = confusion_matrix(y_test, y_pred)
accuracy_score(y_test, y_pred)
precision_score(y_test, y_pred) # tp / (tp + fp)
recall_score(y_test, y_pred) # tp / (tp + fn)
f1_score(y_test, y_pred)
df_cm = pd.DataFrame(cm, index = (1, 0), columns = (1, 0))
plt.figure(figsize = (10,7))
sn.set(font_scale=1.4)
sn.heatmap(df_cm, annot=True, fmt='g')
print("Test Data Accuracy: %0.4f" % accuracy_score(y_test, y_pred))

Applying k-Fold Cross Validation
from sklearn.model_selectionimport cross_val_score
accuracies= cross_val_score(estimator= classifier,
X= X_train[X_train.columns[rfe.support_]],
y= y_train, cv= 10)
print("SVM Accuracy: %0.3f (+/- %0.3f)"% (accuracies.mean(), accuracies.std()* 2))
Analyzing Coefficients
pd.concat([pd.DataFrame(X_train[X_train.columns[rfe.support_]].columns, columns = ["features"]),
pd.DataFrame(np.transpose(classifier.coef_), columns = ["coef"])
],axis = 1)

Formatting Final Results
final_results= pd.concat([y_test, user_identifier], axis= 1).dropna()
final_results['predicted_churn']= y_pred
final_results= final_results[['user', 'churn', 'predicted_churn']].reset_index(drop=True)
본 스터디는 Udemy의 <【한글자막】 Machine Learning 완벽 실습 : 6가지 실제 사례 직접 해결하기> 강의를 활용해 진행됐습니다. 강의에 대한 자세한 정보는 아래에서 확인하실 수 있습니다.
프밍 스터디는 Udemy Korea와 함께 합니다.