Sunteți pe pagina 1din 17

Predicting

Employee Retention

In [1]:

import pandas as pd
import numpy as np

In [2]:

# contains total number of votes, and also if the employee is still an active employee.
churn = pd.read_csv('churn.csv')
# contains actual votes (hapiness rate of the employee)
votes = pd.read_csv('votes.csv')
# contains how many times a comment was liked.
clean = pd.read_csv('comments_clean_anonimized.csv')
# contains comments that was liked by a certain employee
interactions = pd.read_csv('commentInteractions.csv')

/home/benjamincabalonajr/.local/lib/python3.6/site-packages/IPython/core/interactiveshell.py:30
57: DtypeWarning: Columns (2,3) have mixed types. Specify dtype option on import or set low_mem
ory=False.
interactivity=interactivity, compiler=compiler, result=result)

In [3]:

interactions.head()

Out[3]:

employee companyAlias liked disliked commentId

0 307 56aec740f1ef260003e307d6 True False 58d018d7e010990004e38070

1 36 56aec740f1ef260003e307d6 True False 58d018d7e010990004e38070

2 276 56aec740f1ef260003e307d6 True False 58d018d7e010990004e38070

3 24 56aec740f1ef260003e307d6 True False 58d018d7e010990004e38070

4 382 56aec740f1ef260003e307d6 True False 58d0179ae010990004e3806d


In [4]:

clean.head()

Out[4]:

employee companyAlias commentId txt likes dislikes commentD

Mon Ma
0 307 56aec740f1ef260003e307d6 58d018d7e010990004e38070 **********************************************... 4.0 0.0 19:00:17

Mon Ma
1 382 56aec740f1ef260003e307d6 58d0179ae010990004e3806d ***************************** 1.0 2.0 18:55:16

Mon Ma
2 172 56aec740f1ef260003e307d6 58cff8cde010990004e37f6a *************************** 3.0 0.0 16:44:02

Mon Ma
3 135 56aec740f1ef260003e307d6 58cfefeee010990004e37f60 *************************** 1.0 1.0 16:06:08

Mon Ma
4 225 56aec740f1ef260003e307d6 58cfd9b4e010990004e37f52 ********************************* 3.0 2.0 14:30:50

In [5]:

votes.head()

Out[5]:

employee companyAlias voteDate vote

0 31 56aec740f1ef260003e307d6 Mon Feb 01 01:00:00 CET 2016 4

1 33 56aec740f1ef260003e307d6 Mon Feb 01 01:00:00 CET 2016 4

2 79 56aec740f1ef260003e307d6 Mon Feb 01 01:00:00 CET 2016 4

3 94 56aec740f1ef260003e307d6 Mon Feb 01 01:00:00 CET 2016 4

4 16 56aec740f1ef260003e307d6 Mon Feb 01 01:00:00 CET 2016 2

In [6]:

churn.head()

Out[6]:

employee companyAlias numVotes lastParticipationDate stillExists

0 512 56aec740f1ef260003e307d6 4 Thu Feb 23 12:48:04 CET 2017 True

1 -2 56aec740f1ef260003e307d6 0 Wed Jan 18 14:00:55 CET 2017 False

2 2 56aec740f1ef260003e307d6 72 Fri Mar 17 01:00:00 CET 2017 True

3 487 56aec740f1ef260003e307d6 14 Sat Nov 19 15:02:14 CET 2016 False

4 3 56aec740f1ef260003e307d6 22 Thu Feb 16 01:00:00 CET 2017 True

Data Cleaning
First, we will drop employees with negative value, as stated on the description of this Dataset.

In [7]:

interactions = interactions[interactions['employee']>0]

In [8]:

votes = votes[votes['employee']>0]

In [9]:

clean = clean[clean['employee']>0]
In [10]:

churn = churn[(churn['employee']>0) & (churn['numVotes']>0)]

Data Cleaning: Churn


We will first work with the churn data frame. Note that there are certain pairs (Employee,Company) that has a record that is both
Present and Absent at the company. This is due to the fact that an employee might be present in June, but could leave by July.
This will be handled by checking if employee A has an entry of both True and False . If so we will only keep false.

In [11]:

churn.sort_values(by=['employee', 'companyAlias'], ascending=True, inplace=True)

In [12]:

churn.drop_duplicates(subset=['employee','companyAlias'], keep='first', inplace=True)

Data Cleaning: Votes


Using a pivot table, we will aggregate by employee and companyAlias using the average vote. Then, we will apply the ceiling
function.

In [13]:

import math
vote_clean = pd.pivot_table(index=['employee','companyAlias'],values='vote',aggfunc=np.mean,data=votes)

In [14]:
vote_clean['vote']=vote_clean['vote'].apply(math.ceil)

In [15]:

vote_clean.reset_index(inplace = True)

In [16]:

vote_clean.head()

Out[16]:

employee companyAlias vote

0 1 5474b9cde4b0bf7614b2c66f 4

1 1 54d43612e4b0f6a40755d93e 4

2 1 54e52607e4b01191dc064966 3

3 1 5641f96713664c000332c8cd 4

4 1 56558cfd07a5de00030908fb 4

Combining Votes and Churn Data


In [17]:

churn_vote = pd.merge(churn, vote_clean, on=['employee', 'companyAlias'])


In [18]:

churn_vote.head()

Out[18]:

employee companyAlias numVotes lastParticipationDate stillExists vote

0 1 5474b9cde4b0bf7614b2c66f 2 Wed Nov 26 01:00:00 CET 2014 True 4

1 1 54d43612e4b0f6a40755d93e 16 Wed Jun 08 02:00:00 CEST 2016 False 4

2 1 54e52607e4b01191dc064966 135 Thu Sep 29 02:00:00 CEST 2016 False 3

3 1 5641f96713664c000332c8cd 257 Sun Mar 19 01:00:00 CET 2017 True 4

4 1 56558cfd07a5de00030908fb 2 Sun Nov 29 01:00:00 CET 2015 True 4

In [19]:

churn_vote.isnull().sum() # Confirming that there are no missing values.

Out[19]:

employee 0
companyAlias 0
numVotes 0
lastParticipationDate 0
stillExists 0
vote 0
dtype: int64

In [20]:
result_dupe = churn_vote[churn_vote.duplicated(subset=['employee','companyAlias'], keep=False)]

In [21]:

result_dupe # confirming that we have fixed duplicate values.

Out[21]:

employee companyAlias numVotes lastParticipationDate stillExists vote

Engagement
Total number of likes/dislikes of a comment

In [22]:

interactions['liked'] = pd.get_dummies(interactions['liked'],drop_first=True)
interactions['disliked'] = pd.get_dummies(interactions['disliked'],drop_first=True)

In [23]:

interactions.head()

Out[23]:

employee companyAlias liked disliked commentId

0 307 56aec740f1ef260003e307d6 1 0 58d018d7e010990004e38070

1 36 56aec740f1ef260003e307d6 1 0 58d018d7e010990004e38070

2 276 56aec740f1ef260003e307d6 1 0 58d018d7e010990004e38070

3 24 56aec740f1ef260003e307d6 1 0 58d018d7e010990004e38070

4 382 56aec740f1ef260003e307d6 1 0 58d0179ae010990004e3806d

In [24]:

engagement_sum = interactions.groupby(['employee', 'companyAlias']).sum()


In [25]:

engagement_sum.reset_index(inplace=True)

In [26]:

engagement_sum.head()

Out[26]:

employee companyAlias liked disliked

0 1 54e52607e4b01191dc064966 11.0 2.0

1 1 5641f96713664c000332c8cd 151.0 37.0

2 1 567011c035dce00003a07fa4 2.0 0.0

3 1 56e2a905e3b6fe0003e32855 175.0 35.0

4 1 5742d699f839a10003a407d2 599.0 69.0

In [27]:

churn_vote_engagement = pd.merge(churn_vote, engagement_sum, on=['employee', 'companyAlias'])

In [28]:
churn_vote_engagement.head()

Out[28]:

employee companyAlias numVotes lastParticipationDate stillExists vote liked disliked

Thu Sep 29 02:00:00 CEST


0 1 54e52607e4b01191dc064966 135 False 3 11.0 2.0
2016

1 1 5641f96713664c000332c8cd 257 Sun Mar 19 01:00:00 CET 2017 True 4 151.0 37.0

Wed Jun 08 13:44:07 CEST


2 1 567011c035dce00003a07fa4 1 True 3 2.0 0.0
2016

3 1 56e2a905e3b6fe0003e32855 181 Mon Mar 20 01:00:00 CET 2017 True 3 175.0 35.0

4 1 5742d699f839a10003a407d2 295 Mon Mar 20 14:11:12 CET 2017 True 4 599.0 69.0

In [29]:

churn_vote_engagement.rename(columns={'liked':'likes_given', 'disliked' :'dislikes_given'}, inplace=True)

Likeability
In [30]:
clean.drop(['txt','commentDate'],axis=1,inplace=True) # remove un necessary columns.

In [31]:

likeability = clean.groupby(['employee', 'companyAlias']).sum() #sum all likes

In [32]:

likeability.reset_index(inplace=True)

In [33]:

result_final = pd.merge(churn_vote_engagement, likeability, on=['employee', 'companyAlias'], how='inner')


In [34]:

result_final.isnull().sum() #no missing values

Out[34]:

employee 0
companyAlias 0
numVotes 0
lastParticipationDate 0
stillExists 0
vote 0
likes_given 0
dislikes_given 0
likes 0
dislikes 0
dtype: int64

In [35]:

result_final[result_final.duplicated(subset=['employee','companyAlias'])]
# Confirming there are no duplicate values
Out[35]:

employee companyAlias numVotes lastParticipationDate stillExists vote likes_given dislikes_given likes dislikes

In [36]:

#drop participation Date


result_final.drop('lastParticipationDate',axis=1,inplace=True)

In [37]:

result_final.shape

Out[37]:

(2557, 9)

In [38]:

result_final['stillExists'] = pd.get_dummies(result_final['stillExists'],drop_first=True)

In [39]:

result_final.head()

Out[39]:

employee companyAlias numVotes stillExists vote likes_given dislikes_given likes dislikes

0 1 5641f96713664c000332c8cd 257 1 4 151.0 37.0 1740.0 708.0

1 1 56e2a905e3b6fe0003e32855 181 1 3 175.0 35.0 482.0 118.0

2 1 5742d699f839a10003a407d2 295 1 4 599.0 69.0 216.0 22.0

3 1 574c423856b6300003009953 123 1 4 229.0 14.0 402.0 68.0

4 1 57908a2622881200033b34d7 113 1 4 28.0 2.0 82.0 12.0

In [ ]:

In [ ]:

In [ ]:

In [ ]:


Exploratory Data Analysis
In [40]:

import seaborn as sns


import matplotlib.pyplot as plt

%matplotlib inline
sns.set(style='darkgrid')

In [41]:

draft=(pd.pivot_table(index='companyAlias',values='stillExists',aggfunc=sum,data=result_final).reset_index()
.sort_values(by='stillExists',ascending=False).head(15))
sns.barplot(x='companyAlias',y='stillExists',data=draft)

Out[41]:

<matplotlib.axes._subplots.AxesSubplot at 0x7f2e381ccac8>

In [ ]:

In [42]:

result_final[result_final['stillExists']==1].shape
# This implies that 93% of the employees in our cleaned dataset is still working with their current employer

Out[42]:

(2390, 9)

In [ ]:

In [43]:

result_final[result_final['stillExists']==0].shape
# This implies that 93% of the employees in our cleaned dataset is still working with their current employer
Out[43]:

(167, 9)
In [44]:

plt.figure(figsize=(12,4))
sns.countplot(result_final['stillExists'])
plt.ylabel('Counf of Employees')
plt.xlabel('Employment Status')
plt.show()

Notice how skewed our target class is. There are a lot of ways in dealing with imbalanced data set when doing predictive
modelling. (i.e. Evaluate at a different metric other than accuracy, random over/undersampling, SMOTE, using models that
penalizes frequent classes, assigning weights to the class etc.) In our case, we will use SMOTE. (by Nitesh V. Chawla et.al) We will
handle this during predictive modelling.

For the purpose of data exploration, we will only use our original data, and instead we will undersample from our majority class.
We will take a random sample of 150 obsevations from both classes.

In [ ]:

In [45]:

Positive = result_final[result_final['stillExists']==1].sample(150,random_state=42)
Negative = result_final[result_final['stillExists']==0].sample(150,random_state=42)
Sample = pd.concat([Positive,Negative])
Sample.head()
# Taking 150 random samples from each class, then combining them together
Out[45]:

employee companyAlias numVotes stillExists vote likes_given dislikes_given likes dislikes

1386 136 581b08041a0ef8000308aef6 63 1 4 2.0 1.0 58.0 8.0

1486 151 5641f96713664c000332c8cd 57 1 3 11.0 1.0 172.0 12.0

559 38 56aec740f1ef260003e307d6 64 1 3 6.0 2.0 50.0 6.0

583 39 57fcf18712cdbd000396e310 7 1 4 3.0 0.0 4.0 0.0

241 16 57dd2d6a4018d9000339ca43 3 1 4 1.0 0.0 10.0 0.0


In [46]:

plt.figure(figsize=(16,6))
sns.set(style='darkgrid')
sns.countplot(x='vote',hue='stillExists',data=Sample)
plt.xlabel('Vote')
plt.ylabel('Votes per class')
#plt.title('Happiness is not a strong predictior of churn')
plt.show()

In [ ]:

In [47]:

plt.figure(figsize=(12,6))
sns.scatterplot(x='likes',y='stillExists',data=Sample)
plt.show()
In [48]:

plt.figure(figsize=(12,6))
sns.scatterplot(x='dislikes',y='stillExists',data=Sample)
plt.show()

In [49]:

plt.figure(figsize=(12,6))
sns.scatterplot(x='likes_given',y='stillExists',data=Sample)
plt.show()
In [50]:

plt.figure(figsize=(12,6))
sns.scatterplot(x='dislikes_given',y='stillExists',data=Sample)
plt.show()

In [51]:
plt.figure(figsize=(16,6))
sns.heatmap(Sample.corr())
plt.show()

Precision is the ratio of correctly predicted positive observations to the total predicted positive observations.
Recall is the ratio of correctly predicted positive observations to the all observations in actual class.
F1 Score is the weighted average of Precision and Recall. Therefore, this score takes both false positives and false negatives into
account.

In [52]:

from sklearn.preprocessing import LabelEncoder


from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report,roc_curve, auc

In [53]:
le = LabelEncoder()
result_final['companyAlias'] = le.fit_transform(result_final['companyAlias'])
result_final.head(1)

Out[53]:

employee companyAlias numVotes stillExists vote likes_given dislikes_given likes dislikes

0 1 5 257 1 4 151.0 37.0 1740.0 708.0


In [54]:

X = result_final.drop(['employee','stillExists','likes','likes_given','dislikes_given',],axis=1)
y = result_final.stillExists

In [55]:

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,random_state=42)

In [56]:

def model(model,X_train,y_train,X_test,y_test):
model.fit(X_train,y_train)
pred = model.predict(X_test)
print(classification_report(y_test,pred))
print('\n')
print(confusion_matrix(y_test,pred))
print('\n')
print('Rows = Actual Class')
print('Columns = Predicted Class ')

In [57]:

from sklearn.linear_model import LogisticRegressionCV

In [58]:
lmp = LogisticRegressionCV(class_weight={0:9,1:1},cv=5)

In [59]:

model(lmp,X_train,y_train,X_test,y_test)

precision recall f1-score support

0 0.21 0.73 0.33 37


1 0.97 0.79 0.87 475

accuracy 0.79 512


macro avg 0.59 0.76 0.60 512
weighted avg 0.92 0.79 0.83 512

[[ 27 10]
[100 375]]

Rows = Actual Class


Columns = Predicted Class

In [60]:

Coefficients = pd.DataFrame(lmp.coef_,columns=X.columns)
Coefficients

Out[60]:

companyAlias numVotes vote dislikes

0 0.139141 0.000546 0.004513 -0.001853

In [61]:

def sigmoid(x):
return (1/(1+math.exp(-x)))

In [62]:
from imblearn.over_sampling import SMOTE

In [63]:

sm = SMOTE()
In [64]:

X_os, y_os = sm.fit_sample(X_train,y_train)

In [65]:

lr = LogisticRegressionCV(cv=3)

In [66]:

model(lr,X_os,y_os,X_test,y_test)

precision recall f1-score support

0 0.18 0.73 0.29 37


1 0.97 0.74 0.84 475

accuracy 0.74 512


macro avg 0.58 0.73 0.56 512
weighted avg 0.91 0.74 0.80 512

[[ 27 10]
[124 351]]

Rows = Actual Class


Columns = Predicted Class

In [67]:

X.corr()

Out[67]:

companyAlias numVotes vote dislikes

companyAlias 1.000000 -0.332744 0.000985 -0.172828

numVotes -0.332744 1.000000 -0.039459 0.441457

vote 0.000985 -0.039459 1.000000 -0.106803

dislikes -0.172828 0.441457 -0.106803 1.000000

In [68]:

import statsmodels.discrete.discrete_model as sm

In [69]:

lrm = sm.Logit(y_train,X_train)

In [70]:

result = lrm.fit()

Optimization terminated successfully.


Current function value: 0.348001
Iterations 9
In [71]:

result.summary()

/home/benjamincabalonajr/.local/lib/python3.6/site-packages/statsmodels/base/model.py:492: Hess
ianInversionWarning: Inverting hessian failed, no bse or cov_params available
'available', HessianInversionWarning)
/home/benjamincabalonajr/.local/lib/python3.6/site-packages/statsmodels/base/model.py:492: Hess
ianInversionWarning: Inverting hessian failed, no bse or cov_params available
'available', HessianInversionWarning)
/home/benjamincabalonajr/.local/lib/python3.6/site-packages/statsmodels/discrete/discrete_model
.py:3390: RuntimeWarning: divide by zero encountered in double_scalars
return 1 - self.llf/self.llnull

Out[71]:

Logit Regression Results

Dep. Variable: stillExists No. Observations: 2045

Model: Logit Df Residuals: 2041

Method: MLE Df Model: 3

Date: Thu, 25 Jul 2019 Pseudo R-squ.: inf

Time: 16:01:13 Log-Likelihood: -711.66

converged: True LL-Null: 0.0000

Covariance Type: nonrobust LLR p-value: 1.000

coef std err z P>|z| [0.025 0.975]

companyAlias 0.2082 0.019 10.849 0.000 0.171 0.246

numVotes 0.0010 0.001 1.016 0.310 -0.001 0.003

vote 0.2342 0.048 4.831 0.000 0.139 0.329

dislikes -0.0020 0.001 -2.542 0.011 -0.004 -0.000

In [72]:

Coefficients

Out[72]:

companyAlias numVotes vote dislikes

0 0.139141 0.000546 0.004513 -0.001853

In [73]:
result_final.head()

Out[73]:

employee companyAlias numVotes stillExists vote likes_given dislikes_given likes dislikes

0 1 5 257 1 4 151.0 37.0 1740.0 708.0

1 1 11 181 1 3 175.0 35.0 482.0 118.0

2 1 14 295 1 4 599.0 69.0 216.0 22.0

3 1 15 123 1 4 229.0 14.0 402.0 68.0

4 1 17 113 1 4 28.0 2.0 82.0 12.0


In [74]:

result_final.describe()

Out[74]:

employee companyAlias numVotes stillExists vote likes_given dislikes_given likes dislikes

count 2557.000000 2557.000000 2557.000000 2557.000000 2557.000000 2557.000000 2557.000000 2557.00000 2557.000000

mean 173.437622 15.057880 70.650372 0.934689 3.345718 101.928041 18.813062 210.06492 38.685960

std 183.411596 9.258406 82.898413 0.247122 0.678990 203.662501 53.115962 422.70343 97.700203

min 1.000000 0.000000 1.000000 0.000000 1.000000 0.000000 0.000000 0.00000 0.000000

25% 44.000000 7.000000 15.000000 1.000000 3.000000 8.000000 1.000000 22.00000 2.000000

50% 122.000000 15.000000 40.000000 1.000000 3.000000 33.000000 4.000000 72.00000 10.000000

75% 229.000000 22.000000 96.000000 1.000000 4.000000 103.000000 15.000000 210.00000 34.000000

max 999.000000 33.000000 740.000000 1.000000 4.000000 3151.000000 891.000000 6558.00000 1940.000000

In [75]:

dataset = pd.DataFrame()

In [76]:

dataset['numVotes'] = np.random.randint(20,103,size=(9870,))
dataset['likes_given'] = np.random.randint(40,311,size=(9870,))
dataset['dislikes_given'] = np.random.randint(20,53,size=(9870,))
dataset['likes'] = np.random.randint(60,223,size=(9870,))
dataset['dislikes'] = np.random.randint(23,62,size=(9870,))
dataset['vote'] = [1]*357 + [2]* 472 + [3]* 4703 + [4] * 4338

In [77]:
sns.countplot(dataset['vote'])

Out[77]:

<matplotlib.axes._subplots.AxesSubplot at 0x7f2dfc511240>

In [78]:

dataset = dataset.sample(frac=1).reset_index(drop=True) #shuffle the dataset

In [ ]:

In [ ]:

In [ ]:

In [ ]:

In [79]:

import names

In [ ]:

In [80]:

var = []

for i in range(1,9871):
var.append((names.get_full_name()))

In [81]:

dataset['username'] = var

In [82]:

dataset['username'].nunique()

Out[82]:

9738

In [83]:
dataset.head()

Out[83]:

numVotes likes_given dislikes_given likes dislikes vote username

0 73 239 47 199 33 3 Jill Nahhas

1 44 179 46 129 56 4 Leticia Rump

2 90 144 37 220 53 3 Joan Wright

3 81 177 47 184 45 4 Russell Jones

4 66 123 51 208 60 2 Cynthia Lofgren

In [84]:

dataset.drop_duplicates(subset='username',keep='first', inplace=True)

In [ ]:

In [ ]:

comp_list[0:5]

In [ ]:

comp= pd.read_csv('/home/benjamincabalonajr/Documents/web/csv_files_backup/Company-2019-07-23.csv')

In [ ]:

comp_list = list(comp.company_name)

In [ ]:

dataset['company'] = [1]*777 + [2]* 1603 + [3]* 759 + [4] * (630+353) + [5]* 982 + [6] * 1050 + [7] * 666 +
[8] * 803 + [9] * 1001 + [10]* 1119

In [ ]:


In [ ]:

dataset = dataset.sample(frac=1).reset_index(drop=True) #shuffle the dataset

In [ ]:

dataset.to_csv('engineer_calura.csv')

In [ ]:

S-ar putea să vă placă și