ML Basic

Predicting
Employee Retention
In [1]:
import pandas as pd
import numpy as np
In [2]:
# contains total number of votes, and also if the employee is still an active employee.
churn = pd.read_csv('churn.csv')
# contains actual votes (hapiness rate of the employee)
votes = pd.read_csv('votes.csv')
# contains how many times a comment was liked.
clean = pd.read_csv('comments_clean_anonimized.csv')
# contains comments that was liked by a certain employee
interactions = pd.read_csv('commentInteractions.csv')
/home/benjamincabalonajr/.local/lib/python3.6/site-packages/IPython/core/interactiveshell.py:30
57: DtypeWarning: Columns (2,3) have mixed types. Specify dtype option on import or set low_mem
ory=False.
interactivity=interactivity, compiler=compiler, result=result)
In [3]:
interactions.head()
Out[3]:
employee companyAlias liked disliked commentId
0 307 56aec740f1ef260003e307d6 True False 58d018d7e010990004e38070
4 382 56aec740f1ef260003e307d6 True False 58d0179ae010990004e3806d

In [4]:
clean.head()
Out[4]:
employee companyAlias commentId txt likes dislikes commentD
Mon Ma
0 307 56aec740f1ef260003e307d6 58d018d7e010990004e38070 **********************************************... 4.0 0.0 19:00:17
Mon Ma
1 382 56aec740f1ef260003e307d6 58d0179ae010990004e3806d ***************************** 1.0 2.0 18:55:16
Mon Ma
2 172 56aec740f1ef260003e307d6 58cff8cde010990004e37f6a *************************** 3.0 0.0 16:44:02
Mon Ma
3 135 56aec740f1ef260003e307d6 58cfefeee010990004e37f60 *************************** 1.0 1.0 16:06:08
Mon Ma
4 225 56aec740f1ef260003e307d6 58cfd9b4e010990004e37f52 ********************************* 3.0 2.0 14:30:50
In [5]:
votes.head()
Out[5]:
employee companyAlias voteDate vote
0 31 56aec740f1ef260003e307d6 Mon Feb 01 01:00:00 CET 2016 4
In [6]:
churn.head()
Out[6]:
employee companyAlias numVotes lastParticipationDate stillExists
0 512 56aec740f1ef260003e307d6 4 Thu Feb 23 12:48:04 CET 2017 True
1 -2 56aec740f1ef260003e307d6 0 Wed Jan 18 14:00:55 CET 2017 False
2 2 56aec740f1ef260003e307d6 72 Fri Mar 17 01:00:00 CET 2017 True
3 487 56aec740f1ef260003e307d6 14 Sat Nov 19 15:02:14 CET 2016 False
4 3 56aec740f1ef260003e307d6 22 Thu Feb 16 01:00:00 CET 2017 True
Data Cleaning
First, we will drop employees with negative value, as stated on the description of this Dataset.
In [7]:
interactions = interactions[interactions['employee']>0]
In [8]:
votes = votes[votes['employee']>0]
In [9]:
clean = clean[clean['employee']>0]
In [10]:
churn = churn[(churn['employee']>0) & (churn['numVotes']>0)]
Data Cleaning: Churn

We will first work with the churn data frame. Note that there are certain pairs (Employee,Company) that has a record that is both
Present and Absent at the company. This is due to the fact that an employee might be present in June, but could leave by July.
This will be handled by checking if employee A has an entry of both True and False . If so we will only keep false.
In [11]:
churn.sort_values(by=['employee', 'companyAlias'], ascending=True, inplace=True)
In [12]:
churn.drop_duplicates(subset=['employee','companyAlias'], keep='first', inplace=True)
Data Cleaning: Votes

Using a pivot table, we will aggregate by employee and companyAlias using the average vote. Then, we will apply the ceiling
function.
In [13]:
import math
vote_clean = pd.pivot_table(index=['employee','companyAlias'],values='vote',aggfunc=np.mean,data=votes)
In [14]:
vote_clean['vote']=vote_clean['vote'].apply(math.ceil)
In [15]:
vote_clean.reset_index(inplace = True)
In [16]:
vote_clean.head()
Out[16]:
employee companyAlias vote
0 1 5474b9cde4b0bf7614b2c66f 4
1 1 54d43612e4b0f6a40755d93e 4
2 1 54e52607e4b01191dc064966 3
3 1 5641f96713664c000332c8cd 4
4 1 56558cfd07a5de00030908fb 4
Combining Votes and Churn Data

In [17]:
churn_vote = pd.merge(churn, vote_clean, on=['employee', 'companyAlias'])

In [18]:
churn_vote.head()
Out[18]:
employee companyAlias numVotes lastParticipationDate stillExists vote
0 1 5474b9cde4b0bf7614b2c66f 2 Wed Nov 26 01:00:00 CET 2014 True 4
1 1 54d43612e4b0f6a40755d93e 16 Wed Jun 08 02:00:00 CEST 2016 False 4
2 1 54e52607e4b01191dc064966 135 Thu Sep 29 02:00:00 CEST 2016 False 3
3 1 5641f96713664c000332c8cd 257 Sun Mar 19 01:00:00 CET 2017 True 4
4 1 56558cfd07a5de00030908fb 2 Sun Nov 29 01:00:00 CET 2015 True 4
In [19]:
churn_vote.isnull().sum() # Confirming that there are no missing values.
Out[19]:
employee 0
companyAlias 0
numVotes 0
lastParticipationDate 0
stillExists 0
vote 0
dtype: int64
In [20]:
result_dupe = churn_vote[churn_vote.duplicated(subset=['employee','companyAlias'], keep=False)]
In [21]:
result_dupe # confirming that we have fixed duplicate values.
Out[21]:
employee companyAlias numVotes lastParticipationDate stillExists vote
Engagement
Total number of likes/dislikes of a comment
In [22]:
interactions['liked'] = pd.get_dummies(interactions['liked'],drop_first=True)
interactions['disliked'] = pd.get_dummies(interactions['disliked'],drop_first=True)
In [23]:
interactions.head()
Out[23]:
employee companyAlias liked disliked commentId
0 307 56aec740f1ef260003e307d6 1 0 58d018d7e010990004e38070
1 36 56aec740f1ef260003e307d6 1 0 58d018d7e010990004e38070
2 276 56aec740f1ef260003e307d6 1 0 58d018d7e010990004e38070
3 24 56aec740f1ef260003e307d6 1 0 58d018d7e010990004e38070
4 382 56aec740f1ef260003e307d6 1 0 58d0179ae010990004e3806d
In [24]:
engagement_sum = interactions.groupby(['employee', 'companyAlias']).sum()

In [25]:
engagement_sum.reset_index(inplace=True)
In [26]:
engagement_sum.head()
Out[26]:
employee companyAlias liked disliked
0 1 54e52607e4b01191dc064966 11.0 2.0
1 1 5641f96713664c000332c8cd 151.0 37.0
2 1 567011c035dce00003a07fa4 2.0 0.0
3 1 56e2a905e3b6fe0003e32855 175.0 35.0
4 1 5742d699f839a10003a407d2 599.0 69.0
In [27]:
churn_vote_engagement = pd.merge(churn_vote, engagement_sum, on=['employee', 'companyAlias'])
In [28]:
churn_vote_engagement.head()
Out[28]:
employee companyAlias numVotes lastParticipationDate stillExists vote liked disliked
Thu Sep 29 02:00:00 CEST

0 1 54e52607e4b01191dc064966 135 False 3 11.0 2.0
2016
1 1 5641f96713664c000332c8cd 257 Sun Mar 19 01:00:00 CET 2017 True 4 151.0 37.0
Wed Jun 08 13:44:07 CEST

2 1 567011c035dce00003a07fa4 1 True 3 2.0 0.0
2016
3 1 56e2a905e3b6fe0003e32855 181 Mon Mar 20 01:00:00 CET 2017 True 3 175.0 35.0
4 1 5742d699f839a10003a407d2 295 Mon Mar 20 14:11:12 CET 2017 True 4 599.0 69.0
In [29]:
churn_vote_engagement.rename(columns={'liked':'likes_given', 'disliked' :'dislikes_given'}, inplace=True)
Likeability
In [30]:
clean.drop(['txt','commentDate'],axis=1,inplace=True) # remove un necessary columns.
In [31]:
likeability = clean.groupby(['employee', 'companyAlias']).sum() #sum all likes
In [32]:
likeability.reset_index(inplace=True)
In [33]:
result_final = pd.merge(churn_vote_engagement, likeability, on=['employee', 'companyAlias'], how='inner')

In [34]:
result_final.isnull().sum() #no missing values
Out[34]:
employee 0
companyAlias 0
numVotes 0
lastParticipationDate 0
stillExists 0
vote 0
likes_given 0
dislikes_given 0
likes 0
dislikes 0
dtype: int64
In [35]:
result_final[result_final.duplicated(subset=['employee','companyAlias'])]
# Confirming there are no duplicate values
Out[35]:
employee companyAlias numVotes lastParticipationDate stillExists vote likes_given dislikes_given likes dislikes
In [36]:
#drop participation Date

result_final.drop('lastParticipationDate',axis=1,inplace=True)
In [37]:
result_final.shape
Out[37]:
(2557, 9)
In [38]:
result_final['stillExists'] = pd.get_dummies(result_final['stillExists'],drop_first=True)
In [39]:
result_final.head()
Out[39]:
employee companyAlias numVotes stillExists vote likes_given dislikes_given likes dislikes
0 1 5641f96713664c000332c8cd 257 1 4 151.0 37.0 1740.0 708.0
1 1 56e2a905e3b6fe0003e32855 181 1 3 175.0 35.0 482.0 118.0
2 1 5742d699f839a10003a407d2 295 1 4 599.0 69.0 216.0 22.0
3 1 574c423856b6300003009953 123 1 4 229.0 14.0 402.0 68.0
4 1 57908a2622881200033b34d7 113 1 4 28.0 2.0 82.0 12.0
In [ ]:
In [ ]:
In [ ]:
In [ ]:

Exploratory Data Analysis
In [40]:
import seaborn as sns

import matplotlib.pyplot as plt
%matplotlib inline
sns.set(style='darkgrid')
In [41]:
draft=(pd.pivot_table(index='companyAlias',values='stillExists',aggfunc=sum,data=result_final).reset_index()
.sort_values(by='stillExists',ascending=False).head(15))
sns.barplot(x='companyAlias',y='stillExists',data=draft)
Out[41]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f2e381ccac8>
In [ ]:
In [42]:
result_final[result_final['stillExists']==1].shape
# This implies that 93% of the employees in our cleaned dataset is still working with their current employer
Out[42]:
(2390, 9)
In [ ]:
In [43]:
result_final[result_final['stillExists']==0].shape
# This implies that 93% of the employees in our cleaned dataset is still working with their current employer
Out[43]:
(167, 9)
In [44]:
plt.figure(figsize=(12,4))
sns.countplot(result_final['stillExists'])
plt.ylabel('Counf of Employees')
plt.xlabel('Employment Status')
plt.show()
Notice how skewed our target class is. There are a lot of ways in dealing with imbalanced data set when doing predictive
modelling. (i.e. Evaluate at a different metric other than accuracy, random over/undersampling, SMOTE, using models that
penalizes frequent classes, assigning weights to the class etc.) In our case, we will use SMOTE. (by Nitesh V. Chawla et.al) We will
handle this during predictive modelling.
For the purpose of data exploration, we will only use our original data, and instead we will undersample from our majority class.
We will take a random sample of 150 obsevations from both classes.
In [ ]:

In [45]:
Positive = result_final[result_final['stillExists']==1].sample(150,random_state=42)
Negative = result_final[result_final['stillExists']==0].sample(150,random_state=42)
Sample = pd.concat([Positive,Negative])
Sample.head()
# Taking 150 random samples from each class, then combining them together
Out[45]:
1386 136 581b08041a0ef8000308aef6 63 1 4 2.0 1.0 58.0 8.0
1486 151 5641f96713664c000332c8cd 57 1 3 11.0 1.0 172.0 12.0
559 38 56aec740f1ef260003e307d6 64 1 3 6.0 2.0 50.0 6.0
583 39 57fcf18712cdbd000396e310 7 1 4 3.0 0.0 4.0 0.0
241 16 57dd2d6a4018d9000339ca43 3 1 4 1.0 0.0 10.0 0.0

In [46]:
sns.set(style='darkgrid')
sns.countplot(x='vote',hue='stillExists',data=Sample)
plt.xlabel('Vote')
plt.ylabel('Votes per class')
#plt.title('Happiness is not a strong predictior of churn')
plt.show()
In [ ]:

In [47]:
sns.scatterplot(x='likes',y='stillExists',data=Sample)
plt.show()
In [48]:
sns.scatterplot(x='dislikes',y='stillExists',data=Sample)
plt.show()
In [49]:
sns.scatterplot(x='likes_given',y='stillExists',data=Sample)
plt.show()
In [50]:
sns.scatterplot(x='dislikes_given',y='stillExists',data=Sample)
plt.show()
In [51]:
sns.heatmap(Sample.corr())
plt.show()
Precision is the ratio of correctly predicted positive observations to the total predicted positive observations.
Recall is the ratio of correctly predicted positive observations to the all observations in actual class.
F1 Score is the weighted average of Precision and Recall. Therefore, this score takes both false positives and false negatives into
account.
In [52]:
from sklearn.preprocessing import LabelEncoder

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report,roc_curve, auc
In [53]:
le = LabelEncoder()
result_final['companyAlias'] = le.fit_transform(result_final['companyAlias'])
result_final.head(1)
Out[53]:
0 1 5 257 1 4 151.0 37.0 1740.0 708.0

In [54]:
X = result_final.drop(['employee','stillExists','likes','likes_given','dislikes_given',],axis=1)
y = result_final.stillExists
In [55]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,random_state=42)
In [56]:
def model(model,X_train,y_train,X_test,y_test):
model.fit(X_train,y_train)
pred = model.predict(X_test)
print(classification_report(y_test,pred))
print('\n')
print(confusion_matrix(y_test,pred))
print('\n')
print('Rows = Actual Class')
print('Columns = Predicted Class ')
In [57]:
from sklearn.linear_model import LogisticRegressionCV
In [58]:
lmp = LogisticRegressionCV(class_weight={0:9,1:1},cv=5)
In [59]:
model(lmp,X_train,y_train,X_test,y_test)
precision recall f1-score support
0 0.21 0.73 0.33 37

1 0.97 0.79 0.87 475
accuracy 0.79 512

macro avg 0.59 0.76 0.60 512
weighted avg 0.92 0.79 0.83 512
[[ 27 10]
[100 375]]
Rows = Actual Class

Columns = Predicted Class
In [60]:
Coefficients = pd.DataFrame(lmp.coef_,columns=X.columns)
Coefficients
Out[60]:
companyAlias numVotes vote dislikes
0 0.139141 0.000546 0.004513 -0.001853
In [61]:
def sigmoid(x):
return (1/(1+math.exp(-x)))
In [62]:
from imblearn.over_sampling import SMOTE
In [63]:
sm = SMOTE()
In [64]:
X_os, y_os = sm.fit_sample(X_train,y_train)
In [65]:
lr = LogisticRegressionCV(cv=3)
In [66]:
model(lr,X_os,y_os,X_test,y_test)
precision recall f1-score support
0 0.18 0.73 0.29 37

1 0.97 0.74 0.84 475
accuracy 0.74 512

macro avg 0.58 0.73 0.56 512
weighted avg 0.91 0.74 0.80 512
[[ 27 10]
[124 351]]
Rows = Actual Class

Columns = Predicted Class
In [67]:
X.corr()
Out[67]:
companyAlias 1.000000 -0.332744 0.000985 -0.172828
numVotes -0.332744 1.000000 -0.039459 0.441457
vote 0.000985 -0.039459 1.000000 -0.106803
dislikes -0.172828 0.441457 -0.106803 1.000000
In [68]:
import statsmodels.discrete.discrete_model as sm
In [69]:
lrm = sm.Logit(y_train,X_train)
In [70]:
result = lrm.fit()
Optimization terminated successfully.

Current function value: 0.348001
Iterations 9
In [71]:
result.summary()
/home/benjamincabalonajr/.local/lib/python3.6/site-packages/statsmodels/base/model.py:492: Hess
ianInversionWarning: Inverting hessian failed, no bse or cov_params available
'available', HessianInversionWarning)
/home/benjamincabalonajr/.local/lib/python3.6/site-packages/statsmodels/base/model.py:492: Hess
ianInversionWarning: Inverting hessian failed, no bse or cov_params available
'available', HessianInversionWarning)
/home/benjamincabalonajr/.local/lib/python3.6/site-packages/statsmodels/discrete/discrete_model
.py:3390: RuntimeWarning: divide by zero encountered in double_scalars
return 1 - self.llf/self.llnull
Out[71]:
Logit Regression Results
Dep. Variable: stillExists No. Observations: 2045
Model: Logit Df Residuals: 2041
Method: MLE Df Model: 3
Date: Thu, 25 Jul 2019 Pseudo R-squ.: inf
Time: 16:01:13 Log-Likelihood: -711.66
converged: True LL-Null: 0.0000
Covariance Type: nonrobust LLR p-value: 1.000
coef std err z P>|z| [0.025 0.975]
companyAlias 0.2082 0.019 10.849 0.000 0.171 0.246
numVotes 0.0010 0.001 1.016 0.310 -0.001 0.003
vote 0.2342 0.048 4.831 0.000 0.139 0.329
dislikes -0.0020 0.001 -2.542 0.011 -0.004 -0.000
In [72]:
Coefficients
Out[72]:
0 0.139141 0.000546 0.004513 -0.001853
In [73]:
result_final.head()
Out[73]:
0 1 5 257 1 4 151.0 37.0 1740.0 708.0
1 1 11 181 1 3 175.0 35.0 482.0 118.0
2 1 14 295 1 4 599.0 69.0 216.0 22.0
3 1 15 123 1 4 229.0 14.0 402.0 68.0
4 1 17 113 1 4 28.0 2.0 82.0 12.0

In [74]:
result_final.describe()
Out[74]:
count 2557.000000 2557.000000 2557.000000 2557.000000 2557.000000 2557.000000 2557.000000 2557.00000 2557.000000
mean 173.437622 15.057880 70.650372 0.934689 3.345718 101.928041 18.813062 210.06492 38.685960
std 183.411596 9.258406 82.898413 0.247122 0.678990 203.662501 53.115962 422.70343 97.700203
min 1.000000 0.000000 1.000000 0.000000 1.000000 0.000000 0.000000 0.00000 0.000000
25% 44.000000 7.000000 15.000000 1.000000 3.000000 8.000000 1.000000 22.00000 2.000000
50% 122.000000 15.000000 40.000000 1.000000 3.000000 33.000000 4.000000 72.00000 10.000000
75% 229.000000 22.000000 96.000000 1.000000 4.000000 103.000000 15.000000 210.00000 34.000000
max 999.000000 33.000000 740.000000 1.000000 4.000000 3151.000000 891.000000 6558.00000 1940.000000
In [75]:
dataset = pd.DataFrame()
In [76]:
dataset['numVotes'] = np.random.randint(20,103,size=(9870,))
dataset['likes_given'] = np.random.randint(40,311,size=(9870,))
dataset['dislikes_given'] = np.random.randint(20,53,size=(9870,))
dataset['likes'] = np.random.randint(60,223,size=(9870,))
dataset['dislikes'] = np.random.randint(23,62,size=(9870,))
dataset['vote'] = [1]*357 + [2]* 472 + [3]* 4703 + [4] * 4338
In [77]:
sns.countplot(dataset['vote'])
Out[77]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f2dfc511240>
In [78]:
dataset = dataset.sample(frac=1).reset_index(drop=True) #shuffle the dataset
In [ ]:
In [ ]:
In [ ]:

In [ ]:
In [79]:
import names
In [ ]:
In [80]:
var = []
for i in range(1,9871):
var.append((names.get_full_name()))
In [81]:
dataset['username'] = var
In [82]:
dataset['username'].nunique()
Out[82]:
9738
In [83]:
dataset.head()
Out[83]:
numVotes likes_given dislikes_given likes dislikes vote username
0 73 239 47 199 33 3 Jill Nahhas
1 44 179 46 129 56 4 Leticia Rump
2 90 144 37 220 53 3 Joan Wright
3 81 177 47 184 45 4 Russell Jones
4 66 123 51 208 60 2 Cynthia Lofgren
In [84]:
dataset.drop_duplicates(subset='username',keep='first', inplace=True)
In [ ]:
In [ ]:
comp_list[0:5]
In [ ]:
comp= pd.read_csv('/home/benjamincabalonajr/Documents/web/csv_files_backup/Company-2019-07-23.csv')
In [ ]:
comp_list = list(comp.company_name)
In [ ]:
dataset['company'] = [1]*777 + [2]* 1603 + [3]* 759 + [4] * (630+353) + [5]* 982 + [6] * 1050 + [7] * 666 +
[8] * 803 + [9] * 1001 + [10]* 1119
In [ ]:

In [ ]:
dataset = dataset.sample(frac=1).reset_index(drop=True) #shuffle the dataset
In [ ]:
dataset.to_csv('engineer_calura.csv')
In [ ]:

ML Basic

Încărcat de

Informații document

Drepturi de autor

Formate disponibile

Partajați acest document

Partajați sau inserați document

Opțiuni de partajare

Vi se pare util acest document?

Este necorespunzător acest conținut?

Drepturi de autor:

Formate disponibile

ML Basic

Încărcat de

Drepturi de autor:

Formate disponibile

Predicting

employee companyAlias liked disliked commentId

0 307 56aec740f1ef260003e307d6 True False 58d018d7e010990004e38070

1 36 56aec740f1ef260003e307d6 True False 58d018d7e010990004e38070

2 276 56aec740f1ef260003e307d6 True False 58d018d7e010990004e38070

3 24 56aec740f1ef260003e307d6 True False 58d018d7e010990004e38070

4 382 56aec740f1ef260003e307d6 True False 58d0179ae010990004e3806d

employee companyAlias commentId txt likes dislikes commentD

employee companyAlias voteDate vote

0 31 56aec740f1ef260003e307d6 Mon Feb 01 01:00:00 CET 2016 4

1 33 56aec740f1ef260003e307d6 Mon Feb 01 01:00:00 CET 2016 4

2 79 56aec740f1ef260003e307d6 Mon Feb 01 01:00:00 CET 2016 4

3 94 56aec740f1ef260003e307d6 Mon Feb 01 01:00:00 CET 2016 4

4 16 56aec740f1ef260003e307d6 Mon Feb 01 01:00:00 CET 2016 2

employee companyAlias numVotes lastParticipationDate stillExists

0 512 56aec740f1ef260003e307d6 4 Thu Feb 23 12:48:04 CET 2017 True

1 -2 56aec740f1ef260003e307d6 0 Wed Jan 18 14:00:55 CET 2017 False

2 2 56aec740f1ef260003e307d6 72 Fri Mar 17 01:00:00 CET 2017 True

3 487 56aec740f1ef260003e307d6 14 Sat Nov 19 15:02:14 CET 2016 False

4 3 56aec740f1ef260003e307d6 22 Thu Feb 16 01:00:00 CET 2017 True

churn = churn[(churn['employee']>0) & (churn['numVotes']>0)]

Data Cleaning: Churn

churn.sort_values(by=['employee', 'companyAlias'], ascending=True, inplace=True)

churn.drop_duplicates(subset=['employee','companyAlias'], keep='first', inplace=True)

Data Cleaning: Votes

employee companyAlias vote

Combining Votes and Churn Data

churn_vote = pd.merge(churn, vote_clean, on=['employee', 'companyAlias'])

employee companyAlias numVotes lastParticipationDate stillExists vote

0 1 5474b9cde4b0bf7614b2c66f 2 Wed Nov 26 01:00:00 CET 2014 True 4

1 1 54d43612e4b0f6a40755d93e 16 Wed Jun 08 02:00:00 CEST 2016 False 4

2 1 54e52607e4b01191dc064966 135 Thu Sep 29 02:00:00 CEST 2016 False 3

3 1 5641f96713664c000332c8cd 257 Sun Mar 19 01:00:00 CET 2017 True 4

4 1 56558cfd07a5de00030908fb 2 Sun Nov 29 01:00:00 CET 2015 True 4

churn_vote.isnull().sum() # Confirming that there are no missing values.

result_dupe # confirming that we have fixed duplicate values.

employee companyAlias numVotes lastParticipationDate stillExists vote

employee companyAlias liked disliked commentId

0 307 56aec740f1ef260003e307d6 1 0 58d018d7e010990004e38070

2 276 56aec740f1ef260003e307d6 1 0 58d018d7e010990004e38070

4 382 56aec740f1ef260003e307d6 1 0 58d0179ae010990004e3806d

engagement_sum = interactions.groupby(['employee', 'companyAlias']).sum()

employee companyAlias liked disliked

0 1 54e52607e4b01191dc064966 11.0 2.0

1 1 5641f96713664c000332c8cd 151.0 37.0

2 1 567011c035dce00003a07fa4 2.0 0.0

3 1 56e2a905e3b6fe0003e32855 175.0 35.0

4 1 5742d699f839a10003a407d2 599.0 69.0

churn_vote_engagement = pd.merge(churn_vote, engagement_sum, on=['employee', 'companyAlias'])

employee companyAlias numVotes lastParticipationDate stillExists vote liked disliked

Thu Sep 29 02:00:00 CEST

Wed Jun 08 13:44:07 CEST

churn_vote_engagement.rename(columns={'liked':'likes_given', 'disliked' :'dislikes_given'}, inplace=True)

likeability = clean.groupby(['employee', 'companyAlias']).sum() #sum all likes

result_final = pd.merge(churn_vote_engagement, likeability, on=['employee', 'companyAlias'], how='inner')

result_final.isnull().sum() #no missing values

#drop participation Date

employee companyAlias numVotes stillExists vote likes_given dislikes_given likes dislikes

0 1 5641f96713664c000332c8cd 257 1 4 151.0 37.0 1740.0 708.0

1 1 56e2a905e3b6fe0003e32855 181 1 3 175.0 35.0 482.0 118.0

2 1 5742d699f839a10003a407d2 295 1 4 599.0 69.0 216.0 22.0

3 1 574c423856b6300003009953 123 1 4 229.0 14.0 402.0 68.0

4 1 57908a2622881200033b34d7 113 1 4 28.0 2.0 82.0 12.0

import seaborn as sns