Documente Academic
Documente Profesional
Documente Cultură
Employee Retention
In [1]:
import pandas as pd
import numpy as np
In [2]:
# contains total number of votes, and also if the employee is still an active employee.
churn = pd.read_csv('churn.csv')
# contains actual votes (hapiness rate of the employee)
votes = pd.read_csv('votes.csv')
# contains how many times a comment was liked.
clean = pd.read_csv('comments_clean_anonimized.csv')
# contains comments that was liked by a certain employee
interactions = pd.read_csv('commentInteractions.csv')
/home/benjamincabalonajr/.local/lib/python3.6/site-packages/IPython/core/interactiveshell.py:30
57: DtypeWarning: Columns (2,3) have mixed types. Specify dtype option on import or set low_mem
ory=False.
interactivity=interactivity, compiler=compiler, result=result)
In [3]:
interactions.head()
Out[3]:
clean.head()
Out[4]:
Mon Ma
0 307 56aec740f1ef260003e307d6 58d018d7e010990004e38070 **********************************************... 4.0 0.0 19:00:17
Mon Ma
1 382 56aec740f1ef260003e307d6 58d0179ae010990004e3806d ***************************** 1.0 2.0 18:55:16
Mon Ma
2 172 56aec740f1ef260003e307d6 58cff8cde010990004e37f6a *************************** 3.0 0.0 16:44:02
Mon Ma
3 135 56aec740f1ef260003e307d6 58cfefeee010990004e37f60 *************************** 1.0 1.0 16:06:08
Mon Ma
4 225 56aec740f1ef260003e307d6 58cfd9b4e010990004e37f52 ********************************* 3.0 2.0 14:30:50
In [5]:
votes.head()
Out[5]:
In [6]:
churn.head()
Out[6]:
Data Cleaning
First, we will drop employees with negative value, as stated on the description of this Dataset.
In [7]:
interactions = interactions[interactions['employee']>0]
In [8]:
votes = votes[votes['employee']>0]
In [9]:
clean = clean[clean['employee']>0]
In [10]:
In [11]:
In [12]:
In [13]:
import math
vote_clean = pd.pivot_table(index=['employee','companyAlias'],values='vote',aggfunc=np.mean,data=votes)
In [14]:
vote_clean['vote']=vote_clean['vote'].apply(math.ceil)
In [15]:
vote_clean.reset_index(inplace = True)
In [16]:
vote_clean.head()
Out[16]:
0 1 5474b9cde4b0bf7614b2c66f 4
1 1 54d43612e4b0f6a40755d93e 4
2 1 54e52607e4b01191dc064966 3
3 1 5641f96713664c000332c8cd 4
4 1 56558cfd07a5de00030908fb 4
churn_vote.head()
Out[18]:
In [19]:
Out[19]:
employee 0
companyAlias 0
numVotes 0
lastParticipationDate 0
stillExists 0
vote 0
dtype: int64
In [20]:
result_dupe = churn_vote[churn_vote.duplicated(subset=['employee','companyAlias'], keep=False)]
In [21]:
Out[21]:
Engagement
Total number of likes/dislikes of a comment
In [22]:
interactions['liked'] = pd.get_dummies(interactions['liked'],drop_first=True)
interactions['disliked'] = pd.get_dummies(interactions['disliked'],drop_first=True)
In [23]:
interactions.head()
Out[23]:
1 36 56aec740f1ef260003e307d6 1 0 58d018d7e010990004e38070
3 24 56aec740f1ef260003e307d6 1 0 58d018d7e010990004e38070
In [24]:
engagement_sum.reset_index(inplace=True)
In [26]:
engagement_sum.head()
Out[26]:
In [27]:
In [28]:
churn_vote_engagement.head()
Out[28]:
1 1 5641f96713664c000332c8cd 257 Sun Mar 19 01:00:00 CET 2017 True 4 151.0 37.0
3 1 56e2a905e3b6fe0003e32855 181 Mon Mar 20 01:00:00 CET 2017 True 3 175.0 35.0
4 1 5742d699f839a10003a407d2 295 Mon Mar 20 14:11:12 CET 2017 True 4 599.0 69.0
In [29]:
Likeability
In [30]:
clean.drop(['txt','commentDate'],axis=1,inplace=True) # remove un necessary columns.
In [31]:
In [32]:
likeability.reset_index(inplace=True)
In [33]:
Out[34]:
employee 0
companyAlias 0
numVotes 0
lastParticipationDate 0
stillExists 0
vote 0
likes_given 0
dislikes_given 0
likes 0
dislikes 0
dtype: int64
In [35]:
result_final[result_final.duplicated(subset=['employee','companyAlias'])]
# Confirming there are no duplicate values
Out[35]:
employee companyAlias numVotes lastParticipationDate stillExists vote likes_given dislikes_given likes dislikes
In [36]:
In [37]:
result_final.shape
Out[37]:
(2557, 9)
In [38]:
result_final['stillExists'] = pd.get_dummies(result_final['stillExists'],drop_first=True)
In [39]:
result_final.head()
Out[39]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
Exploratory Data Analysis
In [40]:
%matplotlib inline
sns.set(style='darkgrid')
In [41]:
draft=(pd.pivot_table(index='companyAlias',values='stillExists',aggfunc=sum,data=result_final).reset_index()
.sort_values(by='stillExists',ascending=False).head(15))
sns.barplot(x='companyAlias',y='stillExists',data=draft)
Out[41]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f2e381ccac8>
In [ ]:
In [42]:
result_final[result_final['stillExists']==1].shape
# This implies that 93% of the employees in our cleaned dataset is still working with their current employer
Out[42]:
(2390, 9)
In [ ]:
In [43]:
result_final[result_final['stillExists']==0].shape
# This implies that 93% of the employees in our cleaned dataset is still working with their current employer
Out[43]:
(167, 9)
In [44]:
plt.figure(figsize=(12,4))
sns.countplot(result_final['stillExists'])
plt.ylabel('Counf of Employees')
plt.xlabel('Employment Status')
plt.show()
Notice how skewed our target class is. There are a lot of ways in dealing with imbalanced data set when doing predictive
modelling. (i.e. Evaluate at a different metric other than accuracy, random over/undersampling, SMOTE, using models that
penalizes frequent classes, assigning weights to the class etc.) In our case, we will use SMOTE. (by Nitesh V. Chawla et.al) We will
handle this during predictive modelling.
For the purpose of data exploration, we will only use our original data, and instead we will undersample from our majority class.
We will take a random sample of 150 obsevations from both classes.
In [ ]:
In [45]:
Positive = result_final[result_final['stillExists']==1].sample(150,random_state=42)
Negative = result_final[result_final['stillExists']==0].sample(150,random_state=42)
Sample = pd.concat([Positive,Negative])
Sample.head()
# Taking 150 random samples from each class, then combining them together
Out[45]:
plt.figure(figsize=(16,6))
sns.set(style='darkgrid')
sns.countplot(x='vote',hue='stillExists',data=Sample)
plt.xlabel('Vote')
plt.ylabel('Votes per class')
#plt.title('Happiness is not a strong predictior of churn')
plt.show()
In [ ]:
In [47]:
plt.figure(figsize=(12,6))
sns.scatterplot(x='likes',y='stillExists',data=Sample)
plt.show()
In [48]:
plt.figure(figsize=(12,6))
sns.scatterplot(x='dislikes',y='stillExists',data=Sample)
plt.show()
In [49]:
plt.figure(figsize=(12,6))
sns.scatterplot(x='likes_given',y='stillExists',data=Sample)
plt.show()
In [50]:
plt.figure(figsize=(12,6))
sns.scatterplot(x='dislikes_given',y='stillExists',data=Sample)
plt.show()
In [51]:
plt.figure(figsize=(16,6))
sns.heatmap(Sample.corr())
plt.show()
Precision is the ratio of correctly predicted positive observations to the total predicted positive observations.
Recall is the ratio of correctly predicted positive observations to the all observations in actual class.
F1 Score is the weighted average of Precision and Recall. Therefore, this score takes both false positives and false negatives into
account.
In [52]:
In [53]:
le = LabelEncoder()
result_final['companyAlias'] = le.fit_transform(result_final['companyAlias'])
result_final.head(1)
Out[53]:
X = result_final.drop(['employee','stillExists','likes','likes_given','dislikes_given',],axis=1)
y = result_final.stillExists
In [55]:
In [56]:
def model(model,X_train,y_train,X_test,y_test):
model.fit(X_train,y_train)
pred = model.predict(X_test)
print(classification_report(y_test,pred))
print('\n')
print(confusion_matrix(y_test,pred))
print('\n')
print('Rows = Actual Class')
print('Columns = Predicted Class ')
In [57]:
In [58]:
lmp = LogisticRegressionCV(class_weight={0:9,1:1},cv=5)
In [59]:
model(lmp,X_train,y_train,X_test,y_test)
[[ 27 10]
[100 375]]
In [60]:
Coefficients = pd.DataFrame(lmp.coef_,columns=X.columns)
Coefficients
Out[60]:
In [61]:
def sigmoid(x):
return (1/(1+math.exp(-x)))
In [62]:
from imblearn.over_sampling import SMOTE
In [63]:
sm = SMOTE()
In [64]:
In [65]:
lr = LogisticRegressionCV(cv=3)
In [66]:
model(lr,X_os,y_os,X_test,y_test)
[[ 27 10]
[124 351]]
In [67]:
X.corr()
Out[67]:
In [68]:
import statsmodels.discrete.discrete_model as sm
In [69]:
lrm = sm.Logit(y_train,X_train)
In [70]:
result = lrm.fit()
result.summary()
/home/benjamincabalonajr/.local/lib/python3.6/site-packages/statsmodels/base/model.py:492: Hess
ianInversionWarning: Inverting hessian failed, no bse or cov_params available
'available', HessianInversionWarning)
/home/benjamincabalonajr/.local/lib/python3.6/site-packages/statsmodels/base/model.py:492: Hess
ianInversionWarning: Inverting hessian failed, no bse or cov_params available
'available', HessianInversionWarning)
/home/benjamincabalonajr/.local/lib/python3.6/site-packages/statsmodels/discrete/discrete_model
.py:3390: RuntimeWarning: divide by zero encountered in double_scalars
return 1 - self.llf/self.llnull
Out[71]:
In [72]:
Coefficients
Out[72]:
In [73]:
result_final.head()
Out[73]:
result_final.describe()
Out[74]:
count 2557.000000 2557.000000 2557.000000 2557.000000 2557.000000 2557.000000 2557.000000 2557.00000 2557.000000
mean 173.437622 15.057880 70.650372 0.934689 3.345718 101.928041 18.813062 210.06492 38.685960
std 183.411596 9.258406 82.898413 0.247122 0.678990 203.662501 53.115962 422.70343 97.700203
min 1.000000 0.000000 1.000000 0.000000 1.000000 0.000000 0.000000 0.00000 0.000000
25% 44.000000 7.000000 15.000000 1.000000 3.000000 8.000000 1.000000 22.00000 2.000000
50% 122.000000 15.000000 40.000000 1.000000 3.000000 33.000000 4.000000 72.00000 10.000000
75% 229.000000 22.000000 96.000000 1.000000 4.000000 103.000000 15.000000 210.00000 34.000000
max 999.000000 33.000000 740.000000 1.000000 4.000000 3151.000000 891.000000 6558.00000 1940.000000
In [75]:
dataset = pd.DataFrame()
In [76]:
dataset['numVotes'] = np.random.randint(20,103,size=(9870,))
dataset['likes_given'] = np.random.randint(40,311,size=(9870,))
dataset['dislikes_given'] = np.random.randint(20,53,size=(9870,))
dataset['likes'] = np.random.randint(60,223,size=(9870,))
dataset['dislikes'] = np.random.randint(23,62,size=(9870,))
dataset['vote'] = [1]*357 + [2]* 472 + [3]* 4703 + [4] * 4338
In [77]:
sns.countplot(dataset['vote'])
Out[77]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f2dfc511240>
In [78]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [79]:
import names
In [ ]:
In [80]:
var = []
for i in range(1,9871):
var.append((names.get_full_name()))
In [81]:
dataset['username'] = var
In [82]:
dataset['username'].nunique()
Out[82]:
9738
In [83]:
dataset.head()
Out[83]:
In [84]:
dataset.drop_duplicates(subset='username',keep='first', inplace=True)
In [ ]:
In [ ]:
comp_list[0:5]
In [ ]:
comp= pd.read_csv('/home/benjamincabalonajr/Documents/web/csv_files_backup/Company-2019-07-23.csv')
In [ ]:
comp_list = list(comp.company_name)
In [ ]:
dataset['company'] = [1]*777 + [2]* 1603 + [3]* 759 + [4] * (630+353) + [5]* 982 + [6] * 1050 + [7] * 666 +
[8] * 803 + [9] * 1001 + [10]* 1119
In [ ]:
In [ ]:
In [ ]:
dataset.to_csv('engineer_calura.csv')
In [ ]: