Sunteți pe pagina 1din 4

#impor required librares

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

#read the datafile into dataframe


data = pd.read_csv('WorkOrderData.csv')

#get the basic info of DataFrame


data.info()
"""
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28922 entries, 0 to 28921
Data columns (total 14 columns):
site_id 28922 non-null int64
maintenance_typ 27879 non-null object
category_cd 27345 non-null object
status_cd 28922 non-null object
closed_time_id 28922 non-null int64
Closed_date 25148 non-null object
required_time_id 28922 non-null int64
Required_date 28922 non-null object
planned_end_time_id 28922 non-null int64
Planned_date 13666 non-null object
wo_count 28922 non-null int64
wo_ontime_count 28922 non-null int64
wo_late_count 28922 non-null int64
wo_upcoming_count 28922 non-null int64
dtypes: int64(8), object(6)
memory usage: 3.1+ MB

"""

#drop the empty rows


data.dropna(inplace=True)

#get the first 5 samples of data


data.head()

"""
site_id maintenance_typ category_cd status_cd closed_time_id
Closed_date required_time_id Required_date planned_end_time_id
Planned_date wo_count wo_ontime_count wo_late_count
wo_upcoming_count
0 265 BREAKDOWN REPAIR Asset Maintenance Orders COMPLETED 20170206
00:00.0 20170207 00:00.0 20170206 00:00.0 1 1 0
0
1 265 BREAKDOWN REPAIR Asset Maintenance Orders COMPLETED 20170206
00:00.0 20170207 00:00.0 20170206 00:00.0 1 1 0
0
2 334 PREDICTIVE TASK Asset Maintenance Orders CANCELLED 20170406
00:00.0 20170413 00:00.0 20170406 00:00.0 1 0 1
0
3 334 PREDICTIVE TASK Asset Maintenance Orders COMPLETED 20170408
00:00.0 20170413 00:00.0 20170406 00:00.0 1 1 0
0
4 324 CORRECTIVE TASK Asset Maintenance Orders COMPLETED 20170408
00:00.0 20170413 00:00.0 20170408 00:00.0 1 1 0
0

"""

#conert str-dates to timestamp columns


def conv_dt(clm):
"""
function to convert the date formates into timestam
which represents date in 'yyyy-mm-dd' format.

"""
return pd.to_datetime(clm,format='%Y%m%d')

#conert date columns into timestamp columns


data['closed_time_id'] = conv_dt(data['closed_time_id'])
data['required_time_id'] = conv_dt(data['required_time_id'])
data['planned_end_time_id'] = conv_dt(data['planned_end_time_id'])

data.shape #(12093, 14)

#perform predictions on tasks occured in year 2017


data_year = data[data['closed_time_id'].dt.year == 2017]

data_year.shape #(11944, 14)

#total no of samples whos status is 'completed'


data['status_cd'].value_counts()

"""
COMPLETED 8727
CLOSED 3208
CANCELLED 158
Name: status_cd, dtype: int64

"""

"""
get the samples whos maintenance is completed
so that we can predict the date of complition of maintenance only for completed
sites
"""
data_completed = data_year[data_year['status_cd']=='COMPLETED']

data_completed.shape #(8614, 14)

data_completed['closed_day']=data_completed['closed_time_id'].map(lambda x:
x.dayofyear)
data_completed['required_time(day)']=data_completed['required_time_id'].map(lambda
x: x.dayofyear)
data_completed['planned_time(day)']=data_completed['planned_end_time_id'].map(lambd
a x: x.dayofyear)

#errors can be neglected


#get the new data for training the model and testing the data

data_new = data_completed[['site_id', 'maintenance_typ', 'category_cd',


'status_cd',
'wo_count', 'wo_ontime_count', 'wo_late_count',
'wo_upcoming_count',
'closed_day', 'required_time(day)',
'planned_time(day)']].copy()

data_new.shape #(8614, 11)

data_new['site_id'].value_counts().count() #40

data_new['category_cd'].value_counts()

"""
Asset Maintenance Orders 5161
Sanitation Work Orders 1609
Integrated Pest Management 802
Safety Task Work Orders 385
Final Product Zone Order 365
Autonomous Maintenance 147
Food Safety 130
Pest Management Work Order 5
Environmental Task Work Orders 4
Capital Work Orders 4
COR - Corrective Maintenance Work Orders 2
Name: category_cd, dtype: int64
"""

#import the preprocessing and lableencoder


#to perform the preprocessing and lable encoding to the string columns

from sklearn import preprocessing


from sklearn.preprocessing import LabelEncoder

#get the linear regression model


from sklearn.linear_model import LinearRegression

#encode the string columns


le_color = LabelEncoder()
le_make = LabelEncoder()
data_new['maintenance_typ'] = le_color.fit_transform(data_new.maintenance_typ)
data_new['category_cd'] = le_make.fit_transform(data_new.category_cd)

X = data_new[['site_id', 'maintenance_typ', 'category_cd',


'wo_count','wo_ontime_count','wo_late_count','wo_upcoming_count',
'required_time(day)','planned_time(day)']]

y= data_new[ 'closed_day' ]

lm = LinearRegression()

#split the dat into traing and testing sets


from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3,
random_state=101)

#fit the data for training the model


lm.fit(X_train,y_train)

#predict the date in form of DayOfYear


"""
it can be converted back to yyyy-mm-dd format
"""
pred_train = lm.predict(X_train)
pred_test = lm.predict(X_test)

"""
pred_train
array([109.07451724, 43.69171545, 54.58952948, ..., 46.84330053,
52.12862088, 57.95741627])

pred_test
array([ 73.39881804, 95.87136349, 82.24623855, ..., 90.58016066,
102.27159995, 98.12409611])

"""

S-ar putea să vă placă și