Sunteți pe pagina 1din 5

Part 1 SAS:

1) A screen shot of your project window

2) A listing or screen shot of the min, max and mean of the predicted and actual amounts in
your data.
3) A listing of the first 15 observations after imputation and prediction.

Part 2: Python
1) A copy of your Python program
#Importing Library
import pandas as pd
import numpy as np
from AdvancedAnalytics import ReplaceImputeEncode

#Reading Data
df = pd.read_excel("diamondswmissing.xlsx")

#Missing Values
data_map = {\
'obs': [4,(1,53940)],\
'Carat':[0,(0.2,5.5)],\
'cut':[2,('Fair','Good','Premium','Very Good')],\
'color':[2,('D','E','F','G','H','I','J')],\
'clarity':[2,('I1','IF','SI1','SI2','VS1','VS2','VVS1','VVS2')],\
'depth':[0,(40,80)],\
'table':[0,(40,100)],\
'x':[0,(0,11)],\
'y':[0,(0,60)],\
'z':[0,(0,32)],\
'price':[0,(300,20000)]\
}
rie = ReplaceImputeEncode(data_map=data_map,display=True)
df.rie = rie.fit_transform(df)

#Imputing Missing Values


from sklearn import preprocessing
interval_attributes = ['Carat','depth','table','x','y','z']
interval_data = df.as_matrix(columns = interval_attributes)
interval_imputer = preprocessing.Imputer(strategy = 'mean')
imputed_interval_data = interval_imputer.fit_transform(interval_data)

print("Imputed Interval Data:\n", imputed_interval_data)

# Convert String Categorical Attribute to Numbers for further assesment


# Mapping of categories to numbers for attribute 'cut'
cut_map = {'Ideal':0, 'Premium':1, 'Good':2, 'Very Good':3, 'Fair':4}
df['cut'] = df['cut'].map(cut_map)
# Mapping of categories to numbers for attribute 'color'
color_map = {'E':0,'I':1,'J':2,'H':3,'F':4,'G':5,'D':6}
df['color'] = df['color'].map(color_map)
# Mapping of categories to numbers for attribute 'clarity'
clarity_map = {'SI2':0,'SI':1,'VS1':2,'VS2':3,'VVS2':4,'VVS1':5,'I1':6,'IF':7}
df['clarity'] = df['clarity'].map(clarity_map)
print(df)

# Converting nominal data from the dataframe into a numpy array


nominal_attributes = ['cut','color','clarity']
nominal_data = df.as_matrix(columns=nominal_attributes)
# Create Imputer for Categorical Data
cat_imputer = preprocessing.Imputer(strategy='most_frequent')
# Imputing missing values in the Categorical Data
imputed_nominal_data = cat_imputer.fit_transform(nominal_data)
#inserting imputed data in the data frame
df[['cut','color','clarity']] = imputed_nominal_data
df[['Carat','depth','table','x','y','z']] = imputed_interval_data
df.head()

#Encoding
scaler = preprocessing.StandardScaler() # Create an instance of StandardScaler()
scaler.fit(imputed_interval_data)
scaled_interval_data = scaler.transform(imputed_interval_data)
print("Imputed & Scaled Interval Data\n", scaled_interval_data)

# Create an instance of the OneHotEncoder & Selecting Attributes


onehot = preprocessing.OneHotEncoder()
hot_array = onehot.fit_transform(imputed_nominal_data).toarray()
print(hot_array)

from AdvancedAnalytics import linreg


from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

y = df['price']
x = df.drop('price',axis=1)
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=1)

lr=LinearRegression()

col=[]
for i in range(x_train.shape[1]):
col.append('X'+str(i))

lr.fit(x_train,y_train)
print("\n*** LINEAR REGRESSION ***")
linreg.display_coef(lr, x_train, y_train, col)
linreg.display_metrics(lr, x_train, y_train)

y_hat= lr.predict(x_test)
xtestarr = np.asanyarray(x_test)
ytestarr = np.asanyarray(y_test)

print("Residual sum of squares: %.2f"


% np.mean((y_hat - y_test) ** 2))

#Maximum, minimum and mean of predicted value


prediction_min = y_hat.min()
print("\nPredicted minimum\n",prediction_min)
prediction_max = y_hat.max()
print("\nPredicted maximum\n",prediction_max)
prediction_mean = y_hat.mean()
print("\nPredicted mean\n",prediction_mean)

#Maximum, minimum and mean of imputed value


imputed_min = y_test.idxmin(axis = 0)
print("\nImputed minimum\n",imputed_min)
imputed_max = y_test.idxmax(axis = 0)
print("\nImputed maximum\n",imputed_max)
imputed_mean = y_test.mean(axis = 0)
print("\nImputed mean\n",imputed_mean)

# Printing first 15 predicted values


print("\nFirst 15 predicted values\n",y_hat[0:14])

# Printing table
final_table = df.head(15)
from pandas import ExcelWriter
writer = ExcelWriter('PythonHW.xlsx')
final_table.to_excel(writer)
writer.save()

2) A listing or screen shot of the min, max and mean of the predicted and actual amounts in
your data.
Predicted minimum
-8484.009878847311

Predicted maximum
27244.01984476174

Predicted mean
3950.5366225183566

Actual minimum
2

Actual maximum
27746

Actual mean
3900.195464095909

3) A listing of the first 15 observations after imputation and prediction.


First 15 predicted values
[ 131.20374427 7045.54015631 3251.63967449 -152.36320968 6716.05896494
1077.15222802 6289.70190779 -116.45970468 9951.5798897 823.26302334
-242.72890304 3120.9538463 3851.97449455 430.84315957]

S-ar putea să vă placă și