1 Assignment 3 - Classification

assignment_3
August 9, 2019
1 Assignment 3 - Classification
In this assignment we will take a closer look at two popular datasets and work with the sklearn
API for model fitting.
1. Load the Iris dataset.
[1]: import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
%matplotlib inline
from sklearn import datasets
iris = datasets.load_iris()
print(iris.keys())
print(iris.target)
print(iris.target_names)
dict_keys(['data', 'target', 'target_names', 'DESCR', 'feature_names',

'filename'])
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2
2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
2 2]
['setosa' 'versicolor' 'virginica']
[2]: print(iris.data)
[[5.1 3.5 1.4 0.2]

[4.9 3. 1.4 0.2]
[4.7 3.2 1.3 0.2]
[4.6 3.1 1.5 0.2]
[5. 3.6 1.4 0.2]
[5.4 3.9 1.7 0.4]
[4.6 3.4 1.4 0.3]
[5. 3.4 1.5 0.2]
[4.4 2.9 1.4 0.2]
[4.9 3.1 1.5 0.1]
1
[5.4 3.7 1.5 0.2]
[4.8 3.4 1.6 0.2]
[4.8 3. 1.4 0.1]
[4.3 3. 1.1 0.1]
[5.8 4. 1.2 0.2]
[5.7 4.4 1.5 0.4]
[5.4 3.9 1.3 0.4]
[5.1 3.5 1.4 0.3]
[5.7 3.8 1.7 0.3]
[5.1 3.8 1.5 0.3]
[5.4 3.4 1.7 0.2]
[5.1 3.7 1.5 0.4]
[4.6 3.6 1. 0.2]
[5.1 3.3 1.7 0.5]
[4.8 3.4 1.9 0.2]
[5. 3. 1.6 0.2]
[5. 3.4 1.6 0.4]
[5.2 3.5 1.5 0.2]
[5.2 3.4 1.4 0.2]
[4.7 3.2 1.6 0.2]
[4.8 3.1 1.6 0.2]
[5.4 3.4 1.5 0.4]
[5.2 4.1 1.5 0.1]
[5.5 4.2 1.4 0.2]
[4.9 3.1 1.5 0.2]
[5. 3.2 1.2 0.2]
[5.5 3.5 1.3 0.2]
[4.9 3.6 1.4 0.1]
[4.4 3. 1.3 0.2]
[5.1 3.4 1.5 0.2]
[5. 3.5 1.3 0.3]
[4.5 2.3 1.3 0.3]
[4.4 3.2 1.3 0.2]
[5. 3.5 1.6 0.6]
[5.1 3.8 1.9 0.4]
[4.8 3. 1.4 0.3]
[5.1 3.8 1.6 0.2]
[4.6 3.2 1.4 0.2]
[5.3 3.7 1.5 0.2]
[5. 3.3 1.4 0.2]
[7. 3.2 4.7 1.4]
[6.4 3.2 4.5 1.5]
[6.9 3.1 4.9 1.5]
[5.5 2.3 4. 1.3]
[6.5 2.8 4.6 1.5]
[5.7 2.8 4.5 1.3]
[6.3 3.3 4.7 1.6]
[4.9 2.4 3.3 1. ]
2
[6.6 2.9 4.6 1.3]
[5.2 2.7 3.9 1.4]
[5. 2. 3.5 1. ]
[5.9 3. 4.2 1.5]
[6. 2.2 4. 1. ]
[6.1 2.9 4.7 1.4]
[5.6 2.9 3.6 1.3]
[6.7 3.1 4.4 1.4]
[5.6 3. 4.5 1.5]
[5.8 2.7 4.1 1. ]
[6.2 2.2 4.5 1.5]
[5.6 2.5 3.9 1.1]
[5.9 3.2 4.8 1.8]
[6.1 2.8 4. 1.3]
[6.3 2.5 4.9 1.5]
[6.1 2.8 4.7 1.2]
[6.4 2.9 4.3 1.3]
[6.6 3. 4.4 1.4]
[6.8 2.8 4.8 1.4]
[6.7 3. 5. 1.7]
[6. 2.9 4.5 1.5]
[5.7 2.6 3.5 1. ]
[5.5 2.4 3.8 1.1]
[5.5 2.4 3.7 1. ]
[5.8 2.7 3.9 1.2]
[6. 2.7 5.1 1.6]
[5.4 3. 4.5 1.5]
[6. 3.4 4.5 1.6]
[6.7 3.1 4.7 1.5]
[6.3 2.3 4.4 1.3]
[5.6 3. 4.1 1.3]
[5.5 2.5 4. 1.3]
[5.5 2.6 4.4 1.2]
[6.1 3. 4.6 1.4]
[5.8 2.6 4. 1.2]
[5. 2.3 3.3 1. ]
[5.6 2.7 4.2 1.3]
[5.7 3. 4.2 1.2]
[5.7 2.9 4.2 1.3]
[6.2 2.9 4.3 1.3]
[5.1 2.5 3. 1.1]
[5.7 2.8 4.1 1.3]
[6.3 3.3 6. 2.5]
[5.8 2.7 5.1 1.9]
[7.1 3. 5.9 2.1]
[6.3 2.9 5.6 1.8]
[6.5 3. 5.8 2.2]
[7.6 3. 6.6 2.1]
3
[4.9 2.5 4.5 1.7]
[7.3 2.9 6.3 1.8]
[6.7 2.5 5.8 1.8]
[7.2 3.6 6.1 2.5]
[6.5 3.2 5.1 2. ]
[6.4 2.7 5.3 1.9]
[6.8 3. 5.5 2.1]
[5.7 2.5 5. 2. ]
[5.8 2.8 5.1 2.4]
[6.4 3.2 5.3 2.3]
[6.5 3. 5.5 1.8]
[7.7 3.8 6.7 2.2]
[7.7 2.6 6.9 2.3]
[6. 2.2 5. 1.5]
[6.9 3.2 5.7 2.3]
[5.6 2.8 4.9 2. ]
[7.7 2.8 6.7 2. ]
[6.3 2.7 4.9 1.8]
[6.7 3.3 5.7 2.1]
[7.2 3.2 6. 1.8]
[6.2 2.8 4.8 1.8]
[6.1 3. 4.9 1.8]
[6.4 2.8 5.6 2.1]
[7.2 3. 5.8 1.6]
[7.4 2.8 6.1 1.9]
[7.9 3.8 6.4 2. ]
[6.4 2.8 5.6 2.2]
[6.3 2.8 5.1 1.5]
[6.1 2.6 5.6 1.4]
[7.7 3. 6.1 2.3]
[6.3 3.4 5.6 2.4]
[6.4 3.1 5.5 1.8]
[6. 3. 4.8 1.8]
[6.9 3.1 5.4 2.1]
[6.7 3.1 5.6 2.4]
[6.9 3.1 5.1 2.3]
[5.8 2.7 5.1 1.9]
[6.8 3.2 5.9 2.3]
[6.7 3.3 5.7 2.5]
[6.7 3. 5.2 2.3]
[6.3 2.5 5. 1.9]
[6.5 3. 5.2 2. ]
[6.2 3.4 5.4 2.3]
[5.9 3. 5.1 1.8]]
[3]: print(iris.data[0:50,1])
print(iris.data[50:100,1])
4
print(iris.data[100:150,1])
[3.5 3. 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 3.7 3.4 3. 3. 4. 4.4 3.9 3.5
3.8 3.8 3.4 3.7 3.6 3.3 3.4 3. 3.4 3.5 3.4 3.2 3.1 3.4 4.1 4.2 3.1 3.2
3.5 3.6 3. 3.4 3.5 2.3 3.2 3.5 3.8 3. 3.8 3.2 3.7 3.3]
[3.2 3.2 3.1 2.3 2.8 2.8 3.3 2.4 2.9 2.7 2. 3. 2.2 2.9 2.9 3.1 3. 2.7
2.2 2.5 3.2 2.8 2.5 2.8 2.9 3. 2.8 3. 2.9 2.6 2.4 2.4 2.7 2.7 3. 3.4
3.1 2.3 3. 2.5 2.6 3. 2.6 2.3 2.7 3. 2.9 2.9 2.5 2.8]
[3.3 2.7 3. 2.9 3. 3. 2.5 2.9 2.5 3.6 3.2 2.7 3. 2.5 2.8 3.2 3. 3.8
2.6 2.2 3.2 2.8 2.8 2.7 3.3 3.2 2.8 3. 2.8 3. 2.8 3.8 2.8 2.8 2.6 3.
3.4 3.1 3. 3.1 3.1 3.1 2.7 3.2 3.3 3. 2.5 3. 3.4 3. ]
[4]: iris.keys()
print(iris.feature_names)
['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width
(cm)']
2. Make a scatter plot for the sepal width (cm) against sepal length (cm). Colour the data
points according to their class (setosa/versicolor/virginica) and include a legend.
[5]: x_setosa = iris.data[0:50,1]
x_versicolor = iris.data[50:100,1]
x_virginica = iris.data[100:150,1]
y_setosa = iris.data[0:50,0]
y_versicolor = iris.data[50:100,0]
y_virginica = iris.data[100:150,0]
plt.scatter(x_setosa, y_setosa, color='red')

plt.scatter(x_versicolor, y_versicolor, color='blue')
plt.scatter(x_virginica, y_virginica, color='green')
plt.xlabel('sepal width (cm)')
plt.ylabel('sepal length (cm)')
plt.title('The Scatter Plot Iris Dataset')
plt.legend(['Setosa','Versicolor','Virginica'])
plt.grid()
5
3. Split the data into train and test sets with a 80/20 split. Use the training set to build a
logistic regression classifier to discriminate the setosa class from the other two classes, using
the sepal width and sepal length features. Compute the accuracy on train and test sets.
[6]: from sklearn.model_selection import train_test_split
irissplit = iris.data[:,:1]
x_train, x_test, y_train, y_test = train_test_split(irissplit, iris.target,␣
,→test_size=0.20)
[7]: print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)
(120, 1)
(30, 1)
(120,)
(30,)
[8]: d = 0
train_where_d = np.where(y_train==d)[0]
test_where_d = np.where(y_test==d)[0]
y_train_d = np.zeros((len(y_train),), dtype=np.float64)

y_train_d[train_where_d] = 1
6
y_test_d = np.zeros((len(y_test),), dtype=np.float64)
y_test_d[test_where_d] = 1
[9]: from sklearn.linear_model import LogisticRegression
log_reg = LogisticRegression(solver='lbfgs', max_iter=1000)
log_reg.fit(x_train, y_train_d)
y_pred = log_reg.predict(x_test)
[10]: from sklearn.metrics import accuracy_score
print('The accuracy score is: ',accuracy_score(y_test_d, y_pred),'%')
The accuracy score is: 0.9 %
4. Plot the contours of the logistic regression model together with the dataset.
[11]: from sklearn.metrics import confusion_matrix
print(confusion_matrix(y_test, y_pred))
[[ 1 7 0]
[ 6 2 0]
[14 0 0]]
[12]: plt.imshow(np.log(confusion_matrix(y_test, y_pred)), cmap='PuRd',␣

,→interpolation='nearest')
plt.grid()
plt.ylabel('true')
plt.xlabel('predicted');
plt.style.use('seaborn')
plt.show()
C:\Users\DODY\Anaconda3\lib\site-packages\ipykernel_launcher.py:1:
RuntimeWarning: divide by zero encountered in log
"""Entry point for launching an IPython kernel.
7
5. Build logistic regression classifiers using sepal length and sepal width features to dis-
criminate versicolor and setosa classes. Compute the accuracy on the train and test sets. Plot
the decision boundaries for both classifiers.
[13]: for d in range(3):


log_reg = LogisticRegression(solver='lbfgs', max_iter=90)

log_reg.fit(x_train, y_train_d)
y_pred = log_reg.predict(x_test)
print(f'Accuracy score for identifying flower {d}: {100 *␣

,→ accuracy_score(y_test_d, y_pred):.2f}%') #REVISED LATER
Accuracy score for identifying flower 0: 90.00%

6. Try to improve the accuracy of one the versicolor/virginica classifiers by increasing the
8
number of basis functions (still only using sepal width and sepal length). Compute the train
and test accuracy and plot the new decision boundary.
[14]: from sklearn.model_selection import train_test_split
irissplit=iris.data[:,:1]
Xtrain, Xtest, ytrain, ytest = train_test_split(irissplit, iris.target,␣
,→test_size=0.20)
print(Xtrain.shape)
print(Xtest.shape)
print(ytrain.shape)
print(ytest.shape)
d = [1]
train_where_d = np.where(ytrain==d)[0]
test_where_d = np.where(ytest==d)[0]
ytrain_d = np.zeros((len(ytrain),), dtype=np.float64)

ytrain_d[train_where_d] = 1
ytest_d = np.zeros((len(ytest),), dtype=np.float64)

ytest_d[test_where_d] = 1
from sklearn.linear_model import LogisticRegression

log_reg = LogisticRegression(solver='newton-cg', max_iter=1000)
log_reg.fit(Xtrain, ytrain_d)
ypred = log_reg.predict(Xtest)
from sklearn.metrics import accuracy_score

accuracy_score(ytest_d, ypred)
(120, 1)
(30, 1)
(120,)
(30,)
[14]: 0.6
[15]: d = [2]
train_where_d = np.where(ytrain==d)[0]
test_where_d = np.where(ytest==d)[0]
ytrain_d = np.zeros((len(ytrain),), dtype=np.float64)

ytrain_d[train_where_d] = 1
ytest_d = np.zeros((len(ytest),), dtype=np.float64)

ytest_d[test_where_d] = 1
from sklearn.linear_model import LogisticRegression

log_reg = LogisticRegression(solver='newton-cg', max_iter=1000)
9
log_reg.fit(Xtrain, ytrain_d)
ypred = log_reg.predict(Xtest)
from sklearn.metrics import accuracy_score

accuracy_score(ytest_d, ypred)
[15]: 0.8
7. Another classification algorithm available in sklearn is the support vector machine
(SVM). The model is named SVC (support vector classifier, to distinguish from support vector
regression), in the sklearn.svm module. Take a look at the docs for this classifier and experi-
ment with fitting SVM models to the data, again computing train and test accuracy and plotting
the decision boundary.
[16]: from sklearn import svm
for d in range(3):


svc = svm.SVC(kernel = 'sigmoid', gamma = 'scale')

svc.fit(x_train,y_train_d)
y_predic = svc.predict(x_test)

,→ accuracy_score(y_test_d, y_pred):.2f}%')
plt.imshow(np.log(confusion_matrix(y_test, y_pred)), cmap='PuRd',␣

plt.grid(True)
plt.ylabel('true')
plt.xlabel('predicted')
plt.show()

10
[ ]:


svcnu = svm.NuSVC(nu = 0.1, kernel = 'sigmoid', gamma = 0.75)

svcnu.fit(x_train,y_train_d)
y_predic = svcnu.predict(x_test)

11
plt.grid(True)
plt.ylabel('true')
plt.show()

[ ]:
12


svc = svm.SVC(kernel = 'rbf', gamma = 'scale')

svc.fit(x_train,y_train_d)
y_predic = svc.predict(x_test)


plt.grid(True)
plt.ylabel('true')
plt.show()

13
[ ]:


svcrbf = svm.SVC(kernel = 'rbf', gamma = 0.01)

svcrbf.fit(x_train,y_train_d)
y_predic = svcrbf.predict(x_test)

14
plt.grid(True)
plt.ylabel('true')
plt.show()

[ ]:
[ ]:
15
[ ]:
[ ]:
16

1 Assignment 3 - Classification

Încărcat de

Informații document

Titlu original

Drepturi de autor

Formate disponibile

Partajați acest document

Partajați sau inserați document

Opțiuni de partajare

Vi se pare util acest document?

Este necorespunzător acest conținut?

Drepturi de autor:

Formate disponibile

1 Assignment 3 - Classification

Încărcat de

Drepturi de autor:

Formate disponibile

assignment_3

dict_keys(['data', 'target', 'target_names', 'DESCR', 'feature_names',

[[5.1 3.5 1.4 0.2]

plt.scatter(x_setosa, y_setosa, color='red')

y_train_d = np.zeros((len(y_train),), dtype=np.float64)

The accuracy score is: 0.9 %

[12]: plt.imshow(np.log(confusion_matrix(y_test, y_pred)), cmap='PuRd',␣

y_train_d = np.zeros((len(y_train),), dtype=np.float64)

y_test_d = np.zeros((len(y_test),), dtype=np.float64)

log_reg = LogisticRegression(solver='lbfgs', max_iter=90)

print(f'Accuracy score for identifying flower {d}: {100 *␣

Accuracy score for identifying flower 0: 90.00%

ytrain_d = np.zeros((len(ytrain),), dtype=np.float64)

ytest_d = np.zeros((len(ytest),), dtype=np.float64)

from sklearn.linear_model import LogisticRegression

from sklearn.metrics import accuracy_score

ytrain_d = np.zeros((len(ytrain),), dtype=np.float64)

ytest_d = np.zeros((len(ytest),), dtype=np.float64)

from sklearn.linear_model import LogisticRegression

from sklearn.metrics import accuracy_score

y_train_d = np.zeros((len(y_train),), dtype=np.float64)

y_test_d = np.zeros((len(y_test),), dtype=np.float64)

svc = svm.SVC(kernel = 'sigmoid', gamma = 'scale')

print(f'Accuracy score for identifying flower {d}: {100 *␣

plt.imshow(np.log(confusion_matrix(y_test, y_pred)), cmap='PuRd',␣

Accuracy score for identifying flower 0: 40.00%

y_train_d = np.zeros((len(y_train),), dtype=np.float64)

y_test_d = np.zeros((len(y_test),), dtype=np.float64)

svcnu = svm.NuSVC(nu = 0.1, kernel = 'sigmoid', gamma = 0.75)

print(f'Accuracy score for identifying flower {d}: {100 *␣

Accuracy score for identifying flower 0: 40.00%

y_train_d = np.zeros((len(y_train),), dtype=np.float64)

y_test_d = np.zeros((len(y_test),), dtype=np.float64)

svc = svm.SVC(kernel = 'rbf', gamma = 'scale')

print(f'Accuracy score for identifying flower {d}: {100 *␣

plt.imshow(np.log(confusion_matrix(y_test, y_pred)), cmap='PuRd',␣

Accuracy score for identifying flower 0: 40.00%

y_train_d = np.zeros((len(y_train),), dtype=np.float64)

y_test_d = np.zeros((len(y_test),), dtype=np.float64)

svcrbf = svm.SVC(kernel = 'rbf', gamma = 0.01)

print(f'Accuracy score for identifying flower {d}: {100 *␣

Accuracy score for identifying flower 0: 40.00%

S-ar putea să vă placă și