Sunteți pe pagina 1din 18

# Date: --/ -- /----

1. Implement and demonstrate the FIND-S algorithm for finding the most specific
hypothesis based on a given set of training data samples. Read the training data from a
.CSV file.

Program:

import csv
with open('tennis.csv', 'r') as f:

## h = [['0', '0', '0', '0', '0', '0']]

for i in your_list:
print(i)
if i[-1] == "True":
j=0
for x in i:
if x != "True":
if x != h[0][j] and h[0][j] == '0':
h[0][j] = x
elif x != h[0][j] and h[0][j] != '0':
h[0][j] = '?'
else:
pass
j=j+1
print("Most specific hypothesis is")
print(h)
Output

## 'Sunny', 'Warm', 'Normal', 'Strong', 'Warm', 'Same',True

'Sunny', 'Warm', 'High', 'Strong', 'Warm', 'Same',True
'Rainy', 'Cold', 'High', 'Strong', 'Warm', 'Change',False
'Sunny', 'Warm', 'High', 'Strong', 'Cool', 'Change',True

## Maximally Specific set

[['Sunny', 'Warm', '?', 'Strong', '?', '?']]

Page 1
Date: --/ -- /----

2. For a given set of training data examples stored in a .CSV file, implement and
demonstrate the Candidate-Elimination algorithm to output a description of the set of all
hypotheses consistent with the training examples.

Program:
import csv

def get_domains(examples):
d = [set() for i in examples[0]]
for x in examples:
for i, xi in enumerate(x):
return [list(sorted(x)) for x in d]

## def more_general(h1, h2):

more_general_parts = []
for x, y in zip(h1, h2):
mg = x == "?" or (x != "0" and (x == y or y == "0"))
more_general_parts.append(mg)
return all(more_general_parts)

## def fulfills(example, hypothesis):

return more_general(hypothesis, example)

## def min_generalizations(h, x):

h_new = list(h)
for i in range(len(h)):
if not fulfills(x[i:i+1], h[i:i+1]):
h_new[i] = '?' if h[i] != '0' else x[i]
return [tuple(h_new)]

## def min_specializations(h, domains, x):

results = []
for i in range(len(h)):
if h[i] == "?":
for val in domains[i]:
if x[i] != val:
h_new = h[:i] + (val,) + h[i+1:]
results.append(h_new)
elif h[i] != "0":
h_new = h[:i] + ('0',) + h[i+1:]
results.append(h_new)
return results

## def generalize_S(x, G, S):

S_prev = list(S)
for s in S_prev:
if s not in S:
continue
if not fulfills(x, s):
S.remove(s)
Splus = min_generalizations(s, x)

## S.update([h for h in Splus if

Page 2
Date: --/ -- /----

any([more_general(g,h)
for g in G])])

S.difference_update([h for h in S if
any([more_general(h, h1)
for h1 in S if h != h1])])
return S

## def specialize_G(x, domains, G, S):

G_prev = list(G)
for g in G_prev:
if g not in G:
continue
if fulfills(x, g):
G.remove(g)
Gminus = min_specializations(g, domains, x)

## G.update([h for h in Gminus if any([more_general(h, s)

for s in S])])

G.difference_update([h for h in G if
any([more_general(g1, h)
for g1 in G if h != g1])])
return G

def candidate_elimination(examples):
domains = get_domains(examples)[:-1]
n = len(domains)
G = set([("?",)*n])
S = set([("0",)*n])

## print("Maximally specific hypotheses - S ")

print("Maximally general hypotheses - G ")

i=0
print("\nS[0]:",str(S),"\nG[0]:",str(G))
for xcx in examples:
i=i+1
x, cx = xcx[:-1], xcx[-1]
if cx=='Y':
G = {g for g in G if fulfills(x, g)}
S = generalize_S(x, G, S)
else:
S = {s for s in S if not fulfills(x, s)}
G = specialize_G(x, domains, G, S)
print("\nS[{0}]:".format(i),S)
print("G[{0}]:".format(i),G)
return

Page 3
Date: --/ -- /----

## with open('c1.csv') as csvFile:

examples = [tuple(line) for line in csv.reader(csvFile)]

candidate_elimination(examples)

Output:

Page 4
Date: --/ -- /----

3. Write a program to demonstrate the working of the decision tree based ID3 algorithm.
Use an appropriate data set for building the decision tree and apply this knowledge to
classify a new sample.

Program:

import math
import csv

dataset = list(lines)

class Node:
def __init__(self, attribute):
self.attribute = attribute
self.children = []

## def subtables(data, col, delete):

dic = {}
coldata = [ row[col] for row in data]
attr = list(set(coldata)) # All values of attribute retrived
for k in attr:
dic[k] = []
for y in range(len(data)):
key = data[y][col]
if delete:
del data[y][col]
dic[key].append(data[y])
return attr, dic

def entropy(S):
attr = list(set(S))
if len(attr) == 1: #if all are +ve/-ve then entropy = 0
return 0
counts = [0,0] # Only two values possible 'yes' or 'no'
for i in range(2):
counts[i] = sum( [1 for x in S if attr[i] == x] ) / (len(S) * 1.0)
sums = 0
for cnt in counts:
sums += -1 * cnt * math.log(cnt, 2)
return sums

## def compute_gain(data, col):

attValues, dic = subtables(data, col, delete=False)
total_entropy = entropy([row[-1] for row in data])
for x in range(len(attValues)):

Page 5
Date: --/ -- /----

## ratio = len(dic[attValues[x]]) / ( len(data) * 1.0)

entro = entropy([row[-1] for row in dic[attValues[x]]])
total_entropy -= ratio*entro

## def build_tree(data, features):

lastcol = [row[-1] for row in data]
if (len(set(lastcol))) == 1: # If all samples have same labels return
that label
node=Node("")
return node
n = len(data[0])-1
gains = [compute_gain(data, col) for col in range(n) ]
split = gains.index(max(gains)) # Find max gains and returns index
node = Node(features[split]) # 'node' stores attribute selected
#del (features[split])
fea = features[:split]+features[split+1:]
attr, dic = subtables(data, split, delete=True)
for x in range(len(attr)):
child = build_tree(dic[attr[x]], fea)
node.children.append((attr[x], child))
return node

## def print_tree(node, level):

print(" "*level, node.answer) # Displays leaf node yes/no
return
print(" "*level, node.attribute) # Displays attribute Name
for value, n in node.children:
print(" "*(level+1), value)
print_tree(n, level + 2)

def classify(node,x_test,features):
return
pos = features.index(node.attribute)
for value, n in node.children:
if x_test[pos]==value:
classify(n,x_test,features)

node = build_tree(dataset, features) # Build decision tree
print("The decision tree for the dataset using ID3 algorithm is ")
print_tree(node, 0)
for xtest in testdata:
print("The test instance : ",xtest)
print("The predicted label : ", end="")
classify(node,xtest,features)

Page 6
Date: --/ -- /----

Output:

Page 7
Date: --/ -- /----

## 4. Build an Artificial Neural Network by implementing the Backpropagation algorithm

and test the same using appropriate data sets.
Program:
import numpy as np
X = np.array(([2, 9], [1, 5], [3, 6]), dtype=float)
y = np.array(([92], [86], [89]), dtype=float)
X = X/np.amax(X,axis=0)
y = y/100
def sigmoid(x):
return 1/(1 + np.exp(-x))

return x * (1 - x)

epoch=1000
eta =0.2
input_neurons = 2
hidden_neurons = 3
output_neurons = 1
wh=np.random.uniform(size=(input_neurons,hidden_neurons))
bh=np.random.uniform(size=(1,hidden_neurons))
wout=np.random.uniform(size=(hidden_neurons,output_neurons))
bout=np.random.uniform(size=(1,output_neurons))
for i in range(epoch):
h_ip=np.dot(X,wh) + bh
h_act = sigmoid(h_ip)
o_ip=np.dot(h_act,wout) + bout
output = sigmoid(o_ip)
Eo = y-output
Eh = d_output.dot(wout.T)
wout += h_act.T.dot(d_output) *eta
wh += X.T.dot(d_hidden) *eta
print("Normalized Input: \n" + str(X))
print("Actual Output: \n" + str(y))
print("Predicted Output: \n" ,output)

Output:

Page 8
Date: --/ -- /----

5. Write a program to implement the naïve Bayesian classifier for a sample training data
set stored as a .CSV file. Compute the accuracy of the classifier, considering few test data
sets.

Program:

## import csv, random, math

import statistics as st

dataset = list(lines)
for i in range(len(dataset)):
dataset[i] = [float(x) for x in dataset[i]]
return dataset

## def splitDataset(dataset, splitRatio):

testSize = int(len(dataset) * splitRatio);
trainSet = list(dataset);
testSet = []
while len(testSet) < testSize:
index = random.randrange(len(trainSet));
testSet.append(trainSet.pop(index))
return [trainSet, testSet]

def separateByClass(dataset):
separated = {}
for i in range(len(dataset)):
x = dataset[i]
if (x[-1] not in separated):
separated[x[-1]] = []
separated[x[-1]].append(x)
return separated

def compute_mean_std(dataset):
mean_std = [ (st.mean(attribute), st.stdev(attribute))
for attribute in zip(*dataset)];
del mean_std[-1]
return mean_std

def summarizeByClass(dataset):
separated = separateByClass(dataset);
summary = {}
for classValue, instances in separated.items():
summary[classValue] = compute_mean_std(instances)
return summary

## def estimateProbability(x, mean, stdev):

exponent = math.exp(-(math.pow(x-mean,2)/(2*math.pow(stdev,2))))
return (1 / (math.sqrt(2*math.pi) * stdev)) * exponent

## def calculateClassProbabilities(summaries, testVector):

Page 9
Date: --/ -- /----

p = {}
for classValue, classSummaries in summaries.items():
p[classValue] = 1
for i in range(len(classSummaries)):
mean, stdev = classSummaries[i]
x = testVector[i]
p[classValue] *= estimateProbability(x, mean, stdev);
return p

## def predict(summaries, testVector):

all_p = calculateClassProbabilities(summaries, testVector)
bestLabel, bestProb = None, -1
for lbl, p in all_p.items():
if bestLabel is None or p > bestProb:
bestProb = p
bestLabel = lbl
return bestLabel

## def perform_classification(summaries, testSet):

predictions = []
for i in range(len(testSet)):
result = predict(summaries, testSet[i])
predictions.append(result)
return predictions

## def getAccuracy(testSet, predictions):

correct = 0
for i in range(len(testSet)):
if testSet[i][-1] == predictions[i]:
correct += 1
return (correct/float(len(testSet))) * 100.0

print('Total instances available :',len(dataset))
print('Total attributes present :',len(dataset[0])-1)

## print("First Five instances of dataset:")

for i in range(5):
print(i+1 , ':' , dataset[i])
splitRatio = 0.2
trainingSet, testSet = splitDataset(dataset, splitRatio)
print('\nDataset is split into training and testing set.')
print('Training examples = {0} \nTesting examples =
{1}'.format(len(trainingSet), len(testSet)))
summaries = summarizeByClass(trainingSet);
predictions = perform_classification(summaries, testSet)
accuracy = getAccuracy(testSet, predictions)
print('\nAccuracy of the Naive Baysian Classifier is :', accuracy)

Page 10
Date: --/ -- /----

Output:

Page 11
Date: --/ -- /----

6. Assuming a set of documents that need to be classified, use the naïve Bayesian Classifier
model to perform this task. Built-in Java classes/API can be used to write the program.
Calculate the accuracy, precision, and recall for your data set.

Program:
import pandas as pd
print('Total instances in the dataset:',msg.shape[0])
msg['labelnum']=msg.label.map({'pos':1,'neg':0})
X=msg.message
Y=msg.labelnum
print('\nThe message and its label of first 5 instances are listed
below')
X5, Y5 = X[0:5], msg.label[0:5]
for x, y in zip(X5,Y5):
print(x,',',y)

## from sklearn.model_selection import train_test_split

xtrain,xtest,ytrain,ytest=train_test_split(X,Y)
print('\nDataset is split into Training and Testing samples')
print('Total training instances :', xtrain.shape[0])
print('Total testing instances :', xtest.shape[0])

## from sklearn.feature_extraction.text import CountVectorizer

count_vect = CountVectorizer()
xtrain_dtm = count_vect.fit_transform(xtrain)
xtest_dtm = count_vect.transform(xtest)
print('\nTotal features extracted using
CountVectorizer:',xtrain_dtm.shape[1])
print('\nFeatures for first 5 training instances are listed below')
df=pd.DataFrame(xtrain_dtm.toarray(),columns=count_vect.get_feature_
names())
print(df[0:5])

## from sklearn.naive_bayes import MultinomialNB

clf = MultinomialNB().fit(xtrain_dtm,ytrain)
predicted = clf.predict(xtest_dtm)
print('\nClassstification results of testing samples are given
below')
for doc, p in zip(xtest, predicted):
pred = 'pos' if p==1 else 'neg'
print('%s -> %s ' % (doc, pred))

## from sklearn import metrics

print('\nAccuracy metrics')
print('Accuracy of the classifer
is',metrics.accuracy_score(ytest,predicted))
print('Recall :',metrics.recall_score(ytest,predicted), '\nPrecison
:',metrics.precision_score(ytest,predicted))

Page 12
Date: --/ -- /----

Output:

Page 13
Date: --/ -- /----

8. Apply EM algorithm to cluster a set of data stored in a .CSV file. Use the same data set
for clustering using k-Means algorithm. Compare the results of these two algorithms and
comment on the quality of clustering. You can add Java/Python ML library classes/API in
the program

Program:

## import matplotlib.pyplot as plt

from sklearn import datasets
from sklearn.cluster import KMeans
import pandas as pd
import numpy as np
X = pd.DataFrame(iris.data)
X.columns = ['Sepal_Length','Sepal_Width','Petal_Length','Petal_Width']
y = pd.DataFrame(iris.target)
y.columns = ['Targets']

model = KMeans(n_clusters=3)
model.fit(X)
plt.figure(figsize=(14,14))
colormap = np.array(['red', 'lime', 'black'])

plt.subplot(2, 2, 1)
plt.scatter(X.Petal_Length, X.Petal_Width, c=colormap[y.Targets], s=40)
plt.title('Real Clusters')
plt.xlabel('Petal Length')
plt.ylabel('Petal Width')

plt.subplot(2, 2, 2)
plt.scatter(X.Petal_Length, X.Petal_Width, c=colormap[model.labels_],
s=40)
plt.title('K-Means Clustering')
plt.xlabel('Petal Length')
plt.ylabel('Petal Width')

## from sklearn import preprocessing

scaler = preprocessing.StandardScaler()
scaler.fit(X)
xsa = scaler.transform(X)
xs = pd.DataFrame(xsa, columns = X.columns)
from sklearn.mixture import GaussianMixture
gmm = GaussianMixture(n_components=3)
gmm.fit(xs)
gmm_y = gmm.predict(xs)
plt.subplot(2, 2, 3)
plt.scatter(X.Petal_Length, X.Petal_Width, c=colormap[gmm_y], s=40)
plt.title('GMM Clustering')
plt.xlabel('Petal Length')
plt.ylabel('Petal Width')
print('Observation: The GMM using EM algorithm based clustering matched
the true labels more closely than the Kmeans.')

Page 14
Date: --/ -- /----

Output:

Page 15
Date: --/ -- /----

9. Write a program to implement k-Nearest Neighbour algorithm to classify the iris data
set. Print both correct and wrong predictions. Java/Python ML library classes can be used
for this problem.
Program:

## from sklearn.model_selection import train_test_split

from sklearn.neighbors import KNeighborsClassifier
from sklearn import datasets

## x_train, x_test, y_train, y_test =

train_test_split(iris.data,iris.target,test_size=0.1)
print("Dataset is split into training and testing...")
print("Size of trainng data and its label",x_train.shape,y_train.shape)
print("Size of trainng data and its label",x_test.shape, y_test.shape)
for i in range(len(iris.target_names)):
print("Label", i , "-",str(iris.target_names[i]))
classifier = KNeighborsClassifier(n_neighbors=1)
classifier.fit(x_train, y_train)
y_pred=classifier.predict(x_test)
print("Results of Classification using K-nn with K=1 ")
for r in range(0,len(x_test)):
print(" Sample:", str(x_test[r]), " Actual-label:", str(y_test[r]), "
Predicted-label:", str(y_pred[r]))
print("Classification Accuracy :" , classifier.score(x_test,y_test));

Output:

Page 16
Date: --/ -- /----

10. Implement the non-parametric Locally Weighted Regression algorithm in order to fit
data points. Select appropriate data set for your experiment and draw graphs.
Program:

## from numpy import *

import operator
from os import listdir
import matplotlib
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np1
import numpy.linalg as np
from scipy.stats.stats import pearsonr

## def kernel(point,xmat, k):

m,n = np1.shape(xmat)
weights = np1.mat(np1.eye((m)))
for j in range(m):
diff = point - X[j]
weights[j,j] = np1.exp(diff*diff.T/(-2.0*k**2))
return weights

def localWeight(point,xmat,ymat,k):
wei = kernel(point,xmat,k)
W=(X.T*(wei*X)).I*(X.T*(wei*ymat.T))
return W

def localWeightRegression(xmat,ymat,k):
m,n = np1.shape(xmat)
ypred = np1.zeros(m)
for i in range(m):
ypred[i] = xmat[i]*localWeight(xmat[i],xmat,ymat,k)
return ypred

bill = np1.array(data.total_bill)
tip = np1.array(data.tip)
mbill = np1.mat(bill)
mtip = np1.mat(tip)
m= np1.shape(mbill)[1]
one = np1.mat(np1.ones(m))
X= np1.hstack((one.T,mbill.T))

#set k here
ypred = localWeightRegression(X,mtip,2)
SortIndex = X[:,1].argsort(0)
xsort = X[SortIndex][:,0]
fig = plt.figure()