Documente Academic
Documente Profesional
Documente Cultură
1 Assigment - 3
1.1 Apply k-NN on Amazon reviews data-set
Note: Used 5000 review 2500 positive and 2500 negative
In [1]: %matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sn
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
# https://stackoverflow.com/questions/9031783/hide-all-warnings-in-ipython
#Hide Warnings in ipython notebook
import warnings
warnings.filterwarnings('ignore')
1.1.1 Created Pickle file for amazon dataset with 2500 positive and 2500 negative response
In [2]: df=pd.read_pickle('./Amazon_5000.pkl')
#sorting values TBS
df = df.sort_values(['Time'],ascending=True)
In [3]: #Creating Train and test data (80-20 split, Since i ahve small dataset i need more data
# therefore converted in 80-20 split instead of 70-30)
## 80% of sorted data will = Total number of rows (sorted) *0.8
X_train = df.iloc[:int(len(df)*.8),1]
Y_train = df.iloc[:int(len(df)*.8),-1]
# len(Y_train)== len(X_train)
X_test = df.iloc[int(len(df)*.8)+1:,1]
Y_test = df.iloc[int(len(df)*.8)+1:,-1]
# len(Y_test)== len(X_test)
1
1.2 BoW
In [5]: from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()
Xtrain = vectorizer.fit_transform(X_train.values)
Xtest = vectorizer.transform(X_test.values)
In [6]: # https://kevinzakka.github.io/2016/07/13/k-nearest-neighbor/
cv_scores = []
timelist=[]
for k in range(1,100,2):
start_time = time.time()
knn = KNeighborsClassifier(n_neighbors=k)
cv_scores.append(scores.mean())
2
Training accuracy for K = 41 is 68.15
Test accuracy 69.87 for k= 41 Average time taken training in (seconds) 0.0014
df_cm = pd.DataFrame(CM,columns=[le.inverse_transform(0),le.inverse_transform(1)]
3
,index=[le.inverse_transform(0),le.inverse_transform(1)])
plt.figure(figsize = (7,7))
plt.title('BoW - Confusion Matrix')
sn.set(font_scale=1)#for label size
sn.heatmap(df_cm, annot=True,annot_kws={"size": 16})# font size
2 TF-IDF
In [10]: # from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
4
tf_idf_vect = TfidfVectorizer()
Xtrain_tf = tf_idf_vect.fit_transform(X_train.values)
Xtest_tf = tf_idf_vect.transform(X_test.values)
In [11]: cv_scores_1 = []
timelist_1=[]
for k in range(1,100,2):
start_time = time.time()
knn_1 = KNeighborsClassifier(n_neighbors=k)
cv_scores_1.append(scores.mean())
5
Training accuracy for K = 97 is 76.23
6
3 Avg. W2V
In [15]: from gensim.models import Word2Vec
from gensim.models import KeyedVectors
import pickle
7
filtered_sentence=[]
for w in sent.split():
for cleaned_words in w.split():
if(cleaned_words.isalpha()):
filtered_sentence.append(cleaned_words.lower())
else:
continue
list_sent.append(filtered_sentence)
return list_sent
In [19]: Traing_vectors = []# the avg-w2v for each sentence/review is stored in this list
for sent in W2VTraining_list: # for each review/sentence
sent_vec = np.zeros(50) # as word vectors are of zero length
cnt_words =0 # num of words with a valid vector in the sentence/review
for word in sent: # for each word in a review/sentence
try:
vec = w2v_model.wv[word]
sent_vec += vec
cnt_words += 1
except:
pass
sent_vec /= cnt_words
Traing_vectors.append(sent_vec)
In [20]: cv_scores_2 = []
timelist_2=[]
for k in range(1,100,2):
start_time = time.time()
knn_2 = KNeighborsClassifier(n_neighbors=k)
8
cv_scores_2.append(scores.mean())
In [22]: Test_vectors = []# the avg-w2v for each sentence/review is stored in this list
for sent in W2VTest_list: # for each review/sentence
i=0
9
sent_vec = np.zeros(50) # as word vectors are of zero length
cnt_words =1 # num of words with a valid vector in the sentence/review
for word in sent: # for each word in a review/sentence
try:
vec = w2v_model.wv[word]
sent_vec += vec
cnt_words += 1
except:
pass
# i+=1
sent_vec /= cnt_words
# print(i,'\t',cnt_words)
Test_vectors.append(sent_vec)
10
4 TFIDF-W2V
In [25]: tfidf_feat = tf_idf_vect.get_feature_names()
# tfidf_feat = tf_idf_vect.get_feature_names() # tfidf words/col-names
# final_tf_idf is the sparse matrix with row= sentence, col=word and cell_val = tfidf
11
for word in sent: # for each word in a review/sentence
try:
vec = w2v_model.wv[word]
# print(vec)
# obtain the tf_idfidf of a word in a sentence/review
tfidf = Xtrain_tf[row, tfidf_feat.index(word)]
# print(tfidf)
sent_vec += (vec * tfidf)
# print(sent_vec)
weight_sum += tfidf
# print(weight_sum)
except:
pass
sent_vec /= weight_sum
# print(sent_vec)
tfidf_train_vectors.append(sent_vec)
row += 1
In [26]: cv_scores_3 = []
timelist_3=[]
for k in range(1,100,2):
start_time = time.time()
knn_3 = KNeighborsClassifier(n_neighbors=k)
cv_scores_3.append(scores.mean())
12
The optimal number of neighbors is 49
In [28]: tfidf_test_vectors = []; # the tfidf-w2v for each sentence/review is stored in this lis
row=0;
for sent in W2VTest_list: # for each review/sentence
sent_vec = np.zeros(50) # as word vectors are of zero length
weight_sum =1; # num of words with a valid vector in the sentence/review
for word in sent: # for each word in a review/sentence
try:
vec = w2v_model.wv[word]
# print(vec)
# obtain the tf_idfidf of a word in a sentence/review
tfidf = Xtest_tf[row, tfidf_feat.index(word)]
# print(tfidf)
sent_vec += (vec * tfidf)
# print(sent_vec)
weight_sum += tfidf
# print(weight_sum)
except:
pass
13
sent_vec /= weight_sum
# print(sent_vec)
tfidf_test_vectors.append(sent_vec)
row += 1
14
5 RESULTS
In [31]: print("Vectroziation method \t optimal K \t Training Accuracy \t Test Accuracy")
print("-------------------- \t ----------\t ----------------- \t -------------")
print(" Bag of Words \t\t ",optimal_k,'\t\t\t',round(max(cv_scores)*100,2),'\t\t ',ro
print(" TFID \t\t ",optimal_k_1,'\t\t\t',round(max(cv_scores_1)*100,2),'\t\t
print(" TFID W2V \t\t ",optimal_k_2,'\t\t\t',round(max(cv_scores_2)*100,2),'\t\t
print(" Avg W2V \t\t ",optimal_k_3,'\t\t\t',round(max(cv_scores_3)*100,2),'\t\t
15
TFID 97 76.23 78.08
TFID W2V 55 60.77 59.96
Avg W2V 49 59.23 57.96
In [32]: # https://www.kaggle.com/andyxie/matplotlib-plot-multiple-lines
plt.figure(figsize = (10,10))
plt.plot(MSE, label="BoW")
plt.plot(MSE_1, label="TF-IDF")
plt.plot(MSE_2, label="Avg W2V")
plt.plot(MSE_3, label="TFIDF W2V")
# Add legend
plt.legend(loc='Top left')
# Add title and x, y labels
plt.title("Camparing Accuracy of Diff. Tech.", fontsize=15, fontweight='bold')
plt.suptitle("Performance comparision", fontsize=10)
plt.xlabel('Number of Neighbors K')
plt.ylabel('Misclassification Error')
plt.show()
16
17