Sunteți pe pagina 1din 4

#Data Loading

imdb=pd.read_csv('imdb.csv')
imdb.columns = ["index","text","label"]
print(imdb.head(5))

-------------------------------------------------------------

data_size = imdb.shape

print(data_size)

imdb_col_names = list(imdb.columns)

print(imdb_col_names)
print(imdb.groupby('label').describe())
print(imdb.head(3))

-------------------------------------------------------------

imdb_target=imdb['label']

print(imdb_target)

-------------------------------------------------------------

from nltk.tokenize import word_tokenize


import nltk
nltk.download('all')

def split_tokens(text):

message = text.lower()

word_tokens = word_tokenize(text)

return word_tokens

imdb['tokenized_message'] = imdb.apply(lambda row:split_tokens(row['text']),axis=1)

-------------------------------------------------------------

from nltk.stem.wordnet import WordNetLemmatizer

def split_into_lemmas(text):

lemma = []

lemmatizer = WordNetLemmatizer()

for word in text:

a=lemmatizer.lemmatize(word)

lemma.append(a)

return lemma
imdb['lemmatized_message'] = imdb.apply(lambda row:
split_into_lemmas(row['tokenized_message']),axis=1)

print('Tokenized message:', imdb['tokenized_message'][55] )

print('Lemmatized message:', imdb['lemmatized_message'][55])

-------------------------------------------------------------

from nltk.corpus import stopwords

def stopword_removal(text):

stop_words = set(stopwords.words('english'))

filtered_sentence = []

filtered_sentence = ' '.join([word for word in text if word not in stop_words])

return filtered_sentence

imdb['preprocessed_message'] = imdb.apply(lambda row:


stopword_removal(row['lemmatized_message']),axis=1)

print('Preprocessed message:',imdb['preprocessed_message'])

Training_data=pd.Series(list(imdb['preprocessed_message']))

Training_label=pd.Series(list(imdb['label']))

-------------------------------------------------------------

from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer


training_data=pd.Series(list(imdb['preprocessed_message']))

training_label=pd.Series(list(imdb['label']))

tf_vectorizer = CountVectorizer(ngram_range=(1,2),min_df=(1/len(Training_label)),
max_df=0.7)

Total_Dictionary_TDM = tf_vectorizer.fit(Training_data)

message_data_TDM = Total_Dictionary_TDM.transform(training_data)

-------------------------------------------------------------
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer( ngram_range = (1,2), min_df =


(1/len(training_label)),max_df=0.7 )

Total_Dictionary_TFIDF = tfidf_vectorizer.fit(training_data)

message_data_TFIDF = Total_Dictionary_TFIDF.transform(training_data)

-------------------------------------------------------------

from sklearn.model_selection import train_test_split#Splitting the data for


training and testing

train_data,test_data, train_label, test_label =


train_test_split(message_data_TDM, training_label, test_size=0.1)

-------------------------------------------------------------

seed=9
from sklearn.svm import SVC

train_data_shape = train_data.shape

test_data_shape = test_data.shape

print("The shape of train data : ", train_data.shape)

print("The shape of test data : ", test_data.shape)

classifier = SVC(kernel="linear",C=0.025, random_state=seed)

classifier = classifier.fit(train_data,train_label)

#target =

score = classifier.fit(train_data,train_label)

print('SVM Classifier : ',score)

with open('output.txt', 'w') as file:


file.write(str((imdb['tokenized_message'][55],imdb['lemmatized_message'][55])))

-------------------------------------------------------------

from sklearn.linear_model import SGDClassifier


train_data,test_data, train_label, test_label = train_test_split(message_data_TDM,
training_label, test_size=0.1)

train_data_shape = train_data.shape

test_data_shape = test_data.shape

print("The shape of train data : ",train_data.shape)

print("The shape of test data : ",test_data.shape)

classifier = SGDClassifier(loss = 'modified_huber', shuffle=True,


random_state=seed)

classifier = classifier.fit(train_data,train_label)

#target=

score = classifier.score(test_data,test_label)

print('SGD classifier : ',score)

with open('output1.txt', 'w') as file:


file.write(str((imdb['preprocessed_message'][55])))

-------------------------------------------------------------

S-ar putea să vă placă și