Download as txt, pdf, or txt
Download as txt, pdf, or txt
You are on page 1of 6

import pandas as pd

import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import spacy
import string
from nltk.corpus import stopwords
from collections import Counter
from nltk.stem.porter import PorterStemmer
import warnings
warnings.filterwarnings("ignore")

path1 ="/content/drugsComTest_raw.tsv"
class DataFrame_Loader():

def __init__(self,error_bad_lines,sep):
self.error_bad_lines = error_bad_lines
self.sep = sep

print("Loadind DataFrame")

def load_data_files(self,path1):
dftrain = pd.read_csv(path1,error_bad_lines=True,sep='\t')
return dftrain

load = DataFrame_Loader(True,'\t')
df = load.load_data_files(path1)
df.head()

class DataFrame_Preprocessor():

def __init__(self,n_rare_words):
self.n_rare_words = 10

print("Preprocessor object created")

def __remove_punctuation(self,text):

PUNCT_TO_REMOVE = string.punctuation
"""custom function to remove the punctuation"""
return text.translate(str.maketrans('', '', PUNCT_TO_REMOVE))

def __remove_stopwords(self,text):

STOPWORDS = set(stopwords.words('english'))
"""custom function to remove the stopwords"""
return " ".join([word for word in str(text).split() if word not in
STOPWORDS])

def Get_Most_Commom(self,data):

cnt = Counter()
for text in df["review"].values:
for word in text.split():
cnt[word] += 1

return cnt.most_common(10)

def __remove_freqwords(self,text):

FREQWORDS = set([w for (w, wc) in count])


"""custom function to remove the frequent words"""
return " ".join([word for word in str(text).split() if word not in
FREQWORDS])

def __remove_rarewords(self,text):

RAREWORDS = set([w for (w, wc) in count[:-self.n_rare_words-1:-1]])


"""custom function to remove the rare words"""
return " ".join([word for word in str(text).split() if word not in
RAREWORDS])

def __stem_words(self,text):

stemmer = PorterStemmer()
return " ".join([stemmer.stem(word) for word in text.split()])

def Text_Preprocessing(self,data):

try:

data = data[['review','rating']]
data["review"] = data["review"].apply(lambda text:
self.__remove_punctuation(text))
data["review"] = data["review"].apply(lambda text:
self.__remove_stopwords(text))
data["review"] = data["review"].apply(lambda text:
self.__remove_freqwords(text))
data["review"] = data["review"].apply(lambda text:
self.__remove_rarewords(text))
data["review"] = data["review"].apply(lambda text:
self.__stem_words(text))
data = data.astype(str).apply(lambda x: x.str.encode('ascii',
'ignore').str.decode('ascii'))
data['review'] = data['review'].str.replace('\d+', '')
return data

except ValueError as ve:


raise(ValueError("Error in Text Preprocessing {}".format(ve)))

preprocess = DataFrame_Preprocessor(10)
count = preprocess.Get_Most_Commom(df)
count
from sklearn.model_selection import train_test_split
class DataFrame_Preprocessor():

def __init__(self):

print("Preprocessor object created")


def preprocess(self,data):

data['rating'] = pd.to_numeric(data['rating'],errors='coerce')

data['Sentiment'] = np.where(data['rating'] > 6, 1, 0)

data= data[['review','Sentiment']]

x = data['review']

y = data['Sentiment']

return train_test_split(x,y,test_size=0.1, random_state=0)

PR = DataFrame_Preprocessor()
X_train, X_test, y_train, y_test = PR.preprocess(df)
X_train.shape,X_test.shape,y_train.shape,y_test.shape
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing import text
from keras.utils import np_utils
from keras.models import Sequential
class Keras_Tokenizer():

def __init__(self,max_features):

self.max_features =6000

print("Tokenizer object created")

def __label_encoding(self,y_train):
"""
Encode the given list of class labels
:y_train_enc: returns list of encoded classes
:labels: actual class labels
"""
lbl_enc = LabelEncoder()

y_train_enc = lbl_enc.fit_transform(y_train)
labels = lbl_enc.classes_

return y_train_enc, labels

def __word_embedding(self,train, test, max_features, max_len=200):

try:
""" Keras Tokenizer class object """
tokenizer = text.Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(train)
train_data = tokenizer.texts_to_sequences(train)
test_data = tokenizer.texts_to_sequences(test)

""" Get the max_len """


vocab_size = len(tokenizer.word_index) + 1

""" Padd the sequence based on the max-length """


x_train = sequence.pad_sequences(train_data, maxlen=max_len,
padding='post')
x_test = sequence.pad_sequences(test_data, maxlen=max_len,
padding='post')
""" Return train, test and vocab size """
return tokenizer, x_train, x_test, vocab_size
except ValueError as ve:
raise(ValueError("Error in word embedding {}".format(ve)))

def __feature_extraction(self, train, test):


# Feature extraction using TF-IDF vectorization
tfidf_vectorizer = TfidfVectorizer(max_features=self.max_features)
x_train_tfidf = tfidf_vectorizer.fit_transform(train)
x_test_tfidf = tfidf_vectorizer.transform(test)
return x_train_tfidf, x_test_tfidf

def preprocess(self,X_train, X_test):

return self.__word_embedding(X_train, X_test, self.max_features)

KT = Keras_Tokenizer(6000)

tokenizer, x_pad_train, x_pad_valid, vocab_size = KT.preprocess(X_train, X_test)


x_pad_train.shape,x_pad_valid.shape,vocab_size

from tensorflow import keras


class RNN_Bidirectional_lstm_Build_Pack():

def __init__(self,
input_length,
output_length,
vocab_size,
optimizer,
loss,
metrics,
batch_size,
epochs,
verbose):

self.input_length =200
self.output_length= 200
self.vocab_size = 33068
self.optimizer = 'adam'
self.loss = 'binary_crossentropy'
self.metrics = ['acc']
self.batch_size = 256
self.epochs = 20
self.verbose = 1

print("Tokenizer object created")

def build_rnn(self,vocab_size,output_dim, input_dim):

model = Sequential([
keras.layers.Embedding(self.vocab_size,output_dim = self.output_length,
input_length = self.input_length),
keras.layers.BatchNormalization(),

keras.layers.Bidirectional(keras.layers.LSTM(256,return_sequences=True)),
keras.layers.GlobalMaxPool1D(),
keras.layers.Dense(225,activation='relu'),
keras.layers.Dropout(0.3),
keras.layers.Dense(150,activation='relu'),
keras.layers.Dropout(0.2),
keras.layers.Dense(95,activation='relu'),
keras.layers.Dropout(0.2),
keras.layers.Dense(64,activation='relu'),
keras.layers.Dropout(0.1),
keras.layers.Dense(34,activation='relu'),
keras.layers.Dropout(0.1),
keras.layers.Dense(32,activation='relu'),
keras.layers.Dense(output_dim, activation='sigmoid')
])

return model

def Compile_and_Fit(self,rnn_model):

try:

rnn_model.compile(optimizer=self.optimizer, loss=self.loss,
metrics=self.metrics)

rnn_model.fit(x_pad_train,
y_train,
batch_size=self.batch_size,
epochs=self.epochs,
verbose= self.verbose)

score = rnn_model.evaluate(x_pad_valid, y_test, verbose=1)


#verbode = the amount of information displayed
print("Loss:%.3f Accuracy: %.3f" % (score[0], score[1]))

return rnn_model

except ValueError as Model_Error:


raise(ValueError("Model Compiling Error {}".format(Model_Error)))

Rnn_Model =
RNN_Bidirectional_lstm_Build_Pack(200,200,33068,'adam','binary_crossentropy',
['acc'],256,10,1)

rnn_model = Rnn_Model.build_rnn(vocab_size,1,200)
rnn_model.summary()
rnn_model = Rnn_Model.Compile_and_Fit(rnn_model)
y_preds = rnn_model.predict(x_pad_valid)

print("y_preds Shape ::",y_preds.shape)

for arr in y_preds:


for i in range(len(arr)):
if arr[i]>0.5:
arr[i] = 1
else:
arr[i] = 0

y_preds = y_preds.astype('int32')

pred_df = pd.DataFrame(y_preds, columns=['pred'])

print(pred_df.shape)
pred_df.head()

pred_df.value_counts()

from sklearn import metrics


from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

print(metrics.accuracy_score(y_test, pred_df))

print(metrics.confusion_matrix(y_test, pred_df))

print(metrics.classification_report(y_test, pred_df))

rnn_model.save("rnn_model.h5")
import pickle
with open('tokenizer.pickle', 'wb') as handle:
pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

You might also like