Sample

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import spacy
import string
from nltk.corpus import stopwords
from collections import Counter
from nltk.stem.porter import PorterStemmer
import warnings
warnings.filterwarnings("ignore")
path1 ="/content/drugsComTest_raw.tsv"
class DataFrame_Loader():
def __init__(self,error_bad_lines,sep):
self.error_bad_lines = error_bad_lines
self.sep = sep
print("Loadind DataFrame")
def load_data_files(self,path1):
dftrain = pd.read_csv(path1,error_bad_lines=True,sep='\t')
return dftrain
load = DataFrame_Loader(True,'\t')
df = load.load_data_files(path1)
df.head()
class DataFrame_Preprocessor():
def __init__(self,n_rare_words):
self.n_rare_words = 10
print("Preprocessor object created")
def __remove_punctuation(self,text):
PUNCT_TO_REMOVE = string.punctuation
"""custom function to remove the punctuation"""
return text.translate(str.maketrans('', '', PUNCT_TO_REMOVE))
def __remove_stopwords(self,text):
STOPWORDS = set(stopwords.words('english'))
"""custom function to remove the stopwords"""
return " ".join([word for word in str(text).split() if word not in
STOPWORDS])
def Get_Most_Commom(self,data):
cnt = Counter()
for text in df["review"].values:
for word in text.split():
cnt[word] += 1
return cnt.most_common(10)
def __remove_freqwords(self,text):
FREQWORDS = set([w for (w, wc) in count])

"""custom function to remove the frequent words"""
FREQWORDS])
def __remove_rarewords(self,text):
RAREWORDS = set([w for (w, wc) in count[:-self.n_rare_words-1:-1]])

"""custom function to remove the rare words"""
RAREWORDS])
def __stem_words(self,text):
stemmer = PorterStemmer()
return " ".join([stemmer.stem(word) for word in text.split()])
def Text_Preprocessing(self,data):
try:
data = data[['review','rating']]
data["review"] = data["review"].apply(lambda text:
self.__remove_punctuation(text))
self.__remove_stopwords(text))
self.__remove_freqwords(text))
self.__remove_rarewords(text))
self.__stem_words(text))
data = data.astype(str).apply(lambda x: x.str.encode('ascii',
'ignore').str.decode('ascii'))
data['review'] = data['review'].str.replace('\d+', '')
return data
except ValueError as ve:

raise(ValueError("Error in Text Preprocessing {}".format(ve)))
preprocess = DataFrame_Preprocessor(10)
count = preprocess.Get_Most_Commom(df)
count
from sklearn.model_selection import train_test_split
class DataFrame_Preprocessor():
def __init__(self):
print("Preprocessor object created")

def preprocess(self,data):
data['rating'] = pd.to_numeric(data['rating'],errors='coerce')
data['Sentiment'] = np.where(data['rating'] > 6, 1, 0)
data= data[['review','Sentiment']]
x = data['review']
y = data['Sentiment']
return train_test_split(x,y,test_size=0.1, random_state=0)
PR = DataFrame_Preprocessor()
X_train, X_test, y_train, y_test = PR.preprocess(df)
X_train.shape,X_test.shape,y_train.shape,y_test.shape
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing import text
from keras.utils import np_utils
from keras.models import Sequential
class Keras_Tokenizer():
def __init__(self,max_features):
self.max_features =6000
print("Tokenizer object created")
def __label_encoding(self,y_train):
"""
Encode the given list of class labels
:y_train_enc: returns list of encoded classes
:labels: actual class labels
"""
lbl_enc = LabelEncoder()
y_train_enc = lbl_enc.fit_transform(y_train)
labels = lbl_enc.classes_
return y_train_enc, labels
def __word_embedding(self,train, test, max_features, max_len=200):
try:
""" Keras Tokenizer class object """
tokenizer = text.Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(train)
train_data = tokenizer.texts_to_sequences(train)
test_data = tokenizer.texts_to_sequences(test)
""" Get the max_len """

vocab_size = len(tokenizer.word_index) + 1
""" Padd the sequence based on the max-length """

x_train = sequence.pad_sequences(train_data, maxlen=max_len,
padding='post')
x_test = sequence.pad_sequences(test_data, maxlen=max_len,
padding='post')
""" Return train, test and vocab size """
return tokenizer, x_train, x_test, vocab_size
except ValueError as ve:
raise(ValueError("Error in word embedding {}".format(ve)))
def __feature_extraction(self, train, test):

# Feature extraction using TF-IDF vectorization
tfidf_vectorizer = TfidfVectorizer(max_features=self.max_features)
x_train_tfidf = tfidf_vectorizer.fit_transform(train)
x_test_tfidf = tfidf_vectorizer.transform(test)
return x_train_tfidf, x_test_tfidf
def preprocess(self,X_train, X_test):
return self.__word_embedding(X_train, X_test, self.max_features)
KT = Keras_Tokenizer(6000)
tokenizer, x_pad_train, x_pad_valid, vocab_size = KT.preprocess(X_train, X_test)

x_pad_train.shape,x_pad_valid.shape,vocab_size
from tensorflow import keras

class RNN_Bidirectional_lstm_Build_Pack():
def __init__(self,
input_length,
output_length,
vocab_size,
optimizer,
loss,
metrics,
batch_size,
epochs,
verbose):
self.input_length =200
self.output_length= 200
self.vocab_size = 33068
self.optimizer = 'adam'
self.loss = 'binary_crossentropy'
self.metrics = ['acc']
self.batch_size = 256
self.epochs = 20
self.verbose = 1
print("Tokenizer object created")
def build_rnn(self,vocab_size,output_dim, input_dim):
model = Sequential([
keras.layers.Embedding(self.vocab_size,output_dim = self.output_length,
input_length = self.input_length),
keras.layers.BatchNormalization(),
keras.layers.Bidirectional(keras.layers.LSTM(256,return_sequences=True)),
keras.layers.GlobalMaxPool1D(),
keras.layers.Dense(225,activation='relu'),
keras.layers.Dropout(0.3),
keras.layers.Dense(output_dim, activation='sigmoid')
])
return model
def Compile_and_Fit(self,rnn_model):
try:
rnn_model.compile(optimizer=self.optimizer, loss=self.loss,
metrics=self.metrics)
rnn_model.fit(x_pad_train,
y_train,
batch_size=self.batch_size,
epochs=self.epochs,
verbose= self.verbose)
score = rnn_model.evaluate(x_pad_valid, y_test, verbose=1)

#verbode = the amount of information displayed
print("Loss:%.3f Accuracy: %.3f" % (score[0], score[1]))
return rnn_model
except ValueError as Model_Error:

raise(ValueError("Model Compiling Error {}".format(Model_Error)))
Rnn_Model =
RNN_Bidirectional_lstm_Build_Pack(200,200,33068,'adam','binary_crossentropy',
['acc'],256,10,1)
rnn_model = Rnn_Model.build_rnn(vocab_size,1,200)
rnn_model.summary()
rnn_model = Rnn_Model.Compile_and_Fit(rnn_model)
y_preds = rnn_model.predict(x_pad_valid)
print("y_preds Shape ::",y_preds.shape)
for arr in y_preds:

for i in range(len(arr)):
if arr[i]>0.5:
arr[i] = 1
else:
arr[i] = 0
y_preds = y_preds.astype('int32')
pred_df = pd.DataFrame(y_preds, columns=['pred'])
print(pred_df.shape)
pred_df.head()
pred_df.value_counts()
from sklearn import metrics

from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
print(metrics.accuracy_score(y_test, pred_df))
print(metrics.confusion_matrix(y_test, pred_df))
print(metrics.classification_report(y_test, pred_df))
rnn_model.save("rnn_model.h5")
import pickle
with open('tokenizer.pickle', 'wb') as handle:
pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

Sample

Uploaded by

Document Information

Original Description:

Original Title

Copyright

Available Formats

Share this document

Share or Embed Document

Sharing Options

Did you find this document useful?

Is this content inappropriate?

Copyright:

Available Formats

Sample

Uploaded by

Copyright:

Available Formats

import pandas as pd

print("Preprocessor object created")

FREQWORDS = set([w for (w, wc) in count])

RAREWORDS = set([w for (w, wc) in count[:-self.n_rare_words-1:-1]])

except ValueError as ve:

print("Preprocessor object created")

data['Sentiment'] = np.where(data['rating'] > 6, 1, 0)

return train_test_split(x,y,test_size=0.1, random_state=0)

print("Tokenizer object created")

return y_train_enc, labels

def __word_embedding(self,train, test, max_features, max_len=200):

""" Get the max_len """

""" Padd the sequence based on the max-length """

def __feature_extraction(self, train, test):

def preprocess(self,X_train, X_test):

return self.__word_embedding(X_train, X_test, self.max_features)

tokenizer, x_pad_train, x_pad_valid, vocab_size = KT.preprocess(X_train, X_test)

from tensorflow import keras

print("Tokenizer object created")

def build_rnn(self,vocab_size,output_dim, input_dim):

score = rnn_model.evaluate(x_pad_valid, y_test, verbose=1)

except ValueError as Model_Error:

print("y_preds Shape ::",y_preds.shape)

for arr in y_preds:

pred_df = pd.DataFrame(y_preds, columns=['pred'])

from sklearn import metrics

You might also like