Professional Documents
Culture Documents
Sample
Sample
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import spacy
import string
from nltk.corpus import stopwords
from collections import Counter
from nltk.stem.porter import PorterStemmer
import warnings
warnings.filterwarnings("ignore")
path1 ="/content/drugsComTest_raw.tsv"
class DataFrame_Loader():
def __init__(self,error_bad_lines,sep):
self.error_bad_lines = error_bad_lines
self.sep = sep
print("Loadind DataFrame")
def load_data_files(self,path1):
dftrain = pd.read_csv(path1,error_bad_lines=True,sep='\t')
return dftrain
load = DataFrame_Loader(True,'\t')
df = load.load_data_files(path1)
df.head()
class DataFrame_Preprocessor():
def __init__(self,n_rare_words):
self.n_rare_words = 10
def __remove_punctuation(self,text):
PUNCT_TO_REMOVE = string.punctuation
"""custom function to remove the punctuation"""
return text.translate(str.maketrans('', '', PUNCT_TO_REMOVE))
def __remove_stopwords(self,text):
STOPWORDS = set(stopwords.words('english'))
"""custom function to remove the stopwords"""
return " ".join([word for word in str(text).split() if word not in
STOPWORDS])
def Get_Most_Commom(self,data):
cnt = Counter()
for text in df["review"].values:
for word in text.split():
cnt[word] += 1
return cnt.most_common(10)
def __remove_freqwords(self,text):
def __remove_rarewords(self,text):
def __stem_words(self,text):
stemmer = PorterStemmer()
return " ".join([stemmer.stem(word) for word in text.split()])
def Text_Preprocessing(self,data):
try:
data = data[['review','rating']]
data["review"] = data["review"].apply(lambda text:
self.__remove_punctuation(text))
data["review"] = data["review"].apply(lambda text:
self.__remove_stopwords(text))
data["review"] = data["review"].apply(lambda text:
self.__remove_freqwords(text))
data["review"] = data["review"].apply(lambda text:
self.__remove_rarewords(text))
data["review"] = data["review"].apply(lambda text:
self.__stem_words(text))
data = data.astype(str).apply(lambda x: x.str.encode('ascii',
'ignore').str.decode('ascii'))
data['review'] = data['review'].str.replace('\d+', '')
return data
preprocess = DataFrame_Preprocessor(10)
count = preprocess.Get_Most_Commom(df)
count
from sklearn.model_selection import train_test_split
class DataFrame_Preprocessor():
def __init__(self):
data['rating'] = pd.to_numeric(data['rating'],errors='coerce')
data= data[['review','Sentiment']]
x = data['review']
y = data['Sentiment']
PR = DataFrame_Preprocessor()
X_train, X_test, y_train, y_test = PR.preprocess(df)
X_train.shape,X_test.shape,y_train.shape,y_test.shape
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing import text
from keras.utils import np_utils
from keras.models import Sequential
class Keras_Tokenizer():
def __init__(self,max_features):
self.max_features =6000
def __label_encoding(self,y_train):
"""
Encode the given list of class labels
:y_train_enc: returns list of encoded classes
:labels: actual class labels
"""
lbl_enc = LabelEncoder()
y_train_enc = lbl_enc.fit_transform(y_train)
labels = lbl_enc.classes_
try:
""" Keras Tokenizer class object """
tokenizer = text.Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(train)
train_data = tokenizer.texts_to_sequences(train)
test_data = tokenizer.texts_to_sequences(test)
KT = Keras_Tokenizer(6000)
def __init__(self,
input_length,
output_length,
vocab_size,
optimizer,
loss,
metrics,
batch_size,
epochs,
verbose):
self.input_length =200
self.output_length= 200
self.vocab_size = 33068
self.optimizer = 'adam'
self.loss = 'binary_crossentropy'
self.metrics = ['acc']
self.batch_size = 256
self.epochs = 20
self.verbose = 1
model = Sequential([
keras.layers.Embedding(self.vocab_size,output_dim = self.output_length,
input_length = self.input_length),
keras.layers.BatchNormalization(),
keras.layers.Bidirectional(keras.layers.LSTM(256,return_sequences=True)),
keras.layers.GlobalMaxPool1D(),
keras.layers.Dense(225,activation='relu'),
keras.layers.Dropout(0.3),
keras.layers.Dense(150,activation='relu'),
keras.layers.Dropout(0.2),
keras.layers.Dense(95,activation='relu'),
keras.layers.Dropout(0.2),
keras.layers.Dense(64,activation='relu'),
keras.layers.Dropout(0.1),
keras.layers.Dense(34,activation='relu'),
keras.layers.Dropout(0.1),
keras.layers.Dense(32,activation='relu'),
keras.layers.Dense(output_dim, activation='sigmoid')
])
return model
def Compile_and_Fit(self,rnn_model):
try:
rnn_model.compile(optimizer=self.optimizer, loss=self.loss,
metrics=self.metrics)
rnn_model.fit(x_pad_train,
y_train,
batch_size=self.batch_size,
epochs=self.epochs,
verbose= self.verbose)
return rnn_model
Rnn_Model =
RNN_Bidirectional_lstm_Build_Pack(200,200,33068,'adam','binary_crossentropy',
['acc'],256,10,1)
rnn_model = Rnn_Model.build_rnn(vocab_size,1,200)
rnn_model.summary()
rnn_model = Rnn_Model.Compile_and_Fit(rnn_model)
y_preds = rnn_model.predict(x_pad_valid)
y_preds = y_preds.astype('int32')
print(pred_df.shape)
pred_df.head()
pred_df.value_counts()
print(metrics.accuracy_score(y_test, pred_df))
print(metrics.confusion_matrix(y_test, pred_df))
print(metrics.classification_report(y_test, pred_df))
rnn_model.save("rnn_model.h5")
import pickle
with open('tokenizer.pickle', 'wb') as handle:
pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)