Download as txt, pdf, or txt
Download as txt, pdf, or txt
You are on page 1of 7

!pip install pennylane==0.

30
from google.colab import drive
drive.mount('/content/drive')
%cd /content/drive/MyDrive
%mkdir qlstm-vs-lstm-haate-speech
%cd qlstm-vs-lstm-haate-speech
#télécharger la base de données
import gdown
url = "https://drive.google.com/uc?id=1-6SO5YJZlPY_fUppQiXQSVlfZ33rJ4vn"
output = "labeled_data.csv"
gdown. download (url, output)
#télécharger le code implémentant LSTM et QLSTM
url = "https://drive.google.com/uc?id=1-6-eAiPVz-8OtnD5_ip8LKbpkwgs8jt6"
output = "Factory.py"
gdown. download (url, output)
# Import des bibliothèques nécessaires
import torch
from torch import nn, optim
from torch.utils.data import DataLoader, random_split,TensorDataset
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score
from tqdm import tqdm
import pennylane as qml
from sklearn.preprocessing import LabelEncoder
from Factory import QLSTM
#test du gpu si non exectution sur cpu
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# Définition du jeu de données de tweets (assumons que vous avez déjà le jeu de
données chargé dans un DataFrame)
# Supposez que votre DataFrame s'appelle 'df_tweets' et contient deux colonnes :
'tweet' (le texte du tweet) et 'class' (l'étiquette de classe)
# Assurez-vous que les étiquettes de classe sont numériques

# Chargement des données depuis un fichier CSV par exemple


df_tweets = pd.read_csv('labeled_data.csv')

# Encodage des étiquettes de classe


label_encoder = LabelEncoder()
df_tweets['class'] = label_encoder.fit_transform(df_tweets['class'])
import re # regex to detect username, url, html entity
import nltk # to use word tokenize (split the sentence into words)
import nltk
nltk.download('stopwords')
nltk.download('punkt')
from nltk.corpus import stopwords # to remove the stopwords
from sklearn.model_selection import train_test_split
## notes: all of the function taking 1 text at a time
stop_words = set(stopwords.words('english'))
# add rt to remove retweet in dataset (noise)
stop_words.add("rt")

# remove html entity:


def remove_entity(raw_text):
entity_regex = r"&[^\s;]+;"
text = re.sub(entity_regex, "", raw_text)
return text
# change the user tags
def change_user(raw_text):
regex = r"@([^ ]+)"
text = re.sub(regex, "user", raw_text)

return text

# remove urls
def remove_url(raw_text):
url_regex = r"(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:
[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]
+\)))*\)|[^\s`!()\[\]{};:'\".,<>?«»“”‘’]))"
text = re.sub(url_regex, '', raw_text)

return text

# remove unnecessary symbols


def remove_noise_symbols(raw_text):
text = raw_text.replace('"', '')
text = text.replace("'", '')
text = text.replace("!", '')
text = text.replace("`", '')
text = text.replace("..", '')

return text

# remove stopwords
def remove_stopwords(raw_text):
tokenize = nltk.word_tokenize(raw_text)
text = [word for word in tokenize if not word.lower() in stop_words]
text = " ".join(text)

return text

## this function in to clean all the dataset by utilizing all the function above
def preprocess(datas):
clean = []
# change the @xxx into "user"
clean = [change_user(text) for text in datas]
# remove emojis (specifically unicode emojis)
clean = [remove_entity(text) for text in clean]
# remove urls
clean = [remove_url(text) for text in clean]
# remove trailing stuff
clean = [remove_noise_symbols(text) for text in clean]
# remove stopwords
clean = [remove_stopwords(text) for text in clean]

return clean
# appel de la fonction de netoyage
df_tweets['tweet'] = preprocess(df_tweets['tweet'])
train_df.head()
test_df.head()
# Tokenisation des tweets et vectorisation
vectorizer = TfidfVectorizer(max_features=75)
X_train = vectorizer.fit_transform(train_df['tweet']).toarray()
X_test = vectorizer.transform(test_df['tweet']).toarray()
y_train = train_df['class'].values
y_test = test_df['class'].values

# Normalisation des données


scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
# Définition de la classe du jeu de données
class TweetsDataset(torch.utils.data.Dataset):
def __init__(self, X, y):
self.X = torch.tensor(X, dtype=torch.float32)
self.y = torch.tensor(y, dtype=torch.long)

def __len__(self):
return len(self.X)

def __getitem__(self, idx):


return self.X[idx], self.y[idx]
# Formater les données en tenseurs 3D
X_train = torch.tensor(X_train, dtype=torch.float32)
X_train = torch.unsqueeze(X_train, dim=1) # Ajouter une dimension à l'index 1

X_test = torch.tensor(X_test, dtype=torch.float32)


X_test = torch.unsqueeze(X_test, dim=1) # Ajouter une dimension à l'index 1
y_train_tensor = torch.tensor(y_train)
y_test_tensor=torch.tensor(y_test)
# Création des ensembles de données et des chargeurs de données
train_dataset = TensorDataset(X_train, y_train_tensor)
test_dataset = TensorDataset(X_test, y_test_tensor)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)


test_loader = DataLoader(test_dataset, batch_size=32)
# Définition de la classe LSTMModel
class LSTMModel(nn.Module):
def __init__(self, input_size, hidden_size, num_layers, num_classes):
super(LSTMModel, self).__init__()
self.hidden_size = hidden_size
self.num_layers = num_layers
self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
self.fc = nn.Linear(hidden_size, num_classes)

def forward(self, x):


# x: (batch_size, seq_len, input_size)
batch_size, seq_len, input_size = x.size()
# Initialisation des états cachés
h0 = torch.zeros(self.num_layers, batch_size,
self.hidden_size).to(x.device)
c0 = torch.zeros(self.num_layers, batch_size,
self.hidden_size).to(x.device)
# Propagation avant à travers la couche LSTM
out, _ = self.lstm(x, (h0, c0))
# Sélection du dernier pas de temps
out = self.fc(out[:, -1, :])
return out
# Définition du modèle QLSTM
class QLSTMModel(nn.Module):
def __init__(self, input_size, hidden_size, n_qubits, n_qlayers, num_classes):
super(QLSTMModel, self).__init__()
self.hidden_size = hidden_size
self.n_qubits = n_qubits
self.n_qlayers = n_qlayers
self.qlstm = QLSTM(input_size, hidden_size, n_qubits, n_qlayers)
self.fc = nn.Linear(hidden_size, num_classes)

def forward(self, x):


out, _ = self.qlstm(x)
out = self.fc(out[:, -1, :])
return out
# Définition de la fonction d'entraînement
def train(model, criterion, optimizer, train_loader):
model.to(device)
model.train()
running_loss = 0.0
correct = 0
total = 0
for inputs, labels in tqdm(train_loader, desc="Training"):
inputs, labels = inputs.to(device), labels.to(device)
optimizer.zero_grad()
outputs = model(inputs)
loss = criterion(outputs, labels)
loss.backward()
optimizer.step()
running_loss += loss.item()
_, predicted = outputs.max(1)
total += labels.size(0)
correct += predicted.eq(labels).sum().item()
train_loss = running_loss / len(train_loader)
train_accuracy = correct / total
return train_loss, train_accuracy
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

def evaluate_lstm(model, criterion, test_loader):


model.eval()
running_loss = 0.0
correct = 0
total = 0
predictions = []
ground_truths = []
with torch.no_grad():
for inputs, labels in tqdm(test_loader, desc="Testing"):
inputs, labels = inputs.to(device), labels.to(device)
outputs = model(inputs)
loss = criterion(outputs, labels)
running_loss += loss.item()
_, predicted = outputs.max(1)
total += labels.size(0)
correct += predicted.eq(labels).sum().item()
predictions.extend(predicted.cpu().numpy())
ground_truths.extend(labels.cpu().numpy())
test_loss = running_loss / len(test_loader)
test_accuracy = correct / total

# Calculate confusion matrix


cm = confusion_matrix(ground_truths, predictions)

# Plot confusion matrix


plt.figure(figsize=(10, 8))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=False)
plt.xlabel('Predicted labels')
plt.ylabel('True labels')
plt.title('Confusion Matrix LSTM')
plt.show()

return test_loss, test_accuracy


def evaluate_qlstm(model, criterion, test_loader):
model.eval()
running_loss = 0.0
correct = 0
total = 0
predictions = []
ground_truths = []
with torch.no_grad():
for inputs, labels in tqdm(test_loader, desc="Testing"):
inputs, labels = inputs.to(device), labels.to(device)
outputs = model(inputs)
loss = criterion(outputs, labels)
running_loss += loss.item()
_, predicted = outputs.max(1)
total += labels.size(0)
correct += predicted.eq(labels).sum().item()
predictions.extend(predicted.cpu().numpy())
ground_truths.extend(labels.cpu().numpy())
test_loss = running_loss / len(test_loader)
test_accuracy = correct / total

# Calculate confusion matrix


cm = confusion_matrix(ground_truths, predictions)

# Plot confusion matrix


plt.figure(figsize=(10, 8))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=False)
plt.xlabel('Predicted labels')
plt.ylabel('True labels')
plt.title('Confusion Matrix QLSTM')
plt.show()

return test_loss, test_accuracy


# Initialisation des modèles
input_size = 75 # Taille de l'entrée (nombre de fonctionnalités)
hidden_size = 64 # Taille de la couche cachée LSTM
num_layers = 4 # Nombre de couches LSTM
num_classes = len(label_encoder.classes_) # Nombre de classes de sortie
learning_rate=0.01
num_epochs=7
# Création d'une instance du modèle LSTM
lstm_model = LSTMModel(input_size, hidden_size, num_layers, num_classes).to(device)
criterion = nn.CrossEntropyLoss()
lstm_optimizer = torch.optim.Adam(lstm_model.parameters(), lr=learning_rate)
print(lstm_model)
# Entraînement et évaluation du modèle LSTM
train_losses = []
train_accuracies = []
test_losses = []
test_accuracies = []
for epoch in range(num_epochs):
lstm_train_loss, lstm_train_accuracy = train(lstm_model, criterion,
lstm_optimizer, train_loader)
lstm_test_loss, lstm_test_accuracy = evaluate_lstm(lstm_model, criterion,
test_loader)
train_losses.append(lstm_train_loss)
train_accuracies.append(lstm_train_accuracy)
test_losses.append(lstm_test_loss)
test_accuracies.append(lstm_test_accuracy)

print(f"Epoch {epoch + 1}/{num_epochs}:")


print(f"LSTM - Train Loss: {lstm_train_loss:.4f}, Train Acc:
{lstm_train_accuracy:.4f}, Test Loss: {lstm_test_loss:.4f}, Test Acc:
{lstm_test_accuracy:.4f}")
evaluate_lstm(lstm_model, criterion, test_loader)
# Tracer les courbes de perte
plt.figure(figsize=(10, 5))
plt.plot(train_losses, label='Train Loss')
plt.plot(test_losses, label='Test Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('LSTM Training and Test Loss')
plt.legend()
plt.show()

# Tracer les courbes de précision


plt.figure(figsize=(10, 5))
plt.plot(train_accuracies, label='Train Accuracy')
plt.plot(test_accuracies, label='Test Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.title('LSTM Training and Test Accuracy')
plt.legend()
plt.show()
# Création d'une instance du modèle QLSTM
qlstm_model = QLSTMModel(input_size, hidden_size, n_qubits=4, n_qlayers=1,
num_classes=num_classes).to(device)
criterion = nn.CrossEntropyLoss()
qlstm_optimizer = torch.optim.Adam(qlstm_model.parameters(), lr=learning_rate)
print(qlstm_model)
# Entraînement et évaluation du modèle QLSTM
train_losses = []
train_accuracies = []
test_losses = []
test_accuracies = []
num_epochs=7
for epoch in range(num_epochs):
qlstm_train_loss, qlstm_train_accuracy = train(qlstm_model, criterion,
qlstm_optimizer, train_loader)
qlstm_test_loss, qlstm_test_accuracy = evaluate_qlstm(qlstm_model, criterion,
test_loader)
train_losses.append(qlstm_train_loss)
train_accuracies.append(qlstm_train_accuracy)
test_losses.append(qlstm_test_loss)
test_accuracies.append(qlstm_test_accuracy)

print(f"Epoch {epoch + 1}/{num_epochs}:")


print(f"QLSTM - Train Loss: {qlstm_train_loss:.4f}, Train Acc:
{qlstm_train_accuracy:.4f}, Test Loss: {qlstm_test_loss:.4f}, Test Acc:
{qlstm_test_accuracy:.4f}")
# Tracer les courbes de perte
plt.figure(figsize=(10, 5))
plt.plot(train_losses, label='Train Loss')
plt.plot(test_losses, label='Test Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('QLSTM Training and Test Loss')
plt.legend()
plt.show()

# Tracer les courbes de précision


plt.figure(figsize=(10, 5))
plt.plot(train_accuracies, label='Train Accuracy')
plt.plot(test_accuracies, label='Test Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.title('QLSTM Training and Test Accuracy')
plt.legend()
plt.show()

You might also like