Download as docx, pdf, or txt
Download as docx, pdf, or txt
You are on page 1of 6

PENERAPAN SENTIMENT ANALYSIS

PADA REVIEW PENGGUNA APLIKASI SNAPCHAT


DENGAN VADER DAN ALGORITMA MACHINE LEARNING

Ade Andryani¹, Dedek Julian², Yoga Pratama³, Yulia Permata Sari⁴


Program Studi Magister Teknik Informatika
Universitas Bina Darma

Tahapan Sentiment Analysis

1. Download dataset dari Kaggle (https://www.kaggle.com/datasets/umarpervaiz/snapchat-


google-playstore-reviews), kemudian upload ke Google Drive.

2. Buka google colab (https://colab.research.google.com/) dan buat file baru.

3. Masukan baris perintah berikut untuk import library yang dibutuhkan.

!pip install pandas


!pip install nltk
!pip install sklearn
!pip install numpy
!pip install seaborn==0.10.1

from google.colab import drive


drive.mount('/content/drive')

import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
nltk.download("vader_lexicon")

import pandas as pd
import numpy as np

import re

4. Import dataset dari Google Drive untuk dimasukan ke Google Colab, dan hapus kolom
yang tidak dibutuhkan, dataset juga dibatasi menjadi 5000 baris saja.

file_url = "/content/drive/MyDrive/Reviews.csv"
dataset = pd.read_csv(file_url)
dataset = dataset.dropna()
dataset = dataset.drop(columns=['Date', 'Rating', 'Helpful'])
dataset = dataset.head(5000)
5. Melakukan cleaning dataset dengan library NLTK (mengubah semua huruf menjadi
huruf kecil, menghilangkan emoji, stopword, stemming) dan simpan ke dataframe baru.

row = len(dataset)
print('Total data didalam dataset :', row)

cleanReview = []
for i in range(0, row):
review = re.sub('[^a-zA-Z]', ' ', str(dataset['Review Text
'][i]))
review = review.lower()
review = review.split()
ps = PorterStemmer()
review = [ps.stem(word) for word in review if not word in
set(stopwords.words('english'))]
review = ' '.join(review)
cleanReview.append(review)

newDfReview = pd.DataFrame({'review':cleanReview})
dfNew = pd.concat([newDfReview], axis=1)
dfNew

6. Melakukan pembobotan nilai kata dalam setiap review, lalu melakukan penetapan
sentiment menggunakan function SentimentIntensityAnalyzer() dari nltk sentiment
vader.

sentiments = SentimentIntensityAnalyzer()
score = [sentiments.polarity_scores(i)["compound"] for i in df
New['review']]
sentiment = []
for i in score:
if i >= 0.05 :
sentiment.append('positive')
elif i <= -0.05 :
sentiment.append('negative')
else:
sentiment.append('neutral')

7. Melihat hasil penetapan sentiment yang dengan vader, dan melihat jumlah setiap data
dari masing-masing sentiment tersebut.
newDfSentiment = pd.DataFrame({'sentiment':sentiment})
dfNew = pd.concat([newDfReview, newDfSentiment], axis=1)

dfNew
dfNew.to_csv("review_sentiment.csv", index=False)

dfNew['sentiment'].value_counts().plot(kind='pie', autopct='%1
.0f%%', colors=["skyblue", "lightyellow", "lightgreen"])
print(newDfSentiment.value_counts())

8. Melihat kata paling sering muncul pada dataset review dalam bentuk visual.

words = []
for i in range(0,len(cleanReview)):
words = words + (re.findall(r'\w+', cleanReview[i]))

words_counts = Counter(words)

popular_words = sorted(words_counts.items(), key=lambda x: x[1


], reverse=True)
most_popular_wordList = []
most_popular_CountList = []
for x, y in popular_words:
most_popular_wordList.append(x)
most_popular_CountList.append(y)

plt.figure(figsize=(14,12))
plot = sns.barplot(np.arange(30), most_popular_CountList[0:30]
)
plt.ylabel('Jumlah kata',fontsize=12)
plt.xticks(np.arange(30), most_popular_wordList[0:30], fontsiz
e=12, rotation=40)
plt.title('Kata yang paling sering muncul di Review.', fontsiz
e=12)
plt.show()

9. Melihat kata paling sering muncul pada dataset review untuk setiap sentiment (positive,
negative, dan neutral) dalam bentuk visual.
k = most_popular_wordList[0:30]
Positive=[]
Neutral=[]
Negative=[]
for i in k:
Sentiment=[]
for z in cleanReview:
if i in z and dfNew['sentiment']
[cleanReview.index(z)]=='positive':
Positive.append(i)

f, ax = plt.subplots(3,1,figsize=(20, 22))
c1 = sns.countplot(Positive, ax=ax[0])
c2 = sns.countplot(Neutral, ax=ax[1])
c3 = sns.countplot(Negative, ax=ax[2])
ax[0].set_title("\nJumlah kemunculan kata popular \npada revie
w dengan sentimen Positif", fontsize=14)
ax[1].set_title("\nJumlah kemunculan kata popular \npada revie
w dengan sentimen Netral", fontsize=14)
ax[2].set_title("\nJumlah kemunculan kata popular \npada revie
w dengan sentimen Negatif", fontsize=14)

10. Mempersiapkan dataset untuk training dan testing.

from sklearn.feature_extraction.text import CountVectorizer


cv = CountVectorizer(max_features = row)
X = cv.fit_transform(dfNew['review']).toarray()
y = dfNew.iloc[:, 1].values

from sklearn.model_selection import train_test_split


X_train, X_test, y_train, y_test = train_test_split(X, y, test
_size = 0.20, random_state = 0)

11. Melakukan training dan testing dataset dengan algoritma Random Forest.

from sklearn.ensemble import RandomForestClassifier


rf = RandomForestClassifier(n_estimators = row, criterion = 'e
ntropy')
rf.fit(X_train, y_train)

rf_pred = rf.predict(X_test)
rf.score(X_test, y_test)
rfReport = classification_report(y_test, rf_pred, output_dict=
True)
dfRF = pd.DataFrame(rfReport).transpose()

fig, ax = plt.subplots(figsize=(8,5))
sns.heatmap(dfRF, annot=True, cbar=False, fmt=".2f", linewidth
s=.1, cmap="Greens")
plt.title('Report Klasifikasi Random Forest')
plt.show()

fig, ax = plt.subplots(figsize=(8,5))
cmRF = confusion_matrix(y_test, rf_pred)

kelas = unique_labels(dfNew.iloc[:,1].values)

sns.heatmap(cmRF, xticklabels=kelas, yticklabels=kelas, annot=


True, fmt="d", cbar=False, cmap="Greens")
plt.title('Confusion Matrix Random Forest')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

12. Melakukan training dan testing dataset dengan algoritma SVM.

from sklearn.svm import SVC


svm = SVC(kernel = 'linear', random_state = 0)
svm.fit(X_train, y_train)

svm_pred = svm.predict(X_test)
svm.score(X_test, y_test)

svmReport = classification_report(y_test, svm_pred, output_dic


t=True)
dfSVM = pd.DataFrame(svmReport).transpose()

fig, ax = plt.subplots(figsize=(8,5))
sns.heatmap(dfSVM, annot=True, cbar=False, fmt=".2f", linewidt
hs=.1, cmap="Blues")
plt.title('Report Klasifikasi Support Vector Machines')
plt.show()
fig, ax = plt.subplots(figsize=(8,5))
cmSVM = confusion_matrix(y_test, svm_pred)

kelas = unique_labels(dfNew.iloc[:,1].values)

sns.heatmap(cmSVM, xticklabels=kelas, yticklabels=kelas, annot


=True, fmt="d", cbar=False, cmap="Blues")
plt.title('Confusion Matrix SVM')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

You might also like