Professional Documents
Culture Documents
Tahapan Sentiment Analysis
Tahapan Sentiment Analysis
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
nltk.download("vader_lexicon")
import pandas as pd
import numpy as np
import re
4. Import dataset dari Google Drive untuk dimasukan ke Google Colab, dan hapus kolom
yang tidak dibutuhkan, dataset juga dibatasi menjadi 5000 baris saja.
file_url = "/content/drive/MyDrive/Reviews.csv"
dataset = pd.read_csv(file_url)
dataset = dataset.dropna()
dataset = dataset.drop(columns=['Date', 'Rating', 'Helpful'])
dataset = dataset.head(5000)
5. Melakukan cleaning dataset dengan library NLTK (mengubah semua huruf menjadi
huruf kecil, menghilangkan emoji, stopword, stemming) dan simpan ke dataframe baru.
row = len(dataset)
print('Total data didalam dataset :', row)
cleanReview = []
for i in range(0, row):
review = re.sub('[^a-zA-Z]', ' ', str(dataset['Review Text
'][i]))
review = review.lower()
review = review.split()
ps = PorterStemmer()
review = [ps.stem(word) for word in review if not word in
set(stopwords.words('english'))]
review = ' '.join(review)
cleanReview.append(review)
newDfReview = pd.DataFrame({'review':cleanReview})
dfNew = pd.concat([newDfReview], axis=1)
dfNew
6. Melakukan pembobotan nilai kata dalam setiap review, lalu melakukan penetapan
sentiment menggunakan function SentimentIntensityAnalyzer() dari nltk sentiment
vader.
sentiments = SentimentIntensityAnalyzer()
score = [sentiments.polarity_scores(i)["compound"] for i in df
New['review']]
sentiment = []
for i in score:
if i >= 0.05 :
sentiment.append('positive')
elif i <= -0.05 :
sentiment.append('negative')
else:
sentiment.append('neutral')
7. Melihat hasil penetapan sentiment yang dengan vader, dan melihat jumlah setiap data
dari masing-masing sentiment tersebut.
newDfSentiment = pd.DataFrame({'sentiment':sentiment})
dfNew = pd.concat([newDfReview, newDfSentiment], axis=1)
dfNew
dfNew.to_csv("review_sentiment.csv", index=False)
dfNew['sentiment'].value_counts().plot(kind='pie', autopct='%1
.0f%%', colors=["skyblue", "lightyellow", "lightgreen"])
print(newDfSentiment.value_counts())
8. Melihat kata paling sering muncul pada dataset review dalam bentuk visual.
words = []
for i in range(0,len(cleanReview)):
words = words + (re.findall(r'\w+', cleanReview[i]))
words_counts = Counter(words)
plt.figure(figsize=(14,12))
plot = sns.barplot(np.arange(30), most_popular_CountList[0:30]
)
plt.ylabel('Jumlah kata',fontsize=12)
plt.xticks(np.arange(30), most_popular_wordList[0:30], fontsiz
e=12, rotation=40)
plt.title('Kata yang paling sering muncul di Review.', fontsiz
e=12)
plt.show()
9. Melihat kata paling sering muncul pada dataset review untuk setiap sentiment (positive,
negative, dan neutral) dalam bentuk visual.
k = most_popular_wordList[0:30]
Positive=[]
Neutral=[]
Negative=[]
for i in k:
Sentiment=[]
for z in cleanReview:
if i in z and dfNew['sentiment']
[cleanReview.index(z)]=='positive':
Positive.append(i)
f, ax = plt.subplots(3,1,figsize=(20, 22))
c1 = sns.countplot(Positive, ax=ax[0])
c2 = sns.countplot(Neutral, ax=ax[1])
c3 = sns.countplot(Negative, ax=ax[2])
ax[0].set_title("\nJumlah kemunculan kata popular \npada revie
w dengan sentimen Positif", fontsize=14)
ax[1].set_title("\nJumlah kemunculan kata popular \npada revie
w dengan sentimen Netral", fontsize=14)
ax[2].set_title("\nJumlah kemunculan kata popular \npada revie
w dengan sentimen Negatif", fontsize=14)
11. Melakukan training dan testing dataset dengan algoritma Random Forest.
rf_pred = rf.predict(X_test)
rf.score(X_test, y_test)
rfReport = classification_report(y_test, rf_pred, output_dict=
True)
dfRF = pd.DataFrame(rfReport).transpose()
fig, ax = plt.subplots(figsize=(8,5))
sns.heatmap(dfRF, annot=True, cbar=False, fmt=".2f", linewidth
s=.1, cmap="Greens")
plt.title('Report Klasifikasi Random Forest')
plt.show()
fig, ax = plt.subplots(figsize=(8,5))
cmRF = confusion_matrix(y_test, rf_pred)
kelas = unique_labels(dfNew.iloc[:,1].values)
svm_pred = svm.predict(X_test)
svm.score(X_test, y_test)
fig, ax = plt.subplots(figsize=(8,5))
sns.heatmap(dfSVM, annot=True, cbar=False, fmt=".2f", linewidt
hs=.1, cmap="Blues")
plt.title('Report Klasifikasi Support Vector Machines')
plt.show()
fig, ax = plt.subplots(figsize=(8,5))
cmSVM = confusion_matrix(y_test, svm_pred)
kelas = unique_labels(dfNew.iloc[:,1].values)