Download as pdf or txt
Download as pdf or txt
You are on page 1of 8

3-Confussion-Matrix-hasil-modelling-OK

November 15, 2022

1 TOPIC MODELLING BIGRAM-TRIGRAM


[128]: #Sumber acuan : https://medium.com/@listari.tari/
,→topic-modeling-menggunakan-latent-dirchlect-allocation-part-2-topic-modeling-with-gensim-c9f

[24]: import pandas as pd


from pprint import pprint
import seaborn as sns

[195]: #04-topik-per-judul-confus-mtr.csv
#03-topik-per-judul-confus-mtr.csv
#03-4cluster.csv
#05-topik-per-judul_b4.csv
#03-topik-per-judul_b-confus-mtr.csv
#03-topik-per-judul.csv
bln = '10'
#inputfile='dataset/'+bln+'-topik-per-judul-str.csv'
inputfile='dataset/'+bln+'-topik-per-judul.csv'
data = pd.read_csv(inputfile, sep=',', encoding='latin-1')

[196]: data.head()

[196]: Unnamed: 0 Document_No Dominant_Topic Topic_Perc_Contrib \


0 0 0 0 0.6966
1 1 1 1 0.7970
2 2 2 1 0.5937
3 3 3 0 0.7539
4 4 4 0 0.7891

Keywords \
0 uu_cipta, warga, omnibus_law, protokol_kesehat…
1 gempa_m, demo_omnibus, hari_ini, libur_panjang…
2 gempa_m, demo_omnibus, hari_ini, libur_panjang…
3 uu_cipta, warga, omnibus_law, protokol_kesehat…
4 uu_cipta, warga, omnibus_law, protokol_kesehat…

Text \
0 ['jokowi', 'tinjau', 'progres', 'wisata', 'pre…

1
1 ['update', 'covid-19', 'jatim:', '314', 'kasus…
2 ['azerbaijan', 'vs', 'armenia', 'perang,', 'ke…
3 ['perkumpulan', 'warga', 'minang', 'surabaya',…
4 ['azerbaijan', 'vs', 'armenia', 'perang,', 'ri…

Asal
0 jokowi tinjau progres wisata premium labuan ba…
1 update covid-19 jatim: 314 kasus positif baru,…
2 azerbaijan vs armenia perang, kemlu ri: semua …
3 perkumpulan warga minang surabaya dukung machf…
4 azerbaijan vs armenia perang, ri serukan genca…

2 COBA DIBIKIN CONFUSSION MATRIX


[197]: from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
import numpy as np
from sklearn.metrics import confusion_matrix

#lbltopic = data[‘Dominant_Topic’] #textpros = data[‘Text’]

[198]: # This converts the list of words into space-separated strings


#text_list

#df['isinya'] = df['isinya'].apply(lambda x: ' '.join(x))


#textpros = data['Text'].apply(lambda x: ' '.join(x))

count_vect = CountVectorizer()
countsv = count_vect.fit_transform(data['Text'])

[199]: transformer = TfidfTransformer().fit(countsv)


countsv = transformer.transform(countsv)

[200]: X_train, X_test, y_train, y_test = train_test_split(countsv,␣


,→data['Dominant_Topic'], test_size=0.1, random_state=69)

[201]: print(countsv[:5])

(0, 13062) 0.2789470811683493


(0, 12416) 0.3302658628965039
(0, 9983) 0.3856380533145348
(0, 9933) 0.3952840996405798
(0, 6599) 0.3952840996405798
(0, 5356) 0.19697505432178475
(0, 4652) 0.33814725661240264
(0, 1289) 0.3602292195566924

2
(0, 1045) 0.26265835012301936
(1, 12789) 0.4865069079637143
(1, 12787) 0.13427113605737787
(1, 11823) 0.12535024990481136
(1, 10996) 0.1707753898017361
(1, 10993) 0.12355543101275517
(1, 9858) 0.09629203648599599
(1, 5643) 0.4047431962748549
(1, 5632) 0.08249875186455764
(1, 5243) 0.11305965105757178
(1, 2636) 0.08407602342699545
(1, 1437) 0.3285681803271481
(1, 625) 0.41761471252258714
(1, 277) 0.2002009357807956
(1, 269) 0.20880735626129357
(1, 132) 0.34547486946410433
(2, 13073) 0.31993340458739944
(2, 12914) 0.32661014029531127
(2, 11019) 0.3464965733082987
(2, 10419) 0.24891541956174817
(2, 9330) 0.32460798562589627
(2, 5935) 0.3870418329130274
(2, 1225) 0.34071978906273015
(2, 1099) 0.33434120611855406
(2, 874) 0.35482943783014903
(3, 12984) 0.21849274041960373
(3, 11664) 0.26269513938718436
(3, 9431) 0.49306188197186135
(3, 7944) 0.3938875695892951
(3, 7793) 0.49306188197186135
(3, 6997) 0.3693175439087316
(3, 3858) 0.32479113519810243
(4, 12914) 0.3146165664353413
(4, 11141) 0.3323189370063989
(4, 11049) 0.3417996125269308
(4, 10419) 0.23977490277711946
(4, 9330) 0.3126879336409208
(4, 4326) 0.39926813999704075
(4, 4325) 0.38422281169893363
(4, 1225) 0.32820808948113606
(4, 1099) 0.3220637368814121

[202]: model = MultinomialNB().fit(X_train, y_train)

[203]: predicted = model.predict(X_test)


skorcfm = np.mean(predicted == y_test)
print(skorcfm)

3
0.8265602322206096
fjmltop = open(‘dataset/’+bln+‘333-skor-cnf-matrix.txt’,‘w’) fjmltop.write(str(skorcfm)) fjml-
top.close()

[204]: print(confusion_matrix(y_test, predicted))

[[372 33 48]
[ 52 375 42]
[ 36 28 392]]

[205]: %matplotlib inline


import matplotlib.pyplot as plt
from sklearn.utils.multiclass import unique_labels

[206]: def plot_confusion_matrix(y_true, y_pred, classes,


normalize=False,
title=None,
cmap=plt.cm.Blues):
"""
This function prints and plots the confusion matrix.
Normalization can be applied by setting `normalize=True`.
"""
if not title:
if normalize:
title = 'Normalized confusion matrix'
else:
title = 'Confusion matrix, without normalization'

# Compute confusion matrix


cm = confusion_matrix(y_true, y_pred)
# Only use the labels that appear in the data
classes = classes[unique_labels(y_true, y_pred)]
if normalize:
cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
print("Normalized confusion matrix")
else:
print('Confusion matrix, without normalization')

print(cm)

fig, ax = plt.subplots()
im = ax.imshow(cm, interpolation='nearest', cmap=cmap)
ax.figure.colorbar(im, ax=ax)
# We want to show all ticks...
ax.set(xticks=np.arange(cm.shape[1]),
yticks=np.arange(cm.shape[0]),
# ... and label them with the respective list entries

4
xticklabels=classes, yticklabels=classes,
title=title,
ylabel='True label',
xlabel='Predicted label')

# Rotate the tick labels and set their alignment.


plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
rotation_mode="anchor")

# Loop over data dimensions and create text annotations.


fmt = '.2f' if normalize else 'd'
thresh = cm.max() / 2.
for i in range(cm.shape[0]):
for j in range(cm.shape[1]):
ax.text(j, i, format(cm[i, j], fmt),
ha="center", va="center",
color="white" if cm[i, j] > thresh else "black")
fig.tight_layout()
return ax

[207]: np.set_printoptions(precision=2)

# Plot non-normalized confusion matrix


#https://stackoverflow.com/questions/47755723/
,→creating-a-numpy-structure-scalar-instead-of-array

dff = np.array(('0','1','2'),dtype='U10') #--> Labeling ini diambil dari -->␣


,→print(df['sentimen']) diatas

#dff = data['Dominant_Topic']

plot_confusion_matrix(y_test, predicted, classes=dff,


title='Confusion matrix, without normalization')

# Plot normalized confusion matrix


#plot_confusion_matrix(y_test, y_pred, classes=class_names, normalize=True,
# title='Normalized confusion matrix')

plt.show()

Confusion matrix, without normalization


[[372 33 48]
[ 52 375 42]
[ 36 28 392]]

5
[208]: plot_confusion_matrix(y_test, predicted, classes=dff, normalize=True,
title='Normalized confusion matrix')
plt.show()

Normalized confusion matrix


[[0.82 0.07 0.11]
[0.11 0.8 0.09]
[0.08 0.06 0.86]]

6
[166]: #confusion_matrix(y_test, predicted)

[167]: #Sumber : https://towardsdatascience.com/


,→understanding-the-confusion-matrix-and-how-to-implement-it-in-python-319202e0fe4d#ccfe

[179]: # Accuracy
from sklearn.metrics import accuracy_score
#accuracy_score(y_true, y_pred)
print("Accuracy Score : ",accuracy_score(y_test, predicted))

Accuracy Score : 0.8418568056648308

[169]: # Recall
from sklearn.metrics import recall_score
#recall_score(y_true, y_pred, average=None)
print(recall_score(y_test, predicted, average=None))

[0.83 0.86 0.84]

[170]: # Precision
from sklearn.metrics import precision_score
#precision_score(y_true, y_pred, average=None)
print(precision_score(y_test, predicted, average=None))

[0.84 0.84 0.85]

7
[171]: #Sumber : https://medium.com/@ksnugroho/
,→confusion-matrix-untuk-evaluasi-model-pada-unsupervised-machine-learning-bc4b1ae9ae3f

[178]: from sklearn.metrics import classification_report


#print (classification_report(y_test, y_pred))
print ("Classification Report : \n",classification_report(y_test, predicted))

Classification Report :
precision recall f1-score support

0 0.84 0.83 0.83 436


1 0.84 0.86 0.85 438
2 0.85 0.84 0.84 397

accuracy 0.84 1271


macro avg 0.84 0.84 0.84 1271
weighted avg 0.84 0.84 0.84 1271

[ ]:

[ ]:

You might also like