Professional Documents
Culture Documents
lab7
lab7
if sentence:
sentences.append(sentence)
1
train_sentences, test_sentences = train_test_split(sentences, test_size=0.2,␣
↪random_state=42)
2
viterbi_matrix = np.zeros((num_tags, num_words))
backpointer_matrix = np.zeros((num_tags, num_words), dtype=int)
# Khởi tạo
for i, tag in enumerate(tags):
if words[0] in emission_probs[tag]:
viterbi_matrix[i, 0] = transition_probs['<START>'].get(tag, 0) *␣
↪emission_probs[tag].get(words[0], 0)
else:
viterbi_matrix[i, 0] = 0
# Viterbi
for t in range(1, num_words):
for i, tag in enumerate(tags):
max_tr_prob = max(viterbi_matrix[j, t-1] *␣
↪transition_probs[tags[j]].get(tag, 0) for j in range(num_tags))
for j in range(num_tags):
if viterbi_matrix[j, t-1] * transition_probs[tags[j]].get(tag,␣
↪0) == max_tr_prob:
backpointer_matrix[i, t] = j
break
# Tìm trạng thái có xác suất cao nhất ở thời điểm cuối cùng
best_path_pointer = np.argmax(viterbi_matrix[:, num_words-1])
best_path = [best_path_pointer]
predictions.append(predicted_tags)
return predictions
3
predictions = predict(test_sentences, transition_probs, emission_probs,␣
↪tag_counts)
[ ]: import pandas as pd
from collections import defaultdict
def read_data(file_path):
sentences = []
with open(file_path, 'r', encoding='utf-8') as file:
sentence = []
for line in file:
line = line.strip()
if line:
# Tìm dấu `/` cuối cùng trong dòng
last_slash_idx = line.rfind('/')
word = line[:last_slash_idx]
tag = line[last_slash_idx + 1:]
sentence.append((word, tag))
else:
if sentence:
sentences.append(sentence)
sentence = []
if sentence:
sentences.append(sentence)
return sentences
4
test_sentences = read_data('/content/drive/MyDrive/lab3/vi_test.txt')
[ ]: def train_hmm(train_sentences):
transition_counts = defaultdict(lambda: defaultdict(int))
emission_counts = defaultdict(lambda: defaultdict(int))
tag_counts = defaultdict(int)
[ ]: import numpy as np
5
viterbi_matrix = np.zeros((num_tags, num_words))
backpointer_matrix = np.zeros((num_tags, num_words), dtype=int)
# Khởi tạo
for i, tag in enumerate(tags):
if words[0] in emission_probs[tag]:
viterbi_matrix[i, 0] = transition_probs['<START>'].get(tag, 0) *␣
↪emission_probs[tag].get(words[0], 0)
else:
viterbi_matrix[i, 0] = 0
# Viterbi
for t in range(1, num_words):
for i, tag in enumerate(tags):
max_tr_prob = max(viterbi_matrix[j, t-1] *␣
↪transition_probs[tags[j]].get(tag, 0) for j in range(num_tags))
for j in range(num_tags):
if viterbi_matrix[j, t-1] * transition_probs[tags[j]].get(tag,␣
↪0) == max_tr_prob:
backpointer_matrix[i, t] = j
break
# Tìm trạng thái có xác suất cao nhất ở thời điểm cuối cùng
best_path_pointer = np.argmax(viterbi_matrix[:, num_words-1])
best_path = [best_path_pointer]
predictions.append(list(zip(words, predicted_tags)))
return predictions
6
predictions = predict(test_sentences, transition_probs, emission_probs,␣
↪tag_counts)