lab7

lab7
June 17, 2024
[2]: import numpy as np

import pandas as pd
from collections import defaultdict
from sklearn.model_selection import train_test_split
[3]: # Đọc dữ liệu từ file CSV

df = pd.read_csv('/content/drive/MyDrive/lab3/NER-1.csv', encoding='latin-1')
# Xây dựng từ điển từ và tập các nhãn

words = set(df['Word'].values)
tags = set(df['Tag'].values)
# Chuyển đổi từ điển và nhãn thành danh sách để dễ dàng xử lý

words = list(words)
tags = list(tags)
print(f"Số lượng từ: {len(words)}")

print(f"Số lượng nhãn: {len(tags)}")
Số lượng từ: 35178

Số lượng nhãn: 18
[ ]: # Chuyển đổi dữ liệu thành dạng danh sách các câu

sentences = []
sentence = []
for index, row in df.iterrows():

if pd.isnull(row['Sentence #']):
sentence.append((row['Word'], row['Tag']))
else:
if sentence:
sentences.append(sentence)
sentence = [(row['Word'], row['Tag'])]
if sentence:
# Chia dữ liệu thành tập huấn luyện và kiểm thử
1
train_sentences, test_sentences = train_test_split(sentences, test_size=0.2,␣
↪random_state=42)
print(f"Số lượng câu trong tập huấn luyện: {len(train_sentences)}")

print(f"Số lượng câu trong tập kiểm thử: {len(test_sentences)}")
Số lượng câu trong tập huấn luyện: 51604

Số lượng câu trong tập kiểm thử: 12901
[ ]: # Tính toán xác suất chuyển tiếp và phát xạ

def train_hmm(train_sentences):
transition_counts = defaultdict(lambda: defaultdict(int))
emission_counts = defaultdict(lambda: defaultdict(int))
tag_counts = defaultdict(int)
for sentence in train_sentences:

prev_tag = '<START>'
for word, tag in sentence:
transition_counts[prev_tag][tag] += 1
emission_counts[tag][word] += 1
tag_counts[tag] += 1
prev_tag = tag
transition_counts[prev_tag]['<END>'] += 1
# Chuyển đổi counts thành xác suất

transition_probs = defaultdict(dict)
emission_probs = defaultdict(dict)
for prev_tag, tag_counts_dict in transition_counts.items():

total = sum(tag_counts_dict.values())
for tag, count in tag_counts_dict.items():
transition_probs[prev_tag][tag] = count / total
for tag, word_counts_dict in emission_counts.items():

total = sum(word_counts_dict.values())
for word, count in word_counts_dict.items():
emission_probs[tag][word] = count / total
return transition_probs, emission_probs, tag_counts
# Huấn luyện HMM

transition_probs, emission_probs, tag_counts = train_hmm(train_sentences)
[ ]: def viterbi(words, transition_probs, emission_probs, tag_counts):

tags = list(tag_counts.keys())
num_tags = len(tags)
num_words = len(words)
2
viterbi_matrix = np.zeros((num_tags, num_words))
backpointer_matrix = np.zeros((num_tags, num_words), dtype=int)
# Khởi tạo
for i, tag in enumerate(tags):
if words[0] in emission_probs[tag]:
viterbi_matrix[i, 0] = transition_probs['<START>'].get(tag, 0) *␣
↪emission_probs[tag].get(words[0], 0)
else:
viterbi_matrix[i, 0] = 0
# Viterbi
for t in range(1, num_words):
max_tr_prob = max(viterbi_matrix[j, t-1] *␣
↪transition_probs[tags[j]].get(tag, 0) for j in range(num_tags))
for j in range(num_tags):
if viterbi_matrix[j, t-1] * transition_probs[tags[j]].get(tag,␣
↪0) == max_tr_prob:
viterbi_matrix[i, t] = max_tr_prob * emission_probs[tag].

↪get(words[t], 0)
backpointer_matrix[i, t] = j
break
# Tìm trạng thái có xác suất cao nhất ở thời điểm cuối cùng
best_path_pointer = np.argmax(viterbi_matrix[:, num_words-1])
best_path = [best_path_pointer]
# Truy ngược để tìm chuỗi trạng thái tốt nhất

for t in range(num_words-1, 0, -1):
best_path_pointer = backpointer_matrix[best_path_pointer, t]
best_path.insert(0, best_path_pointer)
best_tags = [tags[index] for index in best_path]

return best_tags
[ ]: # Dự đoán trên tập kiểm thử

def predict(test_sentences, transition_probs, emission_probs, tag_counts):
predictions = []
for sentence in test_sentences:
words = [word for word, tag in sentence]
predicted_tags = viterbi(words, transition_probs, emission_probs,␣
↪tag_counts)
predictions.append(predicted_tags)
return predictions
3
predictions = predict(test_sentences, transition_probs, emission_probs,␣
↪tag_counts)
# Tính toán độ chính xác

def calculate_accuracy(predictions, test_sentences):
correct = 0
total = 0
for pred, sent in zip(predictions, test_sentences):

tags = [tag for word, tag in sent]
for p, t in zip(pred, tags):
if p == t:
correct += 1
total += 1
return correct / total
accuracy = calculate_accuracy(predictions, test_sentences)

print(f"Độ chính xác: {accuracy:.2f}")
Độ chính xác: 0.95
[ ]: import pandas as pd
from collections import defaultdict
def read_data(file_path):
sentences = []
with open(file_path, 'r', encoding='utf-8') as file:
sentence = []
for line in file:
line = line.strip()
if line:
# Tìm dấu `/` cuối cùng trong dòng
last_slash_idx = line.rfind('/')
word = line[:last_slash_idx]
tag = line[last_slash_idx + 1:]
sentence.append((word, tag))
else:
if sentence:
sentence = []
if sentence:
return sentences
# Đọc dữ liệu từ file

train_sentences = read_data('/content/drive/MyDrive/lab3/vi_train.txt')
4
test_sentences = read_data('/content/drive/MyDrive/lab3/vi_test.txt')
print(f"Số lượng câu trong tập huấn luyện: {len(train_sentences)}")

print(f"Số lượng câu trong tập kiểm thử: {len(test_sentences)}")
Số lượng câu trong tập huấn luyện: 8456

Số lượng câu trong tập kiểm thử: 1051
[ ]: def train_hmm(train_sentences):
transition_counts = defaultdict(lambda: defaultdict(int))
emission_counts = defaultdict(lambda: defaultdict(int))
tag_counts = defaultdict(int)
for sentence in train_sentences:

prev_tag = '<START>'
for word, tag in sentence:
transition_counts[prev_tag][tag] += 1
emission_counts[tag][word] += 1
tag_counts[tag] += 1
prev_tag = tag
transition_counts[prev_tag]['<END>'] += 1
# Chuyển đổi counts thành xác suất

transition_probs = defaultdict(dict)
emission_probs = defaultdict(dict)
for prev_tag, tag_counts_dict in transition_counts.items():

total = sum(tag_counts_dict.values())
for tag, count in tag_counts_dict.items():
transition_probs[prev_tag][tag] = count / total
for tag, word_counts_dict in emission_counts.items():

total = sum(word_counts_dict.values())
for word, count in word_counts_dict.items():
emission_probs[tag][word] = count / total
return transition_probs, emission_probs, tag_counts
# Huấn luyện HMM

transition_probs, emission_probs, tag_counts = train_hmm(train_sentences)
[ ]: import numpy as np
def viterbi(words, transition_probs, emission_probs, tag_counts):

tags = list(tag_counts.keys())
num_tags = len(tags)
num_words = len(words)
5
viterbi_matrix = np.zeros((num_tags, num_words))
backpointer_matrix = np.zeros((num_tags, num_words), dtype=int)
# Khởi tạo
if words[0] in emission_probs[tag]:
viterbi_matrix[i, 0] = transition_probs['<START>'].get(tag, 0) *␣
↪emission_probs[tag].get(words[0], 0)
else:
viterbi_matrix[i, 0] = 0
# Viterbi
for t in range(1, num_words):
max_tr_prob = max(viterbi_matrix[j, t-1] *␣
↪transition_probs[tags[j]].get(tag, 0) for j in range(num_tags))
for j in range(num_tags):
if viterbi_matrix[j, t-1] * transition_probs[tags[j]].get(tag,␣
↪0) == max_tr_prob:
viterbi_matrix[i, t] = max_tr_prob * emission_probs[tag].

↪get(words[t], 0)
backpointer_matrix[i, t] = j
break
# Tìm trạng thái có xác suất cao nhất ở thời điểm cuối cùng
best_path_pointer = np.argmax(viterbi_matrix[:, num_words-1])
best_path = [best_path_pointer]
# Truy ngược để tìm chuỗi trạng thái tốt nhất

for t in range(num_words-1, 0, -1):
best_path_pointer = backpointer_matrix[best_path_pointer, t]
best_path.insert(0, best_path_pointer)
best_tags = [tags[index] for index in best_path]

return best_tags
[ ]: def predict(test_sentences, transition_probs, emission_probs, tag_counts):

predictions = []
for sentence in test_sentences:
words = [word for word, tag in sentence]
predicted_tags = viterbi(words, transition_probs, emission_probs,␣
↪tag_counts)
predictions.append(list(zip(words, predicted_tags)))
return predictions
6
predictions = predict(test_sentences, transition_probs, emission_probs,␣
↪tag_counts)
[ ]: def calculate_accuracy(predictions, test_sentences):

correct = 0
total = 0
for pred, sent in zip(predictions, test_sentences):

tags = [tag for word, tag in sent]
for (word, p_tag), t_tag in zip(pred, tags):
if p_tag == t_tag:
correct += 1
total += 1
return correct / total
accuracy = calculate_accuracy(predictions, test_sentences)

print(f"Độ chính xác: {accuracy:.2f}")
Độ chính xác: 0.95

lab7

Uploaded by

Document Information

Copyright

Available Formats

Share this document

Share or Embed Document

Sharing Options

Did you find this document useful?

Is this content inappropriate?

Copyright:

Available Formats

lab7

Uploaded by

Copyright:

Available Formats

lab7

June 17, 2024

[2]: import numpy as np

[3]: # Đọc dữ liệu từ file CSV

# Xây dựng từ điển từ và tập các nhãn

# Chuyển đổi từ điển và nhãn thành danh sách để dễ dàng xử lý

print(f"Số lượng từ: {len(words)}")

Số lượng từ: 35178

[ ]: # Chuyển đổi dữ liệu thành dạng danh sách các câu

for index, row in df.iterrows():

# Chia dữ liệu thành tập huấn luyện và kiểm thử

print(f"Số lượng câu trong tập huấn luyện: {len(train_sentences)}")

Số lượng câu trong tập huấn luyện: 51604

[ ]: # Tính toán xác suất chuyển tiếp và phát xạ

for sentence in train_sentences:

# Chuyển đổi counts thành xác suất

for prev_tag, tag_counts_dict in transition_counts.items():

for tag, word_counts_dict in emission_counts.items():

return transition_probs, emission_probs, tag_counts

# Huấn luyện HMM

[ ]: def viterbi(words, transition_probs, emission_probs, tag_counts):

viterbi_matrix[i, t] = max_tr_prob * emission_probs[tag].

# Truy ngược để tìm chuỗi trạng thái tốt nhất

best_tags = [tags[index] for index in best_path]

[ ]: # Dự đoán trên tập kiểm thử

# Tính toán độ chính xác

for pred, sent in zip(predictions, test_sentences):

return correct / total

accuracy = calculate_accuracy(predictions, test_sentences)

Độ chính xác: 0.95

# Đọc dữ liệu từ file

print(f"Số lượng câu trong tập huấn luyện: {len(train_sentences)}")

Số lượng câu trong tập huấn luyện: 8456

for sentence in train_sentences:

# Chuyển đổi counts thành xác suất

for prev_tag, tag_counts_dict in transition_counts.items():

for tag, word_counts_dict in emission_counts.items():

return transition_probs, emission_probs, tag_counts

# Huấn luyện HMM

def viterbi(words, transition_probs, emission_probs, tag_counts):

viterbi_matrix[i, t] = max_tr_prob * emission_probs[tag].

# Truy ngược để tìm chuỗi trạng thái tốt nhất

best_tags = [tags[index] for index in best_path]

[ ]: def predict(test_sentences, transition_probs, emission_probs, tag_counts):

[ ]: def calculate_accuracy(predictions, test_sentences):

for pred, sent in zip(predictions, test_sentences):

return correct / total

accuracy = calculate_accuracy(predictions, test_sentences)

Độ chính xác: 0.95

You might also like