"""
Train the model
- Note: you don't need to touch this function
since you can implement all of your training strategies in the NBModel class
"""
def train(train_dataset):
return NBModel(train_dataset)
"""
Inference with the model
- Note: you need to fill the ?? parts with the functions that you implement earlier, e.g., model.spam_cond_prob(word), etc
"""
def inference(model, test_dataset):
n_correct = 0
for content, is_spam in test_dataset:
words_set = tokenize(content) # The set of words in an tested email (i.e., inference)
########################################################
spam_prob = model.spam_prob() # You need to modify this line
ham_prob = 1-model.spam_prob() # You need to modify this line
is_spam_prob = spam_prob # initial value
is_ham_prob = ham_prob
for word in words_set:
if model.word_exists(word):
# Implement something here
# Hint:
# is_spam_prob *= ??
# is_ham_prob *= ??
is_spam_prob *= model.spam_cond_prob(word)
is_ham_prob *= model.ham_cond_prob(word)
pass
spam_prob = is_spam_prob
ham_prob = is_ham_prob
########################################################
# If the probability of spam is higher than that of ham, then we predict it as a spam
is_spam_prediction = spam_prob >= ham_prob
if is_spam_prediction == is_spam:
n_correct += 1
n_samples = len(test_dataset)
accuracy = n_correct / n_samples
print("Accuracy {} ({} / {})\n".format(accuracy, n_correct, n_samples))
class NBModel:
def __init__(self, train_dataset):
self.train_dataset = train_dataset
self.all_words_list = tokenize_dataset(train_dataset)
pass # you may want to do some computation here to prepare the results of following member functions
"""
Task 1. return if a word was presented in the training dataset
"""
def word_exists(self, word):
if word in self.all_words_list: # 토큰화된 train_dataset 리스트에 word가 있다면
return True
else:
return False
"""
Task 2. P(S)
Return the probability (0.0~1.0) that an email is a spam
"""
def spam_prob(self):
total_rows = len(self.train_dataset) # 전체 행 갯수
spam_rows = 0 # spam 행 갯수
for i in range(total_rows):
is_spam = self.train_dataset[i][1]
if is_spam == 1: # if is_spam이 1이면
spam_rows += 1 # +1
spam_probability = spam_rows/total_rows # spam 행/ 전체 행
return spam_probability
"""
Task 3. P(word_i|S=spam)
Return the conditional probability (0.0~1.0) that a spam email has the given word
"""
def spam_cond_prob(self, word):
# spam 확률 (분모)
spam_probability = self.spam_prob()
# word in spam 확률 (분자)
total_rows = len(self.train_dataset) # 전체 행 갯수
spam_rows = 0 # spam 행 갯수
word_in_spam_rows = 0 # word가 있는 spam의 행 갯수
for i in range(total_rows):
content, is_spam = self.train_dataset[i]
if is_spam == 1: # if is_spam이 1이면
spam_rows += 1 # +1
if word in tokenize(content): # is_spam이 1인 메세지 안에 word가 있는지 확인
word_in_spam_rows += 1 # +1
# 조건부 확률
if spam_probability == 0:
return 0 # 분모가 0이면 조건부 확률은 0
word_in_spam_probability = word_in_spam_rows/total_rows # word가 spam인 메세지에에 있을 확률
spam_conditional_probability = word_in_spam_probability/spam_probability
return spam_conditional_probability
"""
Task 4. P(word_i|S=ham)
Return the conditional probability (0.0~1.0) that a ham email has the given word
"""
def ham_cond_prob(self, word):
# ham 확률 (분모)
ham_probability = 1-self.spam_prob()
# word in ham 확률 (분자)
total_rows = len(self.train_dataset) # 전체 행 갯수
ham_rows = 0 # ham 행 갯수
word_in_ham_rows = 0 # word가 있는 ham의 행 갯수
for i in range(total_rows):
content, is_spam = self.train_dataset[i]
if is_spam == 0: # if is_spam이 0이면 (ham 이면)
ham_rows += 1 # +1
if word in tokenize(content): # is_spam이 0인 메세지 안에 word가 있는지 확인
word_in_ham_rows += 1 # +1
if ham_probability == 0:
return 0 # 분모가 0이면 조건부 확률은 0
word_in_ham_probability = word_in_ham_rows/total_rows # word가 ham인 메세지에에 있을 확률
ham_conditional_probability = word_in_ham_probability/ham_probability
return ham_conditional_probability
"""
You can add other member functions and variables if needed
"""