HomeAboutMeBlogGuest
© 2025 Sejin Cha. All rights reserved.
Built with Next.js, deployed on Vercel

 
""" Train the model - Note: you don't need to touch this function since you can implement all of your training strategies in the NBModel class """ def train(train_dataset): return NBModel(train_dataset) """ Inference with the model - Note: you need to fill the ?? parts with the functions that you implement earlier, e.g., model.spam_cond_prob(word), etc """ def inference(model, test_dataset): n_correct = 0 for content, is_spam in test_dataset: words_set = tokenize(content) # The set of words in an tested email (i.e., inference) ######################################################## spam_prob = model.spam_prob() # You need to modify this line ham_prob = 1-model.spam_prob() # You need to modify this line is_spam_prob = spam_prob # initial value is_ham_prob = ham_prob for word in words_set: if model.word_exists(word): # Implement something here # Hint: # is_spam_prob *= ?? # is_ham_prob *= ?? is_spam_prob *= model.spam_cond_prob(word) is_ham_prob *= model.ham_cond_prob(word) pass spam_prob = is_spam_prob ham_prob = is_ham_prob ######################################################## # If the probability of spam is higher than that of ham, then we predict it as a spam is_spam_prediction = spam_prob >= ham_prob if is_spam_prediction == is_spam: n_correct += 1 n_samples = len(test_dataset) accuracy = n_correct / n_samples print("Accuracy {} ({} / {})\n".format(accuracy, n_correct, n_samples))
class NBModel: def __init__(self, train_dataset): self.train_dataset = train_dataset self.all_words_list = tokenize_dataset(train_dataset) pass # you may want to do some computation here to prepare the results of following member functions """ Task 1. return if a word was presented in the training dataset """ def word_exists(self, word): if word in self.all_words_list: # 토큰화된 train_dataset 리스트에 word가 있다면 return True else: return False """ Task 2. P(S) Return the probability (0.0~1.0) that an email is a spam """ def spam_prob(self): total_rows = len(self.train_dataset) # 전체 행 갯수 spam_rows = 0 # spam 행 갯수 for i in range(total_rows): is_spam = self.train_dataset[i][1] if is_spam == 1: # if is_spam이 1이면 spam_rows += 1 # +1 spam_probability = spam_rows/total_rows # spam 행/ 전체 행 return spam_probability """ Task 3. P(word_i|S=spam) Return the conditional probability (0.0~1.0) that a spam email has the given word """ def spam_cond_prob(self, word): # spam 확률 (분모) spam_probability = self.spam_prob() # word in spam 확률 (분자) total_rows = len(self.train_dataset) # 전체 행 갯수 spam_rows = 0 # spam 행 갯수 word_in_spam_rows = 0 # word가 있는 spam의 행 갯수 for i in range(total_rows): content, is_spam = self.train_dataset[i] if is_spam == 1: # if is_spam이 1이면 spam_rows += 1 # +1 if word in tokenize(content): # is_spam이 1인 메세지 안에 word가 있는지 확인 word_in_spam_rows += 1 # +1 # 조건부 확률 if spam_probability == 0: return 0 # 분모가 0이면 조건부 확률은 0 word_in_spam_probability = word_in_spam_rows/total_rows # word가 spam인 메세지에에 있을 확률 spam_conditional_probability = word_in_spam_probability/spam_probability return spam_conditional_probability """ Task 4. P(word_i|S=ham) Return the conditional probability (0.0~1.0) that a ham email has the given word """ def ham_cond_prob(self, word): # ham 확률 (분모) ham_probability = 1-self.spam_prob() # word in ham 확률 (분자) total_rows = len(self.train_dataset) # 전체 행 갯수 ham_rows = 0 # ham 행 갯수 word_in_ham_rows = 0 # word가 있는 ham의 행 갯수 for i in range(total_rows): content, is_spam = self.train_dataset[i] if is_spam == 0: # if is_spam이 0이면 (ham 이면) ham_rows += 1 # +1 if word in tokenize(content): # is_spam이 0인 메세지 안에 word가 있는지 확인 word_in_ham_rows += 1 # +1 if ham_probability == 0: return 0 # 분모가 0이면 조건부 확률은 0 word_in_ham_probability = word_in_ham_rows/total_rows # word가 ham인 메세지에에 있을 확률 ham_conditional_probability = word_in_ham_probability/ham_probability return ham_conditional_probability """ You can add other member functions and variables if needed """