Portfolio


"""
Train the model
- Note: you don't need to touch this function
  since you can implement all of your training strategies in the NBModel class
"""
def train(train_dataset):
    return NBModel(train_dataset)

"""
Inference with the model
- Note: you need to fill the ?? parts with the functions that you implement earlier, e.g., model.spam_cond_prob(word), etc
"""
def inference(model, test_dataset):
    n_correct = 0
    for content, is_spam in test_dataset:
        words_set = tokenize(content)  # The set of words in an tested email (i.e., inference)

        ########################################################

        spam_prob = model.spam_prob()  # You need to modify this line
        ham_prob = 1-model.spam_prob()  # You need to modify this line
        is_spam_prob = spam_prob # initial value
        is_ham_prob = ham_prob
        for word in words_set:
            if model.word_exists(word):
                # Implement something here
                # Hint:
                # is_spam_prob *= ??
                # is_ham_prob *= ??

                is_spam_prob *= model.spam_cond_prob(word)
                is_ham_prob *= model.ham_cond_prob(word)
                pass
        spam_prob = is_spam_prob
        ham_prob = is_ham_prob
        ########################################################

        # If the probability of spam is higher than that of ham, then we predict it as a spam
        is_spam_prediction = spam_prob >= ham_prob
        if is_spam_prediction == is_spam:
            n_correct += 1

    n_samples = len(test_dataset)
    accuracy = n_correct / n_samples
    print("Accuracy {} ({} / {})\n".format(accuracy, n_correct, n_samples))


class NBModel:
    def __init__(self, train_dataset):
        self.train_dataset = train_dataset
        self.all_words_list = tokenize_dataset(train_dataset)
        pass  # you may want to do some computation here to prepare the results of following member functions

    """
    Task 1. return if a word was presented in the training dataset
    """
    def word_exists(self, word):
        if word in self.all_words_list: # 토큰화된 train_dataset 리스트에 word가 있다면
          return True
        else:
          return False

    """
    Task 2. P(S)
    Return the probability (0.0~1.0) that an email is a spam
    """
    def spam_prob(self):
        total_rows = len(self.train_dataset) # 전체 행 갯수
        spam_rows = 0 # spam 행 갯수
        for i in range(total_rows):
          is_spam = self.train_dataset[i][1]
          if is_spam == 1: # if is_spam이 1이면
            spam_rows += 1 # +1
        spam_probability = spam_rows/total_rows # spam 행/ 전체 행
        return spam_probability

    """
    Task 3. P(word_i|S=spam)
    Return the conditional probability (0.0~1.0) that a spam email has the given word
    """

    def spam_cond_prob(self, word):
        # spam 확률 (분모)
        spam_probability = self.spam_prob()

        # word in spam 확률 (분자)
        total_rows = len(self.train_dataset) # 전체 행 갯수
        spam_rows = 0 # spam 행 갯수
        word_in_spam_rows = 0 # word가 있는 spam의 행 갯수
        for i in range(total_rows):
          content, is_spam = self.train_dataset[i]
          if is_spam == 1: # if is_spam이 1이면
            spam_rows += 1 # +1
            if word in tokenize(content): # is_spam이 1인 메세지 안에 word가 있는지 확인
              word_in_spam_rows += 1 # +1

        # 조건부 확률
        if spam_probability == 0:
          return 0  # 분모가 0이면 조건부 확률은 0

        word_in_spam_probability = word_in_spam_rows/total_rows # word가 spam인 메세지에에 있을 확률
        spam_conditional_probability = word_in_spam_probability/spam_probability
        return spam_conditional_probability

    """
    Task 4. P(word_i|S=ham)
    Return the conditional probability (0.0~1.0) that a ham email has the given word
    """
    def ham_cond_prob(self, word):
        # ham 확률 (분모)
        ham_probability = 1-self.spam_prob()

        # word in ham 확률 (분자)
        total_rows = len(self.train_dataset) # 전체 행 갯수
        ham_rows = 0 # ham 행 갯수
        word_in_ham_rows = 0 # word가 있는 ham의 행 갯수
        for i in range(total_rows):
          content, is_spam = self.train_dataset[i]
          if is_spam == 0: # if is_spam이 0이면 (ham 이면)
            ham_rows += 1 # +1
            if word in tokenize(content): # is_spam이 0인 메세지 안에 word가 있는지 확인
              word_in_ham_rows += 1 # +1

        if ham_probability == 0:
          return 0  # 분모가 0이면 조건부 확률은 0

        word_in_ham_probability = word_in_ham_rows/total_rows # word가 ham인 메세지에에 있을 확률
        ham_conditional_probability = word_in_ham_probability/ham_probability
        return ham_conditional_probability

    """
    You can add other member functions and variables if needed
    """