Multinomial Naive Bayes for Text Classification

Introduction to Information Retrieval - Manning

url: https://nlp.stanford.edu/IR-book/html/htmledition/naive-bayes-text-classification-1.html

Imports

import pandas as pd #input
import numpy as np #counts
#vocabulary, count of tokens
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
from math import log # to calculate posterior probability

Constants

It is good to have max_tokens higher than vocab_size = len(vocabulary), so that we can make use to all the words in the vocabulary for prediction. Otherwise, only the first max_tokens words of the vocabulary will be used.

max_tokens = 1000

Environment

Naive Bayes dataset - Introduction to Information Retrieval - Manning

url: https://nlp.stanford.edu/IR-book/html/htmledition/naive-bayes-text-classification-1.html

f_train = '../input/naive_bayes_demo_data_china/train.csv'
f_test = '../input/naive_bayes_demo_data_china/test.csv'
f_classes = '../input/naive_bayes_demo_data_china/classes.csv'
cols_train = ['class', 'title', 'body']
cols_test = ['class', 'title', 'body']
cols_classes = ['class_name']

AG News

url: https://github.com/mhjabreel/CharCnn_Keras/tree/master/data/ag_news_csv

f_train = '../input/ag_news_csv/train.csv'
f_test = '../input/ag_news_csv/test.csv'
f_classes = '../input/ag_news_csv/classes.txt'
cols_train = ['class', 'title', 'body']
cols_test = ['class', 'title', 'body']
cols_classes = ['class_name']

Data

df_train = pd.read_csv(f_train,
                       header=None)
df_train.columns = cols_train
df_test = pd.read_csv(f_test,
                      header=None)
df_test.columns = cols_test
df_classes = pd.read_csv(f_classes,
                         header=None)
df_classes.columns = cols_classes

Sanity check for data getting loaded

print(df_train.head(2))
print(df_test.head(2))
print(df_classes.head(2))

Helper Functions

"""
    return (vectorizer, list) the vocabulary of words that occur in the documents
    
    arguments:
    vec: vectorizer (of type tensorflow.keras.layers.experimental.preprocessing.TextVectorization)
    df_docs: dataframe with documents
    df_docs_colname: column name of the column having the documents
"""
def extract_vocabulary(vec, df_docs, doc_colname='body'):
    #WARNING: https://stackoverflow.com/questions/62679020/error-while-trying-to-save-npy-numpy-file
    #VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray
    vec.adapt(df_docs[doc_colname].values)
    return vec, vec.get_vocabulary()



"""
    return (number) the number of rows in the dataframe, each row corresponding to a document
    
    arguments:
    df_docs: dataframe with documents (can have columns in addition to the document)
"""
def count_docs(df_docs):
    return len(df_docs)



"""
    return (number) the number of rows in the dataframe belonging to class 'c'
    
    arguments:
    df_docs: dataframe with documents (should have a 'class' column in addition to the document)
    c: the class for which the document count is sought
    class_colname: column name of the column having class name/id
"""
def count_docs_in_class(df_docs, c, class_colname='class'):
    return count_docs(df_docs[df_docs[class_colname] == c])



"""
    return (string) the concatenated text of all documents of class c
    
    arguments:
    df_docs: dataframe with documents (should have a 'class' column in addition to the document)
    c: the class for which the documents are sought
    df_docs_colname: column name of the column having the documents
    class_colname: column name of the column having class name/id
"""
def concatenate_text_of_all_docs_in_class(df_docs, c, doc_colname='body', class_colname='class'):
    #for df[class == c] get the 'body' column, convert its type to list,
    # and concatenate all lists, separated by space.
    return ' '.join(df_docs[df_docs[class_colname] == c][doc_colname].to_list())



'''
    return (numpy array) an array of counts of tokens in the text,
    each array element corresponding to the count of token at that index in the vocabulary.
    
    arguments:
    vec: output_mode="count" based vectorizer
        that was 'adapt'ed to the documents to learn the vocabulary.
    text: concatenated documents belonging to a certain class
'''
def count_tokens_of_all_terms(vec, text):
    counts = vec([[text]])
    return counts[0].numpy()

Model

Training Algorithm

'''
    return 
            vectorizer,
            
            prior: 1-d array of dim num_classes
            (prior probability of document belonging to class),
            
            cond_prob: 2-d array of dim token x num_classes
            (conditional probability for all token, given the class)
            
    arguments:
    df_classes: dataframe with class names
    df_docs: dataframe with documents (should have a 'class' column in addition to the document 'body' column).
                provide suitable column names otherwise, using the doc_colname and class_colname arguments.
'''
def train_multinomial_nb(df_classes, df_docs,
                         doc_colname='body', class_colname='class'):
    
    #extract vocabulary
    
    #create an instance of TextVectorization
    vectorizer = TextVectorization(max_tokens=max_tokens, output_mode="count")
    vectorizer, vocabulary = extract_vocabulary(vectorizer, df_docs, doc_colname)
    
    #data structures for prior and conditional probabilities
    
    num_classes = len(df_classes) #number of classes of documents
    prior = np.zeros((num_classes), dtype=float) #prior probability of document belonging to a class
    vocab_size = len(vocabulary)
    cond_prob = np.zeros((vocab_size, num_classes), dtype=float)

    #total number of documents, to calculate prior probability of document belonging to a class
    N = count_docs(df_docs)
    
    #for each class...
    for c in range(num_classes):
        cls = c+1 #c is 0-based index, but class is 1-based index

        #calculate prior probability of document belonging to class cls
        N_c = count_docs_in_class(df_docs, cls, class_colname) #number of document of class cls
        prior[c] = N_c / N # (num_docs of class cls / total num_docs)

        #calculate likelihood: conditional probability for all token, given this class 

        #1. concatenate documents of this class
        text_c = concatenate_text_of_all_docs_in_class(df_docs, cls, doc_colname, class_colname)
        #2. in this concatenated text, get counts for each token
        T_ct_all_t = count_tokens_of_all_terms(vectorizer, text_c)
        #3. calculate denominator of conditional probability: sigma(T_ct) + |V|
        den = T_ct_all_t.sum() + vocab_size
        #4 set the likelihood: conditional probability for all tokens, given this class
        for t in range(vocab_size):
            t_idx = t+1 #vectorizer begins with an OOV, so, add 1 to accommodate OOV
            cond_prob[t][c] = (T_ct_all_t[t_idx] + 1)/den
            
    return vectorizer, prior, cond_prob

Prediction Algorithm

'''
    return (integer) the (0-based) index of class to which the document belongs
    
    arguments:
    df_classes: dataframe with class names
    vectorizer
    prior: 1-d array of dim num_classes (prior probability of document belonging to class)
    cond_prob: 2-d array of dim token x num_classes (conditional probability for all token, given the class)
    text: document whose class is to be predicted
'''
def apply_multinomial_nb(df_classes, vectorizer, prior, cond_prob, text):
    #count of each token
    W = count_tokens_of_all_terms(vectorizer, text)

    #vocabulary
    vocabulary = vectorizer.get_vocabulary()
    vocab_size = len(vocabulary)

    num_classes = len(df_classes) #number of classes of documents
    score = np.zeros((num_classes), dtype=float)

    #for each class, calculate the posterior probability

    #for each class...
    for c in range(num_classes):

        #for this class, add the log-prior probability to the score
        score[c] += log(prior[c], 10) #log to the base 10

        #for each term, add the log-likelihood to the score

        for t in range(vocab_size):
            t_idx = t+1 #vectorizer begins with an OOV, so, add 1 to accommodate OOV
            T_ct = W[t_idx] #count of number of times this token appeared in text

            #for a term appearing multiple times, repeatedly add the log-likelihood to the score
            for j in range(T_ct):
                score[c] += log(cond_prob[t][c], 10) #log to the base 10
                
    return score.argmax()

Training

#train the model
vectorizer, prior, cond_prob = train_multinomial_nb(df_classes, df_train)
print('vocabulary length is:', len(vectorizer.get_vocabulary()))
print('vocabulary is:', vectorizer.get_vocabulary()[:10])

Prediction

#predict the documents
N = len(df_test)
count_correct, count_incorrect = 0, 0
for i in range(1000):
    text = df_test['body'][i] #input document for classification
    pred_cls = apply_multinomial_nb(df_classes, vectorizer, prior, cond_prob, text)
    actual_cls = df_test['class'][i] - 1 #df_test.class is 1-based index, but df_classes and pred_class are 0-based index
    #print('(predicted, actual):', df_classes['class_name'][pred_cls], df_classes['class_name'][actual_cls])
    if pred_cls == actual_cls:
        count_correct += 1
    else:
        count_incorrect += 1
print('Correct: ', count_correct, 'Incorrect: ', count_incorrect)
print('Percentage of correct predictions: ', (count_correct * 100)/(count_correct + count_incorrect))