import re from math import log from collections import defaultdict non_alphanum = re.compile(r'[\W_]+') flatten = lambda l: [item for sublist in l for item in sublist] def process_word(word): return non_alphanum.sub('', word.lower()) def process_document(document): """ Returns a tuple. The first element is the review split into array of words. The second element is an int indicating whether the review was positive. """ return ( [process_word(word) for word in document.split()][:-1], int(document.split()[-1]) ) def format_document(document, vocab): return ( ','.join(['1' if word in document[0] else '0' for word in vocab]) + ',' + str(document[1]) ) def preprocess(infilename, outfilename): """ Returns a tuple in the form of (vocab, documents). Vocab: a set of words, e.g. { 'great', 'steak', 'gross', ... } Documents: a tuple containing the array of words in the document followed by the sentiment of the document, represented by a 1 or 0, e.g. (['i', 'liked', 'the', 'food'], 1) """ # Process input file infile = open(infilename, 'r') raw_documents = infile.read().splitlines() documents = [process_document(document) for document in raw_documents] vocab = {process_word(word) for document in raw_documents for word in document.split()[:-1]} vocab.discard('') infile.close() # Format text for outfile and write outfile = open(outfilename, 'w') formatted_vocab = ','.join(sorted(list(vocab))) formatted_documents = '\n'.join([format_document(document, vocab) for document in documents]) outfile.write(formatted_vocab + '\n' + formatted_documents) outfile.close() return vocab, documents def train_naive_bayes(vocab, documents, classes): logprior = {} loglikelihood = defaultdict(dict) big_doc = {} def get_logprior(documents, c): class_document_count = len([doc for doc in documents if doc[1] == c]) total_document_count = len(documents) return log(class_document_count / total_document_count) def get_bigdoc(documents, c): return flatten([doc[0] for doc in documents if doc[1] == c]) for c in classes: logprior[c] = get_logprior(documents, c) big_doc[c] = get_bigdoc(documents, c) words_in_class_count = sum([big_doc[c].count(w) for w in vocab]) for word in vocab: word_count = big_doc[c].count(word) loglikelihood[word][c] = log((word_count + 1) / (words_in_class_count + 1)) return logprior, loglikelihood def test_naive_bayes(document, logprior, loglikelihood, classes, vocab): # Filter words not in the vocab document = [word for word in document[0] if word in vocab] prob = {} for c in classes: prob[c] = logprior[c] for word in document: prob[c] = prob[c] + loglikelihood[word][c] return max(prob.keys(), key=(lambda key: prob[key]))