mirror of
https://github.com/caperren/school_archives.git
synced 2025-11-09 13:41:13 +00:00
90 lines
2.9 KiB
Python
90 lines
2.9 KiB
Python
import re
|
|
from math import log
|
|
from collections import defaultdict
|
|
|
|
non_alphanum = re.compile(r'[\W_]+')
|
|
flatten = lambda l: [item for sublist in l for item in sublist]
|
|
|
|
|
|
def process_word(word):
|
|
return non_alphanum.sub('', word.lower())
|
|
|
|
|
|
def process_document(document):
|
|
""" Returns a tuple. The first element is the review split into array of words.
|
|
The second element is an int indicating whether the review was positive. """
|
|
return (
|
|
[process_word(word) for word in document.split()][:-1],
|
|
int(document.split()[-1])
|
|
)
|
|
|
|
|
|
def format_document(document, vocab):
|
|
return (
|
|
','.join(['1' if word in document[0] else '0' for word in vocab])
|
|
+ ',' + str(document[1])
|
|
)
|
|
|
|
|
|
def preprocess(infilename, outfilename):
|
|
""" Returns a tuple in the form of (vocab, documents).
|
|
Vocab: a set of words, e.g. { 'great', 'steak', 'gross', ... }
|
|
Documents: a tuple containing the array of words in the document followed
|
|
by the sentiment of the document, represented by a 1 or 0, e.g.
|
|
(['i', 'liked', 'the', 'food'], 1) """
|
|
|
|
# Process input file
|
|
infile = open(infilename, 'r')
|
|
raw_documents = infile.read().splitlines()
|
|
documents = [process_document(document) for document in raw_documents]
|
|
vocab = {process_word(word) for document in raw_documents for word in document.split()[:-1]}
|
|
vocab.discard('')
|
|
infile.close()
|
|
|
|
# Format text for outfile and write
|
|
outfile = open(outfilename, 'w')
|
|
formatted_vocab = ','.join(sorted(list(vocab)))
|
|
formatted_documents = '\n'.join([format_document(document, vocab) for document in documents])
|
|
outfile.write(formatted_vocab + '\n' + formatted_documents)
|
|
outfile.close()
|
|
|
|
return vocab, documents
|
|
|
|
|
|
def train_naive_bayes(vocab, documents, classes):
|
|
logprior = {}
|
|
loglikelihood = defaultdict(dict)
|
|
big_doc = {}
|
|
|
|
def get_logprior(documents, c):
|
|
class_document_count = len([doc for doc in documents if doc[1] == c])
|
|
total_document_count = len(documents)
|
|
return log(class_document_count / total_document_count)
|
|
|
|
def get_bigdoc(documents, c):
|
|
return flatten([doc[0] for doc in documents if doc[1] == c])
|
|
|
|
for c in classes:
|
|
logprior[c] = get_logprior(documents, c)
|
|
big_doc[c] = get_bigdoc(documents, c)
|
|
words_in_class_count = sum([big_doc[c].count(w) for w in vocab])
|
|
|
|
for word in vocab:
|
|
word_count = big_doc[c].count(word)
|
|
loglikelihood[word][c] = log((word_count + 1) / (words_in_class_count + 1))
|
|
|
|
return logprior, loglikelihood
|
|
|
|
|
|
def test_naive_bayes(document, logprior, loglikelihood, classes, vocab):
|
|
# Filter words not in the vocab
|
|
document = [word for word in document[0] if word in vocab]
|
|
prob = {}
|
|
|
|
for c in classes:
|
|
prob[c] = logprior[c]
|
|
for word in document:
|
|
prob[c] = prob[c] + loglikelihood[word][c]
|
|
|
|
return max(prob.keys(), key=(lambda key: prob[key]))
|