mirror of
https://github.com/caperren/school_archives.git
synced 2025-11-09 21:51:15 +00:00
Added missing classes from final year at OSU
This commit is contained in:
@@ -0,0 +1,89 @@
|
||||
import re
|
||||
from math import log
|
||||
from collections import defaultdict
|
||||
|
||||
non_alphanum = re.compile(r'[\W_]+')
|
||||
flatten = lambda l: [item for sublist in l for item in sublist]
|
||||
|
||||
|
||||
def process_word(word):
|
||||
return non_alphanum.sub('', word.lower())
|
||||
|
||||
|
||||
def process_document(document):
|
||||
""" Returns a tuple. The first element is the review split into array of words.
|
||||
The second element is an int indicating whether the review was positive. """
|
||||
return (
|
||||
[process_word(word) for word in document.split()][:-1],
|
||||
int(document.split()[-1])
|
||||
)
|
||||
|
||||
|
||||
def format_document(document, vocab):
|
||||
return (
|
||||
','.join(['1' if word in document[0] else '0' for word in vocab])
|
||||
+ ',' + str(document[1])
|
||||
)
|
||||
|
||||
|
||||
def preprocess(infilename, outfilename):
|
||||
""" Returns a tuple in the form of (vocab, documents).
|
||||
Vocab: a set of words, e.g. { 'great', 'steak', 'gross', ... }
|
||||
Documents: a tuple containing the array of words in the document followed
|
||||
by the sentiment of the document, represented by a 1 or 0, e.g.
|
||||
(['i', 'liked', 'the', 'food'], 1) """
|
||||
|
||||
# Process input file
|
||||
infile = open(infilename, 'r')
|
||||
raw_documents = infile.read().splitlines()
|
||||
documents = [process_document(document) for document in raw_documents]
|
||||
vocab = {process_word(word) for document in raw_documents for word in document.split()[:-1]}
|
||||
vocab.discard('')
|
||||
infile.close()
|
||||
|
||||
# Format text for outfile and write
|
||||
outfile = open(outfilename, 'w')
|
||||
formatted_vocab = ','.join(sorted(list(vocab)))
|
||||
formatted_documents = '\n'.join([format_document(document, vocab) for document in documents])
|
||||
outfile.write(formatted_vocab + '\n' + formatted_documents)
|
||||
outfile.close()
|
||||
|
||||
return vocab, documents
|
||||
|
||||
|
||||
def train_naive_bayes(vocab, documents, classes):
|
||||
logprior = {}
|
||||
loglikelihood = defaultdict(dict)
|
||||
big_doc = {}
|
||||
|
||||
def get_logprior(documents, c):
|
||||
class_document_count = len([doc for doc in documents if doc[1] == c])
|
||||
total_document_count = len(documents)
|
||||
return log(class_document_count / total_document_count)
|
||||
|
||||
def get_bigdoc(documents, c):
|
||||
return flatten([doc[0] for doc in documents if doc[1] == c])
|
||||
|
||||
for c in classes:
|
||||
logprior[c] = get_logprior(documents, c)
|
||||
big_doc[c] = get_bigdoc(documents, c)
|
||||
words_in_class_count = sum([big_doc[c].count(w) for w in vocab])
|
||||
|
||||
for word in vocab:
|
||||
word_count = big_doc[c].count(word)
|
||||
loglikelihood[word][c] = log((word_count + 1) / (words_in_class_count + 1))
|
||||
|
||||
return logprior, loglikelihood
|
||||
|
||||
|
||||
def test_naive_bayes(document, logprior, loglikelihood, classes, vocab):
|
||||
# Filter words not in the vocab
|
||||
document = [word for word in document[0] if word in vocab]
|
||||
prob = {}
|
||||
|
||||
for c in classes:
|
||||
prob[c] = logprior[c]
|
||||
for word in document:
|
||||
prob[c] = prob[c] + loglikelihood[word][c]
|
||||
|
||||
return max(prob.keys(), key=(lambda key: prob[key]))
|
||||
Reference in New Issue
Block a user