Added missing classes from final year at OSU

This commit is contained in:
2019-06-17 14:04:15 -07:00
parent 8fa1ffb1b0
commit c717a0316f
166 changed files with 653934 additions and 308 deletions

View File

@@ -0,0 +1,23 @@
# assignment3 - Sentiment Analysis with Naive Bayes
- Rutger Farry & Simon Wu
- CS331
- Dr. Rebecca Hutchinson
- 8 June 2017
## Running the code
The program looks for two files, named trainingSet.txt and testSet.txt, and outputs the results to stdout. If these files don't exist, the program will crash. Specs for these files are found here: It is built in Python 3, so running them should be simple on most computers:
```bash
python3 main.py
```
## About
### Construction
The program is split into a library and executable. All the computations and processing happen in `lib.py`, while the executable, `main.py` just calls the correct functions and provides a CLI.
The code has been tested for compatibility down to Python 3.3.2. It may work with older versions, but they are not supported. :warning: The script will not work on Python 2.
### Accuracy
:chart: In our tests, the sentiment analyzer achieved **79.7% accuracy**. Some ideas for improving accuracy are:
1. Increasing our training data
2. Spell-correcting mispelled words
3. Trimming excess characters. For example `soooooooo good` should be considered the same as `soo good`.

View File

@@ -0,0 +1,89 @@
import re
from math import log
from collections import defaultdict
non_alphanum = re.compile(r'[\W_]+')
flatten = lambda l: [item for sublist in l for item in sublist]
def process_word(word):
return non_alphanum.sub('', word.lower())
def process_document(document):
""" Returns a tuple. The first element is the review split into array of words.
The second element is an int indicating whether the review was positive. """
return (
[process_word(word) for word in document.split()][:-1],
int(document.split()[-1])
)
def format_document(document, vocab):
return (
','.join(['1' if word in document[0] else '0' for word in vocab])
+ ',' + str(document[1])
)
def preprocess(infilename, outfilename):
""" Returns a tuple in the form of (vocab, documents).
Vocab: a set of words, e.g. { 'great', 'steak', 'gross', ... }
Documents: a tuple containing the array of words in the document followed
by the sentiment of the document, represented by a 1 or 0, e.g.
(['i', 'liked', 'the', 'food'], 1) """
# Process input file
infile = open(infilename, 'r')
raw_documents = infile.read().splitlines()
documents = [process_document(document) for document in raw_documents]
vocab = {process_word(word) for document in raw_documents for word in document.split()[:-1]}
vocab.discard('')
infile.close()
# Format text for outfile and write
outfile = open(outfilename, 'w')
formatted_vocab = ','.join(sorted(list(vocab)))
formatted_documents = '\n'.join([format_document(document, vocab) for document in documents])
outfile.write(formatted_vocab + '\n' + formatted_documents)
outfile.close()
return vocab, documents
def train_naive_bayes(vocab, documents, classes):
logprior = {}
loglikelihood = defaultdict(dict)
big_doc = {}
def get_logprior(documents, c):
class_document_count = len([doc for doc in documents if doc[1] == c])
total_document_count = len(documents)
return log(class_document_count / total_document_count)
def get_bigdoc(documents, c):
return flatten([doc[0] for doc in documents if doc[1] == c])
for c in classes:
logprior[c] = get_logprior(documents, c)
big_doc[c] = get_bigdoc(documents, c)
words_in_class_count = sum([big_doc[c].count(w) for w in vocab])
for word in vocab:
word_count = big_doc[c].count(word)
loglikelihood[word][c] = log((word_count + 1) / (words_in_class_count + 1))
return logprior, loglikelihood
def test_naive_bayes(document, logprior, loglikelihood, classes, vocab):
# Filter words not in the vocab
document = [word for word in document[0] if word in vocab]
prob = {}
for c in classes:
prob[c] = logprior[c]
for word in document:
prob[c] = prob[c] + loglikelihood[word][c]
return max(prob.keys(), key=(lambda key: prob[key]))

View File

@@ -0,0 +1,54 @@
import sys
from lib import preprocess, train_naive_bayes, test_naive_bayes
def print_test_results(results, test_docs):
misclassified_docs = []
for result, doc in zip(results, test_docs):
if result != doc[1]:
misclassified_docs.append(doc + tuple([result]))
for doc in misclassified_docs:
print('misclassified {}: actual: {}, expected: {}'.format(doc[0], doc[2], doc[1]))
correct = len(test_docs) - len(misclassified_docs)
incorrect = len(misclassified_docs)
accuracy = correct / len(test_docs)
print("\n")
print("****************************************")
print("* RESULTS:")
print("* Correct: {}".format(correct))
print("* Incorrect: {}".format(incorrect))
print("*")
print("* Accuracy: {}".format(accuracy))
print("****************************************")
def test(test_docs, prior, likelihood, classes, vocab):
results = []
for test_doc in test_docs:
results.append(test_naive_bayes(test_doc, prior, likelihood, classes, vocab))
print_test_results(results, test_documents)
# Prevent running if imported as a module
if __name__ == '__main__':
classes = [0, 1]
# perform test on the training data
sys.stdout = open('output.txt', 'wt')
vocab, documents = preprocess('trainingSet.txt', 'preprocessed_train.txt')
_, test_documents = preprocess('trainingSet.txt', 'preprocessed_train.txt')
prior, likelihood = train_naive_bayes(vocab, documents, classes)
print("\n")
print("testing on training data")
test(documents, prior, likelihood, classes, vocab)
# perform test on the testing data
vocab, documents = preprocess('trainingSet.txt', 'preprocessed_train.txt')
_, test_documents = preprocess('testSet.txt', 'preprocessed_test.txt')
prior, likelihood = train_naive_bayes(vocab, documents, classes)
print("\n")
print("testing on testing data")
test(test_documents, prior, likelihood, classes, vocab)