mirror of
https://github.com/caperren/school_archives.git
synced 2025-11-09 21:51:15 +00:00
Added work from my other class repositories before deletion
This commit is contained in:
@@ -0,0 +1,84 @@
|
||||
# eventkwic.txt by Corwin Perren
|
||||
|
||||
###########################################
|
||||
########## Specification Writeup ##########
|
||||
###########################################
|
||||
My specification for the Kwic class is as follows. Once Kwic is constructed, it waits for a method call. If no text has
|
||||
been added to the implementation and either index or listPairs is called, they will return empty arrays. Once text is
|
||||
added using addText, that text is appended to a class variable as one long string. It is done this way to make sure that
|
||||
lines that may be broken up into multiple addText calls still work properly. Now that there is text to process, a call
|
||||
to index will process the whole string that's been stored and return the proper kwic index. If listPairs is called
|
||||
instead, it will internally run index, but not return the indexed data, so that the new text is processed. It then
|
||||
creates the pairs and returns the array. Index works semi-incrementally. Rather than keeping track of every line of text
|
||||
ever added to the class, index processes the new data into the array of tuples (again, just for the new input), appends
|
||||
it to the class's global array, and then resorts it. In this way, we avoid having to recompute the sentences that have
|
||||
already been processed, minus having to re-alphabetize. I chose to not re-compute the sentences each time there was a
|
||||
new call to addText as that seemed like a waste of cpu cycles. It's basically a trade off of having a call to index
|
||||
be fast, or having the call to addText be fast. One potential option I considered was to use multi-threading to
|
||||
continually process the input text for both indexing and listing pairs. While this would have greatly increased the
|
||||
complexity (especially where eventspec is concerned), it also would have resulted in a faster implementation.
|
||||
|
||||
I used the eventspec class and the kwic.fsm state machine to verify that all of the above processes happen correctly,
|
||||
including handling the extra constructor fields for periodsToBreaks and ignoreWords when they are used. By printing out
|
||||
the steps taken through processing, it is very easy to see the program take the correct steps. In the case that a
|
||||
catastrophic state logic error has occurred, the EventSpec class will stop execution with a trace statement, making
|
||||
it very easy to diagnose what incorrect step was taken and correct it. I decided not to place event calls where there
|
||||
was the potential for large loops to keep the log of steps taken clear and concise. Had I placed further logic to handle
|
||||
these loops, there could potentially be hundreds, thousands, or more logged steps that would make debugging difficult.
|
||||
In a sense, using EventSpec performs a similar function to unit testing as if you ever change your code in a way that
|
||||
causes it to skip a step, or produce fatally incorrect output, the class will let you know roughly where the changed
|
||||
code is, so therefore a good idea about what went wrong.
|
||||
|
||||
#####################################
|
||||
########## Trace Output #1 ##########
|
||||
#####################################
|
||||
# Code run
|
||||
kc = Kwic(periodsToBreaks=True)
|
||||
kc.addText("This pair? is good.\n So is this pair and that pair")
|
||||
kc.index()
|
||||
kc.reset()
|
||||
kc.addText("This pair? is good.\n So is this pair and that pair")
|
||||
kc.listPairs()
|
||||
kc.print_eventspec_log()
|
||||
|
||||
# Trace Output
|
||||
STEP #0: callConstructor --> idle
|
||||
STEP #1: callAddText --> idle
|
||||
STEP #2: callIndex --> processIndex
|
||||
STEP #3: callProcessIndex --> checkIfText
|
||||
STEP #4: callSplitPeriods --> splitIntoTuples
|
||||
STEP #5: callSplitAsTuples --> fillCircular
|
||||
STEP #6: callFillCircular --> checkIgnoreOrAlpha
|
||||
STEP #7: callAlphabetize --> idle
|
||||
STEP #8: callReset --> idle
|
||||
STEP #9: callAddText --> idle
|
||||
STEP #10: callListPairs --> processListPairs
|
||||
STEP #11: callProcessListPairs --> listPairsIndexOrList
|
||||
STEP #12: callProcessIndex --> checkIfText
|
||||
STEP #13: callSplitPeriods --> splitIntoTuples
|
||||
STEP #14: callSplitAsTuples --> fillCircular
|
||||
STEP #15: callFillCircular --> checkIgnoreOrAlpha
|
||||
STEP #16: callAlphabetize --> idle
|
||||
STEP #17: callCreateListPairs --> idle
|
||||
|
||||
#####################################
|
||||
########## Trace Output #2 ##########
|
||||
#####################################
|
||||
# Code run
|
||||
kc = Kwic(ignoreWords=["and", "So"])
|
||||
kc.addText("This pair? is good.\n So is this pair and that pair")
|
||||
kc.listPairs()
|
||||
kc.print_eventspec_log()
|
||||
|
||||
# Trace Output
|
||||
STEP #0: callConstructor --> idle
|
||||
STEP #1: callAddText --> idle
|
||||
STEP #2: callListPairs --> processListPairs
|
||||
STEP #3: callProcessListPairs --> listPairsIndexOrList
|
||||
STEP #4: callProcessIndex --> checkIfText
|
||||
STEP #5: processNewlineSplit --> splitIntoTuples
|
||||
STEP #6: callSplitAsTuples --> fillCircular
|
||||
STEP #7: callFillCircular --> checkIgnoreOrAlpha
|
||||
STEP #8: callRemoveWords --> removingWords
|
||||
STEP #9: callAlphabetize --> idle
|
||||
STEP #10: callCreateListPairs --> idle
|
||||
@@ -0,0 +1,66 @@
|
||||
|
||||
class EventSpec():
|
||||
|
||||
def readfsm(self,file):
|
||||
# takes a filename, returns a finite state machine
|
||||
# fsm is (begin state, structure)
|
||||
fsm = {}
|
||||
with open(file) as f:
|
||||
s = None
|
||||
t = {}
|
||||
for l in f:
|
||||
ls = l.split()
|
||||
if (ls == []) or (ls[0][0] == "#"):
|
||||
continue
|
||||
if ls[0] == "state:":
|
||||
if s != None:
|
||||
if s in fsm:
|
||||
raise SyntaxError("Cannot define state " + s + " twice")
|
||||
fsm[s] = t
|
||||
s = ls[1]
|
||||
t = {}
|
||||
elif ls[1] == "->":
|
||||
t[ls[0]] = ls[2]
|
||||
elif ls[0] == "begin:":
|
||||
beginState = ls[1]
|
||||
else:
|
||||
raise SyntaxError(l + " is not a line in a finite state machine definition")
|
||||
if s != None:
|
||||
fsm[s] = t
|
||||
return (beginState, fsm)
|
||||
|
||||
|
||||
def __init__(self,file):
|
||||
(self.start, self.machine) = self.readfsm(file)
|
||||
self.state = self.start
|
||||
self.trace = []
|
||||
self.triggers = {}
|
||||
|
||||
def onEvent(self, event, action):
|
||||
self.triggers[event] = action
|
||||
|
||||
def reset(self):
|
||||
self.state = self.start
|
||||
self.trace = []
|
||||
|
||||
def trace(self):
|
||||
return self.trace
|
||||
|
||||
def state(self):
|
||||
return self.state
|
||||
|
||||
def printLog(self):
|
||||
i = 0
|
||||
for (e,s) in self.trace:
|
||||
print " STEP #"+str(i)+":",e,"-->",s
|
||||
i += 1
|
||||
|
||||
def event(self, event):
|
||||
try:
|
||||
self.state = self.machine[self.state][event]
|
||||
self.trace.append((event,self.state))
|
||||
if event in self.triggers:
|
||||
self.triggers[event]()
|
||||
except KeyError:
|
||||
raise RuntimeError("From state " + self.state + ", transition " + event + " is not allowed (trace: " + str(self.trace) + ")")
|
||||
|
||||
@@ -0,0 +1,39 @@
|
||||
begin: waitForStart
|
||||
|
||||
state: waitForStart
|
||||
callConstructor -> idle
|
||||
|
||||
state: idle
|
||||
callAddText -> idle
|
||||
callIndex -> processIndex
|
||||
callListPairs -> processListPairs
|
||||
callReset -> idle
|
||||
callCreateListPairs -> idle
|
||||
|
||||
state: processIndex
|
||||
callProcessIndex -> checkIfText
|
||||
|
||||
state: checkIfText
|
||||
callSplitPeriods -> splitIntoTuples
|
||||
processNewlineSplit -> splitIntoTuples
|
||||
processNoNewText -> idle
|
||||
|
||||
state: splitIntoTuples
|
||||
callSplitAsTuples -> fillCircular
|
||||
|
||||
state: fillCircular
|
||||
callFillCircular -> checkIgnoreOrAlpha
|
||||
|
||||
state: checkIgnoreOrAlpha
|
||||
callAlphabetize -> idle
|
||||
callRemoveWords -> removingWords
|
||||
|
||||
state: removingWords
|
||||
callAlphabetize -> idle
|
||||
|
||||
state: processListPairs
|
||||
callProcessListPairs -> listPairsIndexOrList
|
||||
|
||||
state: listPairsIndexOrList
|
||||
callProcessIndex -> checkIfText
|
||||
callCreateListPairs -> idle
|
||||
@@ -0,0 +1,270 @@
|
||||
# Created by Corwin Perren
|
||||
|
||||
######################################################
|
||||
########## Begin Kwic Class Implementation ###########
|
||||
######################################################
|
||||
class Kwic(object):
|
||||
def __init__(self, ignoreWords=None, periodsToBreaks=False):
|
||||
self.ignore_words = ignoreWords
|
||||
self.periods_to_breaks = periodsToBreaks
|
||||
|
||||
self.all_sentence_tuples = []
|
||||
|
||||
self.kwic_output_array = []
|
||||
self.list_pairs_output_array = []
|
||||
|
||||
self.text_to_add = ""
|
||||
|
||||
self.event_spec_instance = EventSpec("kwic.fsm")
|
||||
self.event_spec_instance.event("callConstructor")
|
||||
|
||||
def addText(self, new_text):
|
||||
self.event_spec_instance.event("callAddText")
|
||||
self.text_to_add += " " + str(new_text)
|
||||
|
||||
def index(self):
|
||||
self.event_spec_instance.event("callIndex")
|
||||
self.__process_kwic_index()
|
||||
return self.kwic_output_array
|
||||
|
||||
def listPairs(self):
|
||||
self.event_spec_instance.event("callListPairs")
|
||||
self.__process_list_pairs()
|
||||
return self.list_pairs_output_array
|
||||
|
||||
def reset(self):
|
||||
self.event_spec_instance.event("callReset")
|
||||
self.all_sentence_tuples = []
|
||||
|
||||
self.kwic_output_array = []
|
||||
self.list_pairs_output_array = []
|
||||
|
||||
self.text_to_add = []
|
||||
|
||||
def print_eventspec_log(self):
|
||||
self.event_spec_instance.printLog()
|
||||
|
||||
def __process_kwic_index(self):
|
||||
self.event_spec_instance.event("callProcessIndex")
|
||||
if self.text_to_add:
|
||||
if self.periods_to_breaks:
|
||||
split_into_sentences = self.__split_by_periods(self.text_to_add)
|
||||
else:
|
||||
self.event_spec_instance.event("processNewlineSplit")
|
||||
split_into_sentences = self.text_to_add.split('\n')
|
||||
|
||||
split_into_word_tuples = self.__split_by_word_as_tuples(split_into_sentences)
|
||||
|
||||
for sentence_tuple in split_into_word_tuples:
|
||||
self.all_sentence_tuples.append(sentence_tuple)
|
||||
|
||||
circular_shifted_data = self.__fill_with_circular_shifts_and_original(split_into_word_tuples)
|
||||
|
||||
if self.ignore_words:
|
||||
circular_shifted_data = self.__remove_words(circular_shifted_data, self.ignore_words)
|
||||
|
||||
for current_tuple in circular_shifted_data:
|
||||
self.kwic_output_array.append(current_tuple)
|
||||
|
||||
self.text_to_add = ""
|
||||
|
||||
self.kwic_output_array = self.__alphabetize_tuple_list(self.kwic_output_array)
|
||||
else:
|
||||
self.event_spec_instance.event("processNoNewText")
|
||||
|
||||
def __process_list_pairs(self):
|
||||
self.event_spec_instance.event("callProcessListPairs")
|
||||
if self.text_to_add:
|
||||
self.__process_kwic_index()
|
||||
|
||||
self.list_pairs_output_array = self.__create_list_pairs(self.all_sentence_tuples)
|
||||
|
||||
def __fill_with_circular_shifts_and_original(self, sentence_array):
|
||||
self.event_spec_instance.event("callFillCircular")
|
||||
output_array = []
|
||||
|
||||
for current_tuple in sentence_array:
|
||||
for index, _ in enumerate(current_tuple[0]):
|
||||
output_array.append((self.__array_circular_shift(current_tuple[0], index), current_tuple[1]))
|
||||
|
||||
return output_array
|
||||
|
||||
def __alphabetize_tuple_list(self, input_array):
|
||||
self.event_spec_instance.event("callAlphabetize")
|
||||
sorted_array = sorted(input_array, key=self.__alphabetized_key)
|
||||
return sorted_array
|
||||
|
||||
def __create_list_pairs(self, input_array):
|
||||
self.event_spec_instance.event("callCreateListPairs")
|
||||
known_pairs = {}
|
||||
|
||||
for sentence_array, _ in input_array:
|
||||
seen_in_sentence = set([])
|
||||
|
||||
for first_word in sentence_array:
|
||||
for second_word in sentence_array:
|
||||
first, second = self.__return_ordered_words(self.__sanitize_word(first_word),
|
||||
self.__sanitize_word(second_word))
|
||||
|
||||
if (first == second) or (first == ""):
|
||||
continue
|
||||
|
||||
if (first, second) not in seen_in_sentence:
|
||||
seen_in_sentence.add((first, second))
|
||||
|
||||
if (first, second) in known_pairs:
|
||||
known_pairs[(first, second)] += 1
|
||||
else:
|
||||
known_pairs[(first, second)] = 1
|
||||
|
||||
output_list = []
|
||||
|
||||
for key in known_pairs:
|
||||
if known_pairs[key] > 1:
|
||||
output_list.append((key, known_pairs[key]))
|
||||
|
||||
output_list.sort(key=self.__alphabetized_key)
|
||||
|
||||
return output_list
|
||||
|
||||
def __split_by_periods(self, document):
|
||||
self.event_spec_instance.event("callSplitPeriods")
|
||||
output_array = []
|
||||
temp_sentence = ""
|
||||
document_length_zero_indexed = len(document) - 1
|
||||
for current_index, current_value in enumerate(document):
|
||||
if current_value == '.':
|
||||
if (current_index == 0) or (current_index == document_length_zero_indexed) or \
|
||||
(document[current_index - 1].islower() and (document[current_index + 1].isspace() or
|
||||
(document[current_index + 1] == '\n'))):
|
||||
temp_sentence += current_value
|
||||
output_array.append(temp_sentence)
|
||||
temp_sentence = ""
|
||||
else:
|
||||
if current_value != '\n':
|
||||
temp_sentence += current_value
|
||||
else:
|
||||
temp_sentence += " "
|
||||
|
||||
if temp_sentence:
|
||||
output_array.append(temp_sentence)
|
||||
return output_array
|
||||
|
||||
def __split_by_word_as_tuples(self, sentence_array):
|
||||
self.event_spec_instance.event("callSplitAsTuples")
|
||||
|
||||
output_array = []
|
||||
index_incrementer = 0
|
||||
|
||||
for sentence in sentence_array:
|
||||
words_array = sentence.split(" ")
|
||||
words_array = filter(None, words_array)
|
||||
output_array.append((words_array, index_incrementer))
|
||||
index_incrementer += 1
|
||||
|
||||
return output_array
|
||||
|
||||
def __array_circular_shift(self, input_array, rotate_val):
|
||||
output_array = input_array[rotate_val:] + input_array[:rotate_val]
|
||||
return output_array
|
||||
|
||||
def __alphabetized_key(self, input_data):
|
||||
output_array = []
|
||||
for word in input_data[0]:
|
||||
output_array.append(word.lower())
|
||||
return output_array
|
||||
|
||||
def __remove_words(self, input_array, words):
|
||||
self.event_spec_instance.event("callRemoveWords")
|
||||
|
||||
lowered_input = []
|
||||
output_array = []
|
||||
|
||||
for word in words:
|
||||
lowered_input.append(word.lower())
|
||||
|
||||
for current_tuple in input_array:
|
||||
if current_tuple[0][0].lower().strip(".:!?,") in lowered_input:
|
||||
pass
|
||||
else:
|
||||
output_array.append(current_tuple)
|
||||
|
||||
return output_array
|
||||
|
||||
def __sanitize_word(self, input_word):
|
||||
return input_word.lower().translate(None, ".,?!:")
|
||||
|
||||
def __return_ordered_words(self, word_one, word_two):
|
||||
|
||||
if word_one < word_two:
|
||||
return word_one, word_two
|
||||
else:
|
||||
return word_two, word_one
|
||||
|
||||
|
||||
######################################################
|
||||
########## Begin Provided EventSpec Include ##########
|
||||
######################################################
|
||||
class EventSpec():
|
||||
def readfsm(self, file):
|
||||
# takes a filename, returns a finite state machine
|
||||
# fsm is (begin state, structure)
|
||||
fsm = {}
|
||||
with open(file) as f:
|
||||
s = None
|
||||
t = {}
|
||||
for l in f:
|
||||
ls = l.split()
|
||||
if (ls == []) or (ls[0][0] == "#"):
|
||||
continue
|
||||
if ls[0] == "state:":
|
||||
if s != None:
|
||||
if s in fsm:
|
||||
raise SyntaxError("Cannot define state " + s + " twice")
|
||||
fsm[s] = t
|
||||
s = ls[1]
|
||||
t = {}
|
||||
elif ls[1] == "->":
|
||||
t[ls[0]] = ls[2]
|
||||
elif ls[0] == "begin:":
|
||||
beginState = ls[1]
|
||||
else:
|
||||
raise SyntaxError(l + " is not a line in a finite state machine definition")
|
||||
if s != None:
|
||||
fsm[s] = t
|
||||
return (beginState, fsm)
|
||||
|
||||
def __init__(self, file):
|
||||
(self.start, self.machine) = self.readfsm(file)
|
||||
self.state = self.start
|
||||
self.trace = []
|
||||
self.triggers = {}
|
||||
|
||||
def onEvent(self, event, action):
|
||||
self.triggers[event] = action
|
||||
|
||||
def reset(self):
|
||||
self.state = self.start
|
||||
self.trace = []
|
||||
|
||||
def trace(self):
|
||||
return self.trace
|
||||
|
||||
def state(self):
|
||||
return self.state
|
||||
|
||||
def printLog(self):
|
||||
i = 0
|
||||
for (e, s) in self.trace:
|
||||
print " STEP #" + str(i) + ":", e, "-->", s
|
||||
i += 1
|
||||
|
||||
def event(self, event):
|
||||
try:
|
||||
self.state = self.machine[self.state][event]
|
||||
self.trace.append((event, self.state))
|
||||
if event in self.triggers:
|
||||
self.triggers[event]()
|
||||
except KeyError:
|
||||
raise RuntimeError("From state " + self.state + ", transition " + event + " is not allowed (trace: " + str(
|
||||
self.trace) + ")")
|
||||
@@ -0,0 +1,13 @@
|
||||
begin: start
|
||||
|
||||
state: start
|
||||
a -> infinibs
|
||||
b -> infinias
|
||||
|
||||
state: infinibs
|
||||
b -> infinibs
|
||||
a -> start
|
||||
|
||||
state: infinias
|
||||
a -> infiniais
|
||||
b -> start
|
||||
@@ -0,0 +1,12 @@
|
||||
# For testing implementation
|
||||
from kwic import Kwic
|
||||
|
||||
output_check = [(['and', 'that', 'pair', 'So', 'is', 'this', 'pair'], 1), (['good.', 'This', 'pair?', 'is'], 0), (['is', 'good.', 'This', 'pair?'], 0), (['is', 'this', 'pair', 'and', 'that', 'pair', 'So'], 1), (['pair', 'and', 'that', 'pair', 'So', 'is', 'this'], 1), (['pair', 'So', 'is', 'this', 'pair', 'and', 'that'], 1), (['pair?', 'is', 'good.', 'This'], 0), (['So', 'is', 'this', 'pair', 'and', 'that', 'pair'], 1), (['that', 'pair', 'So', 'is', 'this', 'pair', 'and'], 1), (['this', 'pair', 'and', 'that', 'pair', 'So', 'is'], 1), (['This', 'pair?', 'is', 'good.'], 0)]
|
||||
output_check_pairs = [(('is', 'pair'), 2), (('is', 'this'), 2), (('pair', 'this'), 2)]
|
||||
|
||||
if __name__ == "__main__":
|
||||
kc = Kwic()
|
||||
kc.addText("This pair? is good.\n So is this pair and that pair")
|
||||
print kc.listPairs() == output_check_pairs
|
||||
print kc.index() == output_check
|
||||
kc.print_eventspec_log()
|
||||
@@ -0,0 +1,29 @@
|
||||
from assign3 import eventspec
|
||||
|
||||
es = eventspec.EventSpec("kwic.fsm")
|
||||
|
||||
es.event("a")
|
||||
es.event("b")
|
||||
es.event("b")
|
||||
es.event("b")
|
||||
es.event("b")
|
||||
es.event("a")
|
||||
es.printLog()
|
||||
|
||||
# Beatles code
|
||||
|
||||
def printHello():
|
||||
print "Hello"
|
||||
def printGoodbye():
|
||||
print "Goodbye"
|
||||
|
||||
es.onEvent("a",printHello)
|
||||
es.onEvent("b",printGoodbye)
|
||||
|
||||
es.event("a")
|
||||
es.event("a")
|
||||
es.event("a")
|
||||
es.event("b")
|
||||
es.event("b")
|
||||
es.event("a")
|
||||
es.printLog()
|
||||
Reference in New Issue
Block a user