mirror of
https://github.com/caperren/school_archives.git
synced 2025-11-09 13:41:13 +00:00
Added work from my other class repositories before deletion
This commit is contained in:
@@ -0,0 +1,2 @@
|
||||
def kwic(document, listPairs=False, ignoreWords=[]):
|
||||
return []
|
||||
@@ -0,0 +1,6 @@
|
||||
from kwic import kwic
|
||||
|
||||
document = "" # Input no data
|
||||
|
||||
if __name__ == "__main__":
|
||||
assert(kwic(document) == []) # Ensure that results are empty because there's no data
|
||||
@@ -0,0 +1,5 @@
|
||||
def kwic(document, listPairs=False, ignoreWords=[]):
|
||||
if not document:
|
||||
return []
|
||||
|
||||
return [document]
|
||||
@@ -0,0 +1,8 @@
|
||||
from kwic import kwic
|
||||
|
||||
empty_document = ""
|
||||
design_words_doc = "Design is hard.\nLet's just implement."
|
||||
|
||||
if __name__ == "__main__":
|
||||
assert(kwic(empty_document) == []) # Ensure empty input gives empty output
|
||||
assert(kwic(design_words_doc) != []) # Ensure real input does not produce empty output
|
||||
@@ -0,0 +1,7 @@
|
||||
def kwic(document, listPairs=False, ignoreWords=None, periodsToBreaks=False):
|
||||
if not document:
|
||||
return []
|
||||
|
||||
split_into_sentences = document.splitlines()
|
||||
|
||||
return [split_into_sentences]
|
||||
@@ -0,0 +1,12 @@
|
||||
from kwic import kwic
|
||||
|
||||
empty_document = ""
|
||||
design_words_doc = "Design is hard.\nLet's just implement."
|
||||
goodbye_buddy_doc = "Hello there.\nHello there, buddy.\nHello and goodbye, buddy.\nHello is like buddy Goodbye!"
|
||||
|
||||
if __name__ == "__main__":
|
||||
assert(kwic(empty_document) == []) # Ensure empty input gives empty output
|
||||
assert(kwic(design_words_doc) != []) # Ensure real input does not produce empty output
|
||||
|
||||
assert(len(kwic(design_words_doc)[0]) == 2) # [(), ()] Make sure it's broken into two lines
|
||||
assert(len(kwic(goodbye_buddy_doc)[0]) == 4) # [(), ()] Make sure it's broken into four lines
|
||||
@@ -0,0 +1,10 @@
|
||||
def kwic(document, listPairs=False, ignoreWords=None, periodsToBreaks=False):
|
||||
if not document:
|
||||
return []
|
||||
|
||||
if periodsToBreaks:
|
||||
split_into_sentences = document.split(".")
|
||||
else:
|
||||
split_into_sentences = document.splitlines()
|
||||
|
||||
return [split_into_sentences]
|
||||
@@ -0,0 +1,25 @@
|
||||
from kwic import kwic
|
||||
|
||||
empty_document = ""
|
||||
design_words_doc = "Design is hard.\nLet's just implement."
|
||||
goodbye_buddy_doc = "Hello there.\nHello there, buddy.\nHello and goodbye, buddy.\nHello is like buddy Goodbye!"
|
||||
hello_buddy_periods = "Hello there. Hello there, buddy. Hello and goodbye, buddy. Hello is like buddy Goodbye!"
|
||||
|
||||
if __name__ == "__main__":
|
||||
# Ensure empty input gives empty output
|
||||
assert(kwic(empty_document) == [])
|
||||
|
||||
# Ensure real input does not produce empty output
|
||||
assert(kwic(design_words_doc) != [])
|
||||
|
||||
# Make sure it's broken into two lines
|
||||
assert(len(kwic(design_words_doc)[0]) == 2)
|
||||
|
||||
# Make sure it's broken into four line
|
||||
assert(len(kwic(goodbye_buddy_doc)[0]) == 4)
|
||||
|
||||
# Make sure line with just periods only shows up as one normally
|
||||
assert(len(kwic(hello_buddy_periods)[0]) == 1)
|
||||
|
||||
# Make sure it's broken into four lines once it's broken by periods instead
|
||||
assert(len(kwic(hello_buddy_periods, periodsToBreaks=True)[0]) == 4)
|
||||
@@ -0,0 +1,29 @@
|
||||
def split_by_periods(document):
|
||||
output_array = []
|
||||
|
||||
sentence_array_temp = ""
|
||||
|
||||
for current_char in document:
|
||||
if current_char != "\n":
|
||||
sentence_array_temp += current_char
|
||||
|
||||
if current_char == ".":
|
||||
output_array.append(sentence_array_temp)
|
||||
sentence_array_temp = ""
|
||||
|
||||
if sentence_array_temp:
|
||||
output_array.append(sentence_array_temp)
|
||||
|
||||
return output_array
|
||||
|
||||
|
||||
def kwic(document, listPairs=False, ignoreWords=None, periodsToBreaks=False):
|
||||
if not document:
|
||||
return []
|
||||
|
||||
if periodsToBreaks:
|
||||
split_into_sentences = split_by_periods(document)
|
||||
else:
|
||||
split_into_sentences = document.splitlines()
|
||||
|
||||
return split_into_sentences
|
||||
@@ -0,0 +1,29 @@
|
||||
from kwic import kwic
|
||||
|
||||
empty_document = ""
|
||||
design_words_doc = "Design is hard.\nLet's just implement."
|
||||
goodbye_buddy_doc = "Hello there.\nHello there, buddy.\nHello and goodbye, buddy.\nHello is like buddy Goodbye!"
|
||||
hello_buddy_periods = "Hello there. Hello there, buddy. Hello and goodbye, buddy. Hello is like buddy Goodbye!"
|
||||
|
||||
hello_buddy_periods_output = ["Hello there.", " Hello there, buddy.", " Hello and goodbye, buddy.",
|
||||
" Hello is like buddy Goodbye!"]
|
||||
|
||||
if __name__ == "__main__":
|
||||
# Ensure empty input gives empty output
|
||||
assert(kwic(empty_document) == [])
|
||||
|
||||
# Ensure real input does not produce empty output
|
||||
assert(kwic(design_words_doc) != [])
|
||||
|
||||
# Make sure it's broken into two lines
|
||||
assert(len(kwic(design_words_doc)) == 2)
|
||||
|
||||
# Make sure it's broken into four line
|
||||
assert(len(kwic(goodbye_buddy_doc)) == 4)
|
||||
|
||||
# Make sure line with just periods shows up as itself
|
||||
assert(kwic(hello_buddy_periods)[0] == hello_buddy_periods)
|
||||
|
||||
# Make sure it's broken into four lines once it's broken by periods instead
|
||||
# Also, this time it keeps the ending period like it's supposed to
|
||||
assert(kwic(hello_buddy_periods, periodsToBreaks=True) == hello_buddy_periods_output)
|
||||
@@ -0,0 +1,44 @@
|
||||
def split_by_periods(document):
|
||||
output_array = []
|
||||
|
||||
sentence_array_temp = ""
|
||||
|
||||
for current_char in document:
|
||||
if current_char != "\n":
|
||||
sentence_array_temp += current_char
|
||||
|
||||
if current_char == ".":
|
||||
output_array.append(sentence_array_temp)
|
||||
sentence_array_temp = ""
|
||||
|
||||
if sentence_array_temp:
|
||||
output_array.append(sentence_array_temp)
|
||||
|
||||
return output_array
|
||||
|
||||
|
||||
def split_by_word_as_tuples(sentence_array):
|
||||
output_array = []
|
||||
index_incrementer = 0
|
||||
|
||||
for sentence in sentence_array:
|
||||
words_array = sentence.split(" ")
|
||||
words_array = filter(None, words_array)
|
||||
output_array.append((words_array, index_incrementer))
|
||||
index_incrementer += 1
|
||||
|
||||
return output_array
|
||||
|
||||
|
||||
def kwic(document, listPairs=False, ignoreWords=None, periodsToBreaks=False):
|
||||
if not document:
|
||||
return []
|
||||
|
||||
if periodsToBreaks:
|
||||
split_into_sentences = split_by_periods(document)
|
||||
else:
|
||||
split_into_sentences = document.splitlines()
|
||||
|
||||
split_into_word_tuples = split_by_word_as_tuples(split_into_sentences)
|
||||
|
||||
return split_into_word_tuples
|
||||
@@ -0,0 +1,33 @@
|
||||
from kwic import kwic
|
||||
|
||||
empty_document = ""
|
||||
design_words_doc = "Design is hard.\nLet's just implement."
|
||||
goodbye_buddy_doc = "Hello there.\nHello there, buddy.\nHello and goodbye, buddy.\nHello is like buddy Goodbye!"
|
||||
hello_buddy_periods = "Hello there. Hello there, buddy. Hello and goodbye, buddy. Hello is like buddy Goodbye!"
|
||||
|
||||
hello_buddy_periods_output = [(["Hello", "there.", "Hello", "there,", "buddy.", "Hello", "and", "goodbye,", "buddy.",
|
||||
"Hello", "is", "like", "buddy", "Goodbye!"], 0)]
|
||||
|
||||
hello_buddy_word_tuples_output = [(["Hello", "there."], 0),
|
||||
(["Hello", "there,", "buddy."], 1),
|
||||
(["Hello", "and", "goodbye,", "buddy."], 2),
|
||||
(["Hello", "is", "like", "buddy", "Goodbye!"], 3)]
|
||||
|
||||
if __name__ == "__main__":
|
||||
# Ensure empty input gives empty output
|
||||
assert(kwic(empty_document) == [])
|
||||
|
||||
# Ensure real input does not produce empty output
|
||||
assert(kwic(design_words_doc) != [])
|
||||
|
||||
# Make sure it's broken into two lines
|
||||
assert(len(kwic(design_words_doc)) == 2)
|
||||
|
||||
# Make sure it's broken into four line
|
||||
assert(len(kwic(goodbye_buddy_doc)) == 4)
|
||||
|
||||
# Make sure array contains same elements from kwic vs test output
|
||||
assert(any(x in kwic(hello_buddy_periods) for x in hello_buddy_periods_output))
|
||||
|
||||
# Make sure arrays contain the same elements from kwic vs test, even with periods for breaks...
|
||||
assert(any(x in kwic(hello_buddy_periods, periodsToBreaks=True) for x in hello_buddy_word_tuples_output))
|
||||
@@ -0,0 +1,61 @@
|
||||
def split_by_periods(document):
|
||||
output_array = []
|
||||
|
||||
sentence_array_temp = ""
|
||||
|
||||
for current_char in document:
|
||||
if current_char != "\n":
|
||||
sentence_array_temp += current_char
|
||||
|
||||
if current_char == ".":
|
||||
output_array.append(sentence_array_temp)
|
||||
sentence_array_temp = ""
|
||||
|
||||
if sentence_array_temp:
|
||||
output_array.append(sentence_array_temp)
|
||||
|
||||
return output_array
|
||||
|
||||
|
||||
def split_by_word_as_tuples(sentence_array):
|
||||
output_array = []
|
||||
index_incrementer = 0
|
||||
|
||||
for sentence in sentence_array:
|
||||
words_array = sentence.split(" ")
|
||||
words_array = filter(None, words_array)
|
||||
output_array.append((words_array, index_incrementer))
|
||||
index_incrementer += 1
|
||||
|
||||
return output_array
|
||||
|
||||
|
||||
def array_circular_shift(input_array, rotate_val):
|
||||
output_array = input_array[rotate_val:] + input_array[:rotate_val]
|
||||
return output_array
|
||||
|
||||
|
||||
def fill_with_circular_shifts_and_original(sentence_array):
|
||||
output_array = []
|
||||
|
||||
for current_tuple in sentence_array:
|
||||
for index, _ in enumerate(current_tuple[0]):
|
||||
output_array.append((array_circular_shift(current_tuple[0], index), current_tuple[1]))
|
||||
|
||||
return output_array
|
||||
|
||||
|
||||
def kwic(document, listPairs=False, ignoreWords=None, periodsToBreaks=False):
|
||||
if not document:
|
||||
return [], []
|
||||
|
||||
if periodsToBreaks:
|
||||
split_into_sentences = split_by_periods(document)
|
||||
else:
|
||||
split_into_sentences = document.splitlines()
|
||||
|
||||
split_into_word_tuples = split_by_word_as_tuples(split_into_sentences)
|
||||
|
||||
output_tuple = (fill_with_circular_shifts_and_original(split_into_word_tuples), [])
|
||||
|
||||
return output_tuple
|
||||
@@ -0,0 +1,55 @@
|
||||
import kwic
|
||||
|
||||
empty_document = ""
|
||||
design_words_doc = "Design is hard.\nLet's just implement."
|
||||
goodbye_buddy_doc = "Hello there.\nHello there, buddy.\nHello and goodbye, buddy.\nHello is like buddy Goodbye!"
|
||||
hello_buddy_periods = "Hello there. Hello there, buddy. Hello and goodbye, buddy. Hello is like buddy Goodbye!"
|
||||
|
||||
this_is_split_periods_circular_output = [
|
||||
(['This', 'is', 'something', 'split', 'by', 'periods.'], 0),
|
||||
(['is', 'something', 'split', 'by', 'periods.', 'This'], 0),
|
||||
(['something', 'split', 'by', 'periods.', 'This', 'is'], 0),
|
||||
(['split', 'by', 'periods.', 'This', 'is', 'something'], 0),
|
||||
(['by', 'periods.', 'This', 'is', 'something', 'split'], 0),
|
||||
(['periods.', 'This', 'is', 'something', 'split', 'by'], 0),
|
||||
(['Not', 'newlines.'], 1),
|
||||
(['newlines.', 'Not'], 1)
|
||||
]
|
||||
|
||||
if __name__ == "__main__":
|
||||
# Ensure empty input gives empty output
|
||||
assert(kwic.kwic(empty_document) == ([], []))
|
||||
|
||||
# Ensure real input does not produce empty output
|
||||
assert(kwic.kwic(design_words_doc) != ([], []))
|
||||
|
||||
# Make sure it's broken into two lines
|
||||
assert(len(kwic.kwic(design_words_doc)[0]) == 6)
|
||||
|
||||
# Make sure it's broken into four line
|
||||
assert(len(kwic.kwic(goodbye_buddy_doc)[0]) == 14)
|
||||
|
||||
# Just realized I can check each individual function, so here is the check to split by periods
|
||||
assert(kwic.split_by_periods("This is something split \nby periods. Not \n newlines.") ==
|
||||
["This is something split by periods.", " Not newlines."])
|
||||
|
||||
# This checks to make sure a sentence gets split into words properly
|
||||
assert(kwic.split_by_word_as_tuples(["This is something split by periods.", " Not newlines."]) ==
|
||||
[(['This', 'is', 'something', 'split', 'by', 'periods.'], 0), (['Not', 'newlines.'], 1)])
|
||||
|
||||
# These check to make sure that the circular shift function is rotating properly
|
||||
assert(kwic.array_circular_shift(["One", "Two", "Three", "Four", "Five"], 0) ==
|
||||
["One", "Two", "Three", "Four", "Five"])
|
||||
|
||||
assert (kwic.array_circular_shift(["One", "Two", "Three", "Four", "Five"], 2) ==
|
||||
["Three", "Four", "Five", "One", "Two"])
|
||||
|
||||
# This checks to make sure than circularly shifted versions of all sentences are correct
|
||||
assert(kwic.fill_with_circular_shifts_and_original([(['This', 'is', 'something', 'split', 'by', 'periods.'], 0),
|
||||
(['Not', 'newlines.'], 1)]) ==
|
||||
this_is_split_periods_circular_output)
|
||||
|
||||
# This gives a bit of a sanity check. It's making sure the circular shift works, and that the output is formatted
|
||||
# correctly...
|
||||
assert(kwic.kwic("This is something split \nby periods. Not \n newlines.", periodsToBreaks=True) ==
|
||||
(this_is_split_periods_circular_output, []))
|
||||
@@ -0,0 +1,64 @@
|
||||
def split_by_periods(document):
|
||||
output_array = []
|
||||
|
||||
sentence_array_temp = ""
|
||||
|
||||
for current_char in document:
|
||||
if current_char != "\n":
|
||||
sentence_array_temp += current_char
|
||||
|
||||
if current_char == ".":
|
||||
output_array.append(sentence_array_temp)
|
||||
sentence_array_temp = ""
|
||||
|
||||
if sentence_array_temp:
|
||||
output_array.append(sentence_array_temp)
|
||||
|
||||
return output_array
|
||||
|
||||
|
||||
def split_by_word_as_tuples(sentence_array):
|
||||
output_array = []
|
||||
index_incrementer = 0
|
||||
|
||||
for sentence in sentence_array:
|
||||
words_array = sentence.split(" ")
|
||||
words_array = filter(None, words_array)
|
||||
output_array.append((words_array, index_incrementer))
|
||||
index_incrementer += 1
|
||||
|
||||
return output_array
|
||||
|
||||
|
||||
def array_circular_shift(input_array, rotate_val):
|
||||
output_array = input_array[rotate_val:] + input_array[:rotate_val]
|
||||
return output_array
|
||||
|
||||
|
||||
def fill_with_circular_shifts_and_original(sentence_array):
|
||||
output_array = []
|
||||
|
||||
for current_tuple in sentence_array:
|
||||
for index, _ in enumerate(current_tuple[0]):
|
||||
output_array.append((array_circular_shift(current_tuple[0], index), current_tuple[1]))
|
||||
|
||||
return output_array
|
||||
|
||||
|
||||
def kwic(document, listPairs=False, ignoreWords=None, periodsToBreaks=False):
|
||||
if not document:
|
||||
return []
|
||||
|
||||
if periodsToBreaks:
|
||||
split_into_sentences = split_by_periods(document)
|
||||
else:
|
||||
split_into_sentences = document.split('\n')
|
||||
|
||||
split_into_word_tuples = split_by_word_as_tuples(split_into_sentences)
|
||||
|
||||
if listPairs:
|
||||
output_data = (fill_with_circular_shifts_and_original(split_into_word_tuples), [])
|
||||
else:
|
||||
output_data = fill_with_circular_shifts_and_original(split_into_word_tuples)
|
||||
|
||||
return output_data
|
||||
@@ -0,0 +1,55 @@
|
||||
import kwic
|
||||
|
||||
empty_document = ""
|
||||
design_words_doc = "Design is hard.\nLet's just implement."
|
||||
goodbye_buddy_doc = "Hello there.\nHello there, buddy.\nHello and goodbye, buddy.\nHello is like buddy Goodbye!"
|
||||
hello_buddy_periods = "Hello there. Hello there, buddy. Hello and goodbye, buddy. Hello is like buddy Goodbye!"
|
||||
|
||||
this_is_split_periods_circular_output = [
|
||||
(['This', 'is', 'something', 'split', 'by', 'periods.'], 0),
|
||||
(['is', 'something', 'split', 'by', 'periods.', 'This'], 0),
|
||||
(['something', 'split', 'by', 'periods.', 'This', 'is'], 0),
|
||||
(['split', 'by', 'periods.', 'This', 'is', 'something'], 0),
|
||||
(['by', 'periods.', 'This', 'is', 'something', 'split'], 0),
|
||||
(['periods.', 'This', 'is', 'something', 'split', 'by'], 0),
|
||||
(['Not', 'newlines.'], 1),
|
||||
(['newlines.', 'Not'], 1)
|
||||
]
|
||||
|
||||
if __name__ == "__main__":
|
||||
# Ensure empty input gives empty output
|
||||
assert(kwic.kwic(empty_document) == ([]))
|
||||
|
||||
# Ensure real input does not produce empty output
|
||||
assert(kwic.kwic(design_words_doc) != ([]))
|
||||
|
||||
# Make sure it's broken into two lines
|
||||
assert(len(kwic.kwic(design_words_doc)) == 6)
|
||||
|
||||
# Make sure it's broken into four line
|
||||
assert(len(kwic.kwic(goodbye_buddy_doc)) == 14)
|
||||
|
||||
# Just realized I can check each individual function, so here is the check to split by periods
|
||||
assert(kwic.split_by_periods("This is something split \nby periods. Not \n newlines.") ==
|
||||
["This is something split by periods.", " Not newlines."])
|
||||
|
||||
# This checks to make sure a sentence gets split into words properly
|
||||
assert(kwic.split_by_word_as_tuples(["This is something split by periods.", " Not newlines."]) ==
|
||||
[(['This', 'is', 'something', 'split', 'by', 'periods.'], 0), (['Not', 'newlines.'], 1)])
|
||||
|
||||
# These check to make sure that the circular shift function is rotating properly
|
||||
assert(kwic.array_circular_shift(["One", "Two", "Three", "Four", "Five"], 0) ==
|
||||
["One", "Two", "Three", "Four", "Five"])
|
||||
|
||||
assert (kwic.array_circular_shift(["One", "Two", "Three", "Four", "Five"], 2) ==
|
||||
["Three", "Four", "Five", "One", "Two"])
|
||||
|
||||
# This checks to make sure than circularly shifted versions of all sentences are correct
|
||||
assert(kwic.fill_with_circular_shifts_and_original([(['This', 'is', 'something', 'split', 'by', 'periods.'], 0),
|
||||
(['Not', 'newlines.'], 1)]) ==
|
||||
this_is_split_periods_circular_output)
|
||||
|
||||
# This gives a bit of a sanity check. It's making sure the circular shift works, and that the output is formatted
|
||||
# correctly...
|
||||
assert(kwic.kwic("This is something split \nby periods. Not \n newlines.", periodsToBreaks=True) ==
|
||||
this_is_split_periods_circular_output)
|
||||
@@ -0,0 +1,64 @@
|
||||
def split_by_periods(document):
|
||||
output_array = []
|
||||
|
||||
sentence_array_temp = ""
|
||||
|
||||
for current_char in document:
|
||||
if current_char != "\n":
|
||||
sentence_array_temp += current_char
|
||||
|
||||
if current_char == ".":
|
||||
output_array.append(sentence_array_temp)
|
||||
sentence_array_temp = ""
|
||||
|
||||
if sentence_array_temp:
|
||||
output_array.append(sentence_array_temp)
|
||||
|
||||
return output_array
|
||||
|
||||
|
||||
def split_by_word_as_tuples(sentence_array):
|
||||
output_array = []
|
||||
index_incrementer = 0
|
||||
|
||||
for sentence in sentence_array:
|
||||
words_array = sentence.split(" ")
|
||||
words_array = filter(None, words_array)
|
||||
output_array.append((words_array, index_incrementer))
|
||||
index_incrementer += 1
|
||||
|
||||
return output_array
|
||||
|
||||
|
||||
def array_circular_shift(input_array, rotate_val):
|
||||
output_array = input_array[rotate_val:] + input_array[:rotate_val]
|
||||
return output_array
|
||||
|
||||
|
||||
def fill_with_circular_shifts_and_original(sentence_array):
|
||||
output_array = []
|
||||
|
||||
for current_tuple in sentence_array:
|
||||
for index, _ in enumerate(current_tuple[0]):
|
||||
output_array.append((array_circular_shift(current_tuple[0], index), current_tuple[1]))
|
||||
|
||||
return output_array
|
||||
|
||||
|
||||
def kwic(document, listPairs=False, ignoreWords=None, periodsToBreaks=False):
|
||||
if not document:
|
||||
return []
|
||||
|
||||
if periodsToBreaks:
|
||||
split_into_sentences = split_by_periods(document)
|
||||
else:
|
||||
split_into_sentences = document.splitlines()
|
||||
|
||||
split_into_word_tuples = split_by_word_as_tuples(split_into_sentences)
|
||||
|
||||
if listPairs:
|
||||
output_data = (fill_with_circular_shifts_and_original(split_into_word_tuples), [])
|
||||
else:
|
||||
output_data = fill_with_circular_shifts_and_original(split_into_word_tuples)
|
||||
|
||||
return output_data
|
||||
@@ -0,0 +1,31 @@
|
||||
1. How did you decide what to test first? Would your final code change significantly if you changed the order of tests?
|
||||
|
||||
The process for deciding what to test mostly had to do with a logical breakdown of the core parts of the code so that
|
||||
things that relied on others to work were done after those pre-requisites. So, for example, before we could do anything
|
||||
first the document had to be broken down into sentences, then words. In that part, you could also change whether or not
|
||||
the sentence breaks were handled with newlines or periods. Afterwards, you could then deal with rearranging the words as
|
||||
necessary, handling determining pairs, and then excluding any words that were given as arguments. I don't feel like the
|
||||
code would have been massively different if it'd been done in reverse, but it definitely would have been more difficult
|
||||
to write. On top of that, you'd still have to think all the way through how the code would need to function, regardless
|
||||
of the order in which you wrote it, in order to know what needed to be written in the first place. Also, if you
|
||||
literally reversed the order of the tests, it could lead to you skipping the whole point of using tests (if you did it
|
||||
incorrectly). Again, for an example, if you wrote a test that produced the final output of the program as the initial
|
||||
input and expected a correct output, you'd either have to hard code many many values in to the resulting code while you
|
||||
implemented them for real, or you'd end up writing all necessary code at once, which would defeat the purpose.
|
||||
|
||||
2. What did you think of test driven development, for this problem? What are the strengths and weaknesses of the
|
||||
approach? Does it encourage/discourage certain kinds of program designs?
|
||||
|
||||
I'm torn about it. I feel that for this particular problem, it may have been overkill and actually hampered design
|
||||
progress at times. The main issue was that for each major step, the output of the code often changed enough that many
|
||||
of the tests had to be modified or rewritten to match. The definite strengths of this approach are that you can easily
|
||||
tell when you change something that breaks the functionality of your code, and it will immediately produce an exception.
|
||||
On the flip side, it does take much longer to write functional code depend on what exactly you're writing. I would
|
||||
argue that the larger the application being written, the more important it would be to do this kind of testing.
|
||||
Without it, you might be digging through hundreds of thousands to millions of lines of code to figure out what is
|
||||
breaking. I can definitely appreciate the fact that taking the time to write the tests helps guarantee that it's doing
|
||||
what you want, but I feel like it would be more useful in other programming contexts, like the one just mentioned.
|
||||
I'd say this style of coding encourages program designs that have a consistent output even if the underlying code
|
||||
changes often. For example, I feel like it'd be ideal for developing an API where once a standard has been created,
|
||||
that API should function identically for a long time, even if the backend is changing constantly. In general, this way
|
||||
of programming also encourages full coverage of edge cases, as you'll easily and quickly be able to test for them.
|
||||
@@ -0,0 +1,2 @@
|
||||
def kwic(document, listPairs=False, ignoreWords=[]):
|
||||
return []
|
||||
@@ -0,0 +1,5 @@
|
||||
def kwic(document, listPairs=False, ignoreWords=[]):
|
||||
if not document:
|
||||
return []
|
||||
|
||||
return [document]
|
||||
@@ -0,0 +1,7 @@
|
||||
def kwic(document, listPairs=False, ignoreWords=None, periodsToBreaks=False):
|
||||
if not document:
|
||||
return []
|
||||
|
||||
split_into_sentences = document.splitlines()
|
||||
|
||||
return [split_into_sentences]
|
||||
@@ -0,0 +1,10 @@
|
||||
def kwic(document, listPairs=False, ignoreWords=None, periodsToBreaks=False):
|
||||
if not document:
|
||||
return []
|
||||
|
||||
if periodsToBreaks:
|
||||
split_into_sentences = document.split(".")
|
||||
else:
|
||||
split_into_sentences = document.splitlines()
|
||||
|
||||
return [split_into_sentences]
|
||||
@@ -0,0 +1,29 @@
|
||||
def split_by_periods(document):
|
||||
output_array = []
|
||||
|
||||
sentence_array_temp = ""
|
||||
|
||||
for current_char in document:
|
||||
if current_char != "\n":
|
||||
sentence_array_temp += current_char
|
||||
|
||||
if current_char == ".":
|
||||
output_array.append(sentence_array_temp)
|
||||
sentence_array_temp = ""
|
||||
|
||||
if sentence_array_temp:
|
||||
output_array.append(sentence_array_temp)
|
||||
|
||||
return output_array
|
||||
|
||||
|
||||
def kwic(document, listPairs=False, ignoreWords=None, periodsToBreaks=False):
|
||||
if not document:
|
||||
return []
|
||||
|
||||
if periodsToBreaks:
|
||||
split_into_sentences = split_by_periods(document)
|
||||
else:
|
||||
split_into_sentences = document.splitlines()
|
||||
|
||||
return split_into_sentences
|
||||
@@ -0,0 +1,44 @@
|
||||
def split_by_periods(document):
|
||||
output_array = []
|
||||
|
||||
sentence_array_temp = ""
|
||||
|
||||
for current_char in document:
|
||||
if current_char != "\n":
|
||||
sentence_array_temp += current_char
|
||||
|
||||
if current_char == ".":
|
||||
output_array.append(sentence_array_temp)
|
||||
sentence_array_temp = ""
|
||||
|
||||
if sentence_array_temp:
|
||||
output_array.append(sentence_array_temp)
|
||||
|
||||
return output_array
|
||||
|
||||
|
||||
def split_by_word_as_tuples(sentence_array):
|
||||
output_array = []
|
||||
index_incrementer = 0
|
||||
|
||||
for sentence in sentence_array:
|
||||
words_array = sentence.split(" ")
|
||||
words_array = filter(None, words_array)
|
||||
output_array.append((words_array, index_incrementer))
|
||||
index_incrementer += 1
|
||||
|
||||
return output_array
|
||||
|
||||
|
||||
def kwic(document, listPairs=False, ignoreWords=None, periodsToBreaks=False):
|
||||
if not document:
|
||||
return []
|
||||
|
||||
if periodsToBreaks:
|
||||
split_into_sentences = split_by_periods(document)
|
||||
else:
|
||||
split_into_sentences = document.splitlines()
|
||||
|
||||
split_into_word_tuples = split_by_word_as_tuples(split_into_sentences)
|
||||
|
||||
return split_into_word_tuples
|
||||
@@ -0,0 +1,61 @@
|
||||
def split_by_periods(document):
|
||||
output_array = []
|
||||
|
||||
sentence_array_temp = ""
|
||||
|
||||
for current_char in document:
|
||||
if current_char != "\n":
|
||||
sentence_array_temp += current_char
|
||||
|
||||
if current_char == ".":
|
||||
output_array.append(sentence_array_temp)
|
||||
sentence_array_temp = ""
|
||||
|
||||
if sentence_array_temp:
|
||||
output_array.append(sentence_array_temp)
|
||||
|
||||
return output_array
|
||||
|
||||
|
||||
def split_by_word_as_tuples(sentence_array):
|
||||
output_array = []
|
||||
index_incrementer = 0
|
||||
|
||||
for sentence in sentence_array:
|
||||
words_array = sentence.split(" ")
|
||||
words_array = filter(None, words_array)
|
||||
output_array.append((words_array, index_incrementer))
|
||||
index_incrementer += 1
|
||||
|
||||
return output_array
|
||||
|
||||
|
||||
def array_circular_shift(input_array, rotate_val):
|
||||
output_array = input_array[rotate_val:] + input_array[:rotate_val]
|
||||
return output_array
|
||||
|
||||
|
||||
def fill_with_circular_shifts_and_original(sentence_array):
|
||||
output_array = []
|
||||
|
||||
for current_tuple in sentence_array:
|
||||
for index, _ in enumerate(current_tuple[0]):
|
||||
output_array.append((array_circular_shift(current_tuple[0], index), current_tuple[1]))
|
||||
|
||||
return output_array
|
||||
|
||||
|
||||
def kwic(document, listPairs=False, ignoreWords=None, periodsToBreaks=False):
|
||||
if not document:
|
||||
return [], []
|
||||
|
||||
if periodsToBreaks:
|
||||
split_into_sentences = split_by_periods(document)
|
||||
else:
|
||||
split_into_sentences = document.splitlines()
|
||||
|
||||
split_into_word_tuples = split_by_word_as_tuples(split_into_sentences)
|
||||
|
||||
output_tuple = (fill_with_circular_shifts_and_original(split_into_word_tuples), [])
|
||||
|
||||
return output_tuple
|
||||
Binary file not shown.
@@ -0,0 +1,55 @@
|
||||
import kwic
|
||||
|
||||
empty_document = ""
|
||||
design_words_doc = "Design is hard.\nLet's just implement."
|
||||
goodbye_buddy_doc = "Hello there.\nHello there, buddy.\nHello and goodbye, buddy.\nHello is like buddy Goodbye!"
|
||||
hello_buddy_periods = "Hello there. Hello there, buddy. Hello and goodbye, buddy. Hello is like buddy Goodbye!"
|
||||
|
||||
this_is_split_periods_circular_output = [
|
||||
(['This', 'is', 'something', 'split', 'by', 'periods.'], 0),
|
||||
(['is', 'something', 'split', 'by', 'periods.', 'This'], 0),
|
||||
(['something', 'split', 'by', 'periods.', 'This', 'is'], 0),
|
||||
(['split', 'by', 'periods.', 'This', 'is', 'something'], 0),
|
||||
(['by', 'periods.', 'This', 'is', 'something', 'split'], 0),
|
||||
(['periods.', 'This', 'is', 'something', 'split', 'by'], 0),
|
||||
(['Not', 'newlines.'], 1),
|
||||
(['newlines.', 'Not'], 1)
|
||||
]
|
||||
|
||||
if __name__ == "__main__":
|
||||
# Ensure empty input gives empty output
|
||||
assert(kwic.kwic(empty_document) == ([]))
|
||||
|
||||
# Ensure real input does not produce empty output
|
||||
assert(kwic.kwic(design_words_doc) != ([]))
|
||||
|
||||
# Make sure it's broken into two lines
|
||||
assert(len(kwic.kwic(design_words_doc)) == 6)
|
||||
|
||||
# Make sure it's broken into four line
|
||||
assert(len(kwic.kwic(goodbye_buddy_doc)) == 14)
|
||||
|
||||
# Just realized I can check each individual function, so here is the check to split by periods
|
||||
assert(kwic.split_by_periods("This is something split \nby periods. Not \n newlines.") ==
|
||||
["This is something split by periods.", " Not newlines."])
|
||||
|
||||
# This checks to make sure a sentence gets split into words properly
|
||||
assert(kwic.split_by_word_as_tuples(["This is something split by periods.", " Not newlines."]) ==
|
||||
[(['This', 'is', 'something', 'split', 'by', 'periods.'], 0), (['Not', 'newlines.'], 1)])
|
||||
|
||||
# These check to make sure that the circular shift function is rotating properly
|
||||
assert(kwic.array_circular_shift(["One", "Two", "Three", "Four", "Five"], 0) ==
|
||||
["One", "Two", "Three", "Four", "Five"])
|
||||
|
||||
assert (kwic.array_circular_shift(["One", "Two", "Three", "Four", "Five"], 2) ==
|
||||
["Three", "Four", "Five", "One", "Two"])
|
||||
|
||||
# This checks to make sure than circularly shifted versions of all sentences are correct
|
||||
assert(kwic.fill_with_circular_shifts_and_original([(['This', 'is', 'something', 'split', 'by', 'periods.'], 0),
|
||||
(['Not', 'newlines.'], 1)]) ==
|
||||
this_is_split_periods_circular_output)
|
||||
|
||||
# This gives a bit of a sanity check. It's making sure the circular shift works, and that the output is formatted
|
||||
# correctly...
|
||||
assert(kwic.kwic("This is something split \nby periods. Not \n newlines.", periodsToBreaks=True) ==
|
||||
this_is_split_periods_circular_output)
|
||||
@@ -0,0 +1,6 @@
|
||||
from kwic import kwic
|
||||
|
||||
document = "" # Input no data
|
||||
|
||||
if __name__ == "__main__":
|
||||
assert(kwic(document) == []) # Ensure that results are empty because there's no data
|
||||
@@ -0,0 +1,8 @@
|
||||
from kwic import kwic
|
||||
|
||||
empty_document = ""
|
||||
design_words_doc = "Design is hard.\nLet's just implement."
|
||||
|
||||
if __name__ == "__main__":
|
||||
assert(kwic(empty_document) == []) # Ensure empty input gives empty output
|
||||
assert(kwic(design_words_doc) != []) # Ensure real input does not produce empty output
|
||||
@@ -0,0 +1,12 @@
|
||||
from kwic import kwic
|
||||
|
||||
empty_document = ""
|
||||
design_words_doc = "Design is hard.\nLet's just implement."
|
||||
goodbye_buddy_doc = "Hello there.\nHello there, buddy.\nHello and goodbye, buddy.\nHello is like buddy Goodbye!"
|
||||
|
||||
if __name__ == "__main__":
|
||||
assert(kwic(empty_document) == []) # Ensure empty input gives empty output
|
||||
assert(kwic(design_words_doc) != []) # Ensure real input does not produce empty output
|
||||
|
||||
assert(len(kwic(design_words_doc)[0]) == 2) # [(), ()] Make sure it's broken into two lines
|
||||
assert(len(kwic(goodbye_buddy_doc)[0]) == 4) # [(), ()] Make sure it's broken into four lines
|
||||
@@ -0,0 +1,25 @@
|
||||
from kwic import kwic
|
||||
|
||||
empty_document = ""
|
||||
design_words_doc = "Design is hard.\nLet's just implement."
|
||||
goodbye_buddy_doc = "Hello there.\nHello there, buddy.\nHello and goodbye, buddy.\nHello is like buddy Goodbye!"
|
||||
hello_buddy_periods = "Hello there. Hello there, buddy. Hello and goodbye, buddy. Hello is like buddy Goodbye!"
|
||||
|
||||
if __name__ == "__main__":
|
||||
# Ensure empty input gives empty output
|
||||
assert(kwic(empty_document) == [])
|
||||
|
||||
# Ensure real input does not produce empty output
|
||||
assert(kwic(design_words_doc) != [])
|
||||
|
||||
# Make sure it's broken into two lines
|
||||
assert(len(kwic(design_words_doc)[0]) == 2)
|
||||
|
||||
# Make sure it's broken into four line
|
||||
assert(len(kwic(goodbye_buddy_doc)[0]) == 4)
|
||||
|
||||
# Make sure line with just periods only shows up as one normally
|
||||
assert(len(kwic(hello_buddy_periods)[0]) == 1)
|
||||
|
||||
# Make sure it's broken into four lines once it's broken by periods instead
|
||||
assert(len(kwic(hello_buddy_periods, periodsToBreaks=True)[0]) == 4)
|
||||
@@ -0,0 +1,29 @@
|
||||
from kwic import kwic
|
||||
|
||||
empty_document = ""
|
||||
design_words_doc = "Design is hard.\nLet's just implement."
|
||||
goodbye_buddy_doc = "Hello there.\nHello there, buddy.\nHello and goodbye, buddy.\nHello is like buddy Goodbye!"
|
||||
hello_buddy_periods = "Hello there. Hello there, buddy. Hello and goodbye, buddy. Hello is like buddy Goodbye!"
|
||||
|
||||
hello_buddy_periods_output = ["Hello there.", " Hello there, buddy.", " Hello and goodbye, buddy.",
|
||||
" Hello is like buddy Goodbye!"]
|
||||
|
||||
if __name__ == "__main__":
|
||||
# Ensure empty input gives empty output
|
||||
assert(kwic(empty_document) == [])
|
||||
|
||||
# Ensure real input does not produce empty output
|
||||
assert(kwic(design_words_doc) != [])
|
||||
|
||||
# Make sure it's broken into two lines
|
||||
assert(len(kwic(design_words_doc)) == 2)
|
||||
|
||||
# Make sure it's broken into four line
|
||||
assert(len(kwic(goodbye_buddy_doc)) == 4)
|
||||
|
||||
# Make sure line with just periods shows up as itself
|
||||
assert(kwic(hello_buddy_periods)[0] == hello_buddy_periods)
|
||||
|
||||
# Make sure it's broken into four lines once it's broken by periods instead
|
||||
# Also, this time it keeps the ending period like it's supposed to
|
||||
assert(kwic(hello_buddy_periods, periodsToBreaks=True) == hello_buddy_periods_output)
|
||||
@@ -0,0 +1,33 @@
|
||||
from kwic import kwic
|
||||
|
||||
empty_document = ""
|
||||
design_words_doc = "Design is hard.\nLet's just implement."
|
||||
goodbye_buddy_doc = "Hello there.\nHello there, buddy.\nHello and goodbye, buddy.\nHello is like buddy Goodbye!"
|
||||
hello_buddy_periods = "Hello there. Hello there, buddy. Hello and goodbye, buddy. Hello is like buddy Goodbye!"
|
||||
|
||||
hello_buddy_periods_output = [(["Hello", "there.", "Hello", "there,", "buddy.", "Hello", "and", "goodbye,", "buddy.",
|
||||
"Hello", "is", "like", "buddy", "Goodbye!"], 0)]
|
||||
|
||||
hello_buddy_word_tuples_output = [(["Hello", "there."], 0),
|
||||
(["Hello", "there,", "buddy."], 1),
|
||||
(["Hello", "and", "goodbye,", "buddy."], 2),
|
||||
(["Hello", "is", "like", "buddy", "Goodbye!"], 3)]
|
||||
|
||||
if __name__ == "__main__":
|
||||
# Ensure empty input gives empty output
|
||||
assert(kwic(empty_document) == [])
|
||||
|
||||
# Ensure real input does not produce empty output
|
||||
assert(kwic(design_words_doc) != [])
|
||||
|
||||
# Make sure it's broken into two lines
|
||||
assert(len(kwic(design_words_doc)) == 2)
|
||||
|
||||
# Make sure it's broken into four line
|
||||
assert(len(kwic(goodbye_buddy_doc)) == 4)
|
||||
|
||||
# Make sure array contains same elements from kwic vs test output
|
||||
assert(any(x in kwic(hello_buddy_periods) for x in hello_buddy_periods_output))
|
||||
|
||||
# Make sure arrays contain the same elements from kwic vs test, even with periods for breaks...
|
||||
assert(any(x in kwic(hello_buddy_periods, periodsToBreaks=True) for x in hello_buddy_word_tuples_output))
|
||||
@@ -0,0 +1,55 @@
|
||||
import kwic
|
||||
|
||||
empty_document = ""
|
||||
design_words_doc = "Design is hard.\nLet's just implement."
|
||||
goodbye_buddy_doc = "Hello there.\nHello there, buddy.\nHello and goodbye, buddy.\nHello is like buddy Goodbye!"
|
||||
hello_buddy_periods = "Hello there. Hello there, buddy. Hello and goodbye, buddy. Hello is like buddy Goodbye!"
|
||||
|
||||
this_is_split_periods_circular_output = [
|
||||
(['This', 'is', 'something', 'split', 'by', 'periods.'], 0),
|
||||
(['is', 'something', 'split', 'by', 'periods.', 'This'], 0),
|
||||
(['something', 'split', 'by', 'periods.', 'This', 'is'], 0),
|
||||
(['split', 'by', 'periods.', 'This', 'is', 'something'], 0),
|
||||
(['by', 'periods.', 'This', 'is', 'something', 'split'], 0),
|
||||
(['periods.', 'This', 'is', 'something', 'split', 'by'], 0),
|
||||
(['Not', 'newlines.'], 1),
|
||||
(['newlines.', 'Not'], 1)
|
||||
]
|
||||
|
||||
if __name__ == "__main__":
|
||||
# Ensure empty input gives empty output
|
||||
assert(kwic.kwic(empty_document) == ([], []))
|
||||
|
||||
# Ensure real input does not produce empty output
|
||||
assert(kwic.kwic(design_words_doc) != ([], []))
|
||||
|
||||
# Make sure it's broken into two lines
|
||||
assert(len(kwic.kwic(design_words_doc)[0]) == 6)
|
||||
|
||||
# Make sure it's broken into four line
|
||||
assert(len(kwic.kwic(goodbye_buddy_doc)[0]) == 14)
|
||||
|
||||
# Just realized I can check each individual function, so here is the check to split by periods
|
||||
assert(kwic.split_by_periods("This is something split \nby periods. Not \n newlines.") ==
|
||||
["This is something split by periods.", " Not newlines."])
|
||||
|
||||
# This checks to make sure a sentence gets split into words properly
|
||||
assert(kwic.split_by_word_as_tuples(["This is something split by periods.", " Not newlines."]) ==
|
||||
[(['This', 'is', 'something', 'split', 'by', 'periods.'], 0), (['Not', 'newlines.'], 1)])
|
||||
|
||||
# These check to make sure that the circular shift function is rotating properly
|
||||
assert(kwic.array_circular_shift(["One", "Two", "Three", "Four", "Five"], 0) ==
|
||||
["One", "Two", "Three", "Four", "Five"])
|
||||
|
||||
assert (kwic.array_circular_shift(["One", "Two", "Three", "Four", "Five"], 2) ==
|
||||
["Three", "Four", "Five", "One", "Two"])
|
||||
|
||||
# This checks to make sure than circularly shifted versions of all sentences are correct
|
||||
assert(kwic.fill_with_circular_shifts_and_original([(['This', 'is', 'something', 'split', 'by', 'periods.'], 0),
|
||||
(['Not', 'newlines.'], 1)]) ==
|
||||
this_is_split_periods_circular_output)
|
||||
|
||||
# This gives a bit of a sanity check. It's making sure the circular shift works, and that the output is formatted
|
||||
# correctly...
|
||||
assert(kwic.kwic("This is something split \nby periods. Not \n newlines.", periodsToBreaks=True) ==
|
||||
(this_is_split_periods_circular_output, []))
|
||||
@@ -0,0 +1,55 @@
|
||||
import kwic
|
||||
|
||||
empty_document = ""
|
||||
design_words_doc = "Design is hard.\nLet's just implement."
|
||||
goodbye_buddy_doc = "Hello there.\nHello there, buddy.\nHello and goodbye, buddy.\nHello is like buddy Goodbye!"
|
||||
hello_buddy_periods = "Hello there. Hello there, buddy. Hello and goodbye, buddy. Hello is like buddy Goodbye!"
|
||||
|
||||
this_is_split_periods_circular_output = [
|
||||
(['This', 'is', 'something', 'split', 'by', 'periods.'], 0),
|
||||
(['is', 'something', 'split', 'by', 'periods.', 'This'], 0),
|
||||
(['something', 'split', 'by', 'periods.', 'This', 'is'], 0),
|
||||
(['split', 'by', 'periods.', 'This', 'is', 'something'], 0),
|
||||
(['by', 'periods.', 'This', 'is', 'something', 'split'], 0),
|
||||
(['periods.', 'This', 'is', 'something', 'split', 'by'], 0),
|
||||
(['Not', 'newlines.'], 1),
|
||||
(['newlines.', 'Not'], 1)
|
||||
]
|
||||
|
||||
if __name__ == "__main__":
|
||||
# Ensure empty input gives empty output
|
||||
assert(kwic.kwic(empty_document) == ([]))
|
||||
|
||||
# Ensure real input does not produce empty output
|
||||
assert(kwic.kwic(design_words_doc) != ([]))
|
||||
|
||||
# Make sure it's broken into two lines
|
||||
assert(len(kwic.kwic(design_words_doc)) == 6)
|
||||
|
||||
# Make sure it's broken into four line
|
||||
assert(len(kwic.kwic(goodbye_buddy_doc)) == 14)
|
||||
|
||||
# Just realized I can check each individual function, so here is the check to split by periods
|
||||
assert(kwic.split_by_periods("This is something split \nby periods. Not \n newlines.") ==
|
||||
["This is something split by periods.", " Not newlines."])
|
||||
|
||||
# This checks to make sure a sentence gets split into words properly
|
||||
assert(kwic.split_by_word_as_tuples(["This is something split by periods.", " Not newlines."]) ==
|
||||
[(['This', 'is', 'something', 'split', 'by', 'periods.'], 0), (['Not', 'newlines.'], 1)])
|
||||
|
||||
# These check to make sure that the circular shift function is rotating properly
|
||||
assert(kwic.array_circular_shift(["One", "Two", "Three", "Four", "Five"], 0) ==
|
||||
["One", "Two", "Three", "Four", "Five"])
|
||||
|
||||
assert (kwic.array_circular_shift(["One", "Two", "Three", "Four", "Five"], 2) ==
|
||||
["Three", "Four", "Five", "One", "Two"])
|
||||
|
||||
# This checks to make sure than circularly shifted versions of all sentences are correct
|
||||
assert(kwic.fill_with_circular_shifts_and_original([(['This', 'is', 'something', 'split', 'by', 'periods.'], 0),
|
||||
(['Not', 'newlines.'], 1)]) ==
|
||||
this_is_split_periods_circular_output)
|
||||
|
||||
# This gives a bit of a sanity check. It's making sure the circular shift works, and that the output is formatted
|
||||
# correctly...
|
||||
assert(kwic.kwic("This is something split \nby periods. Not \n newlines.", periodsToBreaks=True) ==
|
||||
this_is_split_periods_circular_output)
|
||||
@@ -0,0 +1,6 @@
|
||||
1) ignoreWords only applies to indexing (the shifts) -- it has no impact on listPairs.
|
||||
2) In listPairs, a "pair" is two (different) words that appear together in a "line" -- they may not be next to each other, they are just in the same line.
|
||||
3) The list of pairs should have structure of list of tuple of tuple, I'll fix the bad example in the assignment. Not lists.
|
||||
4) Submit files as a flat folder structure!!! kwic0.py testkwic0.py
|
||||
5) Use assert to test. It must throw an uncaught exception.
|
||||
6) The test testkwicN should pass kiwcN, but not if you ran it against a newer
|
||||
@@ -0,0 +1,10 @@
|
||||
from mykwic import *
|
||||
from pprint import pprint
|
||||
|
||||
for l in open("tocheck.txt"):
|
||||
print "="*50
|
||||
input = l[:-1]
|
||||
print "INPUT:",input
|
||||
v = eval("kwic("+input+")")
|
||||
print "OUTPUT:"
|
||||
pprint
|
||||
@@ -0,0 +1,326 @@
|
||||
==================================================
|
||||
INPUT: "Design is hard.\nLet's just implement."
|
||||
OUTPUT:
|
||||
[(['Design', 'is', 'hard.'], 0),
|
||||
(['hard.', 'Design', 'is'], 0),
|
||||
(['implement.', "Let's", 'just'], 1),
|
||||
(['is', 'hard.', 'Design'], 0),
|
||||
(['just', 'implement.', "Let's"], 1),
|
||||
(["Let's", 'just', 'implement.'], 1)]
|
||||
|
||||
==================================================
|
||||
INPUT: "Design is hard.\nLet's just implement.", ignoreWords=["is"]
|
||||
OUTPUT:
|
||||
[(['Design', 'is', 'hard.'], 0),
|
||||
(['hard.', 'Design', 'is'], 0),
|
||||
(['implement.', "Let's", 'just'], 1),
|
||||
(['just', 'implement.', "Let's"], 1),
|
||||
(["Let's", 'just', 'implement.'], 1)]
|
||||
|
||||
==================================================
|
||||
INPUT: "Design is hard.\nLet's just implement.", ignoreWords=["is"], listPairs=True
|
||||
OUTPUT:
|
||||
([(['Design', 'is', 'hard.'], 0),
|
||||
(['hard.', 'Design', 'is'], 0),
|
||||
(['implement.', "Let's", 'just'], 1),
|
||||
(['just', 'implement.', "Let's"], 1),
|
||||
(["Let's", 'just', 'implement.'], 1)],
|
||||
[])
|
||||
|
||||
==================================================
|
||||
INPUT: "Hello there.\nHello there, buddy.\nHello and goodbye, buddy.\nHello is like buddy Goodbye!", listPairs=True
|
||||
OUTPUT:
|
||||
([(['and', 'goodbye,', 'buddy.', 'Hello'], 2),
|
||||
(['buddy', 'Goodbye!', 'Hello', 'is', 'like'], 3),
|
||||
(['buddy.', 'Hello', 'and', 'goodbye,'], 2),
|
||||
(['buddy.', 'Hello', 'there,'], 1),
|
||||
(['Goodbye!', 'Hello', 'is', 'like', 'buddy'], 3),
|
||||
(['goodbye,', 'buddy.', 'Hello', 'and'], 2),
|
||||
(['Hello', 'and', 'goodbye,', 'buddy.'], 2),
|
||||
(['Hello', 'is', 'like', 'buddy', 'Goodbye!'], 3),
|
||||
(['Hello', 'there,', 'buddy.'], 1),
|
||||
(['Hello', 'there.'], 0),
|
||||
(['is', 'like', 'buddy', 'Goodbye!', 'Hello'], 3),
|
||||
(['like', 'buddy', 'Goodbye!', 'Hello', 'is'], 3),
|
||||
(['there,', 'buddy.', 'Hello'], 1),
|
||||
(['there.', 'Hello'], 0)],
|
||||
[(('buddy', 'goodbye'), 2),
|
||||
(('buddy', 'hello'), 3),
|
||||
(('goodbye', 'hello'), 2),
|
||||
(('hello', 'there'), 2)])
|
||||
|
||||
==================================================
|
||||
INPUT: "Hello there. Hello there, buddy. Hello and goodbye, buddy. Hello is like buddy Goodbye!", listPairs=True, periodsToBreaks=True
|
||||
OUTPUT:
|
||||
([(['and', 'goodbye,', 'buddy.', 'Hello'], 2),
|
||||
(['buddy', 'Goodbye!', 'Hello', 'is', 'like'], 3),
|
||||
(['buddy.', 'Hello', 'and', 'goodbye,'], 2),
|
||||
(['buddy.', 'Hello', 'there,'], 1),
|
||||
(['Goodbye!', 'Hello', 'is', 'like', 'buddy'], 3),
|
||||
(['goodbye,', 'buddy.', 'Hello', 'and'], 2),
|
||||
(['Hello', 'and', 'goodbye,', 'buddy.'], 2),
|
||||
(['Hello', 'is', 'like', 'buddy', 'Goodbye!'], 3),
|
||||
(['Hello', 'there,', 'buddy.'], 1),
|
||||
(['Hello', 'there.'], 0),
|
||||
(['is', 'like', 'buddy', 'Goodbye!', 'Hello'], 3),
|
||||
(['like', 'buddy', 'Goodbye!', 'Hello', 'is'], 3),
|
||||
(['there,', 'buddy.', 'Hello'], 1),
|
||||
(['there.', 'Hello'], 0)],
|
||||
[(('buddy', 'goodbye'), 2),
|
||||
(('buddy', 'hello'), 3),
|
||||
(('goodbye', 'hello'), 2),
|
||||
(('hello', 'there'), 2)])
|
||||
|
||||
==================================================
|
||||
INPUT: ". . a"
|
||||
OUTPUT:
|
||||
[(['.', '.', 'a'], 0), (['.', 'a', '.'], 0), (['a', '.', '.'], 0)]
|
||||
|
||||
==================================================
|
||||
INPUT: ". . a", periodsToBreaks=True
|
||||
OUTPUT:
|
||||
[(['.', '.', 'a'], 0), (['.', 'a', '.'], 0), (['a', '.', '.'], 0)]
|
||||
|
||||
==================================================
|
||||
INPUT: ". A B\n. A B C\n. A B C D", listPairs=True
|
||||
OUTPUT:
|
||||
([(['.', 'A', 'B'], 0),
|
||||
(['.', 'A', 'B', 'C'], 1),
|
||||
(['.', 'A', 'B', 'C', 'D'], 2),
|
||||
(['A', 'B', '.'], 0),
|
||||
(['A', 'B', 'C', '.'], 1),
|
||||
(['A', 'B', 'C', 'D', '.'], 2),
|
||||
(['B', '.', 'A'], 0),
|
||||
(['B', 'C', '.', 'A'], 1),
|
||||
(['B', 'C', 'D', '.', 'A'], 2),
|
||||
(['C', '.', 'A', 'B'], 1),
|
||||
(['C', 'D', '.', 'A', 'B'], 2),
|
||||
(['D', '.', 'A', 'B', 'C'], 2)],
|
||||
[(('a', 'b'), 3), (('a', 'c'), 2), (('b', 'c'), 2)])
|
||||
|
||||
==================================================
|
||||
INPUT: "Hello world. This is a test\nhopefully it turns out okay", periodsToBreaks = True
|
||||
OUTPUT:
|
||||
[(['a', 'test', 'hopefully', 'it', 'turns', 'out', 'okay', 'This', 'is'], 1),
|
||||
(['Hello', 'world.'], 0),
|
||||
(['hopefully', 'it', 'turns', 'out', 'okay', 'This', 'is', 'a', 'test'], 1),
|
||||
(['is', 'a', 'test', 'hopefully', 'it', 'turns', 'out', 'okay', 'This'], 1),
|
||||
(['it', 'turns', 'out', 'okay', 'This', 'is', 'a', 'test', 'hopefully'], 1),
|
||||
(['okay', 'This', 'is', 'a', 'test', 'hopefully', 'it', 'turns', 'out'], 1),
|
||||
(['out', 'okay', 'This', 'is', 'a', 'test', 'hopefully', 'it', 'turns'], 1),
|
||||
(['test', 'hopefully', 'it', 'turns', 'out', 'okay', 'This', 'is', 'a'], 1),
|
||||
(['This', 'is', 'a', 'test', 'hopefully', 'it', 'turns', 'out', 'okay'], 1),
|
||||
(['turns', 'out', 'okay', 'This', 'is', 'a', 'test', 'hopefully', 'it'], 1),
|
||||
(['world.', 'Hello'], 0)]
|
||||
|
||||
==================================================
|
||||
INPUT: "It's very nice to be footloose. \nWith just a toothbrush and a comb.\n"
|
||||
OUTPUT:
|
||||
[(['a', 'comb.', 'With', 'just', 'a', 'toothbrush', 'and'], 1),
|
||||
(['a', 'toothbrush', 'and', 'a', 'comb.', 'With', 'just'], 1),
|
||||
(['and', 'a', 'comb.', 'With', 'just', 'a', 'toothbrush'], 1),
|
||||
(['be', 'footloose.', "It's", 'very', 'nice', 'to'], 0),
|
||||
(['comb.', 'With', 'just', 'a', 'toothbrush', 'and', 'a'], 1),
|
||||
(['footloose.', "It's", 'very', 'nice', 'to', 'be'], 0),
|
||||
(["It's", 'very', 'nice', 'to', 'be', 'footloose.'], 0),
|
||||
(['just', 'a', 'toothbrush', 'and', 'a', 'comb.', 'With'], 1),
|
||||
(['nice', 'to', 'be', 'footloose.', "It's", 'very'], 0),
|
||||
(['to', 'be', 'footloose.', "It's", 'very', 'nice'], 0),
|
||||
(['toothbrush', 'and', 'a', 'comb.', 'With', 'just', 'a'], 1),
|
||||
(['very', 'nice', 'to', 'be', 'footloose.', "It's"], 0),
|
||||
(['With', 'just', 'a', 'toothbrush', 'and', 'a', 'comb.'], 1)]
|
||||
|
||||
==================================================
|
||||
INPUT: "It's very nice to be footloose. \nWith just a toothbrush and a comb.\n", periodsToBreaks=True
|
||||
OUTPUT:
|
||||
[(['a', 'comb.', 'With', 'just', 'a', 'toothbrush', 'and'], 1),
|
||||
(['a', 'toothbrush', 'and', 'a', 'comb.', 'With', 'just'], 1),
|
||||
(['and', 'a', 'comb.', 'With', 'just', 'a', 'toothbrush'], 1),
|
||||
(['be', 'footloose.', "It's", 'very', 'nice', 'to'], 0),
|
||||
(['comb.', 'With', 'just', 'a', 'toothbrush', 'and', 'a'], 1),
|
||||
(['footloose.', "It's", 'very', 'nice', 'to', 'be'], 0),
|
||||
(["It's", 'very', 'nice', 'to', 'be', 'footloose.'], 0),
|
||||
(['just', 'a', 'toothbrush', 'and', 'a', 'comb.', 'With'], 1),
|
||||
(['nice', 'to', 'be', 'footloose.', "It's", 'very'], 0),
|
||||
(['to', 'be', 'footloose.', "It's", 'very', 'nice'], 0),
|
||||
(['toothbrush', 'and', 'a', 'comb.', 'With', 'just', 'a'], 1),
|
||||
(['very', 'nice', 'to', 'be', 'footloose.', "It's"], 0),
|
||||
(['With', 'just', 'a', 'toothbrush', 'and', 'a', 'comb.'], 1)]
|
||||
|
||||
==================================================
|
||||
INPUT: "hello here, hello there, hello everywhere",listPairs = True
|
||||
OUTPUT:
|
||||
([(['everywhere', 'hello', 'here,', 'hello', 'there,', 'hello'], 0),
|
||||
(['hello', 'everywhere', 'hello', 'here,', 'hello', 'there,'], 0),
|
||||
(['hello', 'here,', 'hello', 'there,', 'hello', 'everywhere'], 0),
|
||||
(['hello', 'there,', 'hello', 'everywhere', 'hello', 'here,'], 0),
|
||||
(['here,', 'hello', 'there,', 'hello', 'everywhere', 'hello'], 0),
|
||||
(['there,', 'hello', 'everywhere', 'hello', 'here,', 'hello'], 0)],
|
||||
[])
|
||||
|
||||
==================================================
|
||||
INPUT: "hello here\nhello here again\nhello again", listPairs=True
|
||||
OUTPUT:
|
||||
([(['again', 'hello'], 2),
|
||||
(['again', 'hello', 'here'], 1),
|
||||
(['hello', 'again'], 2),
|
||||
(['hello', 'here'], 0),
|
||||
(['hello', 'here', 'again'], 1),
|
||||
(['here', 'again', 'hello'], 1),
|
||||
(['here', 'hello'], 0)],
|
||||
[(('again', 'hello'), 2), (('hello', 'here'), 2)])
|
||||
|
||||
==================================================
|
||||
INPUT: "hello hello hello\nhello hello", listPairs=True
|
||||
OUTPUT:
|
||||
([(['hello', 'hello'], 1),
|
||||
(['hello', 'hello'], 1),
|
||||
(['hello', 'hello', 'hello'], 0),
|
||||
(['hello', 'hello', 'hello'], 0),
|
||||
(['hello', 'hello', 'hello'], 0)],
|
||||
[])
|
||||
|
||||
==================================================
|
||||
INPUT: "to be or not to be", listPairs=True
|
||||
OUTPUT:
|
||||
([(['be', 'or', 'not', 'to', 'be', 'to'], 0),
|
||||
(['be', 'to', 'be', 'or', 'not', 'to'], 0),
|
||||
(['not', 'to', 'be', 'to', 'be', 'or'], 0),
|
||||
(['or', 'not', 'to', 'be', 'to', 'be'], 0),
|
||||
(['to', 'be', 'or', 'not', 'to', 'be'], 0),
|
||||
(['to', 'be', 'to', 'be', 'or', 'not'], 0)],
|
||||
[])
|
||||
|
||||
==================================================
|
||||
INPUT: ". A B\n. A B C\n. A B C D", listPairs=True
|
||||
OUTPUT:
|
||||
([(['.', 'A', 'B'], 0),
|
||||
(['.', 'A', 'B', 'C'], 1),
|
||||
(['.', 'A', 'B', 'C', 'D'], 2),
|
||||
(['A', 'B', '.'], 0),
|
||||
(['A', 'B', 'C', '.'], 1),
|
||||
(['A', 'B', 'C', 'D', '.'], 2),
|
||||
(['B', '.', 'A'], 0),
|
||||
(['B', 'C', '.', 'A'], 1),
|
||||
(['B', 'C', 'D', '.', 'A'], 2),
|
||||
(['C', '.', 'A', 'B'], 1),
|
||||
(['C', 'D', '.', 'A', 'B'], 2),
|
||||
(['D', '.', 'A', 'B', 'C'], 2)],
|
||||
[(('a', 'b'), 3), (('a', 'c'), 2), (('b', 'c'), 2)])
|
||||
|
||||
==================================================
|
||||
INPUT: "a bad\ncat barks."
|
||||
OUTPUT:
|
||||
[(['a', 'bad'], 0),
|
||||
(['bad', 'a'], 0),
|
||||
(['barks.', 'cat'], 1),
|
||||
(['cat', 'barks.'], 1)]
|
||||
|
||||
==================================================
|
||||
INPUT: "This is not a sentence.\nNeither is this.",ignoreWords=["is."]
|
||||
OUTPUT:
|
||||
[(['a', 'sentence.', 'This', 'is', 'not'], 0),
|
||||
(['is', 'not', 'a', 'sentence.', 'This'], 0),
|
||||
(['is', 'this.', 'Neither'], 1),
|
||||
(['Neither', 'is', 'this.'], 1),
|
||||
(['not', 'a', 'sentence.', 'This', 'is'], 0),
|
||||
(['sentence.', 'This', 'is', 'not', 'a'], 0),
|
||||
(['This', 'is', 'not', 'a', 'sentence.'], 0),
|
||||
(['this.', 'Neither', 'is'], 1)]
|
||||
|
||||
==================================================
|
||||
INPUT: "This is not a sentence.\nNeither is this.",ignoreWords=["is"]
|
||||
OUTPUT:
|
||||
[(['a', 'sentence.', 'This', 'is', 'not'], 0),
|
||||
(['Neither', 'is', 'this.'], 1),
|
||||
(['not', 'a', 'sentence.', 'This', 'is'], 0),
|
||||
(['sentence.', 'This', 'is', 'not', 'a'], 0),
|
||||
(['This', 'is', 'not', 'a', 'sentence.'], 0),
|
||||
(['this.', 'Neither', 'is'], 1)]
|
||||
|
||||
==================================================
|
||||
INPUT: "hello hello\nhello hello"
|
||||
OUTPUT:
|
||||
[(['hello', 'hello'], 0),
|
||||
(['hello', 'hello'], 0),
|
||||
(['hello', 'hello'], 1),
|
||||
(['hello', 'hello'], 1)]
|
||||
|
||||
==================================================
|
||||
INPUT: "#!good morning", ignoreWords = ['!good']
|
||||
OUTPUT:
|
||||
[(['#!good', 'morning'], 0), (['morning', '#!good'], 0)]
|
||||
|
||||
==================================================
|
||||
INPUT: "go!od morning-!", ignoreWords = ['good']
|
||||
OUTPUT:
|
||||
[(['morning-!', 'go!od'], 0)]
|
||||
|
||||
==================================================
|
||||
INPUT: "#!good morning-!", ignoreWords = ['!GoOd']
|
||||
OUTPUT:
|
||||
[(['#!good', 'morning-!'], 0), (['morning-!', '#!good'], 0)]
|
||||
|
||||
==================================================
|
||||
INPUT: "?!good morning-!", ignoreWords = ['!GoOd']
|
||||
OUTPUT:
|
||||
[(['?!good', 'morning-!'], 0), (['morning-!', '?!good'], 0)]
|
||||
|
||||
==================================================
|
||||
INPUT: "?!go!!!od morning-!", ignoreWords = ['!GoOd']
|
||||
OUTPUT:
|
||||
[(['?!go!!!od', 'morning-!'], 0), (['morning-!', '?!go!!!od'], 0)]
|
||||
|
||||
==================================================
|
||||
INPUT: 'This pair? is good.\n So is this pair and that pair',listPairs=True
|
||||
OUTPUT:
|
||||
([(['and', 'that', 'pair', 'So', 'is', 'this', 'pair'], 1),
|
||||
(['good.', 'This', 'pair?', 'is'], 0),
|
||||
(['is', 'good.', 'This', 'pair?'], 0),
|
||||
(['is', 'this', 'pair', 'and', 'that', 'pair', 'So'], 1),
|
||||
(['pair', 'and', 'that', 'pair', 'So', 'is', 'this'], 1),
|
||||
(['pair', 'So', 'is', 'this', 'pair', 'and', 'that'], 1),
|
||||
(['pair?', 'is', 'good.', 'This'], 0),
|
||||
(['So', 'is', 'this', 'pair', 'and', 'that', 'pair'], 1),
|
||||
(['that', 'pair', 'So', 'is', 'this', 'pair', 'and'], 1),
|
||||
(['this', 'pair', 'and', 'that', 'pair', 'So', 'is'], 1),
|
||||
(['This', 'pair?', 'is', 'good.'], 0)],
|
||||
[(('is', 'pair'), 2), (('is', 'this'), 2), (('pair', 'this'), 2)])
|
||||
|
||||
==================================================
|
||||
INPUT: "CS is cool"
|
||||
OUTPUT:
|
||||
[(['cool', 'CS', 'is'], 0),
|
||||
(['CS', 'is', 'cool'], 0),
|
||||
(['is', 'cool', 'CS'], 0)]
|
||||
|
||||
==================================================
|
||||
INPUT: "a b\na b c\na b c d", listPairs=True
|
||||
OUTPUT:
|
||||
([(['a', 'b'], 0),
|
||||
(['a', 'b', 'c'], 1),
|
||||
(['a', 'b', 'c', 'd'], 2),
|
||||
(['b', 'a'], 0),
|
||||
(['b', 'c', 'a'], 1),
|
||||
(['b', 'c', 'd', 'a'], 2),
|
||||
(['c', 'a', 'b'], 1),
|
||||
(['c', 'd', 'a', 'b'], 2),
|
||||
(['d', 'a', 'b', 'c'], 2)],
|
||||
[(('a', 'b'), 3), (('a', 'c'), 2), (('b', 'c'), 2)])
|
||||
|
||||
==================================================
|
||||
INPUT: 'This pair? is good.\n So is this pair and that pair', listPairs=True
|
||||
OUTPUT:
|
||||
([(['and', 'that', 'pair', 'So', 'is', 'this', 'pair'], 1),
|
||||
(['good.', 'This', 'pair?', 'is'], 0),
|
||||
(['is', 'good.', 'This', 'pair?'], 0),
|
||||
(['is', 'this', 'pair', 'and', 'that', 'pair', 'So'], 1),
|
||||
(['pair', 'and', 'that', 'pair', 'So', 'is', 'this'], 1),
|
||||
(['pair', 'So', 'is', 'this', 'pair', 'and', 'that'], 1),
|
||||
(['pair?', 'is', 'good.', 'This'], 0),
|
||||
(['So', 'is', 'this', 'pair', 'and', 'that', 'pair'], 1),
|
||||
(['that', 'pair', 'So', 'is', 'this', 'pair', 'and'], 1),
|
||||
(['this', 'pair', 'and', 'that', 'pair', 'So', 'is'], 1),
|
||||
(['This', 'pair?', 'is', 'good.'], 0)],
|
||||
[(('is', 'pair'), 2), (('is', 'this'), 2), (('pair', 'this'), 2)])
|
||||
|
||||
@@ -0,0 +1,29 @@
|
||||
"Design is hard.\nLet's just implement."
|
||||
"Design is hard.\nLet's just implement.", ignoreWords=["is"]
|
||||
"Design is hard.\nLet's just implement.", ignoreWords=["is"], listPairs=True
|
||||
"Hello there.\nHello there, buddy.\nHello and goodbye, buddy.\nHello is like buddy Goodbye!", listPairs=True
|
||||
"Hello there. Hello there, buddy. Hello and goodbye, buddy. Hello is like buddy Goodbye!", listPairs=True, periodsToBreaks=True
|
||||
". . a"
|
||||
". . a", periodsToBreaks=True
|
||||
". A B\n. A B C\n. A B C D", listPairs=True
|
||||
"Hello world. This is a test\nhopefully it turns out okay", periodsToBreaks = True
|
||||
"It's very nice to be footloose. \nWith just a toothbrush and a comb.\n"
|
||||
"It's very nice to be footloose. \nWith just a toothbrush and a comb.\n", periodsToBreaks=True
|
||||
"hello here, hello there, hello everywhere",listPairs = True
|
||||
"hello here\nhello here again\nhello again", listPairs=True
|
||||
"hello hello hello\nhello hello", listPairs=True
|
||||
"to be or not to be", listPairs=True
|
||||
". A B\n. A B C\n. A B C D", listPairs=True
|
||||
"a bad\ncat barks."
|
||||
"This is not a sentence.\nNeither is this.",ignoreWords=["is."]
|
||||
"This is not a sentence.\nNeither is this.",ignoreWords=["is"]
|
||||
"hello hello\nhello hello"
|
||||
"#!good morning", ignoreWords = ['!good']
|
||||
"go!od morning-!", ignoreWords = ['good']
|
||||
"#!good morning-!", ignoreWords = ['!GoOd']
|
||||
"?!good morning-!", ignoreWords = ['!GoOd']
|
||||
"?!go!!!od morning-!", ignoreWords = ['!GoOd']
|
||||
'This pair? is good.\n So is this pair and that pair',listPairs=True
|
||||
"CS is cool"
|
||||
"a b\na b c\na b c d", listPairs=True
|
||||
'This pair? is good.\n So is this pair and that pair', listPairs=True
|
||||
Binary file not shown.
@@ -0,0 +1,26 @@
|
||||
From an architectural point of view, very little is different between the fast version of kwic and the baseline
|
||||
implementation. However, the one architectural difference that is there is significant in terms of speed, the point of
|
||||
creating this version in the first place. In the baseline kwic, the code acts almost entirely like a single black box
|
||||
block. Data in to data out, all done in pure python. The fast version's biggest gains came from changing a core aspect
|
||||
of how some of the loops were being done, in terms that they are now handled by compiled c code rather than running in
|
||||
native python. This change was the use of the map function, an alternative to the traditional loop in python.
|
||||
While this is an architecturally significant change, as we're now expanding the number of boxes AND languages, is also
|
||||
came with the speed benefits of running code in c. This speed jump was most noticeable when used in the function that is
|
||||
called as an argument "key" in the alphabetization sort calls. Another place where changes were made that helped
|
||||
increase speed, though didn't necessarily change the code architecturally, was in loops where inherited methods were
|
||||
called. So, for example, I originally had many loops that called "array_name.append()". This is slow because every time
|
||||
the interpreter comes across that line in each loop, it has to process "array_name" to determine whether it contains
|
||||
append, and what append is. By aliasing "array_name.append" as a new variable like so "aliased = array_name.append" the
|
||||
interpreter only has to perform that lookup once. Then, by replacing the calls in the loop with my alias, I save that
|
||||
lookup time for each iteration. While this didn't result in a massive increase like the use of map did, it did help
|
||||
enough to mention. A few other minor changes were also made to help shave a couple tenths of a second off here and
|
||||
there. There were many places where I was needlessly making copies of large arrays of data, simply to make code
|
||||
readability better, but which took both time and ram to accomplish. By streamlining the use of existing variables, I
|
||||
again managed to save little bits of time here and there. Now, once it came to enabling listPairs, the other very slow
|
||||
part of the code, I decided to change the way that the words were sanitized as that's where the most speed loss was
|
||||
found. In the original version, I used a join command on an empty string that stripped out unwanted characters
|
||||
essentially rather brute forced. For the fast version, I learned of a way to perform the same task using the python's
|
||||
built in translate function. This change nearly halved the time it took for just the listPairs section of the code to
|
||||
run. In order to test all of this, I used python's built in cProfile tool which times every user written function call,
|
||||
as well as aggregates the times of all python's built in ones, and separately. This made it very easy to see what parts
|
||||
of the code needed to be focused on for speed increases.
|
||||
Binary file not shown.
@@ -0,0 +1,14 @@
|
||||
Compared to the original kwic, the testing version of kwic is quite different. The focus of this version was to pull out
|
||||
much of the code from the main into functions so that sensitive portions could be easily tested separately, as well as
|
||||
be changed more easily. From an architectural point of view, the original kwic is very much a traditional black box
|
||||
approach. Data and flags in, one tiny function call used for alphabetization (as I had trouble with it and needed to
|
||||
pull it out), and the final data came out. The testing version on the other hand is approximately 25% longer in terms of
|
||||
pure code length, and is split into ten separate functions rather than the two for the original kwic. By splitting the
|
||||
core features of the kwic system into these functions, it made testing the development of the code that much easier.
|
||||
Rather than having to run through all the code up to the point I wanted to test, I could simply only call the functions
|
||||
for features I was actively testing. Of course, adding all these extra function calls did affect performance slightly,
|
||||
though not as much as I was expecting. This version is only marginally slower than the baseline implementation. I also
|
||||
considering adding a flag to kwic that would enable debugging print statements, but I decided against it as (at least
|
||||
in my personal experience) adding tons of print statements isn't specifically always helpful. Generally, you only need
|
||||
printing for a particular section of code, which could easily be manually added now that the important parts of the code
|
||||
are broken out.
|
||||
Binary file not shown.
Binary file not shown.
@@ -0,0 +1,58 @@
|
||||
def shift(line):
|
||||
return [line[i:] + line[:i] for i in xrange(0,len(line))]
|
||||
|
||||
def cleanWord(word):
|
||||
return filter (lambda c: c not in [".",",","?","!",":"], word.lower())
|
||||
|
||||
def ignorable(word,ignoreWords):
|
||||
return cleanWord(word) in map(lambda w: w.lower(), ignoreWords)
|
||||
|
||||
def splitBreaks(string, periodsToBreaks):
|
||||
if not periodsToBreaks:
|
||||
return string.split("\n")
|
||||
else:
|
||||
line = ""
|
||||
lines = []
|
||||
lastChar1 = None
|
||||
lastChar2 = None
|
||||
breakChars = map(chr, xrange(ord('a'),ord('z')+1))
|
||||
for c in string:
|
||||
if (c == " ") and (lastChar1 == ".") and (lastChar2 in breakChars):
|
||||
lines.append(line)
|
||||
line = ""
|
||||
line += c
|
||||
lastChar2 = lastChar1
|
||||
lastChar1 = c
|
||||
lines.append(line)
|
||||
return lines
|
||||
|
||||
|
||||
def kwic(string,ignoreWords=[], listPairs=False, periodsToBreaks=False):
|
||||
lines = splitBreaks(string, periodsToBreaks)
|
||||
splitLines = map(lambda l: l.split(), lines)
|
||||
if listPairs:
|
||||
pairs = {}
|
||||
for l in splitLines:
|
||||
seen = set([])
|
||||
for wu1 in l:
|
||||
wc1 = cleanWord(wu1)
|
||||
if len(wc1) == 0:
|
||||
continue
|
||||
for wu2 in l:
|
||||
wc2 = cleanWord(wu2)
|
||||
if wc1 < wc2:
|
||||
if (wc1,wc2) in seen:
|
||||
continue
|
||||
seen.add((wc1,wc2))
|
||||
if (wc1, wc2) in pairs:
|
||||
pairs[(wc1,wc2)] += 1
|
||||
else:
|
||||
pairs[(wc1,wc2)] = 1
|
||||
shiftedLines = [map(lambda x:(x,i), shift(splitLines[i])) for i in xrange(0,len(splitLines))]
|
||||
flattenedLines = [l for subList in shiftedLines for l in subList]
|
||||
filteredLines = filter(lambda l: not ignorable(l[0][0], ignoreWords), flattenedLines)
|
||||
if not listPairs:
|
||||
return sorted(filteredLines, key = lambda l: (map(cleanWord, l[0]),l[1]))
|
||||
else:
|
||||
return (sorted(filteredLines, key = lambda l: (map(lambda w:w.lower(), l[0]),l[1])),
|
||||
map(lambda wp: (wp, pairs[wp]), sorted(filter(lambda wp: pairs[wp] > 1, pairs.keys()))))
|
||||
@@ -0,0 +1,20 @@
|
||||
- One version that has improved performance, and one to improve testability
|
||||
- Performance
|
||||
-- Faster by a good marging than the baseline version
|
||||
- Testability
|
||||
-- Make it easier to control and/or observe the behavior of the system
|
||||
-- Ideas
|
||||
--- Interfaces for controlling internal variables
|
||||
--- Copious assertions
|
||||
--- Limiting complexity
|
||||
--- Changes that make the code timing more consistent
|
||||
|
||||
|
||||
- Files (put in folder named perrenc361assign2.zip)
|
||||
-- kwic.py : baseline version
|
||||
-- fastkwic.py : better performance version
|
||||
-- fastarch.txt : describes changes in architectural terms that made faster, and how you tested this (400 words)
|
||||
-- fastarch.pdf : diagram of architecture
|
||||
-- testkwic.py : highly testable version of kwic
|
||||
-- testarch.txt : describes changes in architectural terms that made it more testable (200 words)
|
||||
-- testarch.pdf : diagram of architecture
|
||||
@@ -0,0 +1,102 @@
|
||||
def alphabetized_key(input_data):
|
||||
output_array = map(str.lower, input_data[0])
|
||||
return output_array
|
||||
|
||||
|
||||
def kwic(document, listPairs=False, ignoreWords=None, periodsToBreaks=False):
|
||||
if not document and not listPairs:
|
||||
return []
|
||||
elif not document and listPairs:
|
||||
return [], []
|
||||
|
||||
if periodsToBreaks:
|
||||
output_array = []
|
||||
temp_sentence = ""
|
||||
document_length_zero_indexed = len(document) - 1
|
||||
for current_index, current_value in enumerate(document):
|
||||
if current_value == '.':
|
||||
if (current_index == 0) or (current_index == document_length_zero_indexed) or \
|
||||
(document[current_index - 1].islower() and (document[current_index + 1].isspace() or
|
||||
(document[current_index + 1] == '\n'))):
|
||||
temp_sentence += current_value
|
||||
output_array.append(temp_sentence)
|
||||
temp_sentence = ""
|
||||
else:
|
||||
if current_value != '\n':
|
||||
temp_sentence += current_value
|
||||
else:
|
||||
temp_sentence += " "
|
||||
|
||||
if temp_sentence:
|
||||
output_array.append(temp_sentence)
|
||||
split_into_sentences = output_array
|
||||
else:
|
||||
split_into_sentences = document.split('\n')
|
||||
|
||||
output_array_2 = []
|
||||
index_incrementer = 0
|
||||
|
||||
temp_append = output_array_2.append
|
||||
temp_split = str.split
|
||||
for sentence in split_into_sentences:
|
||||
words_array = temp_split(sentence, " ")
|
||||
words_array = filter(None, words_array)
|
||||
temp_append((words_array, index_incrementer))
|
||||
|
||||
index_incrementer += 1
|
||||
|
||||
split_into_word_tuples = output_array_2
|
||||
circular_shifted_data = []
|
||||
|
||||
temp_append = circular_shifted_data.append
|
||||
for current_tuple in output_array_2:
|
||||
for index, _ in enumerate(current_tuple[0]):
|
||||
temp_array = current_tuple[0][index:] + current_tuple[0][:index]
|
||||
temp_append((temp_array, current_tuple[1]))
|
||||
|
||||
if ignoreWords:
|
||||
lowered_input = []
|
||||
|
||||
for word in ignoreWords:
|
||||
lowered_input.append(word.lower())
|
||||
|
||||
for current_tuple in circular_shifted_data:
|
||||
if current_tuple[0][0].lower().strip(".:!?,") in lowered_input:
|
||||
circular_shifted_data.remove(current_tuple)
|
||||
|
||||
alphabetized_data = sorted(circular_shifted_data, key=alphabetized_key)
|
||||
|
||||
if listPairs:
|
||||
known_pairs = {}
|
||||
|
||||
for sentence_array, _ in split_into_word_tuples:
|
||||
seen_in_sentence = set([])
|
||||
|
||||
for first_word in sentence_array:
|
||||
for second_word in sentence_array:
|
||||
|
||||
first = first_word.lower().translate(None, ".,?!:")
|
||||
second = second_word.lower().translate(None, ".,?!:")
|
||||
|
||||
if (first == second) or (first == "") or (first > second):
|
||||
continue
|
||||
|
||||
if (first, second) not in seen_in_sentence:
|
||||
seen_in_sentence.add((first, second))
|
||||
|
||||
if (first, second) in known_pairs:
|
||||
known_pairs[(first, second)] += 1
|
||||
else:
|
||||
known_pairs[(first, second)] = 1
|
||||
|
||||
output_list = []
|
||||
|
||||
for key in known_pairs:
|
||||
if known_pairs[key] > 1:
|
||||
output_list.append((key, known_pairs[key]))
|
||||
|
||||
output_list.sort(key=alphabetized_key)
|
||||
|
||||
return alphabetized_data, output_list
|
||||
else:
|
||||
return alphabetized_data
|
||||
@@ -0,0 +1,115 @@
|
||||
def alphabetized_key(input_data):
|
||||
output_array = []
|
||||
for word in input_data[0]:
|
||||
output_array.append(word.lower())
|
||||
return output_array
|
||||
|
||||
|
||||
def kwic(document, listPairs=False, ignoreWords=None, periodsToBreaks=False):
|
||||
if not document and not listPairs:
|
||||
return []
|
||||
elif not document and listPairs:
|
||||
return [], []
|
||||
|
||||
if periodsToBreaks:
|
||||
output_array = []
|
||||
temp_sentence = ""
|
||||
document_length_zero_indexed = len(document) - 1
|
||||
for current_index, current_value in enumerate(document):
|
||||
if current_value == '.':
|
||||
if (current_index == 0) or (current_index == document_length_zero_indexed) or \
|
||||
(document[current_index - 1].islower() and (document[current_index + 1].isspace() or
|
||||
(document[current_index + 1] == '\n'))):
|
||||
temp_sentence += current_value
|
||||
output_array.append(temp_sentence)
|
||||
temp_sentence = ""
|
||||
else:
|
||||
if current_value != '\n':
|
||||
temp_sentence += current_value
|
||||
else:
|
||||
temp_sentence += " "
|
||||
|
||||
if temp_sentence:
|
||||
output_array.append(temp_sentence)
|
||||
split_into_sentences = output_array
|
||||
else:
|
||||
split_into_sentences = document.split('\n')
|
||||
|
||||
output_array = []
|
||||
index_incrementer = 0
|
||||
|
||||
for sentence in split_into_sentences:
|
||||
words_array = sentence.split(" ")
|
||||
words_array = filter(None, words_array)
|
||||
output_array.append((words_array, index_incrementer))
|
||||
index_incrementer += 1
|
||||
|
||||
split_into_word_tuples = output_array
|
||||
|
||||
output_array = []
|
||||
|
||||
for current_tuple in split_into_word_tuples:
|
||||
for index, _ in enumerate(current_tuple[0]):
|
||||
temp_array = current_tuple[0][index:] + current_tuple[0][:index]
|
||||
output_array.append((temp_array, current_tuple[1]))
|
||||
|
||||
circular_shifted_data = output_array
|
||||
|
||||
if ignoreWords:
|
||||
lowered_input = []
|
||||
output_array = []
|
||||
|
||||
for word in ignoreWords:
|
||||
lowered_input.append(word.lower())
|
||||
|
||||
for current_tuple in circular_shifted_data:
|
||||
if current_tuple[0][0].lower().strip(".:!?,") in lowered_input:
|
||||
pass
|
||||
else:
|
||||
output_array.append(current_tuple)
|
||||
|
||||
circular_shifted_data = output_array
|
||||
|
||||
sorted_array = sorted(circular_shifted_data, key=alphabetized_key)
|
||||
alphabetized_data = sorted_array
|
||||
|
||||
if listPairs:
|
||||
known_pairs = {}
|
||||
|
||||
char_set = ".,?!:"
|
||||
for sentence_array, _ in split_into_word_tuples:
|
||||
seen_in_sentence = set([])
|
||||
|
||||
for first_word in sentence_array:
|
||||
for second_word in sentence_array:
|
||||
|
||||
first = "".join(char for char in first_word.lower() if char not in char_set)
|
||||
second = "".join(char for char in second_word.lower() if char not in char_set)
|
||||
|
||||
if first > second:
|
||||
temp = second
|
||||
second = first
|
||||
first = temp
|
||||
|
||||
if (first == second) or (first == ""):
|
||||
continue
|
||||
|
||||
if (first, second) not in seen_in_sentence:
|
||||
seen_in_sentence.add((first, second))
|
||||
|
||||
if (first, second) in known_pairs:
|
||||
known_pairs[(first, second)] += 1
|
||||
else:
|
||||
known_pairs[(first, second)] = 1
|
||||
|
||||
output_list = []
|
||||
|
||||
for key in known_pairs:
|
||||
if known_pairs[key] > 1:
|
||||
output_list.append((key, known_pairs[key]))
|
||||
|
||||
output_list.sort(key=alphabetized_key)
|
||||
|
||||
return alphabetized_data, output_list
|
||||
else:
|
||||
return alphabetized_data
|
||||
@@ -0,0 +1,91 @@
|
||||
import time
|
||||
import kwic as kwic_original
|
||||
import fastkwic as kwic_fast
|
||||
import testkwic as kwic_test
|
||||
|
||||
num_tests = 1
|
||||
|
||||
test_original_kwic = True
|
||||
print_original_kwic = False
|
||||
|
||||
test_fast_kwic = True
|
||||
print_fast_kwic = False
|
||||
|
||||
test_test_kwic = True
|
||||
print_test_kwic = False
|
||||
|
||||
design_words_doc = "Design is hard.\nLet's just implement."
|
||||
goodbye_buddy_doc = "Hello there.\nHello there, buddy.\nHello and goodbye, buddy.\nHello is like buddy Goodbye!"
|
||||
hello_buddy_periods = "Hello there. Hello there, buddy. Hello and goodbye, buddy. Hello is like buddy Goodbye!"
|
||||
letters_and_stuff = "It's very nice to be footloose. \nWith just a toothbrush and a comb.\n"
|
||||
|
||||
open_file = open("test_documents/chesterton_short.txt", "r")
|
||||
file_as_lines = open_file.readlines()
|
||||
file_as_string = ""
|
||||
|
||||
for line in file_as_lines:
|
||||
file_as_string += line
|
||||
|
||||
del file_as_lines
|
||||
open_file.close()
|
||||
|
||||
input_document = file_as_string
|
||||
|
||||
if __name__ == "__main__":
|
||||
original_output = None
|
||||
fast_output = None
|
||||
test_output = None
|
||||
|
||||
original_times = []
|
||||
fast_times = []
|
||||
test_times = []
|
||||
|
||||
for i in range(num_tests):
|
||||
if test_original_kwic:
|
||||
print "\nTesting kwic.py"
|
||||
start_time = time.time()
|
||||
# original_output = kwic_original.kwic(input_document)
|
||||
original_output = kwic_original.kwic(input_document, listPairs=True)
|
||||
if print_original_kwic:
|
||||
print original_output
|
||||
total = time.time() - start_time
|
||||
original_times.append(total)
|
||||
print "kwic.py took " + str(total) + " seconds."
|
||||
|
||||
if test_fast_kwic:
|
||||
|
||||
print "\nTesting fastkwic.py"
|
||||
start_time = time.time()
|
||||
# fast_output = kwic_fast.kwic(input_document)
|
||||
fast_output = kwic_fast.kwic(input_document, listPairs=True)
|
||||
if print_fast_kwic:
|
||||
print fast_output
|
||||
total = time.time() - start_time
|
||||
fast_times.append(total)
|
||||
print "fastkwic.py took " + str(total) + " seconds."
|
||||
|
||||
if test_test_kwic:
|
||||
|
||||
print "\nTesting testkwic.py"
|
||||
start_time = time.time()
|
||||
# test_output = kwic_test.kwic(input_document)
|
||||
test_output = kwic_test.kwic(input_document, listPairs=True)
|
||||
if print_test_kwic:
|
||||
print test_output
|
||||
total = time.time() - start_time
|
||||
test_times.append(total)
|
||||
print "testkwic.py took " + str(total) + " seconds."
|
||||
|
||||
print "\nOriginal == Fast: " + str(original_output == fast_output)
|
||||
print "Original == Test: " + str(original_output == test_output)
|
||||
print "Test == Fast: " + str(test_output == fast_output)
|
||||
print "\n\n"
|
||||
if test_original_kwic:
|
||||
print "Original Avg: " + str(sum(original_times)/ float(len(original_times)))
|
||||
|
||||
if test_fast_kwic:
|
||||
print "Fast Avg: " + str(sum(fast_times) / float(len(fast_times)))
|
||||
|
||||
if test_test_kwic:
|
||||
print "Test Avg: " + str(sum(test_times) / float(len(test_times)))
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because one or more lines are too long
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,149 @@
|
||||
def split_by_periods(document):
|
||||
output_array = []
|
||||
temp_sentence = ""
|
||||
document_length_zero_indexed = len(document) - 1
|
||||
for current_index, current_value in enumerate(document):
|
||||
if current_value == '.':
|
||||
if (current_index == 0) or (current_index == document_length_zero_indexed) or \
|
||||
(document[current_index-1].islower() and (document[current_index+1].isspace() or
|
||||
(document[current_index+1] == '\n'))):
|
||||
temp_sentence += current_value
|
||||
output_array.append(temp_sentence)
|
||||
temp_sentence = ""
|
||||
else:
|
||||
if current_value != '\n':
|
||||
temp_sentence += current_value
|
||||
else:
|
||||
temp_sentence += " "
|
||||
|
||||
if temp_sentence:
|
||||
output_array.append(temp_sentence)
|
||||
return output_array
|
||||
|
||||
|
||||
def split_by_word_as_tuples(sentence_array):
|
||||
output_array = []
|
||||
index_incrementer = 0
|
||||
|
||||
for sentence in sentence_array:
|
||||
words_array = sentence.split(" ")
|
||||
words_array = filter(None, words_array)
|
||||
output_array.append((words_array, index_incrementer))
|
||||
index_incrementer += 1
|
||||
|
||||
return output_array
|
||||
|
||||
|
||||
def array_circular_shift(input_array, rotate_val):
|
||||
output_array = input_array[rotate_val:] + input_array[:rotate_val]
|
||||
return output_array
|
||||
|
||||
|
||||
def fill_with_circular_shifts_and_original(sentence_array):
|
||||
output_array = []
|
||||
|
||||
for current_tuple in sentence_array:
|
||||
for index, _ in enumerate(current_tuple[0]):
|
||||
output_array.append((array_circular_shift(current_tuple[0], index), current_tuple[1]))
|
||||
|
||||
return output_array
|
||||
|
||||
|
||||
def alphabetize_tuple_list(input_array):
|
||||
sorted_array = sorted(input_array, key=alphabetized_key)
|
||||
return sorted_array
|
||||
|
||||
|
||||
def alphabetized_key(input_data):
|
||||
output_array = []
|
||||
for word in input_data[0]:
|
||||
output_array.append(word.lower())
|
||||
return output_array
|
||||
|
||||
|
||||
def remove_words(input_array, words):
|
||||
lowered_input = []
|
||||
output_array = []
|
||||
|
||||
for word in words:
|
||||
lowered_input.append(word.lower())
|
||||
|
||||
for current_tuple in input_array:
|
||||
if current_tuple[0][0].lower().strip(".:!?,") in lowered_input:
|
||||
pass
|
||||
else:
|
||||
output_array.append(current_tuple)
|
||||
|
||||
return output_array
|
||||
|
||||
|
||||
def create_list_pairs(input_array):
|
||||
known_pairs = {}
|
||||
|
||||
for sentence_array, _ in input_array:
|
||||
seen_in_sentence = set([])
|
||||
|
||||
for first_word in sentence_array:
|
||||
for second_word in sentence_array:
|
||||
first, second = return_ordered_words(sanitize_word(first_word), sanitize_word(second_word))
|
||||
|
||||
if (first == second) or (first == ""):
|
||||
continue
|
||||
|
||||
if (first, second) not in seen_in_sentence:
|
||||
seen_in_sentence.add((first, second))
|
||||
|
||||
if (first, second) in known_pairs:
|
||||
known_pairs[(first, second)] += 1
|
||||
else:
|
||||
known_pairs[(first, second)] = 1
|
||||
|
||||
output_list = []
|
||||
|
||||
for key in known_pairs:
|
||||
if known_pairs[key] > 1:
|
||||
output_list.append((key, known_pairs[key]))
|
||||
|
||||
output_list.sort(key=alphabetized_key)
|
||||
|
||||
return output_list
|
||||
|
||||
|
||||
def sanitize_word(input_word):
|
||||
char_set = ".,?!:"
|
||||
return "".join(char for char in input_word.lower() if char not in char_set)
|
||||
|
||||
|
||||
def return_ordered_words(word_one, word_two):
|
||||
|
||||
if word_one < word_two:
|
||||
return word_one, word_two
|
||||
else:
|
||||
return word_two, word_one
|
||||
|
||||
|
||||
def kwic(document, listPairs=False, ignoreWords=None, periodsToBreaks=False):
|
||||
if not document and not listPairs:
|
||||
return []
|
||||
elif not document and listPairs:
|
||||
return [], []
|
||||
|
||||
if periodsToBreaks:
|
||||
split_into_sentences = split_by_periods(document)
|
||||
else:
|
||||
split_into_sentences = document.split('\n')
|
||||
|
||||
split_into_word_tuples = split_by_word_as_tuples(split_into_sentences)
|
||||
|
||||
circular_shifted_data = fill_with_circular_shifts_and_original(split_into_word_tuples)
|
||||
|
||||
if ignoreWords:
|
||||
circular_shifted_data = remove_words(circular_shifted_data, ignoreWords)
|
||||
|
||||
alphabetized_data = alphabetize_tuple_list(circular_shifted_data)
|
||||
|
||||
if listPairs:
|
||||
return alphabetized_data, create_list_pairs(split_into_word_tuples)
|
||||
else:
|
||||
return alphabetized_data
|
||||
|
||||
@@ -0,0 +1,84 @@
|
||||
# eventkwic.txt by Corwin Perren
|
||||
|
||||
###########################################
|
||||
########## Specification Writeup ##########
|
||||
###########################################
|
||||
My specification for the Kwic class is as follows. Once Kwic is constructed, it waits for a method call. If no text has
|
||||
been added to the implementation and either index or listPairs is called, they will return empty arrays. Once text is
|
||||
added using addText, that text is appended to a class variable as one long string. It is done this way to make sure that
|
||||
lines that may be broken up into multiple addText calls still work properly. Now that there is text to process, a call
|
||||
to index will process the whole string that's been stored and return the proper kwic index. If listPairs is called
|
||||
instead, it will internally run index, but not return the indexed data, so that the new text is processed. It then
|
||||
creates the pairs and returns the array. Index works semi-incrementally. Rather than keeping track of every line of text
|
||||
ever added to the class, index processes the new data into the array of tuples (again, just for the new input), appends
|
||||
it to the class's global array, and then resorts it. In this way, we avoid having to recompute the sentences that have
|
||||
already been processed, minus having to re-alphabetize. I chose to not re-compute the sentences each time there was a
|
||||
new call to addText as that seemed like a waste of cpu cycles. It's basically a trade off of having a call to index
|
||||
be fast, or having the call to addText be fast. One potential option I considered was to use multi-threading to
|
||||
continually process the input text for both indexing and listing pairs. While this would have greatly increased the
|
||||
complexity (especially where eventspec is concerned), it also would have resulted in a faster implementation.
|
||||
|
||||
I used the eventspec class and the kwic.fsm state machine to verify that all of the above processes happen correctly,
|
||||
including handling the extra constructor fields for periodsToBreaks and ignoreWords when they are used. By printing out
|
||||
the steps taken through processing, it is very easy to see the program take the correct steps. In the case that a
|
||||
catastrophic state logic error has occurred, the EventSpec class will stop execution with a trace statement, making
|
||||
it very easy to diagnose what incorrect step was taken and correct it. I decided not to place event calls where there
|
||||
was the potential for large loops to keep the log of steps taken clear and concise. Had I placed further logic to handle
|
||||
these loops, there could potentially be hundreds, thousands, or more logged steps that would make debugging difficult.
|
||||
In a sense, using EventSpec performs a similar function to unit testing as if you ever change your code in a way that
|
||||
causes it to skip a step, or produce fatally incorrect output, the class will let you know roughly where the changed
|
||||
code is, so therefore a good idea about what went wrong.
|
||||
|
||||
#####################################
|
||||
########## Trace Output #1 ##########
|
||||
#####################################
|
||||
# Code run
|
||||
kc = Kwic(periodsToBreaks=True)
|
||||
kc.addText("This pair? is good.\n So is this pair and that pair")
|
||||
kc.index()
|
||||
kc.reset()
|
||||
kc.addText("This pair? is good.\n So is this pair and that pair")
|
||||
kc.listPairs()
|
||||
kc.print_eventspec_log()
|
||||
|
||||
# Trace Output
|
||||
STEP #0: callConstructor --> idle
|
||||
STEP #1: callAddText --> idle
|
||||
STEP #2: callIndex --> processIndex
|
||||
STEP #3: callProcessIndex --> checkIfText
|
||||
STEP #4: callSplitPeriods --> splitIntoTuples
|
||||
STEP #5: callSplitAsTuples --> fillCircular
|
||||
STEP #6: callFillCircular --> checkIgnoreOrAlpha
|
||||
STEP #7: callAlphabetize --> idle
|
||||
STEP #8: callReset --> idle
|
||||
STEP #9: callAddText --> idle
|
||||
STEP #10: callListPairs --> processListPairs
|
||||
STEP #11: callProcessListPairs --> listPairsIndexOrList
|
||||
STEP #12: callProcessIndex --> checkIfText
|
||||
STEP #13: callSplitPeriods --> splitIntoTuples
|
||||
STEP #14: callSplitAsTuples --> fillCircular
|
||||
STEP #15: callFillCircular --> checkIgnoreOrAlpha
|
||||
STEP #16: callAlphabetize --> idle
|
||||
STEP #17: callCreateListPairs --> idle
|
||||
|
||||
#####################################
|
||||
########## Trace Output #2 ##########
|
||||
#####################################
|
||||
# Code run
|
||||
kc = Kwic(ignoreWords=["and", "So"])
|
||||
kc.addText("This pair? is good.\n So is this pair and that pair")
|
||||
kc.listPairs()
|
||||
kc.print_eventspec_log()
|
||||
|
||||
# Trace Output
|
||||
STEP #0: callConstructor --> idle
|
||||
STEP #1: callAddText --> idle
|
||||
STEP #2: callListPairs --> processListPairs
|
||||
STEP #3: callProcessListPairs --> listPairsIndexOrList
|
||||
STEP #4: callProcessIndex --> checkIfText
|
||||
STEP #5: processNewlineSplit --> splitIntoTuples
|
||||
STEP #6: callSplitAsTuples --> fillCircular
|
||||
STEP #7: callFillCircular --> checkIgnoreOrAlpha
|
||||
STEP #8: callRemoveWords --> removingWords
|
||||
STEP #9: callAlphabetize --> idle
|
||||
STEP #10: callCreateListPairs --> idle
|
||||
@@ -0,0 +1,66 @@
|
||||
|
||||
class EventSpec():
|
||||
|
||||
def readfsm(self,file):
|
||||
# takes a filename, returns a finite state machine
|
||||
# fsm is (begin state, structure)
|
||||
fsm = {}
|
||||
with open(file) as f:
|
||||
s = None
|
||||
t = {}
|
||||
for l in f:
|
||||
ls = l.split()
|
||||
if (ls == []) or (ls[0][0] == "#"):
|
||||
continue
|
||||
if ls[0] == "state:":
|
||||
if s != None:
|
||||
if s in fsm:
|
||||
raise SyntaxError("Cannot define state " + s + " twice")
|
||||
fsm[s] = t
|
||||
s = ls[1]
|
||||
t = {}
|
||||
elif ls[1] == "->":
|
||||
t[ls[0]] = ls[2]
|
||||
elif ls[0] == "begin:":
|
||||
beginState = ls[1]
|
||||
else:
|
||||
raise SyntaxError(l + " is not a line in a finite state machine definition")
|
||||
if s != None:
|
||||
fsm[s] = t
|
||||
return (beginState, fsm)
|
||||
|
||||
|
||||
def __init__(self,file):
|
||||
(self.start, self.machine) = self.readfsm(file)
|
||||
self.state = self.start
|
||||
self.trace = []
|
||||
self.triggers = {}
|
||||
|
||||
def onEvent(self, event, action):
|
||||
self.triggers[event] = action
|
||||
|
||||
def reset(self):
|
||||
self.state = self.start
|
||||
self.trace = []
|
||||
|
||||
def trace(self):
|
||||
return self.trace
|
||||
|
||||
def state(self):
|
||||
return self.state
|
||||
|
||||
def printLog(self):
|
||||
i = 0
|
||||
for (e,s) in self.trace:
|
||||
print " STEP #"+str(i)+":",e,"-->",s
|
||||
i += 1
|
||||
|
||||
def event(self, event):
|
||||
try:
|
||||
self.state = self.machine[self.state][event]
|
||||
self.trace.append((event,self.state))
|
||||
if event in self.triggers:
|
||||
self.triggers[event]()
|
||||
except KeyError:
|
||||
raise RuntimeError("From state " + self.state + ", transition " + event + " is not allowed (trace: " + str(self.trace) + ")")
|
||||
|
||||
@@ -0,0 +1,39 @@
|
||||
begin: waitForStart
|
||||
|
||||
state: waitForStart
|
||||
callConstructor -> idle
|
||||
|
||||
state: idle
|
||||
callAddText -> idle
|
||||
callIndex -> processIndex
|
||||
callListPairs -> processListPairs
|
||||
callReset -> idle
|
||||
callCreateListPairs -> idle
|
||||
|
||||
state: processIndex
|
||||
callProcessIndex -> checkIfText
|
||||
|
||||
state: checkIfText
|
||||
callSplitPeriods -> splitIntoTuples
|
||||
processNewlineSplit -> splitIntoTuples
|
||||
processNoNewText -> idle
|
||||
|
||||
state: splitIntoTuples
|
||||
callSplitAsTuples -> fillCircular
|
||||
|
||||
state: fillCircular
|
||||
callFillCircular -> checkIgnoreOrAlpha
|
||||
|
||||
state: checkIgnoreOrAlpha
|
||||
callAlphabetize -> idle
|
||||
callRemoveWords -> removingWords
|
||||
|
||||
state: removingWords
|
||||
callAlphabetize -> idle
|
||||
|
||||
state: processListPairs
|
||||
callProcessListPairs -> listPairsIndexOrList
|
||||
|
||||
state: listPairsIndexOrList
|
||||
callProcessIndex -> checkIfText
|
||||
callCreateListPairs -> idle
|
||||
@@ -0,0 +1,270 @@
|
||||
# Created by Corwin Perren
|
||||
|
||||
######################################################
|
||||
########## Begin Kwic Class Implementation ###########
|
||||
######################################################
|
||||
class Kwic(object):
|
||||
def __init__(self, ignoreWords=None, periodsToBreaks=False):
|
||||
self.ignore_words = ignoreWords
|
||||
self.periods_to_breaks = periodsToBreaks
|
||||
|
||||
self.all_sentence_tuples = []
|
||||
|
||||
self.kwic_output_array = []
|
||||
self.list_pairs_output_array = []
|
||||
|
||||
self.text_to_add = ""
|
||||
|
||||
self.event_spec_instance = EventSpec("kwic.fsm")
|
||||
self.event_spec_instance.event("callConstructor")
|
||||
|
||||
def addText(self, new_text):
|
||||
self.event_spec_instance.event("callAddText")
|
||||
self.text_to_add += " " + str(new_text)
|
||||
|
||||
def index(self):
|
||||
self.event_spec_instance.event("callIndex")
|
||||
self.__process_kwic_index()
|
||||
return self.kwic_output_array
|
||||
|
||||
def listPairs(self):
|
||||
self.event_spec_instance.event("callListPairs")
|
||||
self.__process_list_pairs()
|
||||
return self.list_pairs_output_array
|
||||
|
||||
def reset(self):
|
||||
self.event_spec_instance.event("callReset")
|
||||
self.all_sentence_tuples = []
|
||||
|
||||
self.kwic_output_array = []
|
||||
self.list_pairs_output_array = []
|
||||
|
||||
self.text_to_add = []
|
||||
|
||||
def print_eventspec_log(self):
|
||||
self.event_spec_instance.printLog()
|
||||
|
||||
def __process_kwic_index(self):
|
||||
self.event_spec_instance.event("callProcessIndex")
|
||||
if self.text_to_add:
|
||||
if self.periods_to_breaks:
|
||||
split_into_sentences = self.__split_by_periods(self.text_to_add)
|
||||
else:
|
||||
self.event_spec_instance.event("processNewlineSplit")
|
||||
split_into_sentences = self.text_to_add.split('\n')
|
||||
|
||||
split_into_word_tuples = self.__split_by_word_as_tuples(split_into_sentences)
|
||||
|
||||
for sentence_tuple in split_into_word_tuples:
|
||||
self.all_sentence_tuples.append(sentence_tuple)
|
||||
|
||||
circular_shifted_data = self.__fill_with_circular_shifts_and_original(split_into_word_tuples)
|
||||
|
||||
if self.ignore_words:
|
||||
circular_shifted_data = self.__remove_words(circular_shifted_data, self.ignore_words)
|
||||
|
||||
for current_tuple in circular_shifted_data:
|
||||
self.kwic_output_array.append(current_tuple)
|
||||
|
||||
self.text_to_add = ""
|
||||
|
||||
self.kwic_output_array = self.__alphabetize_tuple_list(self.kwic_output_array)
|
||||
else:
|
||||
self.event_spec_instance.event("processNoNewText")
|
||||
|
||||
def __process_list_pairs(self):
|
||||
self.event_spec_instance.event("callProcessListPairs")
|
||||
if self.text_to_add:
|
||||
self.__process_kwic_index()
|
||||
|
||||
self.list_pairs_output_array = self.__create_list_pairs(self.all_sentence_tuples)
|
||||
|
||||
def __fill_with_circular_shifts_and_original(self, sentence_array):
|
||||
self.event_spec_instance.event("callFillCircular")
|
||||
output_array = []
|
||||
|
||||
for current_tuple in sentence_array:
|
||||
for index, _ in enumerate(current_tuple[0]):
|
||||
output_array.append((self.__array_circular_shift(current_tuple[0], index), current_tuple[1]))
|
||||
|
||||
return output_array
|
||||
|
||||
def __alphabetize_tuple_list(self, input_array):
|
||||
self.event_spec_instance.event("callAlphabetize")
|
||||
sorted_array = sorted(input_array, key=self.__alphabetized_key)
|
||||
return sorted_array
|
||||
|
||||
def __create_list_pairs(self, input_array):
|
||||
self.event_spec_instance.event("callCreateListPairs")
|
||||
known_pairs = {}
|
||||
|
||||
for sentence_array, _ in input_array:
|
||||
seen_in_sentence = set([])
|
||||
|
||||
for first_word in sentence_array:
|
||||
for second_word in sentence_array:
|
||||
first, second = self.__return_ordered_words(self.__sanitize_word(first_word),
|
||||
self.__sanitize_word(second_word))
|
||||
|
||||
if (first == second) or (first == ""):
|
||||
continue
|
||||
|
||||
if (first, second) not in seen_in_sentence:
|
||||
seen_in_sentence.add((first, second))
|
||||
|
||||
if (first, second) in known_pairs:
|
||||
known_pairs[(first, second)] += 1
|
||||
else:
|
||||
known_pairs[(first, second)] = 1
|
||||
|
||||
output_list = []
|
||||
|
||||
for key in known_pairs:
|
||||
if known_pairs[key] > 1:
|
||||
output_list.append((key, known_pairs[key]))
|
||||
|
||||
output_list.sort(key=self.__alphabetized_key)
|
||||
|
||||
return output_list
|
||||
|
||||
def __split_by_periods(self, document):
|
||||
self.event_spec_instance.event("callSplitPeriods")
|
||||
output_array = []
|
||||
temp_sentence = ""
|
||||
document_length_zero_indexed = len(document) - 1
|
||||
for current_index, current_value in enumerate(document):
|
||||
if current_value == '.':
|
||||
if (current_index == 0) or (current_index == document_length_zero_indexed) or \
|
||||
(document[current_index - 1].islower() and (document[current_index + 1].isspace() or
|
||||
(document[current_index + 1] == '\n'))):
|
||||
temp_sentence += current_value
|
||||
output_array.append(temp_sentence)
|
||||
temp_sentence = ""
|
||||
else:
|
||||
if current_value != '\n':
|
||||
temp_sentence += current_value
|
||||
else:
|
||||
temp_sentence += " "
|
||||
|
||||
if temp_sentence:
|
||||
output_array.append(temp_sentence)
|
||||
return output_array
|
||||
|
||||
def __split_by_word_as_tuples(self, sentence_array):
|
||||
self.event_spec_instance.event("callSplitAsTuples")
|
||||
|
||||
output_array = []
|
||||
index_incrementer = 0
|
||||
|
||||
for sentence in sentence_array:
|
||||
words_array = sentence.split(" ")
|
||||
words_array = filter(None, words_array)
|
||||
output_array.append((words_array, index_incrementer))
|
||||
index_incrementer += 1
|
||||
|
||||
return output_array
|
||||
|
||||
def __array_circular_shift(self, input_array, rotate_val):
|
||||
output_array = input_array[rotate_val:] + input_array[:rotate_val]
|
||||
return output_array
|
||||
|
||||
def __alphabetized_key(self, input_data):
|
||||
output_array = []
|
||||
for word in input_data[0]:
|
||||
output_array.append(word.lower())
|
||||
return output_array
|
||||
|
||||
def __remove_words(self, input_array, words):
|
||||
self.event_spec_instance.event("callRemoveWords")
|
||||
|
||||
lowered_input = []
|
||||
output_array = []
|
||||
|
||||
for word in words:
|
||||
lowered_input.append(word.lower())
|
||||
|
||||
for current_tuple in input_array:
|
||||
if current_tuple[0][0].lower().strip(".:!?,") in lowered_input:
|
||||
pass
|
||||
else:
|
||||
output_array.append(current_tuple)
|
||||
|
||||
return output_array
|
||||
|
||||
def __sanitize_word(self, input_word):
|
||||
return input_word.lower().translate(None, ".,?!:")
|
||||
|
||||
def __return_ordered_words(self, word_one, word_two):
|
||||
|
||||
if word_one < word_two:
|
||||
return word_one, word_two
|
||||
else:
|
||||
return word_two, word_one
|
||||
|
||||
|
||||
######################################################
|
||||
########## Begin Provided EventSpec Include ##########
|
||||
######################################################
|
||||
class EventSpec():
|
||||
def readfsm(self, file):
|
||||
# takes a filename, returns a finite state machine
|
||||
# fsm is (begin state, structure)
|
||||
fsm = {}
|
||||
with open(file) as f:
|
||||
s = None
|
||||
t = {}
|
||||
for l in f:
|
||||
ls = l.split()
|
||||
if (ls == []) or (ls[0][0] == "#"):
|
||||
continue
|
||||
if ls[0] == "state:":
|
||||
if s != None:
|
||||
if s in fsm:
|
||||
raise SyntaxError("Cannot define state " + s + " twice")
|
||||
fsm[s] = t
|
||||
s = ls[1]
|
||||
t = {}
|
||||
elif ls[1] == "->":
|
||||
t[ls[0]] = ls[2]
|
||||
elif ls[0] == "begin:":
|
||||
beginState = ls[1]
|
||||
else:
|
||||
raise SyntaxError(l + " is not a line in a finite state machine definition")
|
||||
if s != None:
|
||||
fsm[s] = t
|
||||
return (beginState, fsm)
|
||||
|
||||
def __init__(self, file):
|
||||
(self.start, self.machine) = self.readfsm(file)
|
||||
self.state = self.start
|
||||
self.trace = []
|
||||
self.triggers = {}
|
||||
|
||||
def onEvent(self, event, action):
|
||||
self.triggers[event] = action
|
||||
|
||||
def reset(self):
|
||||
self.state = self.start
|
||||
self.trace = []
|
||||
|
||||
def trace(self):
|
||||
return self.trace
|
||||
|
||||
def state(self):
|
||||
return self.state
|
||||
|
||||
def printLog(self):
|
||||
i = 0
|
||||
for (e, s) in self.trace:
|
||||
print " STEP #" + str(i) + ":", e, "-->", s
|
||||
i += 1
|
||||
|
||||
def event(self, event):
|
||||
try:
|
||||
self.state = self.machine[self.state][event]
|
||||
self.trace.append((event, self.state))
|
||||
if event in self.triggers:
|
||||
self.triggers[event]()
|
||||
except KeyError:
|
||||
raise RuntimeError("From state " + self.state + ", transition " + event + " is not allowed (trace: " + str(
|
||||
self.trace) + ")")
|
||||
@@ -0,0 +1,13 @@
|
||||
begin: start
|
||||
|
||||
state: start
|
||||
a -> infinibs
|
||||
b -> infinias
|
||||
|
||||
state: infinibs
|
||||
b -> infinibs
|
||||
a -> start
|
||||
|
||||
state: infinias
|
||||
a -> infiniais
|
||||
b -> start
|
||||
@@ -0,0 +1,12 @@
|
||||
# For testing implementation
|
||||
from kwic import Kwic
|
||||
|
||||
output_check = [(['and', 'that', 'pair', 'So', 'is', 'this', 'pair'], 1), (['good.', 'This', 'pair?', 'is'], 0), (['is', 'good.', 'This', 'pair?'], 0), (['is', 'this', 'pair', 'and', 'that', 'pair', 'So'], 1), (['pair', 'and', 'that', 'pair', 'So', 'is', 'this'], 1), (['pair', 'So', 'is', 'this', 'pair', 'and', 'that'], 1), (['pair?', 'is', 'good.', 'This'], 0), (['So', 'is', 'this', 'pair', 'and', 'that', 'pair'], 1), (['that', 'pair', 'So', 'is', 'this', 'pair', 'and'], 1), (['this', 'pair', 'and', 'that', 'pair', 'So', 'is'], 1), (['This', 'pair?', 'is', 'good.'], 0)]
|
||||
output_check_pairs = [(('is', 'pair'), 2), (('is', 'this'), 2), (('pair', 'this'), 2)]
|
||||
|
||||
if __name__ == "__main__":
|
||||
kc = Kwic()
|
||||
kc.addText("This pair? is good.\n So is this pair and that pair")
|
||||
print kc.listPairs() == output_check_pairs
|
||||
print kc.index() == output_check
|
||||
kc.print_eventspec_log()
|
||||
@@ -0,0 +1,29 @@
|
||||
from assign3 import eventspec
|
||||
|
||||
es = eventspec.EventSpec("kwic.fsm")
|
||||
|
||||
es.event("a")
|
||||
es.event("b")
|
||||
es.event("b")
|
||||
es.event("b")
|
||||
es.event("b")
|
||||
es.event("a")
|
||||
es.printLog()
|
||||
|
||||
# Beatles code
|
||||
|
||||
def printHello():
|
||||
print "Hello"
|
||||
def printGoodbye():
|
||||
print "Goodbye"
|
||||
|
||||
es.onEvent("a",printHello)
|
||||
es.onEvent("b",printGoodbye)
|
||||
|
||||
es.event("a")
|
||||
es.event("a")
|
||||
es.event("a")
|
||||
es.event("b")
|
||||
es.event("b")
|
||||
es.event("a")
|
||||
es.printLog()
|
||||
Reference in New Issue
Block a user