Added work from my other class repositories before deletion

This commit is contained in:
2017-11-29 10:28:24 -08:00
parent cb0b5f4d25
commit 5ea24c81b5
198 changed files with 739603 additions and 0 deletions

View File

@@ -0,0 +1,2 @@
def kwic(document, listPairs=False, ignoreWords=[]):
return []

View File

@@ -0,0 +1,6 @@
from kwic import kwic
document = "" # Input no data
if __name__ == "__main__":
assert(kwic(document) == []) # Ensure that results are empty because there's no data

View File

@@ -0,0 +1,5 @@
def kwic(document, listPairs=False, ignoreWords=[]):
if not document:
return []
return [document]

View File

@@ -0,0 +1,8 @@
from kwic import kwic
empty_document = ""
design_words_doc = "Design is hard.\nLet's just implement."
if __name__ == "__main__":
assert(kwic(empty_document) == []) # Ensure empty input gives empty output
assert(kwic(design_words_doc) != []) # Ensure real input does not produce empty output

View File

@@ -0,0 +1,7 @@
def kwic(document, listPairs=False, ignoreWords=None, periodsToBreaks=False):
if not document:
return []
split_into_sentences = document.splitlines()
return [split_into_sentences]

View File

@@ -0,0 +1,12 @@
from kwic import kwic
empty_document = ""
design_words_doc = "Design is hard.\nLet's just implement."
goodbye_buddy_doc = "Hello there.\nHello there, buddy.\nHello and goodbye, buddy.\nHello is like buddy Goodbye!"
if __name__ == "__main__":
assert(kwic(empty_document) == []) # Ensure empty input gives empty output
assert(kwic(design_words_doc) != []) # Ensure real input does not produce empty output
assert(len(kwic(design_words_doc)[0]) == 2) # [(), ()] Make sure it's broken into two lines
assert(len(kwic(goodbye_buddy_doc)[0]) == 4) # [(), ()] Make sure it's broken into four lines

View File

@@ -0,0 +1,10 @@
def kwic(document, listPairs=False, ignoreWords=None, periodsToBreaks=False):
if not document:
return []
if periodsToBreaks:
split_into_sentences = document.split(".")
else:
split_into_sentences = document.splitlines()
return [split_into_sentences]

View File

@@ -0,0 +1,25 @@
from kwic import kwic
empty_document = ""
design_words_doc = "Design is hard.\nLet's just implement."
goodbye_buddy_doc = "Hello there.\nHello there, buddy.\nHello and goodbye, buddy.\nHello is like buddy Goodbye!"
hello_buddy_periods = "Hello there. Hello there, buddy. Hello and goodbye, buddy. Hello is like buddy Goodbye!"
if __name__ == "__main__":
# Ensure empty input gives empty output
assert(kwic(empty_document) == [])
# Ensure real input does not produce empty output
assert(kwic(design_words_doc) != [])
# Make sure it's broken into two lines
assert(len(kwic(design_words_doc)[0]) == 2)
# Make sure it's broken into four line
assert(len(kwic(goodbye_buddy_doc)[0]) == 4)
# Make sure line with just periods only shows up as one normally
assert(len(kwic(hello_buddy_periods)[0]) == 1)
# Make sure it's broken into four lines once it's broken by periods instead
assert(len(kwic(hello_buddy_periods, periodsToBreaks=True)[0]) == 4)

View File

@@ -0,0 +1,29 @@
def split_by_periods(document):
output_array = []
sentence_array_temp = ""
for current_char in document:
if current_char != "\n":
sentence_array_temp += current_char
if current_char == ".":
output_array.append(sentence_array_temp)
sentence_array_temp = ""
if sentence_array_temp:
output_array.append(sentence_array_temp)
return output_array
def kwic(document, listPairs=False, ignoreWords=None, periodsToBreaks=False):
if not document:
return []
if periodsToBreaks:
split_into_sentences = split_by_periods(document)
else:
split_into_sentences = document.splitlines()
return split_into_sentences

View File

@@ -0,0 +1,29 @@
from kwic import kwic
empty_document = ""
design_words_doc = "Design is hard.\nLet's just implement."
goodbye_buddy_doc = "Hello there.\nHello there, buddy.\nHello and goodbye, buddy.\nHello is like buddy Goodbye!"
hello_buddy_periods = "Hello there. Hello there, buddy. Hello and goodbye, buddy. Hello is like buddy Goodbye!"
hello_buddy_periods_output = ["Hello there.", " Hello there, buddy.", " Hello and goodbye, buddy.",
" Hello is like buddy Goodbye!"]
if __name__ == "__main__":
# Ensure empty input gives empty output
assert(kwic(empty_document) == [])
# Ensure real input does not produce empty output
assert(kwic(design_words_doc) != [])
# Make sure it's broken into two lines
assert(len(kwic(design_words_doc)) == 2)
# Make sure it's broken into four line
assert(len(kwic(goodbye_buddy_doc)) == 4)
# Make sure line with just periods shows up as itself
assert(kwic(hello_buddy_periods)[0] == hello_buddy_periods)
# Make sure it's broken into four lines once it's broken by periods instead
# Also, this time it keeps the ending period like it's supposed to
assert(kwic(hello_buddy_periods, periodsToBreaks=True) == hello_buddy_periods_output)

View File

@@ -0,0 +1,44 @@
def split_by_periods(document):
output_array = []
sentence_array_temp = ""
for current_char in document:
if current_char != "\n":
sentence_array_temp += current_char
if current_char == ".":
output_array.append(sentence_array_temp)
sentence_array_temp = ""
if sentence_array_temp:
output_array.append(sentence_array_temp)
return output_array
def split_by_word_as_tuples(sentence_array):
output_array = []
index_incrementer = 0
for sentence in sentence_array:
words_array = sentence.split(" ")
words_array = filter(None, words_array)
output_array.append((words_array, index_incrementer))
index_incrementer += 1
return output_array
def kwic(document, listPairs=False, ignoreWords=None, periodsToBreaks=False):
if not document:
return []
if periodsToBreaks:
split_into_sentences = split_by_periods(document)
else:
split_into_sentences = document.splitlines()
split_into_word_tuples = split_by_word_as_tuples(split_into_sentences)
return split_into_word_tuples

View File

@@ -0,0 +1,33 @@
from kwic import kwic
empty_document = ""
design_words_doc = "Design is hard.\nLet's just implement."
goodbye_buddy_doc = "Hello there.\nHello there, buddy.\nHello and goodbye, buddy.\nHello is like buddy Goodbye!"
hello_buddy_periods = "Hello there. Hello there, buddy. Hello and goodbye, buddy. Hello is like buddy Goodbye!"
hello_buddy_periods_output = [(["Hello", "there.", "Hello", "there,", "buddy.", "Hello", "and", "goodbye,", "buddy.",
"Hello", "is", "like", "buddy", "Goodbye!"], 0)]
hello_buddy_word_tuples_output = [(["Hello", "there."], 0),
(["Hello", "there,", "buddy."], 1),
(["Hello", "and", "goodbye,", "buddy."], 2),
(["Hello", "is", "like", "buddy", "Goodbye!"], 3)]
if __name__ == "__main__":
# Ensure empty input gives empty output
assert(kwic(empty_document) == [])
# Ensure real input does not produce empty output
assert(kwic(design_words_doc) != [])
# Make sure it's broken into two lines
assert(len(kwic(design_words_doc)) == 2)
# Make sure it's broken into four line
assert(len(kwic(goodbye_buddy_doc)) == 4)
# Make sure array contains same elements from kwic vs test output
assert(any(x in kwic(hello_buddy_periods) for x in hello_buddy_periods_output))
# Make sure arrays contain the same elements from kwic vs test, even with periods for breaks...
assert(any(x in kwic(hello_buddy_periods, periodsToBreaks=True) for x in hello_buddy_word_tuples_output))

View File

@@ -0,0 +1,61 @@
def split_by_periods(document):
output_array = []
sentence_array_temp = ""
for current_char in document:
if current_char != "\n":
sentence_array_temp += current_char
if current_char == ".":
output_array.append(sentence_array_temp)
sentence_array_temp = ""
if sentence_array_temp:
output_array.append(sentence_array_temp)
return output_array
def split_by_word_as_tuples(sentence_array):
output_array = []
index_incrementer = 0
for sentence in sentence_array:
words_array = sentence.split(" ")
words_array = filter(None, words_array)
output_array.append((words_array, index_incrementer))
index_incrementer += 1
return output_array
def array_circular_shift(input_array, rotate_val):
output_array = input_array[rotate_val:] + input_array[:rotate_val]
return output_array
def fill_with_circular_shifts_and_original(sentence_array):
output_array = []
for current_tuple in sentence_array:
for index, _ in enumerate(current_tuple[0]):
output_array.append((array_circular_shift(current_tuple[0], index), current_tuple[1]))
return output_array
def kwic(document, listPairs=False, ignoreWords=None, periodsToBreaks=False):
if not document:
return [], []
if periodsToBreaks:
split_into_sentences = split_by_periods(document)
else:
split_into_sentences = document.splitlines()
split_into_word_tuples = split_by_word_as_tuples(split_into_sentences)
output_tuple = (fill_with_circular_shifts_and_original(split_into_word_tuples), [])
return output_tuple

View File

@@ -0,0 +1,55 @@
import kwic
empty_document = ""
design_words_doc = "Design is hard.\nLet's just implement."
goodbye_buddy_doc = "Hello there.\nHello there, buddy.\nHello and goodbye, buddy.\nHello is like buddy Goodbye!"
hello_buddy_periods = "Hello there. Hello there, buddy. Hello and goodbye, buddy. Hello is like buddy Goodbye!"
this_is_split_periods_circular_output = [
(['This', 'is', 'something', 'split', 'by', 'periods.'], 0),
(['is', 'something', 'split', 'by', 'periods.', 'This'], 0),
(['something', 'split', 'by', 'periods.', 'This', 'is'], 0),
(['split', 'by', 'periods.', 'This', 'is', 'something'], 0),
(['by', 'periods.', 'This', 'is', 'something', 'split'], 0),
(['periods.', 'This', 'is', 'something', 'split', 'by'], 0),
(['Not', 'newlines.'], 1),
(['newlines.', 'Not'], 1)
]
if __name__ == "__main__":
# Ensure empty input gives empty output
assert(kwic.kwic(empty_document) == ([], []))
# Ensure real input does not produce empty output
assert(kwic.kwic(design_words_doc) != ([], []))
# Make sure it's broken into two lines
assert(len(kwic.kwic(design_words_doc)[0]) == 6)
# Make sure it's broken into four line
assert(len(kwic.kwic(goodbye_buddy_doc)[0]) == 14)
# Just realized I can check each individual function, so here is the check to split by periods
assert(kwic.split_by_periods("This is something split \nby periods. Not \n newlines.") ==
["This is something split by periods.", " Not newlines."])
# This checks to make sure a sentence gets split into words properly
assert(kwic.split_by_word_as_tuples(["This is something split by periods.", " Not newlines."]) ==
[(['This', 'is', 'something', 'split', 'by', 'periods.'], 0), (['Not', 'newlines.'], 1)])
# These check to make sure that the circular shift function is rotating properly
assert(kwic.array_circular_shift(["One", "Two", "Three", "Four", "Five"], 0) ==
["One", "Two", "Three", "Four", "Five"])
assert (kwic.array_circular_shift(["One", "Two", "Three", "Four", "Five"], 2) ==
["Three", "Four", "Five", "One", "Two"])
# This checks to make sure than circularly shifted versions of all sentences are correct
assert(kwic.fill_with_circular_shifts_and_original([(['This', 'is', 'something', 'split', 'by', 'periods.'], 0),
(['Not', 'newlines.'], 1)]) ==
this_is_split_periods_circular_output)
# This gives a bit of a sanity check. It's making sure the circular shift works, and that the output is formatted
# correctly...
assert(kwic.kwic("This is something split \nby periods. Not \n newlines.", periodsToBreaks=True) ==
(this_is_split_periods_circular_output, []))

View File

@@ -0,0 +1,64 @@
def split_by_periods(document):
output_array = []
sentence_array_temp = ""
for current_char in document:
if current_char != "\n":
sentence_array_temp += current_char
if current_char == ".":
output_array.append(sentence_array_temp)
sentence_array_temp = ""
if sentence_array_temp:
output_array.append(sentence_array_temp)
return output_array
def split_by_word_as_tuples(sentence_array):
output_array = []
index_incrementer = 0
for sentence in sentence_array:
words_array = sentence.split(" ")
words_array = filter(None, words_array)
output_array.append((words_array, index_incrementer))
index_incrementer += 1
return output_array
def array_circular_shift(input_array, rotate_val):
output_array = input_array[rotate_val:] + input_array[:rotate_val]
return output_array
def fill_with_circular_shifts_and_original(sentence_array):
output_array = []
for current_tuple in sentence_array:
for index, _ in enumerate(current_tuple[0]):
output_array.append((array_circular_shift(current_tuple[0], index), current_tuple[1]))
return output_array
def kwic(document, listPairs=False, ignoreWords=None, periodsToBreaks=False):
if not document:
return []
if periodsToBreaks:
split_into_sentences = split_by_periods(document)
else:
split_into_sentences = document.split('\n')
split_into_word_tuples = split_by_word_as_tuples(split_into_sentences)
if listPairs:
output_data = (fill_with_circular_shifts_and_original(split_into_word_tuples), [])
else:
output_data = fill_with_circular_shifts_and_original(split_into_word_tuples)
return output_data

View File

@@ -0,0 +1,55 @@
import kwic
empty_document = ""
design_words_doc = "Design is hard.\nLet's just implement."
goodbye_buddy_doc = "Hello there.\nHello there, buddy.\nHello and goodbye, buddy.\nHello is like buddy Goodbye!"
hello_buddy_periods = "Hello there. Hello there, buddy. Hello and goodbye, buddy. Hello is like buddy Goodbye!"
this_is_split_periods_circular_output = [
(['This', 'is', 'something', 'split', 'by', 'periods.'], 0),
(['is', 'something', 'split', 'by', 'periods.', 'This'], 0),
(['something', 'split', 'by', 'periods.', 'This', 'is'], 0),
(['split', 'by', 'periods.', 'This', 'is', 'something'], 0),
(['by', 'periods.', 'This', 'is', 'something', 'split'], 0),
(['periods.', 'This', 'is', 'something', 'split', 'by'], 0),
(['Not', 'newlines.'], 1),
(['newlines.', 'Not'], 1)
]
if __name__ == "__main__":
# Ensure empty input gives empty output
assert(kwic.kwic(empty_document) == ([]))
# Ensure real input does not produce empty output
assert(kwic.kwic(design_words_doc) != ([]))
# Make sure it's broken into two lines
assert(len(kwic.kwic(design_words_doc)) == 6)
# Make sure it's broken into four line
assert(len(kwic.kwic(goodbye_buddy_doc)) == 14)
# Just realized I can check each individual function, so here is the check to split by periods
assert(kwic.split_by_periods("This is something split \nby periods. Not \n newlines.") ==
["This is something split by periods.", " Not newlines."])
# This checks to make sure a sentence gets split into words properly
assert(kwic.split_by_word_as_tuples(["This is something split by periods.", " Not newlines."]) ==
[(['This', 'is', 'something', 'split', 'by', 'periods.'], 0), (['Not', 'newlines.'], 1)])
# These check to make sure that the circular shift function is rotating properly
assert(kwic.array_circular_shift(["One", "Two", "Three", "Four", "Five"], 0) ==
["One", "Two", "Three", "Four", "Five"])
assert (kwic.array_circular_shift(["One", "Two", "Three", "Four", "Five"], 2) ==
["Three", "Four", "Five", "One", "Two"])
# This checks to make sure than circularly shifted versions of all sentences are correct
assert(kwic.fill_with_circular_shifts_and_original([(['This', 'is', 'something', 'split', 'by', 'periods.'], 0),
(['Not', 'newlines.'], 1)]) ==
this_is_split_periods_circular_output)
# This gives a bit of a sanity check. It's making sure the circular shift works, and that the output is formatted
# correctly...
assert(kwic.kwic("This is something split \nby periods. Not \n newlines.", periodsToBreaks=True) ==
this_is_split_periods_circular_output)

View File

@@ -0,0 +1,64 @@
def split_by_periods(document):
output_array = []
sentence_array_temp = ""
for current_char in document:
if current_char != "\n":
sentence_array_temp += current_char
if current_char == ".":
output_array.append(sentence_array_temp)
sentence_array_temp = ""
if sentence_array_temp:
output_array.append(sentence_array_temp)
return output_array
def split_by_word_as_tuples(sentence_array):
output_array = []
index_incrementer = 0
for sentence in sentence_array:
words_array = sentence.split(" ")
words_array = filter(None, words_array)
output_array.append((words_array, index_incrementer))
index_incrementer += 1
return output_array
def array_circular_shift(input_array, rotate_val):
output_array = input_array[rotate_val:] + input_array[:rotate_val]
return output_array
def fill_with_circular_shifts_and_original(sentence_array):
output_array = []
for current_tuple in sentence_array:
for index, _ in enumerate(current_tuple[0]):
output_array.append((array_circular_shift(current_tuple[0], index), current_tuple[1]))
return output_array
def kwic(document, listPairs=False, ignoreWords=None, periodsToBreaks=False):
if not document:
return []
if periodsToBreaks:
split_into_sentences = split_by_periods(document)
else:
split_into_sentences = document.splitlines()
split_into_word_tuples = split_by_word_as_tuples(split_into_sentences)
if listPairs:
output_data = (fill_with_circular_shifts_and_original(split_into_word_tuples), [])
else:
output_data = fill_with_circular_shifts_and_original(split_into_word_tuples)
return output_data

View File

@@ -0,0 +1,31 @@
1. How did you decide what to test first? Would your final code change significantly if you changed the order of tests?
The process for deciding what to test mostly had to do with a logical breakdown of the core parts of the code so that
things that relied on others to work were done after those pre-requisites. So, for example, before we could do anything
first the document had to be broken down into sentences, then words. In that part, you could also change whether or not
the sentence breaks were handled with newlines or periods. Afterwards, you could then deal with rearranging the words as
necessary, handling determining pairs, and then excluding any words that were given as arguments. I don't feel like the
code would have been massively different if it'd been done in reverse, but it definitely would have been more difficult
to write. On top of that, you'd still have to think all the way through how the code would need to function, regardless
of the order in which you wrote it, in order to know what needed to be written in the first place. Also, if you
literally reversed the order of the tests, it could lead to you skipping the whole point of using tests (if you did it
incorrectly). Again, for an example, if you wrote a test that produced the final output of the program as the initial
input and expected a correct output, you'd either have to hard code many many values in to the resulting code while you
implemented them for real, or you'd end up writing all necessary code at once, which would defeat the purpose.
2. What did you think of test driven development, for this problem? What are the strengths and weaknesses of the
approach? Does it encourage/discourage certain kinds of program designs?
I'm torn about it. I feel that for this particular problem, it may have been overkill and actually hampered design
progress at times. The main issue was that for each major step, the output of the code often changed enough that many
of the tests had to be modified or rewritten to match. The definite strengths of this approach are that you can easily
tell when you change something that breaks the functionality of your code, and it will immediately produce an exception.
On the flip side, it does take much longer to write functional code depend on what exactly you're writing. I would
argue that the larger the application being written, the more important it would be to do this kind of testing.
Without it, you might be digging through hundreds of thousands to millions of lines of code to figure out what is
breaking. I can definitely appreciate the fact that taking the time to write the tests helps guarantee that it's doing
what you want, but I feel like it would be more useful in other programming contexts, like the one just mentioned.
I'd say this style of coding encourages program designs that have a consistent output even if the underlying code
changes often. For example, I feel like it'd be ideal for developing an API where once a standard has been created,
that API should function identically for a long time, even if the backend is changing constantly. In general, this way
of programming also encourages full coverage of edge cases, as you'll easily and quickly be able to test for them.

View File

@@ -0,0 +1,2 @@
def kwic(document, listPairs=False, ignoreWords=[]):
return []

View File

@@ -0,0 +1,5 @@
def kwic(document, listPairs=False, ignoreWords=[]):
if not document:
return []
return [document]

View File

@@ -0,0 +1,7 @@
def kwic(document, listPairs=False, ignoreWords=None, periodsToBreaks=False):
if not document:
return []
split_into_sentences = document.splitlines()
return [split_into_sentences]

View File

@@ -0,0 +1,10 @@
def kwic(document, listPairs=False, ignoreWords=None, periodsToBreaks=False):
if not document:
return []
if periodsToBreaks:
split_into_sentences = document.split(".")
else:
split_into_sentences = document.splitlines()
return [split_into_sentences]

View File

@@ -0,0 +1,29 @@
def split_by_periods(document):
output_array = []
sentence_array_temp = ""
for current_char in document:
if current_char != "\n":
sentence_array_temp += current_char
if current_char == ".":
output_array.append(sentence_array_temp)
sentence_array_temp = ""
if sentence_array_temp:
output_array.append(sentence_array_temp)
return output_array
def kwic(document, listPairs=False, ignoreWords=None, periodsToBreaks=False):
if not document:
return []
if periodsToBreaks:
split_into_sentences = split_by_periods(document)
else:
split_into_sentences = document.splitlines()
return split_into_sentences

View File

@@ -0,0 +1,44 @@
def split_by_periods(document):
output_array = []
sentence_array_temp = ""
for current_char in document:
if current_char != "\n":
sentence_array_temp += current_char
if current_char == ".":
output_array.append(sentence_array_temp)
sentence_array_temp = ""
if sentence_array_temp:
output_array.append(sentence_array_temp)
return output_array
def split_by_word_as_tuples(sentence_array):
output_array = []
index_incrementer = 0
for sentence in sentence_array:
words_array = sentence.split(" ")
words_array = filter(None, words_array)
output_array.append((words_array, index_incrementer))
index_incrementer += 1
return output_array
def kwic(document, listPairs=False, ignoreWords=None, periodsToBreaks=False):
if not document:
return []
if periodsToBreaks:
split_into_sentences = split_by_periods(document)
else:
split_into_sentences = document.splitlines()
split_into_word_tuples = split_by_word_as_tuples(split_into_sentences)
return split_into_word_tuples

View File

@@ -0,0 +1,61 @@
def split_by_periods(document):
output_array = []
sentence_array_temp = ""
for current_char in document:
if current_char != "\n":
sentence_array_temp += current_char
if current_char == ".":
output_array.append(sentence_array_temp)
sentence_array_temp = ""
if sentence_array_temp:
output_array.append(sentence_array_temp)
return output_array
def split_by_word_as_tuples(sentence_array):
output_array = []
index_incrementer = 0
for sentence in sentence_array:
words_array = sentence.split(" ")
words_array = filter(None, words_array)
output_array.append((words_array, index_incrementer))
index_incrementer += 1
return output_array
def array_circular_shift(input_array, rotate_val):
output_array = input_array[rotate_val:] + input_array[:rotate_val]
return output_array
def fill_with_circular_shifts_and_original(sentence_array):
output_array = []
for current_tuple in sentence_array:
for index, _ in enumerate(current_tuple[0]):
output_array.append((array_circular_shift(current_tuple[0], index), current_tuple[1]))
return output_array
def kwic(document, listPairs=False, ignoreWords=None, periodsToBreaks=False):
if not document:
return [], []
if periodsToBreaks:
split_into_sentences = split_by_periods(document)
else:
split_into_sentences = document.splitlines()
split_into_word_tuples = split_by_word_as_tuples(split_into_sentences)
output_tuple = (fill_with_circular_shifts_and_original(split_into_word_tuples), [])
return output_tuple

View File

@@ -0,0 +1,55 @@
import kwic
empty_document = ""
design_words_doc = "Design is hard.\nLet's just implement."
goodbye_buddy_doc = "Hello there.\nHello there, buddy.\nHello and goodbye, buddy.\nHello is like buddy Goodbye!"
hello_buddy_periods = "Hello there. Hello there, buddy. Hello and goodbye, buddy. Hello is like buddy Goodbye!"
this_is_split_periods_circular_output = [
(['This', 'is', 'something', 'split', 'by', 'periods.'], 0),
(['is', 'something', 'split', 'by', 'periods.', 'This'], 0),
(['something', 'split', 'by', 'periods.', 'This', 'is'], 0),
(['split', 'by', 'periods.', 'This', 'is', 'something'], 0),
(['by', 'periods.', 'This', 'is', 'something', 'split'], 0),
(['periods.', 'This', 'is', 'something', 'split', 'by'], 0),
(['Not', 'newlines.'], 1),
(['newlines.', 'Not'], 1)
]
if __name__ == "__main__":
# Ensure empty input gives empty output
assert(kwic.kwic(empty_document) == ([]))
# Ensure real input does not produce empty output
assert(kwic.kwic(design_words_doc) != ([]))
# Make sure it's broken into two lines
assert(len(kwic.kwic(design_words_doc)) == 6)
# Make sure it's broken into four line
assert(len(kwic.kwic(goodbye_buddy_doc)) == 14)
# Just realized I can check each individual function, so here is the check to split by periods
assert(kwic.split_by_periods("This is something split \nby periods. Not \n newlines.") ==
["This is something split by periods.", " Not newlines."])
# This checks to make sure a sentence gets split into words properly
assert(kwic.split_by_word_as_tuples(["This is something split by periods.", " Not newlines."]) ==
[(['This', 'is', 'something', 'split', 'by', 'periods.'], 0), (['Not', 'newlines.'], 1)])
# These check to make sure that the circular shift function is rotating properly
assert(kwic.array_circular_shift(["One", "Two", "Three", "Four", "Five"], 0) ==
["One", "Two", "Three", "Four", "Five"])
assert (kwic.array_circular_shift(["One", "Two", "Three", "Four", "Five"], 2) ==
["Three", "Four", "Five", "One", "Two"])
# This checks to make sure than circularly shifted versions of all sentences are correct
assert(kwic.fill_with_circular_shifts_and_original([(['This', 'is', 'something', 'split', 'by', 'periods.'], 0),
(['Not', 'newlines.'], 1)]) ==
this_is_split_periods_circular_output)
# This gives a bit of a sanity check. It's making sure the circular shift works, and that the output is formatted
# correctly...
assert(kwic.kwic("This is something split \nby periods. Not \n newlines.", periodsToBreaks=True) ==
this_is_split_periods_circular_output)

View File

@@ -0,0 +1,6 @@
from kwic import kwic
document = "" # Input no data
if __name__ == "__main__":
assert(kwic(document) == []) # Ensure that results are empty because there's no data

View File

@@ -0,0 +1,8 @@
from kwic import kwic
empty_document = ""
design_words_doc = "Design is hard.\nLet's just implement."
if __name__ == "__main__":
assert(kwic(empty_document) == []) # Ensure empty input gives empty output
assert(kwic(design_words_doc) != []) # Ensure real input does not produce empty output

View File

@@ -0,0 +1,12 @@
from kwic import kwic
empty_document = ""
design_words_doc = "Design is hard.\nLet's just implement."
goodbye_buddy_doc = "Hello there.\nHello there, buddy.\nHello and goodbye, buddy.\nHello is like buddy Goodbye!"
if __name__ == "__main__":
assert(kwic(empty_document) == []) # Ensure empty input gives empty output
assert(kwic(design_words_doc) != []) # Ensure real input does not produce empty output
assert(len(kwic(design_words_doc)[0]) == 2) # [(), ()] Make sure it's broken into two lines
assert(len(kwic(goodbye_buddy_doc)[0]) == 4) # [(), ()] Make sure it's broken into four lines

View File

@@ -0,0 +1,25 @@
from kwic import kwic
empty_document = ""
design_words_doc = "Design is hard.\nLet's just implement."
goodbye_buddy_doc = "Hello there.\nHello there, buddy.\nHello and goodbye, buddy.\nHello is like buddy Goodbye!"
hello_buddy_periods = "Hello there. Hello there, buddy. Hello and goodbye, buddy. Hello is like buddy Goodbye!"
if __name__ == "__main__":
# Ensure empty input gives empty output
assert(kwic(empty_document) == [])
# Ensure real input does not produce empty output
assert(kwic(design_words_doc) != [])
# Make sure it's broken into two lines
assert(len(kwic(design_words_doc)[0]) == 2)
# Make sure it's broken into four line
assert(len(kwic(goodbye_buddy_doc)[0]) == 4)
# Make sure line with just periods only shows up as one normally
assert(len(kwic(hello_buddy_periods)[0]) == 1)
# Make sure it's broken into four lines once it's broken by periods instead
assert(len(kwic(hello_buddy_periods, periodsToBreaks=True)[0]) == 4)

View File

@@ -0,0 +1,29 @@
from kwic import kwic
empty_document = ""
design_words_doc = "Design is hard.\nLet's just implement."
goodbye_buddy_doc = "Hello there.\nHello there, buddy.\nHello and goodbye, buddy.\nHello is like buddy Goodbye!"
hello_buddy_periods = "Hello there. Hello there, buddy. Hello and goodbye, buddy. Hello is like buddy Goodbye!"
hello_buddy_periods_output = ["Hello there.", " Hello there, buddy.", " Hello and goodbye, buddy.",
" Hello is like buddy Goodbye!"]
if __name__ == "__main__":
# Ensure empty input gives empty output
assert(kwic(empty_document) == [])
# Ensure real input does not produce empty output
assert(kwic(design_words_doc) != [])
# Make sure it's broken into two lines
assert(len(kwic(design_words_doc)) == 2)
# Make sure it's broken into four line
assert(len(kwic(goodbye_buddy_doc)) == 4)
# Make sure line with just periods shows up as itself
assert(kwic(hello_buddy_periods)[0] == hello_buddy_periods)
# Make sure it's broken into four lines once it's broken by periods instead
# Also, this time it keeps the ending period like it's supposed to
assert(kwic(hello_buddy_periods, periodsToBreaks=True) == hello_buddy_periods_output)

View File

@@ -0,0 +1,33 @@
from kwic import kwic
empty_document = ""
design_words_doc = "Design is hard.\nLet's just implement."
goodbye_buddy_doc = "Hello there.\nHello there, buddy.\nHello and goodbye, buddy.\nHello is like buddy Goodbye!"
hello_buddy_periods = "Hello there. Hello there, buddy. Hello and goodbye, buddy. Hello is like buddy Goodbye!"
hello_buddy_periods_output = [(["Hello", "there.", "Hello", "there,", "buddy.", "Hello", "and", "goodbye,", "buddy.",
"Hello", "is", "like", "buddy", "Goodbye!"], 0)]
hello_buddy_word_tuples_output = [(["Hello", "there."], 0),
(["Hello", "there,", "buddy."], 1),
(["Hello", "and", "goodbye,", "buddy."], 2),
(["Hello", "is", "like", "buddy", "Goodbye!"], 3)]
if __name__ == "__main__":
# Ensure empty input gives empty output
assert(kwic(empty_document) == [])
# Ensure real input does not produce empty output
assert(kwic(design_words_doc) != [])
# Make sure it's broken into two lines
assert(len(kwic(design_words_doc)) == 2)
# Make sure it's broken into four line
assert(len(kwic(goodbye_buddy_doc)) == 4)
# Make sure array contains same elements from kwic vs test output
assert(any(x in kwic(hello_buddy_periods) for x in hello_buddy_periods_output))
# Make sure arrays contain the same elements from kwic vs test, even with periods for breaks...
assert(any(x in kwic(hello_buddy_periods, periodsToBreaks=True) for x in hello_buddy_word_tuples_output))

View File

@@ -0,0 +1,55 @@
import kwic
empty_document = ""
design_words_doc = "Design is hard.\nLet's just implement."
goodbye_buddy_doc = "Hello there.\nHello there, buddy.\nHello and goodbye, buddy.\nHello is like buddy Goodbye!"
hello_buddy_periods = "Hello there. Hello there, buddy. Hello and goodbye, buddy. Hello is like buddy Goodbye!"
this_is_split_periods_circular_output = [
(['This', 'is', 'something', 'split', 'by', 'periods.'], 0),
(['is', 'something', 'split', 'by', 'periods.', 'This'], 0),
(['something', 'split', 'by', 'periods.', 'This', 'is'], 0),
(['split', 'by', 'periods.', 'This', 'is', 'something'], 0),
(['by', 'periods.', 'This', 'is', 'something', 'split'], 0),
(['periods.', 'This', 'is', 'something', 'split', 'by'], 0),
(['Not', 'newlines.'], 1),
(['newlines.', 'Not'], 1)
]
if __name__ == "__main__":
# Ensure empty input gives empty output
assert(kwic.kwic(empty_document) == ([], []))
# Ensure real input does not produce empty output
assert(kwic.kwic(design_words_doc) != ([], []))
# Make sure it's broken into two lines
assert(len(kwic.kwic(design_words_doc)[0]) == 6)
# Make sure it's broken into four line
assert(len(kwic.kwic(goodbye_buddy_doc)[0]) == 14)
# Just realized I can check each individual function, so here is the check to split by periods
assert(kwic.split_by_periods("This is something split \nby periods. Not \n newlines.") ==
["This is something split by periods.", " Not newlines."])
# This checks to make sure a sentence gets split into words properly
assert(kwic.split_by_word_as_tuples(["This is something split by periods.", " Not newlines."]) ==
[(['This', 'is', 'something', 'split', 'by', 'periods.'], 0), (['Not', 'newlines.'], 1)])
# These check to make sure that the circular shift function is rotating properly
assert(kwic.array_circular_shift(["One", "Two", "Three", "Four", "Five"], 0) ==
["One", "Two", "Three", "Four", "Five"])
assert (kwic.array_circular_shift(["One", "Two", "Three", "Four", "Five"], 2) ==
["Three", "Four", "Five", "One", "Two"])
# This checks to make sure than circularly shifted versions of all sentences are correct
assert(kwic.fill_with_circular_shifts_and_original([(['This', 'is', 'something', 'split', 'by', 'periods.'], 0),
(['Not', 'newlines.'], 1)]) ==
this_is_split_periods_circular_output)
# This gives a bit of a sanity check. It's making sure the circular shift works, and that the output is formatted
# correctly...
assert(kwic.kwic("This is something split \nby periods. Not \n newlines.", periodsToBreaks=True) ==
(this_is_split_periods_circular_output, []))

View File

@@ -0,0 +1,55 @@
import kwic
empty_document = ""
design_words_doc = "Design is hard.\nLet's just implement."
goodbye_buddy_doc = "Hello there.\nHello there, buddy.\nHello and goodbye, buddy.\nHello is like buddy Goodbye!"
hello_buddy_periods = "Hello there. Hello there, buddy. Hello and goodbye, buddy. Hello is like buddy Goodbye!"
this_is_split_periods_circular_output = [
(['This', 'is', 'something', 'split', 'by', 'periods.'], 0),
(['is', 'something', 'split', 'by', 'periods.', 'This'], 0),
(['something', 'split', 'by', 'periods.', 'This', 'is'], 0),
(['split', 'by', 'periods.', 'This', 'is', 'something'], 0),
(['by', 'periods.', 'This', 'is', 'something', 'split'], 0),
(['periods.', 'This', 'is', 'something', 'split', 'by'], 0),
(['Not', 'newlines.'], 1),
(['newlines.', 'Not'], 1)
]
if __name__ == "__main__":
# Ensure empty input gives empty output
assert(kwic.kwic(empty_document) == ([]))
# Ensure real input does not produce empty output
assert(kwic.kwic(design_words_doc) != ([]))
# Make sure it's broken into two lines
assert(len(kwic.kwic(design_words_doc)) == 6)
# Make sure it's broken into four line
assert(len(kwic.kwic(goodbye_buddy_doc)) == 14)
# Just realized I can check each individual function, so here is the check to split by periods
assert(kwic.split_by_periods("This is something split \nby periods. Not \n newlines.") ==
["This is something split by periods.", " Not newlines."])
# This checks to make sure a sentence gets split into words properly
assert(kwic.split_by_word_as_tuples(["This is something split by periods.", " Not newlines."]) ==
[(['This', 'is', 'something', 'split', 'by', 'periods.'], 0), (['Not', 'newlines.'], 1)])
# These check to make sure that the circular shift function is rotating properly
assert(kwic.array_circular_shift(["One", "Two", "Three", "Four", "Five"], 0) ==
["One", "Two", "Three", "Four", "Five"])
assert (kwic.array_circular_shift(["One", "Two", "Three", "Four", "Five"], 2) ==
["Three", "Four", "Five", "One", "Two"])
# This checks to make sure than circularly shifted versions of all sentences are correct
assert(kwic.fill_with_circular_shifts_and_original([(['This', 'is', 'something', 'split', 'by', 'periods.'], 0),
(['Not', 'newlines.'], 1)]) ==
this_is_split_periods_circular_output)
# This gives a bit of a sanity check. It's making sure the circular shift works, and that the output is formatted
# correctly...
assert(kwic.kwic("This is something split \nby periods. Not \n newlines.", periodsToBreaks=True) ==
this_is_split_periods_circular_output)

View File

@@ -0,0 +1,6 @@
1) ignoreWords only applies to indexing (the shifts) -- it has no impact on listPairs.
2) In listPairs, a "pair" is two (different) words that appear together in a "line" -- they may not be next to each other, they are just in the same line.
3) The list of pairs should have structure of list of tuple of tuple, I'll fix the bad example in the assignment. Not lists.
4) Submit files as a flat folder structure!!! kwic0.py testkwic0.py
5) Use assert to test. It must throw an uncaught exception.
6) The test testkwicN should pass kiwcN, but not if you ran it against a newer

View File

@@ -0,0 +1,10 @@
from mykwic import *
from pprint import pprint
for l in open("tocheck.txt"):
print "="*50
input = l[:-1]
print "INPUT:",input
v = eval("kwic("+input+")")
print "OUTPUT:"
pprint

View File

@@ -0,0 +1,326 @@
==================================================
INPUT: "Design is hard.\nLet's just implement."
OUTPUT:
[(['Design', 'is', 'hard.'], 0),
(['hard.', 'Design', 'is'], 0),
(['implement.', "Let's", 'just'], 1),
(['is', 'hard.', 'Design'], 0),
(['just', 'implement.', "Let's"], 1),
(["Let's", 'just', 'implement.'], 1)]
==================================================
INPUT: "Design is hard.\nLet's just implement.", ignoreWords=["is"]
OUTPUT:
[(['Design', 'is', 'hard.'], 0),
(['hard.', 'Design', 'is'], 0),
(['implement.', "Let's", 'just'], 1),
(['just', 'implement.', "Let's"], 1),
(["Let's", 'just', 'implement.'], 1)]
==================================================
INPUT: "Design is hard.\nLet's just implement.", ignoreWords=["is"], listPairs=True
OUTPUT:
([(['Design', 'is', 'hard.'], 0),
(['hard.', 'Design', 'is'], 0),
(['implement.', "Let's", 'just'], 1),
(['just', 'implement.', "Let's"], 1),
(["Let's", 'just', 'implement.'], 1)],
[])
==================================================
INPUT: "Hello there.\nHello there, buddy.\nHello and goodbye, buddy.\nHello is like buddy Goodbye!", listPairs=True
OUTPUT:
([(['and', 'goodbye,', 'buddy.', 'Hello'], 2),
(['buddy', 'Goodbye!', 'Hello', 'is', 'like'], 3),
(['buddy.', 'Hello', 'and', 'goodbye,'], 2),
(['buddy.', 'Hello', 'there,'], 1),
(['Goodbye!', 'Hello', 'is', 'like', 'buddy'], 3),
(['goodbye,', 'buddy.', 'Hello', 'and'], 2),
(['Hello', 'and', 'goodbye,', 'buddy.'], 2),
(['Hello', 'is', 'like', 'buddy', 'Goodbye!'], 3),
(['Hello', 'there,', 'buddy.'], 1),
(['Hello', 'there.'], 0),
(['is', 'like', 'buddy', 'Goodbye!', 'Hello'], 3),
(['like', 'buddy', 'Goodbye!', 'Hello', 'is'], 3),
(['there,', 'buddy.', 'Hello'], 1),
(['there.', 'Hello'], 0)],
[(('buddy', 'goodbye'), 2),
(('buddy', 'hello'), 3),
(('goodbye', 'hello'), 2),
(('hello', 'there'), 2)])
==================================================
INPUT: "Hello there. Hello there, buddy. Hello and goodbye, buddy. Hello is like buddy Goodbye!", listPairs=True, periodsToBreaks=True
OUTPUT:
([(['and', 'goodbye,', 'buddy.', 'Hello'], 2),
(['buddy', 'Goodbye!', 'Hello', 'is', 'like'], 3),
(['buddy.', 'Hello', 'and', 'goodbye,'], 2),
(['buddy.', 'Hello', 'there,'], 1),
(['Goodbye!', 'Hello', 'is', 'like', 'buddy'], 3),
(['goodbye,', 'buddy.', 'Hello', 'and'], 2),
(['Hello', 'and', 'goodbye,', 'buddy.'], 2),
(['Hello', 'is', 'like', 'buddy', 'Goodbye!'], 3),
(['Hello', 'there,', 'buddy.'], 1),
(['Hello', 'there.'], 0),
(['is', 'like', 'buddy', 'Goodbye!', 'Hello'], 3),
(['like', 'buddy', 'Goodbye!', 'Hello', 'is'], 3),
(['there,', 'buddy.', 'Hello'], 1),
(['there.', 'Hello'], 0)],
[(('buddy', 'goodbye'), 2),
(('buddy', 'hello'), 3),
(('goodbye', 'hello'), 2),
(('hello', 'there'), 2)])
==================================================
INPUT: ". . a"
OUTPUT:
[(['.', '.', 'a'], 0), (['.', 'a', '.'], 0), (['a', '.', '.'], 0)]
==================================================
INPUT: ". . a", periodsToBreaks=True
OUTPUT:
[(['.', '.', 'a'], 0), (['.', 'a', '.'], 0), (['a', '.', '.'], 0)]
==================================================
INPUT: ". A B\n. A B C\n. A B C D", listPairs=True
OUTPUT:
([(['.', 'A', 'B'], 0),
(['.', 'A', 'B', 'C'], 1),
(['.', 'A', 'B', 'C', 'D'], 2),
(['A', 'B', '.'], 0),
(['A', 'B', 'C', '.'], 1),
(['A', 'B', 'C', 'D', '.'], 2),
(['B', '.', 'A'], 0),
(['B', 'C', '.', 'A'], 1),
(['B', 'C', 'D', '.', 'A'], 2),
(['C', '.', 'A', 'B'], 1),
(['C', 'D', '.', 'A', 'B'], 2),
(['D', '.', 'A', 'B', 'C'], 2)],
[(('a', 'b'), 3), (('a', 'c'), 2), (('b', 'c'), 2)])
==================================================
INPUT: "Hello world. This is a test\nhopefully it turns out okay", periodsToBreaks = True
OUTPUT:
[(['a', 'test', 'hopefully', 'it', 'turns', 'out', 'okay', 'This', 'is'], 1),
(['Hello', 'world.'], 0),
(['hopefully', 'it', 'turns', 'out', 'okay', 'This', 'is', 'a', 'test'], 1),
(['is', 'a', 'test', 'hopefully', 'it', 'turns', 'out', 'okay', 'This'], 1),
(['it', 'turns', 'out', 'okay', 'This', 'is', 'a', 'test', 'hopefully'], 1),
(['okay', 'This', 'is', 'a', 'test', 'hopefully', 'it', 'turns', 'out'], 1),
(['out', 'okay', 'This', 'is', 'a', 'test', 'hopefully', 'it', 'turns'], 1),
(['test', 'hopefully', 'it', 'turns', 'out', 'okay', 'This', 'is', 'a'], 1),
(['This', 'is', 'a', 'test', 'hopefully', 'it', 'turns', 'out', 'okay'], 1),
(['turns', 'out', 'okay', 'This', 'is', 'a', 'test', 'hopefully', 'it'], 1),
(['world.', 'Hello'], 0)]
==================================================
INPUT: "It's very nice to be footloose. \nWith just a toothbrush and a comb.\n"
OUTPUT:
[(['a', 'comb.', 'With', 'just', 'a', 'toothbrush', 'and'], 1),
(['a', 'toothbrush', 'and', 'a', 'comb.', 'With', 'just'], 1),
(['and', 'a', 'comb.', 'With', 'just', 'a', 'toothbrush'], 1),
(['be', 'footloose.', "It's", 'very', 'nice', 'to'], 0),
(['comb.', 'With', 'just', 'a', 'toothbrush', 'and', 'a'], 1),
(['footloose.', "It's", 'very', 'nice', 'to', 'be'], 0),
(["It's", 'very', 'nice', 'to', 'be', 'footloose.'], 0),
(['just', 'a', 'toothbrush', 'and', 'a', 'comb.', 'With'], 1),
(['nice', 'to', 'be', 'footloose.', "It's", 'very'], 0),
(['to', 'be', 'footloose.', "It's", 'very', 'nice'], 0),
(['toothbrush', 'and', 'a', 'comb.', 'With', 'just', 'a'], 1),
(['very', 'nice', 'to', 'be', 'footloose.', "It's"], 0),
(['With', 'just', 'a', 'toothbrush', 'and', 'a', 'comb.'], 1)]
==================================================
INPUT: "It's very nice to be footloose. \nWith just a toothbrush and a comb.\n", periodsToBreaks=True
OUTPUT:
[(['a', 'comb.', 'With', 'just', 'a', 'toothbrush', 'and'], 1),
(['a', 'toothbrush', 'and', 'a', 'comb.', 'With', 'just'], 1),
(['and', 'a', 'comb.', 'With', 'just', 'a', 'toothbrush'], 1),
(['be', 'footloose.', "It's", 'very', 'nice', 'to'], 0),
(['comb.', 'With', 'just', 'a', 'toothbrush', 'and', 'a'], 1),
(['footloose.', "It's", 'very', 'nice', 'to', 'be'], 0),
(["It's", 'very', 'nice', 'to', 'be', 'footloose.'], 0),
(['just', 'a', 'toothbrush', 'and', 'a', 'comb.', 'With'], 1),
(['nice', 'to', 'be', 'footloose.', "It's", 'very'], 0),
(['to', 'be', 'footloose.', "It's", 'very', 'nice'], 0),
(['toothbrush', 'and', 'a', 'comb.', 'With', 'just', 'a'], 1),
(['very', 'nice', 'to', 'be', 'footloose.', "It's"], 0),
(['With', 'just', 'a', 'toothbrush', 'and', 'a', 'comb.'], 1)]
==================================================
INPUT: "hello here, hello there, hello everywhere",listPairs = True
OUTPUT:
([(['everywhere', 'hello', 'here,', 'hello', 'there,', 'hello'], 0),
(['hello', 'everywhere', 'hello', 'here,', 'hello', 'there,'], 0),
(['hello', 'here,', 'hello', 'there,', 'hello', 'everywhere'], 0),
(['hello', 'there,', 'hello', 'everywhere', 'hello', 'here,'], 0),
(['here,', 'hello', 'there,', 'hello', 'everywhere', 'hello'], 0),
(['there,', 'hello', 'everywhere', 'hello', 'here,', 'hello'], 0)],
[])
==================================================
INPUT: "hello here\nhello here again\nhello again", listPairs=True
OUTPUT:
([(['again', 'hello'], 2),
(['again', 'hello', 'here'], 1),
(['hello', 'again'], 2),
(['hello', 'here'], 0),
(['hello', 'here', 'again'], 1),
(['here', 'again', 'hello'], 1),
(['here', 'hello'], 0)],
[(('again', 'hello'), 2), (('hello', 'here'), 2)])
==================================================
INPUT: "hello hello hello\nhello hello", listPairs=True
OUTPUT:
([(['hello', 'hello'], 1),
(['hello', 'hello'], 1),
(['hello', 'hello', 'hello'], 0),
(['hello', 'hello', 'hello'], 0),
(['hello', 'hello', 'hello'], 0)],
[])
==================================================
INPUT: "to be or not to be", listPairs=True
OUTPUT:
([(['be', 'or', 'not', 'to', 'be', 'to'], 0),
(['be', 'to', 'be', 'or', 'not', 'to'], 0),
(['not', 'to', 'be', 'to', 'be', 'or'], 0),
(['or', 'not', 'to', 'be', 'to', 'be'], 0),
(['to', 'be', 'or', 'not', 'to', 'be'], 0),
(['to', 'be', 'to', 'be', 'or', 'not'], 0)],
[])
==================================================
INPUT: ". A B\n. A B C\n. A B C D", listPairs=True
OUTPUT:
([(['.', 'A', 'B'], 0),
(['.', 'A', 'B', 'C'], 1),
(['.', 'A', 'B', 'C', 'D'], 2),
(['A', 'B', '.'], 0),
(['A', 'B', 'C', '.'], 1),
(['A', 'B', 'C', 'D', '.'], 2),
(['B', '.', 'A'], 0),
(['B', 'C', '.', 'A'], 1),
(['B', 'C', 'D', '.', 'A'], 2),
(['C', '.', 'A', 'B'], 1),
(['C', 'D', '.', 'A', 'B'], 2),
(['D', '.', 'A', 'B', 'C'], 2)],
[(('a', 'b'), 3), (('a', 'c'), 2), (('b', 'c'), 2)])
==================================================
INPUT: "a bad\ncat barks."
OUTPUT:
[(['a', 'bad'], 0),
(['bad', 'a'], 0),
(['barks.', 'cat'], 1),
(['cat', 'barks.'], 1)]
==================================================
INPUT: "This is not a sentence.\nNeither is this.",ignoreWords=["is."]
OUTPUT:
[(['a', 'sentence.', 'This', 'is', 'not'], 0),
(['is', 'not', 'a', 'sentence.', 'This'], 0),
(['is', 'this.', 'Neither'], 1),
(['Neither', 'is', 'this.'], 1),
(['not', 'a', 'sentence.', 'This', 'is'], 0),
(['sentence.', 'This', 'is', 'not', 'a'], 0),
(['This', 'is', 'not', 'a', 'sentence.'], 0),
(['this.', 'Neither', 'is'], 1)]
==================================================
INPUT: "This is not a sentence.\nNeither is this.",ignoreWords=["is"]
OUTPUT:
[(['a', 'sentence.', 'This', 'is', 'not'], 0),
(['Neither', 'is', 'this.'], 1),
(['not', 'a', 'sentence.', 'This', 'is'], 0),
(['sentence.', 'This', 'is', 'not', 'a'], 0),
(['This', 'is', 'not', 'a', 'sentence.'], 0),
(['this.', 'Neither', 'is'], 1)]
==================================================
INPUT: "hello hello\nhello hello"
OUTPUT:
[(['hello', 'hello'], 0),
(['hello', 'hello'], 0),
(['hello', 'hello'], 1),
(['hello', 'hello'], 1)]
==================================================
INPUT: "#!good morning", ignoreWords = ['!good']
OUTPUT:
[(['#!good', 'morning'], 0), (['morning', '#!good'], 0)]
==================================================
INPUT: "go!od morning-!", ignoreWords = ['good']
OUTPUT:
[(['morning-!', 'go!od'], 0)]
==================================================
INPUT: "#!good morning-!", ignoreWords = ['!GoOd']
OUTPUT:
[(['#!good', 'morning-!'], 0), (['morning-!', '#!good'], 0)]
==================================================
INPUT: "?!good morning-!", ignoreWords = ['!GoOd']
OUTPUT:
[(['?!good', 'morning-!'], 0), (['morning-!', '?!good'], 0)]
==================================================
INPUT: "?!go!!!od morning-!", ignoreWords = ['!GoOd']
OUTPUT:
[(['?!go!!!od', 'morning-!'], 0), (['morning-!', '?!go!!!od'], 0)]
==================================================
INPUT: 'This pair? is good.\n So is this pair and that pair',listPairs=True
OUTPUT:
([(['and', 'that', 'pair', 'So', 'is', 'this', 'pair'], 1),
(['good.', 'This', 'pair?', 'is'], 0),
(['is', 'good.', 'This', 'pair?'], 0),
(['is', 'this', 'pair', 'and', 'that', 'pair', 'So'], 1),
(['pair', 'and', 'that', 'pair', 'So', 'is', 'this'], 1),
(['pair', 'So', 'is', 'this', 'pair', 'and', 'that'], 1),
(['pair?', 'is', 'good.', 'This'], 0),
(['So', 'is', 'this', 'pair', 'and', 'that', 'pair'], 1),
(['that', 'pair', 'So', 'is', 'this', 'pair', 'and'], 1),
(['this', 'pair', 'and', 'that', 'pair', 'So', 'is'], 1),
(['This', 'pair?', 'is', 'good.'], 0)],
[(('is', 'pair'), 2), (('is', 'this'), 2), (('pair', 'this'), 2)])
==================================================
INPUT: "CS is cool"
OUTPUT:
[(['cool', 'CS', 'is'], 0),
(['CS', 'is', 'cool'], 0),
(['is', 'cool', 'CS'], 0)]
==================================================
INPUT: "a b\na b c\na b c d", listPairs=True
OUTPUT:
([(['a', 'b'], 0),
(['a', 'b', 'c'], 1),
(['a', 'b', 'c', 'd'], 2),
(['b', 'a'], 0),
(['b', 'c', 'a'], 1),
(['b', 'c', 'd', 'a'], 2),
(['c', 'a', 'b'], 1),
(['c', 'd', 'a', 'b'], 2),
(['d', 'a', 'b', 'c'], 2)],
[(('a', 'b'), 3), (('a', 'c'), 2), (('b', 'c'), 2)])
==================================================
INPUT: 'This pair? is good.\n So is this pair and that pair', listPairs=True
OUTPUT:
([(['and', 'that', 'pair', 'So', 'is', 'this', 'pair'], 1),
(['good.', 'This', 'pair?', 'is'], 0),
(['is', 'good.', 'This', 'pair?'], 0),
(['is', 'this', 'pair', 'and', 'that', 'pair', 'So'], 1),
(['pair', 'and', 'that', 'pair', 'So', 'is', 'this'], 1),
(['pair', 'So', 'is', 'this', 'pair', 'and', 'that'], 1),
(['pair?', 'is', 'good.', 'This'], 0),
(['So', 'is', 'this', 'pair', 'and', 'that', 'pair'], 1),
(['that', 'pair', 'So', 'is', 'this', 'pair', 'and'], 1),
(['this', 'pair', 'and', 'that', 'pair', 'So', 'is'], 1),
(['This', 'pair?', 'is', 'good.'], 0)],
[(('is', 'pair'), 2), (('is', 'this'), 2), (('pair', 'this'), 2)])

View File

@@ -0,0 +1,29 @@
"Design is hard.\nLet's just implement."
"Design is hard.\nLet's just implement.", ignoreWords=["is"]
"Design is hard.\nLet's just implement.", ignoreWords=["is"], listPairs=True
"Hello there.\nHello there, buddy.\nHello and goodbye, buddy.\nHello is like buddy Goodbye!", listPairs=True
"Hello there. Hello there, buddy. Hello and goodbye, buddy. Hello is like buddy Goodbye!", listPairs=True, periodsToBreaks=True
". . a"
". . a", periodsToBreaks=True
". A B\n. A B C\n. A B C D", listPairs=True
"Hello world. This is a test\nhopefully it turns out okay", periodsToBreaks = True
"It's very nice to be footloose. \nWith just a toothbrush and a comb.\n"
"It's very nice to be footloose. \nWith just a toothbrush and a comb.\n", periodsToBreaks=True
"hello here, hello there, hello everywhere",listPairs = True
"hello here\nhello here again\nhello again", listPairs=True
"hello hello hello\nhello hello", listPairs=True
"to be or not to be", listPairs=True
". A B\n. A B C\n. A B C D", listPairs=True
"a bad\ncat barks."
"This is not a sentence.\nNeither is this.",ignoreWords=["is."]
"This is not a sentence.\nNeither is this.",ignoreWords=["is"]
"hello hello\nhello hello"
"#!good morning", ignoreWords = ['!good']
"go!od morning-!", ignoreWords = ['good']
"#!good morning-!", ignoreWords = ['!GoOd']
"?!good morning-!", ignoreWords = ['!GoOd']
"?!go!!!od morning-!", ignoreWords = ['!GoOd']
'This pair? is good.\n So is this pair and that pair',listPairs=True
"CS is cool"
"a b\na b c\na b c d", listPairs=True
'This pair? is good.\n So is this pair and that pair', listPairs=True

View File

@@ -0,0 +1,26 @@
From an architectural point of view, very little is different between the fast version of kwic and the baseline
implementation. However, the one architectural difference that is there is significant in terms of speed, the point of
creating this version in the first place. In the baseline kwic, the code acts almost entirely like a single black box
block. Data in to data out, all done in pure python. The fast version's biggest gains came from changing a core aspect
of how some of the loops were being done, in terms that they are now handled by compiled c code rather than running in
native python. This change was the use of the map function, an alternative to the traditional loop in python.
While this is an architecturally significant change, as we're now expanding the number of boxes AND languages, is also
came with the speed benefits of running code in c. This speed jump was most noticeable when used in the function that is
called as an argument "key" in the alphabetization sort calls. Another place where changes were made that helped
increase speed, though didn't necessarily change the code architecturally, was in loops where inherited methods were
called. So, for example, I originally had many loops that called "array_name.append()". This is slow because every time
the interpreter comes across that line in each loop, it has to process "array_name" to determine whether it contains
append, and what append is. By aliasing "array_name.append" as a new variable like so "aliased = array_name.append" the
interpreter only has to perform that lookup once. Then, by replacing the calls in the loop with my alias, I save that
lookup time for each iteration. While this didn't result in a massive increase like the use of map did, it did help
enough to mention. A few other minor changes were also made to help shave a couple tenths of a second off here and
there. There were many places where I was needlessly making copies of large arrays of data, simply to make code
readability better, but which took both time and ram to accomplish. By streamlining the use of existing variables, I
again managed to save little bits of time here and there. Now, once it came to enabling listPairs, the other very slow
part of the code, I decided to change the way that the words were sanitized as that's where the most speed loss was
found. In the original version, I used a join command on an empty string that stripped out unwanted characters
essentially rather brute forced. For the fast version, I learned of a way to perform the same task using the python's
built in translate function. This change nearly halved the time it took for just the listPairs section of the code to
run. In order to test all of this, I used python's built in cProfile tool which times every user written function call,
as well as aggregates the times of all python's built in ones, and separately. This made it very easy to see what parts
of the code needed to be focused on for speed increases.

View File

@@ -0,0 +1,14 @@
Compared to the original kwic, the testing version of kwic is quite different. The focus of this version was to pull out
much of the code from the main into functions so that sensitive portions could be easily tested separately, as well as
be changed more easily. From an architectural point of view, the original kwic is very much a traditional black box
approach. Data and flags in, one tiny function call used for alphabetization (as I had trouble with it and needed to
pull it out), and the final data came out. The testing version on the other hand is approximately 25% longer in terms of
pure code length, and is split into ten separate functions rather than the two for the original kwic. By splitting the
core features of the kwic system into these functions, it made testing the development of the code that much easier.
Rather than having to run through all the code up to the point I wanted to test, I could simply only call the functions
for features I was actively testing. Of course, adding all these extra function calls did affect performance slightly,
though not as much as I was expecting. This version is only marginally slower than the baseline implementation. I also
considering adding a flag to kwic that would enable debugging print statements, but I decided against it as (at least
in my personal experience) adding tons of print statements isn't specifically always helpful. Generally, you only need
printing for a particular section of code, which could easily be manually added now that the important parts of the code
are broken out.

View File

@@ -0,0 +1,58 @@
def shift(line):
return [line[i:] + line[:i] for i in xrange(0,len(line))]
def cleanWord(word):
return filter (lambda c: c not in [".",",","?","!",":"], word.lower())
def ignorable(word,ignoreWords):
return cleanWord(word) in map(lambda w: w.lower(), ignoreWords)
def splitBreaks(string, periodsToBreaks):
if not periodsToBreaks:
return string.split("\n")
else:
line = ""
lines = []
lastChar1 = None
lastChar2 = None
breakChars = map(chr, xrange(ord('a'),ord('z')+1))
for c in string:
if (c == " ") and (lastChar1 == ".") and (lastChar2 in breakChars):
lines.append(line)
line = ""
line += c
lastChar2 = lastChar1
lastChar1 = c
lines.append(line)
return lines
def kwic(string,ignoreWords=[], listPairs=False, periodsToBreaks=False):
lines = splitBreaks(string, periodsToBreaks)
splitLines = map(lambda l: l.split(), lines)
if listPairs:
pairs = {}
for l in splitLines:
seen = set([])
for wu1 in l:
wc1 = cleanWord(wu1)
if len(wc1) == 0:
continue
for wu2 in l:
wc2 = cleanWord(wu2)
if wc1 < wc2:
if (wc1,wc2) in seen:
continue
seen.add((wc1,wc2))
if (wc1, wc2) in pairs:
pairs[(wc1,wc2)] += 1
else:
pairs[(wc1,wc2)] = 1
shiftedLines = [map(lambda x:(x,i), shift(splitLines[i])) for i in xrange(0,len(splitLines))]
flattenedLines = [l for subList in shiftedLines for l in subList]
filteredLines = filter(lambda l: not ignorable(l[0][0], ignoreWords), flattenedLines)
if not listPairs:
return sorted(filteredLines, key = lambda l: (map(cleanWord, l[0]),l[1]))
else:
return (sorted(filteredLines, key = lambda l: (map(lambda w:w.lower(), l[0]),l[1])),
map(lambda wp: (wp, pairs[wp]), sorted(filter(lambda wp: pairs[wp] > 1, pairs.keys()))))

View File

@@ -0,0 +1,20 @@
- One version that has improved performance, and one to improve testability
- Performance
-- Faster by a good marging than the baseline version
- Testability
-- Make it easier to control and/or observe the behavior of the system
-- Ideas
--- Interfaces for controlling internal variables
--- Copious assertions
--- Limiting complexity
--- Changes that make the code timing more consistent
- Files (put in folder named perrenc361assign2.zip)
-- kwic.py : baseline version
-- fastkwic.py : better performance version
-- fastarch.txt : describes changes in architectural terms that made faster, and how you tested this (400 words)
-- fastarch.pdf : diagram of architecture
-- testkwic.py : highly testable version of kwic
-- testarch.txt : describes changes in architectural terms that made it more testable (200 words)
-- testarch.pdf : diagram of architecture

View File

@@ -0,0 +1,102 @@
def alphabetized_key(input_data):
output_array = map(str.lower, input_data[0])
return output_array
def kwic(document, listPairs=False, ignoreWords=None, periodsToBreaks=False):
if not document and not listPairs:
return []
elif not document and listPairs:
return [], []
if periodsToBreaks:
output_array = []
temp_sentence = ""
document_length_zero_indexed = len(document) - 1
for current_index, current_value in enumerate(document):
if current_value == '.':
if (current_index == 0) or (current_index == document_length_zero_indexed) or \
(document[current_index - 1].islower() and (document[current_index + 1].isspace() or
(document[current_index + 1] == '\n'))):
temp_sentence += current_value
output_array.append(temp_sentence)
temp_sentence = ""
else:
if current_value != '\n':
temp_sentence += current_value
else:
temp_sentence += " "
if temp_sentence:
output_array.append(temp_sentence)
split_into_sentences = output_array
else:
split_into_sentences = document.split('\n')
output_array_2 = []
index_incrementer = 0
temp_append = output_array_2.append
temp_split = str.split
for sentence in split_into_sentences:
words_array = temp_split(sentence, " ")
words_array = filter(None, words_array)
temp_append((words_array, index_incrementer))
index_incrementer += 1
split_into_word_tuples = output_array_2
circular_shifted_data = []
temp_append = circular_shifted_data.append
for current_tuple in output_array_2:
for index, _ in enumerate(current_tuple[0]):
temp_array = current_tuple[0][index:] + current_tuple[0][:index]
temp_append((temp_array, current_tuple[1]))
if ignoreWords:
lowered_input = []
for word in ignoreWords:
lowered_input.append(word.lower())
for current_tuple in circular_shifted_data:
if current_tuple[0][0].lower().strip(".:!?,") in lowered_input:
circular_shifted_data.remove(current_tuple)
alphabetized_data = sorted(circular_shifted_data, key=alphabetized_key)
if listPairs:
known_pairs = {}
for sentence_array, _ in split_into_word_tuples:
seen_in_sentence = set([])
for first_word in sentence_array:
for second_word in sentence_array:
first = first_word.lower().translate(None, ".,?!:")
second = second_word.lower().translate(None, ".,?!:")
if (first == second) or (first == "") or (first > second):
continue
if (first, second) not in seen_in_sentence:
seen_in_sentence.add((first, second))
if (first, second) in known_pairs:
known_pairs[(first, second)] += 1
else:
known_pairs[(first, second)] = 1
output_list = []
for key in known_pairs:
if known_pairs[key] > 1:
output_list.append((key, known_pairs[key]))
output_list.sort(key=alphabetized_key)
return alphabetized_data, output_list
else:
return alphabetized_data

View File

@@ -0,0 +1,115 @@
def alphabetized_key(input_data):
output_array = []
for word in input_data[0]:
output_array.append(word.lower())
return output_array
def kwic(document, listPairs=False, ignoreWords=None, periodsToBreaks=False):
if not document and not listPairs:
return []
elif not document and listPairs:
return [], []
if periodsToBreaks:
output_array = []
temp_sentence = ""
document_length_zero_indexed = len(document) - 1
for current_index, current_value in enumerate(document):
if current_value == '.':
if (current_index == 0) or (current_index == document_length_zero_indexed) or \
(document[current_index - 1].islower() and (document[current_index + 1].isspace() or
(document[current_index + 1] == '\n'))):
temp_sentence += current_value
output_array.append(temp_sentence)
temp_sentence = ""
else:
if current_value != '\n':
temp_sentence += current_value
else:
temp_sentence += " "
if temp_sentence:
output_array.append(temp_sentence)
split_into_sentences = output_array
else:
split_into_sentences = document.split('\n')
output_array = []
index_incrementer = 0
for sentence in split_into_sentences:
words_array = sentence.split(" ")
words_array = filter(None, words_array)
output_array.append((words_array, index_incrementer))
index_incrementer += 1
split_into_word_tuples = output_array
output_array = []
for current_tuple in split_into_word_tuples:
for index, _ in enumerate(current_tuple[0]):
temp_array = current_tuple[0][index:] + current_tuple[0][:index]
output_array.append((temp_array, current_tuple[1]))
circular_shifted_data = output_array
if ignoreWords:
lowered_input = []
output_array = []
for word in ignoreWords:
lowered_input.append(word.lower())
for current_tuple in circular_shifted_data:
if current_tuple[0][0].lower().strip(".:!?,") in lowered_input:
pass
else:
output_array.append(current_tuple)
circular_shifted_data = output_array
sorted_array = sorted(circular_shifted_data, key=alphabetized_key)
alphabetized_data = sorted_array
if listPairs:
known_pairs = {}
char_set = ".,?!:"
for sentence_array, _ in split_into_word_tuples:
seen_in_sentence = set([])
for first_word in sentence_array:
for second_word in sentence_array:
first = "".join(char for char in first_word.lower() if char not in char_set)
second = "".join(char for char in second_word.lower() if char not in char_set)
if first > second:
temp = second
second = first
first = temp
if (first == second) or (first == ""):
continue
if (first, second) not in seen_in_sentence:
seen_in_sentence.add((first, second))
if (first, second) in known_pairs:
known_pairs[(first, second)] += 1
else:
known_pairs[(first, second)] = 1
output_list = []
for key in known_pairs:
if known_pairs[key] > 1:
output_list.append((key, known_pairs[key]))
output_list.sort(key=alphabetized_key)
return alphabetized_data, output_list
else:
return alphabetized_data

View File

@@ -0,0 +1,91 @@
import time
import kwic as kwic_original
import fastkwic as kwic_fast
import testkwic as kwic_test
num_tests = 1
test_original_kwic = True
print_original_kwic = False
test_fast_kwic = True
print_fast_kwic = False
test_test_kwic = True
print_test_kwic = False
design_words_doc = "Design is hard.\nLet's just implement."
goodbye_buddy_doc = "Hello there.\nHello there, buddy.\nHello and goodbye, buddy.\nHello is like buddy Goodbye!"
hello_buddy_periods = "Hello there. Hello there, buddy. Hello and goodbye, buddy. Hello is like buddy Goodbye!"
letters_and_stuff = "It's very nice to be footloose. \nWith just a toothbrush and a comb.\n"
open_file = open("test_documents/chesterton_short.txt", "r")
file_as_lines = open_file.readlines()
file_as_string = ""
for line in file_as_lines:
file_as_string += line
del file_as_lines
open_file.close()
input_document = file_as_string
if __name__ == "__main__":
original_output = None
fast_output = None
test_output = None
original_times = []
fast_times = []
test_times = []
for i in range(num_tests):
if test_original_kwic:
print "\nTesting kwic.py"
start_time = time.time()
# original_output = kwic_original.kwic(input_document)
original_output = kwic_original.kwic(input_document, listPairs=True)
if print_original_kwic:
print original_output
total = time.time() - start_time
original_times.append(total)
print "kwic.py took " + str(total) + " seconds."
if test_fast_kwic:
print "\nTesting fastkwic.py"
start_time = time.time()
# fast_output = kwic_fast.kwic(input_document)
fast_output = kwic_fast.kwic(input_document, listPairs=True)
if print_fast_kwic:
print fast_output
total = time.time() - start_time
fast_times.append(total)
print "fastkwic.py took " + str(total) + " seconds."
if test_test_kwic:
print "\nTesting testkwic.py"
start_time = time.time()
# test_output = kwic_test.kwic(input_document)
test_output = kwic_test.kwic(input_document, listPairs=True)
if print_test_kwic:
print test_output
total = time.time() - start_time
test_times.append(total)
print "testkwic.py took " + str(total) + " seconds."
print "\nOriginal == Fast: " + str(original_output == fast_output)
print "Original == Test: " + str(original_output == test_output)
print "Test == Fast: " + str(test_output == fast_output)
print "\n\n"
if test_original_kwic:
print "Original Avg: " + str(sum(original_times)/ float(len(original_times)))
if test_fast_kwic:
print "Fast Avg: " + str(sum(fast_times) / float(len(fast_times)))
if test_test_kwic:
print "Test Avg: " + str(sum(test_times) / float(len(test_times)))

File diff suppressed because one or more lines are too long

View File

@@ -0,0 +1,149 @@
def split_by_periods(document):
output_array = []
temp_sentence = ""
document_length_zero_indexed = len(document) - 1
for current_index, current_value in enumerate(document):
if current_value == '.':
if (current_index == 0) or (current_index == document_length_zero_indexed) or \
(document[current_index-1].islower() and (document[current_index+1].isspace() or
(document[current_index+1] == '\n'))):
temp_sentence += current_value
output_array.append(temp_sentence)
temp_sentence = ""
else:
if current_value != '\n':
temp_sentence += current_value
else:
temp_sentence += " "
if temp_sentence:
output_array.append(temp_sentence)
return output_array
def split_by_word_as_tuples(sentence_array):
output_array = []
index_incrementer = 0
for sentence in sentence_array:
words_array = sentence.split(" ")
words_array = filter(None, words_array)
output_array.append((words_array, index_incrementer))
index_incrementer += 1
return output_array
def array_circular_shift(input_array, rotate_val):
output_array = input_array[rotate_val:] + input_array[:rotate_val]
return output_array
def fill_with_circular_shifts_and_original(sentence_array):
output_array = []
for current_tuple in sentence_array:
for index, _ in enumerate(current_tuple[0]):
output_array.append((array_circular_shift(current_tuple[0], index), current_tuple[1]))
return output_array
def alphabetize_tuple_list(input_array):
sorted_array = sorted(input_array, key=alphabetized_key)
return sorted_array
def alphabetized_key(input_data):
output_array = []
for word in input_data[0]:
output_array.append(word.lower())
return output_array
def remove_words(input_array, words):
lowered_input = []
output_array = []
for word in words:
lowered_input.append(word.lower())
for current_tuple in input_array:
if current_tuple[0][0].lower().strip(".:!?,") in lowered_input:
pass
else:
output_array.append(current_tuple)
return output_array
def create_list_pairs(input_array):
known_pairs = {}
for sentence_array, _ in input_array:
seen_in_sentence = set([])
for first_word in sentence_array:
for second_word in sentence_array:
first, second = return_ordered_words(sanitize_word(first_word), sanitize_word(second_word))
if (first == second) or (first == ""):
continue
if (first, second) not in seen_in_sentence:
seen_in_sentence.add((first, second))
if (first, second) in known_pairs:
known_pairs[(first, second)] += 1
else:
known_pairs[(first, second)] = 1
output_list = []
for key in known_pairs:
if known_pairs[key] > 1:
output_list.append((key, known_pairs[key]))
output_list.sort(key=alphabetized_key)
return output_list
def sanitize_word(input_word):
char_set = ".,?!:"
return "".join(char for char in input_word.lower() if char not in char_set)
def return_ordered_words(word_one, word_two):
if word_one < word_two:
return word_one, word_two
else:
return word_two, word_one
def kwic(document, listPairs=False, ignoreWords=None, periodsToBreaks=False):
if not document and not listPairs:
return []
elif not document and listPairs:
return [], []
if periodsToBreaks:
split_into_sentences = split_by_periods(document)
else:
split_into_sentences = document.split('\n')
split_into_word_tuples = split_by_word_as_tuples(split_into_sentences)
circular_shifted_data = fill_with_circular_shifts_and_original(split_into_word_tuples)
if ignoreWords:
circular_shifted_data = remove_words(circular_shifted_data, ignoreWords)
alphabetized_data = alphabetize_tuple_list(circular_shifted_data)
if listPairs:
return alphabetized_data, create_list_pairs(split_into_word_tuples)
else:
return alphabetized_data

View File

@@ -0,0 +1,84 @@
# eventkwic.txt by Corwin Perren
###########################################
########## Specification Writeup ##########
###########################################
My specification for the Kwic class is as follows. Once Kwic is constructed, it waits for a method call. If no text has
been added to the implementation and either index or listPairs is called, they will return empty arrays. Once text is
added using addText, that text is appended to a class variable as one long string. It is done this way to make sure that
lines that may be broken up into multiple addText calls still work properly. Now that there is text to process, a call
to index will process the whole string that's been stored and return the proper kwic index. If listPairs is called
instead, it will internally run index, but not return the indexed data, so that the new text is processed. It then
creates the pairs and returns the array. Index works semi-incrementally. Rather than keeping track of every line of text
ever added to the class, index processes the new data into the array of tuples (again, just for the new input), appends
it to the class's global array, and then resorts it. In this way, we avoid having to recompute the sentences that have
already been processed, minus having to re-alphabetize. I chose to not re-compute the sentences each time there was a
new call to addText as that seemed like a waste of cpu cycles. It's basically a trade off of having a call to index
be fast, or having the call to addText be fast. One potential option I considered was to use multi-threading to
continually process the input text for both indexing and listing pairs. While this would have greatly increased the
complexity (especially where eventspec is concerned), it also would have resulted in a faster implementation.
I used the eventspec class and the kwic.fsm state machine to verify that all of the above processes happen correctly,
including handling the extra constructor fields for periodsToBreaks and ignoreWords when they are used. By printing out
the steps taken through processing, it is very easy to see the program take the correct steps. In the case that a
catastrophic state logic error has occurred, the EventSpec class will stop execution with a trace statement, making
it very easy to diagnose what incorrect step was taken and correct it. I decided not to place event calls where there
was the potential for large loops to keep the log of steps taken clear and concise. Had I placed further logic to handle
these loops, there could potentially be hundreds, thousands, or more logged steps that would make debugging difficult.
In a sense, using EventSpec performs a similar function to unit testing as if you ever change your code in a way that
causes it to skip a step, or produce fatally incorrect output, the class will let you know roughly where the changed
code is, so therefore a good idea about what went wrong.
#####################################
########## Trace Output #1 ##########
#####################################
# Code run
kc = Kwic(periodsToBreaks=True)
kc.addText("This pair? is good.\n So is this pair and that pair")
kc.index()
kc.reset()
kc.addText("This pair? is good.\n So is this pair and that pair")
kc.listPairs()
kc.print_eventspec_log()
# Trace Output
STEP #0: callConstructor --> idle
STEP #1: callAddText --> idle
STEP #2: callIndex --> processIndex
STEP #3: callProcessIndex --> checkIfText
STEP #4: callSplitPeriods --> splitIntoTuples
STEP #5: callSplitAsTuples --> fillCircular
STEP #6: callFillCircular --> checkIgnoreOrAlpha
STEP #7: callAlphabetize --> idle
STEP #8: callReset --> idle
STEP #9: callAddText --> idle
STEP #10: callListPairs --> processListPairs
STEP #11: callProcessListPairs --> listPairsIndexOrList
STEP #12: callProcessIndex --> checkIfText
STEP #13: callSplitPeriods --> splitIntoTuples
STEP #14: callSplitAsTuples --> fillCircular
STEP #15: callFillCircular --> checkIgnoreOrAlpha
STEP #16: callAlphabetize --> idle
STEP #17: callCreateListPairs --> idle
#####################################
########## Trace Output #2 ##########
#####################################
# Code run
kc = Kwic(ignoreWords=["and", "So"])
kc.addText("This pair? is good.\n So is this pair and that pair")
kc.listPairs()
kc.print_eventspec_log()
# Trace Output
STEP #0: callConstructor --> idle
STEP #1: callAddText --> idle
STEP #2: callListPairs --> processListPairs
STEP #3: callProcessListPairs --> listPairsIndexOrList
STEP #4: callProcessIndex --> checkIfText
STEP #5: processNewlineSplit --> splitIntoTuples
STEP #6: callSplitAsTuples --> fillCircular
STEP #7: callFillCircular --> checkIgnoreOrAlpha
STEP #8: callRemoveWords --> removingWords
STEP #9: callAlphabetize --> idle
STEP #10: callCreateListPairs --> idle

View File

@@ -0,0 +1,66 @@
class EventSpec():
def readfsm(self,file):
# takes a filename, returns a finite state machine
# fsm is (begin state, structure)
fsm = {}
with open(file) as f:
s = None
t = {}
for l in f:
ls = l.split()
if (ls == []) or (ls[0][0] == "#"):
continue
if ls[0] == "state:":
if s != None:
if s in fsm:
raise SyntaxError("Cannot define state " + s + " twice")
fsm[s] = t
s = ls[1]
t = {}
elif ls[1] == "->":
t[ls[0]] = ls[2]
elif ls[0] == "begin:":
beginState = ls[1]
else:
raise SyntaxError(l + " is not a line in a finite state machine definition")
if s != None:
fsm[s] = t
return (beginState, fsm)
def __init__(self,file):
(self.start, self.machine) = self.readfsm(file)
self.state = self.start
self.trace = []
self.triggers = {}
def onEvent(self, event, action):
self.triggers[event] = action
def reset(self):
self.state = self.start
self.trace = []
def trace(self):
return self.trace
def state(self):
return self.state
def printLog(self):
i = 0
for (e,s) in self.trace:
print " STEP #"+str(i)+":",e,"-->",s
i += 1
def event(self, event):
try:
self.state = self.machine[self.state][event]
self.trace.append((event,self.state))
if event in self.triggers:
self.triggers[event]()
except KeyError:
raise RuntimeError("From state " + self.state + ", transition " + event + " is not allowed (trace: " + str(self.trace) + ")")

View File

@@ -0,0 +1,39 @@
begin: waitForStart
state: waitForStart
callConstructor -> idle
state: idle
callAddText -> idle
callIndex -> processIndex
callListPairs -> processListPairs
callReset -> idle
callCreateListPairs -> idle
state: processIndex
callProcessIndex -> checkIfText
state: checkIfText
callSplitPeriods -> splitIntoTuples
processNewlineSplit -> splitIntoTuples
processNoNewText -> idle
state: splitIntoTuples
callSplitAsTuples -> fillCircular
state: fillCircular
callFillCircular -> checkIgnoreOrAlpha
state: checkIgnoreOrAlpha
callAlphabetize -> idle
callRemoveWords -> removingWords
state: removingWords
callAlphabetize -> idle
state: processListPairs
callProcessListPairs -> listPairsIndexOrList
state: listPairsIndexOrList
callProcessIndex -> checkIfText
callCreateListPairs -> idle

View File

@@ -0,0 +1,270 @@
# Created by Corwin Perren
######################################################
########## Begin Kwic Class Implementation ###########
######################################################
class Kwic(object):
def __init__(self, ignoreWords=None, periodsToBreaks=False):
self.ignore_words = ignoreWords
self.periods_to_breaks = periodsToBreaks
self.all_sentence_tuples = []
self.kwic_output_array = []
self.list_pairs_output_array = []
self.text_to_add = ""
self.event_spec_instance = EventSpec("kwic.fsm")
self.event_spec_instance.event("callConstructor")
def addText(self, new_text):
self.event_spec_instance.event("callAddText")
self.text_to_add += " " + str(new_text)
def index(self):
self.event_spec_instance.event("callIndex")
self.__process_kwic_index()
return self.kwic_output_array
def listPairs(self):
self.event_spec_instance.event("callListPairs")
self.__process_list_pairs()
return self.list_pairs_output_array
def reset(self):
self.event_spec_instance.event("callReset")
self.all_sentence_tuples = []
self.kwic_output_array = []
self.list_pairs_output_array = []
self.text_to_add = []
def print_eventspec_log(self):
self.event_spec_instance.printLog()
def __process_kwic_index(self):
self.event_spec_instance.event("callProcessIndex")
if self.text_to_add:
if self.periods_to_breaks:
split_into_sentences = self.__split_by_periods(self.text_to_add)
else:
self.event_spec_instance.event("processNewlineSplit")
split_into_sentences = self.text_to_add.split('\n')
split_into_word_tuples = self.__split_by_word_as_tuples(split_into_sentences)
for sentence_tuple in split_into_word_tuples:
self.all_sentence_tuples.append(sentence_tuple)
circular_shifted_data = self.__fill_with_circular_shifts_and_original(split_into_word_tuples)
if self.ignore_words:
circular_shifted_data = self.__remove_words(circular_shifted_data, self.ignore_words)
for current_tuple in circular_shifted_data:
self.kwic_output_array.append(current_tuple)
self.text_to_add = ""
self.kwic_output_array = self.__alphabetize_tuple_list(self.kwic_output_array)
else:
self.event_spec_instance.event("processNoNewText")
def __process_list_pairs(self):
self.event_spec_instance.event("callProcessListPairs")
if self.text_to_add:
self.__process_kwic_index()
self.list_pairs_output_array = self.__create_list_pairs(self.all_sentence_tuples)
def __fill_with_circular_shifts_and_original(self, sentence_array):
self.event_spec_instance.event("callFillCircular")
output_array = []
for current_tuple in sentence_array:
for index, _ in enumerate(current_tuple[0]):
output_array.append((self.__array_circular_shift(current_tuple[0], index), current_tuple[1]))
return output_array
def __alphabetize_tuple_list(self, input_array):
self.event_spec_instance.event("callAlphabetize")
sorted_array = sorted(input_array, key=self.__alphabetized_key)
return sorted_array
def __create_list_pairs(self, input_array):
self.event_spec_instance.event("callCreateListPairs")
known_pairs = {}
for sentence_array, _ in input_array:
seen_in_sentence = set([])
for first_word in sentence_array:
for second_word in sentence_array:
first, second = self.__return_ordered_words(self.__sanitize_word(first_word),
self.__sanitize_word(second_word))
if (first == second) or (first == ""):
continue
if (first, second) not in seen_in_sentence:
seen_in_sentence.add((first, second))
if (first, second) in known_pairs:
known_pairs[(first, second)] += 1
else:
known_pairs[(first, second)] = 1
output_list = []
for key in known_pairs:
if known_pairs[key] > 1:
output_list.append((key, known_pairs[key]))
output_list.sort(key=self.__alphabetized_key)
return output_list
def __split_by_periods(self, document):
self.event_spec_instance.event("callSplitPeriods")
output_array = []
temp_sentence = ""
document_length_zero_indexed = len(document) - 1
for current_index, current_value in enumerate(document):
if current_value == '.':
if (current_index == 0) or (current_index == document_length_zero_indexed) or \
(document[current_index - 1].islower() and (document[current_index + 1].isspace() or
(document[current_index + 1] == '\n'))):
temp_sentence += current_value
output_array.append(temp_sentence)
temp_sentence = ""
else:
if current_value != '\n':
temp_sentence += current_value
else:
temp_sentence += " "
if temp_sentence:
output_array.append(temp_sentence)
return output_array
def __split_by_word_as_tuples(self, sentence_array):
self.event_spec_instance.event("callSplitAsTuples")
output_array = []
index_incrementer = 0
for sentence in sentence_array:
words_array = sentence.split(" ")
words_array = filter(None, words_array)
output_array.append((words_array, index_incrementer))
index_incrementer += 1
return output_array
def __array_circular_shift(self, input_array, rotate_val):
output_array = input_array[rotate_val:] + input_array[:rotate_val]
return output_array
def __alphabetized_key(self, input_data):
output_array = []
for word in input_data[0]:
output_array.append(word.lower())
return output_array
def __remove_words(self, input_array, words):
self.event_spec_instance.event("callRemoveWords")
lowered_input = []
output_array = []
for word in words:
lowered_input.append(word.lower())
for current_tuple in input_array:
if current_tuple[0][0].lower().strip(".:!?,") in lowered_input:
pass
else:
output_array.append(current_tuple)
return output_array
def __sanitize_word(self, input_word):
return input_word.lower().translate(None, ".,?!:")
def __return_ordered_words(self, word_one, word_two):
if word_one < word_two:
return word_one, word_two
else:
return word_two, word_one
######################################################
########## Begin Provided EventSpec Include ##########
######################################################
class EventSpec():
def readfsm(self, file):
# takes a filename, returns a finite state machine
# fsm is (begin state, structure)
fsm = {}
with open(file) as f:
s = None
t = {}
for l in f:
ls = l.split()
if (ls == []) or (ls[0][0] == "#"):
continue
if ls[0] == "state:":
if s != None:
if s in fsm:
raise SyntaxError("Cannot define state " + s + " twice")
fsm[s] = t
s = ls[1]
t = {}
elif ls[1] == "->":
t[ls[0]] = ls[2]
elif ls[0] == "begin:":
beginState = ls[1]
else:
raise SyntaxError(l + " is not a line in a finite state machine definition")
if s != None:
fsm[s] = t
return (beginState, fsm)
def __init__(self, file):
(self.start, self.machine) = self.readfsm(file)
self.state = self.start
self.trace = []
self.triggers = {}
def onEvent(self, event, action):
self.triggers[event] = action
def reset(self):
self.state = self.start
self.trace = []
def trace(self):
return self.trace
def state(self):
return self.state
def printLog(self):
i = 0
for (e, s) in self.trace:
print " STEP #" + str(i) + ":", e, "-->", s
i += 1
def event(self, event):
try:
self.state = self.machine[self.state][event]
self.trace.append((event, self.state))
if event in self.triggers:
self.triggers[event]()
except KeyError:
raise RuntimeError("From state " + self.state + ", transition " + event + " is not allowed (trace: " + str(
self.trace) + ")")

View File

@@ -0,0 +1,13 @@
begin: start
state: start
a -> infinibs
b -> infinias
state: infinibs
b -> infinibs
a -> start
state: infinias
a -> infiniais
b -> start

View File

@@ -0,0 +1,12 @@
# For testing implementation
from kwic import Kwic
output_check = [(['and', 'that', 'pair', 'So', 'is', 'this', 'pair'], 1), (['good.', 'This', 'pair?', 'is'], 0), (['is', 'good.', 'This', 'pair?'], 0), (['is', 'this', 'pair', 'and', 'that', 'pair', 'So'], 1), (['pair', 'and', 'that', 'pair', 'So', 'is', 'this'], 1), (['pair', 'So', 'is', 'this', 'pair', 'and', 'that'], 1), (['pair?', 'is', 'good.', 'This'], 0), (['So', 'is', 'this', 'pair', 'and', 'that', 'pair'], 1), (['that', 'pair', 'So', 'is', 'this', 'pair', 'and'], 1), (['this', 'pair', 'and', 'that', 'pair', 'So', 'is'], 1), (['This', 'pair?', 'is', 'good.'], 0)]
output_check_pairs = [(('is', 'pair'), 2), (('is', 'this'), 2), (('pair', 'this'), 2)]
if __name__ == "__main__":
kc = Kwic()
kc.addText("This pair? is good.\n So is this pair and that pair")
print kc.listPairs() == output_check_pairs
print kc.index() == output_check
kc.print_eventspec_log()

View File

@@ -0,0 +1,29 @@
from assign3 import eventspec
es = eventspec.EventSpec("kwic.fsm")
es.event("a")
es.event("b")
es.event("b")
es.event("b")
es.event("b")
es.event("a")
es.printLog()
# Beatles code
def printHello():
print "Hello"
def printGoodbye():
print "Goodbye"
es.onEvent("a",printHello)
es.onEvent("b",printGoodbye)
es.event("a")
es.event("a")
es.event("a")
es.event("b")
es.event("b")
es.event("a")
es.printLog()