Files

103 lines
3.5 KiB
Python

def alphabetized_key(input_data):
output_array = map(str.lower, input_data[0])
return output_array
def kwic(document, listPairs=False, ignoreWords=None, periodsToBreaks=False):
if not document and not listPairs:
return []
elif not document and listPairs:
return [], []
if periodsToBreaks:
output_array = []
temp_sentence = ""
document_length_zero_indexed = len(document) - 1
for current_index, current_value in enumerate(document):
if current_value == '.':
if (current_index == 0) or (current_index == document_length_zero_indexed) or \
(document[current_index - 1].islower() and (document[current_index + 1].isspace() or
(document[current_index + 1] == '\n'))):
temp_sentence += current_value
output_array.append(temp_sentence)
temp_sentence = ""
else:
if current_value != '\n':
temp_sentence += current_value
else:
temp_sentence += " "
if temp_sentence:
output_array.append(temp_sentence)
split_into_sentences = output_array
else:
split_into_sentences = document.split('\n')
output_array_2 = []
index_incrementer = 0
temp_append = output_array_2.append
temp_split = str.split
for sentence in split_into_sentences:
words_array = temp_split(sentence, " ")
words_array = filter(None, words_array)
temp_append((words_array, index_incrementer))
index_incrementer += 1
split_into_word_tuples = output_array_2
circular_shifted_data = []
temp_append = circular_shifted_data.append
for current_tuple in output_array_2:
for index, _ in enumerate(current_tuple[0]):
temp_array = current_tuple[0][index:] + current_tuple[0][:index]
temp_append((temp_array, current_tuple[1]))
if ignoreWords:
lowered_input = []
for word in ignoreWords:
lowered_input.append(word.lower())
for current_tuple in circular_shifted_data:
if current_tuple[0][0].lower().strip(".:!?,") in lowered_input:
circular_shifted_data.remove(current_tuple)
alphabetized_data = sorted(circular_shifted_data, key=alphabetized_key)
if listPairs:
known_pairs = {}
for sentence_array, _ in split_into_word_tuples:
seen_in_sentence = set([])
for first_word in sentence_array:
for second_word in sentence_array:
first = first_word.lower().translate(None, ".,?!:")
second = second_word.lower().translate(None, ".,?!:")
if (first == second) or (first == "") or (first > second):
continue
if (first, second) not in seen_in_sentence:
seen_in_sentence.add((first, second))
if (first, second) in known_pairs:
known_pairs[(first, second)] += 1
else:
known_pairs[(first, second)] = 1
output_list = []
for key in known_pairs:
if known_pairs[key] > 1:
output_list.append((key, known_pairs[key]))
output_list.sort(key=alphabetized_key)
return alphabetized_data, output_list
else:
return alphabetized_data