mirror of
https://github.com/caperren/school_archives.git
synced 2025-11-09 21:51:15 +00:00
Added missing classes from final year at OSU
This commit is contained in:
Binary file not shown.
@@ -0,0 +1,37 @@
|
||||
##### To Run This Program #####
|
||||
Run one of the lines below replacing arguments as necessary:
|
||||
|
||||
./bloom_filter.py -d dictionary.txt -i input.txt -o output3.txt output5.txt
|
||||
OR
|
||||
python3 bloom_filter.py -d dictionary.txt -i input.txt -o output3.txt output5.txt
|
||||
|
||||
Both of the above commands have been tested on this program on the OSU Flip servers.
|
||||
There is no guarantee that this will run on any computers other than these servers!!!
|
||||
No makefile is needed for this program.
|
||||
|
||||
##### Answers to Questions #####
|
||||
a.
|
||||
The functions I chose were ripemd160, sha256, whirlpool, md5, and DSA. These are all cryptographic hashes. I chose
|
||||
them because they are less likely to generate collisions than non-cryptographic ones (at the expense of being slower),
|
||||
as well as because they are the ones build into the hashlib library for python3 on the flip servers, which guarantees
|
||||
that the grader will be able to run the program without having to install or include additional libraries.
|
||||
|
||||
b.
|
||||
ripemd160: 0.000006914
|
||||
sha256: 0.00001025
|
||||
whirlpool: 0.00001121
|
||||
md5: 0.00001025
|
||||
DSA: 0.000009298
|
||||
|
||||
ripemd160 and DSA are the fastest, though not by much. They perform better as their algorithms to generate the hash
|
||||
computer more quickly than the others. It is also likely that the length of the hash output is shorter than ones like
|
||||
sha256, which are quite large.
|
||||
|
||||
c.
|
||||
The probability of false positives is 1% as I set the hash bit array size to 5976456, which I calculated using a
|
||||
dictionary size of 623518. The result of the false positive equation -((623518 * log(0.01)/(log^2(2))) = 5976456.
|
||||
The probability of false negatives is 0%. It is not possible to have a false negative with a bloom filter.
|
||||
|
||||
d.
|
||||
The rate of false positives can be reduced by increasing the number of storable positions for the the hash bit array, or
|
||||
by reducing the number of hash functions used to reduce collisions.
|
||||
@@ -0,0 +1,125 @@
|
||||
#!/usr/bin/env python3
|
||||
# ##### Includes #####
|
||||
# System includes
|
||||
import sys
|
||||
import getopt
|
||||
import hashlib
|
||||
import random
|
||||
from time import time
|
||||
|
||||
# ##### Global Variables #####
|
||||
USAGE_STRING = "usage: ./bloom_filter.py -d dictionary.txt -i input.txt -o output3.txt output5.txt"
|
||||
NUM_ARGUMENTS_CORRECT = 7
|
||||
|
||||
HASH_ARRAY_SIZE = 5976456 # should be a 1% false positive rate
|
||||
|
||||
|
||||
ARGUMENT_MAPPING = {
|
||||
"dictionary": 1,
|
||||
"input": 3,
|
||||
"three_hash": 5,
|
||||
"five_hash": 6
|
||||
}
|
||||
|
||||
AVAILABLE_HASHES = [
|
||||
"ripemd160",
|
||||
"sha256",
|
||||
"whirlpool",
|
||||
"md5",
|
||||
"DSA"
|
||||
]
|
||||
|
||||
|
||||
# ##### Bloom Filter Class #####
|
||||
class BloomFilter(object):
|
||||
def __init__(self, arguments):
|
||||
super(BloomFilter, self).__init__()
|
||||
|
||||
if len(arguments) != NUM_ARGUMENTS_CORRECT:
|
||||
print(USAGE_STRING)
|
||||
sys.exit(2)
|
||||
|
||||
self.dictionary_path = None
|
||||
self.input_file_path = None
|
||||
self.three_hash_output_path = None
|
||||
self.five_hash_output_path = None
|
||||
|
||||
self.three_hash_dictionary = {i: 0 for i in range(HASH_ARRAY_SIZE)}
|
||||
self.five_hash_dictionary = {i: 0 for i in range(HASH_ARRAY_SIZE)}
|
||||
|
||||
self.dictionary_path = arguments[ARGUMENT_MAPPING["dictionary"]]
|
||||
self.input_file_path = arguments[ARGUMENT_MAPPING["input"]]
|
||||
self.three_hash_output_path = arguments[ARGUMENT_MAPPING["three_hash"]]
|
||||
self.five_hash_output_path = arguments[ARGUMENT_MAPPING["five_hash"]]
|
||||
|
||||
def generate_filters(self):
|
||||
dictionary_file = open(self.dictionary_path, "r", encoding="latin-1")
|
||||
lines = dictionary_file.read().splitlines()
|
||||
|
||||
print("Generating filter using \"%s\". This will take a few moments." % self.dictionary_path)
|
||||
|
||||
for password in lines:
|
||||
clean_password = password.strip()
|
||||
for i in range(5):
|
||||
five_hasher = hashlib.new(AVAILABLE_HASHES[i])
|
||||
five_hasher.update(clean_password.encode())
|
||||
current_hash = int(five_hasher.hexdigest(), 16)
|
||||
|
||||
self.five_hash_dictionary[current_hash % HASH_ARRAY_SIZE] = 1
|
||||
|
||||
for i in range(3):
|
||||
three_hasher = hashlib.new(AVAILABLE_HASHES[i])
|
||||
three_hasher.update(clean_password.encode())
|
||||
current_hash = int(three_hasher.hexdigest(), 16)
|
||||
self.three_hash_dictionary[current_hash % HASH_ARRAY_SIZE] = 1
|
||||
|
||||
print("Filter generation complete.")
|
||||
|
||||
dictionary_file.close()
|
||||
|
||||
def process_inputs_and_generate_outputs(self):
|
||||
input_file = open(self.input_file_path, "r", encoding="latin-1")
|
||||
lines = input_file.read().splitlines()
|
||||
|
||||
output_file_three_hash = open(self.three_hash_output_path, "w")
|
||||
output_file_five_hash = open(self.five_hash_output_path, "w")
|
||||
|
||||
print("Processing input file \"%s\" and writing outputs to \"%s\" and \"%s\"." %
|
||||
(self.input_file_path, self.three_hash_output_path, self.five_hash_output_path))
|
||||
|
||||
for password in lines[1:]:
|
||||
in_set_three = True
|
||||
in_set_five = True
|
||||
|
||||
clean_password = password.strip()
|
||||
for i in range(5):
|
||||
five_hasher = hashlib.new(AVAILABLE_HASHES[i])
|
||||
five_hasher.update(clean_password.encode())
|
||||
current_hash = int(five_hasher.hexdigest(), 16)
|
||||
|
||||
if self.five_hash_dictionary[current_hash % HASH_ARRAY_SIZE] == 0:
|
||||
in_set_five = False
|
||||
|
||||
for i in range(3):
|
||||
three_hasher = hashlib.new(AVAILABLE_HASHES[i])
|
||||
three_hasher.update(clean_password.encode())
|
||||
current_hash = int(three_hasher.hexdigest(), 16)
|
||||
|
||||
if self.three_hash_dictionary[current_hash % HASH_ARRAY_SIZE] == 0:
|
||||
in_set_three = False
|
||||
|
||||
output_file_three_hash.write("%s\n" % ("no" if not in_set_three else "maybe"))
|
||||
output_file_five_hash.write("%s\n" % ("no" if not in_set_five else "maybe"))
|
||||
|
||||
print("Processing complete.")
|
||||
|
||||
input_file.close()
|
||||
output_file_three_hash.close()
|
||||
output_file_five_hash.close()
|
||||
|
||||
|
||||
# ##### Main #####
|
||||
if __name__ == "__main__":
|
||||
bloom_filter = BloomFilter(sys.argv[1:])
|
||||
bloom_filter.generate_filters()
|
||||
bloom_filter.process_inputs_and_generate_outputs()
|
||||
File diff suppressed because one or more lines are too long
@@ -0,0 +1,19 @@
|
||||
17
|
||||
*holly&ben
|
||||
*homo*
|
||||
*vanusa*
|
||||
010605
|
||||
010605
|
||||
walton-dutch-luzon-post
|
||||
012190
|
||||
0121909334
|
||||
maskflower
|
||||
2,,{H99*X(
|
||||
darry-bethel-cube-mess
|
||||
masking
|
||||
undersupplied
|
||||
undersupplies
|
||||
7^*.$?GC86
|
||||
undersupply
|
||||
9Ca5B>w8.Q}bhU=ss*sK
|
||||
karl
|
||||
@@ -0,0 +1,12 @@
|
||||
no
|
||||
maybe
|
||||
no
|
||||
no
|
||||
no
|
||||
maybe
|
||||
no
|
||||
maybe
|
||||
maybe
|
||||
maybe
|
||||
no
|
||||
maybe
|
||||
Reference in New Issue
Block a user