moolb

Bloom filter for python3 with pluggable hasher backend
git clone git://git.defalsify.org/python-moolb.git
Log | Files | Refs | README | LICENSE

commit a5aedfb5514f9f668b4de0037cc9d8fd5a1818b4
parent 10b4c6cff5a0c4e8445f556dffc7d5f7bcef42e9
Author: nolash <dev@holbrook.no>
Date:   Fri, 30 Oct 2020 09:48:05 +0100

Add test, py repo structure

Diffstat:
A.gitignore | 1+
AREADME.md | 1+
Dmain.py | 68--------------------------------------------------------------------
Amoolb/__init__.py | 1+
Amoolb/moolb.py | 64++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Asetup.cfg | 10++++++++++
Asetup.py | 20++++++++++++++++++++
Atests/test_basic.py | 32++++++++++++++++++++++++++++++++
8 files changed, 129 insertions(+), 68 deletions(-)

diff --git a/.gitignore b/.gitignore @@ -0,0 +1 @@ +__pycache__ diff --git a/README.md b/README.md @@ -0,0 +1 @@ +# Moolb - simple bloom filter with pluggable hash backend diff --git a/main.py b/main.py @@ -1,68 +0,0 @@ -import numpy -import hashlib -import logging -import math - -logging.basicConfig(level=logging.DEBUG) -logg = logging.getLogger() - - -# m = ceil((n * log(p)) / log(1 / pow(2, log(2)))); - -class BloomFilter: - - def __init__(self, bits, rounds): - self.bits = bits - self.bytes = int(bits / 8) - if self.bytes * 8 != self.bits: - raise ValueError('Need byte boundary bit value') - self.rounds = rounds - self.filter = numpy.zeros(self.bytes, dtype=numpy.uint8) - self.hasher = self.set_hasher(self.__hash) - - - def set_hasher(self, hasher): - self.hasher = hasher - - - def add(self, b): - for i in range(self.rounds): - salt = str(i) - s = salt.encode('utf-8') - z = self.__hash(b, s) - r = int.from_bytes(z, byteorder='big') - m = r % self.bits - bytepos = math.floor(m / 8) - bitpos = m % 8 - self.filter[bytepos] |= 1 << bitpos - logg.debug('foo {} {}'.format(bytepos, bitpos)) - return m - - - def check(self, b): - for i in range(self.rounds): - salt = str(i) - s = salt.encode('utf-8') - z = self.__hash(b, s) - r = int.from_bytes(z, byteorder='big') - m = r % self.bits - bytepos = math.floor(m / 8) - bitpos = m % 8 - if not self.filter[bytepos] & 1 << bitpos: - return False - return True - - - def __hash(self, b, s): - logg.debug('hashing {} {}'.format(b.hex(), s.hex())) - h = hashlib.sha256() - h.update(b) - h.update(s) - return h.digest() - - - -f = BloomFilter(8192 * 8, 3) -f.add(b'1024') -print(f.check(b'1024')) -print(f.check(b'1023')) diff --git a/moolb/__init__.py b/moolb/__init__.py @@ -0,0 +1 @@ +from .moolb import Bloom diff --git a/moolb/moolb.py b/moolb/moolb.py @@ -0,0 +1,64 @@ +import numpy +import hashlib +import logging +import math + +logging.basicConfig(level=logging.DEBUG) +logg = logging.getLogger() + + +# m = ceil((n * log(p)) / log(1 / pow(2, log(2)))); + +class Bloom: + + def __init__(self, bits, rounds, hasher=None): + self.bits = bits + self.bytes = int(bits / 8) + if self.bytes * 8 != self.bits: + raise ValueError('Need byte boundary bit value') + self.rounds = rounds + self.filter = numpy.zeros(self.bytes, dtype=numpy.uint8) + if hasher == None: + logg.info('using default hasher (SHA256)') + hasher = self.__hash + self.hasher = self.set_hasher(hasher) + + + def set_hasher(self, hasher): + self.hasher = hasher + + + def add(self, b): + for i in range(self.rounds): + salt = str(i) + s = salt.encode('utf-8') + z = self.__hash(b, s) + r = int.from_bytes(z, byteorder='big') + m = r % self.bits + bytepos = math.floor(m / 8) + bitpos = m % 8 + self.filter[bytepos] |= 1 << bitpos + logg.debug('foo {} {}'.format(bytepos, bitpos)) + return m + + + def check(self, b): + for i in range(self.rounds): + salt = str(i) + s = salt.encode('utf-8') + z = self.__hash(b, s) + r = int.from_bytes(z, byteorder='big') + m = r % self.bits + bytepos = math.floor(m / 8) + bitpos = m % 8 + if not self.filter[bytepos] & 1 << bitpos: + return False + return True + + + def __hash(self, b, s): + logg.debug('hashing {} {}'.format(b.hex(), s.hex())) + h = hashlib.sha256() + h.update(b) + h.update(s) + return h.digest() diff --git a/setup.cfg b/setup.cfg @@ -0,0 +1,10 @@ +[metadata] +classifiers = + Programming Language :: Python :: 3 + Operating System :: OS Independent + Development Status :: 3 - Alpha + Intended Audience :: Developers + Topic :: Software Development :: Libraries + License :: OSI Approved :: GNU General Public License v3 or later (GPLv3+) +license_files = + LICENSE.txt diff --git a/setup.py b/setup.py @@ -0,0 +1,20 @@ +from setuptools import setup + +f = open('./README.md', 'r') +long_description = f.read() +f.close() + +setup( + name='moolb', + version='0.0.1', + description='Simple bloom filter with pluggable hash backend', + author='Louis Holbrook', + author_email='dev@holbrook.no', + license='GPL3', + long_description=long_description, + long_description_content_type='text/markdown', + packages=[ + 'moolb', + ], + url='https://gitlab.com/nolash/python-moolb', + ) diff --git a/tests/test_basic.py b/tests/test_basic.py @@ -0,0 +1,32 @@ +import unittest +import hashlib + +from moolb import Bloom + + +def hashround(self, b, s): + logg.debug('sha1 hashing {} {}'.format(b.hex(), s.hex())) + h = hashlib.sha1() + h.update(b) + h.update(s) + return h.digest() + + +class Test(unittest.TestCase): + + def test_default(self): + f = Bloom(8192 * 8, 3) + f.add(b'1024') + self.assertTrue(f.check(b'1024')) + self.assertFalse(f.check(b'1023')) + + + def test_plug(self): + f = Bloom(8192 * 8, 3, hashround) + f.add(b'1024') + self.assertTrue(f.check(b'1024')) + self.assertFalse(f.check(b'1023')) + + +if __name__ == '__main__': + unittest.main()