moolb

Bloom filter for python3 with pluggable hasher backend
git clone git://git.defalsify.org/python-moolb.git
Info | Log | Files | Refs | README | LICENSE

moolb.py (2236B)


      1 import hashlib
      2 import logging
      3 import math
      4 
      5 logg = logging.getLogger(__name__)
      6 
      7 # m = ceil((n * log(p)) / log(1 / pow(2, log(2))));
      8 
      9 
     10 class Bloom:
     11 
     12     def __init__(self, bits, rounds, hasher=None, default_data=None):
     13         self.bits = bits
     14         self.bytes = int(bits / 8)
     15         if self.bytes * 8 != self.bits:
     16             raise ValueError('Need byte boundary bit value')
     17         self.rounds = rounds
     18         if hasher == None:
     19             logg.debug('moolb using default hasher (SHA256)')
     20             hasher = self.__hash
     21         self.hasher = self.set_hasher(hasher)
     22 
     23         self.filter = None
     24         if default_data != None:
     25             self.merge(default_data)
     26         else:
     27             self.filter = [0] * self.bytes
     28 
     29 
     30     def merge(self, filter_data):
     31         datalen = len(filter_data)
     32         if datalen != self.bytes:
     33             raise ValueError('expected byte array bit size {}, got {}'.format(self.bits, datalen * 8))
     34 
     35         if self.filter == None:
     36             self.filter = filter_data
     37         else:
     38             self.filter = list(map(lambda x, y: x | y, filter_data, self.filter))
     39 
     40 
     41     def set_hasher(self, hasher):
     42         self.hasher = hasher
     43 
     44 
     45     def add(self, b):
     46         for i in range(self.rounds):
     47             #salt = str(i)
     48             #s = salt.encode('utf-8')
     49             s = i.to_bytes(4, byteorder='big')
     50             z = self.__hash(b, s)
     51             r = int.from_bytes(z, byteorder='big')
     52             m = r % self.bits
     53             bytepos = math.floor(m / 8)
     54             bitpos = m % 8
     55             self.filter[bytepos] |= 1 << bitpos
     56         return m
     57 
     58 
     59     def check(self, b):
     60         for i in range(self.rounds):
     61             #salt = str(i)
     62             #s = salt.encode('utf-8')
     63             s = i.to_bytes(4, byteorder='big')
     64             z = self.__hash(b, s)
     65             r = int.from_bytes(z, byteorder='big')
     66             m = r % self.bits
     67             bytepos = math.floor(m / 8)
     68             bitpos = m % 8
     69             if not self.filter[bytepos] & 1 << bitpos:
     70                 return False
     71             return True
     72 
     73 
     74     def to_bytes(self):
     75         return bytes(self.filter)
     76 
     77 
     78     def __hash(self, b, s):
     79        h = hashlib.sha256()
     80        h.update(b)
     81        h.update(s)
     82        return h.digest()