BloomFilter.st
author Claus Gittinger <cg@exept.de>
Sat, 02 May 2020 21:40:13 +0200
changeset 5476 7355a4b11cb6
parent 4974 33b7c4e3a597
permissions -rw-r--r--
#FEATURE by cg class: Socket class added: #newTCPclientToHost:port:domain:domainOrder:withTimeout: changed: #newTCPclientToHost:port:domain:withTimeout:

"{ Encoding: utf8 }"

"
 COPYRIGHT (c) 2019 by Claus Gittinger
              All Rights Reserved

 This software is furnished under a license and may be used
 only in accordance with the terms of that license and with the
 inclusion of the above copyright notice.   This software may not
 be provided or otherwise made available to, or used by, any
 other person.  No title to or ownership of the software is
 hereby transferred.
"
"{ Package: 'stx:libbasic2' }"

"{ NameSpace: Smalltalk }"

Collection subclass:#BloomFilter
	instanceVariableNames:'bits hashingBlocks tally'
	classVariableNames:''
	poolDictionaries:''
	category:'Collections-Unordered'
!

!BloomFilter class methodsFor:'documentation'!

copyright
"
 COPYRIGHT (c) 2019 by Claus Gittinger
              All Rights Reserved

 This software is furnished under a license and may be used
 only in accordance with the terms of that license and with the
 inclusion of the above copyright notice.   This software may not
 be provided or otherwise made available to, or used by, any
 other person.  No title to or ownership of the software is
 hereby transferred.
"
!

documentation
"
    a BloomFilter (see eg. https://de.wikipedia.org/wiki/Bloomfilter)
    is a Set-like data structure which can quickly answer whether
    an element is definitely NOT in the set, but might generate false positives
    (i.e. it might answer true, although the element is not actually present).

    Bloom filters are used for tuning, when accesses (eg. to the disk or database)
    are expensive, and can be avoided in many cases (in the NOT present case),
    and false positives only lead to additional useless queries.

    Bloom filters cannot remove or enumerate elements. 
    The only useful operations are adding and inclusion testing (includes).

    For example, a database query (eg. 'is something present'), 
    might make use of a bloom filter, to avoid the query in many cases.

    Bloom filters false positive rate degrades as the number of elements is added,
    thus a good choice of its initial size is mandatory.
"
!

examples
"<<END
[exBegin]
    |filter set nFalsePositive|

    filter := BloomFilter new:10000.
    set := Set new:10000.
    
    "/ add all words from the Collection class's source code
    Collection sourceStream 
        linesDo:[:eachLine |
            eachLine 
               splitOn:[:ch | ch isSeparator]
               do:[:word | 
                  filter add:word.
                  set add:word.
               ].
        ];
        close.
    
    "/ check some which are known to be present
    set do:[:w |
        self assert:(filter includes:w)
    ].    
    "/ check for false positives
    nFalsePositive :=
        (1 to:1000) count:[:n |
            |randomWord isFalsePositive|

            isFalsePositive := false.
            randomWord := (1 to:6) collect:[:i | ($a to: $z) atRandom] as:String.
            (set includes:randomWord) ifFalse:[
                (filter includes:randomWord) ifTrue:[
                    Transcript showCR:'false positive: ',randomWord.
                    isFalsePositive := true.
                ].    
            ].
            isFalsePositive
        ].
    Transcript show:nFalsePositive; showCR:' false positives'.
    filter inspect.
[exEnd]
END>>"
! !

!BloomFilter class methodsFor:'instance creation'!

new:initialNumberOfBits
    ^ self 
        new:initialNumberOfBits 
        hashes:{
            [:s | s hash_fnv1a] .
            [:s | s hash_sdbm] .
        }

    "Created: / 10-02-2019 / 14:14:23 / Claus Gittinger"
!

new:initialNumberOfBits hashes:hashActionBlocks
    ^ super new 
        numberOfBits:initialNumberOfBits 
        hashes:hashActionBlocks

    "Created: / 10-02-2019 / 14:13:05 / Claus Gittinger"
! !

!BloomFilter methodsFor:'adding'!

add:aString
    "add the arguemnt, aString to the set"

    |mod isNew|

    mod := bits size.
    isNew := false.
    hashingBlocks do:[:each |
        |h idx|
        
        h := each value:aString.
        idx := (h \\ mod)+1.
        (bits at:idx) == 0 ifTrue:[ isNew := true ].
        bits at:idx put:1.
    ].
    isNew ifTrue:[
        tally := tally + 1
    ].

    "Created: / 10-02-2019 / 14:16:29 / Claus Gittinger"
! !

!BloomFilter methodsFor:'blocked'!

do:aBlock
    self shouldNotImplement

    "Created: / 10-02-2019 / 14:40:17 / Claus Gittinger"
!

remove:aString
    "raise an error - elements cannot be removed"

    self shouldNotImplement

    "Created: / 10-02-2019 / 14:18:14 / Claus Gittinger"
! !

!BloomFilter methodsFor:'initialization'!

numberOfBits:initialNumberOfBits hashes:hashingBlocksArg
    bits := BitArray new:(initialNumberOfBits nextPrime).
    hashingBlocks := hashingBlocksArg.
    tally := 0.

    "Created: / 10-02-2019 / 14:15:10 / Claus Gittinger"
! !

!BloomFilter methodsFor:'queries'!

includes:aString
    "returns false, if the arguemnt, aString is definitly NOT in the set.
     However, it may return true as a false positive"
     
    |mod|

    mod := bits size.
    hashingBlocks do:[:each |
        |h|
        
        h := each value:aString.
        (bits at:(h \\ mod)+1) == 0 ifTrue:[^ false].
    ].
    ^ true

    "Created: / 10-02-2019 / 14:16:59 / Claus Gittinger"
! !

!BloomFilter class methodsFor:'documentation'!

version_CVS
    ^ '$Header$'
! !