initial checkin
authorClaus Gittinger <cg@exept.de>
Sun, 10 Feb 2019 14:39:41 +0100
changeset 4800 a6cbf556cfd1
parent 4799 bf18e2754b1e
child 4801 08db5e0efb14
initial checkin
BloomFilter.st
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/BloomFilter.st	Sun Feb 10 14:39:41 2019 +0100
@@ -0,0 +1,191 @@
+"
+ COPYRIGHT (c) 2019 by Claus Gittinger
+              All Rights Reserved
+
+ This software is furnished under a license and may be used
+ only in accordance with the terms of that license and with the
+ inclusion of the above copyright notice.   This software may not
+ be provided or otherwise made available to, or used by, any
+ other person.  No title to or ownership of the software is
+ hereby transferred.
+"
+"{ Package: 'stx:libbasic2' }"
+
+"{ NameSpace: Smalltalk }"
+
+Collection subclass:#BloomFilter
+	instanceVariableNames:'bits hashingBlocks tally'
+	classVariableNames:''
+	poolDictionaries:''
+	category:'Collections-Unordered'
+!
+
+!BloomFilter class methodsFor:'documentation'!
+
+copyright
+"
+ COPYRIGHT (c) 2019 by Claus Gittinger
+              All Rights Reserved
+
+ This software is furnished under a license and may be used
+ only in accordance with the terms of that license and with the
+ inclusion of the above copyright notice.   This software may not
+ be provided or otherwise made available to, or used by, any
+ other person.  No title to or ownership of the software is
+ hereby transferred.
+"
+!
+
+documentation
+"
+    a BloomFilter is a Set-like data structure which can quickly answer whether
+    an element is definitely NOT in the set, but might generate false positives
+    (i.e. it might answer true, although the element is not actually present).
+
+    Bloom filters are used for tuning, when accesses (eg. to the disk or database)
+    are expensive, and can be avoided in many cases (in the NOT present case),
+    and false positives only lead to additional useless queries.
+
+    Bloom filters cannot remove or enumerate elements. 
+    The only useful operations are adding and inclusion testing (includes).
+
+    For example, a database query (eg. 'is something present'), 
+    might make use of a bloom filter, to avoid the query in many cases.
+
+    Bloom filters false positive rate degrades as the number of elements is added,
+    thus a good choice of its initial size is mandatory.
+"
+!
+
+examples
+"<<END
+[exBegin]
+    |filter set nFalsePositive|
+
+    filter := BloomFilter new:10000.
+    set := Set new:10000.
+    
+    "/ add all words from the Collection class's source code
+    Collection sourceStream 
+        linesDo:[:eachLine |
+            eachLine 
+               splitOn:[:ch | ch isSeparator]
+               do:[:word | 
+                  filter add:word.
+                  set add:word.
+               ].
+        ];
+        close.
+    
+    "/ check some which are known to be present
+    set do:[:w |
+        self assert:(filter includes:w)
+    ].    
+    "/ check for false positives
+    nFalsePositive :=
+        (1 to:1000) count:[:n |
+            |randomWord isFalsePositive|
+
+            isFalsePositive := false.
+            randomWord := (1 to:6) collect:[:i | ($a to: $z) atRandom] as:String.
+            (set includes:randomWord) ifFalse:[
+                (filter includes:randomWord) ifTrue:[
+                    Transcript showCR:'false positive: ',randomWord.
+                    isFalsePositive := true.
+                ].    
+            ].
+            isFalsePositive
+        ].
+    Transcript show:nFalsePositive; showCR:' false positives'.
+    filter inspect.
+[exEnd]
+END>>"
+! !
+
+!BloomFilter class methodsFor:'instance creation'!
+
+new:initialNumberOfBits
+    ^ self 
+        new:initialNumberOfBits 
+        hashes:{
+            [:s | s hash_fnv1a] .
+            [:s | s hash_sdbm] .
+        }
+
+    "Created: / 10-02-2019 / 14:14:23 / Claus Gittinger"
+!
+
+new:initialNumberOfBits hashes:hashActionBlocks
+    ^ super new 
+        numberOfBits:initialNumberOfBits 
+        hashes:hashActionBlocks
+
+    "Created: / 10-02-2019 / 14:13:05 / Claus Gittinger"
+! !
+
+!BloomFilter methodsFor:'adding & removing'!
+
+add:aString
+    "add the arguemnt, aString to the set"
+
+    |mod isNew|
+
+    mod := bits size.
+    isNew := false.
+    hashingBlocks do:[:each |
+        |h idx|
+        
+        h := each value:aString.
+        idx := (h \\ mod)+1.
+        (bits at:idx) == 0 ifTrue:[ isNew := true ].
+        bits at:idx put:1.
+    ].
+    isNew ifTrue:[
+        tally := tally + 1
+    ].
+
+    "Created: / 10-02-2019 / 14:16:29 / Claus Gittinger"
+!
+
+includes:aString
+    "returns false, if the arguemnt, aString is definitly NOT in the set.
+     However, it may return true as a false positive"
+     
+    |mod|
+
+    mod := bits size.
+    hashingBlocks do:[:each |
+        |h|
+        
+        h := each value:aString.
+        (bits at:(h \\ mod)+1) == 0 ifTrue:[^ false].
+    ].
+    ^ true
+
+    "Created: / 10-02-2019 / 14:16:59 / Claus Gittinger"
+!
+
+remove:aString
+    "raise an error - elements cannot be removed"
+
+    self shouldNotImplement
+
+    "Created: / 10-02-2019 / 14:18:14 / Claus Gittinger"
+! !
+
+!BloomFilter methodsFor:'initialization'!
+
+numberOfBits:initialNumberOfBits hashes:hashingBlocksArg
+    bits := BitArray new:(initialNumberOfBits nextPrime).
+    hashingBlocks := hashingBlocksArg.
+    tally := 0.
+
+    "Created: / 10-02-2019 / 14:15:10 / Claus Gittinger"
+! !
+
+!BloomFilter class methodsFor:'documentation'!
+
+version_CVS
+    ^ '$Header$'
+! !
+