--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/BloomFilter.st Sun Feb 10 14:39:41 2019 +0100
@@ -0,0 +1,191 @@
+"
+ COPYRIGHT (c) 2019 by Claus Gittinger
+ All Rights Reserved
+
+ This software is furnished under a license and may be used
+ only in accordance with the terms of that license and with the
+ inclusion of the above copyright notice. This software may not
+ be provided or otherwise made available to, or used by, any
+ other person. No title to or ownership of the software is
+ hereby transferred.
+"
+"{ Package: 'stx:libbasic2' }"
+
+"{ NameSpace: Smalltalk }"
+
+Collection subclass:#BloomFilter
+ instanceVariableNames:'bits hashingBlocks tally'
+ classVariableNames:''
+ poolDictionaries:''
+ category:'Collections-Unordered'
+!
+
+!BloomFilter class methodsFor:'documentation'!
+
+copyright
+"
+ COPYRIGHT (c) 2019 by Claus Gittinger
+ All Rights Reserved
+
+ This software is furnished under a license and may be used
+ only in accordance with the terms of that license and with the
+ inclusion of the above copyright notice. This software may not
+ be provided or otherwise made available to, or used by, any
+ other person. No title to or ownership of the software is
+ hereby transferred.
+"
+!
+
+documentation
+"
+ a BloomFilter is a Set-like data structure which can quickly answer whether
+ an element is definitely NOT in the set, but might generate false positives
+ (i.e. it might answer true, although the element is not actually present).
+
+ Bloom filters are used for tuning, when accesses (eg. to the disk or database)
+ are expensive, and can be avoided in many cases (in the NOT present case),
+ and false positives only lead to additional useless queries.
+
+ Bloom filters cannot remove or enumerate elements.
+ The only useful operations are adding and inclusion testing (includes).
+
+ For example, a database query (eg. 'is something present'),
+ might make use of a bloom filter, to avoid the query in many cases.
+
+ Bloom filters false positive rate degrades as the number of elements is added,
+ thus a good choice of its initial size is mandatory.
+"
+!
+
+examples
+"<<END
+[exBegin]
+ |filter set nFalsePositive|
+
+ filter := BloomFilter new:10000.
+ set := Set new:10000.
+
+ "/ add all words from the Collection class's source code
+ Collection sourceStream
+ linesDo:[:eachLine |
+ eachLine
+ splitOn:[:ch | ch isSeparator]
+ do:[:word |
+ filter add:word.
+ set add:word.
+ ].
+ ];
+ close.
+
+ "/ check some which are known to be present
+ set do:[:w |
+ self assert:(filter includes:w)
+ ].
+ "/ check for false positives
+ nFalsePositive :=
+ (1 to:1000) count:[:n |
+ |randomWord isFalsePositive|
+
+ isFalsePositive := false.
+ randomWord := (1 to:6) collect:[:i | ($a to: $z) atRandom] as:String.
+ (set includes:randomWord) ifFalse:[
+ (filter includes:randomWord) ifTrue:[
+ Transcript showCR:'false positive: ',randomWord.
+ isFalsePositive := true.
+ ].
+ ].
+ isFalsePositive
+ ].
+ Transcript show:nFalsePositive; showCR:' false positives'.
+ filter inspect.
+[exEnd]
+END>>"
+! !
+
+!BloomFilter class methodsFor:'instance creation'!
+
+new:initialNumberOfBits
+ ^ self
+ new:initialNumberOfBits
+ hashes:{
+ [:s | s hash_fnv1a] .
+ [:s | s hash_sdbm] .
+ }
+
+ "Created: / 10-02-2019 / 14:14:23 / Claus Gittinger"
+!
+
+new:initialNumberOfBits hashes:hashActionBlocks
+ ^ super new
+ numberOfBits:initialNumberOfBits
+ hashes:hashActionBlocks
+
+ "Created: / 10-02-2019 / 14:13:05 / Claus Gittinger"
+! !
+
+!BloomFilter methodsFor:'adding & removing'!
+
+add:aString
+ "add the arguemnt, aString to the set"
+
+ |mod isNew|
+
+ mod := bits size.
+ isNew := false.
+ hashingBlocks do:[:each |
+ |h idx|
+
+ h := each value:aString.
+ idx := (h \\ mod)+1.
+ (bits at:idx) == 0 ifTrue:[ isNew := true ].
+ bits at:idx put:1.
+ ].
+ isNew ifTrue:[
+ tally := tally + 1
+ ].
+
+ "Created: / 10-02-2019 / 14:16:29 / Claus Gittinger"
+!
+
+includes:aString
+ "returns false, if the arguemnt, aString is definitly NOT in the set.
+ However, it may return true as a false positive"
+
+ |mod|
+
+ mod := bits size.
+ hashingBlocks do:[:each |
+ |h|
+
+ h := each value:aString.
+ (bits at:(h \\ mod)+1) == 0 ifTrue:[^ false].
+ ].
+ ^ true
+
+ "Created: / 10-02-2019 / 14:16:59 / Claus Gittinger"
+!
+
+remove:aString
+ "raise an error - elements cannot be removed"
+
+ self shouldNotImplement
+
+ "Created: / 10-02-2019 / 14:18:14 / Claus Gittinger"
+! !
+
+!BloomFilter methodsFor:'initialization'!
+
+numberOfBits:initialNumberOfBits hashes:hashingBlocksArg
+ bits := BitArray new:(initialNumberOfBits nextPrime).
+ hashingBlocks := hashingBlocksArg.
+ tally := 0.
+
+ "Created: / 10-02-2019 / 14:15:10 / Claus Gittinger"
+! !
+
+!BloomFilter class methodsFor:'documentation'!
+
+version_CVS
+ ^ '$Header$'
+! !
+