BloomFilter.st
author Claus Gittinger <cg@exept.de>
Sat, 02 May 2020 21:40:13 +0200
changeset 5476 7355a4b11cb6
parent 4974 33b7c4e3a597
permissions -rw-r--r--
#FEATURE by cg class: Socket class added: #newTCPclientToHost:port:domain:domainOrder:withTimeout: changed: #newTCPclientToHost:port:domain:withTimeout:
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
4801
08db5e0efb14 #DOCUMENTATION by cg
Claus Gittinger <cg@exept.de>
parents: 4800
diff changeset
     1
"{ Encoding: utf8 }"
08db5e0efb14 #DOCUMENTATION by cg
Claus Gittinger <cg@exept.de>
parents: 4800
diff changeset
     2
4800
a6cbf556cfd1 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
     3
"
a6cbf556cfd1 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
     4
 COPYRIGHT (c) 2019 by Claus Gittinger
a6cbf556cfd1 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
     5
              All Rights Reserved
a6cbf556cfd1 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
     6
a6cbf556cfd1 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
     7
 This software is furnished under a license and may be used
a6cbf556cfd1 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
     8
 only in accordance with the terms of that license and with the
a6cbf556cfd1 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
     9
 inclusion of the above copyright notice.   This software may not
a6cbf556cfd1 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
    10
 be provided or otherwise made available to, or used by, any
a6cbf556cfd1 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
    11
 other person.  No title to or ownership of the software is
a6cbf556cfd1 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
    12
 hereby transferred.
a6cbf556cfd1 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
    13
"
a6cbf556cfd1 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
    14
"{ Package: 'stx:libbasic2' }"
a6cbf556cfd1 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
    15
a6cbf556cfd1 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
    16
"{ NameSpace: Smalltalk }"
a6cbf556cfd1 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
    17
a6cbf556cfd1 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
    18
Collection subclass:#BloomFilter
a6cbf556cfd1 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
    19
	instanceVariableNames:'bits hashingBlocks tally'
a6cbf556cfd1 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
    20
	classVariableNames:''
a6cbf556cfd1 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
    21
	poolDictionaries:''
a6cbf556cfd1 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
    22
	category:'Collections-Unordered'
a6cbf556cfd1 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
    23
!
a6cbf556cfd1 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
    24
a6cbf556cfd1 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
    25
!BloomFilter class methodsFor:'documentation'!
a6cbf556cfd1 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
    26
a6cbf556cfd1 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
    27
copyright
a6cbf556cfd1 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
    28
"
a6cbf556cfd1 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
    29
 COPYRIGHT (c) 2019 by Claus Gittinger
a6cbf556cfd1 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
    30
              All Rights Reserved
a6cbf556cfd1 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
    31
a6cbf556cfd1 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
    32
 This software is furnished under a license and may be used
a6cbf556cfd1 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
    33
 only in accordance with the terms of that license and with the
a6cbf556cfd1 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
    34
 inclusion of the above copyright notice.   This software may not
a6cbf556cfd1 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
    35
 be provided or otherwise made available to, or used by, any
a6cbf556cfd1 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
    36
 other person.  No title to or ownership of the software is
a6cbf556cfd1 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
    37
 hereby transferred.
a6cbf556cfd1 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
    38
"
a6cbf556cfd1 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
    39
!
a6cbf556cfd1 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
    40
a6cbf556cfd1 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
    41
documentation
a6cbf556cfd1 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
    42
"
4974
33b7c4e3a597 #DOCUMENTATION by cg
Claus Gittinger <cg@exept.de>
parents: 4801
diff changeset
    43
    a BloomFilter (see eg. https://de.wikipedia.org/wiki/Bloomfilter)
33b7c4e3a597 #DOCUMENTATION by cg
Claus Gittinger <cg@exept.de>
parents: 4801
diff changeset
    44
    is a Set-like data structure which can quickly answer whether
4800
a6cbf556cfd1 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
    45
    an element is definitely NOT in the set, but might generate false positives
a6cbf556cfd1 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
    46
    (i.e. it might answer true, although the element is not actually present).
a6cbf556cfd1 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
    47
a6cbf556cfd1 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
    48
    Bloom filters are used for tuning, when accesses (eg. to the disk or database)
a6cbf556cfd1 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
    49
    are expensive, and can be avoided in many cases (in the NOT present case),
a6cbf556cfd1 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
    50
    and false positives only lead to additional useless queries.
a6cbf556cfd1 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
    51
a6cbf556cfd1 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
    52
    Bloom filters cannot remove or enumerate elements. 
a6cbf556cfd1 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
    53
    The only useful operations are adding and inclusion testing (includes).
a6cbf556cfd1 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
    54
a6cbf556cfd1 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
    55
    For example, a database query (eg. 'is something present'), 
a6cbf556cfd1 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
    56
    might make use of a bloom filter, to avoid the query in many cases.
a6cbf556cfd1 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
    57
a6cbf556cfd1 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
    58
    Bloom filters false positive rate degrades as the number of elements is added,
a6cbf556cfd1 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
    59
    thus a good choice of its initial size is mandatory.
a6cbf556cfd1 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
    60
"
a6cbf556cfd1 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
    61
!
a6cbf556cfd1 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
    62
a6cbf556cfd1 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
    63
examples
a6cbf556cfd1 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
    64
"<<END
a6cbf556cfd1 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
    65
[exBegin]
a6cbf556cfd1 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
    66
    |filter set nFalsePositive|
a6cbf556cfd1 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
    67
a6cbf556cfd1 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
    68
    filter := BloomFilter new:10000.
a6cbf556cfd1 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
    69
    set := Set new:10000.
a6cbf556cfd1 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
    70
    
a6cbf556cfd1 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
    71
    "/ add all words from the Collection class's source code
a6cbf556cfd1 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
    72
    Collection sourceStream 
a6cbf556cfd1 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
    73
        linesDo:[:eachLine |
a6cbf556cfd1 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
    74
            eachLine 
a6cbf556cfd1 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
    75
               splitOn:[:ch | ch isSeparator]
a6cbf556cfd1 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
    76
               do:[:word | 
a6cbf556cfd1 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
    77
                  filter add:word.
a6cbf556cfd1 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
    78
                  set add:word.
a6cbf556cfd1 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
    79
               ].
a6cbf556cfd1 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
    80
        ];
a6cbf556cfd1 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
    81
        close.
a6cbf556cfd1 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
    82
    
a6cbf556cfd1 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
    83
    "/ check some which are known to be present
a6cbf556cfd1 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
    84
    set do:[:w |
a6cbf556cfd1 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
    85
        self assert:(filter includes:w)
a6cbf556cfd1 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
    86
    ].    
a6cbf556cfd1 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
    87
    "/ check for false positives
a6cbf556cfd1 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
    88
    nFalsePositive :=
a6cbf556cfd1 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
    89
        (1 to:1000) count:[:n |
a6cbf556cfd1 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
    90
            |randomWord isFalsePositive|
a6cbf556cfd1 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
    91
a6cbf556cfd1 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
    92
            isFalsePositive := false.
a6cbf556cfd1 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
    93
            randomWord := (1 to:6) collect:[:i | ($a to: $z) atRandom] as:String.
a6cbf556cfd1 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
    94
            (set includes:randomWord) ifFalse:[
a6cbf556cfd1 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
    95
                (filter includes:randomWord) ifTrue:[
a6cbf556cfd1 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
    96
                    Transcript showCR:'false positive: ',randomWord.
a6cbf556cfd1 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
    97
                    isFalsePositive := true.
a6cbf556cfd1 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
    98
                ].    
a6cbf556cfd1 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
    99
            ].
a6cbf556cfd1 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
   100
            isFalsePositive
a6cbf556cfd1 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
   101
        ].
a6cbf556cfd1 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
   102
    Transcript show:nFalsePositive; showCR:' false positives'.
a6cbf556cfd1 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
   103
    filter inspect.
a6cbf556cfd1 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
   104
[exEnd]
a6cbf556cfd1 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
   105
END>>"
a6cbf556cfd1 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
   106
! !
a6cbf556cfd1 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
   107
a6cbf556cfd1 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
   108
!BloomFilter class methodsFor:'instance creation'!
a6cbf556cfd1 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
   109
a6cbf556cfd1 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
   110
new:initialNumberOfBits
a6cbf556cfd1 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
   111
    ^ self 
a6cbf556cfd1 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
   112
        new:initialNumberOfBits 
a6cbf556cfd1 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
   113
        hashes:{
a6cbf556cfd1 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
   114
            [:s | s hash_fnv1a] .
a6cbf556cfd1 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
   115
            [:s | s hash_sdbm] .
a6cbf556cfd1 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
   116
        }
a6cbf556cfd1 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
   117
a6cbf556cfd1 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
   118
    "Created: / 10-02-2019 / 14:14:23 / Claus Gittinger"
a6cbf556cfd1 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
   119
!
a6cbf556cfd1 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
   120
a6cbf556cfd1 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
   121
new:initialNumberOfBits hashes:hashActionBlocks
a6cbf556cfd1 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
   122
    ^ super new 
a6cbf556cfd1 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
   123
        numberOfBits:initialNumberOfBits 
a6cbf556cfd1 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
   124
        hashes:hashActionBlocks
a6cbf556cfd1 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
   125
a6cbf556cfd1 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
   126
    "Created: / 10-02-2019 / 14:13:05 / Claus Gittinger"
a6cbf556cfd1 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
   127
! !
a6cbf556cfd1 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
   128
4801
08db5e0efb14 #DOCUMENTATION by cg
Claus Gittinger <cg@exept.de>
parents: 4800
diff changeset
   129
!BloomFilter methodsFor:'adding'!
4800
a6cbf556cfd1 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
   130
a6cbf556cfd1 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
   131
add:aString
a6cbf556cfd1 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
   132
    "add the arguemnt, aString to the set"
a6cbf556cfd1 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
   133
a6cbf556cfd1 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
   134
    |mod isNew|
a6cbf556cfd1 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
   135
a6cbf556cfd1 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
   136
    mod := bits size.
a6cbf556cfd1 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
   137
    isNew := false.
a6cbf556cfd1 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
   138
    hashingBlocks do:[:each |
a6cbf556cfd1 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
   139
        |h idx|
a6cbf556cfd1 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
   140
        
a6cbf556cfd1 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
   141
        h := each value:aString.
a6cbf556cfd1 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
   142
        idx := (h \\ mod)+1.
a6cbf556cfd1 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
   143
        (bits at:idx) == 0 ifTrue:[ isNew := true ].
a6cbf556cfd1 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
   144
        bits at:idx put:1.
a6cbf556cfd1 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
   145
    ].
a6cbf556cfd1 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
   146
    isNew ifTrue:[
a6cbf556cfd1 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
   147
        tally := tally + 1
a6cbf556cfd1 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
   148
    ].
a6cbf556cfd1 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
   149
a6cbf556cfd1 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
   150
    "Created: / 10-02-2019 / 14:16:29 / Claus Gittinger"
4801
08db5e0efb14 #DOCUMENTATION by cg
Claus Gittinger <cg@exept.de>
parents: 4800
diff changeset
   151
! !
4800
a6cbf556cfd1 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
   152
4801
08db5e0efb14 #DOCUMENTATION by cg
Claus Gittinger <cg@exept.de>
parents: 4800
diff changeset
   153
!BloomFilter methodsFor:'blocked'!
4800
a6cbf556cfd1 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
   154
4801
08db5e0efb14 #DOCUMENTATION by cg
Claus Gittinger <cg@exept.de>
parents: 4800
diff changeset
   155
do:aBlock
08db5e0efb14 #DOCUMENTATION by cg
Claus Gittinger <cg@exept.de>
parents: 4800
diff changeset
   156
    self shouldNotImplement
4800
a6cbf556cfd1 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
   157
4801
08db5e0efb14 #DOCUMENTATION by cg
Claus Gittinger <cg@exept.de>
parents: 4800
diff changeset
   158
    "Created: / 10-02-2019 / 14:40:17 / Claus Gittinger"
4800
a6cbf556cfd1 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
   159
!
a6cbf556cfd1 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
   160
a6cbf556cfd1 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
   161
remove:aString
a6cbf556cfd1 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
   162
    "raise an error - elements cannot be removed"
a6cbf556cfd1 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
   163
a6cbf556cfd1 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
   164
    self shouldNotImplement
a6cbf556cfd1 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
   165
a6cbf556cfd1 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
   166
    "Created: / 10-02-2019 / 14:18:14 / Claus Gittinger"
a6cbf556cfd1 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
   167
! !
a6cbf556cfd1 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
   168
a6cbf556cfd1 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
   169
!BloomFilter methodsFor:'initialization'!
a6cbf556cfd1 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
   170
a6cbf556cfd1 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
   171
numberOfBits:initialNumberOfBits hashes:hashingBlocksArg
a6cbf556cfd1 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
   172
    bits := BitArray new:(initialNumberOfBits nextPrime).
a6cbf556cfd1 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
   173
    hashingBlocks := hashingBlocksArg.
a6cbf556cfd1 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
   174
    tally := 0.
a6cbf556cfd1 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
   175
a6cbf556cfd1 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
   176
    "Created: / 10-02-2019 / 14:15:10 / Claus Gittinger"
a6cbf556cfd1 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
   177
! !
a6cbf556cfd1 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
   178
4801
08db5e0efb14 #DOCUMENTATION by cg
Claus Gittinger <cg@exept.de>
parents: 4800
diff changeset
   179
!BloomFilter methodsFor:'queries'!
08db5e0efb14 #DOCUMENTATION by cg
Claus Gittinger <cg@exept.de>
parents: 4800
diff changeset
   180
08db5e0efb14 #DOCUMENTATION by cg
Claus Gittinger <cg@exept.de>
parents: 4800
diff changeset
   181
includes:aString
08db5e0efb14 #DOCUMENTATION by cg
Claus Gittinger <cg@exept.de>
parents: 4800
diff changeset
   182
    "returns false, if the arguemnt, aString is definitly NOT in the set.
08db5e0efb14 #DOCUMENTATION by cg
Claus Gittinger <cg@exept.de>
parents: 4800
diff changeset
   183
     However, it may return true as a false positive"
08db5e0efb14 #DOCUMENTATION by cg
Claus Gittinger <cg@exept.de>
parents: 4800
diff changeset
   184
     
08db5e0efb14 #DOCUMENTATION by cg
Claus Gittinger <cg@exept.de>
parents: 4800
diff changeset
   185
    |mod|
08db5e0efb14 #DOCUMENTATION by cg
Claus Gittinger <cg@exept.de>
parents: 4800
diff changeset
   186
08db5e0efb14 #DOCUMENTATION by cg
Claus Gittinger <cg@exept.de>
parents: 4800
diff changeset
   187
    mod := bits size.
08db5e0efb14 #DOCUMENTATION by cg
Claus Gittinger <cg@exept.de>
parents: 4800
diff changeset
   188
    hashingBlocks do:[:each |
08db5e0efb14 #DOCUMENTATION by cg
Claus Gittinger <cg@exept.de>
parents: 4800
diff changeset
   189
        |h|
08db5e0efb14 #DOCUMENTATION by cg
Claus Gittinger <cg@exept.de>
parents: 4800
diff changeset
   190
        
08db5e0efb14 #DOCUMENTATION by cg
Claus Gittinger <cg@exept.de>
parents: 4800
diff changeset
   191
        h := each value:aString.
08db5e0efb14 #DOCUMENTATION by cg
Claus Gittinger <cg@exept.de>
parents: 4800
diff changeset
   192
        (bits at:(h \\ mod)+1) == 0 ifTrue:[^ false].
08db5e0efb14 #DOCUMENTATION by cg
Claus Gittinger <cg@exept.de>
parents: 4800
diff changeset
   193
    ].
08db5e0efb14 #DOCUMENTATION by cg
Claus Gittinger <cg@exept.de>
parents: 4800
diff changeset
   194
    ^ true
08db5e0efb14 #DOCUMENTATION by cg
Claus Gittinger <cg@exept.de>
parents: 4800
diff changeset
   195
08db5e0efb14 #DOCUMENTATION by cg
Claus Gittinger <cg@exept.de>
parents: 4800
diff changeset
   196
    "Created: / 10-02-2019 / 14:16:59 / Claus Gittinger"
08db5e0efb14 #DOCUMENTATION by cg
Claus Gittinger <cg@exept.de>
parents: 4800
diff changeset
   197
! !
08db5e0efb14 #DOCUMENTATION by cg
Claus Gittinger <cg@exept.de>
parents: 4800
diff changeset
   198
4800
a6cbf556cfd1 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
   199
!BloomFilter class methodsFor:'documentation'!
a6cbf556cfd1 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
   200
a6cbf556cfd1 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
   201
version_CVS
a6cbf556cfd1 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
   202
    ^ '$Header$'
a6cbf556cfd1 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
   203
! !
a6cbf556cfd1 initial checkin
Claus Gittinger <cg@exept.de>
parents:
diff changeset
   204