4459
|
1 |
"{ Package: 'stx:libbasic2' }"
|
|
2 |
|
|
3 |
"{ NameSpace: Smalltalk }"
|
|
4 |
|
|
5 |
Object subclass:#FuzzyMatcher
|
|
6 |
instanceVariableNames:'pattern lowercasePattern indexes'
|
|
7 |
classVariableNames:''
|
|
8 |
poolDictionaries:''
|
|
9 |
category:'Collections-Text-Support'
|
|
10 |
!
|
|
11 |
|
|
12 |
!FuzzyMatcher class methodsFor:'documentation'!
|
|
13 |
|
|
14 |
documentation
|
|
15 |
"
|
4474
|
16 |
FuzzyMatcher is an approximate string matching algorithm that can determine if a string includes a given pattern.
|
4459
|
17 |
For example, the string 'axby' matches both the pattern 'ab' and, 'ay', but not 'ba'.
|
4492
|
18 |
I.e. it matches if the searched string contains a sequence of chars, probably intermixed by other chars,
|
|
19 |
which matches the given search pattern or part of it.
|
|
20 |
|
4459
|
21 |
The algorithm is based on lib_fts[1], and includes an optional scoring algorithm
|
|
22 |
that can be used to sort all the matches based on their similarity to the pattern.
|
4492
|
23 |
It is used (among others) in the sublime text editor.
|
4493
|
24 |
|
|
25 |
[caveat:]
|
|
26 |
although this works great for class searches,
|
|
27 |
it is strange that 'dabc' scores lower against 'abc' than 'adbc'
|
|
28 |
(dabc has a longer common subsequence without interruptions...)
|
|
29 |
|
4459
|
30 |
[see also:]
|
|
31 |
https://blog.forrestthewoods.com/reverse-engineering-sublime-text-s-fuzzy-match-4cffeed33fdb
|
|
32 |
https://github.com/forrestthewoods/lib_fts
|
|
33 |
|
|
34 |
"
|
4468
|
35 |
!
|
|
36 |
|
|
37 |
example
|
|
38 |
"
|
4493
|
39 |
|matcher|
|
|
40 |
|
|
41 |
matcher := FuzzyMatcher pattern:'abc'.
|
|
42 |
matcher
|
|
43 |
match:'somearbitrarysequence'
|
|
44 |
ifScored: [:score | Transcript show:('''somearbitrarysequence'' scores '); showCR:score].
|
|
45 |
|
|
46 |
matcher
|
|
47 |
match:'someabcd'
|
|
48 |
ifScored: [:score | Transcript show:('''someabcd'' scores '); showCR:score].
|
|
49 |
|
|
50 |
matcher
|
|
51 |
match:'abcd'
|
|
52 |
ifScored: [:score | Transcript show:('''abcd'' scores '); showCR:score].
|
|
53 |
|
|
54 |
matcher
|
|
55 |
match:'dabc'
|
|
56 |
ifScored: [:score | Transcript show:('''dabc'' scores '); showCR:score].
|
|
57 |
|
|
58 |
matcher
|
|
59 |
match:'adbc'
|
|
60 |
ifScored: [:score | Transcript show:('''adbc'' scores '); showCR:score].
|
|
61 |
|
|
62 |
matcher
|
|
63 |
match:'abc'
|
|
64 |
ifScored: [:score | Transcript show:('''abc'' scores '); showCR:score].
|
|
65 |
|
|
66 |
|
4468
|
67 |
|top lv list field patternHolder names|
|
|
68 |
|
|
69 |
patternHolder := '' asValue.
|
|
70 |
list := List new.
|
|
71 |
|
|
72 |
top := StandardSystemView new.
|
|
73 |
lv := ListView origin:(0.0@30) corner:(1.0@1.0) in:top.
|
|
74 |
lv model:list.
|
|
75 |
|
|
76 |
field := EditField origin:(0.0@0.0) corner:(1.0@30) in:top.
|
|
77 |
field model:patternHolder.
|
|
78 |
field immediateAccept:true.
|
|
79 |
|
|
80 |
names := Smalltalk allClasses collect:#name.
|
|
81 |
|
|
82 |
patternHolder
|
|
83 |
onChangeEvaluate:[
|
|
84 |
|matcher pattern matches|
|
|
85 |
|
|
86 |
pattern := patternHolder value.
|
|
87 |
pattern notEmpty ifTrue:[
|
|
88 |
matcher := FuzzyMatcher pattern:pattern.
|
|
89 |
|
|
90 |
matches := OrderedCollection new.
|
|
91 |
|
|
92 |
names do:[:eachClassName |
|
|
93 |
matcher
|
|
94 |
match:eachClassName
|
|
95 |
ifScored: [ :score | matches add: eachClassName -> score ]
|
|
96 |
].
|
4470
|
97 |
matches sort:[:a :b |
|
|
98 |
a value < b value
|
|
99 |
or:[ a value = b value and:[ a key > b key]]
|
|
100 |
].
|
4468
|
101 |
|
|
102 |
list removeAll.
|
|
103 |
list addAllReversed:(matches
|
|
104 |
collect:[:nameScoreAssoc |
|
|
105 |
'[%1] %2' bindWith:nameScoreAssoc value with:nameScoreAssoc key])
|
|
106 |
].
|
|
107 |
].
|
|
108 |
top open.
|
|
109 |
patternHolder value:'mph'.
|
|
110 |
"
|
4459
|
111 |
! !
|
|
112 |
|
|
113 |
!FuzzyMatcher class methodsFor:'instance creation'!
|
|
114 |
|
|
115 |
new
|
|
116 |
"return an initialized instance"
|
|
117 |
|
|
118 |
^ self basicNew initialize.
|
4475
|
119 |
!
|
4459
|
120 |
|
|
121 |
pattern: aString
|
4476
|
122 |
^ self new pattern: aString
|
|
123 |
|
|
124 |
"
|
|
125 |
(self pattern:'mrp') matches:'ButtonMorph'
|
4493
|
126 |
(self pattern:'mrp') matches:'ButtonMorh'
|
|
127 |
|
|
128 |
(self pattern:'mrp') matches:'ButtonMorph'
|
|
129 |
(self pattern:'mrp') matches:'ButtonMorh'
|
4476
|
130 |
"
|
|
131 |
|
4493
|
132 |
"Modified (comment): / 02-08-2017 / 15:57:07 / cg"
|
4459
|
133 |
! !
|
|
134 |
|
|
135 |
!FuzzyMatcher class methodsFor:'utilities api'!
|
|
136 |
|
|
137 |
allMatching: aPattern in: aCollection
|
4493
|
138 |
"Assumes that the collection is a collection of Strings;
|
|
139 |
return all those which match"
|
4474
|
140 |
|
|
141 |
| matcher |
|
4459
|
142 |
|
4474
|
143 |
matcher := self pattern: aPattern.
|
|
144 |
^ aCollection select: [ :each | matcher matches: each ]
|
4459
|
145 |
|
4474
|
146 |
"
|
|
147 |
self
|
|
148 |
allMatching:'clu'
|
|
149 |
in:(Smalltalk allClasses collect:#name)
|
|
150 |
"
|
4459
|
151 |
|
4493
|
152 |
"Modified (comment): / 02-08-2017 / 16:02:44 / cg"
|
4459
|
153 |
!
|
|
154 |
|
|
155 |
allMatching: aPattern in: aCollection by: aBlockReturningString
|
4475
|
156 |
"selects matching elements from aCollection.
|
|
157 |
aBlockReturningString is applied to elements to get the string representation
|
|
158 |
(can be used eg. to sort classes)"
|
|
159 |
|
4474
|
160 |
| matcher |
|
|
161 |
|
|
162 |
matcher := self pattern: aPattern.
|
|
163 |
|
|
164 |
^ aCollection select: [ :each | matcher matches: (aBlockReturningString value: each) ]
|
4459
|
165 |
|
4474
|
166 |
"
|
|
167 |
self
|
|
168 |
allMatching:'clu'
|
|
169 |
in:(Smalltalk allClasses)
|
|
170 |
by:[:cls | cls name]
|
|
171 |
"
|
|
172 |
"
|
|
173 |
self
|
|
174 |
allMatching:'clu'
|
|
175 |
in:(Smalltalk allClasses)
|
|
176 |
by:#name
|
|
177 |
"
|
|
178 |
|
4475
|
179 |
"Modified (comment): / 14-07-2017 / 12:21:40 / cg"
|
4459
|
180 |
!
|
|
181 |
|
|
182 |
allSortedByScoreMatching: aPattern in: aCollection
|
4475
|
183 |
"Assumes that the collection is a collection of Strings;
|
|
184 |
returns matching strings sorted by score (level of similarity)"
|
4474
|
185 |
|
|
186 |
^ self allSortedByScoreMatching: aPattern in: aCollection by: [ :each | each ]
|
|
187 |
|
|
188 |
"
|
|
189 |
self
|
|
190 |
allSortedByScoreMatching:'clu'
|
|
191 |
in:(Smalltalk allClasses collect:#name)
|
|
192 |
"
|
|
193 |
"
|
|
194 |
self
|
|
195 |
allSortedByScoreMatching:'nary'
|
|
196 |
in:(Smalltalk allClasses collect:#name)
|
|
197 |
"
|
|
198 |
|
4475
|
199 |
"Modified (comment): / 14-07-2017 / 12:22:14 / cg"
|
4459
|
200 |
!
|
|
201 |
|
|
202 |
allSortedByScoreMatching: aPattern in: aCollection by: aBlockReturningString
|
4475
|
203 |
"selects matching elements from aCollection.
|
|
204 |
aBlockReturningString is applied to elements to get the string representation.
|
|
205 |
Returns them sorted by score (i.e. similarity).
|
|
206 |
(can be used eg. to sort classes)"
|
4474
|
207 |
|
4475
|
208 |
| matchesAndScores |
|
4474
|
209 |
|
4475
|
210 |
matchesAndScores := self allWithScoresSortedByScoreMatching: aPattern in: aCollection by: aBlockReturningString.
|
|
211 |
^ matchesAndScores collect: [ :each | each value ]
|
4474
|
212 |
|
4475
|
213 |
"
|
|
214 |
self
|
|
215 |
allSortedByScoreMatching:''
|
|
216 |
in:(Smalltalk allClasses)
|
|
217 |
by:[:cls | cls name]
|
|
218 |
"
|
4474
|
219 |
"
|
|
220 |
self
|
|
221 |
allSortedByScoreMatching:'nary'
|
|
222 |
in:(Smalltalk allClasses)
|
|
223 |
by:[:cls | cls name]
|
|
224 |
"
|
|
225 |
"
|
|
226 |
self
|
|
227 |
allSortedByScoreMatching:'nary'
|
|
228 |
in:(Smalltalk allClasses)
|
|
229 |
by:#name
|
|
230 |
"
|
|
231 |
|
4475
|
232 |
"Modified: / 14-07-2017 / 12:43:14 / cg"
|
|
233 |
!
|
|
234 |
|
|
235 |
allWithScoresSortedByScoreMatching: aPattern in: aCollection by: aBlockReturningString
|
|
236 |
"selects matching elements from aCollection.
|
|
237 |
aBlockReturningString is applied to elements to get the string representation.
|
|
238 |
Returns them sorted by score (i.e. similarity) associated to their scores.
|
|
239 |
(can be used eg. to sort classes)"
|
|
240 |
|
|
241 |
|matcher matches|
|
|
242 |
|
|
243 |
|
|
244 |
matcher := self pattern: aPattern.
|
|
245 |
matches := OrderedCollection new: aCollection size // 2.
|
|
246 |
|
|
247 |
aCollection do: [ :each |
|
|
248 |
matcher
|
|
249 |
match: (aBlockReturningString value: each)
|
|
250 |
ifScored: [ :score | matches add: score -> each ]
|
|
251 |
].
|
|
252 |
matches sort: [ :a :b | a key > b key].
|
|
253 |
^ matches asArray
|
|
254 |
|
|
255 |
"
|
|
256 |
self
|
|
257 |
allWithScoresSortedByScoreMatching:''
|
|
258 |
in:(Smalltalk allClasses)
|
|
259 |
by:[:cls | cls name]
|
|
260 |
"
|
|
261 |
"
|
|
262 |
self
|
|
263 |
allWithScoresSortedByScoreMatching:'OC'
|
|
264 |
in:(Smalltalk allClasses)
|
|
265 |
by:[:cls | cls name]
|
|
266 |
"
|
|
267 |
"
|
|
268 |
self
|
|
269 |
allWithScoresSortedByScoreMatching:'nary'
|
|
270 |
in:(Smalltalk allClasses)
|
|
271 |
by:[:cls | cls name]
|
|
272 |
"
|
|
273 |
"
|
|
274 |
self
|
|
275 |
allWithScoresSortedByScoreMatching:'nary'
|
|
276 |
in:(Smalltalk allClasses)
|
|
277 |
by:#name
|
|
278 |
"
|
|
279 |
|
|
280 |
"Created: / 14-07-2017 / 12:25:19 / cg"
|
4459
|
281 |
! !
|
|
282 |
|
|
283 |
!FuzzyMatcher methodsFor:'accessing'!
|
|
284 |
|
4477
|
285 |
indexes
|
|
286 |
"only valid inside the match callback block"
|
|
287 |
|
|
288 |
^ indexes
|
|
289 |
|
|
290 |
"Created: / 15-07-2017 / 14:57:10 / cg"
|
|
291 |
!
|
|
292 |
|
4459
|
293 |
pattern
|
|
294 |
|
|
295 |
^ pattern
|
|
296 |
!
|
|
297 |
|
|
298 |
pattern: aString
|
|
299 |
|
4475
|
300 |
pattern := aString.
|
|
301 |
lowercasePattern := pattern asLowercase.
|
|
302 |
indexes := Array new: pattern size.
|
|
303 |
|
|
304 |
"Modified (format): / 14-07-2017 / 12:59:15 / cg"
|
4459
|
305 |
! !
|
|
306 |
|
4493
|
307 |
!FuzzyMatcher methodsFor:'api - comparing'!
|
4459
|
308 |
|
|
309 |
match: aString ifScored: aBlock
|
4493
|
310 |
"If there is a match, evaluate aBlock, passing the score value"
|
4475
|
311 |
|
4493
|
312 |
| scoreOrNil |
|
|
313 |
|
|
314 |
scoreOrNil := self matchScoreOrNil: aString.
|
|
315 |
scoreOrNil notNil ifTrue:[
|
|
316 |
aBlock value:scoreOrNil
|
|
317 |
].
|
|
318 |
|
|
319 |
"Modified: / 02-08-2017 / 16:00:59 / cg"
|
|
320 |
!
|
|
321 |
|
|
322 |
matchScoreOrNil: aString
|
|
323 |
"return the scrore if there is a match; nil otherwise."
|
4475
|
324 |
|
4493
|
325 |
| score |
|
|
326 |
|
|
327 |
pattern ifEmpty: [ ^ (aString size negated) ].
|
|
328 |
(self matches: aString) ifFalse: [ ^ nil ].
|
|
329 |
|
|
330 |
score := self firstScore: aString at: indexes first.
|
4475
|
331 |
|
4493
|
332 |
2 to: pattern size do: [ :pix |
|
|
333 |
score := score + (self score: aString at: (indexes at: pix) patternAt: pix)
|
|
334 |
].
|
|
335 |
|
|
336 |
score := score + self indexScore + ((aString size - pattern size) * self unmatchedLetterPenalty).
|
|
337 |
|
|
338 |
^ score.
|
|
339 |
|
|
340 |
"Created: / 02-08-2017 / 15:59:56 / cg"
|
4459
|
341 |
!
|
|
342 |
|
|
343 |
matches: aString
|
4493
|
344 |
"return true if there is a match; false otherwise."
|
|
345 |
|
|
346 |
| idx |
|
4459
|
347 |
|
4493
|
348 |
pattern size > aString size ifTrue: [ ^ false ].
|
4459
|
349 |
|
4493
|
350 |
idx := 0.
|
|
351 |
pattern withIndexDo: [ :each :i |
|
|
352 |
idx := aString
|
|
353 |
findString: each asString
|
|
354 |
startingAt: idx + 1
|
|
355 |
caseSensitive: false.
|
4459
|
356 |
|
4493
|
357 |
idx == 0 ifTrue: [ ^ false ].
|
|
358 |
indexes at: i put: idx.
|
|
359 |
].
|
|
360 |
^ true
|
|
361 |
|
|
362 |
"Modified (format): / 02-08-2017 / 16:01:05 / cg"
|
4459
|
363 |
! !
|
|
364 |
|
|
365 |
!FuzzyMatcher methodsFor:'initialization'!
|
|
366 |
|
|
367 |
initialize
|
|
368 |
|
4475
|
369 |
super initialize.
|
4459
|
370 |
|
4475
|
371 |
pattern := lowercasePattern := ''.
|
|
372 |
indexes := #().
|
|
373 |
|
|
374 |
"Modified (format): / 14-07-2017 / 13:23:26 / cg"
|
4459
|
375 |
! !
|
|
376 |
|
|
377 |
!FuzzyMatcher methodsFor:'private'!
|
|
378 |
|
|
379 |
firstScore: aString at: anIndex
|
|
380 |
|
|
381 |
| score |
|
|
382 |
|
|
383 |
score := (aString at: anIndex) = pattern first
|
|
384 |
ifTrue: [ self caseEqualBonus ]
|
|
385 |
ifFalse: [ 0 ].
|
|
386 |
|
|
387 |
anIndex = 1 ifTrue: [ ^ score + self firstLetterBonus ].
|
|
388 |
|
|
389 |
score := score + (((anIndex - 1) * self leadingLetterPenalty) max: self maxLeadingLetterPenalty).
|
|
390 |
|
|
391 |
^ score
|
|
392 |
!
|
|
393 |
|
|
394 |
indexScore
|
|
395 |
|
4475
|
396 |
| sum ramp |
|
|
397 |
|
|
398 |
ramp := 1.
|
|
399 |
sum := 0.
|
|
400 |
|
|
401 |
1 to: indexes size - 1 do: [ :ix |
|
|
402 |
ramp := (indexes at: ix) + 1 = (indexes at: ix + 1)
|
|
403 |
ifTrue: [ ramp + (ramp * self adjacencyIncrease) ]
|
|
404 |
ifFalse: [ 1 ].
|
|
405 |
|
|
406 |
sum := sum + ramp - 1
|
|
407 |
].
|
|
408 |
|
|
409 |
^ sum rounded
|
|
410 |
|
|
411 |
"Modified (format): / 14-07-2017 / 13:24:07 / cg"
|
4459
|
412 |
!
|
|
413 |
|
|
414 |
isSeparator: aCharacter
|
|
415 |
|
|
416 |
^ aCharacter = $_ or: [ aCharacter = $: ]
|
|
417 |
|
|
418 |
"Created: / 13-07-2017 / 13:30:34 / cg"
|
|
419 |
!
|
|
420 |
|
|
421 |
score: aString at: stringIndex patternAt: patternIndex
|
|
422 |
|
|
423 |
| score prev |
|
|
424 |
|
|
425 |
prev := (aString at: stringIndex - 1).
|
|
426 |
|
|
427 |
score := (self isSeparator: prev)
|
|
428 |
ifTrue: [ self separatorBonus ]
|
|
429 |
ifFalse: [ (prev asLowercase = (lowercasePattern at: patternIndex - 1))
|
|
430 |
ifTrue: [
|
|
431 |
self adjacencyBonus +
|
|
432 |
((prev = (pattern at: patternIndex - 1)) ifTrue: [ self adjacentCaseEqualBonus ] ifFalse: [ 0 ])
|
|
433 |
]
|
|
434 |
ifFalse: [ 0 ]
|
|
435 |
].
|
|
436 |
|
|
437 |
(aString at: stringIndex) = (pattern at: patternIndex) ifTrue: [
|
|
438 |
score := score + self caseEqualBonus.
|
|
439 |
].
|
|
440 |
|
|
441 |
^ score
|
|
442 |
|
|
443 |
"Modified: / 13-07-2017 / 13:30:57 / cg"
|
|
444 |
! !
|
|
445 |
|
|
446 |
!FuzzyMatcher methodsFor:'scoring-bonus'!
|
|
447 |
|
|
448 |
adjacencyBonus
|
|
449 |
|
|
450 |
^ 5
|
|
451 |
!
|
|
452 |
|
|
453 |
adjacencyIncrease
|
|
454 |
|
|
455 |
^ 1.2
|
|
456 |
!
|
|
457 |
|
|
458 |
adjacentCaseEqualBonus
|
|
459 |
|
|
460 |
^ 3
|
|
461 |
!
|
|
462 |
|
|
463 |
caseEqualBonus
|
|
464 |
|
|
465 |
^ 7
|
|
466 |
!
|
|
467 |
|
|
468 |
firstLetterBonus
|
|
469 |
|
|
470 |
^ 12
|
|
471 |
!
|
|
472 |
|
|
473 |
separatorBonus
|
|
474 |
|
|
475 |
^ 5
|
|
476 |
! !
|
|
477 |
|
|
478 |
!FuzzyMatcher methodsFor:'scoring-penalty'!
|
|
479 |
|
|
480 |
leadingLetterPenalty
|
|
481 |
|
|
482 |
^ -3
|
|
483 |
!
|
|
484 |
|
|
485 |
maxLeadingLetterPenalty
|
|
486 |
|
|
487 |
^ -9
|
|
488 |
!
|
|
489 |
|
|
490 |
unmatchedLetterPenalty
|
|
491 |
|
|
492 |
^ -1
|
|
493 |
! !
|
|
494 |
|
|
495 |
!FuzzyMatcher class methodsFor:'documentation'!
|
|
496 |
|
|
497 |
version
|
|
498 |
^ '$Header$'
|
|
499 |
!
|
|
500 |
|
|
501 |
version_CVS
|
|
502 |
^ '$Header$'
|
|
503 |
! !
|
|
504 |
|