FuzzyMatcher.st
changeset 4493 3961346f8256
parent 4492 05def04efc34
child 5376 61b8a719febd
equal deleted inserted replaced
4492:05def04efc34 4493:3961346f8256
    19     which matches the given search pattern or part of it.
    19     which matches the given search pattern or part of it.
    20     
    20     
    21     The algorithm is based on lib_fts[1], and includes an optional scoring algorithm 
    21     The algorithm is based on lib_fts[1], and includes an optional scoring algorithm 
    22     that can be used to sort all the matches based on their similarity to the pattern.
    22     that can be used to sort all the matches based on their similarity to the pattern.
    23     It is used (among others) in the sublime text editor.
    23     It is used (among others) in the sublime text editor.
    24     
    24 
       
    25     [caveat:]
       
    26         although this works great for class searches,
       
    27         it is strange that 'dabc' scores lower against 'abc' than 'adbc'
       
    28         (dabc has a longer common subsequence without interruptions...)
       
    29 
    25     [see also:]
    30     [see also:]
    26         https://blog.forrestthewoods.com/reverse-engineering-sublime-text-s-fuzzy-match-4cffeed33fdb
    31         https://blog.forrestthewoods.com/reverse-engineering-sublime-text-s-fuzzy-match-4cffeed33fdb
    27         https://github.com/forrestthewoods/lib_fts
    32         https://github.com/forrestthewoods/lib_fts
    28 
    33 
    29 "
    34 "
    30 !
    35 !
    31 
    36 
    32 example
    37 example
    33 "
    38 "
       
    39     |matcher|
       
    40     
       
    41     matcher := FuzzyMatcher pattern:'abc'.
       
    42     matcher 
       
    43         match:'somearbitrarysequence'   
       
    44         ifScored: [:score | Transcript show:('''somearbitrarysequence'' scores '); showCR:score].
       
    45 
       
    46     matcher 
       
    47         match:'someabcd'   
       
    48         ifScored: [:score | Transcript show:('''someabcd'' scores '); showCR:score].
       
    49 
       
    50     matcher 
       
    51         match:'abcd'   
       
    52         ifScored: [:score | Transcript show:('''abcd'' scores '); showCR:score].
       
    53 
       
    54     matcher 
       
    55         match:'dabc'   
       
    56         ifScored: [:score | Transcript show:('''dabc'' scores '); showCR:score].
       
    57 
       
    58     matcher 
       
    59         match:'adbc'   
       
    60         ifScored: [:score | Transcript show:('''adbc'' scores '); showCR:score].
       
    61 
       
    62     matcher 
       
    63         match:'abc'   
       
    64         ifScored: [:score | Transcript show:('''abc'' scores '); showCR:score].
       
    65 
       
    66     
    34     |top lv list field patternHolder names|
    67     |top lv list field patternHolder names|
    35 
    68 
    36     patternHolder := '' asValue.
    69     patternHolder := '' asValue.
    37     list := List new.
    70     list := List new.
    38     
    71     
    84 
   117 
    85     ^ self basicNew initialize.
   118     ^ self basicNew initialize.
    86 !
   119 !
    87 
   120 
    88 pattern: aString
   121 pattern: aString
    89 
       
    90     ^ self new pattern: aString
   122     ^ self new pattern: aString
    91 
   123 
    92     "
   124     "
    93      (self pattern:'mrp') matches:'ButtonMorph'
   125      (self pattern:'mrp') matches:'ButtonMorph'
    94     "
   126      (self pattern:'mrp') matches:'ButtonMorh'
    95 
   127 
    96     "Modified (comment): / 14-07-2017 / 15:02:43 / cg"
   128      (self pattern:'mrp') matches:'ButtonMorph'
       
   129      (self pattern:'mrp') matches:'ButtonMorh'
       
   130     "
       
   131 
       
   132     "Modified (comment): / 02-08-2017 / 15:57:07 / cg"
    97 ! !
   133 ! !
    98 
   134 
    99 !FuzzyMatcher class methodsFor:'utilities api'!
   135 !FuzzyMatcher class methodsFor:'utilities api'!
   100 
   136 
   101 allMatching: aPattern in: aCollection
   137 allMatching: aPattern in: aCollection
   102     "Assumes that the collection is a collection of Strings"
   138     "Assumes that the collection is a collection of Strings;
       
   139      return all those which match"
   103 
   140 
   104     | matcher |
   141     | matcher |
   105 
   142 
   106     matcher := self pattern: aPattern.
   143     matcher := self pattern: aPattern.
   107 
       
   108     ^ aCollection select: [ :each | matcher matches: each ]
   144     ^ aCollection select: [ :each | matcher matches: each ]
   109 
   145 
   110     "
   146     "
   111      self 
   147      self 
   112         allMatching:'clu' 
   148         allMatching:'clu' 
   113         in:(Smalltalk allClasses collect:#name)
   149         in:(Smalltalk allClasses collect:#name)
   114     "
   150     "
   115 
   151 
   116     "Modified (comment): / 14-07-2017 / 12:19:05 / cg"
   152     "Modified (comment): / 02-08-2017 / 16:02:44 / cg"
   117 !
   153 !
   118 
   154 
   119 allMatching: aPattern in: aCollection by: aBlockReturningString
   155 allMatching: aPattern in: aCollection by: aBlockReturningString
   120     "selects matching elements from aCollection.
   156     "selects matching elements from aCollection.
   121      aBlockReturningString is applied to elements to get the string representation
   157      aBlockReturningString is applied to elements to get the string representation
   266         indexes := Array new: pattern size.
   302         indexes := Array new: pattern size.
   267 
   303 
   268     "Modified (format): / 14-07-2017 / 12:59:15 / cg"
   304     "Modified (format): / 14-07-2017 / 12:59:15 / cg"
   269 ! !
   305 ! !
   270 
   306 
   271 !FuzzyMatcher methodsFor:'comparing'!
   307 !FuzzyMatcher methodsFor:'api - comparing'!
   272 
   308 
   273 match: aString ifScored: aBlock
   309 match: aString ifScored: aBlock
   274         
   310     "If there is a match, evaluate aBlock, passing the score value"
   275         | score |
   311         
   276         
   312     | scoreOrNil |
   277         pattern ifEmpty: [ aBlock value: "0" aString size negated. ^ self ].
   313 
   278         (self matches: aString) ifFalse: [ ^ self ].
   314     scoreOrNil := self matchScoreOrNil: aString.
   279         
   315     scoreOrNil notNil ifTrue:[
   280         score := self firstScore: aString at: indexes first.
   316         aBlock value:scoreOrNil
   281         
   317     ].
   282         2 to: pattern size do: [ :pix | 
   318 
   283                 score := score + (self score: aString at: (indexes at: pix) patternAt: pix)
   319     "Modified: / 02-08-2017 / 16:00:59 / cg"
   284         ].
   320 !
   285                 
   321 
   286         score := score + self indexScore + ((aString size - pattern size) * self unmatchedLetterPenalty).
   322 matchScoreOrNil: aString
   287                 
   323     "return the scrore if there is a match; nil otherwise."
   288         aBlock value: score.
   324         
   289 
   325     | score |
   290     "Modified: / 14-07-2017 / 12:44:50 / cg"
   326 
       
   327     pattern ifEmpty: [ ^ (aString size negated) ].
       
   328     (self matches: aString) ifFalse: [ ^ nil ].
       
   329 
       
   330     score := self firstScore: aString at: indexes first.
       
   331 
       
   332     2 to: pattern size do: [ :pix | 
       
   333             score := score + (self score: aString at: (indexes at: pix) patternAt: pix)
       
   334     ].
       
   335 
       
   336     score := score + self indexScore + ((aString size - pattern size) * self unmatchedLetterPenalty).
       
   337 
       
   338     ^ score.
       
   339 
       
   340     "Created: / 02-08-2017 / 15:59:56 / cg"
   291 !
   341 !
   292 
   342 
   293 matches: aString
   343 matches: aString
   294 
   344     "return true if there is a match; false otherwise."
   295 	| idx |
   345      
   296 	
   346     | idx |
   297 	pattern size > aString size ifTrue: [ ^ false ].
   347 
   298 
   348     pattern size > aString size ifTrue: [ ^ false ].
   299 	idx := 0.
   349 
   300 	pattern withIndexDo: [ :each :i |
   350     idx := 0.
   301 		idx := aString 
   351     pattern withIndexDo: [ :each :i |
   302 			findString: each asString 
   352             idx := aString 
   303 			startingAt: idx + 1 
   353                     findString: each asString 
   304 			caseSensitive: false. 
   354                     startingAt: idx + 1 
   305 		
   355                     caseSensitive: false. 
   306 		idx == 0 ifTrue: [ ^ false ].
   356 
   307 		indexes at: i put: idx.
   357             idx == 0 ifTrue: [ ^ false ].
   308 	].
   358             indexes at: i put: idx.
   309 
   359     ].
   310 	^ true
   360     ^ true
       
   361 
       
   362     "Modified (format): / 02-08-2017 / 16:01:05 / cg"
   311 ! !
   363 ! !
   312 
   364 
   313 !FuzzyMatcher methodsFor:'initialization'!
   365 !FuzzyMatcher methodsFor:'initialization'!
   314 
   366 
   315 initialize
   367 initialize