FuzzyMatcher.st
author Claus Gittinger <cg@exept.de>
Fri, 14 Jul 2017 12:19:21 +0200
changeset 4474 98208d107b52
parent 4470 5825ccc0dabf
child 4475 2e19c5a7452a
permissions -rw-r--r--
#DOCUMENTATION by cg class: FuzzyMatcher class comment/format in: #allMatching:in: #allMatching:in:by: #allSortedByScoreMatching:in: #allSortedByScoreMatching:in:by: #documentation

"{ Package: 'stx:libbasic2' }"

"{ NameSpace: Smalltalk }"

Object subclass:#FuzzyMatcher
	instanceVariableNames:'pattern lowercasePattern indexes'
	classVariableNames:''
	poolDictionaries:''
	category:'Collections-Text-Support'
!

!FuzzyMatcher class methodsFor:'documentation'!

documentation
"
    FuzzyMatcher is an approximate string matching algorithm that can determine if a string includes a given pattern.
    For example, the string 'axby' matches both the pattern 'ab' and, 'ay', but not 'ba'. 

    The algorithm is based on lib_fts[1], and includes an optional scoring algorithm 
    that can be used to sort all the matches based on their similarity to the pattern.
    It is used in the sublime text editor.
    
    [see also:]
        https://blog.forrestthewoods.com/reverse-engineering-sublime-text-s-fuzzy-match-4cffeed33fdb
        https://github.com/forrestthewoods/lib_fts

"
!

example
"
    |top lv list field patternHolder names|

    patternHolder := '' asValue.
    list := List new.
    
    top := StandardSystemView new.
    lv := ListView origin:(0.0@30) corner:(1.0@1.0) in:top.
    lv model:list.

    field := EditField origin:(0.0@0.0) corner:(1.0@30) in:top.
    field model:patternHolder.
    field immediateAccept:true.

    names := Smalltalk allClasses collect:#name.

    patternHolder 
        onChangeEvaluate:[
            |matcher pattern matches|

            pattern := patternHolder value.
            pattern notEmpty ifTrue:[
                matcher := FuzzyMatcher pattern:pattern.
                
                matches := OrderedCollection new.

                names do:[:eachClassName | 
                    matcher 
                        match:eachClassName
                        ifScored: [ :score | matches add: eachClassName -> score ] 
                ].
                matches sort:[:a :b |
                        a value < b value
                        or:[ a value = b value and:[ a key > b key]]
                ].

                list removeAll.
                list addAllReversed:(matches 
                                collect:[:nameScoreAssoc | 
                                    '[%1] %2' bindWith:nameScoreAssoc value with:nameScoreAssoc key])
            ].    
        ].    
    top open.
    patternHolder value:'mph'.
"
! !

!FuzzyMatcher class methodsFor:'instance creation'!

new
    "return an initialized instance"

    ^ self basicNew initialize.
! !

!FuzzyMatcher class methodsFor:'construction'!

pattern: aString

	^self new pattern: aString
! !

!FuzzyMatcher class methodsFor:'utilities api'!

allMatching: aPattern in: aCollection
    "Assumes that the collection is a collection of Strings"

    | matcher |

    matcher := self pattern: aPattern.

    ^ aCollection select: [ :each | matcher matches: each ]

    "
     self 
        allMatching:'clu' 
        in:(Smalltalk allClasses collect:#name)
    "

    "Modified (comment): / 14-07-2017 / 12:19:05 / cg"
!

allMatching: aPattern in: aCollection by: aBlockReturningString

    | matcher |

    matcher := self pattern: aPattern.

    ^ aCollection select: [ :each | matcher matches: (aBlockReturningString value: each) ]

        "
         self 
            allMatching:'clu' 
            in:(Smalltalk allClasses)
            by:[:cls | cls name]
        "
        "
         self 
            allMatching:'clu' 
            in:(Smalltalk allClasses)
            by:#name
        "

    "Modified (format): / 14-07-2017 / 12:18:59 / cg"
!

allSortedByScoreMatching: aPattern in: aCollection
    "Assumes that the collection is a collection of Strings"

    ^ self allSortedByScoreMatching: aPattern in: aCollection by: [ :each | each ]

    "
     self 
        allSortedByScoreMatching:'clu' 
        in:(Smalltalk allClasses collect:#name)
    "
    "
     self 
        allSortedByScoreMatching:'nary' 
        in:(Smalltalk allClasses collect:#name)
    "

    "Modified (comment): / 14-07-2017 / 12:18:54 / cg"
!

allSortedByScoreMatching: aPattern in: aCollection by: aBlockReturningString

    | matcher matches |

    aPattern isEmpty ifTrue: [ ^ aCollection asArray ].

    matcher := self pattern: aPattern.
    matches := OrderedCollection new: aCollection size // 2.

    aCollection do: [ :each | 
        matcher 
            match: (aBlockReturningString value: each) 
            ifScored: [ :score | matches add: score -> each ] 
    ].

    matches sort: [ :a :b | a key >= b key].
    ^ matches collect: [ :each | each value ] as: Array

    "
     self 
        allSortedByScoreMatching:'nary' 
        in:(Smalltalk allClasses)
        by:[:cls | cls name]
    "
    "
     self 
        allSortedByScoreMatching:'nary' 
        in:(Smalltalk allClasses)
        by:#name
    "

    "Modified (format): / 14-07-2017 / 12:18:48 / cg"
! !

!FuzzyMatcher methodsFor:'accessing'!

pattern

	^ pattern 
!

pattern: aString

	pattern := aString.
	lowercasePattern := pattern asLowercase.
	indexes := Array new: pattern size.
	
! !

!FuzzyMatcher methodsFor:'comparing'!

match: aString ifScored: aBlock
	
	| score |
	
	score := 0.	
		
	pattern ifEmpty: [ aBlock value: score. ^ self ].
	
	(self matches: aString) ifFalse: [ ^ self ].
	
	score := self firstScore: aString at: indexes first.
	
	2 to: pattern size do: [ :pix | 
		score := score + (self score: aString at: (indexes at: pix) patternAt: pix)
	].
		
	score := score + self indexScore + ((aString size - pattern size) * self unmatchedLetterPenalty).
		
	aBlock value: score.	
!

matches: aString

	| idx |
	
	pattern size > aString size ifTrue: [ ^ false ].

	idx := 0.
	pattern withIndexDo: [ :each :i |
		idx := aString 
			findString: each asString 
			startingAt: idx + 1 
			caseSensitive: false. 
		
		idx == 0 ifTrue: [ ^ false ].
		indexes at: i put: idx.
	].

	^ true
! !

!FuzzyMatcher methodsFor:'initialization'!

initialize

	super initialize.

	pattern := lowercasePattern := ''.
	indexes := #().
	
! !

!FuzzyMatcher methodsFor:'private'!

firstScore: aString at: anIndex

	| score |

	score := (aString at: anIndex) = pattern first 
		ifTrue: [ self caseEqualBonus ]
		ifFalse: [ 0 ].
	
	anIndex = 1 	ifTrue: [ ^ score + self firstLetterBonus ].
		
	score := score + (((anIndex - 1) * self leadingLetterPenalty) max: self maxLeadingLetterPenalty).
				
	^ score 
!

indexScore 

	| sum ramp |
	
	ramp := 1.
	sum := 0.
	
	1 to: indexes size - 1 do: [ :ix |
		ramp := (indexes at: ix) + 1 = (indexes at: ix + 1) 
			ifTrue: [ ramp + (ramp * self adjacencyIncrease) ]
			ifFalse: [ 1 ].		
		
		sum := sum + ramp - 1
	].
	
	^ sum rounded
!

isSeparator: aCharacter
        
        ^  aCharacter = $_ or: [ aCharacter = $: ]

    "Created: / 13-07-2017 / 13:30:34 / cg"
!

isSeperator: aCharacter
        <resource: #obsolete>
        ^ self isSeparator: aCharacter

    "Modified: / 13-07-2017 / 13:31:18 / cg"
!

score: aString at: stringIndex patternAt: patternIndex

        | score prev |
        
        prev := (aString at: stringIndex - 1).
        
        score := (self isSeparator: prev) 
                ifTrue: [ self separatorBonus ]
                ifFalse: [ (prev asLowercase = (lowercasePattern at: patternIndex - 1))
                        ifTrue: [ 
                                self adjacencyBonus + 
                                ((prev = (pattern at: patternIndex - 1)) ifTrue: [ self adjacentCaseEqualBonus ] ifFalse: [ 0 ]) 
                        ] 
                        ifFalse: [ 0 ] 
                ].
        
        (aString at: stringIndex) = (pattern at: patternIndex) ifTrue: [ 
                score := score + self caseEqualBonus.
        ].

        ^ score

    "Modified: / 13-07-2017 / 13:30:57 / cg"
! !

!FuzzyMatcher methodsFor:'scoring-bonus'!

adjacencyBonus

	^ 5
!

adjacencyIncrease

	^ 1.2
!

adjacentCaseEqualBonus

	^ 3
!

caseEqualBonus

	^ 7
!

firstLetterBonus

	^ 12
!

separatorBonus

	^ 5
! !

!FuzzyMatcher methodsFor:'scoring-penalty'!

leadingLetterPenalty

	^ -3
!

maxLeadingLetterPenalty

	^ -9
!

unmatchedLetterPenalty

	^ -1
! !

!FuzzyMatcher class methodsFor:'documentation'!

version
    ^ '$Header$'
!

version_CVS
    ^ '$Header$'
! !