FuzzyMatcher.st
author Claus Gittinger <cg@exept.de>
Thu, 13 Jul 2017 13:40:46 +0200
changeset 4459 cfbde361fe34
child 4468 4da827f70608
permissions -rw-r--r--
initial checkin

"{ Package: 'stx:libbasic2' }"

"{ NameSpace: Smalltalk }"

Object subclass:#FuzzyMatcher
	instanceVariableNames:'pattern lowercasePattern indexes'
	classVariableNames:''
	poolDictionaries:''
	category:'Collections-Text-Support'
!

!FuzzyMatcher class methodsFor:'documentation'!

documentation
"
    FuzzyMatcher is an approximate string matching algroithm that can determine if a string includes a given pattern.
    For example, the string 'axby' matches both the pattern 'ab' and, 'ay', but not 'ba'. 

    The algorithm is based on lib_fts[1], and includes an optional scoring algorithm 
    that can be used to sort all the matches based on their similarity to the pattern.
    It is used in the sublime text editor.
    
    [see also:]
        https://blog.forrestthewoods.com/reverse-engineering-sublime-text-s-fuzzy-match-4cffeed33fdb
        https://github.com/forrestthewoods/lib_fts

"
! !

!FuzzyMatcher class methodsFor:'instance creation'!

new
    "return an initialized instance"

    ^ self basicNew initialize.
! !

!FuzzyMatcher class methodsFor:'construction'!

pattern: aString

	^self new pattern: aString
! !

!FuzzyMatcher class methodsFor:'utilities api'!

allMatching: aPattern in: aCollection
        "Assumes that the collection is a collection of Strings"

        | matcher |
        
        matcher := self pattern: aPattern.

        ^ aCollection select: [ :each | matcher matches: each ]

        "
         self 
            allMatching:'clu' 
            in:(Smalltalk allClasses collect:#name)
        "

    "Modified (comment): / 13-07-2017 / 13:29:11 / cg"
!

allMatching: aPattern in: aCollection by: aBlockReturningString

	| matcher |
	
	matcher := self pattern: aPattern.

	^ aCollection select: [ :each | matcher matches: (aBlockReturningString value: each) ]
!

allSortedByScoreMatching: aPattern in: aCollection
	"Assumes that the collection is a collection of Strings"
	
	^ self allSortedByScoreMatching: aPattern in: aCollection by: [ :each | each ]
!

allSortedByScoreMatching: aPattern in: aCollection by: aBlockReturningString

	| matcher matches |
	
	aPattern isEmpty ifTrue: [ ^ aCollection asArray ].
	
	matcher := self pattern: aPattern.
	matches := OrderedCollection new: aCollection size // 2.
	
	aCollection do: [ :each | 
		matcher 
			match: (aBlockReturningString value: each) 
			ifScored: [ :score | matches add: score -> each ] 
	].
	
	matches sort: [ :a :b | a key >= b key ].
	
	^ matches collect: [ :each | each value ] as: Array
	
! !

!FuzzyMatcher methodsFor:'accessing'!

pattern

	^ pattern 
!

pattern: aString

	pattern := aString.
	lowercasePattern := pattern asLowercase.
	indexes := Array new: pattern size.
	
! !

!FuzzyMatcher methodsFor:'comparing'!

match: aString ifScored: aBlock
	
	| score |
	
	score := 0.	
		
	pattern ifEmpty: [ aBlock value: score. ^ self ].
	
	(self matches: aString) ifFalse: [ ^ self ].
	
	score := self firstScore: aString at: indexes first.
	
	2 to: pattern size do: [ :pix | 
		score := score + (self score: aString at: (indexes at: pix) patternAt: pix)
	].
		
	score := score + self indexScore + ((aString size - pattern size) * self unmatchedLetterPenalty).
		
	aBlock value: score.	
!

matches: aString

	| idx |
	
	pattern size > aString size ifTrue: [ ^ false ].

	idx := 0.
	pattern withIndexDo: [ :each :i |
		idx := aString 
			findString: each asString 
			startingAt: idx + 1 
			caseSensitive: false. 
		
		idx == 0 ifTrue: [ ^ false ].
		indexes at: i put: idx.
	].

	^ true
! !

!FuzzyMatcher methodsFor:'initialization'!

initialize

	super initialize.

	pattern := lowercasePattern := ''.
	indexes := #().
	
! !

!FuzzyMatcher methodsFor:'private'!

firstScore: aString at: anIndex

	| score |

	score := (aString at: anIndex) = pattern first 
		ifTrue: [ self caseEqualBonus ]
		ifFalse: [ 0 ].
	
	anIndex = 1 	ifTrue: [ ^ score + self firstLetterBonus ].
		
	score := score + (((anIndex - 1) * self leadingLetterPenalty) max: self maxLeadingLetterPenalty).
				
	^ score 
!

indexScore 

	| sum ramp |
	
	ramp := 1.
	sum := 0.
	
	1 to: indexes size - 1 do: [ :ix |
		ramp := (indexes at: ix) + 1 = (indexes at: ix + 1) 
			ifTrue: [ ramp + (ramp * self adjacencyIncrease) ]
			ifFalse: [ 1 ].		
		
		sum := sum + ramp - 1
	].
	
	^ sum rounded
!

isSeparator: aCharacter
        
        ^  aCharacter = $_ or: [ aCharacter = $: ]

    "Created: / 13-07-2017 / 13:30:34 / cg"
!

isSeperator: aCharacter
        <resource: #obsolete>
        ^ self isSeparator: aCharacter

    "Modified: / 13-07-2017 / 13:31:18 / cg"
!

score: aString at: stringIndex patternAt: patternIndex

        | score prev |
        
        prev := (aString at: stringIndex - 1).
        
        score := (self isSeparator: prev) 
                ifTrue: [ self separatorBonus ]
                ifFalse: [ (prev asLowercase = (lowercasePattern at: patternIndex - 1))
                        ifTrue: [ 
                                self adjacencyBonus + 
                                ((prev = (pattern at: patternIndex - 1)) ifTrue: [ self adjacentCaseEqualBonus ] ifFalse: [ 0 ]) 
                        ] 
                        ifFalse: [ 0 ] 
                ].
        
        (aString at: stringIndex) = (pattern at: patternIndex) ifTrue: [ 
                score := score + self caseEqualBonus.
        ].

        ^ score

    "Modified: / 13-07-2017 / 13:30:57 / cg"
! !

!FuzzyMatcher methodsFor:'scoring-bonus'!

adjacencyBonus

	^ 5
!

adjacencyIncrease

	^ 1.2
!

adjacentCaseEqualBonus

	^ 3
!

caseEqualBonus

	^ 7
!

firstLetterBonus

	^ 12
!

separatorBonus

	^ 5
! !

!FuzzyMatcher methodsFor:'scoring-penalty'!

leadingLetterPenalty

	^ -3
!

maxLeadingLetterPenalty

	^ -9
!

unmatchedLetterPenalty

	^ -1
! !

!FuzzyMatcher class methodsFor:'documentation'!

version
    ^ '$Header$'
!

version_CVS
    ^ '$Header$'
! !