--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/FuzzyMatcher.st Thu Jul 13 13:40:46 2017 +0200
@@ -0,0 +1,302 @@
+"{ Package: 'stx:libbasic2' }"
+
+"{ NameSpace: Smalltalk }"
+
+Object subclass:#FuzzyMatcher
+ instanceVariableNames:'pattern lowercasePattern indexes'
+ classVariableNames:''
+ poolDictionaries:''
+ category:'Collections-Text-Support'
+!
+
+!FuzzyMatcher class methodsFor:'documentation'!
+
+documentation
+"
+ FuzzyMatcher is an approximate string matching algroithm that can determine if a string includes a given pattern.
+ For example, the string 'axby' matches both the pattern 'ab' and, 'ay', but not 'ba'.
+
+ The algorithm is based on lib_fts[1], and includes an optional scoring algorithm
+ that can be used to sort all the matches based on their similarity to the pattern.
+ It is used in the sublime text editor.
+
+ [see also:]
+ https://blog.forrestthewoods.com/reverse-engineering-sublime-text-s-fuzzy-match-4cffeed33fdb
+ https://github.com/forrestthewoods/lib_fts
+
+"
+! !
+
+!FuzzyMatcher class methodsFor:'instance creation'!
+
+new
+ "return an initialized instance"
+
+ ^ self basicNew initialize.
+! !
+
+!FuzzyMatcher class methodsFor:'construction'!
+
+pattern: aString
+
+ ^self new pattern: aString
+! !
+
+!FuzzyMatcher class methodsFor:'utilities api'!
+
+allMatching: aPattern in: aCollection
+ "Assumes that the collection is a collection of Strings"
+
+ | matcher |
+
+ matcher := self pattern: aPattern.
+
+ ^ aCollection select: [ :each | matcher matches: each ]
+
+ "
+ self
+ allMatching:'clu'
+ in:(Smalltalk allClasses collect:#name)
+ "
+
+ "Modified (comment): / 13-07-2017 / 13:29:11 / cg"
+!
+
+allMatching: aPattern in: aCollection by: aBlockReturningString
+
+ | matcher |
+
+ matcher := self pattern: aPattern.
+
+ ^ aCollection select: [ :each | matcher matches: (aBlockReturningString value: each) ]
+!
+
+allSortedByScoreMatching: aPattern in: aCollection
+ "Assumes that the collection is a collection of Strings"
+
+ ^ self allSortedByScoreMatching: aPattern in: aCollection by: [ :each | each ]
+!
+
+allSortedByScoreMatching: aPattern in: aCollection by: aBlockReturningString
+
+ | matcher matches |
+
+ aPattern isEmpty ifTrue: [ ^ aCollection asArray ].
+
+ matcher := self pattern: aPattern.
+ matches := OrderedCollection new: aCollection size // 2.
+
+ aCollection do: [ :each |
+ matcher
+ match: (aBlockReturningString value: each)
+ ifScored: [ :score | matches add: score -> each ]
+ ].
+
+ matches sort: [ :a :b | a key >= b key ].
+
+ ^ matches collect: [ :each | each value ] as: Array
+
+! !
+
+!FuzzyMatcher methodsFor:'accessing'!
+
+pattern
+
+ ^ pattern
+!
+
+pattern: aString
+
+ pattern := aString.
+ lowercasePattern := pattern asLowercase.
+ indexes := Array new: pattern size.
+
+! !
+
+!FuzzyMatcher methodsFor:'comparing'!
+
+match: aString ifScored: aBlock
+
+ | score |
+
+ score := 0.
+
+ pattern ifEmpty: [ aBlock value: score. ^ self ].
+
+ (self matches: aString) ifFalse: [ ^ self ].
+
+ score := self firstScore: aString at: indexes first.
+
+ 2 to: pattern size do: [ :pix |
+ score := score + (self score: aString at: (indexes at: pix) patternAt: pix)
+ ].
+
+ score := score + self indexScore + ((aString size - pattern size) * self unmatchedLetterPenalty).
+
+ aBlock value: score.
+!
+
+matches: aString
+
+ | idx |
+
+ pattern size > aString size ifTrue: [ ^ false ].
+
+ idx := 0.
+ pattern withIndexDo: [ :each :i |
+ idx := aString
+ findString: each asString
+ startingAt: idx + 1
+ caseSensitive: false.
+
+ idx == 0 ifTrue: [ ^ false ].
+ indexes at: i put: idx.
+ ].
+
+ ^ true
+! !
+
+!FuzzyMatcher methodsFor:'initialization'!
+
+initialize
+
+ super initialize.
+
+ pattern := lowercasePattern := ''.
+ indexes := #().
+
+! !
+
+!FuzzyMatcher methodsFor:'private'!
+
+firstScore: aString at: anIndex
+
+ | score |
+
+ score := (aString at: anIndex) = pattern first
+ ifTrue: [ self caseEqualBonus ]
+ ifFalse: [ 0 ].
+
+ anIndex = 1 ifTrue: [ ^ score + self firstLetterBonus ].
+
+ score := score + (((anIndex - 1) * self leadingLetterPenalty) max: self maxLeadingLetterPenalty).
+
+ ^ score
+!
+
+indexScore
+
+ | sum ramp |
+
+ ramp := 1.
+ sum := 0.
+
+ 1 to: indexes size - 1 do: [ :ix |
+ ramp := (indexes at: ix) + 1 = (indexes at: ix + 1)
+ ifTrue: [ ramp + (ramp * self adjacencyIncrease) ]
+ ifFalse: [ 1 ].
+
+ sum := sum + ramp - 1
+ ].
+
+ ^ sum rounded
+!
+
+isSeparator: aCharacter
+
+ ^ aCharacter = $_ or: [ aCharacter = $: ]
+
+ "Created: / 13-07-2017 / 13:30:34 / cg"
+!
+
+isSeperator: aCharacter
+ <resource: #obsolete>
+ ^ self isSeparator: aCharacter
+
+ "Modified: / 13-07-2017 / 13:31:18 / cg"
+!
+
+score: aString at: stringIndex patternAt: patternIndex
+
+ | score prev |
+
+ prev := (aString at: stringIndex - 1).
+
+ score := (self isSeparator: prev)
+ ifTrue: [ self separatorBonus ]
+ ifFalse: [ (prev asLowercase = (lowercasePattern at: patternIndex - 1))
+ ifTrue: [
+ self adjacencyBonus +
+ ((prev = (pattern at: patternIndex - 1)) ifTrue: [ self adjacentCaseEqualBonus ] ifFalse: [ 0 ])
+ ]
+ ifFalse: [ 0 ]
+ ].
+
+ (aString at: stringIndex) = (pattern at: patternIndex) ifTrue: [
+ score := score + self caseEqualBonus.
+ ].
+
+ ^ score
+
+ "Modified: / 13-07-2017 / 13:30:57 / cg"
+! !
+
+!FuzzyMatcher methodsFor:'scoring-bonus'!
+
+adjacencyBonus
+
+ ^ 5
+!
+
+adjacencyIncrease
+
+ ^ 1.2
+!
+
+adjacentCaseEqualBonus
+
+ ^ 3
+!
+
+caseEqualBonus
+
+ ^ 7
+!
+
+firstLetterBonus
+
+ ^ 12
+!
+
+separatorBonus
+
+ ^ 5
+! !
+
+!FuzzyMatcher methodsFor:'scoring-penalty'!
+
+leadingLetterPenalty
+
+ ^ -3
+!
+
+maxLeadingLetterPenalty
+
+ ^ -9
+!
+
+unmatchedLetterPenalty
+
+ ^ -1
+! !
+
+!FuzzyMatcher class methodsFor:'documentation'!
+
+version
+ ^ '$Header$'
+!
+
+version_CVS
+ ^ '$Header$'
+! !
+