FuzzyMatcher.st
changeset 4459 cfbde361fe34
child 4468 4da827f70608
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/FuzzyMatcher.st	Thu Jul 13 13:40:46 2017 +0200
@@ -0,0 +1,302 @@
+"{ Package: 'stx:libbasic2' }"
+
+"{ NameSpace: Smalltalk }"
+
+Object subclass:#FuzzyMatcher
+	instanceVariableNames:'pattern lowercasePattern indexes'
+	classVariableNames:''
+	poolDictionaries:''
+	category:'Collections-Text-Support'
+!
+
+!FuzzyMatcher class methodsFor:'documentation'!
+
+documentation
+"
+    FuzzyMatcher is an approximate string matching algroithm that can determine if a string includes a given pattern.
+    For example, the string 'axby' matches both the pattern 'ab' and, 'ay', but not 'ba'. 
+
+    The algorithm is based on lib_fts[1], and includes an optional scoring algorithm 
+    that can be used to sort all the matches based on their similarity to the pattern.
+    It is used in the sublime text editor.
+    
+    [see also:]
+        https://blog.forrestthewoods.com/reverse-engineering-sublime-text-s-fuzzy-match-4cffeed33fdb
+        https://github.com/forrestthewoods/lib_fts
+
+"
+! !
+
+!FuzzyMatcher class methodsFor:'instance creation'!
+
+new
+    "return an initialized instance"
+
+    ^ self basicNew initialize.
+! !
+
+!FuzzyMatcher class methodsFor:'construction'!
+
+pattern: aString
+
+	^self new pattern: aString
+! !
+
+!FuzzyMatcher class methodsFor:'utilities api'!
+
+allMatching: aPattern in: aCollection
+        "Assumes that the collection is a collection of Strings"
+
+        | matcher |
+        
+        matcher := self pattern: aPattern.
+
+        ^ aCollection select: [ :each | matcher matches: each ]
+
+        "
+         self 
+            allMatching:'clu' 
+            in:(Smalltalk allClasses collect:#name)
+        "
+
+    "Modified (comment): / 13-07-2017 / 13:29:11 / cg"
+!
+
+allMatching: aPattern in: aCollection by: aBlockReturningString
+
+	| matcher |
+	
+	matcher := self pattern: aPattern.
+
+	^ aCollection select: [ :each | matcher matches: (aBlockReturningString value: each) ]
+!
+
+allSortedByScoreMatching: aPattern in: aCollection
+	"Assumes that the collection is a collection of Strings"
+	
+	^ self allSortedByScoreMatching: aPattern in: aCollection by: [ :each | each ]
+!
+
+allSortedByScoreMatching: aPattern in: aCollection by: aBlockReturningString
+
+	| matcher matches |
+	
+	aPattern isEmpty ifTrue: [ ^ aCollection asArray ].
+	
+	matcher := self pattern: aPattern.
+	matches := OrderedCollection new: aCollection size // 2.
+	
+	aCollection do: [ :each | 
+		matcher 
+			match: (aBlockReturningString value: each) 
+			ifScored: [ :score | matches add: score -> each ] 
+	].
+	
+	matches sort: [ :a :b | a key >= b key ].
+	
+	^ matches collect: [ :each | each value ] as: Array
+	
+! !
+
+!FuzzyMatcher methodsFor:'accessing'!
+
+pattern
+
+	^ pattern 
+!
+
+pattern: aString
+
+	pattern := aString.
+	lowercasePattern := pattern asLowercase.
+	indexes := Array new: pattern size.
+	
+! !
+
+!FuzzyMatcher methodsFor:'comparing'!
+
+match: aString ifScored: aBlock
+	
+	| score |
+	
+	score := 0.	
+		
+	pattern ifEmpty: [ aBlock value: score. ^ self ].
+	
+	(self matches: aString) ifFalse: [ ^ self ].
+	
+	score := self firstScore: aString at: indexes first.
+	
+	2 to: pattern size do: [ :pix | 
+		score := score + (self score: aString at: (indexes at: pix) patternAt: pix)
+	].
+		
+	score := score + self indexScore + ((aString size - pattern size) * self unmatchedLetterPenalty).
+		
+	aBlock value: score.	
+!
+
+matches: aString
+
+	| idx |
+	
+	pattern size > aString size ifTrue: [ ^ false ].
+
+	idx := 0.
+	pattern withIndexDo: [ :each :i |
+		idx := aString 
+			findString: each asString 
+			startingAt: idx + 1 
+			caseSensitive: false. 
+		
+		idx == 0 ifTrue: [ ^ false ].
+		indexes at: i put: idx.
+	].
+
+	^ true
+! !
+
+!FuzzyMatcher methodsFor:'initialization'!
+
+initialize
+
+	super initialize.
+
+	pattern := lowercasePattern := ''.
+	indexes := #().
+	
+! !
+
+!FuzzyMatcher methodsFor:'private'!
+
+firstScore: aString at: anIndex
+
+	| score |
+
+	score := (aString at: anIndex) = pattern first 
+		ifTrue: [ self caseEqualBonus ]
+		ifFalse: [ 0 ].
+	
+	anIndex = 1 	ifTrue: [ ^ score + self firstLetterBonus ].
+		
+	score := score + (((anIndex - 1) * self leadingLetterPenalty) max: self maxLeadingLetterPenalty).
+				
+	^ score 
+!
+
+indexScore 
+
+	| sum ramp |
+	
+	ramp := 1.
+	sum := 0.
+	
+	1 to: indexes size - 1 do: [ :ix |
+		ramp := (indexes at: ix) + 1 = (indexes at: ix + 1) 
+			ifTrue: [ ramp + (ramp * self adjacencyIncrease) ]
+			ifFalse: [ 1 ].		
+		
+		sum := sum + ramp - 1
+	].
+	
+	^ sum rounded
+!
+
+isSeparator: aCharacter
+        
+        ^  aCharacter = $_ or: [ aCharacter = $: ]
+
+    "Created: / 13-07-2017 / 13:30:34 / cg"
+!
+
+isSeperator: aCharacter
+        <resource: #obsolete>
+        ^ self isSeparator: aCharacter
+
+    "Modified: / 13-07-2017 / 13:31:18 / cg"
+!
+
+score: aString at: stringIndex patternAt: patternIndex
+
+        | score prev |
+        
+        prev := (aString at: stringIndex - 1).
+        
+        score := (self isSeparator: prev) 
+                ifTrue: [ self separatorBonus ]
+                ifFalse: [ (prev asLowercase = (lowercasePattern at: patternIndex - 1))
+                        ifTrue: [ 
+                                self adjacencyBonus + 
+                                ((prev = (pattern at: patternIndex - 1)) ifTrue: [ self adjacentCaseEqualBonus ] ifFalse: [ 0 ]) 
+                        ] 
+                        ifFalse: [ 0 ] 
+                ].
+        
+        (aString at: stringIndex) = (pattern at: patternIndex) ifTrue: [ 
+                score := score + self caseEqualBonus.
+        ].
+
+        ^ score
+
+    "Modified: / 13-07-2017 / 13:30:57 / cg"
+! !
+
+!FuzzyMatcher methodsFor:'scoring-bonus'!
+
+adjacencyBonus
+
+	^ 5
+!
+
+adjacencyIncrease
+
+	^ 1.2
+!
+
+adjacentCaseEqualBonus
+
+	^ 3
+!
+
+caseEqualBonus
+
+	^ 7
+!
+
+firstLetterBonus
+
+	^ 12
+!
+
+separatorBonus
+
+	^ 5
+! !
+
+!FuzzyMatcher methodsFor:'scoring-penalty'!
+
+leadingLetterPenalty
+
+	^ -3
+!
+
+maxLeadingLetterPenalty
+
+	^ -9
+!
+
+unmatchedLetterPenalty
+
+	^ -1
+! !
+
+!FuzzyMatcher class methodsFor:'documentation'!
+
+version
+    ^ '$Header$'
+!
+
+version_CVS
+    ^ '$Header$'
+! !
+