FuzzyMatcher.st
author Claus Gittinger <cg@exept.de>
Sat, 02 May 2020 21:40:13 +0200
changeset 5476 7355a4b11cb6
parent 5376 61b8a719febd
permissions -rw-r--r--
#FEATURE by cg class: Socket class added: #newTCPclientToHost:port:domain:domainOrder:withTimeout: changed: #newTCPclientToHost:port:domain:withTimeout:

"{ Encoding: utf8 }"

"
 COPYRIGHT (c) 2018 by eXept Software AG
              All Rights Reserved

 This software is furnished under a license and may be used
 only in accordance with the terms of that license and with the
 inclusion of the above copyright notice.   This software may not
 be provided or otherwise made available to, or used by, any
 other person.  No title to or ownership of the software is
 hereby transferred.
"
"{ Package: 'stx:libbasic2' }"

"{ NameSpace: Smalltalk }"

Object subclass:#FuzzyMatcher
	instanceVariableNames:'pattern lowercasePattern indexes'
	classVariableNames:''
	poolDictionaries:''
	category:'Collections-Text-Support'
!

!FuzzyMatcher class methodsFor:'documentation'!

copyright
"
 COPYRIGHT (c) 2018 by eXept Software AG
              All Rights Reserved

 This software is furnished under a license and may be used
 only in accordance with the terms of that license and with the
 inclusion of the above copyright notice.   This software may not
 be provided or otherwise made available to, or used by, any
 other person.  No title to or ownership of the software is
 hereby transferred.
"
!

documentation
"
    FuzzyMatcher is an approximate string matching algorithm that can determine if a string includes a given pattern.
    For example, the string 'axby' matches both the pattern 'ab' and, 'ay', but not 'ba'. 
    I.e. it matches if the searched string contains a sequence of chars, probably intermixed by other chars,
    which matches the given search pattern or part of it.
    
    The algorithm is based on lib_fts[1], and includes an optional scoring algorithm 
    that can be used to sort all the matches based on their similarity to the pattern.
    It is used (among others) in the sublime text editor.

    [caveat:]
        although this works great for class searches,
        it is strange that 'dabc' scores lower against 'abc' than 'adbc'
        (dabc has a longer common subsequence without interruptions...)

    [see also:]
        https://blog.forrestthewoods.com/reverse-engineering-sublime-text-s-fuzzy-match-4cffeed33fdb
        https://github.com/forrestthewoods/lib_fts

"
!

example
"
    |matcher|
    
    matcher := FuzzyMatcher pattern:'abc'.
    matcher 
        match:'somearbitrarysequence'   
        ifScored: [:score | Transcript show:('''somearbitrarysequence'' scores '); showCR:score].

    matcher 
        match:'someabcd'   
        ifScored: [:score | Transcript show:('''someabcd'' scores '); showCR:score].

    matcher 
        match:'abcd'   
        ifScored: [:score | Transcript show:('''abcd'' scores '); showCR:score].

    matcher 
        match:'dabc'   
        ifScored: [:score | Transcript show:('''dabc'' scores '); showCR:score].

    matcher 
        match:'adbc'   
        ifScored: [:score | Transcript show:('''adbc'' scores '); showCR:score].

    matcher 
        match:'abc'   
        ifScored: [:score | Transcript show:('''abc'' scores '); showCR:score].

    
    |top lv list field patternHolder names|

    patternHolder := '' asValue.
    list := List new.
    
    top := StandardSystemView new.
    lv := ListView origin:(0.0@30) corner:(1.0@1.0) in:top.
    lv model:list.

    field := EditField origin:(0.0@0.0) corner:(1.0@30) in:top.
    field model:patternHolder.
    field immediateAccept:true.

    names := Smalltalk allClasses collect:#name.

    patternHolder 
        onChangeEvaluate:[
            |matcher pattern matches|

            pattern := patternHolder value.
            pattern notEmpty ifTrue:[
                matcher := FuzzyMatcher pattern:pattern.
                
                matches := OrderedCollection new.

                names do:[:eachClassName | 
                    matcher 
                        match:eachClassName
                        ifScored: [ :score | matches add: eachClassName -> score ] 
                ].
                matches sort:[:a :b |
                        a value < b value
                        or:[ a value = b value and:[ a key > b key]]
                ].

                list removeAll.
                list addAllReversed:(matches 
                                collect:[:nameScoreAssoc | 
                                    '[%1] %2' bindWith:nameScoreAssoc value with:nameScoreAssoc key])
            ].    
        ].    
    top open.
    patternHolder value:'mph'.
"
! !

!FuzzyMatcher class methodsFor:'instance creation'!

new
    "return an initialized instance"

    ^ self basicNew initialize.
!

pattern: aString
    ^ self new pattern: aString

    "
     (self pattern:'mrp') matches:'ButtonMorph'
     (self pattern:'mrp') matches:'ButtonMorh'

     (self pattern:'mrp') matches:'ButtonMorph'
     (self pattern:'mrp') matches:'ButtonMorh'
    "

    "Modified (comment): / 02-08-2017 / 15:57:07 / cg"
! !

!FuzzyMatcher class methodsFor:'utilities api'!

allMatching: aPattern in: aCollection
    "Assumes that the collection is a collection of Strings;
     return all those which match"

    | matcher |

    matcher := self pattern: aPattern.
    ^ aCollection select: [ :each | matcher matches: each ]

    "
     self 
        allMatching:'clu' 
        in:(Smalltalk allClasses collect:#name)
    "

    "Modified (comment): / 02-08-2017 / 16:02:44 / cg"
!

allMatching: aPattern in: aCollection by: aBlockReturningString
    "selects matching elements from aCollection.
     aBlockReturningString is applied to elements to get the string representation
     (can be used eg. to sort classes)"
     
    | matcher |

    matcher := self pattern: aPattern.

    ^ aCollection select: [ :each | matcher matches: (aBlockReturningString value: each) ]

        "
         self 
            allMatching:'clu' 
            in:(Smalltalk allClasses)
            by:[:cls | cls name]
        "
        "
         self 
            allMatching:'clu' 
            in:(Smalltalk allClasses)
            by:#name
        "

    "Modified (comment): / 14-07-2017 / 12:21:40 / cg"
!

allSortedByScoreMatching: aPattern in: aCollection
    "Assumes that the collection is a collection of Strings;
     returns matching strings sorted by score (level of similarity)"

    ^ self allSortedByScoreMatching: aPattern in: aCollection by: [ :each | each ]

    "
     self 
        allSortedByScoreMatching:'clu' 
        in:(Smalltalk allClasses collect:#name)
    "
    "
     self 
        allSortedByScoreMatching:'nary' 
        in:(Smalltalk allClasses collect:#name)
    "

    "Modified (comment): / 14-07-2017 / 12:22:14 / cg"
!

allSortedByScoreMatching: aPattern in: aCollection by: aBlockReturningString
    "selects matching elements from aCollection.
     aBlockReturningString is applied to elements to get the string representation.
     Returns them sorted by score (i.e. similarity).
     (can be used eg. to sort classes)"

    | matchesAndScores |

    matchesAndScores := self allWithScoresSortedByScoreMatching: aPattern in: aCollection by: aBlockReturningString.
    ^ matchesAndScores collect: [ :each | each value ]

    "
     self 
        allSortedByScoreMatching:'' 
        in:(Smalltalk allClasses)
        by:[:cls | cls name]
    "
    "
     self 
        allSortedByScoreMatching:'nary' 
        in:(Smalltalk allClasses)
        by:[:cls | cls name]
    "
    "
     self 
        allSortedByScoreMatching:'nary' 
        in:(Smalltalk allClasses)
        by:#name
    "

    "Modified: / 14-07-2017 / 12:43:14 / cg"
!

allWithScoresSortedByScoreMatching: aPattern in: aCollection by: aBlockReturningString
    "selects matching elements from aCollection.
     aBlockReturningString is applied to elements to get the string representation.
     Returns them sorted by score (i.e. similarity) associated to their scores.
     (can be used eg. to sort classes)"

    |matcher matches|


    matcher := self pattern: aPattern.
    matches := OrderedCollection new: aCollection size // 2.

    aCollection do: [ :each | 
        matcher 
            match: (aBlockReturningString value: each) 
            ifScored: [ :score | matches add: score -> each ] 
    ].
    matches sort: [ :a :b | a key > b key].
    ^ matches asArray

    "
     self 
        allWithScoresSortedByScoreMatching:'' 
        in:(Smalltalk allClasses)
        by:[:cls | cls name]
    "
    "
     self 
        allWithScoresSortedByScoreMatching:'OC' 
        in:(Smalltalk allClasses)
        by:[:cls | cls name]
    "
    "
     self 
        allWithScoresSortedByScoreMatching:'nary' 
        in:(Smalltalk allClasses)
        by:[:cls | cls name]
    "
    "
     self 
        allWithScoresSortedByScoreMatching:'nary' 
        in:(Smalltalk allClasses)
        by:#name
    "

    "Created: / 14-07-2017 / 12:25:19 / cg"
! !

!FuzzyMatcher methodsFor:'accessing'!

indexes
    "only valid inside the match callback block"
    
    ^ indexes

    "Created: / 15-07-2017 / 14:57:10 / cg"
!

pattern

	^ pattern 
!

pattern: aString

        pattern := aString.
        lowercasePattern := pattern asLowercase.
        indexes := Array new: pattern size.

    "Modified (format): / 14-07-2017 / 12:59:15 / cg"
! !

!FuzzyMatcher methodsFor:'api - comparing'!

match: aString ifScored: aBlock
    "If there is a match, evaluate aBlock, passing the score value"
        
    | scoreOrNil |

    scoreOrNil := self matchScoreOrNil: aString.
    scoreOrNil notNil ifTrue:[
        aBlock value:scoreOrNil
    ].

    "Modified: / 02-08-2017 / 16:00:59 / cg"
!

matchScoreOrNil: aString
    "return the scrore if there is a match; nil otherwise."
        
    | score |

    pattern ifEmpty: [ ^ (aString size negated) ].
    (self matches: aString) ifFalse: [ ^ nil ].

    score := self firstScore: aString at: indexes first.

    2 to: pattern size do: [ :pix | 
            score := score + (self score: aString at: (indexes at: pix) patternAt: pix)
    ].

    score := score + self indexScore + ((aString size - pattern size) * self unmatchedLetterPenalty).

    ^ score.

    "Created: / 02-08-2017 / 15:59:56 / cg"
!

matches: aString
    "return true if there is a match; false otherwise."
     
    | idx |

    pattern size > aString size ifTrue: [ ^ false ].

    idx := 0.
    pattern withIndexDo: [ :each :i |
            idx := aString 
                    findString: each asString 
                    startingAt: idx + 1 
                    caseSensitive: false. 

            idx == 0 ifTrue: [ ^ false ].
            indexes at: i put: idx.
    ].
    ^ true

    "Modified (format): / 02-08-2017 / 16:01:05 / cg"
! !

!FuzzyMatcher methodsFor:'initialization'!

initialize

        super initialize.

        pattern := lowercasePattern := ''.
        indexes := #().

    "Modified (format): / 14-07-2017 / 13:23:26 / cg"
! !

!FuzzyMatcher methodsFor:'private'!

firstScore: aString at: anIndex

	| score |

	score := (aString at: anIndex) = pattern first 
		ifTrue: [ self caseEqualBonus ]
		ifFalse: [ 0 ].
	
	anIndex = 1 	ifTrue: [ ^ score + self firstLetterBonus ].
		
	score := score + (((anIndex - 1) * self leadingLetterPenalty) max: self maxLeadingLetterPenalty).
				
	^ score 
!

indexScore 

        | sum ramp |
        
        ramp := 1.
        sum := 0.
        
        1 to: indexes size - 1 do: [ :ix |
                ramp := (indexes at: ix) + 1 = (indexes at: ix + 1) 
                        ifTrue: [ ramp + (ramp * self adjacencyIncrease) ]
                        ifFalse: [ 1 ].                 
                
                sum := sum + ramp - 1
        ].
        
        ^ sum rounded

    "Modified (format): / 14-07-2017 / 13:24:07 / cg"
!

isSeparator: aCharacter
        
        ^  aCharacter = $_ or: [ aCharacter = $: ]

    "Created: / 13-07-2017 / 13:30:34 / cg"
!

score: aString at: stringIndex patternAt: patternIndex

        | score prev |
        
        prev := (aString at: stringIndex - 1).
        
        score := (self isSeparator: prev) 
                ifTrue: [ self separatorBonus ]
                ifFalse: [ (prev asLowercase = (lowercasePattern at: patternIndex - 1))
                        ifTrue: [ 
                                self adjacencyBonus + 
                                ((prev = (pattern at: patternIndex - 1)) ifTrue: [ self adjacentCaseEqualBonus ] ifFalse: [ 0 ]) 
                        ] 
                        ifFalse: [ 0 ] 
                ].
        
        (aString at: stringIndex) = (pattern at: patternIndex) ifTrue: [ 
                score := score + self caseEqualBonus.
        ].

        ^ score

    "Modified: / 13-07-2017 / 13:30:57 / cg"
! !

!FuzzyMatcher methodsFor:'scoring-bonus'!

adjacencyBonus

	^ 5
!

adjacencyIncrease

	^ 1.2
!

adjacentCaseEqualBonus

	^ 3
!

caseEqualBonus

	^ 7
!

firstLetterBonus

	^ 12
!

separatorBonus

	^ 5
! !

!FuzzyMatcher methodsFor:'scoring-penalty'!

leadingLetterPenalty

	^ -3
!

maxLeadingLetterPenalty

	^ -9
!

unmatchedLetterPenalty

	^ -1
! !

!FuzzyMatcher class methodsFor:'documentation'!

version
    ^ '$Header$'
!

version_CVS
    ^ '$Header$'
! !