StringUtilities.st
author Claus Gittinger <cg@exept.de>
Sat, 02 May 2020 21:40:13 +0200
changeset 5476 7355a4b11cb6
parent 4970 60474b8d086d
permissions -rw-r--r--
#FEATURE by cg class: Socket class added: #newTCPclientToHost:port:domain:domainOrder:withTimeout: changed: #newTCPclientToHost:port:domain:withTimeout:

"{ Encoding: utf8 }"

"
 COPYRIGHT (c) 1994 by Claus Gittinger
 COPYRIGHT (c) 2009 by eXept Software AG
              All Rights Reserved

 This software is furnished under a license and may be used
 only in accordance with the terms of that license and with the
 inclusion of the above copyright notice.   This software may not
 be provided or otherwise made available to, or used by, any
 other person.  No title to or ownership of the software is
 hereby transferred.
"
"{ Package: 'stx:libbasic2' }"

"{ NameSpace: Smalltalk }"

Object subclass:#StringUtilities
	instanceVariableNames:''
	classVariableNames:''
	poolDictionaries:''
	category:'Collections-Text-Support'
!

!StringUtilities class methodsFor:'documentation'!

copyright
"
 COPYRIGHT (c) 1994 by Claus Gittinger
 COPYRIGHT (c) 2009 by eXept Software AG
              All Rights Reserved

 This software is furnished under a license and may be used
 only in accordance with the terms of that license and with the
 inclusion of the above copyright notice.   This software may not
 be provided or otherwise made available to, or used by, any
 other person.  No title to or ownership of the software is
 hereby transferred.
"
!

documentation
"
    some less often used algorithms have been moved to here to
    make libbasic more compact.
"
! !

!StringUtilities class methodsFor:'edit distance'!

editDistanceFrom:s1 to:s2 s:substWeight k:kbdTypoWeight c:caseWeight e:exchangeWeight i:insrtWeight
    "another, simpler editing distance between two strings. 
     See also: levenshtein"

    |editedS2 min d|

    s2 size > s1 size ifTrue:[
        ^ self editDistanceFrom:s2 to:s1 s:substWeight k:kbdTypoWeight c:caseWeight e:exchangeWeight i:insrtWeight
    ].
    s1 size > s2 size ifTrue:[
        0 to:s2 size do:[:pos |
            editedS2 := s2 copyWith:$# insertedAfterIndex:pos.
            
            d := self editDistanceFrom:s1 to:editedS2 s:substWeight k:kbdTypoWeight c:caseWeight e:exchangeWeight i:insrtWeight.
            min := (min ? d) min:d.
        ].
        ^ min + insrtWeight
    ].

    ^ (1 to:s1 size) sum:
        [:i | 
            ((s2 at:i) == $# or:[ (s1 at:i)=(s2 at:i)]) 
                ifTrue:[0] 
                ifFalse:[substWeight]
        ] 

    "
     'comptuer' levenshteinTo:'computer'      
     self editDistanceFrom:'comptuer' to:'computer' s:4 k:2 c:1 e:nil i:2        

     'computr' levenshteinTo:'computer'                                    
     self editDistanceFrom:'computr' to:'computer' s:4 k:2 c:1 e:nil i:2        
    "

    "Modified (format): / 09-08-2012 / 05:41:59 / cg"
!

isKey:k1 nextTo:k2
    "return true, if k1 and k2 are adjacent keys on the keyboard.
     This is used to specially priorize plausible typing errors of adjacent keys."

    ^ self isKey:k1 nextTo:k2 onKeyboard:(self keyboardLayout)

    "
     self isKey:$a nextTo:$a   
     self isKey:$a nextTo:$s   
     self isKey:$a nextTo:$q   
     self isKey:$a nextTo:$w    
     self isKey:$a nextTo:$y    
     self isKey:$a nextTo:$z    
     self isKey:$a nextTo:$x    
    "
!

isKey:k1 nextTo:k2 onKeyboard:keys
    "return true, if k1 and k2 are adjacent keys on the keyboard defined by keys"

    |row1 row2 col1 col2|

    row1 := keys findFirst:[:eachRow | col1 := eachRow indexOf:k1. col1 ~~ 0].
    row1 == 0 ifTrue:[^ false].
    row2 := keys findFirst:[:eachRow | col2 := eachRow indexOf:k2. col2 ~~ 0].
    row2 == 0 ifTrue:[^ false].

    ^ (row1-row2) abs <= 1 and:[(col1-col2) abs <= 1]

    "
     self isKey:$a nextTo:$q onKeyboard:(StringUtilities keyboardLayoutForLanguage:#de)
     self isKey:$a nextTo:$x onKeyboard:(StringUtilities keyboardLayoutForLanguage:#de)
    "
!

keyboardLayout
    "the keyboard layout (used with algorithms to find possible typing errors,
     for example: edit distance in levenshtein)"

    ^ self keyboardLayoutForLanguage:(UserPreferences current language)

    "
     self keyboardLayout
    "
!

keyboardLayoutForLanguage:lang
    "the keyboard layout (used with algorithms to find possible typing errors,
     for example: edit distance in levenshtein).
     CAVEAT: hard coded us- and german keyboards here - should go into resource file."

    "/ danish
    lang == #da ifTrue:[
        ^ #( 
               '1234567890-'
               '*qwertyuiopå'
               '**asdfghjklæø'
               '***zxcvbnm' ).
    ].
    lang == #no ifTrue:[
        ^ #( 
               '1234567890-'
               '*qwertyuiopå'
               '**asdfghjkløæ'
               '***zxcvbnm' ).
    ].

    (lang == #de or:[lang == #pl or:[lang == #cz]]) ifTrue:[
        ^ #( 
               '1234567890-'
               '*qwertzuiop'
               '**asdfghjkl:'
               '***yxcvbnm' ).
    ].
    lang == #hu ifTrue:[
        ^ #( 
               '1234567890-'                         
               '*qwertyuiopõú'
               '**asdfghjkléáũ'
               '**ízxcvbnm' ).
    ].

    lang == #pt ifTrue:[
        ^ #( 
               '1234567890-'
               '*qwertyuiop'
               '**asdfghjklç'
               '***zxcvbnm' ).
    ].
    lang == #es ifTrue:[
        ^ #( 
               '1234567890-'
               '*qwertyuiop'
               '**asdfghjklñ'
               '***zxcvbnm' ).
    ].
    (lang == #sv or:[lang == #fi]) ifTrue:[
        ^ #( 
               '1234567890-'
               '*qwertyuiopå'
               '**asdfghjklöä'
               '***zxcvbnm' ).
    ].
    lang == #fr ifTrue:[
        ^ #( 
               '1234567890'
               '*azertyuiop'
               '**qsdfghjklm'
               '***wxcvbn,' ).
    ].

    "/ fallback: english, dutch, italian
    ^ #( 
           '1234567890-'
           '*qwertyuiop'
           '**asdfghjkl:'
           '***zxcvbnm' ).

    "
     self keyboardLayoutForLanguage:#de 
    "

    "Modified (comment): / 09-08-2012 / 05:39:19 / cg"
!

levenshteinDistanceFrom:string1 to:string2 s:substWeight k:kbdTypoWeight c:caseWeight e:exchangeWeight i:insrtWeight d:deleteWeight
    "parametrized levenshtein.
     return the levenshtein distance of two strings;
     this value corrensponds to the number of replacements that have to be
     made to get string2 from string1. The smaller the returned number,
     tbe more similar are the two strings.

     The arguments are the costs for
        s:substitution,
        k:keyboard type (substitution),   if nil, s is used
        c:case-change,                    if nil, s is used
        i:insertion
        d:deletion
        e:exchange                        if nil, s*2 is used
     of a character.
     See IEEE transactions on Computers 1976 Pg 172 ff."

    |d  "delta matrix"
     len1 "{ Class: SmallInteger }"
     len2 "{ Class: SmallInteger }"
     dim  "{ Class: SmallInteger }"
     prevRow row col
     dimPlus1 "{ Class: SmallInteger }"
     min pp c1 c2|

    len1 := string1 size.
    len2 := string2 size.

    "create the help-matrix"

    dim := len1 max:len2.
    dimPlus1 := dim + 1.

    d := Array new:dimPlus1.
    1 to:dimPlus1 do:[:i |
        d at:i put:(Array new:dimPlus1)
    ].

    "init help-matrix"

    (d at:1) at:1 put:0.
    row := d at:1.
    1 to:dim do:[:j |
        row at:(j + 1) put:( (row at:j) + insrtWeight )
    ].

    1 to:dim do:[:i |
        (d at:(i + 1)) at:1 put:(  ((d at:i) at:1) + deleteWeight )
    ].

    1 to:len1 do:[:i |
        c1 := string1 at:i.
        1 to:len2 do:[:j |
            c2 := string2 at:j.
            (c1 == c2) ifTrue:[
                pp := 0
            ] ifFalse:[
                (c1 asLowercase == c2 asLowercase) ifTrue:[
                    pp := caseWeight
                ] ifFalse:[
                    pp := substWeight.
                    substWeight ~~ kbdTypoWeight ifTrue:[
                        (self isKey:c1 asLowercase nextTo:c2 asLowercase) ifTrue:[
                            pp := kbdTypoWeight ? substWeight.
                        ].
                    ].
                    "/ assmuing (;-) that an exchange is very common when typing...
                    exchangeWeight notNil ifTrue:[
                        (i < len1 and:[j < len2]) ifTrue:[
                            ((string1 at:i) = (string2 at:(j+1))
                            and:[ (string1 at:i+1) = (string2 at:j) ]) ifTrue:[
                                pp := exchangeWeight.
                            ].
                        ].
                        (i > 1 and:[j > 1]) ifTrue:[
                            ((string1 at:i) = (string2 at:(j-1))
                            and:[ (string1 at:i-1) = (string2 at:j) ]) ifTrue:[
                                pp := exchangeWeight.
                            ].
                        ].
                    ].
                ]
            ].

            prevRow := d at:i.
            row := d at:(i + 1).
            col := j + 1.
            min := (prevRow at:j) + pp.
            min := min min:( (row at:j) + insrtWeight).
            min := min min:( (prevRow at:col) + deleteWeight).

            row at:col put: min
        ]
    ].

    ^ (d at:(len1 + 1)) at:(len2 + 1)

    "
     'comptuer' levenshteinTo:'computer'       

     self levenshteinDistanceFrom:'comptuer' to:'computer' 
            s:4 k:2 c:1 e:nil i:2 d:6    
    "

    "Modified (comment): / 09-08-2012 / 05:40:08 / cg"
! !

!StringUtilities class methodsFor:'matching'!

stringMatchFunctionFor:aMultiPattern glob:searchForGlobPattern regex:searchForRegexPattern caseSensitive:searchIsCaseSensitive
    "generates a check function which - given a string - checks for a match.
     The match-pattern argument aMultiPattern
     may contain multiple patterns, 
     separated by ';' (for and-search) or '|' (for or-search).
     If the pattern is invalid, nil is returned and an information-notification
     is signalled"

    |combinator stringToSearchFor stringsToSearchFor checkFunction|

    stringToSearchFor := aMultiPattern.
    combinator := #conform:.
    (stringToSearchFor includes:$|) ifTrue:[
        stringsToSearchFor := stringToSearchFor asCollectionOfSubstringsSeparatedBy:$|.
        combinator := #contains:.
    ] ifFalse:[
        (stringToSearchFor includes:$;) ifTrue:[
            stringsToSearchFor := stringToSearchFor asCollectionOfSubstringsSeparatedBy:$;.
        ] ifFalse:[
            stringsToSearchFor := nil.
"/            lastStringToSearchFor := self stringOfCurrentSearchList.
"/            (lastStringToSearchFor notEmptyOrNil and:[stringToSearchFor startsWith:lastStringToSearchFor]) ifTrue:[
"/                searchRoot := AbstractBrowserItemPO new.
"/                self searchResultList do:[:each | searchRoot add:each treePO].
"/                self stringOfCurrentSearchList:stringToSearchFor.
"/                Transcript showCR:'fast'.
"/            ].
"/            self stringOfCurrentSearchList:stringToSearchFor.
        ]
    ].

    Error handle:[:ex |
        self notify:('Invalid pattern: ',ex description).
        ^ nil
    ] do:[
        |lcString shortString regexPattern regexPatterns|

        stringsToSearchFor isNil ifTrue:[
            "/ single string search

            (searchForGlobPattern and:[stringToSearchFor includesMatchCharacters]) ifTrue:[
                |matchScanArray|

                "/ verify if this is a valid pattern; if not, do not search
                matchScanArray := String matchScanArrayFrom:stringToSearchFor escapeCharacter:String matchEscapeCharacter.
                matchScanArray isNil ifTrue:[
                    self information:'Invalid search pattern'.
                    ^ nil.
                ].

                ((stringToSearchFor startsWith:$*) and:[(stringToSearchFor copyFrom:2) includesMatchCharacters not]) ifTrue:[
                    shortString := (stringToSearchFor copyFrom:2).
                    searchIsCaseSensitive ifTrue:[
                        checkFunction := [:s | s endsWith:shortString].
                    ] ifFalse:[
                        lcString := shortString asLowercase.
                        checkFunction := [:s | s asLowercase endsWith:lcString].
                    ]
                ] ifFalse:[
                    ((stringToSearchFor endsWith:$*) and:[(stringToSearchFor copyButLast:1) includesMatchCharacters not]) ifTrue:[
                        shortString := (stringToSearchFor copyButLast:1).
                        searchIsCaseSensitive ifTrue:[
                            checkFunction := [:s | s startsWith:shortString].
                        ] ifFalse:[
                            lcString := shortString asLowercase.
                            checkFunction := [:s | s asLowercase startsWith:lcString].
                        ]
                    ] ifFalse:[
                        regexPattern := searchIsCaseSensitive 
                                                ifTrue:[stringToSearchFor asRegex] 
                                                ifFalse:[stringToSearchFor asRegexIgnoringCase].
                        checkFunction := [:s | regexPattern hasMatchesIn:s].
                    ].
                ].
            ] ifFalse:[
                searchForRegexPattern ifTrue:[
                    regexPattern := searchIsCaseSensitive 
                                        ifTrue:[stringToSearchFor asRegex] 
                                        ifFalse:[stringToSearchFor asRegexIgnoringCase].
                    checkFunction := [:s | regexPattern hasMatchesIn:s].
                ] ifFalse:[
                    checkFunction :=
                        [:s | s includesString:stringToSearchFor caseSensitive:searchIsCaseSensitive]
                ]
            ].
        ] ifFalse:[
            "/ multi string search

            searchForGlobPattern ifTrue:[
                stringsToSearchFor do:[:eachStringToSearchFor |
                    |matchScanArray|

                    "/ verify if this is a valid pattern; if not, do not search
                    matchScanArray := String matchScanArrayFrom:eachStringToSearchFor escapeCharacter:String matchEscapeCharacter.
                    matchScanArray isNil ifTrue:[
                        self information:'Invalid search pattern'.
                        ^ nil.
                    ].
                ].

                checkFunction :=
                    [:s |
                        stringsToSearchFor perform:combinator with:[:eachStringToSearchFor |
                            s includesMatchString:eachStringToSearchFor caseSensitive:searchIsCaseSensitive
                        ]
                    ].
            ] ifFalse:[
                searchForRegexPattern ifTrue:[
                    regexPatterns := stringsToSearchFor collect:[:eachPatternString|
                                            searchIsCaseSensitive 
                                                ifTrue:[eachPatternString asRegex] 
                                                ifFalse:[eachPatternString asRegexIgnoringCase].
                                        ].
                    checkFunction := [:s | regexPattern hasMatchesIn:s].
                    checkFunction :=
                        [:s |
                            regexPatterns perform:combinator with:[:eachRegexPattern |
                                eachRegexPattern hasMatchesIn:s
                            ]
                        ].
                ] ifFalse:[
                    checkFunction :=
                        [:s |
                            stringsToSearchFor perform:combinator with:[:eachStringToSearchFor |
                                s includesString:eachStringToSearchFor caseSensitive:searchIsCaseSensitive
                            ]
                        ]
                ]
            ].
        ].
    ].
    ^ checkFunction

    "
     |fn|
     fn := self stringMatchFunctionFor:'aaa|bbb' glob:false regex:false caseSensitive:false.
     fn value:'   aaa   '.
     fn value:'   aa   '.
     fn value:'   bbb   '.
     fn value:'  aa bb   '.
    "
    
    "
     |fn|
     fn := self stringMatchFunctionFor:'aaa;bbb' glob:false regex:false caseSensitive:false.
     fn value:'   aaa   '.
     fn value:'   aa   '.
     fn value:'   bbb   '.
     fn value:'  aa bb   '.
     fn value:'  aaa bb   '.
     fn value:'  aaa bbb   '.
    "

    "
     |fn|
     fn := self stringMatchFunctionFor:'aa*;bb*' glob:true regex:false caseSensitive:false.
     fn value:'   aaa   '.
     fn value:'   aa   '.
     fn value:'   bbb   '.
     fn value:'  aa bb   '.
     fn value:'  aaa bb   '.
     fn value:'  aaa bbb   '.
    "

    "Created: / 02-05-2019 / 17:32:12 / Claus Gittinger"
    "Modified: / 30-05-2019 / 11:26:26 / Claus Gittinger"
! !

!StringUtilities class methodsFor:'queries'!

isUtilityClass
    ^ self == StringUtilities
! !

!StringUtilities class methodsFor:'documentation'!

version
    ^ '$Header$'
!

version_CVS
    ^ '$Header$'
! !