initial checkin
authorClaus Gittinger <cg@exept.de>
Sun, 16 Aug 2009 23:07:08 +0200
changeset 2214 9523a5ed7d8a
parent 2213 d465fa29df0e
child 2215 c1f072a78366
initial checkin
StringUtilities.st
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/StringUtilities.st	Sun Aug 16 23:07:08 2009 +0200
@@ -0,0 +1,308 @@
+"{ Encoding: utf8 }"
+
+"
+ COPYRIGHT (c) 1994 by Claus Gittinger
+ COPYRIGHT (c) 2009 by eXept Software AG
+              All Rights Reserved
+
+ This software is furnished under a license and may be used
+ only in accordance with the terms of that license and with the
+ inclusion of the above copyright notice.   This software may not
+ be provided or otherwise made available to, or used by, any
+ other person.  No title to or ownership of the software is
+ hereby transferred.
+"
+"{ Package: 'stx:libbasic2' }"
+
+Object subclass:#StringUtilities
+	instanceVariableNames:''
+	classVariableNames:''
+	poolDictionaries:''
+	category:'Collections-Text-Support'
+!
+
+!StringUtilities class methodsFor:'documentation'!
+
+copyright
+"
+ COPYRIGHT (c) 1994 by Claus Gittinger
+ COPYRIGHT (c) 2009 by eXept Software AG
+              All Rights Reserved
+
+ This software is furnished under a license and may be used
+ only in accordance with the terms of that license and with the
+ inclusion of the above copyright notice.   This software may not
+ be provided or otherwise made available to, or used by, any
+ other person.  No title to or ownership of the software is
+ hereby transferred.
+"
+!
+
+documentation
+"
+    some less often used algorithms have been moved to here to
+    make libbasic more compact.
+"
+! !
+
+!StringUtilities class methodsFor:'edit distance'!
+
+editDistanceFrom:s1 to:s2 s:substWeight k:kbdTypoWeight c:caseWeight e:exchangeWeight i:insrtWeight
+    |editedS2 min d|
+
+    s2 size > s1 size ifTrue:[
+        ^ self editDistanceFrom:s2 to:s1 s:substWeight k:kbdTypoWeight c:caseWeight e:exchangeWeight i:insrtWeight
+    ].
+    s1 size > s2 size ifTrue:[
+        0 to:s2 size do:[:pos |
+            editedS2 := (s2 copyTo:pos),$#,(s2 copyFrom:pos+1).
+            
+            d := self editDistanceFrom:s1 to:editedS2 s:substWeight k:kbdTypoWeight c:caseWeight e:exchangeWeight i:insrtWeight.
+            min := (min ? d) min:d.
+        ].
+        ^ min + insrtWeight
+    ].
+
+    ^ (1 to:s1 size) sum:[:i | ((s2 at:i) == $# or:[ (s1 at:i)=(s2 at:i)]) ifTrue:[0] ifFalse:[substWeight]] 
+
+    "
+     'comptuer' levenshteinTo:'computer'      
+     self editDistanceFrom:'comptuer' to:'computer' s:4 k:2 c:1 e:nil i:2        
+
+     'computr' levenshteinTo:'computer'                                    
+     self editDistanceFrom:'computr' to:'computer' s:4 k:2 c:1 e:nil i:2        
+    "
+!
+
+isKey:k1 nextTo:k2
+    "return true, if k1 and k2 are adjacent keys on the keyboard.
+     This is used to specially priorize plausible typing errors of adjacent keys."
+
+    ^ self isKey:k1 nextTo:k2 onKeyboard:(self keyboardLayout)
+
+    "
+     self isKey:$a nextTo:$a   
+     self isKey:$a nextTo:$s   
+     self isKey:$a nextTo:$q   
+     self isKey:$a nextTo:$w    
+     self isKey:$a nextTo:$y    
+     self isKey:$a nextTo:$z    
+     self isKey:$a nextTo:$x    
+    "
+!
+
+isKey:k1 nextTo:k2 onKeyboard:keys
+    "return true, if k1 and k2 are adjacent keys on the keyboard defined by keys"
+
+    |row1 row2 col1 col2|
+
+    row1 := keys findFirst:[:eachRow | col1 := eachRow indexOf:k1. col1 ~~ 0].
+    row1 == 0 ifTrue:[^ false].
+    row2 := keys findFirst:[:eachRow | col2 := eachRow indexOf:k2. col2 ~~ 0].
+    row2 == 0 ifTrue:[^ false].
+
+    ^ (row1-row2) abs <= 1 and:[(col1-col2) abs <= 1]
+
+    "
+     self isKey:$a nextTo:$q onKeyboard:(StringUtilities keyboardLayoutForLanguage:#de)
+     self isKey:$a nextTo:$x onKeyboard:(StringUtilities keyboardLayoutForLanguage:#de)
+    "
+!
+
+keyboardLayout
+    "the keyboard layout (used with algorithms to find possible typing errors,
+     for example: edit distance in levenshtein)"
+
+    ^ self keyboardLayoutForLanguage:(UserPreferences current language)
+
+    "
+     self keyboardLayout
+    "
+!
+
+keyboardLayoutForLanguage:lang
+    "the keyboard layout (used with algorithms to find possible typing errors,
+     for example: edit distance in levenshtein).
+     CAVEAT: hard coded us- and german keyboards here."
+
+    "/ danish
+    lang == #da ifTrue:[
+        ^ #( 
+               '1234567890-'
+               '*qwertyuiopå'
+               '**asdfghjklæø'
+               '***zxcvbnm' ).
+    ].
+    lang == #no ifTrue:[
+        ^ #( 
+               '1234567890-'
+               '*qwertyuiopå'
+               '**asdfghjkløæ'
+               '***zxcvbnm' ).
+    ].
+
+    (lang == #de or:[lang == #pl or:[lang == #cz]]) ifTrue:[
+        ^ #( 
+               '1234567890-'
+               '*qwertzuiop'
+               '**asdfghjkl:'
+               '***yxcvbnm' ).
+    ].
+    lang == #hu ifTrue:[
+        ^ #( 
+               '1234567890-'                         
+               '*qwertyuiopõú'
+               '**asdfghjkléáũ'
+               '**ízxcvbnm' ).
+    ].
+
+    lang == #pt ifTrue:[
+        ^ #( 
+               '1234567890-'
+               '*qwertyuiop'
+               '**asdfghjklç'
+               '***zxcvbnm' ).
+    ].
+    lang == #es ifTrue:[
+        ^ #( 
+               '1234567890-'
+               '*qwertyuiop'
+               '**asdfghjklñ'
+               '***zxcvbnm' ).
+    ].
+    (lang == #sv or:[lang == #fi]) ifTrue:[
+        ^ #( 
+               '1234567890-'
+               '*qwertyuiopå'
+               '**asdfghjklöä'
+               '***zxcvbnm' ).
+    ].
+    lang == #fr ifTrue:[
+        ^ #( 
+               '1234567890'
+               '*azertyuiop'
+               '**qsdfghjklm'
+               '***wxcvbn,' ).
+    ].
+
+    "/ fallback: english, dutch, italian
+    ^ #( 
+           '1234567890-'
+           '*qwertyuiop'
+           '**asdfghjkl:'
+           '***zxcvbnm' ).
+
+    "
+     self keyboardLayoutForLanguage:#de 
+    "
+!
+
+levenshteinDistanceFrom:string1 to:string2 s:substWeight k:kbdTypoWeight c:caseWeight e:exchangeWeight i:insrtWeight d:deleteWeight
+    "parametrized levenshtein.
+     return the levenshtein distance of two strings;
+     this value corrensponds to the number of replacements that have to be
+     made to get string2 from string1.
+
+     The arguments are the costs for
+        s:substitution,
+        k:keyboard type (substitution),   if nil, s is used
+        c:case-change,                    if nil, s is used
+        i:insertion
+        d:deletion
+        e:exchange                        if nil, s*2 is used
+     of a character.
+     See IEEE transactions on Computers 1976 Pg 172 ff."
+
+    |d  "delta matrix"
+     len1 "{ Class: SmallInteger }"
+     len2 "{ Class: SmallInteger }"
+     dim  "{ Class: SmallInteger }"
+     prevRow row col
+     dimPlus1 "{ Class: SmallInteger }"
+     min pp c1 c2|
+
+    len1 := string1 size.
+    len2 := string2 size.
+
+    "create the help-matrix"
+
+    dim := len1 max:len2.
+    dimPlus1 := dim + 1.
+
+    d := Array new:dimPlus1.
+    1 to:dimPlus1 do:[:i |
+        d at:i put:(Array new:dimPlus1)
+    ].
+
+    "init help-matrix"
+
+    (d at:1) at:1 put:0.
+    row := d at:1.
+    1 to:dim do:[:j |
+        row at:(j + 1) put:( (row at:j) + insrtWeight )
+    ].
+
+    1 to:dim do:[:i |
+        (d at:(i + 1)) at:1 put:(  ((d at:i) at:1) + deleteWeight )
+    ].
+
+    1 to:len1 do:[:i |
+        c1 := string1 at:i.
+        1 to:len2 do:[:j |
+            c2 := string2 at:j.
+            (c1 == c2) ifTrue:[
+                pp := 0
+            ] ifFalse:[
+                (c1 asLowercase == c2 asLowercase) ifTrue:[
+                    pp := caseWeight
+                ] ifFalse:[
+                    pp := substWeight.
+                    substWeight ~~ kbdTypoWeight ifTrue:[
+                        (self isKey:c1 asLowercase nextTo:c2 asLowercase) ifTrue:[
+                            pp := kbdTypoWeight.
+                        ].
+                    ].
+                    "/ assmuing (;-) that an exchange is very common when typing...
+                    exchangeWeight notNil ifTrue:[
+                        (i < len1 and:[j < len2]) ifTrue:[
+                            ((string1 at:i) = (string2 at:(j+1))
+                            and:[ (string1 at:i+1) = (string2 at:j) ]) ifTrue:[
+                                pp := exchangeWeight.
+                            ].
+                        ].
+                        (i > 1 and:[j > 1]) ifTrue:[
+                            ((string1 at:i) = (string2 at:(j-1))
+                            and:[ (string1 at:i-1) = (string2 at:j) ]) ifTrue:[
+                                pp := exchangeWeight.
+                            ].
+                        ].
+                    ].
+                ]
+            ].
+
+            prevRow := d at:i.
+            row := d at:(i + 1).
+            col := j + 1.
+            min := (prevRow at:j) + pp.
+            min := min min:( (row at:j) + insrtWeight).
+            min := min min:( (prevRow at:col) + deleteWeight).
+
+            row at:col put: min
+        ]
+    ].
+
+    ^ (d at:(len1 + 1)) at:(len2 + 1)
+
+    "
+     'comptuer' levenshteinTo:'computer'       
+
+     self levenshteinDistanceFrom:'comptuer' to:'computer' 
+            s:4 k:2 c:1 e:nil2 i:2 d:6    
+    "
+! !
+
+!StringUtilities class methodsFor:'documentation'!
+
+version
+    ^ '$Header: /cvs/stx/stx/libbasic2/StringUtilities.st,v 1.1 2009-08-16 21:07:08 cg Exp $'
+! !