better levenshtein
authorClaus Gittinger <cg@exept.de>
Fri, 23 May 2003 12:12:28 +0200
changeset 7318 cc7a8f38f696
parent 7317 461d6124fd63
child 7319 d2f3366cae79
better levenshtein
CharacterArray.st
--- a/CharacterArray.st	Thu May 22 21:41:38 2003 +0200
+++ b/CharacterArray.st	Fri May 23 12:12:28 2003 +0200
@@ -6163,21 +6163,33 @@
 
 levenshteinTo:aString
     "return the levenshtein distance to the argument, aString;
-     this value corrensponds to the number of replacements that have to be
+     this value corresponds to the number of replacements that have to be
      made to get aString from the receiver.
      See IEEE transactions on Computers 1976 Pg 172 ff."
 
     "
-     in the following, we assum that ommiting a character
+     in the following, we assume that ommiting a character
      is less of an error than inserting an extra character.
-     Therefore the different insertion (i) and deletion (d)
-     values.
-    "
-
-    ^ self levenshteinTo:aString s:4 c:1 i:2 d:6
-
-    "
-     'ocmprt' levenshteinTo:'computer'
+     Therefore the different insertion (i) and deletion (d) values.
+     s: substitution weight
+     c: case weight
+     i: insertion of extra character weight
+     d: delete of a character weight
+    "
+
+    ^ self levenshteinTo:aString s:4 k:2 c:1 i:2 d:6
+
+    "
+     'computer' levenshteinTo:'computer' 
+     'cOmputer' levenshteinTo:'computer'  
+     'cOmpuTer' levenshteinTo:'computer'  
+     'cimputer' levenshteinTo:'computer'   
+     'cumputer' levenshteinTo:'computer'   
+
+     'cmputer' levenshteinTo:'computer'   
+     'coomputer' levenshteinTo:'computer'  
+
+     'ocmprt' levenshteinTo:'computer'   
      'computer' levenshteinTo:'computer'
      'ocmputer' levenshteinTo:'computer'
      'cmputer' levenshteinTo:'computer'
@@ -6191,8 +6203,31 @@
      return the levenshtein distance to the argument, aString;
      this value corrensponds to the number of replacements that have to be
      made to get aString from the receiver.
-     The arguments are the costs for substitution, case-change, insertion and 
-     deletion of a character.
+     The arguments are the costs for 
+        substitution, case-change, insertion and deletion of a character.
+     See IEEE transactions on Computers 1976 Pg 172 ff"
+
+    ^ self
+        levenshteinTo:aString 
+        s:substWeight 
+        k:substWeight 
+        c:caseWeight 
+        i:insrtWeight 
+        d:deleteWeight
+!
+
+levenshteinTo:aString s:substWeight k:kbdTypoWeight c:caseWeight i:insrtWeight d:deleteWeight
+    "parametrized levenshtein. 
+     return the levenshtein distance to the argument, aString;
+     this value corrensponds to the number of replacements that have to be
+     made to get aString from the receiver.
+     The arguments are the costs for 
+        s:substitution, 
+        k:keyboard type (substitution), 
+        c:case-change, 
+        i:insertion 
+        d:deletion 
+     of a character.
      See IEEE transactions on Computers 1976 Pg 172 ff"
 
     |d  "delta matrix"
@@ -6213,7 +6248,7 @@
 
     d := Array new:dimPlus1.
     1 to:dimPlus1 do:[:i |
-	d at:i put:(Array new:dimPlus1)
+        d at:i put:(Array new:dimPlus1)
     ].
 
     "init help-matrix"
@@ -6221,34 +6256,39 @@
     (d at:1) at:1 put:0.
     row := d at:1.
     1 to:dim do:[:j |
-	row at:(j + 1) put:( (row at:j) + insrtWeight )
+        row at:(j + 1) put:( (row at:j) + insrtWeight )
     ].
 
     1 to:dim do:[:i |
-	 (d at:(i + 1)) at:1 put:(  ((d at:i) at:1) + deleteWeight )
+         (d at:(i + 1)) at:1 put:(  ((d at:i) at:1) + deleteWeight )
     ].
 
     1 to:len1 do:[:i |
-	c1 := self at:i.
-	1 to:len2 do:[:j |
-	    c2 := aString at:j.
-	    (c1 == c2) ifTrue:[
-		pp := 0
-	    ] ifFalse:[
-		(c1 asLowercase == c2 asLowercase) ifTrue:[
-		    pp := caseWeight
-		] ifFalse:[
-		    pp := substWeight
-		]
-	    ].
-	    prevRow := d at:i.
-	    row := d at:(i + 1).
-	    col := j + 1.
-	    min := (prevRow at:j) + pp.
-	    min := min min:( (row at:j) + insrtWeight).
-	    min := min min:( (prevRow at:col) + deleteWeight).
-	    row at:col put: min
-	]
+        c1 := self at:i.
+        1 to:len2 do:[:j |
+            c2 := aString at:j.
+            (c1 == c2) ifTrue:[
+                pp := 0
+            ] ifFalse:[
+                (c1 asLowercase == c2 asLowercase) ifTrue:[
+                    pp := caseWeight
+                ] ifFalse:[
+                    pp := substWeight.
+                    substWeight ~~ kbdTypoWeight ifTrue:[
+                        (DoWhatIMeanSupport isKey:c1 asLowercase nextTo:c2 asLowercase) ifTrue:[
+                            pp := kbdTypoWeight.
+                        ].
+                    ].
+                ]
+            ].
+            prevRow := d at:i.
+            row := d at:(i + 1).
+            col := j + 1.
+            min := (prevRow at:j) + pp.
+            min := min min:( (row at:j) + insrtWeight).
+            min := min min:( (prevRow at:col) + deleteWeight).
+            row at:col put: min
+        ]
     ].
 
     ^ (d at:(len1 + 1)) at:(len2 + 1)
@@ -6400,7 +6440,7 @@
 !CharacterArray class methodsFor:'documentation'!
 
 version
-    ^ '$Header: /cvs/stx/stx/libbasic/CharacterArray.st,v 1.248 2003-05-20 15:35:43 cg Exp $'
+    ^ '$Header: /cvs/stx/stx/libbasic/CharacterArray.st,v 1.249 2003-05-23 10:12:28 cg Exp $'
 ! !
 
 CharacterArray initialize!