Fix #levenshtein:... C-code.
authorStefan Vogel <sv@exept.de>
Thu, 18 Nov 2004 15:01:24 +0100
changeset 8647 2f59b57bcc4d
parent 8646 5b40b09db73b
child 8648 032f1b25d3a2
Fix #levenshtein:... C-code. Define the additional arg so that it is used.
String.st
--- a/String.st	Thu Nov 18 12:58:15 2004 +0100
+++ b/String.st	Thu Nov 18 15:01:24 2004 +0100
@@ -10,6 +10,8 @@
  hereby transferred.
 "
 
+'From Smalltalk/X, Version:5.2.5 on 18-11-2004 at 14:54:53'                     !
+
 "{ Package: 'stx:libbasic' }"
 
 CharacterArray variableByteSubclass:#String
@@ -45,6 +47,48 @@
 %}
 ! !
 
+!String primitiveFunctions!
+%{
+
+static int 
+nextOnKeyboard(char1, char2) {
+    /* compare two characters if they are next to each other on a (US-) keyboard */
+    static char *keys[] = { "1234567890-",
+                            "*qwertyuiop",
+                            "**asdfghjkl:",
+                            "***zxcvbnm",
+                            0 };
+    char **line1, **line2;
+    char *col1, *col2;
+    int diff;
+
+    for (line1 = keys; *line1 != 0; line1++) {
+        for (col1 = *line1; *col1 != 0 && *col1 != char1; col1++)
+            continue;
+    }
+    if (*col1 == 0)
+        return(0);
+
+    for (line2 = keys; *line2 != 0; line2++) {
+        for (col2 = *line2; *col2 != 0 && *col2 != char2; col2++)
+            continue;
+    }
+    if (*col2 == 0)
+        return(0);
+
+    diff = col1 - col2;
+    if (diff > 1 || diff < -1)
+        return(0);
+
+    diff = line1 - line2;
+    if (diff > 1 || diff < -1)
+        return(0);
+    return(1);
+}
+
+%}
+! !
+
 !String class methodsFor:'documentation'!
 
 copyright
@@ -3135,7 +3179,7 @@
     ^ self size == 0
 !
 
-levenshteinTo:aString s:substWeight c:caseWeight i:insrtWeight d:deleteWeight
+levenshteinTo:aString s:substWeight k:kbdTypoWeight c:caseWeight i:insrtWeight d:deleteWeight
     "parametrized levenshtein. arguments are the costs for
      substitution, case-change, insertion and deletion of a character."
 
@@ -3145,97 +3189,103 @@
      * this is very heavy used when correcting errors 
      * (all symbols are searched for best match) - therefore it must be fast
      */
-{
+
     unsigned short *data;
     int l1, l2;
     REGISTER int sz;
     unsigned char *s1, *s2;
     int v1, v2, v3, m;
     REGISTER unsigned short *dp;
-    REGISTER int delta;
+    REGISTER int rowDelta;
     REGISTER int j;
     int i;
-    int iW, cW, sW, dW;
+    int iW, cW, sW, kW, dW;
 #   define FASTSIZE 30  /* increase STACK if you increase this ... */
     unsigned short fastData[(FASTSIZE + 1) * (FASTSIZE + 1)];
 
-    if ((__isString(self) || __isSymbol(self))
-     && (__isString(aString) || __isSymbol(aString))
-     && __bothSmallInteger(insrtWeight, caseWeight)
-     && __bothSmallInteger(substWeight, deleteWeight)) {
-	iW = __intVal(insrtWeight);
-	cW = __intVal(caseWeight);
-	sW = __intVal(substWeight);
-	dW = __intVal(deleteWeight);
-	s1 = __stringVal(self);
-	s2 = __stringVal(aString);
-	l1 = strlen(s1);
-	l2 = strlen(s2);
-
-	sz = (l1 < l2) ? l2 : l1;
-	delta = sz + 1;
-	if (sz <= FASTSIZE) {
-	    data = fastData;
-	} else {
-	    /* add ifdef ALLOCA here ... */
-	    data = (unsigned short *)malloc(delta * delta * sizeof(short));
-	    if (! data) goto mallocFailed;
-	}
-
-	data[0] = 0;
-	dp = data+1;
-	for (j=1, dp=data+1; j<=sz; j++, dp++)
-	    *dp = *(dp-1) + iW;
-
-	for (i=1, dp=data+delta; i<=sz; i++, dp+=delta)
-	    *dp = *(dp-delta) + dW;
-
-	for (i=1; i<=l1; i++) {
-	    for (j=1; j<=l2; j++) {
-		dp = data + (i*delta) + j;
-		if (s1[i] != s2[j]) {
-		    if (tolower(s1[i]) == tolower(s2[j])) {
-			m = cW;
-		    } else {
-			m = sW;
-		    }
-		} else
-		    m = 0;
-
-		v2 = *(dp - 1) + iW;
-		v3 = *(dp - delta) + dW;
-		v1 = *(dp - delta - 1) + m;
-		if (v1 < v2)
-		    if (v1 < v3)
-			m = v1;
-		    else
-			m = v3;
-		else
-		    if (v2 < v3)
-			m = v2;
-		    else
-			m = v3;
-		*dp = m;
-	    }
-	}
-	m = data[l1 * delta + l2];
-	if (sz > FASTSIZE) 
-	    free(data);
-	RETURN ( __MKSMALLINT(m) );
+    if (__isStringLike(self) && __isStringLike(aString)
+        && __bothSmallInteger(insrtWeight, caseWeight)
+        && __bothSmallInteger(substWeight, deleteWeight)
+        && __isSmallInteger(kbdTypoWeight)
+    ) {
+        iW = __intVal(insrtWeight);
+        cW = __intVal(caseWeight);
+        sW = __intVal(substWeight);
+        kW = __intVal(kbdTypoWeight);
+        dW = __intVal(deleteWeight);
+        s1 = __stringVal(self);
+        s2 = __stringVal(aString);
+        l1 = strlen(s1);
+        l2 = strlen(s2);
+
+        sz = (l1 < l2) ? l2 : l1;
+        rowDelta = sz + 1;
+        if (sz <= FASTSIZE) {
+            data = fastData;
+        } else {
+            /* add ifdef ALLOCA here ... */
+            data = (unsigned short *)malloc(rowDelta * rowDelta * sizeof(short));
+            if (! data) goto mallocFailed;
+        }
+
+        data[0] = 0;
+        for (j=1, dp=data+1; j<=sz; j++, dp++)
+            *dp = dp[-1] + iW;
+
+        for (i=1, dp=data+rowDelta; i<=sz; i++, dp+=rowDelta)
+            *dp = dp[-rowDelta] + dW;
+
+        for (i=0; i<l1; i++) {
+            for (j=0; j<l2; j++) {
+                if (s1[i] == s2[j]) 
+                    m = 0;
+                else if (tolower(s1[i]) == tolower(s2[j])) 
+                    m = cW;
+                else if (sW != kW && nextOnKeyboard(tolower(s1[i]), tolower(s2[j])))
+                    m = kW;
+                else
+                    m = sW;
+
+                dp = data + ((i+1)*rowDelta) + j;
+                v2 = dp[0] + iW;
+                v1 = dp[-rowDelta] + m;
+                v3 = dp[-rowDelta+1] + dW;
+                if (v1 < v2) {
+                    if (v1 < v3)
+                        m = v1;
+                    else
+                        m = v3;
+                } else {
+                    if (v2 < v3)
+                        m = v2;
+                    else
+                        m = v3;
+                }
+                dp[1] = m;
+            }
+        }
+        m = data[l1*rowDelta + l2];
+        if (sz > FASTSIZE) 
+            free(data);
+        RETURN ( __mkSmallInteger(m) );
     }
 mallocFailed: ;
-}
 %}.
 
     ^ super levenshteinTo:aString 
-			s:substWeight c:caseWeight 
-			i:insrtWeight d:deleteWeight
-
-    "'ocmprt' levenshteinTo:'computer'
+                        s:substWeight k:kbdTypoWeight c:caseWeight 
+                        i:insrtWeight d:deleteWeight
+
+    "
+     'ocmprt' levenshteinTo:'computer'
      'computer' levenshteinTo:'computer'
      'ocmputer' levenshteinTo:'computer'
      'cmputer' levenshteinTo:'computer'
-     'Computer' levenshteinTo:'computer'"
+     'computer' levenshteinTo:'cmputer'
+     'computer' levenshteinTo:'vomputer'
+     'computer' levenshteinTo:'bomputer'
+     'Computer' levenshteinTo:'computer'
+    "
 !
 
 notEmpty
@@ -3372,5 +3422,5 @@
 !String class methodsFor:'documentation'!
 
 version
-    ^ '$Header: /cvs/stx/stx/libbasic/String.st,v 1.210 2004-08-03 19:29:04 penk Exp $'
+    ^ '$Header: /cvs/stx/stx/libbasic/String.st,v 1.211 2004-11-18 14:01:24 stefan Exp $'
 ! !