--- a/String.st Thu Nov 18 12:58:15 2004 +0100
+++ b/String.st Thu Nov 18 15:01:24 2004 +0100
@@ -10,6 +10,8 @@
hereby transferred.
"
+'From Smalltalk/X, Version:5.2.5 on 18-11-2004 at 14:54:53' !
+
"{ Package: 'stx:libbasic' }"
CharacterArray variableByteSubclass:#String
@@ -45,6 +47,48 @@
%}
! !
+!String primitiveFunctions!
+%{
+
+static int
+nextOnKeyboard(char1, char2) {
+ /* compare two characters if they are next to each other on a (US-) keyboard */
+ static char *keys[] = { "1234567890-",
+ "*qwertyuiop",
+ "**asdfghjkl:",
+ "***zxcvbnm",
+ 0 };
+ char **line1, **line2;
+ char *col1, *col2;
+ int diff;
+
+ for (line1 = keys; *line1 != 0; line1++) {
+ for (col1 = *line1; *col1 != 0 && *col1 != char1; col1++)
+ continue;
+ }
+ if (*col1 == 0)
+ return(0);
+
+ for (line2 = keys; *line2 != 0; line2++) {
+ for (col2 = *line2; *col2 != 0 && *col2 != char2; col2++)
+ continue;
+ }
+ if (*col2 == 0)
+ return(0);
+
+ diff = col1 - col2;
+ if (diff > 1 || diff < -1)
+ return(0);
+
+ diff = line1 - line2;
+ if (diff > 1 || diff < -1)
+ return(0);
+ return(1);
+}
+
+%}
+! !
+
!String class methodsFor:'documentation'!
copyright
@@ -3135,7 +3179,7 @@
^ self size == 0
!
-levenshteinTo:aString s:substWeight c:caseWeight i:insrtWeight d:deleteWeight
+levenshteinTo:aString s:substWeight k:kbdTypoWeight c:caseWeight i:insrtWeight d:deleteWeight
"parametrized levenshtein. arguments are the costs for
substitution, case-change, insertion and deletion of a character."
@@ -3145,97 +3189,103 @@
* this is very heavy used when correcting errors
* (all symbols are searched for best match) - therefore it must be fast
*/
-{
+
unsigned short *data;
int l1, l2;
REGISTER int sz;
unsigned char *s1, *s2;
int v1, v2, v3, m;
REGISTER unsigned short *dp;
- REGISTER int delta;
+ REGISTER int rowDelta;
REGISTER int j;
int i;
- int iW, cW, sW, dW;
+ int iW, cW, sW, kW, dW;
# define FASTSIZE 30 /* increase STACK if you increase this ... */
unsigned short fastData[(FASTSIZE + 1) * (FASTSIZE + 1)];
- if ((__isString(self) || __isSymbol(self))
- && (__isString(aString) || __isSymbol(aString))
- && __bothSmallInteger(insrtWeight, caseWeight)
- && __bothSmallInteger(substWeight, deleteWeight)) {
- iW = __intVal(insrtWeight);
- cW = __intVal(caseWeight);
- sW = __intVal(substWeight);
- dW = __intVal(deleteWeight);
- s1 = __stringVal(self);
- s2 = __stringVal(aString);
- l1 = strlen(s1);
- l2 = strlen(s2);
-
- sz = (l1 < l2) ? l2 : l1;
- delta = sz + 1;
- if (sz <= FASTSIZE) {
- data = fastData;
- } else {
- /* add ifdef ALLOCA here ... */
- data = (unsigned short *)malloc(delta * delta * sizeof(short));
- if (! data) goto mallocFailed;
- }
-
- data[0] = 0;
- dp = data+1;
- for (j=1, dp=data+1; j<=sz; j++, dp++)
- *dp = *(dp-1) + iW;
-
- for (i=1, dp=data+delta; i<=sz; i++, dp+=delta)
- *dp = *(dp-delta) + dW;
-
- for (i=1; i<=l1; i++) {
- for (j=1; j<=l2; j++) {
- dp = data + (i*delta) + j;
- if (s1[i] != s2[j]) {
- if (tolower(s1[i]) == tolower(s2[j])) {
- m = cW;
- } else {
- m = sW;
- }
- } else
- m = 0;
-
- v2 = *(dp - 1) + iW;
- v3 = *(dp - delta) + dW;
- v1 = *(dp - delta - 1) + m;
- if (v1 < v2)
- if (v1 < v3)
- m = v1;
- else
- m = v3;
- else
- if (v2 < v3)
- m = v2;
- else
- m = v3;
- *dp = m;
- }
- }
- m = data[l1 * delta + l2];
- if (sz > FASTSIZE)
- free(data);
- RETURN ( __MKSMALLINT(m) );
+ if (__isStringLike(self) && __isStringLike(aString)
+ && __bothSmallInteger(insrtWeight, caseWeight)
+ && __bothSmallInteger(substWeight, deleteWeight)
+ && __isSmallInteger(kbdTypoWeight)
+ ) {
+ iW = __intVal(insrtWeight);
+ cW = __intVal(caseWeight);
+ sW = __intVal(substWeight);
+ kW = __intVal(kbdTypoWeight);
+ dW = __intVal(deleteWeight);
+ s1 = __stringVal(self);
+ s2 = __stringVal(aString);
+ l1 = strlen(s1);
+ l2 = strlen(s2);
+
+ sz = (l1 < l2) ? l2 : l1;
+ rowDelta = sz + 1;
+ if (sz <= FASTSIZE) {
+ data = fastData;
+ } else {
+ /* add ifdef ALLOCA here ... */
+ data = (unsigned short *)malloc(rowDelta * rowDelta * sizeof(short));
+ if (! data) goto mallocFailed;
+ }
+
+ data[0] = 0;
+ for (j=1, dp=data+1; j<=sz; j++, dp++)
+ *dp = dp[-1] + iW;
+
+ for (i=1, dp=data+rowDelta; i<=sz; i++, dp+=rowDelta)
+ *dp = dp[-rowDelta] + dW;
+
+ for (i=0; i<l1; i++) {
+ for (j=0; j<l2; j++) {
+ if (s1[i] == s2[j])
+ m = 0;
+ else if (tolower(s1[i]) == tolower(s2[j]))
+ m = cW;
+ else if (sW != kW && nextOnKeyboard(tolower(s1[i]), tolower(s2[j])))
+ m = kW;
+ else
+ m = sW;
+
+ dp = data + ((i+1)*rowDelta) + j;
+ v2 = dp[0] + iW;
+ v1 = dp[-rowDelta] + m;
+ v3 = dp[-rowDelta+1] + dW;
+ if (v1 < v2) {
+ if (v1 < v3)
+ m = v1;
+ else
+ m = v3;
+ } else {
+ if (v2 < v3)
+ m = v2;
+ else
+ m = v3;
+ }
+ dp[1] = m;
+ }
+ }
+ m = data[l1*rowDelta + l2];
+ if (sz > FASTSIZE)
+ free(data);
+ RETURN ( __mkSmallInteger(m) );
}
mallocFailed: ;
-}
%}.
^ super levenshteinTo:aString
- s:substWeight c:caseWeight
- i:insrtWeight d:deleteWeight
-
- "'ocmprt' levenshteinTo:'computer'
+ s:substWeight k:kbdTypoWeight c:caseWeight
+ i:insrtWeight d:deleteWeight
+
+ "
+ 'ocmprt' levenshteinTo:'computer'
'computer' levenshteinTo:'computer'
'ocmputer' levenshteinTo:'computer'
'cmputer' levenshteinTo:'computer'
- 'Computer' levenshteinTo:'computer'"
+ 'computer' levenshteinTo:'cmputer'
+ 'computer' levenshteinTo:'vomputer'
+ 'computer' levenshteinTo:'bomputer'
+ 'Computer' levenshteinTo:'computer'
+ "
!
notEmpty
@@ -3372,5 +3422,5 @@
!String class methodsFor:'documentation'!
version
- ^ '$Header: /cvs/stx/stx/libbasic/String.st,v 1.210 2004-08-03 19:29:04 penk Exp $'
+ ^ '$Header: /cvs/stx/stx/libbasic/String.st,v 1.211 2004-11-18 14:01:24 stefan Exp $'
! !