"
COPYRIGHT (c) 1988-93 by Claus Gittinger
All Rights Reserved
This software is furnished under a license and may be used
only in accordance with the terms of that license and with the
inclusion of the above copyright notice. This software may not
be provided or otherwise made available to, or used by, any
other person. No title to or ownership of the software is
hereby transferred.
"
ByteArray subclass:#String
instanceVariableNames:''
classVariableNames:''
poolDictionaries:''
category:'Collections-Text'
!
String comment:'
COPYRIGHT (c) 1988-93 by Claus Gittinger
All Rights Reserved
%W% %E%
'!
%{
#include <stdio.h>
#include <ctype.h>
%}
!String class methodsFor:'documentation'!
documentation
"
Strings are ByteArrays storing Characters.
Strings are kind of kludgy: to allow for easy handling by c-functions,
there is always one 0-byte added at the end, which is not counted
in size. also, the at:put: method does not allow for storing 0-bytes.
(to do this, the basicAt:put: and basicNew: methods are redefined)
You cannot add any instvars to String, since the code (also in the run time
system & compiler) knows that strings have no named instvars. If you need
to, you have to create a subclass.
"
! !
!String class methodsFor:'instance creation'!
basicNew:anInteger
"return a new empty string with anInteger characters"
%{ /* NOCONTEXT */
OBJ newString;
REGISTER int len;
REGISTER unsigned char *cp;
REGISTER OBJ *op;
int nInstVars, instsize;
extern OBJ new();
if (_isSmallInteger(anInteger)) {
len = _intVal(anInteger);
if (len >= 0) {
nInstVars = _intVal(_ClassInstPtr(self)->c_ninstvars);
instsize = OHDR_SIZE + (nInstVars * sizeof(OBJ)) + len + 1;
PROTECT(self);
_qNew(newString, instsize, SENDER);
UNPROTECT(self);
_InstPtr(newString)->o_class = self;
if (nInstVars) {
#if defined(memset4)
memset4(_InstPtr(newString)->i_instvars, nil, nInstVars);
#else
# if defined(FAST_MEMSET) && ! defined(NEGATIVE_ADDRESSES)
/*
* knowing that nil is 0
*/
memset(_InstPtr(newString)->i_instvars, 0, instsize - OHDR_SIZE);
# else
op = _InstPtr(newString)->i_instvars;
do {
*op++ = nil;
} while (--nInstVars);
# endif
#endif
cp = _stringVal(newString) + (nInstVars * sizeof(OBJ));
} else
cp = _stringVal(newString);
#ifdef FAST_MEMSET
memset(cp, ' ', len);
*(cp + len) = '\0';
#else
while (len--)
*cp++ = ' ';
*cp = '\0';
#endif
RETURN (newString);
}
}
%}
.
^ (super basicNew:anInteger) atAllPut:(Character space)
!
new:anInteger
"same as basicNew - to avoid another send"
^ self basicNew:anInteger
!
basicNew
"return a new empty string"
^ self basicNew:0
!
new
"return a new empty string"
^ self basicNew:0
!
unititializedNew:anInteger
"redefine it back - strings must have a 0-byte at the end"
^ self basicNew:anInteger
!
fromString:aString
"return a copy of the argument, aString"
^ aString copyFrom:1 to:(aString size)
! !
!String methodsFor:'accessing'!
basicSize
"return the number of characters in myself"
%{ /* NOCONTEXT */
if ((_qClass(self) == String) || (_qClass(self) == Symbol)) {
RETURN ( _MKSMALLINT(_stringSize(self)) );
}
%}
.
^ super basicSize - 1
!
size
"return the number of characters in myself
- reimplemented here to avoid double send (size -> basicSize)"
%{ /* NOCONTEXT */
if ((_qClass(self) == String) || (_qClass(self) == Symbol)) {
RETURN ( _MKSMALLINT(_stringSize(self)) );
}
%}
.
^ super basicSize - 1
!
basicAt:index
"return the character at position index, an Integer
- reimplemented here since we return characters"
%{ /* NOCONTEXT */
REGISTER int indx;
int len;
if (_isSmallInteger(index)) {
indx = _intVal(index);
if (_qClass(self) != String)
indx += _intVal(_ClassInstPtr(_qClass(self))->c_ninstvars) * sizeof(OBJ);
len = _stringSize(self);
if ((indx > 0) && (indx <= len)) {
RETURN ( _MKCHARACTER(_stringVal(self)[indx-1] & 0xFF) );
}
}
%}
.
self subscriptBoundsError:index
!
basicAt:index put:aCharacter
"store the argument, aCharacter at position index, an Integer
- reimplemented here since we store characters"
%{ /* NOCONTEXT */
REGISTER int value, indx;
int len;
if (_isCharacter(aCharacter)) {
value = _intVal(_characterVal(aCharacter));
if (value && _isSmallInteger(index)) {
indx = _intVal(index);
if (_qClass(self) != String)
indx += _intVal(_ClassInstPtr(_qClass(self))->c_ninstvars) * sizeof(OBJ);
len = _stringSize(self);
if ((indx > 0) && (indx <= len)) {
_stringVal(self)[indx-1] = value;
RETURN ( aCharacter );
}
}
}
%}
.
(aCharacter isMemberOf:Character) ifFalse:[
self elementNotCharacter
] ifTrue:[
(aCharacter asciiValue == 0) ifTrue:[
self error:'0-character not allowed in strings'
] ifFalse:[
self subscriptBoundsError:index
]
]
! !
!String methodsFor:'converting'!
asUppercase
"return a copy of myself in uppercase letters"
|newStr
mySize "{ Class: SmallInteger }" |
mySize := self size.
newStr := self species new:mySize.
1 to:mySize do:[:i |
newStr at:i put:(self at:i) asUppercase
].
^newStr
!
asLowercase
"return a copy of myself in lowercase letters"
|newStr
mySize "{ Class: SmallInteger }" |
mySize := self size.
newStr := self species new:mySize.
1 to:mySize do:[:i |
newStr at:i put:(self at:i) asLowercase
].
^newStr
!
asString
"return myself - I am a string"
^ self
!
asSymbol
"return a unique symbol with name taken from myself.
The argument must be a String, subclass instances are not allowed."
%{
if (_qClass(self) == String) {
RETURN ( _MKSYMBOL(_stringVal(self), (OBJ *)0, __context) );
}
%}
.
self primitiveFailed
!
asText
"return a Text-object (collection of lines) from myself"
^ Text from:self
!
asNumber
"read a number from the receiver"
^ Number readFromString:self
"'123' asNumber"
"'123.567' asNumber"
"'(5/6)' asNumber"
!
asFilename
"return a Filename with pathname taken from the receiver"
^ Filename named:self
! !
!String methodsFor:'printing & storing'!
printOn:aStream
"print the receiver on aStream"
aStream nextPutAll:self
!
printString
"return a string for printing - thats myself"
^ self
!
print
"print the receiver on standard output - for debugging only"
%{ /* NOCONTEXT */
if (_qClass(self) == String) {
printf("%s", _stringVal(self));
RETURN (self);
}
%}
.
super print
!
printfPrintString:formatString
"non-portable but sometimes useful.
return a printed representation of the receiver
as specified by formatString, which is defined by printf.
No checking on overrunning the buffer - the result must be shorter than 8k chars"
%{ /* NOCONTEXT */
char buffer[8192];
char *cp;
if (_isString(formatString)) {
#ifdef THIS_CONTEXT
/* mhmh - sprintf seems to destroy thisContext (if its in a register) */
OBJ sav = __thisContext;
#endif
cp = (char *)_stringVal(self);
if (_qClass(self) != String)
cp += _intVal(_ClassInstPtr(_qClass(self))->c_ninstvars) * sizeof(OBJ);
sprintf(buffer, (char *)_stringVal(formatString), cp);
#ifdef THIS_CONTEXT
__thisContext = sav;
#endif
RETURN ( _MKSTRING(buffer COMMA_SND) );
}
%}
.
self primitiveFailed
"'hello' printfPrintString:'%%s -> %s'"
"'hello' printfPrintString:'%%10s -> %10s'"
"'hello' printfPrintString:'%%-10s -> %-10s'"
!
displayString
"return a string to display the receiver - use storeString to have
quotes around"
^ self storeString
!
storeString
"return a String for storing myself"
|s|
(self includes:$') ifTrue:[
s := ''''.
self do:[:thisChar |
(thisChar == $') ifTrue:[s := s , ''''].
s := s copyWith:thisChar
].
s := s , ''''.
^ s
].
^ '''' asString , self , '''' asString
!
storeOn:aStream
"put the storeString of myself on aStream"
aStream nextPut: $'.
(self includes:$') ifTrue:[
self do:[:thisChar |
(thisChar == $') ifTrue:[aStream nextPut:thisChar].
aStream nextPut:thisChar
]
] ifFalse:[
aStream nextPutAll:self
].
aStream nextPut:$'
! !
!String methodsFor:'comparing'!
hash
"return an integer useful as a hash-key"
%{ /* NOCONTEXT */
REGISTER int g, val;
REGISTER unsigned char *cp, *cp0;
int l;
cp = _stringVal(self);
l = _stringSize(self);
if (_qClass(self) != String) {
int n = _intVal(_ClassInstPtr(_qClass(self))->c_ninstvars) * sizeof(OBJ);
cp += n;
l -= n;
}
/*
* this is the dragon-book algorithm with a funny start
* value (to give short strings a number above 8192)
*/
val = 12345;
for (cp0 = cp, cp += l - 1; cp >= cp0; cp--) {
val = (val << 5) + (*cp & 0x1F);
if (g = (val & 0x3E000000))
val ^= g >> 25 /* 23 */ /* 25 */;
val &= 0x3FFFFFFF;
}
if (l) {
l |= 1;
val = (val * l) & 0x3FFFFFFF;
}
RETURN ( _MKSMALLINT(val) );
%}
!
<= something
"Compare the receiver with the argument and return true if the
receiver is less than or equal to the argument. Otherwise return false."
^ (self > something) not
!
< something
"Compare the receiver with the argument and return true if the
receiver is less than the argument. Otherwise return false."
^ (something > self)
!
>= something
"Compare the receiver with the argument and return true if the
receiver is greater than or equal to the argument.
Otherwise return false."
^ (something > self) not
!
> aString
"Compare the receiver with the argument and return true if the
receiver is greater than the argument. Otherwise return false."
%{ /* NOCONTEXT */
int len1, len2, cmp;
REGISTER OBJ s = aString;
char *cp1, *cp2;
if (_isNonNilObject(s)
&& ((_qClass(s) == String) || (_qClass(s) == Symbol) || (_qClass(s) == _qClass(self)))) {
cp1 = (char *) _stringVal(self);
len1 = _stringSize(self);
if (_qClass(self) != String) {
int n = _intVal(_ClassInstPtr(_qClass(self))->c_ninstvars) * sizeof(OBJ);
cp1 += n;
len1 -= n;
}
cp2 = (char *) _stringVal(s);
len2 = _stringSize(s);
if (_qClass(s) != String) {
int n = _intVal(_ClassInstPtr(_qClass(s))->c_ninstvars) * sizeof(OBJ);
cp2 += n;
len2 -= n;
}
if (len1 <= len2)
cmp = strncmp(cp1, cp2, len1);
else
cmp = strncmp(cp1, cp2, len2);
if (cmp > 0) {
RETURN ( true );
}
if ((cmp == 0) && (len1 > len2)) {
RETURN ( true );
}
RETURN ( false );
}
%}
.
^ super > aString
!
= aString
"Compare the receiver with the argument and return true if the
receiver is equal to the argument. Otherwise return false."
%{ /* NOCONTEXT */
int l1, l2;
REGISTER OBJ s = aString;
char *cp1, *cp2;
if (s == self) {
RETURN ( true );
}
if (! _isNonNilObject(s)) {
RETURN ( false );
}
if ((_qClass(s) == String) || (_qClass(s) == Symbol) || (_qClass(s) == _qClass(self))) {
cp1 = (char *) _stringVal(self);
l1 = _stringSize(self);
if (_qClass(self) != String) {
int n = _intVal(_ClassInstPtr(_qClass(self))->c_ninstvars) * sizeof(OBJ);
cp1 += n;
l1 -= n;
}
cp2 = (char *) _stringVal(s);
l2 = _stringSize(s);
if (_qClass(s) != String) {
int n = _intVal(_ClassInstPtr(_qClass(s))->c_ninstvars) * sizeof(OBJ);
cp2 += n;
l2 -= n;
}
if (l1 != l2) {
RETURN ( false );
}
RETURN ( (strncmp(cp1, cp2, l1) == 0) ? true : false );
}
%}
.
^ super = aString
!
~= aString
"Compare the receiver with the argument and return true if the
receiver is not equal to the argument. Otherwise return false."
%{ /* NOCONTEXT */
int l1, l2;
REGISTER OBJ s = aString;
char *cp1, *cp2;
if (s == self) {
RETURN ( false );
}
if (! _isNonNilObject(s)) {
RETURN ( true );
}
if ((_qClass(s) == String) || (_qClass(s) == Symbol) || (_qClass(s) == _qClass(self))) {
cp1 = (char *) _stringVal(self);
l1 = _stringSize(self);
if (_qClass(self) != String) {
int n = _intVal(_ClassInstPtr(_qClass(self))->c_ninstvars) * sizeof(OBJ);
cp1 += n;
l1 -= n;
}
cp2 = (char *) _stringVal(s);
l2 = _stringSize(s);
if (_qClass(s) != String) {
int n = _intVal(_ClassInstPtr(_qClass(s))->c_ninstvars) * sizeof(OBJ);
cp2 += n;
l2 -= n;
}
if (l1 != l2) {
RETURN ( true );
}
RETURN ( (strncmp(cp1, cp2, l1) == 0) ? false : true );
}
%}
.
^ super ~= aString
! !
!String methodsFor:'testing'!
occurrencesOf:aCharacter
"count the occurrences of the argument, aCharacter in myself
- reimplemented here for speed"
%{ /* NOCONTEXT */
REGISTER unsigned char *cp;
REGISTER int byteValue;
REGISTER int count;
if (_isCharacter(aCharacter)) {
count = 0;
byteValue = _intVal(_characterVal(aCharacter));
cp = _stringVal(self);
if (_qClass(self) != String)
cp += _intVal(_ClassInstPtr(_qClass(self))->c_ninstvars) * sizeof(OBJ);
while (*cp) {
if (*cp++ == byteValue) count++;
}
RETURN ( _MKSMALLINT(count) );
}
%}
.
^ 0
!
includes:aCharacter
"return true if the argument, aCharacter is included in the receiver
- reimplemented here for speed"
%{ /* NOCONTEXT */
REGISTER unsigned char *cp;
REGISTER int byteValue;
extern char *strchr();
if (_isCharacter(aCharacter)) {
byteValue = _intVal(_characterVal(aCharacter));
cp = _stringVal(self);
if (_qClass(self) != String)
cp += _intVal(_ClassInstPtr(_qClass(self))->c_ninstvars) * sizeof(OBJ);
#ifdef FAST_STRCHR
cp = (unsigned char *) strchr(cp, _intVal(_characterVal(aCharacter)));
if (cp) {
RETURN ( true );
}
#else
while (*cp) {
if (*cp == byteValue) {
RETURN ( true );
}
cp++;
}
#endif
}
%}
.
^ false
!
indexOf:aCharacter
"return the index of the first occurrences of the argument, aCharacter
in the receiver or 0 if not found - reimplemented here for speed."
%{ /* NOCONTEXT */
REGISTER unsigned char *cp;
#ifdef FAST_STRCHR
char *strchr();
#else
REGISTER int byteValue;
REGISTER int index;
#endif
if (_isCharacter(aCharacter)) {
cp = _stringVal(self);
if (_qClass(self) != String)
cp += _intVal(_ClassInstPtr(_qClass(self))->c_ninstvars) * sizeof(OBJ);
#ifdef FAST_STRCHR
cp = (unsigned char *) strchr(cp, _intVal(_characterVal(aCharacter)));
if (cp) {
RETURN ( _MKSMALLINT(cp - _stringVal(self) + 1) );
}
#else
byteValue = _intVal(_characterVal(aCharacter));
index = 1;
while (*cp) {
if (*cp++ == byteValue) {
RETURN ( _MKSMALLINT(index) );
}
index++;
}
#endif
}
%}
.
^ 0
!
indexOf:aCharacter startingAt:start
"return the index of the first occurrence of the argument, aCharacter
in myself starting at start, anInteger or 0 if not found;
- reimplemented here for speed"
%{ /* NOCONTEXT */
REGISTER unsigned char *cp;
REGISTER int index, byteValue;
#ifdef FAST_STRCHR
char *strchr();
#endif
int len;
if (_isSmallInteger(start)) {
if (_isCharacter(aCharacter)) {
byteValue = _intVal(_characterVal(aCharacter));
index = _intVal(start);
if (index <= 0)
index = 1;
if (_qClass(self) != String)
index += _intVal(_ClassInstPtr(_qClass(self))->c_ninstvars) * sizeof(OBJ);
len = _stringSize(self);
if (index <= len) {
cp = _stringVal(self) + index - 1;
#ifdef FAST_STRCHR
cp = (unsigned char *) strchr(cp, byteValue);
if (cp) {
RETURN ( _MKSMALLINT(cp - _stringVal(self) + 1) );
}
#else
while (*cp) {
if (*cp++ == byteValue) {
RETURN ( _MKSMALLINT(index) );
}
index++;
}
#endif
}
}
RETURN ( _MKSMALLINT(0) );
}
%}
.
^ super indexOf:aCharacter startingAt:start
!
indexOfSeparatorStartingAt:start
"return the index of the next separator character"
%{ /* NOCONTEXT */
REGISTER unsigned char *cp;
REGISTER char c;
int len, index;
index = _intVal(start);
if (index <= 0) {
index = 1;
}
if (_qClass(self) != String)
index += _intVal(_ClassInstPtr(_qClass(self))->c_ninstvars) * sizeof(OBJ);
len = _stringSize(self);
if (index > len) {
RETURN ( _MKSMALLINT(0) );
}
cp = _stringVal(self) + index - 1;
while (c = *cp++) {
#ifdef ASCII
if (c <= ' ')
#endif
if ((c == ' ') || (c == '\t') || (c == '\n')
|| (c == '\r') || (c == '\f')) {
RETURN ( _MKSMALLINT(cp - _stringVal(self)) );
}
}
%}
.
^ 0
!
includesMatchCharacters
"return true if the receiver includes any meta characters (i.e. $* or $#)
for match operations; false if not"
(self includes:$*) ifTrue:[^ true].
^ (self includes:$#)
!
from:matchStart to:matchStop match:aString from:start to:stop
"helper for match; return true if the characters from start to stop in
aString are matching the receivers characters from matchStart to matchStop.
The receiver may contain meta-match characters $* (to match any string)
or $# (to match any character)."
|matchChar mStart mStop sStart sStop mSize sSize index cont matchLast|
mStart := matchStart.
mStop := matchStop.
sStart := start.
sStop := stop.
[true] whileTrue:[
mSize := mStop - mStart + 1.
sSize := sStop - sStart + 1.
"empty strings match"
(mSize == 0) ifTrue:[^ (sSize == 0)].
matchChar := self at:mStart.
(matchChar == $#) ifTrue:[
"testString empty -> no match"
(sSize == 0) ifTrue:[^ false].
"# matches single character"
((sSize == 1) and:[mSize == 1]) ifTrue:[^ true].
"cut off 1st chars and continue"
mStart := mStart + 1.
sStart := sStart + 1
] ifFalse:[
(matchChar == $*) ifTrue:[
"testString empty -> we have a match"
(sSize == 0) ifTrue:[^ true].
"* matches anything"
(mSize == 1) ifTrue:[^ true].
"try to avoid some of the recursion by checking last
character and continue with shortened strings if possible"
cont := false.
(mStop >= mStart) ifTrue:[
matchLast := self at:mStop.
(matchLast ~~ $*) ifTrue:[
(matchLast == $#) ifTrue:[
cont := true
] ifFalse:[
(matchLast == (aString at:sStop)) ifTrue:[
cont := true
]
]
]
].
cont ifFalse:[
index := sStart.
[index <= sStop] whileTrue:[
(self from:(mStart + 1) to:mStop match:aString
from:index to:sStop) ifTrue:[
^ true
].
index := index + 1
].
^ false
].
mStop := mStop - 1.
sStop := sStop - 1
] ifFalse:[
"testString empty ?"
(sSize == 0) ifTrue:[^ false].
"first characters equal ?"
((aString at:sStart) ~~ matchChar) ifTrue:[^ false].
"avoid recursion if possible"
((sSize == mSize) and:[self = aString]) ifTrue:[^ true].
"cut off 1st chars and continue"
mStart := mStart + 1.
sStart := sStart + 1
]
]
]
!
match:aString
"return true if aString matches self, where self may contain meta-match
characters $* (to match any string) or $# (to match any character)."
^ self from:1 to:(self size) match:aString from:1 to:(aString size)
" '*ute*' match:'computer' "
" '*uter' match:'computer' "
" 'uter*' match:'computer' "
!
startsWith:aString
"return true, if the receiver starts with something, aString"
(aString isKindOf:String) ifFalse: [
(aString isMemberOf:Character) ifTrue:[
self isEmpty ifTrue:[^ false].
^ (self at:1) == aString
].
^ super startsWith:aString
].
%{
int len1, len2;
REGISTER unsigned char *src1, *src2;
REGISTER OBJ s = aString;
len1 = _qSize(self);
src1 = _stringVal(self);
if (_qClass(self) != String) {
int n = _intVal(_ClassInstPtr(_qClass(self))->c_ninstvars) * sizeof(OBJ);
len1 -= n;
src1 += n;
}
len2 = _qSize(s);
src2 = _stringVal(s);
if (_qClass(s) != String) {
int n = _intVal(_ClassInstPtr(_qClass(s))->c_ninstvars) * sizeof(OBJ);
len2 -= n;
src2 += n;
}
if (len1 < len2) {
RETURN ( false );
}
while (*src2)
if (*src2++ != *src1++) {
RETURN ( false );
}
%}
.
^ true
"'hello world' startsWith:'hello'"
"'hello world' startsWith:'hi'"
!
endsWith:aString
"return true, if the receiver end with something, aString"
(aString isKindOf:String) ifFalse: [
(aString isMemberOf:Character) ifTrue:[
self isEmpty ifTrue:[^ false].
^ (self at:(self size)) == aString
].
^ super endsWith:aString
].
%{
int len1, len2;
REGISTER unsigned char *src1, *src2;
REGISTER OBJ s = aString;
len1 = _qSize(self);
src1 = _stringVal(self);
if (_qClass(self) != String) {
int n = _intVal(_ClassInstPtr(_qClass(self))->c_ninstvars) * sizeof(OBJ);
len1 -= n;
src1 += n;
}
len2 = _qSize(s);
src2 = _stringVal(s);
if (_qClass(s) != String) {
int n = _intVal(_ClassInstPtr(_qClass(s))->c_ninstvars) * sizeof(OBJ);
len2 -= n;
src2 += n;
}
if (len1 < len2) {
RETURN ( false );
}
src1 = _stringVal(self) + len1 - len2;
src2 = _stringVal(aString);
while (*src2)
if (*src2++ != *src1++) {
RETURN ( false );
}
%}
.
^ true
"'hello world' endsWith:'world'"
"'hello world' endsWith:'earth'"
!
isBlank
"return true, if the receiver contains spaces only"
%{ /* NOCONTEXT */
REGISTER unsigned char *src;
src = _stringVal(self);
if (_qClass(self) != String)
src += _intVal(_ClassInstPtr(_qClass(self))->c_ninstvars) * sizeof(OBJ);
while (*src)
if (*src++ != ' ') {
RETURN ( false );
}
%}
.
^ true
!
countWords
"return the number of words, which are separated by separators"
|tally start stop mySize|
tally := 0.
start := 1.
mySize := self size.
[start <= mySize] whileTrue:[
(self at:start) isSeparator ifTrue:[
start := start + 1
] ifFalse:[
stop := self indexOfSeparatorStartingAt:start.
(stop == 0) ifTrue:[
stop := mySize + 1
].
tally := tally + 1.
start := stop
]
].
^ tally
!
asCollectionOfWords
"return a collection containing the words (separated by whitespace) of the receiver"
|words start stop mySize|
words := OrderedCollection new.
start := 1.
mySize := self size.
[start <= mySize] whileTrue:[
(self at:start) isSeparator ifTrue:[
start := start + 1
] ifFalse:[
stop := self indexOfSeparatorStartingAt:start.
stop == 0 ifTrue:[
words add:(self copyFrom:start to:mySize).
^ words
].
words add:(self copyFrom:start to:(stop - 1)).
start := stop
]
].
^ words
!
levenshteinTo:aString
"return the levenshtein distance to the argument, aString;
this value corrensponds to the number of replacements that have to be
made to get aString from the receiver.
see IEEE transactions on Computers 1976 Pg 172 ff."
^ self levenshteinTo:aString s:4 c:1 i:2 d:6
!
levenshteinTo:aString s:substWeight c:caseWeight i:insrtWeight d:deleteWeight
"parametrized levenshtein. arguments are the costs for
substitution, case-change, insertion and deletion of a character."
|d "delta matrix"
len1 len2 dim prevRow row col dimPlus1
min pp c1 c2|
%{ /* NOCONTEXT */
/*
* this is very heavy used when correcting errors
* (all symbols are searched for best match) - therefore it must be fast
*/
{
unsigned short *data;
int l1, l2;
REGISTER int sz;
unsigned char *s1, *s2;
int v1, v2, v3, m;
REGISTER unsigned short *dp;
REGISTER int delta;
REGISTER int j;
int i;
int iW, cW, sW, dW;
# define FASTSIZE 30
short fastData[(FASTSIZE + 1) * (FASTSIZE + 1)];
if ((_isString(self) || _isSymbol(self))
&& (_isString(aString) || _isSymbol(aString))
&& _isSmallInteger(insrtWeight) && _isSmallInteger(caseWeight)
&& _isSmallInteger(substWeight) && _isSmallInteger(deleteWeight)) {
iW = _intVal(insrtWeight);
cW = _intVal(caseWeight);
sW = _intVal(substWeight);
dW = _intVal(deleteWeight);
s1 = _stringVal(self);
s2 = _stringVal(aString);
l1 = strlen(s1);
l2 = strlen(s2);
sz = (l1 < l2) ? l2 : l1;
delta = sz + 1;
if (sz <= FASTSIZE) {
data = fastData;
} else {
/* add ifdef ALLOCA here ... */
data = (unsigned short *)malloc(delta * delta * sizeof(short));
}
data[0] = 0;
dp = data+1;
for (j=1, dp=data+1; j<=sz; j++, dp++)
*dp = *(dp-1) + iW;
for (i=1, dp=data+delta; i<=sz; i++, dp+=delta)
*dp = *(dp-delta) + dW;
for (i=1; i<=l1; i++) {
for (j=1; j<=l2; j++) {
dp = data + (i*delta) + j;
if (s1[i] != s2[j]) {
if (tolower(s1[i]) == tolower(s2[j])) {
m = cW;
} else {
m = sW;
}
} else
m = 0;
v2 = *(dp - 1) + iW;
v3 = *(dp - delta) + dW;
v1 = *(dp - delta - 1) + m;
if (v1 < v2)
if (v1 < v3)
m = v1;
else
m = v3;
else
if (v2 < v3)
m = v2;
else
m = v3;
*dp = m;
}
}
m = data[l1 * delta + l2];
if (sz > FASTSIZE)
free(data);
RETURN ( _MKSMALLINT(m) );
}
}
%}
.
len1 := self size.
len2 := aString size.
"create the help-matrix"
dim := len1 max:len2.
dimPlus1 := dim + 1.
d := Array new:dimPlus1.
1 to:dimPlus1 do:[:i |
d at:i put:(Array new:dimPlus1)
].
"init help-matrix"
(d at:1) at:1 put:0.
row := d at:1.
1 to:dim do:[:j |
row at:(j + 1) put:( (row at:j) + insrtWeight )
].
1 to:dim do:[:i |
(d at:(i + 1)) at:1 put:( ((d at:i) at:1) + deleteWeight )
].
1 to:len1 do:[:i |
c1 := self at:i.
1 to:len2 do:[:j |
c2 := aString at:j.
(c1 == c2) ifTrue:[
pp := 0
] ifFalse:[
(c1 asLowercase == c2 asLowercase) ifTrue:[
pp := caseWeight
] ifFalse:[
pp := substWeight
]
].
prevRow := d at:i.
row := d at:(i + 1).
col := j + 1.
min := (prevRow at:j) + pp.
min := min min:( (row at:j) + insrtWeight).
min := min min:( (prevRow at:col) + deleteWeight).
row at:col put: min
]
].
^ (d at:(len1 + 1)) at:(len2 + 1)
"'ocmprt' levenshteinTo:'computer'
'computer' levenshteinTo:'computer'
'ocmputer' levenshteinTo:'computer'
'cmputer' levenshteinTo:'computer'
'Computer' levenshteinTo:'computer'"
! !
!String methodsFor:'copying'!
shallowCopy
"return a copy of the receiver
- redefined for more speed"
^ self copyFrom:1
!
deepCopy
"return a copy of the receiver
- redefined for speed"
^ self copyFrom:1
!
, aString
"return the concatenation of myself and the argument, aString
- reimplemented here for speed"
|newString|
%{
int l1, l2;
char *cp1, *cp2;
REGISTER unsigned char *dstp;
REGISTER OBJ s = aString;
OBJ new();
if ((_qClass(s) == String) || (_qClass(s) == Symbol) || (_qClass(s) == _qClass(self))) {
cp1 = (char *) _stringVal(self);
l1 = _stringSize(self);
if (_qClass(self) != String) {
int n = _intVal(_ClassInstPtr(_qClass(self))->c_ninstvars) * sizeof(OBJ);
cp1 += n;
l1 -= n;
}
cp2 = (char *) _stringVal(s);
l2 = _stringSize(s);
if (_qClass(s) != String) {
int n = _intVal(_ClassInstPtr(_qClass(s))->c_ninstvars) * sizeof(OBJ);
cp2 += n;
l2 -= n;
}
_qNew(newString, OHDR_SIZE + l1 + l2 + 1, __context);
_InstPtr(newString)->o_class = String;
dstp = _stringVal(newString);
#ifdef FAST_MEMCPY
bcopy(cp1, dstp, l1);
bcopy(cp2, dstp + l1, l2+1);
#else
# ifdef FAST_STRCPY
strcpy(dstp, cp1);
strcpy(dstp + l1, cp2);
# else
while ((*dstp++ = *cp1++) != '\0') ;
dstp--;
while ((*dstp++ = *cp2++) != '\0') ;
# endif
#endif
RETURN ( newString );
}
%}
.
^ super , aString
!
concatenate:string1 and:string2
"return the concatenation of myself and the arguments, string1 and string2.
This is equivalent to self , string1 , string2
- generated by compiler when such a construct is detected"
|newString|
%{
int len1, len2, len3;
#if !defined(FAST_MEMCPY) && !defined(FAST_STRCPY)
REGISTER unsigned char *srcp;
#endif
REGISTER unsigned char *dstp;
OBJ new();
if ((_isString(self) || _isSymbol(self))
&& (_isString(string1) || _isSymbol(string1))
&& (_isString(string2) || _isSymbol(string2))) {
len1 = _stringSize(self);
len2 = _stringSize(string1);
len3 = _stringSize(string2);
_qNew(newString, OHDR_SIZE + len1 + len2 + len3 + 1, __context);
_InstPtr(newString)->o_class = String;
dstp = _stringVal(newString);
#ifdef FAST_MEMCPY
bcopy(_stringVal(self), dstp, len1);
bcopy(_stringVal(string1), dstp + len1, len2);
bcopy(_stringVal(string2), dstp + len1 + len2, len3+1);
#else
# ifdef FAST_STRCPY
strcpy(dstp, _stringVal(self));
strcpy(dstp + len1, _stringVal(string1));
strcpy(dstp + len1 + len2, _stringVal(string2));
# else
srcp = _stringVal(self);
while ((*dstp++ = *srcp++) != '\0') ;
dstp--;
srcp = _stringVal(string1);
while ((*dstp++ = *srcp++) != '\0') ;
dstp--;
srcp = _stringVal(string2);
while ((*dstp++ = *srcp++) != '\0') ;
# endif
#endif
RETURN ( newString );
}
%}
.
^ super , string1 , string2
!
concatenate:string1 and:string2 and:string3
"return the concatenation of myself and the string arguments.
This is equivalent to self , string1 , string2 , string3
- generated by compiler when such a construct is detected"
|newString|
%{
int len1, len2, len3, len4;
#if !defined(FAST_MEMCPY) && !defined(FAST_STRCPY)
REGISTER unsigned char *srcp;
#endif
REGISTER unsigned char *dstp;
OBJ new();
if ((_isString(self) || _isSymbol(self))
&& (_isString(string1) || _isSymbol(string1))
&& (_isString(string2) || _isSymbol(string2))
&& (_isString(string3) || _isSymbol(string3))) {
len1 = _stringSize(self);
len2 = _stringSize(string1);
len3 = _stringSize(string2);
len4 = _stringSize(string3);
_qNew(newString, OHDR_SIZE + len1 + len2 + len3 + len4 + 1, __context);
_InstPtr(newString)->o_class = String;
dstp = _stringVal(newString);
#ifdef FAST_MEMCPY
bcopy(_stringVal(self), dstp, len1);
bcopy(_stringVal(string1), dstp + len1, len2);
bcopy(_stringVal(string2), dstp + len1 + len2, len3);
bcopy(_stringVal(string3), dstp + len1 + len2 + len3, len4+1);
#else
# ifdef FAST_STRCPY
strcpy(dstp, _stringVal(self));
strcpy(dstp + len1, _stringVal(string1));
strcpy(dstp + len1 + len2, _stringVal(string2));
strcpy(dstp + len1 + len2 + len3, _stringVal(string3));
# else
srcp = _stringVal(self);
while ((*dstp++ = *srcp++) != '\0') ;
dstp--;
srcp = _stringVal(string1);
while ((*dstp++ = *srcp++) != '\0') ;
dstp--;
srcp = _stringVal(string2);
while ((*dstp++ = *srcp++) != '\0') ;
dstp--;
srcp = _stringVal(string3);
while ((*dstp++ = *srcp++) != '\0') ;
# endif
#endif
RETURN ( newString );
}
%}
.
^ super , string1 , string2 , string3
!
copyWith:aCharacter
"return the concatenation of myself and the argument, aCharacter
- reimplemented here for speed"
|newString|
(aCharacter isMemberOf:Character) ifFalse:[
^ super copyWith:aCharacter
].
%{
OBJ new();
int sz;
REGISTER unsigned char *dstp;
int offs;
sz = _qSize(self) + 1;
if (_qClass(self) != String) {
offs = _intVal(_ClassInstPtr(_qClass(self))->c_ninstvars) * sizeof(OBJ);
sz -= offs;
} else
offs = 0;
_qNew(newString, sz, __context);
_InstPtr(newString)->o_class = String;
dstp = _stringVal(newString);
#ifdef FAST_MEMCPY
sz = sz - OHDR_SIZE - 1 - 1;
bcopy(_stringVal(self) + offs, dstp, sz);
dstp += sz;
#else
# ifdef FAST_STRCPY
strcpy(dstp, _stringVal(self) + offs);
dstp += sz - OHDR_SIZE - 1 - 1;
# else
{
REGISTER unsigned char *srcp;
srcp = _stringVal(self) + offs;
while ((*dstp = *srcp++) != '\0')
dstp++;
}
# endif
#endif
*dstp++ = _intVal(_characterVal(aCharacter));
*dstp = '\0';
%}
.
^ newString
!
copyFrom:start to:stop
"return the substring starting at index start, anInteger and ending
at stop, anInteger.
- reimplemented here for speed"
|newString|
%{
OBJ new();
#if !defined(FAST_MEMCPY)
REGISTER unsigned char *srcp;
#endif
REGISTER unsigned char *dstp;
REGISTER int count;
int len, index1, index2;
if (_isSmallInteger(start) && _isSmallInteger(stop)) {
len = _stringSize(self);
index1 = _intVal(start);
index2 = _intVal(stop);
if ((index1 <= index2) && (index1 > 0)) {
if (_qClass(self) != String) {
int n = _intVal(_ClassInstPtr(_qClass(self))->c_ninstvars) * sizeof(OBJ);
index1 += n;
index2 += n;
}
if (index2 <= len) {
count = index2 - index1 + 1;
_qNew(newString, OHDR_SIZE+count+1, __context);
_InstPtr(newString)->o_class = String;
dstp = _stringVal(newString);
#ifdef FAST_MEMCPY
bcopy(_stringVal(self) + index1 - 1, dstp, count);
dstp[count] = '\0';
#else
srcp = _stringVal(self) + index1 - 1;
while (count--) {
*dstp++ = *srcp++;
}
*dstp = '\0';
#endif
RETURN ( newString );
}
}
}
%}
.
^ super copyFrom:start to:stop
!
copyFrom:start
"return the substring from start, anInteger to the end
- reimplemented here for speed"
|newString|
%{
OBJ new();
#if !defined(FAST_MEMCPY)
REGISTER unsigned char *srcp;
#endif
REGISTER unsigned char *dstp;
REGISTER int count;
int len, index1;
if (_isSmallInteger(start)) {
len = _stringSize(self);
index1 = _intVal(start);
if (index1 > 0) {
if (_qClass(self) != String) {
int n = _intVal(_ClassInstPtr(_qClass(self))->c_ninstvars) * sizeof(OBJ);
index1 += n;
}
if (index1 <= len) {
count = len - index1 + 1;
_qNew(newString, OHDR_SIZE+count+1, __context);
_InstPtr(newString)->o_class = String;
dstp = _stringVal(newString);
#ifdef FAST_MEMCPY
bcopy(_stringVal(self) + index1 - 1, dstp, count);
dstp[count] = '\0';
#else
srcp = _stringVal(self) + index1 - 1;
while (count--) {
*dstp++ = *srcp++;
}
*dstp = '\0';
#endif
RETURN ( newString );
}
}
}
%}
.
^ super copyFrom:start
! !
!String methodsFor:'filling and replacing'!
replaceFrom:start to:stop with:aString startingAt:repStart
"replace the characters starting at index start, anInteger and ending
at stop, anInteger with characters from aString starting at repStart.
- reimplemented here for speed"
%{ /* NOCONTEXT */
REGISTER unsigned char *srcp, *dstp;
REGISTER int count;
int len, index1, index2;
int repLen, repIndex;
if ((_isString(aString) || _isSymbol(aString))
&& _isString(self)
&& _isSmallInteger(start)
&& _isSmallInteger(stop)) {
len = _stringSize(self);
index1 = _intVal(start);
index2 = _intVal(stop);
count = index2 - index1 + 1;
if (count <= 0) {
RETURN (self);
}
if ((index2 <= len) && (index1 > 0)) {
repLen = _stringSize(aString);
repIndex = _intVal(repStart);
if ((repIndex > 0) && ((repIndex + count - 1) <= repLen)) {
srcp = _stringVal(aString) + repIndex - 1;
dstp = _stringVal(self) + index1 - 1;
if (aString == self) {
/* take care of overlapping copy */
if (srcp < dstp) {
/* must do a reverse copy */
srcp += count;
dstp += count;
while (count-- > 0) {
*--dstp = *--srcp;
}
RETURN (self);
}
}
#ifdef FAST_MEMCPY
bcopy(srcp, dstp, count);
#else
while (count-- > 0) {
*dstp++ = *srcp++;
}
#endif
RETURN (self);
}
}
}
%}
.
^ super replaceFrom:start to:stop with:aString startingAt:repStart
!
replaceAll:oldCharacter by:newCharacter
"replace all oldCharacters by newCharacter in the receiver"
%{ /* NOCONTEXT */
REGISTER unsigned char *srcp;
REGISTER unsigned oldVal, newVal;
if (_isCharacter(oldCharacter)
&& _isCharacter(newCharacter)
&& _isString(self)) {
srcp = (unsigned char *)_stringVal(self);
oldVal = _intVal(_characterVal(oldCharacter));
newVal = _intVal(_characterVal(newCharacter));
while (*srcp) {
if (*srcp == oldVal)
*srcp = newVal;
srcp++;
}
RETURN ( self );
}
%}
.
^ super replaceAll:oldCharacter by:newCharacter
!
reverse
"in-place reverse the characters of the string"
%{ /* NOCONTEXT */
REGISTER char c;
REGISTER unsigned char *hip, *lowp;
if (_isString(self)) {
lowp = _stringVal(self);
hip = lowp + _stringSize(self) - 1;
while (lowp < hip) {
c = *lowp;
*lowp = *hip;
*hip = c;
lowp++;
hip--;
}
RETURN ( self );
}
%}
.
^ super reverse
!
withCRs
"return a copy of the receiver, where
all \-characters are replaced by newline characters
- reimplemented here for speed"
|newString|
%{
OBJ new();
REGISTER char c;
REGISTER unsigned char *srcp, *dstp;
int len, offs;
len = _qSize(self);
if (_qClass(self) != String) {
offs = _intVal(_ClassInstPtr(_qClass(self))->c_ninstvars) * sizeof(OBJ);
len -= offs;
} else
offs = 0;
_qNew(newString, len, __context);
_InstPtr(newString)->o_class = String;
srcp = _stringVal(self) + offs;
dstp = _stringVal(newString);
while (c = *srcp++)
if (c == '\\')
*dstp++ = '\n';
else
*dstp++ = c;
*dstp++ = '\0';
RETURN ( newString );
%}
!
atAllPut:aCharacter
"replace all characters with aCharacter
- reimplemented here for speed"
%{ /* NOCONTEXT */
REGISTER int byteValue;
#ifndef FAST_MEMSET
REGISTER unsigned char *dst;
#endif
if (_isCharacter(aCharacter) && _isString(self)) {
byteValue = _intVal(_characterVal(aCharacter));
#ifdef FAST_MEMSET
memset(_stringVal(self), byteValue, _qSize(self) - OHDR_SIZE - 1);
#else
dst = _stringVal(self);
while (*dst != '\0')
*dst++ = byteValue;
#endif
RETURN ( self );
}
%}
.
^ super atAllPut:aCharacter
!
withoutSpaces
"return a copy of myself without leading and trailing spaces"
|startIndex endIndex blank|
startIndex := 1.
endIndex := self size.
blank := Character space.
[(startIndex < endIndex) and:[(self at:startIndex) == blank]] whileTrue:[
startIndex := startIndex + 1
].
[(endIndex > 1) and:[(self at:endIndex) == blank]] whileTrue:[
endIndex := endIndex - 1
].
startIndex > endIndex ifTrue:[
^ ''
].
((startIndex == 1) and:[endIndex == self size]) ifTrue:[
^ self
].
^ self copyFrom:startIndex to:endIndex
!
withoutSeparators
"return a copy of myself without leading and trailing whitespace"
|startIndex endIndex|
startIndex := 1.
endIndex := self size.
[(startIndex < endIndex) and:[(self at:startIndex) isSeparator]] whileTrue:[
startIndex := startIndex + 1
].
[(endIndex > 1) and:[(self at:endIndex) isSeparator]] whileTrue:[
endIndex := endIndex - 1
].
startIndex > endIndex ifTrue:[
^ ''
].
((startIndex == 1) and:[endIndex == self size]) ifTrue:[
^ self
].
^ self copyFrom:startIndex to:endIndex
! !
!String methodsFor:'queries'!
encoding
"assume iso8859 encoding"
^ #iso8859
!
knownAsSymbol
"return true, if there is a symbol with same characters in the
system - use to check for existance of a symbol without creating one"
%{ /* NOCONTEXT */
extern OBJ _KNOWNASSYMBOL();
RETURN ( _KNOWNASSYMBOL(_stringVal(self)) );
%}
! !