FourByteString.st
author Claus Gittinger <cg@exept.de>
Sat, 02 May 2020 21:40:13 +0200
changeset 5476 7355a4b11cb6
parent 5090 33d4825d883d
permissions -rw-r--r--
#FEATURE by cg class: Socket class added: #newTCPclientToHost:port:domain:domainOrder:withTimeout: changed: #newTCPclientToHost:port:domain:withTimeout:

"
 COPYRIGHT (c) 2004 by eXept Software AG
	      All Rights Reserved

 This software is furnished under a license and may be used
 only in accordance with the terms of that license and with the
 inclusion of the above copyright notice.   This software may not
 be provided or otherwise made available to, or used by, any
 other person.  No title to or ownership of the software is
 hereby transferred.
"
"{ Package: 'stx:libbasic2' }"

"{ NameSpace: Smalltalk }"

CharacterArray variableLongSubclass:#FourByteString
	instanceVariableNames:''
	classVariableNames:''
	poolDictionaries:''
	category:'Collections-Text'
!

!FourByteString class methodsFor:'documentation'!

copyright
"
 COPYRIGHT (c) 2004 by eXept Software AG
	      All Rights Reserved

 This software is furnished under a license and may be used
 only in accordance with the terms of that license and with the
 inclusion of the above copyright notice.   This software may not
 be provided or otherwise made available to, or used by, any
 other person.  No title to or ownership of the software is
 hereby transferred.
"
!

documentation
"
    FourByteStrings are like strings, but storing 32bits per character.
    The integration of them into the system is not completed ....

    [author:]
	Claus Gittinger

    [see also:]
	Text TwoByteString UnicodeEncodedString
	StringCollection
"
! !

!FourByteString class methodsFor:'initialization'!

initialize
    "initialize the class - private"

    self flags:(Behavior flagLongs)

    "
     FourByteString initialize
    "

    "Modified: 22.4.1996 / 16:14:14 / cg"
! !

!FourByteString class methodsFor:'instance creation'!

basicNew:anInteger
    "return a new empty string with anInteger characters"

    ^ (super basicNew:anInteger) atAllPut:(Character space)

    "Modified: 26.2.1996 / 14:38:47 / cg"
!

uninitializedNew:anInteger
    "return a new empty string with anInteger characters"

    ^ super basicNew:anInteger

    "
	self uninitializedNew:10
    "
! !

!FourByteString methodsFor:'accessing'!

basicAt:index
    "return the character at position index, an Integer
     - reimplemented here since we return 32-bit characters"

    |val|

    val := super basicAt:index.
    ^ Character value:val
!

basicAt:index put:aCharacter
    "store the argument, aCharacter at position index, an Integer.
     Returns aCharacter (sigh).
     - reimplemented here since we store 32-bit characters"

    |val|

    val := aCharacter codePoint.
    super basicAt:index put:val.
    ^ aCharacter
! !

!FourByteString methodsFor:'filling and replacing'!

from:start to:stop put:aCharacter
    "fill part of the receiver with aCharacter.
     - reimplemented here for speed"

%{  /* NOCONTEXT */

    REGISTER int count;

    // fprintf(stderr, "fill32...\n");
    if (__isCharacter(aCharacter)
     && __bothSmallInteger(start, stop)) {
	OBJ cls;
	int len, index1, index2;
	REGISTER unsigned int *dstp;

	len = __unicode32StringSize(self);
	index1 = __intVal(start);
	index2 = __intVal(stop);

	dstp = __unicode32StringVal(self) + index1 - 1;
	if (((cls = __qClass(self)) == Unicode32String)
	 || (__OBJS2BYTES__(__intVal(__ClassInstPtr(cls)->c_ninstvars)) == 0)) {
	    REGISTER unsigned INT charValue = __intVal(__characterVal(aCharacter));

	    if (((unsigned)charValue <= 0x0FFFFFFF)
	     && (index1 <= index2)
	     && (index1 > 0)
	     && (index2 <= len)) {
		count = index2 - index1 + 1;

#if (__POINTER_SIZE__ == 8)
		{
		    unsigned INT v2;

		    v2 = (charValue << 32) | charValue;

		    /* fill unaligned part */
		    while ((count > 0) && (((unsigned INT)dstp & 7) != 0)) {
			*dstp++ = charValue;
			count--;
		    }

		    /* fill aligned part */
		    // TODO: use SSE instructions, if possible
		    while (count >= 16) {
			((unsigned INT *)dstp)[0] = v2;
			((unsigned INT *)dstp)[1] = v2;
			((unsigned INT *)dstp)[2] = v2;
			((unsigned INT *)dstp)[3] = v2;
			((unsigned INT *)dstp)[4] = v2;
			((unsigned INT *)dstp)[5] = v2;
			((unsigned INT *)dstp)[6] = v2;
			((unsigned INT *)dstp)[7] = v2;
			dstp += 16;
			count -= 16;
		    }
		    if (count >= 8) {
			((unsigned INT *)dstp)[0] = v2;
			((unsigned INT *)dstp)[1] = v2;
			((unsigned INT *)dstp)[2] = v2;
			((unsigned INT *)dstp)[3] = v2;
			dstp += 8;
			count -= 8;
		    }
		    if (count >= 4) {
			((unsigned INT *)dstp)[0] = v2;
			((unsigned INT *)dstp)[1] = v2;
			dstp += 4;
			count -= 4;
		    }
		    if (count >= 2) {
			((unsigned INT *)dstp)[0] = v2;
			dstp += 2;
			count -= 2;
		    }
		    if (count > 0) {
			*dstp = charValue;
		    }
		}
#else // not 64bit
		while (count >= 8) {
		    dstp[0] = dstp[1] = dstp[2] = dstp[3] =
		    dstp[4] = dstp[5] = dstp[6] = dstp[7] = charValue;
		    dstp += 8;
		    count -= 8;
		}
		while (count--) {
		    *dstp++ = charValue;
		}
#endif /* 64bit */
		RETURN (self);
	    }
	}
    }
%}.
    "
     fall back in case of non-integer index or out-of-bound index/value;
     will eventually lead to an out-of-bound signal raise
    "
    ^ super from:start to:stop put:aCharacter

    "
     (Unicode16String new:10) from:1 to:10 put:$a
     (Unicode16String new:20) from:10 to:20 put:$b
     (Unicode16String new:20) from:1 to:10 put:$c
     (Unicode16String new:20) from:1 to:10 put:$c
     (Unicode16String new:100) from:2 to:99 put:$c

     (Unicode16String new:10) from:0 to:9 put:$a
     (Unicode16String new:10) from:1 to:11 put:$a
    "

    "Created: / 26-03-2019 / 11:30:51 / Claus Gittinger"
    "Modified: / 27-03-2019 / 14:10:18 / Claus Gittinger"
!

replaceFrom:start to:stop with:aString startingAt:repStart
    "replace the characters starting at index start, anInteger and ending
     at stop, anInteger with characters from aString starting at repStart.
     Return the receiver.

     - reimplemented here for speed"

%{  /* NOCONTEXT */

#ifndef NO_PRIM_STRING
    if (__bothSmallInteger(start, stop)) {
	int len;
	int index1 = __intVal(start);
	int index2 = __intVal(stop);
	int count = index2 - index1 + 1;

	if (count <= 0) {
	     RETURN (self);
	}
	len = __unicode32StringSize(self);
	if ((index2 <= len) && (index1 > 0)) {
	    int repIndex = __intVal(repStart);
	    OBJ cls;

	    if (((cls = __qClass(self)) == Unicode32String)
	     || (__OBJS2BYTES__(__intVal(__ClassInstPtr(cls)->c_ninstvars)) == 0)) {
		if (__isStringLike(aString)) {
		    int repLen = __stringSize(aString);
		    if ((repIndex > 0) && ((repIndex + count - 1) <= repLen)) {
			REGISTER unsigned char *srcp = __stringVal(aString) + repIndex - 1;
			REGISTER unsigned int *dstp  = __unicode32StringVal(self) + index1 - 1;

			while (count-- > 0) {
			    *dstp++ = *srcp++;
			}
			RETURN (self);
		    }
		} else if (__isTwoByteString(aString) || __isUnicode16String(aString)) {
		    int repLen = __twoByteStringSize(aString);
		    if ((repIndex > 0) && ((repIndex + count - 1) <= repLen)) {
			REGISTER unsigned short *srcp = __twoByteStringVal(aString) + repIndex - 1;
			REGISTER unsigned int *dstp  = __unicode32StringVal(self) + index1 - 1;

			while (count-- > 0) {
			    *dstp++ = *srcp++;
			}
			RETURN (self);
		    }
		} else if (__isUnicode32String(aString)) {
		    int repLen = __unicode32StringSize(aString);
		    if ((repIndex > 0) && ((repIndex + count - 1) <= repLen)) {
			REGISTER unsigned int *srcp  = __unicode32StringVal(aString) + repIndex - 1;
			REGISTER unsigned int *dstp = __unicode32StringVal(self) + index1 - 1;

			if (aString == self) {
			    /* take care of overlapping copy */
			    memmove(dstp, srcp, count*sizeof(int));
			    RETURN (self);
			}
			if (count > 5) {
			    memcpy(dstp, srcp, count*sizeof(int));
			} else {
			    while (count-- > 0) {
				*dstp++ = *srcp++;
			    }
			}
			RETURN (self);
		    }
		}
	    }
	}
    }
#endif
%}.
    "/ arrive here if any index arg is out o range, or the source is neither a string,
    "/ nor a two-byte string.
    ^ super replaceFrom:start to:stop with:aString startingAt:repStart

    "
     'hello world' asUnicode32String replaceFrom:1 to:5 with:'123456' startingAt:2
     'hello world' asUnicode32String replaceFrom:1 to:5 with:'123456' asUnicode16String startingAt:2
     'hello world' asUnicode32String replaceFrom:1 to:5 with:'123456' asUnicode32String startingAt:2
     'hello world' asUnicode32String replaceFrom:1 to:0 with:'123456' startingAt:2
     'hello' asUnicode32String replaceFrom:1 to:6 with:'123456' startingAt:2
     'hello world' asUnicode32String replaceFrom:1 to:1 with:'123456' startingAt:2
    "

    "Created: / 26-03-2019 / 12:10:26 / Claus Gittinger"
    "Modified: / 27-03-2019 / 14:11:27 / Claus Gittinger"
! !

!FourByteString methodsFor:'queries'!

bitsPerCharacter
    "return the number of bits each character has.
     Here, 32 is returned (storing quad byte characters)."

    ^ 32
!

bytesPerCharacter
    "return the number of bytes each character has.
     Here, 4 is returned (storing quad byte characters)."

    ^ 4
!

isWideString
    "true if I require more than one byte per character"

    ^ true
!

occurrencesOf:aCharacter
    "count the occurrences of the argument, aCharacter in myself
      - reimplemented here for speed"

%{  /* NOCONTEXT */

    REGISTER unsigned charValue;
    REGISTER INT count, limit;

    if (__isCharacter(aCharacter)) {
	limit = __unicode32StringSize(self);
	count = 0;
	charValue = __intVal(__characterVal(aCharacter));
	if (charValue <= 0x3FFFFFFF) {
	    OBJ cls;
	    REGISTER unsigned int* cp;

	    cp = __unicode32StringVal(self);
	    if (((cls = __qClass(self)) == Unicode32String)
	     || (__OBJS2BYTES__(__intVal(__ClassInstPtr(cls)->c_ninstvars)) == 0)) {
		/* loop unrolled and software-pipelined
		 * (gives 30-40% speedup on Intel-DUO using borland bcc55)
		 */
		while (limit >= 4) {
		    register unsigned int c1, c2;

		    c1 = cp[0];
		    limit -= 4;
		    c2 = cp[1];
		    if (c1 == charValue) count++;
		    c1 = cp[2];
		    if (c2 == charValue) count++;
		    c2 = cp[3];
		    if (c1 == charValue) count++;
		    cp += 4;
		    if (c2 == charValue) count++;
		}
		while (limit > 0) {
		    register unsigned int c;

		    c = cp[0];
		    limit--;
		    if (c == charValue) count++;
		    cp++;
		}
	    }
	    RETURN ( __mkSmallInteger(count) );
	}
    }
%}.
    ^ super occurrencesOf:aCharacter

    "
     'hello world' asUnicode32String occurrencesOf:$a
     'hello world' asUnicode32String occurrencesOf:$w
     'hello world' asUnicode32String occurrencesOf:$l
     'hello world' asUnicode32String occurrencesOf:$x
     'hello world' asUnicode32String occurrencesOf:1

     Time millisecondsToRun:[
	|s|

	s := 'abcdefghijklmn' asUnicode32String.
	1000000 timesRepeat:[ s occurrencesOf:$x ]
     ]. 60 60 60 70 (untuned: 690 760 670)
    "

    "Created: / 27-03-2019 / 14:13:43 / Claus Gittinger"
! !

!FourByteString methodsFor:'testing'!

isSingleByteCollection
    "return true, if the receiver has access methods for bytes;
     i.e. #at: and #at:put: accesses a byte and are equivalent to #byteAt: and byteAt:put:
     and #replaceFrom:to: is equivalent to #replaceBytesFrom:to:.
     false is returned here since at: returns 4-byte characters and not bytes
      - the method is redefined from UninterpretedBytes."

    ^ false

    "Created: / 30-08-2017 / 23:31:02 / cg"
! !

!FourByteString class methodsFor:'documentation'!

version
    ^ '$Header$'
!

version_CVS
    ^ '$Header$'
! !


FourByteString initialize!