TwoByteString.st
author Claus Gittinger <cg@exept.de>
Tue, 09 Jul 2019 20:55:17 +0200
changeset 24417 03b083548da2
parent 23984 ac00c411f7f6
permissions -rw-r--r--
#REFACTORING by exept class: Smalltalk class changed: #recursiveInstallAutoloadedClassesFrom:rememberIn:maxLevels:noAutoload:packageTop:showSplashInLevels: Transcript showCR:(... bindWith:...) -> Transcript showCR:... with:...

"{ Encoding: utf8 }"

"
 COPYRIGHT (c) 1993 by Claus Gittinger
	      All Rights Reserved

 This software is furnished under a license and may be used
 only in accordance with the terms of that license and with the
 inclusion of the above copyright notice.   This software may not
 be provided or otherwise made available to, or used by, any
 other person.  No title to or ownership of the software is
 hereby transferred.
"
"{ Package: 'stx:libbasic' }"

"{ NameSpace: Smalltalk }"

CharacterArray variableWordSubclass:#TwoByteString
	instanceVariableNames:''
	classVariableNames:''
	poolDictionaries:''
	category:'Collections-Text'
!

!TwoByteString class methodsFor:'documentation'!

copyright
"
 COPYRIGHT (c) 1993 by Claus Gittinger
	      All Rights Reserved

 This software is furnished under a license and may be used
 only in accordance with the terms of that license and with the
 inclusion of the above copyright notice.   This software may not
 be provided or otherwise made available to, or used by, any
 other person.  No title to or ownership of the software is
 hereby transferred.
"
!

documentation
"
    TwoByteStrings are like strings, but storing 16bits per character.
    The integration of them into the system is not completed ....

    [author:]
        Claus Gittinger

    [see also:]
        Text JISEncodedString
        StringCollection
"
! !

!TwoByteString class methodsFor:'initialization'!

initialize
    "initialize the class - private"

    self flags:(Behavior flagWords)

    "
     TwoByteString initialize
    "

    "Modified: 22.4.1996 / 16:14:14 / cg"
! !

!TwoByteString class methodsFor:'instance creation'!

basicNew:anInteger
    "return a new empty string with anInteger number of characters"

    ^ (super basicNew:anInteger) atAllPut:(Character space)

    "Modified: / 26-02-1996 / 14:38:47 / cg"
    "Modified (comment): / 22-11-2017 / 21:32:49 / cg"
!

uninitializedNew:anInteger
    "return a new empty string with anInteger characters"

    ^ super basicNew:anInteger

    "
        self uninitializedNew:10
    "
! !

!TwoByteString methodsFor:'accessing'!

basicAt:index
    "return the character at position index, an Integer
     - reimplemented here since we return 16-bit characters"

    |val|

    val := super basicAt:index.
    ^ Character value:val

    "Modified: 26.2.1996 / 17:02:16 / cg"
!

basicAt:index put:aCharacter
    "store the argument, aCharacter at position index, an Integer.
     Returns aCharacter (sigh).
     - reimplemented here since we store 16-bit characters"

    super basicAt:index put:aCharacter codePoint.
    ^ aCharacter

    "Modified: 19.4.1996 / 11:16:22 / cg"
!

unsignedShortAt:index
    "return the short at position index, an Integer"

    ^ super basicAt:index.
! !

!TwoByteString methodsFor:'filling and replacing'!

from:start to:stop put:aCharacter
    "fill part of the receiver with aCharacter.
     - reimplemented here for speed"

%{  /* NOCONTEXT */

    REGISTER unsigned short *dstp;
    REGISTER int count, charValue;
    
    // fprintf(stderr, "fill16...\n");
    if (__isCharacter(aCharacter)
     && __bothSmallInteger(start, stop)) {
        int len, index1, index2;
        OBJ cls;
        
        len = __twoByteStringSize(self);
        index1 = __intVal(start);
        index2 = __intVal(stop);

        dstp = __twoByteStringVal(self) + index1 - 1;
        if (((cls = __qClass(self)) == Unicode16String)
         || (__OBJS2BYTES__(__intVal(__ClassInstPtr(cls)->c_ninstvars)) == 0)) {

            charValue = __intVal(__characterVal(aCharacter));
            if (((unsigned)charValue <= 0xFFFF)
             && (index1 <= index2)
             && (index1 > 0)) {
                if (index2 <= len) {
                    count = index2 - index1 + 1;

#if (__POINTER_SIZE__ == 8)
                    {
                        INT v4;

                        v4 = (charValue << 16) | charValue;
                        v4 = (v4 << 32) | v4;

                        /* fill unaligned part */
                        while ((count > 0) && (((unsigned INT)dstp & 7) != 0)) {
                            *dstp++ = charValue;
                            count--;
                        }

                        /* fill aligned part */
                        while (count >= 4) {
                            ((unsigned INT *)dstp)[0] = v4;
                            dstp += 4;
                            count -= 4;
                        }

                        /* fill rest */
                        while (count > 0) {
                            *dstp++ = charValue;
                            count--;
                        }
                        RETURN (self);
                    }
#endif /* 64bit */

                    while (count >= 8) {
                        dstp[0] = dstp[1] = dstp[2] = dstp[3] =
                        dstp[4] = dstp[5] = dstp[6] = dstp[7] = charValue;
                        dstp += 8;
                        count -= 8;
                    }
                    while (count--) {
                        *dstp++ = charValue;
                    }
                    RETURN (self);
                }
            }
        }
    }
%}.
    "
     fall back in case of non-integer index or out-of-bound index/value;
     will eventually lead to an out-of-bound signal raise
    "
    ^ super from:start to:stop put:aCharacter

    "
     (Unicode16String new:10) from:1 to:10 put:$a
     (Unicode16String new:20) from:10 to:20 put:$b
     (Unicode16String new:20) from:1 to:10 put:$c
     (Unicode16String new:20) from:1 to:10 put:$c 
     (Unicode16String new:100) from:2 to:99 put:$c 

     (Unicode16String new:10) from:0 to:9 put:$a
     (Unicode16String new:10) from:1 to:11 put:$a
    "

    "Created: / 26-03-2019 / 11:20:14 / Claus Gittinger"
    "Modified: / 27-03-2019 / 14:05:10 / Claus Gittinger"
!

replaceFrom:start to:stop with:aString startingAt:repStart
    "replace the characters starting at index start, anInteger and ending
     at stop, anInteger with characters from aString starting at repStart.
     Return the receiver.

     - reimplemented here for speed"

%{  /* NOCONTEXT */

#ifndef NO_PRIM_STRING
    if (__bothSmallInteger(start, stop)) {
        int len;
        int index1 = __intVal(start);
        int index2 = __intVal(stop);
        int count = index2 - index1 + 1;

        if (count <= 0) {
             RETURN (self);
        }
        len = __twoByteStringSize(self);
        if ((index2 <= len) && (index1 > 0)) {
            int repIndex = __intVal(repStart);
            OBJ cls;
            
            if (((cls = __qClass(self)) == Unicode16String)
             || (__OBJS2BYTES__(__intVal(__ClassInstPtr(cls)->c_ninstvars)) == 0)) {
                if (__isStringLike(aString)) {
                    int repLen = __stringSize(aString);
                    if ((repIndex > 0) && ((repIndex + count - 1) <= repLen)) {
                        REGISTER unsigned char *srcp = __stringVal(aString) + repIndex - 1;
                        REGISTER unsigned short *dstp  = __twoByteStringVal(self) + index1 - 1;

                        while (count-- > 0) {
                            *dstp++ = *srcp++;
                        }
                        RETURN (self);
                    }
                } else  if (__isTwoByteString(aString) || __isUnicode16String(aString)) {
                    int repLen = __twoByteStringSize(aString);
                    if ((repIndex > 0) && ((repIndex + count - 1) <= repLen)) {
                        REGISTER unsigned short *srcp  = __twoByteStringVal(aString) + repIndex - 1;
                        REGISTER unsigned short *dstp = __twoByteStringVal(self) + index1 - 1;

                        if (aString == self) {
                            /* take care of overlapping copy */
                            memmove(dstp, srcp, count*sizeof(short));
                            RETURN (self);
                        }
                        if (count > 5) {
                            memcpy(dstp, srcp, count*sizeof(short));
                        } else {
                            while (count-- > 0) {
                                *dstp++ = *srcp++;
                            }
                        }
                        RETURN (self);
                    }
                }
            }
        }
    }
#endif
%}.
    "/ arrive here if any index arg is out o range, or the source is neither a string,
    "/ nor a two-byte string.
    ^ super replaceFrom:start to:stop with:aString startingAt:repStart

    "
     'hello world' asUnicode16String replaceFrom:1 to:5 with:'123456' startingAt:2
     'hello world' asUnicode16String replaceFrom:1 to:5 with:'123456' asUnicode16String startingAt:2
     'hello world' asUnicode16String replaceFrom:1 to:0 with:'123456' startingAt:2
     'hello' asUnicode16String replaceFrom:1 to:6 with:'123456' startingAt:2
     'hello world' asUnicode16String replaceFrom:1 to:1 with:'123456' startingAt:2
    "

    "Modified: / 27-03-2019 / 14:03:27 / Claus Gittinger"
! !

!TwoByteString methodsFor:'queries'!

bitsPerCharacter
    "return the number of bits each character has.
     Here, 16 is returned (storing double byte characters)."

    ^ 16

    "Modified: 20.4.1996 / 23:08:38 / cg"
!

bytesPerCharacter
    "return the number of bytes each character has.
     Here, 2 is returned (storing double byte characters)."

    ^ 2
!

characterSize
    "answer the size in bits of my largest character (actually only 7, 8 or 16)"

%{  /* NOCONTEXT */

    REGISTER unsigned short *sp = __twoByteStringVal(self);
    REGISTER unsigned short *last = sp + __twoByteStringSize(self);
    OBJ cls = __qClass(self);
    int has8BitChars = 0;

    if (cls != Unicode16String) {
        sp += __OBJS2BYTES__(__intVal(__ClassInstPtr(cls)->c_ninstvars)) / 2;
    }

#if __POINTER_SIZE__ == 8
    if (sizeof(unsigned INT) == 8) {
        if (!has8BitChars) {
            for ( ; (sp+4) <= last; sp += 4) {
                if (*(unsigned INT *)sp & 0xFF80FF80FF80FF80) {
                    /* there are at least 8-bit chars - check for more */
                    has8BitChars = 1;
                    break;
                }
            }
        }
        for ( ; (sp+4) <= last; sp += 4) {
            if (*(unsigned INT *)sp & 0xFF00FF00FF00FF00) {
                RETURN(__mkSmallInteger(16));
            }
        }
    }
#endif
    if (sizeof(unsigned int) == 4) {
        if (!has8BitChars) {
            for ( ; (sp+2) <= last; sp += 2) {
                if (*(unsigned int *)sp & 0xFF80FF80) {
                    /* there are at least 8-bit chars - check for more */
                    has8BitChars = 1;
                    break;
                }
            }
        }
        for ( ; (sp+2) <= last; sp += 2) {
            if (*(unsigned int *)sp & 0xFF00FF00) {
                RETURN(__mkSmallInteger(16));
            }
        }
    }
    if (!has8BitChars) {
        for ( ; sp < last; sp++) {
            if (*sp & 0xFF80) {
                /* there are at least 8-bit chars - check for more */
                has8BitChars = 1;
                break;
            }
        }
    }
    for ( ; sp < last; sp++) {
        if (*sp & 0xFF00) {
            RETURN(__mkSmallInteger(16));
        }
    }
    RETURN (__mkSmallInteger(has8BitChars ? 8 : 7));
%}.

    "
     'hello world' asUnicode16String characterSize
     'hello worldüäö' asUnicode16String characterSize
     'a' asUnicode16String characterSize
     'ü' asUnicode16String characterSize
     'aa' asUnicode16String characterSize
     'aü' asUnicode16String characterSize
     'aaa' asUnicode16String characterSize
     'aaü' asUnicode16String characterSize
     'aaaü' asUnicode16String characterSize
     'aaaa' asUnicode16String characterSize
     'aaaaü' asUnicode16String characterSize
    "

    "Modified: / 27-03-2019 / 14:06:56 / Claus Gittinger"
!

containsNon7BitAscii
    "return true, if the underlying string contains 8BitCharacters (or widers)
     (i.e. if it is non-ascii)"

%{  /* NOCONTEXT */

    REGISTER unsigned short *sp = __twoByteStringVal(self);
    REGISTER unsigned short *last = sp + __twoByteStringSize(self);
    OBJ cls = __qClass(self);

    if (cls != Unicode16String) {
        sp += __OBJS2BYTES__(__intVal(__ClassInstPtr(cls)->c_ninstvars)) / 2;
    }
#if __POINTER_SIZE__ == 8
    if (sizeof(unsigned INT) == 8) {
        for ( ; (sp+4) <= last; sp += 4) {
            if (*(unsigned INT *)sp & 0xFF80FF80FF80FF80) {
                RETURN ( true );
            }
        }
    }
#endif
    if (sizeof(unsigned int) == 4) {
        for ( ; (sp+2) <= last; sp += 2) {
            if (*(unsigned int *)sp & 0xFF80FF80) {
                RETURN ( true );
            }
        }
    }
    for ( ; sp < last; sp++) {
        if (*sp & 0xFF80) {
            RETURN ( true );
        }
    }
    RETURN (false);
%}.

    "
     'hello world' asUnicode16String containsNon7BitAscii
     'hello worldüäö' asUnicode16String containsNon7BitAscii
     'ü' asUnicode16String containsNon7BitAscii
     'aü' asUnicode16String containsNon7BitAscii
     'aaü' asUnicode16String containsNon7BitAscii
     'aaaü' asUnicode16String containsNon7BitAscii
     'aaaaü' asUnicode16String containsNon7BitAscii
     'aaaaa' asUnicode16String containsNon7BitAscii
    "

    "Modified: / 27-03-2019 / 14:07:10 / Claus Gittinger"
!

isWideString
    "true if I require more than one byte per character"

    ^ true
!

occurrencesOf:aCharacter
    "count the occurrences of the argument, aCharacter in myself
      - reimplemented here for speed"

%{  /* NOCONTEXT */

    REGISTER unsigned charValue;
    REGISTER INT count, limit;

    if (__isCharacter(aCharacter)) {
        limit = __unicode16StringSize(self);
        count = 0;
        charValue = __intVal(__characterVal(aCharacter));
        if (charValue <= 0xFFFF) {
            OBJ cls;
            REGISTER unsigned short* cp;

            cp = __unicode16StringVal(self);
            if (((cls = __qClass(self)) == Unicode16String)
             || (__OBJS2BYTES__(__intVal(__ClassInstPtr(cls)->c_ninstvars)) == 0)) {
                /* loop unrolled and software-pipelined
                 * (gives 30-40% speedup on Intel-DUO using borland bcc55)
                 */
                while (limit >= 4) {
                    register unsigned short c1, c2;

                    c1 = cp[0];
                    limit -= 4;
                    c2 = cp[1];
                    if (c1 == charValue) count++;
                    c1 = cp[2];
                    if (c2 == charValue) count++;
                    c2 = cp[3];
                    if (c1 == charValue) count++;
                    cp += 4;
                    if (c2 == charValue) count++;
                }
                while (limit > 0) {
                    register unsigned short c;

                    c = cp[0];
                    limit--;
                    if (c == charValue) count++;
                    cp++;
                }
            }
            RETURN ( __mkSmallInteger(count) );
        }
    }
%}.
    ^ super occurrencesOf:aCharacter

    "
     'hello world' occurrencesOf:$a
     'hello world' occurrencesOf:$w
     'hello world' occurrencesOf:$l
     'hello world' occurrencesOf:$x
     'hello world' occurrencesOf:1
     Time millisecondsToRun:[
        |s|
        
        s := 'abcdefghijklmn' asUnicode16String. 
        1000000 timesRepeat:[ s occurrencesOf:$x ]
     ]. 60 60 60 70 (untuned: 670 710 740)
    "

    "Created: / 27-03-2019 / 13:58:55 / Claus Gittinger"
! !

!TwoByteString methodsFor:'testing'!

isSingleByteCollection
    "return true, if the receiver has access methods for bytes;
     i.e. #at: and #at:put: accesses a byte and are equivalent to #byteAt: and byteAt:put:
     and #replaceFrom:to: is equivalent to #replaceBytesFrom:to:. 
     false is returned here since at: returns 2-byte characters and not bytes 
      - the method is redefined from UninterpretedBytes."

    ^ false

    "Created: / 30-08-2017 / 23:30:36 / cg"
! !

!TwoByteString class methodsFor:'documentation'!

version
    ^ '$Header$'
!

version_CVS
    ^ '$Header$'
! !


TwoByteString initialize!