TwoByteString.st
author Jan Vrany <jan.vrany@fit.cvut.cz>
Wed, 17 Jun 2015 06:22:00 +0100
branchjv
changeset 18487 8735bd9eee2f
parent 18120 e3a375d5f6a8
child 18608 7d521f25267c
permissions -rw-r--r--
Use inlined FNV1a hash for String ...and do not use __symbolHash(). Although currently the VM also uses FNV1a hash for Symbols, the __symbolHash() does not handle properly character with codepoint 0 (because '\0' is used as a string terminator). This causes problems with Unicode16/32Strigs whose version of FNV1a hash is using object size from header to determine string's end. Added Symbol>>hash that actually *uses* the __symbolHash() to make sure it's hash is the the same as used bu the VM. Symbols with zeroes are rare and there's no Unicode16/32Symbol. This commit fixes issue #65.

"{ Encoding: utf8 }"

"
 COPYRIGHT (c) 1993 by Claus Gittinger
	      All Rights Reserved

 This software is furnished under a license and may be used
 only in accordance with the terms of that license and with the
 inclusion of the above copyright notice.   This software may not
 be provided or otherwise made available to, or used by, any
 other person.  No title to or ownership of the software is
 hereby transferred.
"
"{ Package: 'stx:libbasic' }"

"{ NameSpace: Smalltalk }"

CharacterArray variableWordSubclass:#TwoByteString
	instanceVariableNames:''
	classVariableNames:''
	poolDictionaries:''
	category:'Collections-Text'
!

!TwoByteString class methodsFor:'documentation'!

copyright
"
 COPYRIGHT (c) 1993 by Claus Gittinger
	      All Rights Reserved

 This software is furnished under a license and may be used
 only in accordance with the terms of that license and with the
 inclusion of the above copyright notice.   This software may not
 be provided or otherwise made available to, or used by, any
 other person.  No title to or ownership of the software is
 hereby transferred.
"
!

documentation
"
    TwoByteStrings are like strings, but storing 16bits per character.
    The integration of them into the system is not completed ....

    [author:]
        Claus Gittinger

    [see also:]
        Text JISEncodedString
        StringCollection
"
! !

!TwoByteString class methodsFor:'initialization'!

initialize
    "initialize the class - private"

    self flags:(Behavior flagWords)

    "
     TwoByteString initialize
    "

    "Modified: 22.4.1996 / 16:14:14 / cg"
! !

!TwoByteString class methodsFor:'instance creation'!

basicNew:anInteger
    "return a new empty string with anInteger characters"

    ^ (super basicNew:anInteger) atAllPut:(Character space)

    "Modified: 26.2.1996 / 14:38:47 / cg"
!

uninitializedNew:anInteger
    "return a new empty string with anInteger characters"

    ^ super basicNew:anInteger

    "
        self uninitializedNew:10
    "
! !

!TwoByteString methodsFor:'accessing'!

basicAt:index
    "return the character at position index, an Integer
     - reimplemented here since we return 16-bit characters"

    |val|

    val := super basicAt:index.
    ^ Character value:val

    "Modified: 26.2.1996 / 17:02:16 / cg"
!

basicAt:index put:aCharacter
    "store the argument, aCharacter at position index, an Integer.
     Returns aCharacter (sigh).
     - reimplemented here since we store 16-bit characters"

    super basicAt:index put:aCharacter codePoint.
    ^ aCharacter

    "Modified: 19.4.1996 / 11:16:22 / cg"
!

unsignedShortAt:index
    "return the short at position index, an Integer"

    ^ super basicAt:index.
! !

!TwoByteString methodsFor:'encoding'!

utf8Encoded
    "Return my UTF-8 representation as a new String"

    self contains8BitCharacters ifTrue:[
        ^ self basicUtf8Encoded.
    ].

    ^ self asSingleByteString.


    "
        'abcdef' asUnicode16String utf8Encoded
        'abcdefäöü' asUnicode16String utf8Encoded
    "
!

utf8EncodedOn:aStream
    "write to aStream in utf8 encoding"

    self contains8BitCharacters ifTrue:[
        aStream nextPutAllUtf8:self.
    ] ifFalse:[
        |sz "{Class: SmallInteger}"|

        sz := self size.
        1 to:sz do:[:idx|
            aStream nextPut:(self basicAt:idx).
        ].
    ].

    "
        |s|
        s := '' writeStream.
        'abcdef' asUnicode16String utf8EncodedOn:s.
        s contents
    "

    "
        |s|
        s := '' writeStream.
        'abcdefäöü' asUnicode16String utf8EncodedOn:s.
        s contents
    "
! !

!TwoByteString methodsFor:'filling and replacing'!

replaceFrom:start to:stop with:aString startingAt:repStart
    "replace the characters starting at index start, anInteger and ending
     at stop, anInteger with characters from aString starting at repStart.
     Return the receiver.

     - reimplemented here for speed"

%{  /* NOCONTEXT */

#ifndef NO_PRIM_STRING
    if (__bothSmallInteger(start, stop)) {
        REGISTER int count;
        int len, index1, index2;

        index1 = __intVal(start);
        index2 = __intVal(stop);
        count = index2 - index1 + 1;
        if (count <= 0) {
             RETURN (self);
        }
        len = __twoByteStringSize(self);
        if ((index2 <= len) && (index1 > 0)) {
            int repLen, repIndex;

            repIndex = __intVal(repStart);

            if (__isStringLike(aString)) {
                repLen = __stringSize(aString);
                if ((repIndex > 0) && ((repIndex + count - 1) <= repLen)) {
                    REGISTER unsigned char *srcp;
                    REGISTER unsigned short *dstp;

                    srcp = __stringVal(aString) + repIndex - 1;
                    dstp = __twoByteStringVal(self) + index1 - 1;
                    while (count-- > 0) {
                        *dstp++ = *srcp++;
                    }
                    RETURN (self);
                }
            } else  if (__isTwoByteString(aString) || __isUnicode16String(aString)) {
                repLen = __twoByteStringSize(aString);
                if ((repIndex > 0) && ((repIndex + count - 1) <= repLen)) {
                    REGISTER unsigned short *srcp;
                    REGISTER unsigned short *dstp;

                    srcp = __twoByteStringVal(aString) + repIndex - 1;
                    dstp = __twoByteStringVal(self) + index1 - 1;
                    if (aString == self) {
                        /* take care of overlapping copy */
                        if (srcp < dstp) {
                            /* must do a reverse copy */
                            srcp += count;
                            dstp += count;
                            while (count-- > 0) {
                                *--dstp = *--srcp;
                            }
                            RETURN (self);
                        }
                    }
                    if (count > 5) {
                        memcpy(dstp, srcp, count*sizeof(short));
                    } else {
                        while (count-- > 0) {
                            *dstp++ = *srcp++;
                        }
                    }
                    RETURN (self);
                }
            }
        }
    }
#endif
%}.
    "/ arrive here if any index arg is out o range, or the source is neither a string,
    "/ nor a two-byte string.
    ^ super replaceFrom:start to:stop with:aString startingAt:repStart

    "
     'hello world' asUnicode16String replaceFrom:1 to:5 with:'123456' startingAt:2
     'hello world' asUnicode16String replaceFrom:1 to:5 with:'123456' asUnicode16String startingAt:2
     'hello world' asUnicode16String replaceFrom:1 to:0 with:'123456' startingAt:2
     'hello' asUnicode16String replaceFrom:1 to:6 with:'123456' startingAt:2
     'hello world' asUnicode16String replaceFrom:1 to:1 with:'123456' startingAt:2
    "
! !

!TwoByteString methodsFor:'queries'!

bitsPerCharacter
    "return the number of bits each character has.
     Here, 16 is returned (storing double byte characters)."

    ^ 16

    "Modified: 20.4.1996 / 23:08:38 / cg"
!

contains8BitCharacters
    "return true, if the underlying string contains 8BitCharacters (or widers)
     (i.e. if it is non-ascii)"

%{  /* NOCONTEXT */

    REGISTER unsigned short *sp, *last;
    OBJ cls;

    sp = __twoByteStringVal(self);
    last = sp + __twoByteStringSize(self);
    if ((cls = __qClass(self)) != TwoByteString && cls != Unicode16String) {
        sp += __OBJS2BYTES__(__intVal(__ClassInstPtr(cls)->c_ninstvars)) / 2;
    }
#if __POINTER_SIZE__ == 8
    /* assume sizeof(long) == 4
     * if __POINTER_SIZE__ == 4
     */
    if (sizeof(long) == 8) {
        while ((sp+4) <= last) {
            if (*(unsigned long *)sp & 0xFF80FF80FF80FF80) {
                RETURN ( true );
            }
            sp += 4;
        }
    }
#endif
    if (sizeof(int) == 4) {
        while ((sp+2) <= last) {
            if (*(unsigned int *)sp & 0xFF80FF80) {
                RETURN ( true );
            }
            sp += 2;
        }
    }
    while (sp <= last) {
        if (*sp & 0xFF80) {
            RETURN ( true );
        }
        sp++;
    }
    RETURN (false);
%}.

    "
     'hello world' asUnicode16String contains8BitCharacters
     'hello worldüäö' asUnicode16String contains8BitCharacters
     'ü' asUnicode16String contains8BitCharacters
     'aü' asUnicode16String contains8BitCharacters
     'aaü' asUnicode16String contains8BitCharacters
     'aaaü' asUnicode16String contains8BitCharacters
     'aaaaü' asUnicode16String contains8BitCharacters
    "
!

isWideString
    ^ true
! !

!TwoByteString class methodsFor:'documentation'!

version
    ^ '$Header: /cvs/stx/stx/libbasic/TwoByteString.st,v 1.36 2015-03-14 21:30:22 stefan Exp $'
! !


TwoByteString initialize!