CharacterEncoderImplementations__ISO10646_to_UTF16BE.st
author Claus Gittinger <cg@exept.de>
Thu, 07 Jul 2005 19:36:41 +0200
changeset 8903 4e15c297fadc
child 9325 a4c635a6f8eb
permissions -rw-r--r--
initial checkin

"
 COPYRIGHT (c) 2005 by eXept Software AG
              All Rights Reserved

 This software is furnished under a license and may be used
 only in accordance with the terms of that license and with the
 inclusion of the above copyright notice.   This software may not
 be provided or otherwise made available to, or used by, any
 other person.  No title to or ownership of the software is
 hereby transferred.
"

"{ Package: 'stx:libbasic' }"

"{ NameSpace: CharacterEncoderImplementations }"

TwoByteEncoder subclass:#ISO10646_to_UTF16BE
	instanceVariableNames:''
	classVariableNames:''
	poolDictionaries:''
	category:'Collections-Text-Encodings'
!

!ISO10646_to_UTF16BE class methodsFor:'documentation'!

copyright
"
 COPYRIGHT (c) 2005 by eXept Software AG
              All Rights Reserved

 This software is furnished under a license and may be used
 only in accordance with the terms of that license and with the
 inclusion of the above copyright notice.   This software may not
 be provided or otherwise made available to, or used by, any
 other person.  No title to or ownership of the software is
 hereby transferred.
"
!

documentation
"
    encodes/decodes UTF16 BigEndian (big-end-first)
"
!

examples
"
  Encoding (unicode to utf16BE)
     ISO10646_to_UTF16BE encodeString:'hello'.


  Decoding (utf16BE to unicode):
     |t|

     t := ISO10646_to_UTF16BE encodeString:''.
     ISO10646_to_UTF16BE decodeString:t.
"
! !

!ISO10646_to_UTF16BE methodsFor:'encoding & decoding'!

decode:aCode
    self shouldNotImplement "/ no single byte conversion possible
!

decodeString:aStringOrByteCollection
    "given a byteArray (2-bytes per character) or unsignedShortArra in UTF16 encoding,
     return a new string containing the same characters, in 8, 16bit (or more) encoding.
     Returns either a normal String, a TwoByte- or a FourByte-String instance.
     Only useful, when reading from external sources.
     This only handles up-to 30bit characters."

    |sz nBitsRequired s newString idx bitsPerElementIn nextIn
     codeIn codeIn1 codeIn2|

    aStringOrByteCollection isByteArray ifTrue:[
        bitsPerElementIn := 8.
    ] ifFalse:[
        aStringOrByteCollection isString ifTrue:[
            bitsPerElementIn := aStringOrByteCollection bitsPerCharacter.
        ] ifFalse:[
            bitsPerElementIn := 16.
        ].
    ].

    bitsPerElementIn == 8 ifTrue:[
        nextIn := [self nextTwoByteValueFrom:s].
    ] ifFalse:[
        nextIn := [s next].
    ].

    nBitsRequired := 8.
    sz := 0.
    s := aStringOrByteCollection readStream.
    [s atEnd] whileFalse:[
        codeIn := nextIn value.
        sz := sz + 1.

        codeIn <= 16rFF ifTrue:[
        ] ifFalse:[
            nBitsRequired := nBitsRequired max:16.
            (codeIn between:16rD800 and:16rDBFF) ifTrue:[
                nBitsRequired := 32.
                codeIn2 := nextIn value.
            ].
        ]
    ].

    nBitsRequired == 8 ifTrue:[
        newString := String uninitializedNew:sz
    ] ifFalse:[
        nBitsRequired <= 16 ifTrue:[
            newString := Unicode16String new:sz
        ] ifFalse:[
            newString := Unicode32String new:sz
        ]
    ].

    s := aStringOrByteCollection readStream.
    idx := 1.
    [s atEnd] whileFalse:[
        codeIn := nextIn value.
        codeIn <= 16rFF ifTrue:[
        ] ifFalse:[
            nBitsRequired := nBitsRequired max:16.
            (codeIn between:16rD800 and:16rDBFF) ifTrue:[
                nBitsRequired := 32.
                codeIn1 := codeIn.
                codeIn2 := nextIn value.
                codeIn := ((codeIn1 - 16rD800) bitShift:10)
                          +
                          (codeIn2 - 16rDC00)
                          + 16r00010000.
            ].
        ].
        newString at:idx put:(Character value:codeIn).
        idx := idx + 1.
    ].
    ^ newString

    "
     self new decodeString:#[ 16r00 16r42 ]            
     self new decodeString:#[ 16r01 16r42 ]            
     self new decodeString:#[ 16r00 16r48
                              16r00 16r69  
                              16rD8 16r00  
                              16rDC 16r00  
                              16r00 16r21  
                              16r00 16r21  
                            ]            

     self new decodeString:#( 16r0048
                              16r0069  
                              16rD800  
                              16rDC00  
                              16r0021  
                              16r0021  
                            )
    "
!

encode:aCode
    self shouldNotImplement "/ no single byte conversion possible
!

encodeString:aUnicodeString
    "return the UTF-16 representation of a aUnicodeString.
     The resulting string is only useful to be stored on some external file,
     not for being used inside ST/X."

    |s|

    s := WriteStream on:(ByteArray uninitializedNew:aUnicodeString size).
    aUnicodeString do:[:eachCharacter |
        |codePoint t hi low|

        codePoint := eachCharacter codePoint.
        (codePoint <= 16rFFFF) ifTrue:[
            ((codePoint <= 16rD7FF) or:[ codePoint between:16rE000 and:16rFFFF]) ifTrue:[
                self nextPutTwoByteValue:codePoint to:s.
            ] ifFalse:[
                "/ unrepresentable: D800..DFFFF
                self error:'unrepresentable value (D800..DFFFF) in utf16Encode'.
            ].
        ] ifFalse:[
            t := codePoint - 16r00010000.
            hi := t bitShift:-10.
            low := t bitAnd:16r3FF.
            hi > 16r3FF ifTrue:[
                "/ unrepresentable: above 110000
                self error:'unrepresentable value (> 10FFFF) in utf16Encode'.
            ].
            self nextPutTwoByteValue:(hi + 16rD800) to:s.
            self nextPutTwoByteValue:(low + 16rDC00) to:s.
        ].
    ].

    ^ s contents

    "
     (self encodeString:'hello')                                         #[0 104 0 101 0 108 0 108 0 111]
     (self encodeString:(Character value:16r40) asString)                #[0 64]
     (self encodeString:(Character value:16rFF) asString)                #[0 255]
     (self encodeString:(Character value:16r100) asString)               #[1 0]
     (self encodeString:(Character value:16r1000) asString)              #[16 0]
     (self encodeString:(Character value:16r2000) asString)              #[32 0]
     (self encodeString:(Character value:16r4000) asString)              #[64 0]
     (self encodeString:(Character value:16r8000) asString)              #[128 0]
     (self encodeString:(Character value:16rD7FF) asString)              #[215 255]
     (self encodeString:(Character value:16rE000) asString)              #[224 0]
     (self encodeString:(Character value:16rFFFF) asString)              #[255 255]
     (self encodeString:(Character value:16r10000) asString)             #[216 64 220 0]
     (self encodeString:(Character value:16r10FFF) asString)             #[216 67 223 255]
     (self encodeString:(Character value:16r1FFFF) asString)             #[216 127 223 255]
     (self encodeString:(Character value:16r10FFFF) asString)            #[219 255 223 255]             
    error cases:
     (self encodeString:(Character value:16rD800) asString) 
     (self encodeString:(Character value:16rD801) asString) 
     (self encodeString:(Character value:16rDFFF) asString) 
     (self encodeString:(Character value:16r110000) asString)   
    "
! !

!ISO10646_to_UTF16BE methodsFor:'private'!

nextPutTwoByteValue:anInteger to:aStream
    aStream nextPutShort:anInteger MSB:true
!

nextTwoByteValueFrom:aStream
    ^ aStream nextUnsignedShortMSB:true
! !

!ISO10646_to_UTF16BE methodsFor:'queries'!

nameOfEncoding
    ^ #'utf8be'
! !

!ISO10646_to_UTF16BE class methodsFor:'documentation'!

version
    ^ '$Header: /cvs/stx/stx/libbasic/CharacterEncoderImplementations__ISO10646_to_UTF16BE.st,v 1.1 2005-07-07 17:36:41 cg Exp $'
! !