CharacterEncoderImplementations__ISO10646_to_UTF16BE.st
changeset 8903 4e15c297fadc
child 9325 a4c635a6f8eb
equal deleted inserted replaced
8902:e7fa109f9cbe 8903:4e15c297fadc
       
     1 "
       
     2  COPYRIGHT (c) 2005 by eXept Software AG
       
     3               All Rights Reserved
       
     4 
       
     5  This software is furnished under a license and may be used
       
     6  only in accordance with the terms of that license and with the
       
     7  inclusion of the above copyright notice.   This software may not
       
     8  be provided or otherwise made available to, or used by, any
       
     9  other person.  No title to or ownership of the software is
       
    10  hereby transferred.
       
    11 "
       
    12 
       
    13 "{ Package: 'stx:libbasic' }"
       
    14 
       
    15 "{ NameSpace: CharacterEncoderImplementations }"
       
    16 
       
    17 TwoByteEncoder subclass:#ISO10646_to_UTF16BE
       
    18 	instanceVariableNames:''
       
    19 	classVariableNames:''
       
    20 	poolDictionaries:''
       
    21 	category:'Collections-Text-Encodings'
       
    22 !
       
    23 
       
    24 !ISO10646_to_UTF16BE class methodsFor:'documentation'!
       
    25 
       
    26 copyright
       
    27 "
       
    28  COPYRIGHT (c) 2005 by eXept Software AG
       
    29               All Rights Reserved
       
    30 
       
    31  This software is furnished under a license and may be used
       
    32  only in accordance with the terms of that license and with the
       
    33  inclusion of the above copyright notice.   This software may not
       
    34  be provided or otherwise made available to, or used by, any
       
    35  other person.  No title to or ownership of the software is
       
    36  hereby transferred.
       
    37 "
       
    38 !
       
    39 
       
    40 documentation
       
    41 "
       
    42     encodes/decodes UTF16 BigEndian (big-end-first)
       
    43 "
       
    44 !
       
    45 
       
    46 examples
       
    47 "
       
    48   Encoding (unicode to utf16BE)
       
    49      ISO10646_to_UTF16BE encodeString:'hello'.
       
    50 
       
    51 
       
    52   Decoding (utf16BE to unicode):
       
    53      |t|
       
    54 
       
    55      t := ISO10646_to_UTF16BE encodeString:''.
       
    56      ISO10646_to_UTF16BE decodeString:t.
       
    57 "
       
    58 ! !
       
    59 
       
    60 !ISO10646_to_UTF16BE methodsFor:'encoding & decoding'!
       
    61 
       
    62 decode:aCode
       
    63     self shouldNotImplement "/ no single byte conversion possible
       
    64 !
       
    65 
       
    66 decodeString:aStringOrByteCollection
       
    67     "given a byteArray (2-bytes per character) or unsignedShortArra in UTF16 encoding,
       
    68      return a new string containing the same characters, in 8, 16bit (or more) encoding.
       
    69      Returns either a normal String, a TwoByte- or a FourByte-String instance.
       
    70      Only useful, when reading from external sources.
       
    71      This only handles up-to 30bit characters."
       
    72 
       
    73     |sz nBitsRequired s newString idx bitsPerElementIn nextIn
       
    74      codeIn codeIn1 codeIn2|
       
    75 
       
    76     aStringOrByteCollection isByteArray ifTrue:[
       
    77         bitsPerElementIn := 8.
       
    78     ] ifFalse:[
       
    79         aStringOrByteCollection isString ifTrue:[
       
    80             bitsPerElementIn := aStringOrByteCollection bitsPerCharacter.
       
    81         ] ifFalse:[
       
    82             bitsPerElementIn := 16.
       
    83         ].
       
    84     ].
       
    85 
       
    86     bitsPerElementIn == 8 ifTrue:[
       
    87         nextIn := [self nextTwoByteValueFrom:s].
       
    88     ] ifFalse:[
       
    89         nextIn := [s next].
       
    90     ].
       
    91 
       
    92     nBitsRequired := 8.
       
    93     sz := 0.
       
    94     s := aStringOrByteCollection readStream.
       
    95     [s atEnd] whileFalse:[
       
    96         codeIn := nextIn value.
       
    97         sz := sz + 1.
       
    98 
       
    99         codeIn <= 16rFF ifTrue:[
       
   100         ] ifFalse:[
       
   101             nBitsRequired := nBitsRequired max:16.
       
   102             (codeIn between:16rD800 and:16rDBFF) ifTrue:[
       
   103                 nBitsRequired := 32.
       
   104                 codeIn2 := nextIn value.
       
   105             ].
       
   106         ]
       
   107     ].
       
   108 
       
   109     nBitsRequired == 8 ifTrue:[
       
   110         newString := String uninitializedNew:sz
       
   111     ] ifFalse:[
       
   112         nBitsRequired <= 16 ifTrue:[
       
   113             newString := Unicode16String new:sz
       
   114         ] ifFalse:[
       
   115             newString := Unicode32String new:sz
       
   116         ]
       
   117     ].
       
   118 
       
   119     s := aStringOrByteCollection readStream.
       
   120     idx := 1.
       
   121     [s atEnd] whileFalse:[
       
   122         codeIn := nextIn value.
       
   123         codeIn <= 16rFF ifTrue:[
       
   124         ] ifFalse:[
       
   125             nBitsRequired := nBitsRequired max:16.
       
   126             (codeIn between:16rD800 and:16rDBFF) ifTrue:[
       
   127                 nBitsRequired := 32.
       
   128                 codeIn1 := codeIn.
       
   129                 codeIn2 := nextIn value.
       
   130                 codeIn := ((codeIn1 - 16rD800) bitShift:10)
       
   131                           +
       
   132                           (codeIn2 - 16rDC00)
       
   133                           + 16r00010000.
       
   134             ].
       
   135         ].
       
   136         newString at:idx put:(Character value:codeIn).
       
   137         idx := idx + 1.
       
   138     ].
       
   139     ^ newString
       
   140 
       
   141     "
       
   142      self new decodeString:#[ 16r00 16r42 ]            
       
   143      self new decodeString:#[ 16r01 16r42 ]            
       
   144      self new decodeString:#[ 16r00 16r48
       
   145                               16r00 16r69  
       
   146                               16rD8 16r00  
       
   147                               16rDC 16r00  
       
   148                               16r00 16r21  
       
   149                               16r00 16r21  
       
   150                             ]            
       
   151 
       
   152      self new decodeString:#( 16r0048
       
   153                               16r0069  
       
   154                               16rD800  
       
   155                               16rDC00  
       
   156                               16r0021  
       
   157                               16r0021  
       
   158                             )
       
   159     "
       
   160 !
       
   161 
       
   162 encode:aCode
       
   163     self shouldNotImplement "/ no single byte conversion possible
       
   164 !
       
   165 
       
   166 encodeString:aUnicodeString
       
   167     "return the UTF-16 representation of a aUnicodeString.
       
   168      The resulting string is only useful to be stored on some external file,
       
   169      not for being used inside ST/X."
       
   170 
       
   171     |s|
       
   172 
       
   173     s := WriteStream on:(ByteArray uninitializedNew:aUnicodeString size).
       
   174     aUnicodeString do:[:eachCharacter |
       
   175         |codePoint t hi low|
       
   176 
       
   177         codePoint := eachCharacter codePoint.
       
   178         (codePoint <= 16rFFFF) ifTrue:[
       
   179             ((codePoint <= 16rD7FF) or:[ codePoint between:16rE000 and:16rFFFF]) ifTrue:[
       
   180                 self nextPutTwoByteValue:codePoint to:s.
       
   181             ] ifFalse:[
       
   182                 "/ unrepresentable: D800..DFFFF
       
   183                 self error:'unrepresentable value (D800..DFFFF) in utf16Encode'.
       
   184             ].
       
   185         ] ifFalse:[
       
   186             t := codePoint - 16r00010000.
       
   187             hi := t bitShift:-10.
       
   188             low := t bitAnd:16r3FF.
       
   189             hi > 16r3FF ifTrue:[
       
   190                 "/ unrepresentable: above 110000
       
   191                 self error:'unrepresentable value (> 10FFFF) in utf16Encode'.
       
   192             ].
       
   193             self nextPutTwoByteValue:(hi + 16rD800) to:s.
       
   194             self nextPutTwoByteValue:(low + 16rDC00) to:s.
       
   195         ].
       
   196     ].
       
   197 
       
   198     ^ s contents
       
   199 
       
   200     "
       
   201      (self encodeString:'hello')                                         #[0 104 0 101 0 108 0 108 0 111]
       
   202      (self encodeString:(Character value:16r40) asString)                #[0 64]
       
   203      (self encodeString:(Character value:16rFF) asString)                #[0 255]
       
   204      (self encodeString:(Character value:16r100) asString)               #[1 0]
       
   205      (self encodeString:(Character value:16r1000) asString)              #[16 0]
       
   206      (self encodeString:(Character value:16r2000) asString)              #[32 0]
       
   207      (self encodeString:(Character value:16r4000) asString)              #[64 0]
       
   208      (self encodeString:(Character value:16r8000) asString)              #[128 0]
       
   209      (self encodeString:(Character value:16rD7FF) asString)              #[215 255]
       
   210      (self encodeString:(Character value:16rE000) asString)              #[224 0]
       
   211      (self encodeString:(Character value:16rFFFF) asString)              #[255 255]
       
   212      (self encodeString:(Character value:16r10000) asString)             #[216 64 220 0]
       
   213      (self encodeString:(Character value:16r10FFF) asString)             #[216 67 223 255]
       
   214      (self encodeString:(Character value:16r1FFFF) asString)             #[216 127 223 255]
       
   215      (self encodeString:(Character value:16r10FFFF) asString)            #[219 255 223 255]             
       
   216     error cases:
       
   217      (self encodeString:(Character value:16rD800) asString) 
       
   218      (self encodeString:(Character value:16rD801) asString) 
       
   219      (self encodeString:(Character value:16rDFFF) asString) 
       
   220      (self encodeString:(Character value:16r110000) asString)   
       
   221     "
       
   222 ! !
       
   223 
       
   224 !ISO10646_to_UTF16BE methodsFor:'private'!
       
   225 
       
   226 nextPutTwoByteValue:anInteger to:aStream
       
   227     aStream nextPutShort:anInteger MSB:true
       
   228 !
       
   229 
       
   230 nextTwoByteValueFrom:aStream
       
   231     ^ aStream nextUnsignedShortMSB:true
       
   232 ! !
       
   233 
       
   234 !ISO10646_to_UTF16BE methodsFor:'queries'!
       
   235 
       
   236 nameOfEncoding
       
   237     ^ #'utf8be'
       
   238 ! !
       
   239 
       
   240 !ISO10646_to_UTF16BE class methodsFor:'documentation'!
       
   241 
       
   242 version
       
   243     ^ '$Header: /cvs/stx/stx/libbasic/CharacterEncoderImplementations__ISO10646_to_UTF16BE.st,v 1.1 2005-07-07 17:36:41 cg Exp $'
       
   244 ! !