CharacterEncoderImplementations__ISO10646_to_UTF8.st
changeset 8081 b468050174a9
child 8103 794d8e3f11d8
equal deleted inserted replaced
8080:db22e5dcf518 8081:b468050174a9
       
     1 "{ Encoding: utf8 }"
       
     2 
       
     3 "{ Package: 'stx:libbasic' }"
       
     4 
       
     5 "{ NameSpace: CharacterEncoderImplementations }"
       
     6 
       
     7 TwoByteEncoder subclass:#ISO10646_to_UTF8
       
     8 	instanceVariableNames:''
       
     9 	classVariableNames:''
       
    10 	poolDictionaries:''
       
    11 	category:'Collections-Text-Encodings'
       
    12 !
       
    13 
       
    14 !ISO10646_to_UTF8 class methodsFor:'documentation'!
       
    15 
       
    16 examples
       
    17 "
       
    18   Encoding (unicode to utf8)
       
    19      ISO10646_to_UTF8 encodeString:'hello'. 
       
    20      ISO10646_to_UTF8 encodeString:'ÄÖÜ'. 
       
    21 
       
    22  Decoding (utf8 to unicode):
       
    23 
       
    24      |t|
       
    25 
       
    26      t := ISO10646_to_UTF8 encodeString:'ÄÖÜ'.
       
    27      ISO10646_to_UTF8 decodeString:t.    
       
    28 
       
    29 "
       
    30 ! !
       
    31 
       
    32 !ISO10646_to_UTF8 class methodsFor:'queries'!
       
    33 
       
    34 nameOfEncoding
       
    35     "I encode utf8 into unicode and vice versa"
       
    36 
       
    37     ^ #'utf8'
       
    38 !
       
    39 
       
    40 namesOfEncoding
       
    41     "I encode utf8 into unicode and vice versa"
       
    42 
       
    43     ^ #( 'utf8' 'utf-8' )
       
    44 ! !
       
    45 
       
    46 !ISO10646_to_UTF8 methodsFor:'encoding & decoding'!
       
    47 
       
    48 decode:aCode
       
    49     self shouldNotImplement "/ no single byte conversion possible
       
    50 !
       
    51 
       
    52 decodeString:aStringOrByteCollection
       
    53     "given a string in UTF8 encoding,
       
    54      return a new string containing the same characters, in 16bit (or more) encoding.
       
    55      Returns either a normal String, a TwoByteString or a FourByteString instance.
       
    56      Only useful, when reading from external sources.
       
    57      This only handles up-to 30bit characters.
       
    58 
       
    59      If you work a lot with utf8 encoded textFiles, 
       
    60      this is a first-class candidate for a primitive."
       
    61 
       
    62     |sz anyAbove7BitAscii nBitsRequired 
       
    63      ascii "{ Class: SmallInteger }"
       
    64      byte  "{ Class: SmallInteger }"
       
    65      s newString idx next6Bits last6Bits
       
    66      errorReporter|
       
    67 
       
    68     errorReporter := [:msg | DecodingError raiseWith:aStringOrByteCollection errorString:msg].
       
    69 
       
    70     next6Bits := [
       
    71                     |byte|
       
    72 
       
    73                     byte := s nextByte.
       
    74                     byte isNil ifTrue:[^ errorReporter value:'short utf8 string'].
       
    75                     ascii := (ascii bitShift:6) bitOr:(byte bitAnd:2r00111111).
       
    76                  ].
       
    77 
       
    78     last6Bits := [
       
    79                     |byte a|
       
    80 
       
    81                     byte := s nextByte.
       
    82                     byte isNil ifTrue:[^ errorReporter value:'short utf8 string'].
       
    83                     a := (ascii bitShift:6) bitOr:(byte bitAnd:2r00111111).
       
    84                     (a > 16r3FFFFFFF) ifTrue:[
       
    85                         "/ ST/X can only represent 30 bit unicode characters.
       
    86                         errorReporter value:'unicode character out of range'.
       
    87                         a := 16r3FFFFFFF.
       
    88                     ].
       
    89                     ascii := a.
       
    90                  ].
       
    91 
       
    92     nBitsRequired := 8.
       
    93     anyAbove7BitAscii := false.    
       
    94     sz := 0.
       
    95     s := aStringOrByteCollection readStream.
       
    96     [s atEnd] whileFalse:[
       
    97         byte := ascii := s nextByte.
       
    98         (byte bitAnd:16r80) ~~ 0 ifTrue:[
       
    99             anyAbove7BitAscii := true.    
       
   100             (byte bitAnd:2r11100000) == 2r11000000 ifTrue:[
       
   101                 "/ 80 .. 7FF
       
   102                 ascii := (byte bitAnd:2r00011111).
       
   103                 next6Bits value.
       
   104                 ascii > 16rFF ifTrue:[
       
   105                     nBitsRequired := nBitsRequired max:16
       
   106                 ].
       
   107                 "/ a strict utf8 decoder does not allow overlong sequences
       
   108                 ascii < 16r80 ifTrue:[
       
   109                     errorReporter value:'overlong utf8 sequence'
       
   110                 ].
       
   111             ] ifFalse:[
       
   112                 (byte bitAnd:2r11110000) == 2r11100000 ifTrue:[
       
   113                     "/ 800 .. FFFF
       
   114                     ascii := (byte bitAnd:2r00001111).
       
   115                     next6Bits value.
       
   116                     next6Bits value.
       
   117                     ascii > 16rFF ifTrue:[
       
   118                         nBitsRequired := nBitsRequired max:16
       
   119                     ].
       
   120                     ascii < 16r800 ifTrue:[
       
   121                         errorReporter value:'overlong utf8 sequence'
       
   122                     ].
       
   123                 ] ifFalse:[
       
   124                     (byte bitAnd:2r11111000) == 2r11110000 ifTrue:[
       
   125                         "/ 10000 .. 1FFFFF
       
   126                         ascii := (byte bitAnd:2r00000111).
       
   127                         next6Bits value.
       
   128                         next6Bits value.
       
   129                         next6Bits value.
       
   130                         ascii > 16rFF ifTrue:[
       
   131                             ascii > 16rFFFF ifTrue:[
       
   132                                 nBitsRequired := nBitsRequired max:32
       
   133                             ] ifFalse:[
       
   134                                 nBitsRequired := nBitsRequired max:16
       
   135                             ]
       
   136                         ].
       
   137                         ascii < 16r10000 ifTrue:[
       
   138                             errorReporter value:'overlong utf8 sequence'
       
   139                         ].
       
   140                     ] ifFalse:[
       
   141                         (byte bitAnd:2r11111100) == 2r11111000 ifTrue:[
       
   142                             "/ 200000 .. 3FFFFFF
       
   143                             ascii := (byte bitAnd:2r00000011).
       
   144                             next6Bits value.
       
   145                             next6Bits value.
       
   146                             next6Bits value.
       
   147                             next6Bits value.
       
   148                             ascii > 16rFF ifTrue:[
       
   149                                 ascii > 16rFFFF ifTrue:[
       
   150                                     nBitsRequired := nBitsRequired max:32
       
   151                                 ] ifFalse:[
       
   152                                     nBitsRequired := nBitsRequired max:16
       
   153                                 ]
       
   154                             ].
       
   155                             ascii < 200000 ifTrue:[
       
   156                                 errorReporter value:'overlong utf8 sequence'
       
   157                             ].
       
   158                         ] ifFalse:[
       
   159                             (byte bitAnd:2r11111110) == 2r11111100 ifTrue:[
       
   160                                 "/ 4000000 .. 7FFFFFFF
       
   161                                 ascii := (byte bitAnd:2r00000001).
       
   162                                 next6Bits value.
       
   163                                 next6Bits value.
       
   164                                 next6Bits value.
       
   165                                 next6Bits value.
       
   166                                 last6Bits value.
       
   167                                 ascii > 16rFF ifTrue:[
       
   168                                     ascii > 16rFFFF ifTrue:[
       
   169                                         nBitsRequired := nBitsRequired max:32
       
   170                                     ] ifFalse:[
       
   171                                         nBitsRequired := nBitsRequired max:16
       
   172                                     ]
       
   173                                 ].
       
   174                                 ascii < 16r4000000 ifTrue:[
       
   175                                     errorReporter value:'overlong utf8 sequence'
       
   176                                 ].
       
   177                             ] ifFalse:[
       
   178                                 errorReporter value:'invalid utf8 encoding'
       
   179                             ]
       
   180                         ]
       
   181                     ]
       
   182                 ]
       
   183             ].
       
   184         ].
       
   185         sz := sz + 1.
       
   186     ].
       
   187     nBitsRequired == 8 ifTrue:[
       
   188         anyAbove7BitAscii ifFalse:[
       
   189             "/ can return the original string
       
   190             aStringOrByteCollection isString ifTrue:[^ aStringOrByteCollection].
       
   191         ].
       
   192         newString := String uninitializedNew:sz
       
   193     ] ifFalse:[
       
   194         nBitsRequired <= 16 ifTrue:[
       
   195             newString := Unicode16String new:sz
       
   196         ] ifFalse:[
       
   197             newString := Unicode32String new:sz
       
   198         ]
       
   199     ].
       
   200 
       
   201     next6Bits := [
       
   202                     |byte|
       
   203 
       
   204                     byte := s nextByte.
       
   205                     ascii := (ascii bitShift:6) bitOr:(byte bitAnd:2r00111111).
       
   206                  ].
       
   207 
       
   208     s := aStringOrByteCollection readStream.
       
   209     idx := 1.
       
   210     [s atEnd] whileFalse:[
       
   211         byte := ascii := s nextByte.
       
   212         (byte bitAnd:2r10000000) ~~ 0 ifTrue:[
       
   213             (byte bitAnd:2r11100000) == 2r11000000 ifTrue:[
       
   214                 ascii := (byte bitAnd:2r00011111).
       
   215                 next6Bits value.
       
   216             ] ifFalse:[
       
   217                 (byte bitAnd:2r11110000) == 2r11100000 ifTrue:[
       
   218                     ascii := (byte bitAnd:2r00001111).
       
   219                     next6Bits value.
       
   220                     next6Bits value.
       
   221                 ] ifFalse:[
       
   222                     (byte bitAnd:2r11111000) == 2r11110000 ifTrue:[
       
   223                         ascii := (byte bitAnd:2r00000111).
       
   224                         next6Bits value.
       
   225                         next6Bits value.
       
   226                         next6Bits value.
       
   227                     ] ifFalse:[
       
   228                         (byte bitAnd:2r11111100) == 2r11111000 ifTrue:[
       
   229                             ascii := (byte bitAnd:2r00000011).
       
   230                             next6Bits value.
       
   231                             next6Bits value.
       
   232                             next6Bits value.
       
   233                             next6Bits value.
       
   234                         ] ifFalse:[
       
   235                             (byte bitAnd:2r11111110) == 2r11111100 ifTrue:[
       
   236                                 ascii := (byte bitAnd:2r00000001).
       
   237                                 next6Bits value.
       
   238                                 next6Bits value.
       
   239                                 next6Bits value.
       
   240                                 next6Bits value.
       
   241                                 last6Bits value.
       
   242                             ]
       
   243                         ]
       
   244                     ]
       
   245                 ]
       
   246             ].
       
   247         ].
       
   248         newString at:idx put:(Character value:ascii).
       
   249         idx := idx + 1.
       
   250     ].
       
   251     ^ newString
       
   252 
       
   253     "
       
   254      CharacterArray fromUTF8Bytes:#[ 16r41 16r42 ]      
       
   255      CharacterArray fromUTF8Bytes:#[ 16rC1 16r02 ]       
       
   256      CharacterArray fromUTF8Bytes:#[ 16rE0 16r81 16r02 ]      
       
   257      CharacterArray fromUTF8Bytes:#[ 16rEF 16rBF 16rBF ]  
       
   258 
       
   259    rfc2279 examples:
       
   260      CharacterArray fromUTF8Bytes:#[ 16r41 16rE2 16r89 16rA2 16rCE 16r91 16r2E ]           
       
   261      CharacterArray fromUTF8Bytes:#[ 16rED 16r95 16r9C 16rEA 16rB5 16rAD 16rEC 16r96 16rB4 ]      
       
   262      CharacterArray fromUTF8Bytes:#[ 16rE6 16r97 16rA5 16rE6 16r9C 16rAC 16rE8 16rAA 16r9E ]      
       
   263 
       
   264    invalid:
       
   265      CharacterArray fromUTF8Bytes:#[ 16rC0 16r80 ]      
       
   266      CharacterArray fromUTF8Bytes:#[ 16rE0 16r80 16r80 ]      
       
   267     "
       
   268 !
       
   269 
       
   270 encode:aCode
       
   271     self shouldNotImplement "/ no single byte conversion possible
       
   272 !
       
   273 
       
   274 encodeString:aUnicodeString
       
   275     "return the UTF-8 representation of a aUnicodeString.
       
   276      The resulting string is only useful to be stored on some external file,
       
   277      not for being used inside ST/X.
       
   278 
       
   279      If you work a lot with utf8 encoded textFiles, 
       
   280      this is a first-class candidate for a primitive."
       
   281 
       
   282     |s anyAbove7BitAscii|
       
   283 
       
   284     anyAbove7BitAscii := false.
       
   285     s := (String uninitializedNew:aUnicodeString size) writeStream.
       
   286     aUnicodeString do:[:eachCharacter |
       
   287         |codePoint b1 b2 b3 b4 b5 v "{Class: SmallInteger }"|
       
   288 
       
   289         codePoint := eachCharacter asciiValue.
       
   290         codePoint <= 16r7F ifTrue:[
       
   291             s nextPut:eachCharacter.
       
   292         ] ifFalse:[
       
   293             anyAbove7BitAscii := true.
       
   294             b1 := Character value:((codePoint bitAnd:16r3F) bitOr:2r10000000).
       
   295             v := codePoint bitShift:-6.
       
   296             v <= 16r1F ifTrue:[
       
   297                 s nextPut:(Character value:(v bitOr:2r11000000)).
       
   298                 s nextPut:b1.
       
   299             ] ifFalse:[
       
   300                 b2 := Character value:((v bitAnd:16r3F) bitOr:2r10000000).
       
   301                 v := v bitShift:-6.
       
   302                 v <= 16r0F ifTrue:[
       
   303                     s nextPut:(Character value:(v bitOr:2r11100000)).
       
   304                     s nextPut:b2; nextPut:b1.
       
   305                 ] ifFalse:[
       
   306                     b3 := Character value:((v bitAnd:16r3F) bitOr:2r10000000).
       
   307                     v := v bitShift:-6.
       
   308                     v <= 16r07 ifTrue:[
       
   309                         s nextPut:(Character value:(v bitOr:2r11110000)).
       
   310                         s nextPut:b3; nextPut:b2; nextPut:b1.
       
   311                     ] ifFalse:[
       
   312                         b4 := Character value:((v bitAnd:16r3F) bitOr:2r10000000).
       
   313                         v := v bitShift:-6.
       
   314                         v <= 16r03 ifTrue:[
       
   315                             s nextPut:(Character value:(v bitOr:2r11111000)).
       
   316                             s nextPut:b4; nextPut:b3; nextPut:b2; nextPut:b1.
       
   317                         ] ifFalse:[
       
   318                             b5 := Character value:((v bitAnd:16r3F) bitOr:2r10000000).
       
   319                             v := v bitShift:-6.
       
   320                             v <= 16r01 ifTrue:[
       
   321                                 s nextPut:(Character value:(v bitOr:2r11111100)).
       
   322                                 s nextPut:b5; nextPut:b4; nextPut:b3; nextPut:b2; nextPut:b1.
       
   323                             ] ifFalse:[
       
   324                                 "/ cannot happen - we only support up to 30 bit characters
       
   325                                 self error:'ascii value > 31bit in utf8Encode'.
       
   326                             ]
       
   327                         ].
       
   328                     ].
       
   329                 ].
       
   330             ].
       
   331         ].
       
   332     ].
       
   333 
       
   334     anyAbove7BitAscii ifFalse:[^ aUnicodeString].   "/ avoid creation of new strings
       
   335     ^ s contents
       
   336 
       
   337     "
       
   338      (self encodeString:'hello') asByteArray                             #[104 101 108 108 111]
       
   339      (self encodeString:(Character value:16r40) asString) asByteArray    #[64]
       
   340      (self encodeString:(Character value:16r7F) asString) asByteArray    #[127]
       
   341      (self encodeString:(Character value:16r80) asString) asByteArray    #[194 128]
       
   342      (self encodeString:(Character value:16rFF) asString) asByteArray    #[195 191] 
       
   343      (self encodeString:(Character value:16r100) asString) asByteArray   #[196 128]  
       
   344      (self encodeString:(Character value:16r200) asString) asByteArray   #[200 128]  
       
   345      (self encodeString:(Character value:16r400) asString) asByteArray   #[208 128]  
       
   346      (self encodeString:(Character value:16r800) asString) asByteArray   #[224 160 128]  
       
   347      (self encodeString:(Character value:16r1000) asString) asByteArray  #[225 128 128]   
       
   348      (self encodeString:(Character value:16r2000) asString) asByteArray  #[226 128 128]   
       
   349      (self encodeString:(Character value:16r4000) asString) asByteArray  #[228 128 128]   
       
   350      (self encodeString:(Character value:16r8000) asString) asByteArray  #[232 128 128]   
       
   351      (self encodeString:(Character value:16rFFFF) asString) asByteArray  #[239 191 191]   
       
   352     "
       
   353 ! !
       
   354 
       
   355 !ISO10646_to_UTF8 class methodsFor:'documentation'!
       
   356 
       
   357 version
       
   358     ^ '$Header: /cvs/stx/stx/libbasic/CharacterEncoderImplementations__ISO10646_to_UTF8.st,v 1.1 2004-03-05 17:18:03 cg Exp $'
       
   359 ! !