CharacterEncoderImplementations__ISO10646_to_UTF8.st
changeset 18625 37d697b9bf8d
parent 18604 54caf7b64994
child 18630 a74d669db937
child 19838 a6ca726d596c
equal deleted inserted replaced
18624:1f113cce940e 18625:37d697b9bf8d
     1 "{ Encoding: utf8 }"
       
     2 
       
     3 "
     1 "
     4  COPYRIGHT (c) 2004 by eXept Software AG
     2  COPYRIGHT (c) 2004 by eXept Software AG
     5 	      All Rights Reserved
     3 	      All Rights Reserved
     6 
     4 
     7  This software is furnished under a license and may be used
     5  This software is furnished under a license and may be used
    52 
    50 
    53 
    51 
    54   Decoding (utf8 to unicode):
    52   Decoding (utf8 to unicode):
    55      |t|
    53      |t|
    56 
    54 
    57      t := ISO10646_to_UTF8 encodeString:'Helloœ'.
    55      t := ISO10646_to_UTF8 encodeString:'Helloœ'.
    58      ISO10646_to_UTF8 decodeString:t.
    56      ISO10646_to_UTF8 decodeString:t.
    59 "
    57 "
    60 ! !
    58 ! !
    61 
    59 
    62 !ISO10646_to_UTF8 class methodsFor:'instance creation'!
    60 !ISO10646_to_UTF8 class methodsFor:'instance creation'!
   346      not for being used inside ST/X.
   344      not for being used inside ST/X.
   347 
   345 
   348      If you work a lot with utf8 encoded textFiles,
   346      If you work a lot with utf8 encoded textFiles,
   349      this is a first-class candidate for a primitive."
   347      this is a first-class candidate for a primitive."
   350 
   348 
   351     |s|
   349     |s
       
   350      stringSize "{ Class: SmallInteger }"|
   352 
   351 
   353     "/ avoid creation of new strings if possible
   352     "/ avoid creation of new strings if possible
   354     aUnicodeString containsNon7BitAscii ifFalse:[
   353     aUnicodeString containsNon7BitAscii ifFalse:[
   355         ^ aUnicodeString asSingleByteString
   354         ^ aUnicodeString asSingleByteString
   356     ].
   355     ].
   357 
   356 
   358     s := WriteStream on:(String uninitializedNew:(aUnicodeString size * 3 // 2)).
   357     stringSize := aUnicodeString size.
   359     aUnicodeString do:[:eachCharacter |
   358     s := WriteStream on:(String uninitializedNew:(stringSize * 3 // 2)).
   360         |codePoint "{Class: SmallInteger }" b1 b2 b3 b4 b5 v "{Class: SmallInteger }"|
   359     1 to:stringSize do:[:idx |
   361 
   360         |character codePoint "{Class: SmallInteger }" b1 b2 b3 b4 b5 v "{Class: SmallInteger }"|
   362         codePoint := eachCharacter codePoint.
   361 
       
   362         character := aUnicodeString at:idx.
       
   363         codePoint := character codePoint.
   363         codePoint <= 16r7F ifTrue:[
   364         codePoint <= 16r7F ifTrue:[
   364             s nextPut:eachCharacter.
   365             s nextPut:character.
   365         ] ifFalse:[
   366         ] ifFalse:[
   366             b1 := Character value:((codePoint bitAnd:16r3F) bitOr:2r10000000).
   367             b1 := Character value:((codePoint bitAnd:16r3F) bitOr:2r10000000).
   367             v := codePoint bitShift:-6.
   368             v := codePoint bitShift:-6.
   368             v <= 16r1F ifTrue:[
   369             v <= 16r1F ifTrue:[
   369                 s nextPut:(Character value:(v bitOr:2r11000000)).
   370                 s nextPut:(Character value:(v bitOr:2r11000000)).
   392                             v <= 16r01 ifTrue:[
   393                             v <= 16r01 ifTrue:[
   393                                 s nextPut:(Character value:(v bitOr:2r11111100)).
   394                                 s nextPut:(Character value:(v bitOr:2r11111100)).
   394                                 s nextPut:b5; nextPut:b4; nextPut:b3; nextPut:b2; nextPut:b1.
   395                                 s nextPut:b5; nextPut:b4; nextPut:b3; nextPut:b2; nextPut:b1.
   395                             ] ifFalse:[
   396                             ] ifFalse:[
   396                                 "/ cannot happen - we only support up to 30 bit characters
   397                                 "/ cannot happen - we only support up to 30 bit characters
   397                                 EncodingError raiseWith:eachCharacter errorString:'codePoint > 31bit in #utf8Encode'.
   398                                 EncodingError raiseWith:character errorString:'codePoint > 31bit in #utf8Encode'.
   398                             ]
   399                             ]
   399                         ].
   400                         ].
   400                     ].
   401                     ].
   401                 ].
   402                 ].
   402             ].
   403             ].