CharacterEncoderImplementations__ISO10646_to_UTF8.st
changeset 8460 f4d333135e1d
parent 8411 44509c4f92f0
child 8773 267612096a52
equal deleted inserted replaced
8459:1f59b17291a3 8460:f4d333135e1d
    71      ascii "{ Class: SmallInteger }"
    71      ascii "{ Class: SmallInteger }"
    72      byte  "{ Class: SmallInteger }"
    72      byte  "{ Class: SmallInteger }"
    73      s newString idx next6Bits last6Bits
    73      s newString idx next6Bits last6Bits
    74      errorReporter|
    74      errorReporter|
    75 
    75 
       
    76     "/ avoid creation of new strings
       
    77     aStringOrByteCollection isString ifTrue:[
       
    78         aStringOrByteCollection contains8BitCharacters ifFalse:[^ aStringOrByteCollection].
       
    79     ].
       
    80 
    76     errorReporter := [:msg | DecodingError raiseWith:aStringOrByteCollection errorString:msg].
    81     errorReporter := [:msg | DecodingError raiseWith:aStringOrByteCollection errorString:msg].
    77 
    82 
    78     next6Bits := [
    83     next6Bits := [
    79 		    | byte |
    84                     | byte |
    80 
    85 
    81 		    byte := s nextByte.
    86                     byte := s nextByte.
    82 		    byte isNil ifTrue:[^ errorReporter value:'short utf8 string'].
    87                     byte isNil ifTrue:[^ errorReporter value:'short utf8 string'].
    83 		    ascii := (ascii bitShift:6) bitOr:(byte bitAnd:2r00111111).
    88                     ascii := (ascii bitShift:6) bitOr:(byte bitAnd:2r00111111).
    84 		    (byte bitAnd:2r11000000) ~~ 2r10000000 ifTrue:[
    89                     (byte bitAnd:2r11000000) ~~ 2r10000000 ifTrue:[
    85 			^ errorReporter value:'illegal followbyte'.].
    90                         ^ errorReporter value:'illegal followbyte'.].
    86 		 ].
    91                  ].
    87 
    92 
    88     last6Bits := [
    93     last6Bits := [
    89 		    | a byte |
    94                     | a byte |
    90 
    95 
    91 		    byte := s nextByte.
    96                     byte := s nextByte.
    92 		    byte isNil ifTrue:[^ errorReporter value:'short utf8 string'].
    97                     byte isNil ifTrue:[^ errorReporter value:'short utf8 string'].
    93 		    a := (ascii bitShift:6) bitOr:(byte bitAnd:2r00111111).
    98                     a := (ascii bitShift:6) bitOr:(byte bitAnd:2r00111111).
    94 		    (a > 16r3FFFFFFF) ifTrue:[
    99                     (a > 16r3FFFFFFF) ifTrue:[
    95 			"/ ST/X can only represent 30 bit unicode characters.
   100                         "/ ST/X can only represent 30 bit unicode characters.
    96 			errorReporter value:'unicode character out of range'.
   101                         errorReporter value:'unicode character out of range'.
    97 			a := 16r3FFFFFFF.
   102                         a := 16r3FFFFFFF.
    98 		    ].
   103                     ].
    99 		    ascii := a.
   104                     ascii := a.
   100 		    (byte bitAnd:2r11000000) ~~ 2r10000000 ifTrue:[
   105                     (byte bitAnd:2r11000000) ~~ 2r10000000 ifTrue:[
   101 			^ errorReporter value:'illegal followbyte'.].
   106                         ^ errorReporter value:'illegal followbyte'.].
   102 		 ].
   107                  ].
   103 
   108 
   104     nBitsRequired := 8.
   109     nBitsRequired := 8.
   105     anyAbove7BitAscii := false.
   110     anyAbove7BitAscii := false.
   106     sz := 0.
   111     sz := 0.
   107     s := aStringOrByteCollection readStream.
   112     s := aStringOrByteCollection readStream.
   108     [s atEnd] whileFalse:[
   113     [s atEnd] whileFalse:[
   109 	byte := ascii := s nextByte.
   114         byte := ascii := s nextByte.
   110 	(byte bitAnd:16r80) ~~ 0 ifTrue:[
   115         (byte bitAnd:16r80) ~~ 0 ifTrue:[
   111 	    anyAbove7BitAscii := true.
   116             anyAbove7BitAscii := true.
   112 	    (byte bitAnd:2r11100000) == 2r11000000 ifTrue:[
   117             (byte bitAnd:2r11100000) == 2r11000000 ifTrue:[
   113 		"/ 80 .. 7FF
   118                 "/ 80 .. 7FF
   114 		ascii := (byte bitAnd:2r00011111).
   119                 ascii := (byte bitAnd:2r00011111).
   115 		next6Bits value.
   120                 next6Bits value.
   116 		ascii > 16rFF ifTrue:[
   121                 ascii > 16rFF ifTrue:[
   117 		    nBitsRequired := nBitsRequired max:16
   122                     nBitsRequired := nBitsRequired max:16
   118 		].
   123                 ].
   119 		"/ a strict utf8 decoder does not allow overlong sequences
   124                 "/ a strict utf8 decoder does not allow overlong sequences
   120 		ascii < 16r80 ifTrue:[
   125                 ascii < 16r80 ifTrue:[
   121 		    errorReporter value:'overlong utf8 sequence'
   126                     errorReporter value:'overlong utf8 sequence'
   122 		].
   127                 ].
   123 	    ] ifFalse:[
   128             ] ifFalse:[
   124 		(byte bitAnd:2r11110000) == 2r11100000 ifTrue:[
   129                 (byte bitAnd:2r11110000) == 2r11100000 ifTrue:[
   125 		    "/ 800 .. FFFF
   130                     "/ 800 .. FFFF
   126 		    ascii := (byte bitAnd:2r00001111).
   131                     ascii := (byte bitAnd:2r00001111).
   127 		    next6Bits value.
   132                     next6Bits value.
   128 		    next6Bits value.
   133                     next6Bits value.
   129 		    ascii > 16rFF ifTrue:[
   134                     ascii > 16rFF ifTrue:[
   130 			nBitsRequired := nBitsRequired max:16
   135                         nBitsRequired := nBitsRequired max:16
   131 		    ].
   136                     ].
   132 		    ascii < 16r800 ifTrue:[
   137                     ascii < 16r800 ifTrue:[
   133 			errorReporter value:'overlong utf8 sequence'
   138                         errorReporter value:'overlong utf8 sequence'
   134 		    ].
   139                     ].
   135 		] ifFalse:[
   140                 ] ifFalse:[
   136 		    (byte bitAnd:2r11111000) == 2r11110000 ifTrue:[
   141                     (byte bitAnd:2r11111000) == 2r11110000 ifTrue:[
   137 			"/ 10000 .. 1FFFFF
   142                         "/ 10000 .. 1FFFFF
   138 			ascii := (byte bitAnd:2r00000111).
   143                         ascii := (byte bitAnd:2r00000111).
   139 			next6Bits value.
   144                         next6Bits value.
   140 			next6Bits value.
   145                         next6Bits value.
   141 			next6Bits value.
   146                         next6Bits value.
   142 			ascii > 16rFF ifTrue:[
   147                         ascii > 16rFF ifTrue:[
   143 			    ascii > 16rFFFF ifTrue:[
   148                             ascii > 16rFFFF ifTrue:[
   144 				nBitsRequired := nBitsRequired max:32
   149                                 nBitsRequired := nBitsRequired max:32
   145 			    ] ifFalse:[
   150                             ] ifFalse:[
   146 				nBitsRequired := nBitsRequired max:16
   151                                 nBitsRequired := nBitsRequired max:16
   147 			    ]
   152                             ]
   148 			].
   153                         ].
   149 			ascii < 16r10000 ifTrue:[
   154                         ascii < 16r10000 ifTrue:[
   150 			    errorReporter value:'overlong utf8 sequence'
   155                             errorReporter value:'overlong utf8 sequence'
   151 			].
   156                         ].
   152 		    ] ifFalse:[
   157                     ] ifFalse:[
   153 			(byte bitAnd:2r11111100) == 2r11111000 ifTrue:[
   158                         (byte bitAnd:2r11111100) == 2r11111000 ifTrue:[
   154 			    "/ 200000 .. 3FFFFFF
   159                             "/ 200000 .. 3FFFFFF
   155 			    ascii := (byte bitAnd:2r00000011).
   160                             ascii := (byte bitAnd:2r00000011).
   156 			    next6Bits value.
   161                             next6Bits value.
   157 			    next6Bits value.
   162                             next6Bits value.
   158 			    next6Bits value.
   163                             next6Bits value.
   159 			    next6Bits value.
   164                             next6Bits value.
   160 			    ascii > 16rFF ifTrue:[
   165                             ascii > 16rFF ifTrue:[
   161 				ascii > 16rFFFF ifTrue:[
   166                                 ascii > 16rFFFF ifTrue:[
   162 				    nBitsRequired := nBitsRequired max:32
   167                                     nBitsRequired := nBitsRequired max:32
   163 				] ifFalse:[
   168                                 ] ifFalse:[
   164 				    nBitsRequired := nBitsRequired max:16
   169                                     nBitsRequired := nBitsRequired max:16
   165 				]
   170                                 ]
   166 			    ].
   171                             ].
   167 			    ascii < 200000 ifTrue:[
   172                             ascii < 200000 ifTrue:[
   168 				errorReporter value:'overlong utf8 sequence'
   173                                 errorReporter value:'overlong utf8 sequence'
   169 			    ].
   174                             ].
   170 			] ifFalse:[
   175                         ] ifFalse:[
   171 			    (byte bitAnd:2r11111110) == 2r11111100 ifTrue:[
   176                             (byte bitAnd:2r11111110) == 2r11111100 ifTrue:[
   172 				"/ 4000000 .. 7FFFFFFF
   177                                 "/ 4000000 .. 7FFFFFFF
   173 				ascii := (byte bitAnd:2r00000001).
   178                                 ascii := (byte bitAnd:2r00000001).
   174 				next6Bits value.
   179                                 next6Bits value.
   175 				next6Bits value.
   180                                 next6Bits value.
   176 				next6Bits value.
   181                                 next6Bits value.
   177 				next6Bits value.
   182                                 next6Bits value.
   178 				last6Bits value.
   183                                 last6Bits value.
   179 				ascii > 16rFF ifTrue:[
   184                                 ascii > 16rFF ifTrue:[
   180 				    ascii > 16rFFFF ifTrue:[
   185                                     ascii > 16rFFFF ifTrue:[
   181 					nBitsRequired := nBitsRequired max:32
   186                                         nBitsRequired := nBitsRequired max:32
   182 				    ] ifFalse:[
   187                                     ] ifFalse:[
   183 					nBitsRequired := nBitsRequired max:16
   188                                         nBitsRequired := nBitsRequired max:16
   184 				    ]
   189                                     ]
   185 				].
   190                                 ].
   186 				ascii < 16r4000000 ifTrue:[
   191                                 ascii < 16r4000000 ifTrue:[
   187 				    errorReporter value:'overlong utf8 sequence'
   192                                     errorReporter value:'overlong utf8 sequence'
   188 				].
   193                                 ].
   189 			    ] ifFalse:[
   194                             ] ifFalse:[
   190 				errorReporter value:'invalid utf8 encoding'
   195                                 errorReporter value:'invalid utf8 encoding'
   191 			    ]
   196                             ]
   192 			]
   197                         ]
   193 		    ]
   198                     ]
   194 		]
   199                 ]
   195 	    ].
   200             ].
   196 	].
   201         ].
   197 	sz := sz + 1.
   202         sz := sz + 1.
   198     ].
   203     ].
   199     nBitsRequired == 8 ifTrue:[
   204     nBitsRequired == 8 ifTrue:[
   200 	anyAbove7BitAscii ifFalse:[
   205         anyAbove7BitAscii ifFalse:[
   201 	    "/ can return the original string
   206             "/ can return the original string
   202 	    aStringOrByteCollection isString ifTrue:[^ aStringOrByteCollection].
   207             aStringOrByteCollection isString ifTrue:[^ aStringOrByteCollection].
   203 	].
   208         ].
   204 	newString := String uninitializedNew:sz
   209         newString := String uninitializedNew:sz
   205     ] ifFalse:[
   210     ] ifFalse:[
   206 	nBitsRequired <= 16 ifTrue:[
   211         nBitsRequired <= 16 ifTrue:[
   207 	    newString := Unicode16String new:sz
   212             newString := Unicode16String new:sz
   208 	] ifFalse:[
   213         ] ifFalse:[
   209 	    newString := Unicode32String new:sz
   214             newString := Unicode32String new:sz
   210 	]
   215         ]
   211     ].
   216     ].
   212 
   217 
   213     next6Bits := [
   218     next6Bits := [
   214 		    |byte|
   219                     |byte|
   215 
   220 
   216 		    byte := s nextByte.
   221                     byte := s nextByte.
   217 		    ascii := (ascii bitShift:6) bitOr:(byte bitAnd:2r00111111).
   222                     ascii := (ascii bitShift:6) bitOr:(byte bitAnd:2r00111111).
   218 		 ].
   223                  ].
   219 
   224 
   220     s := aStringOrByteCollection readStream.
   225     s := aStringOrByteCollection readStream.
   221     idx := 1.
   226     idx := 1.
   222     [s atEnd] whileFalse:[
   227     [s atEnd] whileFalse:[
   223 	byte := ascii := s nextByte.
   228         byte := ascii := s nextByte.
   224 	(byte bitAnd:2r10000000) ~~ 0 ifTrue:[
   229         (byte bitAnd:2r10000000) ~~ 0 ifTrue:[
   225 	    (byte bitAnd:2r11100000) == 2r11000000 ifTrue:[
   230             (byte bitAnd:2r11100000) == 2r11000000 ifTrue:[
   226 		ascii := (byte bitAnd:2r00011111).
   231                 ascii := (byte bitAnd:2r00011111).
   227 		next6Bits value.
   232                 next6Bits value.
   228 	    ] ifFalse:[
   233             ] ifFalse:[
   229 		(byte bitAnd:2r11110000) == 2r11100000 ifTrue:[
   234                 (byte bitAnd:2r11110000) == 2r11100000 ifTrue:[
   230 		    ascii := (byte bitAnd:2r00001111).
   235                     ascii := (byte bitAnd:2r00001111).
   231 		    next6Bits value.
   236                     next6Bits value.
   232 		    next6Bits value.
   237                     next6Bits value.
   233 		] ifFalse:[
   238                 ] ifFalse:[
   234 		    (byte bitAnd:2r11111000) == 2r11110000 ifTrue:[
   239                     (byte bitAnd:2r11111000) == 2r11110000 ifTrue:[
   235 			ascii := (byte bitAnd:2r00000111).
   240                         ascii := (byte bitAnd:2r00000111).
   236 			next6Bits value.
   241                         next6Bits value.
   237 			next6Bits value.
   242                         next6Bits value.
   238 			next6Bits value.
   243                         next6Bits value.
   239 		    ] ifFalse:[
   244                     ] ifFalse:[
   240 			(byte bitAnd:2r11111100) == 2r11111000 ifTrue:[
   245                         (byte bitAnd:2r11111100) == 2r11111000 ifTrue:[
   241 			    ascii := (byte bitAnd:2r00000011).
   246                             ascii := (byte bitAnd:2r00000011).
   242 			    next6Bits value.
   247                             next6Bits value.
   243 			    next6Bits value.
   248                             next6Bits value.
   244 			    next6Bits value.
   249                             next6Bits value.
   245 			    next6Bits value.
   250                             next6Bits value.
   246 			] ifFalse:[
   251                         ] ifFalse:[
   247 			    (byte bitAnd:2r11111110) == 2r11111100 ifTrue:[
   252                             (byte bitAnd:2r11111110) == 2r11111100 ifTrue:[
   248 				ascii := (byte bitAnd:2r00000001).
   253                                 ascii := (byte bitAnd:2r00000001).
   249 				next6Bits value.
   254                                 next6Bits value.
   250 				next6Bits value.
   255                                 next6Bits value.
   251 				next6Bits value.
   256                                 next6Bits value.
   252 				next6Bits value.
   257                                 next6Bits value.
   253 				last6Bits value.
   258                                 last6Bits value.
   254 			    ]
   259                             ]
   255 			]
   260                         ]
   256 		    ]
   261                     ]
   257 		]
   262                 ]
   258 	    ].
   263             ].
   259 	].
   264         ].
   260 	newString at:idx put:(Character value:ascii).
   265         newString at:idx put:(Character value:ascii).
   261 	idx := idx + 1.
   266         idx := idx + 1.
   262     ].
   267     ].
   263     ^ newString
   268     ^ newString
   264 
   269 
   265     "
   270     "
   266      CharacterArray fromUTF8Bytes:#[ 16r41 16r42 ]
   271      CharacterArray fromUTF8Bytes:#[ 16r41 16r42 ]
   289      not for being used inside ST/X.
   294      not for being used inside ST/X.
   290 
   295 
   291      If you work a lot with utf8 encoded textFiles,
   296      If you work a lot with utf8 encoded textFiles,
   292      this is a first-class candidate for a primitive."
   297      this is a first-class candidate for a primitive."
   293 
   298 
   294     |s anyAbove7BitAscii|
   299     |s|
   295 
   300 
   296     anyAbove7BitAscii := false.
   301     "/ avoid creation of new strings
       
   302     aUnicodeString contains8BitCharacters ifFalse:[^ aUnicodeString].
       
   303 
   297     s := WriteStream on:(String uninitializedNew:aUnicodeString size).
   304     s := WriteStream on:(String uninitializedNew:aUnicodeString size).
   298     aUnicodeString do:[:eachCharacter |
   305     aUnicodeString do:[:eachCharacter |
   299 	|codePoint b1 b2 b3 b4 b5 v "{Class: SmallInteger }"|
   306         |codePoint b1 b2 b3 b4 b5 v "{Class: SmallInteger }"|
   300 
   307 
   301 	codePoint := eachCharacter codePoint.
   308         codePoint := eachCharacter codePoint.
   302 	codePoint <= 16r7F ifTrue:[
   309         codePoint <= 16r7F ifTrue:[
   303 	    s nextPut:eachCharacter.
   310             s nextPut:eachCharacter.
   304 	] ifFalse:[
   311         ] ifFalse:[
   305 	    anyAbove7BitAscii := true.
   312             b1 := Character value:((codePoint bitAnd:16r3F) bitOr:2r10000000).
   306 	    b1 := Character value:((codePoint bitAnd:16r3F) bitOr:2r10000000).
   313             v := codePoint bitShift:-6.
   307 	    v := codePoint bitShift:-6.
   314             v <= 16r1F ifTrue:[
   308 	    v <= 16r1F ifTrue:[
   315                 s nextPut:(Character value:(v bitOr:2r11000000)).
   309 		s nextPut:(Character value:(v bitOr:2r11000000)).
   316                 s nextPut:b1.
   310 		s nextPut:b1.
   317             ] ifFalse:[
   311 	    ] ifFalse:[
   318                 b2 := Character value:((v bitAnd:16r3F) bitOr:2r10000000).
   312 		b2 := Character value:((v bitAnd:16r3F) bitOr:2r10000000).
   319                 v := v bitShift:-6.
   313 		v := v bitShift:-6.
   320                 v <= 16r0F ifTrue:[
   314 		v <= 16r0F ifTrue:[
   321                     s nextPut:(Character value:(v bitOr:2r11100000)).
   315 		    s nextPut:(Character value:(v bitOr:2r11100000)).
   322                     s nextPut:b2; nextPut:b1.
   316 		    s nextPut:b2; nextPut:b1.
   323                 ] ifFalse:[
   317 		] ifFalse:[
   324                     b3 := Character value:((v bitAnd:16r3F) bitOr:2r10000000).
   318 		    b3 := Character value:((v bitAnd:16r3F) bitOr:2r10000000).
   325                     v := v bitShift:-6.
   319 		    v := v bitShift:-6.
   326                     v <= 16r07 ifTrue:[
   320 		    v <= 16r07 ifTrue:[
   327                         s nextPut:(Character value:(v bitOr:2r11110000)).
   321 			s nextPut:(Character value:(v bitOr:2r11110000)).
   328                         s nextPut:b3; nextPut:b2; nextPut:b1.
   322 			s nextPut:b3; nextPut:b2; nextPut:b1.
   329                     ] ifFalse:[
   323 		    ] ifFalse:[
   330                         b4 := Character value:((v bitAnd:16r3F) bitOr:2r10000000).
   324 			b4 := Character value:((v bitAnd:16r3F) bitOr:2r10000000).
   331                         v := v bitShift:-6.
   325 			v := v bitShift:-6.
   332                         v <= 16r03 ifTrue:[
   326 			v <= 16r03 ifTrue:[
   333                             s nextPut:(Character value:(v bitOr:2r11111000)).
   327 			    s nextPut:(Character value:(v bitOr:2r11111000)).
   334                             s nextPut:b4; nextPut:b3; nextPut:b2; nextPut:b1.
   328 			    s nextPut:b4; nextPut:b3; nextPut:b2; nextPut:b1.
   335                         ] ifFalse:[
   329 			] ifFalse:[
   336                             b5 := Character value:((v bitAnd:16r3F) bitOr:2r10000000).
   330 			    b5 := Character value:((v bitAnd:16r3F) bitOr:2r10000000).
   337                             v := v bitShift:-6.
   331 			    v := v bitShift:-6.
   338                             v <= 16r01 ifTrue:[
   332 			    v <= 16r01 ifTrue:[
   339                                 s nextPut:(Character value:(v bitOr:2r11111100)).
   333 				s nextPut:(Character value:(v bitOr:2r11111100)).
   340                                 s nextPut:b5; nextPut:b4; nextPut:b3; nextPut:b2; nextPut:b1.
   334 				s nextPut:b5; nextPut:b4; nextPut:b3; nextPut:b2; nextPut:b1.
   341                             ] ifFalse:[
   335 			    ] ifFalse:[
   342                                 "/ cannot happen - we only support up to 30 bit characters
   336 				"/ cannot happen - we only support up to 30 bit characters
   343                                 self error:'ascii value > 31bit in utf8Encode'.
   337 				self error:'ascii value > 31bit in utf8Encode'.
   344                             ]
   338 			    ]
   345                         ].
   339 			].
   346                     ].
   340 		    ].
   347                 ].
   341 		].
   348             ].
   342 	    ].
   349         ].
   343 	].
   350     ].
   344     ].
   351 
   345 
       
   346     anyAbove7BitAscii ifFalse:[^ aUnicodeString].   "/ avoid creation of new strings
       
   347     ^ s contents
   352     ^ s contents
   348 
   353 
   349     "
   354     "
   350      (self encodeString:'hello') asByteArray                             #[104 101 108 108 111]
   355      (self encodeString:'hello') asByteArray                             #[104 101 108 108 111]
   351      (self encodeString:(Character value:16r40) asString) asByteArray    #[64]
   356      (self encodeString:(Character value:16r40) asString) asByteArray    #[64]
   371 ! !
   376 ! !
   372 
   377 
   373 !ISO10646_to_UTF8 class methodsFor:'documentation'!
   378 !ISO10646_to_UTF8 class methodsFor:'documentation'!
   374 
   379 
   375 version
   380 version
   376     ^ '$Header: /cvs/stx/stx/libbasic/CharacterEncoderImplementations__ISO10646_to_UTF8.st,v 1.9 2004-06-16 18:52:26 ca Exp $'
   381     ^ '$Header: /cvs/stx/stx/libbasic/CharacterEncoderImplementations__ISO10646_to_UTF8.st,v 1.10 2004-08-03 20:08:14 penk Exp $'
   377 ! !
   382 ! !