71 ascii "{ Class: SmallInteger }" |
71 ascii "{ Class: SmallInteger }" |
72 byte "{ Class: SmallInteger }" |
72 byte "{ Class: SmallInteger }" |
73 s newString idx next6Bits last6Bits |
73 s newString idx next6Bits last6Bits |
74 errorReporter| |
74 errorReporter| |
75 |
75 |
|
76 "/ avoid creation of new strings |
|
77 aStringOrByteCollection isString ifTrue:[ |
|
78 aStringOrByteCollection contains8BitCharacters ifFalse:[^ aStringOrByteCollection]. |
|
79 ]. |
|
80 |
76 errorReporter := [:msg | DecodingError raiseWith:aStringOrByteCollection errorString:msg]. |
81 errorReporter := [:msg | DecodingError raiseWith:aStringOrByteCollection errorString:msg]. |
77 |
82 |
78 next6Bits := [ |
83 next6Bits := [ |
79 | byte | |
84 | byte | |
80 |
85 |
81 byte := s nextByte. |
86 byte := s nextByte. |
82 byte isNil ifTrue:[^ errorReporter value:'short utf8 string']. |
87 byte isNil ifTrue:[^ errorReporter value:'short utf8 string']. |
83 ascii := (ascii bitShift:6) bitOr:(byte bitAnd:2r00111111). |
88 ascii := (ascii bitShift:6) bitOr:(byte bitAnd:2r00111111). |
84 (byte bitAnd:2r11000000) ~~ 2r10000000 ifTrue:[ |
89 (byte bitAnd:2r11000000) ~~ 2r10000000 ifTrue:[ |
85 ^ errorReporter value:'illegal followbyte'.]. |
90 ^ errorReporter value:'illegal followbyte'.]. |
86 ]. |
91 ]. |
87 |
92 |
88 last6Bits := [ |
93 last6Bits := [ |
89 | a byte | |
94 | a byte | |
90 |
95 |
91 byte := s nextByte. |
96 byte := s nextByte. |
92 byte isNil ifTrue:[^ errorReporter value:'short utf8 string']. |
97 byte isNil ifTrue:[^ errorReporter value:'short utf8 string']. |
93 a := (ascii bitShift:6) bitOr:(byte bitAnd:2r00111111). |
98 a := (ascii bitShift:6) bitOr:(byte bitAnd:2r00111111). |
94 (a > 16r3FFFFFFF) ifTrue:[ |
99 (a > 16r3FFFFFFF) ifTrue:[ |
95 "/ ST/X can only represent 30 bit unicode characters. |
100 "/ ST/X can only represent 30 bit unicode characters. |
96 errorReporter value:'unicode character out of range'. |
101 errorReporter value:'unicode character out of range'. |
97 a := 16r3FFFFFFF. |
102 a := 16r3FFFFFFF. |
98 ]. |
103 ]. |
99 ascii := a. |
104 ascii := a. |
100 (byte bitAnd:2r11000000) ~~ 2r10000000 ifTrue:[ |
105 (byte bitAnd:2r11000000) ~~ 2r10000000 ifTrue:[ |
101 ^ errorReporter value:'illegal followbyte'.]. |
106 ^ errorReporter value:'illegal followbyte'.]. |
102 ]. |
107 ]. |
103 |
108 |
104 nBitsRequired := 8. |
109 nBitsRequired := 8. |
105 anyAbove7BitAscii := false. |
110 anyAbove7BitAscii := false. |
106 sz := 0. |
111 sz := 0. |
107 s := aStringOrByteCollection readStream. |
112 s := aStringOrByteCollection readStream. |
108 [s atEnd] whileFalse:[ |
113 [s atEnd] whileFalse:[ |
109 byte := ascii := s nextByte. |
114 byte := ascii := s nextByte. |
110 (byte bitAnd:16r80) ~~ 0 ifTrue:[ |
115 (byte bitAnd:16r80) ~~ 0 ifTrue:[ |
111 anyAbove7BitAscii := true. |
116 anyAbove7BitAscii := true. |
112 (byte bitAnd:2r11100000) == 2r11000000 ifTrue:[ |
117 (byte bitAnd:2r11100000) == 2r11000000 ifTrue:[ |
113 "/ 80 .. 7FF |
118 "/ 80 .. 7FF |
114 ascii := (byte bitAnd:2r00011111). |
119 ascii := (byte bitAnd:2r00011111). |
115 next6Bits value. |
120 next6Bits value. |
116 ascii > 16rFF ifTrue:[ |
121 ascii > 16rFF ifTrue:[ |
117 nBitsRequired := nBitsRequired max:16 |
122 nBitsRequired := nBitsRequired max:16 |
118 ]. |
123 ]. |
119 "/ a strict utf8 decoder does not allow overlong sequences |
124 "/ a strict utf8 decoder does not allow overlong sequences |
120 ascii < 16r80 ifTrue:[ |
125 ascii < 16r80 ifTrue:[ |
121 errorReporter value:'overlong utf8 sequence' |
126 errorReporter value:'overlong utf8 sequence' |
122 ]. |
127 ]. |
123 ] ifFalse:[ |
128 ] ifFalse:[ |
124 (byte bitAnd:2r11110000) == 2r11100000 ifTrue:[ |
129 (byte bitAnd:2r11110000) == 2r11100000 ifTrue:[ |
125 "/ 800 .. FFFF |
130 "/ 800 .. FFFF |
126 ascii := (byte bitAnd:2r00001111). |
131 ascii := (byte bitAnd:2r00001111). |
127 next6Bits value. |
132 next6Bits value. |
128 next6Bits value. |
133 next6Bits value. |
129 ascii > 16rFF ifTrue:[ |
134 ascii > 16rFF ifTrue:[ |
130 nBitsRequired := nBitsRequired max:16 |
135 nBitsRequired := nBitsRequired max:16 |
131 ]. |
136 ]. |
132 ascii < 16r800 ifTrue:[ |
137 ascii < 16r800 ifTrue:[ |
133 errorReporter value:'overlong utf8 sequence' |
138 errorReporter value:'overlong utf8 sequence' |
134 ]. |
139 ]. |
135 ] ifFalse:[ |
140 ] ifFalse:[ |
136 (byte bitAnd:2r11111000) == 2r11110000 ifTrue:[ |
141 (byte bitAnd:2r11111000) == 2r11110000 ifTrue:[ |
137 "/ 10000 .. 1FFFFF |
142 "/ 10000 .. 1FFFFF |
138 ascii := (byte bitAnd:2r00000111). |
143 ascii := (byte bitAnd:2r00000111). |
139 next6Bits value. |
144 next6Bits value. |
140 next6Bits value. |
145 next6Bits value. |
141 next6Bits value. |
146 next6Bits value. |
142 ascii > 16rFF ifTrue:[ |
147 ascii > 16rFF ifTrue:[ |
143 ascii > 16rFFFF ifTrue:[ |
148 ascii > 16rFFFF ifTrue:[ |
144 nBitsRequired := nBitsRequired max:32 |
149 nBitsRequired := nBitsRequired max:32 |
145 ] ifFalse:[ |
150 ] ifFalse:[ |
146 nBitsRequired := nBitsRequired max:16 |
151 nBitsRequired := nBitsRequired max:16 |
147 ] |
152 ] |
148 ]. |
153 ]. |
149 ascii < 16r10000 ifTrue:[ |
154 ascii < 16r10000 ifTrue:[ |
150 errorReporter value:'overlong utf8 sequence' |
155 errorReporter value:'overlong utf8 sequence' |
151 ]. |
156 ]. |
152 ] ifFalse:[ |
157 ] ifFalse:[ |
153 (byte bitAnd:2r11111100) == 2r11111000 ifTrue:[ |
158 (byte bitAnd:2r11111100) == 2r11111000 ifTrue:[ |
154 "/ 200000 .. 3FFFFFF |
159 "/ 200000 .. 3FFFFFF |
155 ascii := (byte bitAnd:2r00000011). |
160 ascii := (byte bitAnd:2r00000011). |
156 next6Bits value. |
161 next6Bits value. |
157 next6Bits value. |
162 next6Bits value. |
158 next6Bits value. |
163 next6Bits value. |
159 next6Bits value. |
164 next6Bits value. |
160 ascii > 16rFF ifTrue:[ |
165 ascii > 16rFF ifTrue:[ |
161 ascii > 16rFFFF ifTrue:[ |
166 ascii > 16rFFFF ifTrue:[ |
162 nBitsRequired := nBitsRequired max:32 |
167 nBitsRequired := nBitsRequired max:32 |
163 ] ifFalse:[ |
168 ] ifFalse:[ |
164 nBitsRequired := nBitsRequired max:16 |
169 nBitsRequired := nBitsRequired max:16 |
165 ] |
170 ] |
166 ]. |
171 ]. |
167 ascii < 200000 ifTrue:[ |
172 ascii < 200000 ifTrue:[ |
168 errorReporter value:'overlong utf8 sequence' |
173 errorReporter value:'overlong utf8 sequence' |
169 ]. |
174 ]. |
170 ] ifFalse:[ |
175 ] ifFalse:[ |
171 (byte bitAnd:2r11111110) == 2r11111100 ifTrue:[ |
176 (byte bitAnd:2r11111110) == 2r11111100 ifTrue:[ |
172 "/ 4000000 .. 7FFFFFFF |
177 "/ 4000000 .. 7FFFFFFF |
173 ascii := (byte bitAnd:2r00000001). |
178 ascii := (byte bitAnd:2r00000001). |
174 next6Bits value. |
179 next6Bits value. |
175 next6Bits value. |
180 next6Bits value. |
176 next6Bits value. |
181 next6Bits value. |
177 next6Bits value. |
182 next6Bits value. |
178 last6Bits value. |
183 last6Bits value. |
179 ascii > 16rFF ifTrue:[ |
184 ascii > 16rFF ifTrue:[ |
180 ascii > 16rFFFF ifTrue:[ |
185 ascii > 16rFFFF ifTrue:[ |
181 nBitsRequired := nBitsRequired max:32 |
186 nBitsRequired := nBitsRequired max:32 |
182 ] ifFalse:[ |
187 ] ifFalse:[ |
183 nBitsRequired := nBitsRequired max:16 |
188 nBitsRequired := nBitsRequired max:16 |
184 ] |
189 ] |
185 ]. |
190 ]. |
186 ascii < 16r4000000 ifTrue:[ |
191 ascii < 16r4000000 ifTrue:[ |
187 errorReporter value:'overlong utf8 sequence' |
192 errorReporter value:'overlong utf8 sequence' |
188 ]. |
193 ]. |
189 ] ifFalse:[ |
194 ] ifFalse:[ |
190 errorReporter value:'invalid utf8 encoding' |
195 errorReporter value:'invalid utf8 encoding' |
191 ] |
196 ] |
192 ] |
197 ] |
193 ] |
198 ] |
194 ] |
199 ] |
195 ]. |
200 ]. |
196 ]. |
201 ]. |
197 sz := sz + 1. |
202 sz := sz + 1. |
198 ]. |
203 ]. |
199 nBitsRequired == 8 ifTrue:[ |
204 nBitsRequired == 8 ifTrue:[ |
200 anyAbove7BitAscii ifFalse:[ |
205 anyAbove7BitAscii ifFalse:[ |
201 "/ can return the original string |
206 "/ can return the original string |
202 aStringOrByteCollection isString ifTrue:[^ aStringOrByteCollection]. |
207 aStringOrByteCollection isString ifTrue:[^ aStringOrByteCollection]. |
203 ]. |
208 ]. |
204 newString := String uninitializedNew:sz |
209 newString := String uninitializedNew:sz |
205 ] ifFalse:[ |
210 ] ifFalse:[ |
206 nBitsRequired <= 16 ifTrue:[ |
211 nBitsRequired <= 16 ifTrue:[ |
207 newString := Unicode16String new:sz |
212 newString := Unicode16String new:sz |
208 ] ifFalse:[ |
213 ] ifFalse:[ |
209 newString := Unicode32String new:sz |
214 newString := Unicode32String new:sz |
210 ] |
215 ] |
211 ]. |
216 ]. |
212 |
217 |
213 next6Bits := [ |
218 next6Bits := [ |
214 |byte| |
219 |byte| |
215 |
220 |
216 byte := s nextByte. |
221 byte := s nextByte. |
217 ascii := (ascii bitShift:6) bitOr:(byte bitAnd:2r00111111). |
222 ascii := (ascii bitShift:6) bitOr:(byte bitAnd:2r00111111). |
218 ]. |
223 ]. |
219 |
224 |
220 s := aStringOrByteCollection readStream. |
225 s := aStringOrByteCollection readStream. |
221 idx := 1. |
226 idx := 1. |
222 [s atEnd] whileFalse:[ |
227 [s atEnd] whileFalse:[ |
223 byte := ascii := s nextByte. |
228 byte := ascii := s nextByte. |
224 (byte bitAnd:2r10000000) ~~ 0 ifTrue:[ |
229 (byte bitAnd:2r10000000) ~~ 0 ifTrue:[ |
225 (byte bitAnd:2r11100000) == 2r11000000 ifTrue:[ |
230 (byte bitAnd:2r11100000) == 2r11000000 ifTrue:[ |
226 ascii := (byte bitAnd:2r00011111). |
231 ascii := (byte bitAnd:2r00011111). |
227 next6Bits value. |
232 next6Bits value. |
228 ] ifFalse:[ |
233 ] ifFalse:[ |
229 (byte bitAnd:2r11110000) == 2r11100000 ifTrue:[ |
234 (byte bitAnd:2r11110000) == 2r11100000 ifTrue:[ |
230 ascii := (byte bitAnd:2r00001111). |
235 ascii := (byte bitAnd:2r00001111). |
231 next6Bits value. |
236 next6Bits value. |
232 next6Bits value. |
237 next6Bits value. |
233 ] ifFalse:[ |
238 ] ifFalse:[ |
234 (byte bitAnd:2r11111000) == 2r11110000 ifTrue:[ |
239 (byte bitAnd:2r11111000) == 2r11110000 ifTrue:[ |
235 ascii := (byte bitAnd:2r00000111). |
240 ascii := (byte bitAnd:2r00000111). |
236 next6Bits value. |
241 next6Bits value. |
237 next6Bits value. |
242 next6Bits value. |
238 next6Bits value. |
243 next6Bits value. |
239 ] ifFalse:[ |
244 ] ifFalse:[ |
240 (byte bitAnd:2r11111100) == 2r11111000 ifTrue:[ |
245 (byte bitAnd:2r11111100) == 2r11111000 ifTrue:[ |
241 ascii := (byte bitAnd:2r00000011). |
246 ascii := (byte bitAnd:2r00000011). |
242 next6Bits value. |
247 next6Bits value. |
243 next6Bits value. |
248 next6Bits value. |
244 next6Bits value. |
249 next6Bits value. |
245 next6Bits value. |
250 next6Bits value. |
246 ] ifFalse:[ |
251 ] ifFalse:[ |
247 (byte bitAnd:2r11111110) == 2r11111100 ifTrue:[ |
252 (byte bitAnd:2r11111110) == 2r11111100 ifTrue:[ |
248 ascii := (byte bitAnd:2r00000001). |
253 ascii := (byte bitAnd:2r00000001). |
249 next6Bits value. |
254 next6Bits value. |
250 next6Bits value. |
255 next6Bits value. |
251 next6Bits value. |
256 next6Bits value. |
252 next6Bits value. |
257 next6Bits value. |
253 last6Bits value. |
258 last6Bits value. |
254 ] |
259 ] |
255 ] |
260 ] |
256 ] |
261 ] |
257 ] |
262 ] |
258 ]. |
263 ]. |
259 ]. |
264 ]. |
260 newString at:idx put:(Character value:ascii). |
265 newString at:idx put:(Character value:ascii). |
261 idx := idx + 1. |
266 idx := idx + 1. |
262 ]. |
267 ]. |
263 ^ newString |
268 ^ newString |
264 |
269 |
265 " |
270 " |
266 CharacterArray fromUTF8Bytes:#[ 16r41 16r42 ] |
271 CharacterArray fromUTF8Bytes:#[ 16r41 16r42 ] |
289 not for being used inside ST/X. |
294 not for being used inside ST/X. |
290 |
295 |
291 If you work a lot with utf8 encoded textFiles, |
296 If you work a lot with utf8 encoded textFiles, |
292 this is a first-class candidate for a primitive." |
297 this is a first-class candidate for a primitive." |
293 |
298 |
294 |s anyAbove7BitAscii| |
299 |s| |
295 |
300 |
296 anyAbove7BitAscii := false. |
301 "/ avoid creation of new strings |
|
302 aUnicodeString contains8BitCharacters ifFalse:[^ aUnicodeString]. |
|
303 |
297 s := WriteStream on:(String uninitializedNew:aUnicodeString size). |
304 s := WriteStream on:(String uninitializedNew:aUnicodeString size). |
298 aUnicodeString do:[:eachCharacter | |
305 aUnicodeString do:[:eachCharacter | |
299 |codePoint b1 b2 b3 b4 b5 v "{Class: SmallInteger }"| |
306 |codePoint b1 b2 b3 b4 b5 v "{Class: SmallInteger }"| |
300 |
307 |
301 codePoint := eachCharacter codePoint. |
308 codePoint := eachCharacter codePoint. |
302 codePoint <= 16r7F ifTrue:[ |
309 codePoint <= 16r7F ifTrue:[ |
303 s nextPut:eachCharacter. |
310 s nextPut:eachCharacter. |
304 ] ifFalse:[ |
311 ] ifFalse:[ |
305 anyAbove7BitAscii := true. |
312 b1 := Character value:((codePoint bitAnd:16r3F) bitOr:2r10000000). |
306 b1 := Character value:((codePoint bitAnd:16r3F) bitOr:2r10000000). |
313 v := codePoint bitShift:-6. |
307 v := codePoint bitShift:-6. |
314 v <= 16r1F ifTrue:[ |
308 v <= 16r1F ifTrue:[ |
315 s nextPut:(Character value:(v bitOr:2r11000000)). |
309 s nextPut:(Character value:(v bitOr:2r11000000)). |
316 s nextPut:b1. |
310 s nextPut:b1. |
317 ] ifFalse:[ |
311 ] ifFalse:[ |
318 b2 := Character value:((v bitAnd:16r3F) bitOr:2r10000000). |
312 b2 := Character value:((v bitAnd:16r3F) bitOr:2r10000000). |
319 v := v bitShift:-6. |
313 v := v bitShift:-6. |
320 v <= 16r0F ifTrue:[ |
314 v <= 16r0F ifTrue:[ |
321 s nextPut:(Character value:(v bitOr:2r11100000)). |
315 s nextPut:(Character value:(v bitOr:2r11100000)). |
322 s nextPut:b2; nextPut:b1. |
316 s nextPut:b2; nextPut:b1. |
323 ] ifFalse:[ |
317 ] ifFalse:[ |
324 b3 := Character value:((v bitAnd:16r3F) bitOr:2r10000000). |
318 b3 := Character value:((v bitAnd:16r3F) bitOr:2r10000000). |
325 v := v bitShift:-6. |
319 v := v bitShift:-6. |
326 v <= 16r07 ifTrue:[ |
320 v <= 16r07 ifTrue:[ |
327 s nextPut:(Character value:(v bitOr:2r11110000)). |
321 s nextPut:(Character value:(v bitOr:2r11110000)). |
328 s nextPut:b3; nextPut:b2; nextPut:b1. |
322 s nextPut:b3; nextPut:b2; nextPut:b1. |
329 ] ifFalse:[ |
323 ] ifFalse:[ |
330 b4 := Character value:((v bitAnd:16r3F) bitOr:2r10000000). |
324 b4 := Character value:((v bitAnd:16r3F) bitOr:2r10000000). |
331 v := v bitShift:-6. |
325 v := v bitShift:-6. |
332 v <= 16r03 ifTrue:[ |
326 v <= 16r03 ifTrue:[ |
333 s nextPut:(Character value:(v bitOr:2r11111000)). |
327 s nextPut:(Character value:(v bitOr:2r11111000)). |
334 s nextPut:b4; nextPut:b3; nextPut:b2; nextPut:b1. |
328 s nextPut:b4; nextPut:b3; nextPut:b2; nextPut:b1. |
335 ] ifFalse:[ |
329 ] ifFalse:[ |
336 b5 := Character value:((v bitAnd:16r3F) bitOr:2r10000000). |
330 b5 := Character value:((v bitAnd:16r3F) bitOr:2r10000000). |
337 v := v bitShift:-6. |
331 v := v bitShift:-6. |
338 v <= 16r01 ifTrue:[ |
332 v <= 16r01 ifTrue:[ |
339 s nextPut:(Character value:(v bitOr:2r11111100)). |
333 s nextPut:(Character value:(v bitOr:2r11111100)). |
340 s nextPut:b5; nextPut:b4; nextPut:b3; nextPut:b2; nextPut:b1. |
334 s nextPut:b5; nextPut:b4; nextPut:b3; nextPut:b2; nextPut:b1. |
341 ] ifFalse:[ |
335 ] ifFalse:[ |
342 "/ cannot happen - we only support up to 30 bit characters |
336 "/ cannot happen - we only support up to 30 bit characters |
343 self error:'ascii value > 31bit in utf8Encode'. |
337 self error:'ascii value > 31bit in utf8Encode'. |
344 ] |
338 ] |
345 ]. |
339 ]. |
346 ]. |
340 ]. |
347 ]. |
341 ]. |
348 ]. |
342 ]. |
349 ]. |
343 ]. |
350 ]. |
344 ]. |
351 |
345 |
|
346 anyAbove7BitAscii ifFalse:[^ aUnicodeString]. "/ avoid creation of new strings |
|
347 ^ s contents |
352 ^ s contents |
348 |
353 |
349 " |
354 " |
350 (self encodeString:'hello') asByteArray #[104 101 108 108 111] |
355 (self encodeString:'hello') asByteArray #[104 101 108 108 111] |
351 (self encodeString:(Character value:16r40) asString) asByteArray #[64] |
356 (self encodeString:(Character value:16r40) asString) asByteArray #[64] |