|
1 "{ Encoding: utf8 }" |
|
2 |
|
3 "{ Package: 'stx:libbasic' }" |
|
4 |
|
5 "{ NameSpace: CharacterEncoderImplementations }" |
|
6 |
|
7 TwoByteEncoder subclass:#ISO10646_to_UTF8 |
|
8 instanceVariableNames:'' |
|
9 classVariableNames:'' |
|
10 poolDictionaries:'' |
|
11 category:'Collections-Text-Encodings' |
|
12 ! |
|
13 |
|
14 !ISO10646_to_UTF8 class methodsFor:'documentation'! |
|
15 |
|
16 examples |
|
17 " |
|
18 Encoding (unicode to utf8) |
|
19 ISO10646_to_UTF8 encodeString:'hello'. |
|
20 ISO10646_to_UTF8 encodeString:'ÃÂÃÂÃÂ'. |
|
21 |
|
22 Decoding (utf8 to unicode): |
|
23 |
|
24 |t| |
|
25 |
|
26 t := ISO10646_to_UTF8 encodeString:'ÃÂÃÂÃÂ'. |
|
27 ISO10646_to_UTF8 decodeString:t. |
|
28 |
|
29 " |
|
30 ! ! |
|
31 |
|
32 !ISO10646_to_UTF8 class methodsFor:'queries'! |
|
33 |
|
34 nameOfEncoding |
|
35 "I encode utf8 into unicode and vice versa" |
|
36 |
|
37 ^ #'utf8' |
|
38 ! |
|
39 |
|
40 namesOfEncoding |
|
41 "I encode utf8 into unicode and vice versa" |
|
42 |
|
43 ^ #( 'utf8' 'utf-8' ) |
|
44 ! ! |
|
45 |
|
46 !ISO10646_to_UTF8 methodsFor:'encoding & decoding'! |
|
47 |
|
48 decode:aCode |
|
49 self shouldNotImplement "/ no single byte conversion possible |
|
50 ! |
|
51 |
|
52 decodeString:aStringOrByteCollection |
|
53 "given a string in UTF8 encoding, |
|
54 return a new string containing the same characters, in 16bit (or more) encoding. |
|
55 Returns either a normal String, a TwoByteString or a FourByteString instance. |
|
56 Only useful, when reading from external sources. |
|
57 This only handles up-to 30bit characters. |
|
58 |
|
59 If you work a lot with utf8 encoded textFiles, |
|
60 this is a first-class candidate for a primitive." |
|
61 |
|
62 |sz anyAbove7BitAscii nBitsRequired |
|
63 ascii "{ Class: SmallInteger }" |
|
64 byte "{ Class: SmallInteger }" |
|
65 s newString idx next6Bits last6Bits |
|
66 errorReporter| |
|
67 |
|
68 errorReporter := [:msg | DecodingError raiseWith:aStringOrByteCollection errorString:msg]. |
|
69 |
|
70 next6Bits := [ |
|
71 |byte| |
|
72 |
|
73 byte := s nextByte. |
|
74 byte isNil ifTrue:[^ errorReporter value:'short utf8 string']. |
|
75 ascii := (ascii bitShift:6) bitOr:(byte bitAnd:2r00111111). |
|
76 ]. |
|
77 |
|
78 last6Bits := [ |
|
79 |byte a| |
|
80 |
|
81 byte := s nextByte. |
|
82 byte isNil ifTrue:[^ errorReporter value:'short utf8 string']. |
|
83 a := (ascii bitShift:6) bitOr:(byte bitAnd:2r00111111). |
|
84 (a > 16r3FFFFFFF) ifTrue:[ |
|
85 "/ ST/X can only represent 30 bit unicode characters. |
|
86 errorReporter value:'unicode character out of range'. |
|
87 a := 16r3FFFFFFF. |
|
88 ]. |
|
89 ascii := a. |
|
90 ]. |
|
91 |
|
92 nBitsRequired := 8. |
|
93 anyAbove7BitAscii := false. |
|
94 sz := 0. |
|
95 s := aStringOrByteCollection readStream. |
|
96 [s atEnd] whileFalse:[ |
|
97 byte := ascii := s nextByte. |
|
98 (byte bitAnd:16r80) ~~ 0 ifTrue:[ |
|
99 anyAbove7BitAscii := true. |
|
100 (byte bitAnd:2r11100000) == 2r11000000 ifTrue:[ |
|
101 "/ 80 .. 7FF |
|
102 ascii := (byte bitAnd:2r00011111). |
|
103 next6Bits value. |
|
104 ascii > 16rFF ifTrue:[ |
|
105 nBitsRequired := nBitsRequired max:16 |
|
106 ]. |
|
107 "/ a strict utf8 decoder does not allow overlong sequences |
|
108 ascii < 16r80 ifTrue:[ |
|
109 errorReporter value:'overlong utf8 sequence' |
|
110 ]. |
|
111 ] ifFalse:[ |
|
112 (byte bitAnd:2r11110000) == 2r11100000 ifTrue:[ |
|
113 "/ 800 .. FFFF |
|
114 ascii := (byte bitAnd:2r00001111). |
|
115 next6Bits value. |
|
116 next6Bits value. |
|
117 ascii > 16rFF ifTrue:[ |
|
118 nBitsRequired := nBitsRequired max:16 |
|
119 ]. |
|
120 ascii < 16r800 ifTrue:[ |
|
121 errorReporter value:'overlong utf8 sequence' |
|
122 ]. |
|
123 ] ifFalse:[ |
|
124 (byte bitAnd:2r11111000) == 2r11110000 ifTrue:[ |
|
125 "/ 10000 .. 1FFFFF |
|
126 ascii := (byte bitAnd:2r00000111). |
|
127 next6Bits value. |
|
128 next6Bits value. |
|
129 next6Bits value. |
|
130 ascii > 16rFF ifTrue:[ |
|
131 ascii > 16rFFFF ifTrue:[ |
|
132 nBitsRequired := nBitsRequired max:32 |
|
133 ] ifFalse:[ |
|
134 nBitsRequired := nBitsRequired max:16 |
|
135 ] |
|
136 ]. |
|
137 ascii < 16r10000 ifTrue:[ |
|
138 errorReporter value:'overlong utf8 sequence' |
|
139 ]. |
|
140 ] ifFalse:[ |
|
141 (byte bitAnd:2r11111100) == 2r11111000 ifTrue:[ |
|
142 "/ 200000 .. 3FFFFFF |
|
143 ascii := (byte bitAnd:2r00000011). |
|
144 next6Bits value. |
|
145 next6Bits value. |
|
146 next6Bits value. |
|
147 next6Bits value. |
|
148 ascii > 16rFF ifTrue:[ |
|
149 ascii > 16rFFFF ifTrue:[ |
|
150 nBitsRequired := nBitsRequired max:32 |
|
151 ] ifFalse:[ |
|
152 nBitsRequired := nBitsRequired max:16 |
|
153 ] |
|
154 ]. |
|
155 ascii < 200000 ifTrue:[ |
|
156 errorReporter value:'overlong utf8 sequence' |
|
157 ]. |
|
158 ] ifFalse:[ |
|
159 (byte bitAnd:2r11111110) == 2r11111100 ifTrue:[ |
|
160 "/ 4000000 .. 7FFFFFFF |
|
161 ascii := (byte bitAnd:2r00000001). |
|
162 next6Bits value. |
|
163 next6Bits value. |
|
164 next6Bits value. |
|
165 next6Bits value. |
|
166 last6Bits value. |
|
167 ascii > 16rFF ifTrue:[ |
|
168 ascii > 16rFFFF ifTrue:[ |
|
169 nBitsRequired := nBitsRequired max:32 |
|
170 ] ifFalse:[ |
|
171 nBitsRequired := nBitsRequired max:16 |
|
172 ] |
|
173 ]. |
|
174 ascii < 16r4000000 ifTrue:[ |
|
175 errorReporter value:'overlong utf8 sequence' |
|
176 ]. |
|
177 ] ifFalse:[ |
|
178 errorReporter value:'invalid utf8 encoding' |
|
179 ] |
|
180 ] |
|
181 ] |
|
182 ] |
|
183 ]. |
|
184 ]. |
|
185 sz := sz + 1. |
|
186 ]. |
|
187 nBitsRequired == 8 ifTrue:[ |
|
188 anyAbove7BitAscii ifFalse:[ |
|
189 "/ can return the original string |
|
190 aStringOrByteCollection isString ifTrue:[^ aStringOrByteCollection]. |
|
191 ]. |
|
192 newString := String uninitializedNew:sz |
|
193 ] ifFalse:[ |
|
194 nBitsRequired <= 16 ifTrue:[ |
|
195 newString := Unicode16String new:sz |
|
196 ] ifFalse:[ |
|
197 newString := Unicode32String new:sz |
|
198 ] |
|
199 ]. |
|
200 |
|
201 next6Bits := [ |
|
202 |byte| |
|
203 |
|
204 byte := s nextByte. |
|
205 ascii := (ascii bitShift:6) bitOr:(byte bitAnd:2r00111111). |
|
206 ]. |
|
207 |
|
208 s := aStringOrByteCollection readStream. |
|
209 idx := 1. |
|
210 [s atEnd] whileFalse:[ |
|
211 byte := ascii := s nextByte. |
|
212 (byte bitAnd:2r10000000) ~~ 0 ifTrue:[ |
|
213 (byte bitAnd:2r11100000) == 2r11000000 ifTrue:[ |
|
214 ascii := (byte bitAnd:2r00011111). |
|
215 next6Bits value. |
|
216 ] ifFalse:[ |
|
217 (byte bitAnd:2r11110000) == 2r11100000 ifTrue:[ |
|
218 ascii := (byte bitAnd:2r00001111). |
|
219 next6Bits value. |
|
220 next6Bits value. |
|
221 ] ifFalse:[ |
|
222 (byte bitAnd:2r11111000) == 2r11110000 ifTrue:[ |
|
223 ascii := (byte bitAnd:2r00000111). |
|
224 next6Bits value. |
|
225 next6Bits value. |
|
226 next6Bits value. |
|
227 ] ifFalse:[ |
|
228 (byte bitAnd:2r11111100) == 2r11111000 ifTrue:[ |
|
229 ascii := (byte bitAnd:2r00000011). |
|
230 next6Bits value. |
|
231 next6Bits value. |
|
232 next6Bits value. |
|
233 next6Bits value. |
|
234 ] ifFalse:[ |
|
235 (byte bitAnd:2r11111110) == 2r11111100 ifTrue:[ |
|
236 ascii := (byte bitAnd:2r00000001). |
|
237 next6Bits value. |
|
238 next6Bits value. |
|
239 next6Bits value. |
|
240 next6Bits value. |
|
241 last6Bits value. |
|
242 ] |
|
243 ] |
|
244 ] |
|
245 ] |
|
246 ]. |
|
247 ]. |
|
248 newString at:idx put:(Character value:ascii). |
|
249 idx := idx + 1. |
|
250 ]. |
|
251 ^ newString |
|
252 |
|
253 " |
|
254 CharacterArray fromUTF8Bytes:#[ 16r41 16r42 ] |
|
255 CharacterArray fromUTF8Bytes:#[ 16rC1 16r02 ] |
|
256 CharacterArray fromUTF8Bytes:#[ 16rE0 16r81 16r02 ] |
|
257 CharacterArray fromUTF8Bytes:#[ 16rEF 16rBF 16rBF ] |
|
258 |
|
259 rfc2279 examples: |
|
260 CharacterArray fromUTF8Bytes:#[ 16r41 16rE2 16r89 16rA2 16rCE 16r91 16r2E ] |
|
261 CharacterArray fromUTF8Bytes:#[ 16rED 16r95 16r9C 16rEA 16rB5 16rAD 16rEC 16r96 16rB4 ] |
|
262 CharacterArray fromUTF8Bytes:#[ 16rE6 16r97 16rA5 16rE6 16r9C 16rAC 16rE8 16rAA 16r9E ] |
|
263 |
|
264 invalid: |
|
265 CharacterArray fromUTF8Bytes:#[ 16rC0 16r80 ] |
|
266 CharacterArray fromUTF8Bytes:#[ 16rE0 16r80 16r80 ] |
|
267 " |
|
268 ! |
|
269 |
|
270 encode:aCode |
|
271 self shouldNotImplement "/ no single byte conversion possible |
|
272 ! |
|
273 |
|
274 encodeString:aUnicodeString |
|
275 "return the UTF-8 representation of a aUnicodeString. |
|
276 The resulting string is only useful to be stored on some external file, |
|
277 not for being used inside ST/X. |
|
278 |
|
279 If you work a lot with utf8 encoded textFiles, |
|
280 this is a first-class candidate for a primitive." |
|
281 |
|
282 |s anyAbove7BitAscii| |
|
283 |
|
284 anyAbove7BitAscii := false. |
|
285 s := (String uninitializedNew:aUnicodeString size) writeStream. |
|
286 aUnicodeString do:[:eachCharacter | |
|
287 |codePoint b1 b2 b3 b4 b5 v "{Class: SmallInteger }"| |
|
288 |
|
289 codePoint := eachCharacter asciiValue. |
|
290 codePoint <= 16r7F ifTrue:[ |
|
291 s nextPut:eachCharacter. |
|
292 ] ifFalse:[ |
|
293 anyAbove7BitAscii := true. |
|
294 b1 := Character value:((codePoint bitAnd:16r3F) bitOr:2r10000000). |
|
295 v := codePoint bitShift:-6. |
|
296 v <= 16r1F ifTrue:[ |
|
297 s nextPut:(Character value:(v bitOr:2r11000000)). |
|
298 s nextPut:b1. |
|
299 ] ifFalse:[ |
|
300 b2 := Character value:((v bitAnd:16r3F) bitOr:2r10000000). |
|
301 v := v bitShift:-6. |
|
302 v <= 16r0F ifTrue:[ |
|
303 s nextPut:(Character value:(v bitOr:2r11100000)). |
|
304 s nextPut:b2; nextPut:b1. |
|
305 ] ifFalse:[ |
|
306 b3 := Character value:((v bitAnd:16r3F) bitOr:2r10000000). |
|
307 v := v bitShift:-6. |
|
308 v <= 16r07 ifTrue:[ |
|
309 s nextPut:(Character value:(v bitOr:2r11110000)). |
|
310 s nextPut:b3; nextPut:b2; nextPut:b1. |
|
311 ] ifFalse:[ |
|
312 b4 := Character value:((v bitAnd:16r3F) bitOr:2r10000000). |
|
313 v := v bitShift:-6. |
|
314 v <= 16r03 ifTrue:[ |
|
315 s nextPut:(Character value:(v bitOr:2r11111000)). |
|
316 s nextPut:b4; nextPut:b3; nextPut:b2; nextPut:b1. |
|
317 ] ifFalse:[ |
|
318 b5 := Character value:((v bitAnd:16r3F) bitOr:2r10000000). |
|
319 v := v bitShift:-6. |
|
320 v <= 16r01 ifTrue:[ |
|
321 s nextPut:(Character value:(v bitOr:2r11111100)). |
|
322 s nextPut:b5; nextPut:b4; nextPut:b3; nextPut:b2; nextPut:b1. |
|
323 ] ifFalse:[ |
|
324 "/ cannot happen - we only support up to 30 bit characters |
|
325 self error:'ascii value > 31bit in utf8Encode'. |
|
326 ] |
|
327 ]. |
|
328 ]. |
|
329 ]. |
|
330 ]. |
|
331 ]. |
|
332 ]. |
|
333 |
|
334 anyAbove7BitAscii ifFalse:[^ aUnicodeString]. "/ avoid creation of new strings |
|
335 ^ s contents |
|
336 |
|
337 " |
|
338 (self encodeString:'hello') asByteArray #[104 101 108 108 111] |
|
339 (self encodeString:(Character value:16r40) asString) asByteArray #[64] |
|
340 (self encodeString:(Character value:16r7F) asString) asByteArray #[127] |
|
341 (self encodeString:(Character value:16r80) asString) asByteArray #[194 128] |
|
342 (self encodeString:(Character value:16rFF) asString) asByteArray #[195 191] |
|
343 (self encodeString:(Character value:16r100) asString) asByteArray #[196 128] |
|
344 (self encodeString:(Character value:16r200) asString) asByteArray #[200 128] |
|
345 (self encodeString:(Character value:16r400) asString) asByteArray #[208 128] |
|
346 (self encodeString:(Character value:16r800) asString) asByteArray #[224 160 128] |
|
347 (self encodeString:(Character value:16r1000) asString) asByteArray #[225 128 128] |
|
348 (self encodeString:(Character value:16r2000) asString) asByteArray #[226 128 128] |
|
349 (self encodeString:(Character value:16r4000) asString) asByteArray #[228 128 128] |
|
350 (self encodeString:(Character value:16r8000) asString) asByteArray #[232 128 128] |
|
351 (self encodeString:(Character value:16rFFFF) asString) asByteArray #[239 191 191] |
|
352 " |
|
353 ! ! |
|
354 |
|
355 !ISO10646_to_UTF8 class methodsFor:'documentation'! |
|
356 |
|
357 version |
|
358 ^ '$Header: /cvs/stx/stx/libbasic/CharacterEncoderImplementations__ISO10646_to_UTF8.st,v 1.1 2004-03-05 17:18:03 cg Exp $' |
|
359 ! ! |