196 ! |
194 ! |
197 |
195 |
198 utf8DecodeFrom:aStream |
196 utf8DecodeFrom:aStream |
199 "read and return a single unicode character from an UTF8 encoded stream" |
197 "read and return a single unicode character from an UTF8 encoded stream" |
200 |
198 |
201 |fetchNext c1 c2 c3 c4 c5 codePoint| |
199 |fetchNext c1 c2 codePoint| |
202 |
200 |
203 c1 := aStream next. |
201 c1 := aStream next. |
204 codePoint := c1 codePoint. |
202 codePoint := c1 codePoint. |
205 codePoint <= 16r7F ifTrue:[ |
203 codePoint <= 16r7F ifTrue:[ |
206 "/ 0xxxxxxx - 7 bits |
204 "/ 0xxxxxxx - 7 bits |
207 ^ c1. |
205 ^ c1 asCharacter. |
208 ]. |
206 ]. |
209 |
207 |
210 (codePoint bitAnd:2r11000000) == 2r10000000 ifTrue:[ |
208 (codePoint bitAnd:2r11000000) == 2r10000000 ifTrue:[ |
211 "/ out of sync (got an intermediate character) |
209 "/ out of sync (got an intermediate character) |
212 InvalidEncodingError raiseRequestWith:codePoint errorString:' - out of sync'. |
210 InvalidEncodingError raiseRequestWith:codePoint errorString:' - out of sync'. |
213 ^ c1. |
211 ^ c1 asCharacter. |
214 ]. |
212 ]. |
215 |
213 |
216 fetchNext := [ |ch| |
214 fetchNext := [ |code| |
217 ch := aStream next. |
215 code := aStream next codePoint. |
218 (ch codePoint bitAnd:2r11000000) == 2r10000000 ifFalse:[ |
216 (code bitAnd:2r11000000) == 2r10000000 ifFalse:[ |
219 "/ followup chars must have 2r10 in high bits |
217 "/ followup chars must have 2r10 in high bits |
220 InvalidEncodingError raiseRequestWith:ch codePoint. |
218 InvalidEncodingError raiseRequestWith:code. |
221 ^ c1. |
219 ^ c1 asCharacter. |
222 ]. |
220 ]. |
223 ch |
221 code bitAnd:16r3F |
224 ]. |
222 ]. |
225 |
223 |
226 (codePoint bitAnd:2r11100000) == 2r11000000 ifTrue:[ |
224 (codePoint bitAnd:2r11100000) == 2r11000000 ifTrue:[ |
227 "/ 110xxxxx 10xxxxxx - 11 bits |
225 "/ 110xxxxx 10xxxxxx - 11 bits |
228 c2 := fetchNext value. |
226 codePoint := codePoint bitAnd:16r1F. |
229 codePoint := c1 codePoint bitAnd:16r1F. |
227 codePoint := (codePoint bitShift:6) bitOr:(fetchNext value). |
230 codePoint := (codePoint bitShift:6) bitOr:(c2 codePoint bitAnd:16r3F). |
228 codePoint <= 16r7F ifTrue:[ |
231 codePoint <= 16r7F ifTrue:[ |
229 InvalidEncodingError raiseRequestWith:codePoint. |
232 InvalidEncodingError raiseRequestWith:codePoint. |
230 ]. |
233 ]. |
231 ^ Character codePoint:codePoint |
234 ^ Character codePoint:codePoint |
|
235 ]. |
232 ]. |
236 (codePoint bitAnd:2r11110000) == 2r11100000 ifTrue:[ |
233 (codePoint bitAnd:2r11110000) == 2r11100000 ifTrue:[ |
237 "/ 1110xxxx 10xxxxxx 10xxxxxx - 16 bits |
234 "/ 1110xxxx 10xxxxxx 10xxxxxx - 16 bits |
238 c2 := fetchNext value. |
235 codePoint := codePoint bitAnd:16r0F. |
239 c3 := fetchNext value. |
236 codePoint := (codePoint bitShift:6) bitOr:(fetchNext value). |
240 codePoint := c1 codePoint bitAnd:16r0F. |
237 codePoint := (codePoint bitShift:6) bitOr:(fetchNext value). |
241 codePoint := (codePoint bitShift:6) bitOr:(c2 codePoint bitAnd:16r3F). |
238 codePoint <= 16r7FF ifTrue:[ |
242 codePoint := (codePoint bitShift:6) bitOr:(c3 codePoint bitAnd:16r3F). |
239 InvalidEncodingError raiseRequestWith:codePoint. |
243 codePoint <= 16r7FF ifTrue:[ |
240 ]. |
244 InvalidEncodingError raiseRequestWith:codePoint. |
241 ^ Character codePoint:codePoint |
245 ]. |
242 ]. |
246 ^ Character codePoint:codePoint |
|
247 ]. |
|
248 |
|
249 "/ notice: currently, characters can only have 16bit encoding; |
|
250 "/ therefore the following will raise a runtime exception, |
|
251 |
243 |
252 (codePoint bitAnd:2r11111000) == 2r11110000 ifTrue:[ |
244 (codePoint bitAnd:2r11111000) == 2r11110000 ifTrue:[ |
253 "/ 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx - 21 bits |
245 "/ 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx - 21 bits |
254 c2 := fetchNext value. |
246 codePoint := codePoint bitAnd:16r07. |
255 c3 := fetchNext value. |
247 codePoint := (codePoint bitShift:6) bitOr:(fetchNext value). |
256 c4 := fetchNext value. |
248 codePoint := (codePoint bitShift:6) bitOr:(fetchNext value). |
257 codePoint := c1 codePoint bitAnd:16r07. |
249 codePoint := (codePoint bitShift:6) bitOr:(fetchNext value). |
258 codePoint := (codePoint bitShift:6) bitOr:(c2 codePoint bitAnd:16r3F). |
250 codePoint <= 16rFFFF ifTrue:[ |
259 codePoint := (codePoint bitShift:6) bitOr:(c3 codePoint bitAnd:16r3F). |
251 InvalidEncodingError raiseRequestWith:codePoint. |
260 codePoint := (codePoint bitShift:6) bitOr:(c4 codePoint bitAnd:16r3F). |
252 ]. |
261 codePoint <= 16rFFFF ifTrue:[ |
253 ^ Character codePoint:codePoint |
262 InvalidEncodingError raiseRequestWith:codePoint. |
|
263 ]. |
|
264 ^ Character codePoint:codePoint |
|
265 ]. |
254 ]. |
266 |
255 |
267 (codePoint bitAnd:2r11111100) == 2r11111000 ifTrue:[ |
256 (codePoint bitAnd:2r11111100) == 2r11111000 ifTrue:[ |
268 "/ 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx - 26 bits |
257 "/ 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx - 26 bits |
269 c2 := fetchNext value. |
258 codePoint := codePoint bitAnd:16r03. |
270 c3 := fetchNext value. |
259 codePoint := (codePoint bitShift:6) bitOr:(fetchNext value). |
271 c4 := fetchNext value. |
260 codePoint := (codePoint bitShift:6) bitOr:(fetchNext value). |
272 c5 := fetchNext value. |
261 codePoint := (codePoint bitShift:6) bitOr:(fetchNext value). |
273 codePoint := c1 codePoint bitAnd:16r03. |
262 codePoint := (codePoint bitShift:6) bitOr:(fetchNext value). |
274 codePoint := (codePoint bitShift:6) bitOr:(c2 codePoint bitAnd:16r3F). |
263 codePoint <= 16r1FFFFF ifTrue:[ |
275 codePoint := (codePoint bitShift:6) bitOr:(c3 codePoint bitAnd:16r3F). |
264 InvalidEncodingError raiseRequestWith:codePoint. |
276 codePoint := (codePoint bitShift:6) bitOr:(c4 codePoint bitAnd:16r3F). |
265 ]. |
277 codePoint := (codePoint bitShift:6) bitOr:(c5 codePoint bitAnd:16r3F). |
266 ^ Character codePoint:codePoint |
278 codePoint <= 16r1FFFFF ifTrue:[ |
|
279 InvalidEncodingError raiseRequestWith:codePoint. |
|
280 ]. |
|
281 ^ Character codePoint:codePoint |
|
282 ]. |
267 ]. |
283 |
268 |
284 (codePoint bitAnd:2r11111110) == 2r11111100 ifTrue:[ |
269 (codePoint bitAnd:2r11111110) == 2r11111100 ifTrue:[ |
285 "/ 1111110x ... 10xxxxxx - any number of bits |
270 "/ 1111110x ... 10xxxxxx - any number of bits |
286 codePoint := c1 codePoint bitAnd:16r01. |
271 codePoint := codePoint bitAnd:16r01. |
287 |
272 |
288 c2 := aStream peek. |
273 c2 := aStream peek. |
289 [c2 notNil and:[(c2 codePoint bitAnd:2r11000000) == 2r10000000]] whileTrue:[ |
274 [c2 notNil and:[(c2 codePoint bitAnd:2r11000000) == 2r10000000]] whileTrue:[ |
290 codePoint := (codePoint bitShift:6) bitOr:(c2 codePoint bitAnd:16r3F). |
275 codePoint := (codePoint bitShift:6) bitOr:(c2 codePoint bitAnd:16r3F). |
291 aStream next. |
276 aStream next. |
292 c2 := aStream peek. |
277 c2 := aStream peek. |
293 ]. |
278 ]. |
294 codePoint <= 16r3FFFFFF ifTrue:[ |
279 codePoint <= 16r3FFFFFF ifTrue:[ |
295 InvalidEncodingError raiseRequestWith:codePoint. |
280 InvalidEncodingError raiseRequestWith:codePoint. |
296 ]. |
281 ]. |
297 ^ Character codePoint:codePoint |
282 ^ Character codePoint:codePoint |
298 ]. |
283 ]. |
299 |
284 |
300 InvalidEncodingError raiseRequestWith:codePoint. |
285 InvalidEncodingError raiseRequestWith:codePoint. |
301 ^ c1 |
286 ^ c1 asCharacter. |
302 |
287 |
303 " |
288 " |
304 Character utf8DecodeFrom:'a' readStream |
289 Character utf8DecodeFrom:'a' readStream |
305 Character utf8DecodeFrom:#[195 188] asString readStream |
290 Character utf8DecodeFrom:#[195 188] asString readStream |
306 " |
291 " |