Character.st
branchjv
changeset 18610 b9799e74a9c5
parent 18608 7d521f25267c
parent 18605 4f0a114fed00
child 18678 a9b30d72dff9
equal deleted inserted replaced
18609:af1c36c18e24 18610:b9799e74a9c5
     1 "{ Encoding: utf8 }"
       
     2 
       
     3 "
     1 "
     4  COPYRIGHT (c) 1988 by Claus Gittinger
     2  COPYRIGHT (c) 1988 by Claus Gittinger
     5 	      All Rights Reserved
     3 	      All Rights Reserved
     6 
     4 
     7  This software is furnished under a license and may be used
     5  This software is furnished under a license and may be used
   196 !
   194 !
   197 
   195 
   198 utf8DecodeFrom:aStream
   196 utf8DecodeFrom:aStream
   199     "read and return a single unicode character from an UTF8 encoded stream"
   197     "read and return a single unicode character from an UTF8 encoded stream"
   200 
   198 
   201     |fetchNext c1 c2 c3 c4 c5 codePoint|
   199     |fetchNext c1 c2 codePoint|
   202 
   200 
   203     c1 := aStream next.
   201     c1 := aStream next.
   204     codePoint := c1 codePoint.
   202     codePoint := c1 codePoint.
   205     codePoint <= 16r7F ifTrue:[
   203     codePoint <= 16r7F ifTrue:[
   206 	"/ 0xxxxxxx - 7 bits
   204         "/ 0xxxxxxx - 7 bits
   207 	^ c1.
   205         ^ c1 asCharacter.
   208     ].
   206     ].
   209 
   207 
   210     (codePoint bitAnd:2r11000000) == 2r10000000 ifTrue:[
   208     (codePoint bitAnd:2r11000000) == 2r10000000 ifTrue:[
   211 	"/ out of sync (got an intermediate character)
   209         "/ out of sync (got an intermediate character)
   212 	InvalidEncodingError raiseRequestWith:codePoint errorString:' - out of sync'.
   210         InvalidEncodingError raiseRequestWith:codePoint errorString:' - out of sync'.
   213 	^ c1.
   211         ^ c1 asCharacter.
   214     ].
   212     ].
   215 
   213 
   216     fetchNext := [  |ch|
   214     fetchNext := [  |code|
   217 		    ch := aStream next.
   215                     code := aStream next codePoint.
   218 		    (ch codePoint bitAnd:2r11000000) == 2r10000000 ifFalse:[
   216                     (code bitAnd:2r11000000) == 2r10000000 ifFalse:[
   219 			"/ followup chars must have 2r10 in high bits
   217                         "/ followup chars must have 2r10 in high bits
   220 			InvalidEncodingError raiseRequestWith:ch codePoint.
   218                         InvalidEncodingError raiseRequestWith:code.
   221 			^ c1.
   219                         ^ c1 asCharacter.
   222 		    ].
   220                     ].
   223 		    ch
   221                     code bitAnd:16r3F
   224 		 ].
   222                  ].
   225 
   223 
   226     (codePoint bitAnd:2r11100000) == 2r11000000 ifTrue:[
   224     (codePoint bitAnd:2r11100000) == 2r11000000 ifTrue:[
   227 	"/ 110xxxxx 10xxxxxx - 11 bits
   225         "/ 110xxxxx 10xxxxxx - 11 bits
   228 	c2 := fetchNext value.
   226         codePoint := codePoint bitAnd:16r1F.
   229 	codePoint := c1 codePoint bitAnd:16r1F.
   227         codePoint := (codePoint bitShift:6) bitOr:(fetchNext value).
   230 	codePoint := (codePoint bitShift:6) bitOr:(c2 codePoint bitAnd:16r3F).
   228         codePoint <= 16r7F ifTrue:[
   231 	codePoint <= 16r7F ifTrue:[
   229             InvalidEncodingError raiseRequestWith:codePoint.
   232 	    InvalidEncodingError raiseRequestWith:codePoint.
   230         ].
   233 	].
   231         ^ Character codePoint:codePoint
   234 	^ Character codePoint:codePoint
       
   235     ].
   232     ].
   236     (codePoint bitAnd:2r11110000) == 2r11100000 ifTrue:[
   233     (codePoint bitAnd:2r11110000) == 2r11100000 ifTrue:[
   237 	"/ 1110xxxx 10xxxxxx 10xxxxxx - 16 bits
   234         "/ 1110xxxx 10xxxxxx 10xxxxxx - 16 bits
   238 	c2 := fetchNext value.
   235         codePoint := codePoint bitAnd:16r0F.
   239 	c3 := fetchNext value.
   236         codePoint := (codePoint bitShift:6) bitOr:(fetchNext value).
   240 	codePoint := c1 codePoint bitAnd:16r0F.
   237         codePoint := (codePoint bitShift:6) bitOr:(fetchNext value).
   241 	codePoint := (codePoint bitShift:6) bitOr:(c2 codePoint bitAnd:16r3F).
   238         codePoint <= 16r7FF ifTrue:[
   242 	codePoint := (codePoint bitShift:6) bitOr:(c3 codePoint bitAnd:16r3F).
   239             InvalidEncodingError raiseRequestWith:codePoint.
   243 	codePoint <= 16r7FF ifTrue:[
   240         ].
   244 	    InvalidEncodingError raiseRequestWith:codePoint.
   241         ^ Character codePoint:codePoint
   245 	].
   242     ].
   246 	^ Character codePoint:codePoint
       
   247     ].
       
   248 
       
   249     "/ notice: currently, characters can only have 16bit encoding;
       
   250     "/ therefore the following will raise a runtime exception,
       
   251 
   243 
   252     (codePoint bitAnd:2r11111000) == 2r11110000 ifTrue:[
   244     (codePoint bitAnd:2r11111000) == 2r11110000 ifTrue:[
   253 	"/ 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx - 21 bits
   245         "/ 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx - 21 bits
   254 	c2 := fetchNext value.
   246         codePoint := codePoint bitAnd:16r07.
   255 	c3 := fetchNext value.
   247         codePoint := (codePoint bitShift:6) bitOr:(fetchNext value).
   256 	c4 := fetchNext value.
   248         codePoint := (codePoint bitShift:6) bitOr:(fetchNext value).
   257 	codePoint := c1 codePoint bitAnd:16r07.
   249         codePoint := (codePoint bitShift:6) bitOr:(fetchNext value).
   258 	codePoint := (codePoint bitShift:6) bitOr:(c2 codePoint bitAnd:16r3F).
   250         codePoint <= 16rFFFF ifTrue:[
   259 	codePoint := (codePoint bitShift:6) bitOr:(c3 codePoint bitAnd:16r3F).
   251             InvalidEncodingError raiseRequestWith:codePoint.
   260 	codePoint := (codePoint bitShift:6) bitOr:(c4 codePoint bitAnd:16r3F).
   252         ].
   261 	codePoint <= 16rFFFF ifTrue:[
   253         ^ Character codePoint:codePoint
   262 	    InvalidEncodingError raiseRequestWith:codePoint.
       
   263 	].
       
   264 	^ Character codePoint:codePoint
       
   265     ].
   254     ].
   266 
   255 
   267     (codePoint bitAnd:2r11111100) == 2r11111000 ifTrue:[
   256     (codePoint bitAnd:2r11111100) == 2r11111000 ifTrue:[
   268 	"/ 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx - 26 bits
   257         "/ 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx - 26 bits
   269 	c2 := fetchNext value.
   258         codePoint := codePoint bitAnd:16r03.
   270 	c3 := fetchNext value.
   259         codePoint := (codePoint bitShift:6) bitOr:(fetchNext value).
   271 	c4 := fetchNext value.
   260         codePoint := (codePoint bitShift:6) bitOr:(fetchNext value).
   272 	c5 := fetchNext value.
   261         codePoint := (codePoint bitShift:6) bitOr:(fetchNext value).
   273 	codePoint := c1 codePoint bitAnd:16r03.
   262         codePoint := (codePoint bitShift:6) bitOr:(fetchNext value).
   274 	codePoint := (codePoint bitShift:6) bitOr:(c2 codePoint bitAnd:16r3F).
   263         codePoint <= 16r1FFFFF ifTrue:[
   275 	codePoint := (codePoint bitShift:6) bitOr:(c3 codePoint bitAnd:16r3F).
   264             InvalidEncodingError raiseRequestWith:codePoint.
   276 	codePoint := (codePoint bitShift:6) bitOr:(c4 codePoint bitAnd:16r3F).
   265         ].
   277 	codePoint := (codePoint bitShift:6) bitOr:(c5 codePoint bitAnd:16r3F).
   266         ^ Character codePoint:codePoint
   278 	codePoint <= 16r1FFFFF ifTrue:[
       
   279 	    InvalidEncodingError raiseRequestWith:codePoint.
       
   280 	].
       
   281 	^ Character codePoint:codePoint
       
   282     ].
   267     ].
   283 
   268 
   284     (codePoint bitAnd:2r11111110) == 2r11111100 ifTrue:[
   269     (codePoint bitAnd:2r11111110) == 2r11111100 ifTrue:[
   285 	"/ 1111110x ... 10xxxxxx - any number of bits
   270         "/ 1111110x ... 10xxxxxx - any number of bits
   286 	codePoint := c1 codePoint bitAnd:16r01.
   271         codePoint := codePoint bitAnd:16r01.
   287 
   272 
   288 	c2 := aStream peek.
   273         c2 := aStream peek.
   289 	[c2 notNil and:[(c2 codePoint bitAnd:2r11000000) == 2r10000000]] whileTrue:[
   274         [c2 notNil and:[(c2 codePoint bitAnd:2r11000000) == 2r10000000]] whileTrue:[
   290 	    codePoint := (codePoint bitShift:6) bitOr:(c2 codePoint bitAnd:16r3F).
   275             codePoint := (codePoint bitShift:6) bitOr:(c2 codePoint bitAnd:16r3F).
   291 	    aStream next.
   276             aStream next.
   292 	    c2 := aStream peek.
   277             c2 := aStream peek.
   293 	].
   278         ].
   294 	codePoint <= 16r3FFFFFF ifTrue:[
   279         codePoint <= 16r3FFFFFF ifTrue:[
   295 	    InvalidEncodingError raiseRequestWith:codePoint.
   280             InvalidEncodingError raiseRequestWith:codePoint.
   296 	].
   281         ].
   297 	^ Character codePoint:codePoint
   282         ^ Character codePoint:codePoint
   298     ].
   283     ].
   299 
   284 
   300     InvalidEncodingError raiseRequestWith:codePoint.
   285     InvalidEncodingError raiseRequestWith:codePoint.
   301     ^ c1
   286     ^ c1 asCharacter.
   302 
   287 
   303     "
   288     "
   304       Character utf8DecodeFrom:'a' readStream
   289       Character utf8DecodeFrom:'a' readStream
   305       Character utf8DecodeFrom:#[195 188] asString readStream
   290       Character utf8DecodeFrom:#[195 188] asString readStream
   306     "
   291     "
   308     "test:
   293     "test:
   309 
   294 
   310       |utf8Encoding original readBack|
   295       |utf8Encoding original readBack|
   311 
   296 
   312       1 to:16rFFFF do:[:codePoint |
   297       1 to:16rFFFF do:[:codePoint |
   313 	original := Character value:codePoint.
   298         original := Character value:codePoint.
   314 	utf8Encoding := original asString utf8Encoded.
   299         utf8Encoding := original utf8Encoded.
   315 	readBack := Character utf8DecodeFrom:(utf8Encoding readStream).
   300         readBack := Character utf8DecodeFrom:(utf8Encoding readStream).
   316 	readBack codePoint = codePoint ifFalse:[
   301         readBack codePoint = codePoint ifFalse:[
   317 	    self halt
   302             self halt
   318 	]
   303         ]
   319       ]
   304       ]
   320     "
   305     "
   321 !
   306 !
   322 
   307 
   323 value:anInteger
   308 value:anInteger
  1470     s := WriteStream on:(String new:6).
  1455     s := WriteStream on:(String new:6).
  1471     s nextPutUtf8:self.
  1456     s nextPutUtf8:self.
  1472     ^ s contents
  1457     ^ s contents
  1473 
  1458 
  1474     "
  1459     "
  1475 	'ä' utf8Encoded
  1460 	'ä' utf8Encoded
  1476     "
  1461     "
  1477 ! !
  1462 ! !
  1478 
  1463 
  1479 !Character methodsFor:'copying'!
  1464 !Character methodsFor:'copying'!
  1480 
  1465 
  2532     RETURN (__MKUCHARACTER(val)) ;
  2517     RETURN (__MKUCHARACTER(val)) ;
  2533 %}
  2518 %}
  2534 
  2519 
  2535     "
  2520     "
  2536      $e asNonDiacritical
  2521      $e asNonDiacritical
  2537      $é asNonDiacritical
  2522      $é asNonDiacritical
  2538      $ä asNonDiacritical
  2523      $ä asNonDiacritical
  2539      $Ã¥ asNonDiacritical
  2524      $å asNonDiacritical
  2540     "
  2525     "
  2541 !
  2526 !
  2542 
  2527 
  2543 isNationalAlphaNumeric
  2528 isNationalAlphaNumeric
  2544     "return true, if the receiver is a letter or digit.
  2529     "return true, if the receiver is a letter or digit.