RegressionTests__CharacterEncoderTests.st
changeset 1699 c2daf79e7ebc
parent 1549 12b505414801
child 2151 0c7cd20dd232
equal deleted inserted replaced
1698:4dfaff79690e 1699:c2daf79e7ebc
       
     1 "{ Encoding: utf8 }"
       
     2 
     1 "{ Package: 'stx:goodies/regression' }"
     3 "{ Package: 'stx:goodies/regression' }"
     2 
     4 
     3 "{ NameSpace: RegressionTests }"
     5 "{ NameSpace: RegressionTests }"
     4 
     6 
     5 TestCase subclass:#CharacterEncoderTests
     7 TestCase subclass:#CharacterEncoderTests
   189 
   191 
   190     encoder := CharacterEncoder encoderToEncodeFrom:#unicode into:#utf8.
   192     encoder := CharacterEncoder encoderToEncodeFrom:#unicode into:#utf8.
   191 
   193 
   192     "/ reversibility
   194     "/ reversibility
   193     (0 to:16r1FFFF) do:[:eachCodePoint |
   195     (0 to:16r1FFFF) do:[:eachCodePoint |
   194 	|s1 s2 s3|
   196         |s1 s2 s3|
   195 
   197 
   196 	s1 := (Character value:eachCodePoint) asString.
   198         s1 := (Character value:eachCodePoint) asString.
   197 	s2 := encoder encodeString:s1.
   199         s2 := encoder encodeString:s1.
   198 	s3 := encoder decodeString:s2.
   200         s3 := encoder decodeString:s2.
   199 	self assert:(s1 = s3).
   201         self assert:(s1 = s3).
   200     ].
   202     ].
   201 
   203 
   202     "/ 00 .. 7F -> 0xxxxxxx
   204     "/ 00 .. 7F -> 0xxxxxxx
   203     #[16r00 16r01 16r02 16r04 16r08 16r10 16r20 16r40
   205     #[16r00 16r01 16r02 16r04 16r08 16r10 16r20 16r40
   204 			16r03 16r07 16r0F 16r1F 16r3F 16r7F]
   206                         16r03 16r07 16r0F 16r1F 16r3F 16r7F]
   205     do:[:eachCodePoint |
   207     do:[:eachCodePoint |
   206 	|s1 s2 s3|
   208         |s1 s2 s3|
   207 
   209 
   208 	s1 := (Character value:eachCodePoint) asString.
   210         s1 := (Character value:eachCodePoint) asString.
   209 	s2 := encoder encodeString:s1.
   211         s2 := encoder encodeString:s1.
   210 	s3 := encoder decodeString:s2.
   212         s3 := encoder decodeString:s2.
   211 	self assert:(s1 = s2).
   213         self assert:(s1 = s2).
   212 	self assert:(s2 = s3).
   214         self assert:(s2 = s3).
   213 	self assert:(s2 size == 1).
   215         self assert:(s2 size == 1).
   214     ].
   216     ].
   215 
   217 
   216     "/ 80 .. 7FF -> 110xxxxx 10xxxxxx
   218     "/ 80 .. 7FF -> 110xxxxx 10xxxxxx
   217     #(16r80 16r100 16r200 16r400
   219     #(16r80 16r100 16r200 16r400
   218 	    16r0FF 16r1FF 16r3FF 16r7FF)
   220             16r0FF 16r1FF 16r3FF 16r7FF)
   219     do:[:eachCodePoint |
   221     do:[:eachCodePoint |
   220 	|s1 s2 s3|
   222         |s1 s2 s3|
   221 
   223 
   222 	s1 := (Character value:eachCodePoint) asString.
   224         s1 := (Character value:eachCodePoint) asString.
   223 	s2 := encoder encodeString:s1.
   225         s2 := encoder encodeString:s1.
   224 	self assert:(s2 size == 2).
   226         self assert:(s2 size == 2).
   225 	self assert:((s2 first codePoint bitAnd:2r11100000) == 2r11000000).
   227         self assert:((s2 first codePoint bitAnd:2r11100000) == 2r11000000).
   226 	self assert:((s2 second codePoint bitAnd:2r11000000) == 2r10000000).
   228         self assert:((s2 second codePoint bitAnd:2r11000000) == 2r10000000).
   227 	s3 := encoder decodeString:s2.
   229         s3 := encoder decodeString:s2.
   228 	self assert:(s1 = s3).
   230         self assert:(s1 = s3).
   229     ].
   231     ].
   230 
   232 
   231     "/ 800 .. FFFF -> 1110xxxx 10xxxxxx 10xxxxxx
   233     "/ 800 .. FFFF -> 1110xxxx 10xxxxxx 10xxxxxx
   232     #(16r800 16r1000 16r2000 16r4000 16r8000
   234     #(16r800 16r1000 16r2000 16r4000 16r8000
   233 	     16r0FFF 16r1FFF 16r3FFF 16r7FFF 16rFFFF)
   235              16r0FFF 16r1FFF 16r3FFF 16r7FFF 16rFFFF)
   234     do:[:eachCodePoint |
   236     do:[:eachCodePoint |
   235 	|s1 s2 s3|
   237         |s1 s2 s3|
   236 
   238 
   237 	s1 := (Character value:eachCodePoint) asString.
   239         s1 := (Character value:eachCodePoint) asString.
   238 	s2 := encoder encodeString:s1.
   240         s2 := encoder encodeString:s1.
   239 	self assert:(s2 size == 3).
   241         self assert:(s2 size == 3).
   240 	self assert:((s2 first codePoint bitAnd:2r11110000) == 2r11100000).
   242         self assert:((s2 first codePoint bitAnd:2r11110000) == 2r11100000).
   241 	self assert:((s2 second codePoint bitAnd:2r11000000) == 2r10000000).
   243         self assert:((s2 second codePoint bitAnd:2r11000000) == 2r10000000).
   242 	self assert:((s2 third codePoint bitAnd:2r11000000) == 2r10000000).
   244         self assert:((s2 third codePoint bitAnd:2r11000000) == 2r10000000).
   243 	s3 := encoder decodeString:s2.
   245         s3 := encoder decodeString:s2.
   244 	self assert:(s1 = s3).
   246         self assert:(s1 = s3).
   245     ].
   247     ].
   246 
   248 
   247     "/ 10000 .. 1FFFFF -> 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
   249     "/ 10000 .. 1FFFFF -> 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
   248     #(16r10000 16r20000 16r40000 16r80000 16r10000
   250     #(16r10000 16r20000 16r40000 16r80000 16r10000
   249 	       16r1FFFF 16r3FFFF 16r7FFFF 16rFFFFF 16r1FFFFF)
   251                16r1FFFF 16r3FFFF 16r7FFFF 16rFFFFF 16r1FFFFF)
   250     do:[:eachCodePoint |
   252     do:[:eachCodePoint |
   251 	|s1 s2 s3|
   253         |s1 s2 s3|
   252 
   254 
   253 	s1 := (Character value:eachCodePoint) asString.
   255         s1 := (Character value:eachCodePoint) asString.
   254 	s2 := encoder encodeString:s1.
   256         s2 := encoder encodeString:s1.
   255 	self assert:(s2 size == 4).
   257         self assert:(s2 size == 4).
   256 	self assert:((s2 first codePoint bitAnd:2r11111000) == 2r11110000).
   258         self assert:((s2 first codePoint bitAnd:2r11111000) == 2r11110000).
   257 	self assert:((s2 second codePoint bitAnd:2r11000000) == 2r10000000).
   259         self assert:((s2 second codePoint bitAnd:2r11000000) == 2r10000000).
   258 	self assert:((s2 third codePoint bitAnd:2r11000000) == 2r10000000).
   260         self assert:((s2 third codePoint bitAnd:2r11000000) == 2r10000000).
   259 	self assert:((s2 fourth codePoint bitAnd:2r11000000) == 2r10000000).
   261         self assert:((s2 fourth codePoint bitAnd:2r11000000) == 2r10000000).
   260 	s3 := encoder decodeString:s2.
   262         s3 := encoder decodeString:s2.
   261 	self assert:(s1 = s3).
   263         self assert:(s1 = s3).
   262     ].
   264     ].
   263 
   265 
   264     "/ 200000 .. 3FFFFFF -> 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
   266     "/ 200000 .. 3FFFFFF -> 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
   265     #(16r200000 16r400000 16r800000 16r1000000 16r2000000
   267     #(16r200000 16r400000 16r800000 16r1000000 16r2000000
   266 		16r3FFFFF 16r7FFFFF 16r0FFFFFF 16r1FFFFFF 16r3FFFFFF)
   268                 16r3FFFFF 16r7FFFFF 16r0FFFFFF 16r1FFFFFF 16r3FFFFFF)
   267     do:[:eachCodePoint |
   269     do:[:eachCodePoint |
   268 	|s1 s2 s3|
   270         |s1 s2 s3|
   269 
   271 
   270 	s1 := (Character value:eachCodePoint) asString.
   272         s1 := (Character value:eachCodePoint) asString.
   271 	s2 := encoder encodeString:s1.
   273         s2 := encoder encodeString:s1.
   272 	self assert:(s2 size == 5).
   274         self assert:(s2 size == 5).
   273 	self assert:((s2 first codePoint bitAnd:2r11111100) == 2r11111000).
   275         self assert:((s2 first codePoint bitAnd:2r11111100) == 2r11111000).
   274 	self assert:((s2 second codePoint bitAnd:2r11000000) == 2r10000000).
   276         self assert:((s2 second codePoint bitAnd:2r11000000) == 2r10000000).
   275 	self assert:((s2 third codePoint bitAnd:2r11000000) == 2r10000000).
   277         self assert:((s2 third codePoint bitAnd:2r11000000) == 2r10000000).
   276 	self assert:((s2 fourth codePoint bitAnd:2r11000000) == 2r10000000).
   278         self assert:((s2 fourth codePoint bitAnd:2r11000000) == 2r10000000).
   277 	self assert:((s2 fifth codePoint bitAnd:2r11000000) == 2r10000000).
   279         self assert:((s2 fifth codePoint bitAnd:2r11000000) == 2r10000000).
   278 	s3 := encoder decodeString:s2.
   280         s3 := encoder decodeString:s2.
   279 	self assert:(s1 = s3).
   281         self assert:(s1 = s3).
   280     ].
   282     ].
   281 
   283 
   282     "/ ST/X limitation: only 30 bit integers (to avoid largeInteger codePoint)
   284     "/ ST/X limitation: only 30 bit integers (to avoid largeInteger codePoint)
   283 
   285 
   284     "/ 4000000 .. 7FFFFFFF -> 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
   286     "/ 4000000 .. 7FFFFFFF -> 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
   285     #(16r4000000 16r8000000 16r10000000 16r20000000 "16r40000000"
   287     #(16r4000000 16r8000000 16r10000000 16r20000000 "16r40000000"
   286 		 16r7FFFFFF 16r0FFFFFFF 16r1FFFFFFF "16r3FFFFFFF 16r7FFFFFFF")
   288                  16r7FFFFFF 16r0FFFFFFF 16r1FFFFFFF "16r3FFFFFFF 16r7FFFFFFF")
   287     do:[:eachCodePoint |
   289     do:[:eachCodePoint |
   288 	|s1 s2 s3|
   290         |s1 s2 s3|
   289 
   291 
   290 	s1 := (Character value:eachCodePoint) asString.
   292         s1 := (Character value:eachCodePoint) asString.
   291 	s2 := encoder encodeString:s1.
   293         s2 := encoder encodeString:s1.
   292 	self assert:(s2 size == 6).
   294         self assert:(s2 size == 6).
   293 	self assert:((s2 first codePoint bitAnd:2r11111110) == 2r11111100).
   295         self assert:((s2 first codePoint bitAnd:2r11111110) == 2r11111100).
   294 	self assert:((s2 second codePoint bitAnd:2r11000000) == 2r10000000).
   296         self assert:((s2 second codePoint bitAnd:2r11000000) == 2r10000000).
   295 	self assert:((s2 third codePoint bitAnd:2r11000000) == 2r10000000).
   297         self assert:((s2 third codePoint bitAnd:2r11000000) == 2r10000000).
   296 	self assert:((s2 fourth codePoint bitAnd:2r11000000) == 2r10000000).
   298         self assert:((s2 fourth codePoint bitAnd:2r11000000) == 2r10000000).
   297 	self assert:((s2 fifth codePoint bitAnd:2r11000000) == 2r10000000).
   299         self assert:((s2 fifth codePoint bitAnd:2r11000000) == 2r10000000).
   298 	self assert:((s2 sixth codePoint bitAnd:2r11000000) == 2r10000000).
   300         self assert:((s2 sixth codePoint bitAnd:2r11000000) == 2r10000000).
   299 	s3 := encoder decodeString:s2.
   301         s3 := encoder decodeString:s2.
   300 	self assert:(s1 = s3).
   302         self assert:(s1 = s3).
   301     ].
   303     ].
   302 
   304 
   303     "
   305     "
   304      self new testUTF8
   306      self new testUTF8
       
   307     "
       
   308 !
       
   309 
       
   310 testUTF8_2
       
   311     |encoder s|
       
   312 
       
   313     encoder := CharacterEncoder encoderToEncodeFrom:#unicode into:#utf8.
       
   314 
       
   315     self assert:(s := encoder encodeString:(Character value:16r1FFFFF)) asByteArray = #[16rF7 16rBF 16rBF 16rBF ].
       
   316     self assert:( encoder decodeString:s ) first = (Character value:16r1FFFFF).
       
   317 
       
   318     self assert:(s := encoder encodeString:(Character value:16r200000)) asByteArray = #[16rF8 16r88 16r80 16r80 16r80 ].
       
   319     self assert:( encoder decodeString:s ) first = (Character value:16r200000).
       
   320 
       
   321     self assert:( encoder encodeString:(Character value:16r2FFFFF)) asByteArray = #[16rF8 16r8B 16rBF 16rBF 16rBF ].
       
   322     self assert:( encoder encodeString:(Character value:16r3FFFFF)) asByteArray = #[16rF8 16r8F 16rBF 16rBF 16rBF ].
       
   323     self assert:( encoder encodeString:(Character value:16r4FFFFF)) asByteArray = #[16rF8 16r93 16rBF 16rBF 16rBF ].
       
   324     self assert:( encoder encodeString:(Character value:16r5FFFFF)) asByteArray = #[16rF8 16r97 16rBF 16rBF 16rBF ].
       
   325     self assert:( encoder encodeString:(Character value:16r6FFFFF)) asByteArray = #[16rF8 16r9B 16rBF 16rBF 16rBF ].
       
   326     self assert:( encoder encodeString:(Character value:16r7FFFFF)) asByteArray = #[16rF8 16r9F 16rBF 16rBF 16rBF ].
       
   327     self assert:( encoder encodeString:(Character value:16r800000)) asByteArray = #[16rF8 16rA0 16r80 16r80 16r80 ].
       
   328 
       
   329     self assert:( encoder encodeString:(Character value:16r3FFFFFF)) asByteArray = #[16rFB 16rBF 16rBF 16rBF 16rBF ].
       
   330     self assert:( encoder encodeString:(Character value:16r4000000)) asByteArray = #[16rFC 16r84 16r80 16r80 16r80 16r80 ].
       
   331     self assert:( encoder encodeString:(Character value:16r7FFFFFF)) asByteArray = #[16rFC 16r87 16rBF 16rBF 16rBF 16rBF ].
       
   332     self assert:( encoder encodeString:(Character value:16rFFFFFFF)) asByteArray = #[16rFC 16r8F 16rBF 16rBF 16rBF 16rBF ].
       
   333 
       
   334     self assert:(s := encoder encodeString:(Character value:16r3FFFFFFF)) asByteArray = #[16rFC 16rBF 16rBF 16rBF 16rBF 16rBF ].
       
   335     self assert:( encoder decodeString:s ) first = (Character value:16r3FFFFFFF).
       
   336 
       
   337     "/ STX <<only>> supports characters up to 31 bit.
       
   338     "/ self assert:( encoder encodeString:(Character value:16r7FFFFFFF)) asByteArray = #[16rFD 16rBF 16rBF 16rBF 16rBF 16rBF ].
       
   339 
       
   340 
       
   341     "
       
   342      self new testUTF8_2
   305     "
   343     "
   306 ! !
   344 ! !
   307 
   345 
   308 !CharacterEncoderTests class methodsFor:'documentation'!
   346 !CharacterEncoderTests class methodsFor:'documentation'!
   309 
   347