CharacterArray.st
changeset 6810 a055eb19e9d3
parent 6809 72fee17c14b1
child 6834 eba4b58b8692
--- a/CharacterArray.st	Tue Oct 22 20:00:37 2002 +0200
+++ b/CharacterArray.st	Tue Oct 22 20:14:36 2002 +0200
@@ -157,7 +157,8 @@
     "return a new string which represents the characters as decoded
      from the utf8 encoded bytes, aByteCollection.
      Returns either a normal String, or a TwoByteString instance.
-     Only useful, when reading twoByteStrings from external sources."
+     Only useful, when reading twoByteStrings from external sources.
+     This only handles up-to 16bit characters"
 
     |sz nBitsRequired ascii s byte newString idx|
 
@@ -173,7 +174,10 @@
                 ascii := (ascii bitShift:6) bitOr:(byte bitAnd:16r3F).
                 ascii > 16rFF ifTrue:[
                     nBitsRequired := nBitsRequired max:16
-                ]
+                ].
+                ascii <= 16r7F ifTrue:[
+                    self error:'invalid utf encoding'.
+                ].
             ] ifFalse:[
                 (byte bitAnd:16rF0) == 16rE0 ifTrue:[
                     ascii := (byte bitAnd:16r0F).
@@ -183,9 +187,12 @@
                     ascii := (ascii bitShift:6) bitOr:(byte bitAnd:16r3F).
                     ascii > 16rFF ifTrue:[
                         nBitsRequired := nBitsRequired max:16
-                    ]
+                    ].
+                    ascii <= 16r7FF ifTrue:[
+                        self error:'invalid utf encoding'.
+                    ].
                 ] ifFalse:[
-                    self error:'bad utf encoding'.
+                    self error:'bad/unsupported utf encoding'.
                     ^ nil.
                 ]
             ].
@@ -224,7 +231,16 @@
      CharacterArray fromUTF8Bytes:#[ 16r41 16r42 ]      
      CharacterArray fromUTF8Bytes:#[ 16rC1 16r02 ]      
      CharacterArray fromUTF8Bytes:#[ 16rE0 16r81 16r02 ]      
-     CharacterArray fromUTF8Bytes:#[ 16rEF 16rBF 16rBF ]      
+     CharacterArray fromUTF8Bytes:#[ 16rEF 16rBF 16rBF ]
+
+   rfc2279 examples:
+     CharacterArray fromUTF8Bytes:#[ 16r41 16rE2 16r89 16rA2 16rCE 16r91 16r2E ]      
+     CharacterArray fromUTF8Bytes:#[ 16rED 16r95 16r9C 16rEA 16rB5 16rAD 16rEC 16r96 16rB4 ]      
+     CharacterArray fromUTF8Bytes:#[ 16rE6 16r97 16rA5 16rE6 16r9C 16rAC 16rE8 16rAA 16r9E ]      
+
+   invalid:
+     CharacterArray fromUTF8Bytes:#[ 16rC0 16r80 ]      
+     CharacterArray fromUTF8Bytes:#[ 16rE0 16r80 16r80 ]      
     "
 !
 
@@ -4878,7 +4894,7 @@
         c := Character utf8DecodeFrom:in.
         is16Bit ifFalse:[
             c asciiValue > 16rFF ifTrue:[
-                out := WriteStream with:(out contents asTwoByteString).
+                out := WriteStream with:(UnicodeString fromString:out contents).
                 is16Bit := true.
             ].
         ].
@@ -6211,7 +6227,7 @@
 !CharacterArray class methodsFor:'documentation'!
 
 version
-    ^ '$Header: /cvs/stx/stx/libbasic/CharacterArray.st,v 1.228 2002-10-22 18:00:37 cg Exp $'
+    ^ '$Header: /cvs/stx/stx/libbasic/CharacterArray.st,v 1.229 2002-10-22 18:14:36 cg Exp $'
 ! !
 
 CharacterArray initialize!