CharacterArray.st
changeset 7876 56ab739f1843
parent 7797 49e53638191f
child 7877 bef56f06d2f8
--- a/CharacterArray.st	Thu Feb 05 16:04:27 2004 +0100
+++ b/CharacterArray.st	Thu Feb 05 23:20:31 2004 +0100
@@ -160,72 +160,7 @@
      Only useful, when reading twoByteStrings from external sources.
      This only handles up-to 16bit characters"
 
-    |sz nBitsRequired ascii s byte newString idx|
-
-    nBitsRequired := 8.
-    sz := 0.
-    s := aByteCollection readStream.
-    [s atEnd] whileFalse:[
-        byte := ascii := s nextByte.
-        (byte bitAnd:16r80) ~~ 0 ifTrue:[
-            (byte bitAnd:16rE0) == 16rC0 ifTrue:[
-                ascii := (byte bitAnd:16r1F).
-                byte := s nextByte.
-                ascii := (ascii bitShift:6) bitOr:(byte bitAnd:16r3F).
-                ascii > 16rFF ifTrue:[
-                    nBitsRequired := nBitsRequired max:16
-                ].
-                ascii <= 16r7F ifTrue:[
-"/                    self error:'invalid utf encoding' mayProceed:true.
-                ].
-            ] ifFalse:[
-                (byte bitAnd:16rF0) == 16rE0 ifTrue:[
-                    ascii := (byte bitAnd:16r0F).
-                    byte := s nextByte.
-                    ascii := (ascii bitShift:6) bitOr:(byte bitAnd:16r3F).
-                    byte := s nextByte.
-                    ascii := (ascii bitShift:6) bitOr:(byte bitAnd:16r3F).
-                    ascii > 16rFF ifTrue:[
-                        nBitsRequired := nBitsRequired max:16
-                    ].
-                    ascii <= 16r7FF ifTrue:[
-                        self error:'invalid utf encoding'.
-                    ].
-                ] ifFalse:[
-                    self error:'bad/unsupported utf encoding'.
-                    ^ nil.
-                ]
-            ].
-        ].
-        sz := sz + 1.
-    ].
-    nBitsRequired == 8 ifTrue:[
-        newString := String uninitializedNew:sz
-    ] ifFalse:[
-        newString := UnicodeString new:sz
-    ].
-
-    s := aByteCollection readStream.
-    idx := 1.
-    [s atEnd] whileFalse:[
-        byte := ascii := s nextByte.
-        (byte bitAnd:16r80) ~~ 0 ifTrue:[
-            (byte bitAnd:16rE0) == 16rC0 ifTrue:[
-                ascii := (byte bitAnd:16r1F).
-                byte := s nextByte.
-                ascii := (ascii bitShift:6) bitOr:(byte bitAnd:16r3F).
-            ] ifFalse:[
-                ascii := (byte bitAnd:16r0F).
-                byte := s nextByte.
-                ascii := (ascii bitShift:6) bitOr:(byte bitAnd:16r3F).
-                byte := s nextByte.
-                ascii := (ascii bitShift:6) bitOr:(byte bitAnd:16r3F).
-            ].
-        ].
-        newString at:idx put:(Character value:ascii).
-        idx := idx + 1.
-    ].
-    ^ newString
+    ^ self decodeFromUTF8:aByteCollection.
 
     "
      CharacterArray fromUTF8Bytes:#[ 16r41 16r42 ]      
@@ -1410,6 +1345,98 @@
     "Modified: 4.7.1997 / 11:01:22 / cg"
 !
 
+decodeFromUTF8:aStringOrByteCollection
+    "return a string which represents the characters as decoded
+     from the utf8 encoded bytes, aByteCollection.
+     Returns either a normal String, or a TwoByteString instance.
+     Only useful, when reading twoByteStrings from external sources.
+     This only handles up-to 16bit characters."
+
+    |sz nBitsRequired ascii s byte newString idx|
+
+    nBitsRequired := 8.
+    sz := 0.
+    s := aStringOrByteCollection readStream.
+    [s atEnd] whileFalse:[
+        byte := ascii := s nextByte.
+        (byte bitAnd:16r80) ~~ 0 ifTrue:[
+            (byte bitAnd:16rE0) == 16rC0 ifTrue:[
+                ascii := (byte bitAnd:16r1F).
+                byte := s nextByte.
+                ascii := (ascii bitShift:6) bitOr:(byte bitAnd:16r3F).
+                ascii > 16rFF ifTrue:[
+                    nBitsRequired := nBitsRequired max:16
+                ].
+                ascii <= 16r7F ifTrue:[
+"/                    self error:'invalid utf encoding' mayProceed:true.
+                ].
+            ] ifFalse:[
+                (byte bitAnd:16rF0) == 16rE0 ifTrue:[
+                    ascii := (byte bitAnd:16r0F).
+                    byte := s nextByte.
+                    ascii := (ascii bitShift:6) bitOr:(byte bitAnd:16r3F).
+                    byte := s nextByte.
+                    ascii := (ascii bitShift:6) bitOr:(byte bitAnd:16r3F).
+                    ascii > 16rFF ifTrue:[
+                        nBitsRequired := nBitsRequired max:16
+                    ].
+                    ascii <= 16r7FF ifTrue:[
+                        self error:'invalid utf encoding'.
+                    ].
+                ] ifFalse:[
+                    self error:'bad/unsupported utf encoding'.
+                    ^ nil.
+                ]
+            ].
+        ].
+        sz := sz + 1.
+    ].
+    nBitsRequired == 8 ifTrue:[
+        aStringOrByteCollection isString ifTrue:[^ aStringOrByteCollection].
+        newString := String uninitializedNew:sz
+    ] ifFalse:[
+        newString := UnicodeString new:sz
+    ].
+
+    s := aStringOrByteCollection readStream.
+    idx := 1.
+    [s atEnd] whileFalse:[
+        byte := ascii := s nextByte.
+        (byte bitAnd:16r80) ~~ 0 ifTrue:[
+            (byte bitAnd:16rE0) == 16rC0 ifTrue:[
+                ascii := (byte bitAnd:16r1F).
+                byte := s nextByte.
+                ascii := (ascii bitShift:6) bitOr:(byte bitAnd:16r3F).
+            ] ifFalse:[
+                ascii := (byte bitAnd:16r0F).
+                byte := s nextByte.
+                ascii := (ascii bitShift:6) bitOr:(byte bitAnd:16r3F).
+                byte := s nextByte.
+                ascii := (ascii bitShift:6) bitOr:(byte bitAnd:16r3F).
+            ].
+        ].
+        newString at:idx put:(Character value:ascii).
+        idx := idx + 1.
+    ].
+    ^ newString
+
+    "
+     CharacterArray fromUTF8Bytes:#[ 16r41 16r42 ]      
+     CharacterArray fromUTF8Bytes:#[ 16rC1 16r02 ]      
+     CharacterArray fromUTF8Bytes:#[ 16rE0 16r81 16r02 ]      
+     CharacterArray fromUTF8Bytes:#[ 16rEF 16rBF 16rBF ]
+
+   rfc2279 examples:
+     CharacterArray fromUTF8Bytes:#[ 16r41 16rE2 16r89 16rA2 16rCE 16r91 16r2E ]      
+     CharacterArray fromUTF8Bytes:#[ 16rED 16r95 16r9C 16rEA 16rB5 16rAD 16rEC 16r96 16rB4 ]      
+     CharacterArray fromUTF8Bytes:#[ 16rE6 16r97 16rA5 16rE6 16r9C 16rAC 16rE8 16rAA 16r9E ]      
+
+   invalid:
+     CharacterArray fromUTF8Bytes:#[ 16rC0 16r80 ]      
+     CharacterArray fromUTF8Bytes:#[ 16rE0 16r80 16r80 ]      
+    "
+!
+
 encodeIntoBIG5:aBIG5String
     "return a new string with aBIG5Strings characters as BIG5 encoded 16bit string,
      The argument must be a BIG5String.
@@ -6586,7 +6613,7 @@
 !CharacterArray class methodsFor:'documentation'!
 
 version
-    ^ '$Header: /cvs/stx/stx/libbasic/CharacterArray.st,v 1.264 2003-12-05 13:04:48 cg Exp $'
+    ^ '$Header: /cvs/stx/stx/libbasic/CharacterArray.st,v 1.265 2004-02-05 22:20:31 cg Exp $'
 ! !
 
 CharacterArray initialize!