--- a/CharacterArray.st Thu Feb 05 16:04:27 2004 +0100
+++ b/CharacterArray.st Thu Feb 05 23:20:31 2004 +0100
@@ -160,72 +160,7 @@
Only useful, when reading twoByteStrings from external sources.
This only handles up-to 16bit characters"
- |sz nBitsRequired ascii s byte newString idx|
-
- nBitsRequired := 8.
- sz := 0.
- s := aByteCollection readStream.
- [s atEnd] whileFalse:[
- byte := ascii := s nextByte.
- (byte bitAnd:16r80) ~~ 0 ifTrue:[
- (byte bitAnd:16rE0) == 16rC0 ifTrue:[
- ascii := (byte bitAnd:16r1F).
- byte := s nextByte.
- ascii := (ascii bitShift:6) bitOr:(byte bitAnd:16r3F).
- ascii > 16rFF ifTrue:[
- nBitsRequired := nBitsRequired max:16
- ].
- ascii <= 16r7F ifTrue:[
-"/ self error:'invalid utf encoding' mayProceed:true.
- ].
- ] ifFalse:[
- (byte bitAnd:16rF0) == 16rE0 ifTrue:[
- ascii := (byte bitAnd:16r0F).
- byte := s nextByte.
- ascii := (ascii bitShift:6) bitOr:(byte bitAnd:16r3F).
- byte := s nextByte.
- ascii := (ascii bitShift:6) bitOr:(byte bitAnd:16r3F).
- ascii > 16rFF ifTrue:[
- nBitsRequired := nBitsRequired max:16
- ].
- ascii <= 16r7FF ifTrue:[
- self error:'invalid utf encoding'.
- ].
- ] ifFalse:[
- self error:'bad/unsupported utf encoding'.
- ^ nil.
- ]
- ].
- ].
- sz := sz + 1.
- ].
- nBitsRequired == 8 ifTrue:[
- newString := String uninitializedNew:sz
- ] ifFalse:[
- newString := UnicodeString new:sz
- ].
-
- s := aByteCollection readStream.
- idx := 1.
- [s atEnd] whileFalse:[
- byte := ascii := s nextByte.
- (byte bitAnd:16r80) ~~ 0 ifTrue:[
- (byte bitAnd:16rE0) == 16rC0 ifTrue:[
- ascii := (byte bitAnd:16r1F).
- byte := s nextByte.
- ascii := (ascii bitShift:6) bitOr:(byte bitAnd:16r3F).
- ] ifFalse:[
- ascii := (byte bitAnd:16r0F).
- byte := s nextByte.
- ascii := (ascii bitShift:6) bitOr:(byte bitAnd:16r3F).
- byte := s nextByte.
- ascii := (ascii bitShift:6) bitOr:(byte bitAnd:16r3F).
- ].
- ].
- newString at:idx put:(Character value:ascii).
- idx := idx + 1.
- ].
- ^ newString
+ ^ self decodeFromUTF8:aByteCollection.
"
CharacterArray fromUTF8Bytes:#[ 16r41 16r42 ]
@@ -1410,6 +1345,98 @@
"Modified: 4.7.1997 / 11:01:22 / cg"
!
+decodeFromUTF8:aStringOrByteCollection
+ "return a string which represents the characters as decoded
+ from the utf8 encoded bytes, aByteCollection.
+ Returns either a normal String, or a TwoByteString instance.
+ Only useful, when reading twoByteStrings from external sources.
+ This only handles up-to 16bit characters."
+
+ |sz nBitsRequired ascii s byte newString idx|
+
+ nBitsRequired := 8.
+ sz := 0.
+ s := aStringOrByteCollection readStream.
+ [s atEnd] whileFalse:[
+ byte := ascii := s nextByte.
+ (byte bitAnd:16r80) ~~ 0 ifTrue:[
+ (byte bitAnd:16rE0) == 16rC0 ifTrue:[
+ ascii := (byte bitAnd:16r1F).
+ byte := s nextByte.
+ ascii := (ascii bitShift:6) bitOr:(byte bitAnd:16r3F).
+ ascii > 16rFF ifTrue:[
+ nBitsRequired := nBitsRequired max:16
+ ].
+ ascii <= 16r7F ifTrue:[
+"/ self error:'invalid utf encoding' mayProceed:true.
+ ].
+ ] ifFalse:[
+ (byte bitAnd:16rF0) == 16rE0 ifTrue:[
+ ascii := (byte bitAnd:16r0F).
+ byte := s nextByte.
+ ascii := (ascii bitShift:6) bitOr:(byte bitAnd:16r3F).
+ byte := s nextByte.
+ ascii := (ascii bitShift:6) bitOr:(byte bitAnd:16r3F).
+ ascii > 16rFF ifTrue:[
+ nBitsRequired := nBitsRequired max:16
+ ].
+ ascii <= 16r7FF ifTrue:[
+ self error:'invalid utf encoding'.
+ ].
+ ] ifFalse:[
+ self error:'bad/unsupported utf encoding'.
+ ^ nil.
+ ]
+ ].
+ ].
+ sz := sz + 1.
+ ].
+ nBitsRequired == 8 ifTrue:[
+ aStringOrByteCollection isString ifTrue:[^ aStringOrByteCollection].
+ newString := String uninitializedNew:sz
+ ] ifFalse:[
+ newString := UnicodeString new:sz
+ ].
+
+ s := aStringOrByteCollection readStream.
+ idx := 1.
+ [s atEnd] whileFalse:[
+ byte := ascii := s nextByte.
+ (byte bitAnd:16r80) ~~ 0 ifTrue:[
+ (byte bitAnd:16rE0) == 16rC0 ifTrue:[
+ ascii := (byte bitAnd:16r1F).
+ byte := s nextByte.
+ ascii := (ascii bitShift:6) bitOr:(byte bitAnd:16r3F).
+ ] ifFalse:[
+ ascii := (byte bitAnd:16r0F).
+ byte := s nextByte.
+ ascii := (ascii bitShift:6) bitOr:(byte bitAnd:16r3F).
+ byte := s nextByte.
+ ascii := (ascii bitShift:6) bitOr:(byte bitAnd:16r3F).
+ ].
+ ].
+ newString at:idx put:(Character value:ascii).
+ idx := idx + 1.
+ ].
+ ^ newString
+
+ "
+ CharacterArray fromUTF8Bytes:#[ 16r41 16r42 ]
+ CharacterArray fromUTF8Bytes:#[ 16rC1 16r02 ]
+ CharacterArray fromUTF8Bytes:#[ 16rE0 16r81 16r02 ]
+ CharacterArray fromUTF8Bytes:#[ 16rEF 16rBF 16rBF ]
+
+ rfc2279 examples:
+ CharacterArray fromUTF8Bytes:#[ 16r41 16rE2 16r89 16rA2 16rCE 16r91 16r2E ]
+ CharacterArray fromUTF8Bytes:#[ 16rED 16r95 16r9C 16rEA 16rB5 16rAD 16rEC 16r96 16rB4 ]
+ CharacterArray fromUTF8Bytes:#[ 16rE6 16r97 16rA5 16rE6 16r9C 16rAC 16rE8 16rAA 16r9E ]
+
+ invalid:
+ CharacterArray fromUTF8Bytes:#[ 16rC0 16r80 ]
+ CharacterArray fromUTF8Bytes:#[ 16rE0 16r80 16r80 ]
+ "
+!
+
encodeIntoBIG5:aBIG5String
"return a new string with aBIG5Strings characters as BIG5 encoded 16bit string,
The argument must be a BIG5String.
@@ -6586,7 +6613,7 @@
!CharacterArray class methodsFor:'documentation'!
version
- ^ '$Header: /cvs/stx/stx/libbasic/CharacterArray.st,v 1.264 2003-12-05 13:04:48 cg Exp $'
+ ^ '$Header: /cvs/stx/stx/libbasic/CharacterArray.st,v 1.265 2004-02-05 22:20:31 cg Exp $'
! !
CharacterArray initialize!