--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/CharacterEncoderImplementations__ISO10646_to_UTF8.st Fri Mar 05 18:28:27 2004 +0100
@@ -0,0 +1,359 @@
+"{ Encoding: utf8 }"
+
+"{ Package: 'stx:libbasic' }"
+
+"{ NameSpace: CharacterEncoderImplementations }"
+
+TwoByteEncoder subclass:#ISO10646_to_UTF8
+ instanceVariableNames:''
+ classVariableNames:''
+ poolDictionaries:''
+ category:'Collections-Text-Encodings'
+!
+
+!ISO10646_to_UTF8 class methodsFor:'documentation'!
+
+examples
+"
+ Encoding (unicode to utf8)
+ ISO10646_to_UTF8 encodeString:'hello'.
+ ISO10646_to_UTF8 encodeString:'ÃÂÃÂÃÂ'.
+
+ Decoding (utf8 to unicode):
+
+ |t|
+
+ t := ISO10646_to_UTF8 encodeString:'ÃÂÃÂÃÂ'.
+ ISO10646_to_UTF8 decodeString:t.
+
+"
+! !
+
+!ISO10646_to_UTF8 class methodsFor:'queries'!
+
+nameOfEncoding
+ "I encode utf8 into unicode and vice versa"
+
+ ^ #'utf8'
+!
+
+namesOfEncoding
+ "I encode utf8 into unicode and vice versa"
+
+ ^ #( 'utf8' 'utf-8' )
+! !
+
+!ISO10646_to_UTF8 methodsFor:'encoding & decoding'!
+
+decode:aCode
+ self shouldNotImplement "/ no single byte conversion possible
+!
+
+decodeString:aStringOrByteCollection
+ "given a string in UTF8 encoding,
+ return a new string containing the same characters, in 16bit (or more) encoding.
+ Returns either a normal String, a TwoByteString or a FourByteString instance.
+ Only useful, when reading from external sources.
+ This only handles up-to 30bit characters.
+
+ If you work a lot with utf8 encoded textFiles,
+ this is a first-class candidate for a primitive."
+
+ |sz anyAbove7BitAscii nBitsRequired
+ ascii "{ Class: SmallInteger }"
+ byte "{ Class: SmallInteger }"
+ s newString idx next6Bits last6Bits
+ errorReporter|
+
+ errorReporter := [:msg | DecodingError raiseWith:aStringOrByteCollection errorString:msg].
+
+ next6Bits := [
+ |byte|
+
+ byte := s nextByte.
+ byte isNil ifTrue:[^ errorReporter value:'short utf8 string'].
+ ascii := (ascii bitShift:6) bitOr:(byte bitAnd:2r00111111).
+ ].
+
+ last6Bits := [
+ |byte a|
+
+ byte := s nextByte.
+ byte isNil ifTrue:[^ errorReporter value:'short utf8 string'].
+ a := (ascii bitShift:6) bitOr:(byte bitAnd:2r00111111).
+ (a > 16r3FFFFFFF) ifTrue:[
+ "/ ST/X can only represent 30 bit unicode characters.
+ errorReporter value:'unicode character out of range'.
+ a := 16r3FFFFFFF.
+ ].
+ ascii := a.
+ ].
+
+ nBitsRequired := 8.
+ anyAbove7BitAscii := false.
+ sz := 0.
+ s := aStringOrByteCollection readStream.
+ [s atEnd] whileFalse:[
+ byte := ascii := s nextByte.
+ (byte bitAnd:16r80) ~~ 0 ifTrue:[
+ anyAbove7BitAscii := true.
+ (byte bitAnd:2r11100000) == 2r11000000 ifTrue:[
+ "/ 80 .. 7FF
+ ascii := (byte bitAnd:2r00011111).
+ next6Bits value.
+ ascii > 16rFF ifTrue:[
+ nBitsRequired := nBitsRequired max:16
+ ].
+ "/ a strict utf8 decoder does not allow overlong sequences
+ ascii < 16r80 ifTrue:[
+ errorReporter value:'overlong utf8 sequence'
+ ].
+ ] ifFalse:[
+ (byte bitAnd:2r11110000) == 2r11100000 ifTrue:[
+ "/ 800 .. FFFF
+ ascii := (byte bitAnd:2r00001111).
+ next6Bits value.
+ next6Bits value.
+ ascii > 16rFF ifTrue:[
+ nBitsRequired := nBitsRequired max:16
+ ].
+ ascii < 16r800 ifTrue:[
+ errorReporter value:'overlong utf8 sequence'
+ ].
+ ] ifFalse:[
+ (byte bitAnd:2r11111000) == 2r11110000 ifTrue:[
+ "/ 10000 .. 1FFFFF
+ ascii := (byte bitAnd:2r00000111).
+ next6Bits value.
+ next6Bits value.
+ next6Bits value.
+ ascii > 16rFF ifTrue:[
+ ascii > 16rFFFF ifTrue:[
+ nBitsRequired := nBitsRequired max:32
+ ] ifFalse:[
+ nBitsRequired := nBitsRequired max:16
+ ]
+ ].
+ ascii < 16r10000 ifTrue:[
+ errorReporter value:'overlong utf8 sequence'
+ ].
+ ] ifFalse:[
+ (byte bitAnd:2r11111100) == 2r11111000 ifTrue:[
+ "/ 200000 .. 3FFFFFF
+ ascii := (byte bitAnd:2r00000011).
+ next6Bits value.
+ next6Bits value.
+ next6Bits value.
+ next6Bits value.
+ ascii > 16rFF ifTrue:[
+ ascii > 16rFFFF ifTrue:[
+ nBitsRequired := nBitsRequired max:32
+ ] ifFalse:[
+ nBitsRequired := nBitsRequired max:16
+ ]
+ ].
+ ascii < 200000 ifTrue:[
+ errorReporter value:'overlong utf8 sequence'
+ ].
+ ] ifFalse:[
+ (byte bitAnd:2r11111110) == 2r11111100 ifTrue:[
+ "/ 4000000 .. 7FFFFFFF
+ ascii := (byte bitAnd:2r00000001).
+ next6Bits value.
+ next6Bits value.
+ next6Bits value.
+ next6Bits value.
+ last6Bits value.
+ ascii > 16rFF ifTrue:[
+ ascii > 16rFFFF ifTrue:[
+ nBitsRequired := nBitsRequired max:32
+ ] ifFalse:[
+ nBitsRequired := nBitsRequired max:16
+ ]
+ ].
+ ascii < 16r4000000 ifTrue:[
+ errorReporter value:'overlong utf8 sequence'
+ ].
+ ] ifFalse:[
+ errorReporter value:'invalid utf8 encoding'
+ ]
+ ]
+ ]
+ ]
+ ].
+ ].
+ sz := sz + 1.
+ ].
+ nBitsRequired == 8 ifTrue:[
+ anyAbove7BitAscii ifFalse:[
+ "/ can return the original string
+ aStringOrByteCollection isString ifTrue:[^ aStringOrByteCollection].
+ ].
+ newString := String uninitializedNew:sz
+ ] ifFalse:[
+ nBitsRequired <= 16 ifTrue:[
+ newString := Unicode16String new:sz
+ ] ifFalse:[
+ newString := Unicode32String new:sz
+ ]
+ ].
+
+ next6Bits := [
+ |byte|
+
+ byte := s nextByte.
+ ascii := (ascii bitShift:6) bitOr:(byte bitAnd:2r00111111).
+ ].
+
+ s := aStringOrByteCollection readStream.
+ idx := 1.
+ [s atEnd] whileFalse:[
+ byte := ascii := s nextByte.
+ (byte bitAnd:2r10000000) ~~ 0 ifTrue:[
+ (byte bitAnd:2r11100000) == 2r11000000 ifTrue:[
+ ascii := (byte bitAnd:2r00011111).
+ next6Bits value.
+ ] ifFalse:[
+ (byte bitAnd:2r11110000) == 2r11100000 ifTrue:[
+ ascii := (byte bitAnd:2r00001111).
+ next6Bits value.
+ next6Bits value.
+ ] ifFalse:[
+ (byte bitAnd:2r11111000) == 2r11110000 ifTrue:[
+ ascii := (byte bitAnd:2r00000111).
+ next6Bits value.
+ next6Bits value.
+ next6Bits value.
+ ] ifFalse:[
+ (byte bitAnd:2r11111100) == 2r11111000 ifTrue:[
+ ascii := (byte bitAnd:2r00000011).
+ next6Bits value.
+ next6Bits value.
+ next6Bits value.
+ next6Bits value.
+ ] ifFalse:[
+ (byte bitAnd:2r11111110) == 2r11111100 ifTrue:[
+ ascii := (byte bitAnd:2r00000001).
+ next6Bits value.
+ next6Bits value.
+ next6Bits value.
+ next6Bits value.
+ last6Bits value.
+ ]
+ ]
+ ]
+ ]
+ ].
+ ].
+ newString at:idx put:(Character value:ascii).
+ idx := idx + 1.
+ ].
+ ^ newString
+
+ "
+ CharacterArray fromUTF8Bytes:#[ 16r41 16r42 ]
+ CharacterArray fromUTF8Bytes:#[ 16rC1 16r02 ]
+ CharacterArray fromUTF8Bytes:#[ 16rE0 16r81 16r02 ]
+ CharacterArray fromUTF8Bytes:#[ 16rEF 16rBF 16rBF ]
+
+ rfc2279 examples:
+ CharacterArray fromUTF8Bytes:#[ 16r41 16rE2 16r89 16rA2 16rCE 16r91 16r2E ]
+ CharacterArray fromUTF8Bytes:#[ 16rED 16r95 16r9C 16rEA 16rB5 16rAD 16rEC 16r96 16rB4 ]
+ CharacterArray fromUTF8Bytes:#[ 16rE6 16r97 16rA5 16rE6 16r9C 16rAC 16rE8 16rAA 16r9E ]
+
+ invalid:
+ CharacterArray fromUTF8Bytes:#[ 16rC0 16r80 ]
+ CharacterArray fromUTF8Bytes:#[ 16rE0 16r80 16r80 ]
+ "
+!
+
+encode:aCode
+ self shouldNotImplement "/ no single byte conversion possible
+!
+
+encodeString:aUnicodeString
+ "return the UTF-8 representation of a aUnicodeString.
+ The resulting string is only useful to be stored on some external file,
+ not for being used inside ST/X.
+
+ If you work a lot with utf8 encoded textFiles,
+ this is a first-class candidate for a primitive."
+
+ |s anyAbove7BitAscii|
+
+ anyAbove7BitAscii := false.
+ s := (String uninitializedNew:aUnicodeString size) writeStream.
+ aUnicodeString do:[:eachCharacter |
+ |codePoint b1 b2 b3 b4 b5 v "{Class: SmallInteger }"|
+
+ codePoint := eachCharacter asciiValue.
+ codePoint <= 16r7F ifTrue:[
+ s nextPut:eachCharacter.
+ ] ifFalse:[
+ anyAbove7BitAscii := true.
+ b1 := Character value:((codePoint bitAnd:16r3F) bitOr:2r10000000).
+ v := codePoint bitShift:-6.
+ v <= 16r1F ifTrue:[
+ s nextPut:(Character value:(v bitOr:2r11000000)).
+ s nextPut:b1.
+ ] ifFalse:[
+ b2 := Character value:((v bitAnd:16r3F) bitOr:2r10000000).
+ v := v bitShift:-6.
+ v <= 16r0F ifTrue:[
+ s nextPut:(Character value:(v bitOr:2r11100000)).
+ s nextPut:b2; nextPut:b1.
+ ] ifFalse:[
+ b3 := Character value:((v bitAnd:16r3F) bitOr:2r10000000).
+ v := v bitShift:-6.
+ v <= 16r07 ifTrue:[
+ s nextPut:(Character value:(v bitOr:2r11110000)).
+ s nextPut:b3; nextPut:b2; nextPut:b1.
+ ] ifFalse:[
+ b4 := Character value:((v bitAnd:16r3F) bitOr:2r10000000).
+ v := v bitShift:-6.
+ v <= 16r03 ifTrue:[
+ s nextPut:(Character value:(v bitOr:2r11111000)).
+ s nextPut:b4; nextPut:b3; nextPut:b2; nextPut:b1.
+ ] ifFalse:[
+ b5 := Character value:((v bitAnd:16r3F) bitOr:2r10000000).
+ v := v bitShift:-6.
+ v <= 16r01 ifTrue:[
+ s nextPut:(Character value:(v bitOr:2r11111100)).
+ s nextPut:b5; nextPut:b4; nextPut:b3; nextPut:b2; nextPut:b1.
+ ] ifFalse:[
+ "/ cannot happen - we only support up to 30 bit characters
+ self error:'ascii value > 31bit in utf8Encode'.
+ ]
+ ].
+ ].
+ ].
+ ].
+ ].
+ ].
+
+ anyAbove7BitAscii ifFalse:[^ aUnicodeString]. "/ avoid creation of new strings
+ ^ s contents
+
+ "
+ (self encodeString:'hello') asByteArray #[104 101 108 108 111]
+ (self encodeString:(Character value:16r40) asString) asByteArray #[64]
+ (self encodeString:(Character value:16r7F) asString) asByteArray #[127]
+ (self encodeString:(Character value:16r80) asString) asByteArray #[194 128]
+ (self encodeString:(Character value:16rFF) asString) asByteArray #[195 191]
+ (self encodeString:(Character value:16r100) asString) asByteArray #[196 128]
+ (self encodeString:(Character value:16r200) asString) asByteArray #[200 128]
+ (self encodeString:(Character value:16r400) asString) asByteArray #[208 128]
+ (self encodeString:(Character value:16r800) asString) asByteArray #[224 160 128]
+ (self encodeString:(Character value:16r1000) asString) asByteArray #[225 128 128]
+ (self encodeString:(Character value:16r2000) asString) asByteArray #[226 128 128]
+ (self encodeString:(Character value:16r4000) asString) asByteArray #[228 128 128]
+ (self encodeString:(Character value:16r8000) asString) asByteArray #[232 128 128]
+ (self encodeString:(Character value:16rFFFF) asString) asByteArray #[239 191 191]
+ "
+! !
+
+!ISO10646_to_UTF8 class methodsFor:'documentation'!
+
+version
+ ^ '$Header: /cvs/stx/stx/libbasic/CharacterEncoderImplementations__ISO10646_to_UTF8.st,v 1.1 2004-03-05 17:18:03 cg Exp $'
+! !