--- a/CharacterEncoderImplementations__ISO10646_to_UTF8.st Tue Aug 03 22:07:01 2004 +0200
+++ b/CharacterEncoderImplementations__ISO10646_to_UTF8.st Tue Aug 03 22:08:14 2004 +0200
@@ -73,192 +73,197 @@
s newString idx next6Bits last6Bits
errorReporter|
+ "/ avoid creation of new strings
+ aStringOrByteCollection isString ifTrue:[
+ aStringOrByteCollection contains8BitCharacters ifFalse:[^ aStringOrByteCollection].
+ ].
+
errorReporter := [:msg | DecodingError raiseWith:aStringOrByteCollection errorString:msg].
next6Bits := [
- | byte |
+ | byte |
- byte := s nextByte.
- byte isNil ifTrue:[^ errorReporter value:'short utf8 string'].
- ascii := (ascii bitShift:6) bitOr:(byte bitAnd:2r00111111).
- (byte bitAnd:2r11000000) ~~ 2r10000000 ifTrue:[
- ^ errorReporter value:'illegal followbyte'.].
- ].
+ byte := s nextByte.
+ byte isNil ifTrue:[^ errorReporter value:'short utf8 string'].
+ ascii := (ascii bitShift:6) bitOr:(byte bitAnd:2r00111111).
+ (byte bitAnd:2r11000000) ~~ 2r10000000 ifTrue:[
+ ^ errorReporter value:'illegal followbyte'.].
+ ].
last6Bits := [
- | a byte |
+ | a byte |
- byte := s nextByte.
- byte isNil ifTrue:[^ errorReporter value:'short utf8 string'].
- a := (ascii bitShift:6) bitOr:(byte bitAnd:2r00111111).
- (a > 16r3FFFFFFF) ifTrue:[
- "/ ST/X can only represent 30 bit unicode characters.
- errorReporter value:'unicode character out of range'.
- a := 16r3FFFFFFF.
- ].
- ascii := a.
- (byte bitAnd:2r11000000) ~~ 2r10000000 ifTrue:[
- ^ errorReporter value:'illegal followbyte'.].
- ].
+ byte := s nextByte.
+ byte isNil ifTrue:[^ errorReporter value:'short utf8 string'].
+ a := (ascii bitShift:6) bitOr:(byte bitAnd:2r00111111).
+ (a > 16r3FFFFFFF) ifTrue:[
+ "/ ST/X can only represent 30 bit unicode characters.
+ errorReporter value:'unicode character out of range'.
+ a := 16r3FFFFFFF.
+ ].
+ ascii := a.
+ (byte bitAnd:2r11000000) ~~ 2r10000000 ifTrue:[
+ ^ errorReporter value:'illegal followbyte'.].
+ ].
nBitsRequired := 8.
anyAbove7BitAscii := false.
sz := 0.
s := aStringOrByteCollection readStream.
[s atEnd] whileFalse:[
- byte := ascii := s nextByte.
- (byte bitAnd:16r80) ~~ 0 ifTrue:[
- anyAbove7BitAscii := true.
- (byte bitAnd:2r11100000) == 2r11000000 ifTrue:[
- "/ 80 .. 7FF
- ascii := (byte bitAnd:2r00011111).
- next6Bits value.
- ascii > 16rFF ifTrue:[
- nBitsRequired := nBitsRequired max:16
- ].
- "/ a strict utf8 decoder does not allow overlong sequences
- ascii < 16r80 ifTrue:[
- errorReporter value:'overlong utf8 sequence'
- ].
- ] ifFalse:[
- (byte bitAnd:2r11110000) == 2r11100000 ifTrue:[
- "/ 800 .. FFFF
- ascii := (byte bitAnd:2r00001111).
- next6Bits value.
- next6Bits value.
- ascii > 16rFF ifTrue:[
- nBitsRequired := nBitsRequired max:16
- ].
- ascii < 16r800 ifTrue:[
- errorReporter value:'overlong utf8 sequence'
- ].
- ] ifFalse:[
- (byte bitAnd:2r11111000) == 2r11110000 ifTrue:[
- "/ 10000 .. 1FFFFF
- ascii := (byte bitAnd:2r00000111).
- next6Bits value.
- next6Bits value.
- next6Bits value.
- ascii > 16rFF ifTrue:[
- ascii > 16rFFFF ifTrue:[
- nBitsRequired := nBitsRequired max:32
- ] ifFalse:[
- nBitsRequired := nBitsRequired max:16
- ]
- ].
- ascii < 16r10000 ifTrue:[
- errorReporter value:'overlong utf8 sequence'
- ].
- ] ifFalse:[
- (byte bitAnd:2r11111100) == 2r11111000 ifTrue:[
- "/ 200000 .. 3FFFFFF
- ascii := (byte bitAnd:2r00000011).
- next6Bits value.
- next6Bits value.
- next6Bits value.
- next6Bits value.
- ascii > 16rFF ifTrue:[
- ascii > 16rFFFF ifTrue:[
- nBitsRequired := nBitsRequired max:32
- ] ifFalse:[
- nBitsRequired := nBitsRequired max:16
- ]
- ].
- ascii < 200000 ifTrue:[
- errorReporter value:'overlong utf8 sequence'
- ].
- ] ifFalse:[
- (byte bitAnd:2r11111110) == 2r11111100 ifTrue:[
- "/ 4000000 .. 7FFFFFFF
- ascii := (byte bitAnd:2r00000001).
- next6Bits value.
- next6Bits value.
- next6Bits value.
- next6Bits value.
- last6Bits value.
- ascii > 16rFF ifTrue:[
- ascii > 16rFFFF ifTrue:[
- nBitsRequired := nBitsRequired max:32
- ] ifFalse:[
- nBitsRequired := nBitsRequired max:16
- ]
- ].
- ascii < 16r4000000 ifTrue:[
- errorReporter value:'overlong utf8 sequence'
- ].
- ] ifFalse:[
- errorReporter value:'invalid utf8 encoding'
- ]
- ]
- ]
- ]
- ].
- ].
- sz := sz + 1.
+ byte := ascii := s nextByte.
+ (byte bitAnd:16r80) ~~ 0 ifTrue:[
+ anyAbove7BitAscii := true.
+ (byte bitAnd:2r11100000) == 2r11000000 ifTrue:[
+ "/ 80 .. 7FF
+ ascii := (byte bitAnd:2r00011111).
+ next6Bits value.
+ ascii > 16rFF ifTrue:[
+ nBitsRequired := nBitsRequired max:16
+ ].
+ "/ a strict utf8 decoder does not allow overlong sequences
+ ascii < 16r80 ifTrue:[
+ errorReporter value:'overlong utf8 sequence'
+ ].
+ ] ifFalse:[
+ (byte bitAnd:2r11110000) == 2r11100000 ifTrue:[
+ "/ 800 .. FFFF
+ ascii := (byte bitAnd:2r00001111).
+ next6Bits value.
+ next6Bits value.
+ ascii > 16rFF ifTrue:[
+ nBitsRequired := nBitsRequired max:16
+ ].
+ ascii < 16r800 ifTrue:[
+ errorReporter value:'overlong utf8 sequence'
+ ].
+ ] ifFalse:[
+ (byte bitAnd:2r11111000) == 2r11110000 ifTrue:[
+ "/ 10000 .. 1FFFFF
+ ascii := (byte bitAnd:2r00000111).
+ next6Bits value.
+ next6Bits value.
+ next6Bits value.
+ ascii > 16rFF ifTrue:[
+ ascii > 16rFFFF ifTrue:[
+ nBitsRequired := nBitsRequired max:32
+ ] ifFalse:[
+ nBitsRequired := nBitsRequired max:16
+ ]
+ ].
+ ascii < 16r10000 ifTrue:[
+ errorReporter value:'overlong utf8 sequence'
+ ].
+ ] ifFalse:[
+ (byte bitAnd:2r11111100) == 2r11111000 ifTrue:[
+ "/ 200000 .. 3FFFFFF
+ ascii := (byte bitAnd:2r00000011).
+ next6Bits value.
+ next6Bits value.
+ next6Bits value.
+ next6Bits value.
+ ascii > 16rFF ifTrue:[
+ ascii > 16rFFFF ifTrue:[
+ nBitsRequired := nBitsRequired max:32
+ ] ifFalse:[
+ nBitsRequired := nBitsRequired max:16
+ ]
+ ].
+ ascii < 200000 ifTrue:[
+ errorReporter value:'overlong utf8 sequence'
+ ].
+ ] ifFalse:[
+ (byte bitAnd:2r11111110) == 2r11111100 ifTrue:[
+ "/ 4000000 .. 7FFFFFFF
+ ascii := (byte bitAnd:2r00000001).
+ next6Bits value.
+ next6Bits value.
+ next6Bits value.
+ next6Bits value.
+ last6Bits value.
+ ascii > 16rFF ifTrue:[
+ ascii > 16rFFFF ifTrue:[
+ nBitsRequired := nBitsRequired max:32
+ ] ifFalse:[
+ nBitsRequired := nBitsRequired max:16
+ ]
+ ].
+ ascii < 16r4000000 ifTrue:[
+ errorReporter value:'overlong utf8 sequence'
+ ].
+ ] ifFalse:[
+ errorReporter value:'invalid utf8 encoding'
+ ]
+ ]
+ ]
+ ]
+ ].
+ ].
+ sz := sz + 1.
].
nBitsRequired == 8 ifTrue:[
- anyAbove7BitAscii ifFalse:[
- "/ can return the original string
- aStringOrByteCollection isString ifTrue:[^ aStringOrByteCollection].
- ].
- newString := String uninitializedNew:sz
+ anyAbove7BitAscii ifFalse:[
+ "/ can return the original string
+ aStringOrByteCollection isString ifTrue:[^ aStringOrByteCollection].
+ ].
+ newString := String uninitializedNew:sz
] ifFalse:[
- nBitsRequired <= 16 ifTrue:[
- newString := Unicode16String new:sz
- ] ifFalse:[
- newString := Unicode32String new:sz
- ]
+ nBitsRequired <= 16 ifTrue:[
+ newString := Unicode16String new:sz
+ ] ifFalse:[
+ newString := Unicode32String new:sz
+ ]
].
next6Bits := [
- |byte|
+ |byte|
- byte := s nextByte.
- ascii := (ascii bitShift:6) bitOr:(byte bitAnd:2r00111111).
- ].
+ byte := s nextByte.
+ ascii := (ascii bitShift:6) bitOr:(byte bitAnd:2r00111111).
+ ].
s := aStringOrByteCollection readStream.
idx := 1.
[s atEnd] whileFalse:[
- byte := ascii := s nextByte.
- (byte bitAnd:2r10000000) ~~ 0 ifTrue:[
- (byte bitAnd:2r11100000) == 2r11000000 ifTrue:[
- ascii := (byte bitAnd:2r00011111).
- next6Bits value.
- ] ifFalse:[
- (byte bitAnd:2r11110000) == 2r11100000 ifTrue:[
- ascii := (byte bitAnd:2r00001111).
- next6Bits value.
- next6Bits value.
- ] ifFalse:[
- (byte bitAnd:2r11111000) == 2r11110000 ifTrue:[
- ascii := (byte bitAnd:2r00000111).
- next6Bits value.
- next6Bits value.
- next6Bits value.
- ] ifFalse:[
- (byte bitAnd:2r11111100) == 2r11111000 ifTrue:[
- ascii := (byte bitAnd:2r00000011).
- next6Bits value.
- next6Bits value.
- next6Bits value.
- next6Bits value.
- ] ifFalse:[
- (byte bitAnd:2r11111110) == 2r11111100 ifTrue:[
- ascii := (byte bitAnd:2r00000001).
- next6Bits value.
- next6Bits value.
- next6Bits value.
- next6Bits value.
- last6Bits value.
- ]
- ]
- ]
- ]
- ].
- ].
- newString at:idx put:(Character value:ascii).
- idx := idx + 1.
+ byte := ascii := s nextByte.
+ (byte bitAnd:2r10000000) ~~ 0 ifTrue:[
+ (byte bitAnd:2r11100000) == 2r11000000 ifTrue:[
+ ascii := (byte bitAnd:2r00011111).
+ next6Bits value.
+ ] ifFalse:[
+ (byte bitAnd:2r11110000) == 2r11100000 ifTrue:[
+ ascii := (byte bitAnd:2r00001111).
+ next6Bits value.
+ next6Bits value.
+ ] ifFalse:[
+ (byte bitAnd:2r11111000) == 2r11110000 ifTrue:[
+ ascii := (byte bitAnd:2r00000111).
+ next6Bits value.
+ next6Bits value.
+ next6Bits value.
+ ] ifFalse:[
+ (byte bitAnd:2r11111100) == 2r11111000 ifTrue:[
+ ascii := (byte bitAnd:2r00000011).
+ next6Bits value.
+ next6Bits value.
+ next6Bits value.
+ next6Bits value.
+ ] ifFalse:[
+ (byte bitAnd:2r11111110) == 2r11111100 ifTrue:[
+ ascii := (byte bitAnd:2r00000001).
+ next6Bits value.
+ next6Bits value.
+ next6Bits value.
+ next6Bits value.
+ last6Bits value.
+ ]
+ ]
+ ]
+ ]
+ ].
+ ].
+ newString at:idx put:(Character value:ascii).
+ idx := idx + 1.
].
^ newString
@@ -291,59 +296,59 @@
If you work a lot with utf8 encoded textFiles,
this is a first-class candidate for a primitive."
- |s anyAbove7BitAscii|
+ |s|
- anyAbove7BitAscii := false.
+ "/ avoid creation of new strings
+ aUnicodeString contains8BitCharacters ifFalse:[^ aUnicodeString].
+
s := WriteStream on:(String uninitializedNew:aUnicodeString size).
aUnicodeString do:[:eachCharacter |
- |codePoint b1 b2 b3 b4 b5 v "{Class: SmallInteger }"|
+ |codePoint b1 b2 b3 b4 b5 v "{Class: SmallInteger }"|
- codePoint := eachCharacter codePoint.
- codePoint <= 16r7F ifTrue:[
- s nextPut:eachCharacter.
- ] ifFalse:[
- anyAbove7BitAscii := true.
- b1 := Character value:((codePoint bitAnd:16r3F) bitOr:2r10000000).
- v := codePoint bitShift:-6.
- v <= 16r1F ifTrue:[
- s nextPut:(Character value:(v bitOr:2r11000000)).
- s nextPut:b1.
- ] ifFalse:[
- b2 := Character value:((v bitAnd:16r3F) bitOr:2r10000000).
- v := v bitShift:-6.
- v <= 16r0F ifTrue:[
- s nextPut:(Character value:(v bitOr:2r11100000)).
- s nextPut:b2; nextPut:b1.
- ] ifFalse:[
- b3 := Character value:((v bitAnd:16r3F) bitOr:2r10000000).
- v := v bitShift:-6.
- v <= 16r07 ifTrue:[
- s nextPut:(Character value:(v bitOr:2r11110000)).
- s nextPut:b3; nextPut:b2; nextPut:b1.
- ] ifFalse:[
- b4 := Character value:((v bitAnd:16r3F) bitOr:2r10000000).
- v := v bitShift:-6.
- v <= 16r03 ifTrue:[
- s nextPut:(Character value:(v bitOr:2r11111000)).
- s nextPut:b4; nextPut:b3; nextPut:b2; nextPut:b1.
- ] ifFalse:[
- b5 := Character value:((v bitAnd:16r3F) bitOr:2r10000000).
- v := v bitShift:-6.
- v <= 16r01 ifTrue:[
- s nextPut:(Character value:(v bitOr:2r11111100)).
- s nextPut:b5; nextPut:b4; nextPut:b3; nextPut:b2; nextPut:b1.
- ] ifFalse:[
- "/ cannot happen - we only support up to 30 bit characters
- self error:'ascii value > 31bit in utf8Encode'.
- ]
- ].
- ].
- ].
- ].
- ].
+ codePoint := eachCharacter codePoint.
+ codePoint <= 16r7F ifTrue:[
+ s nextPut:eachCharacter.
+ ] ifFalse:[
+ b1 := Character value:((codePoint bitAnd:16r3F) bitOr:2r10000000).
+ v := codePoint bitShift:-6.
+ v <= 16r1F ifTrue:[
+ s nextPut:(Character value:(v bitOr:2r11000000)).
+ s nextPut:b1.
+ ] ifFalse:[
+ b2 := Character value:((v bitAnd:16r3F) bitOr:2r10000000).
+ v := v bitShift:-6.
+ v <= 16r0F ifTrue:[
+ s nextPut:(Character value:(v bitOr:2r11100000)).
+ s nextPut:b2; nextPut:b1.
+ ] ifFalse:[
+ b3 := Character value:((v bitAnd:16r3F) bitOr:2r10000000).
+ v := v bitShift:-6.
+ v <= 16r07 ifTrue:[
+ s nextPut:(Character value:(v bitOr:2r11110000)).
+ s nextPut:b3; nextPut:b2; nextPut:b1.
+ ] ifFalse:[
+ b4 := Character value:((v bitAnd:16r3F) bitOr:2r10000000).
+ v := v bitShift:-6.
+ v <= 16r03 ifTrue:[
+ s nextPut:(Character value:(v bitOr:2r11111000)).
+ s nextPut:b4; nextPut:b3; nextPut:b2; nextPut:b1.
+ ] ifFalse:[
+ b5 := Character value:((v bitAnd:16r3F) bitOr:2r10000000).
+ v := v bitShift:-6.
+ v <= 16r01 ifTrue:[
+ s nextPut:(Character value:(v bitOr:2r11111100)).
+ s nextPut:b5; nextPut:b4; nextPut:b3; nextPut:b2; nextPut:b1.
+ ] ifFalse:[
+ "/ cannot happen - we only support up to 30 bit characters
+ self error:'ascii value > 31bit in utf8Encode'.
+ ]
+ ].
+ ].
+ ].
+ ].
+ ].
].
- anyAbove7BitAscii ifFalse:[^ aUnicodeString]. "/ avoid creation of new strings
^ s contents
"
@@ -373,5 +378,5 @@
!ISO10646_to_UTF8 class methodsFor:'documentation'!
version
- ^ '$Header: /cvs/stx/stx/libbasic/CharacterEncoderImplementations__ISO10646_to_UTF8.st,v 1.9 2004-06-16 18:52:26 ca Exp $'
+ ^ '$Header: /cvs/stx/stx/libbasic/CharacterEncoderImplementations__ISO10646_to_UTF8.st,v 1.10 2004-08-03 20:08:14 penk Exp $'
! !
--- a/Encoder_ISO10646_to_UTF8.st Tue Aug 03 22:07:01 2004 +0200
+++ b/Encoder_ISO10646_to_UTF8.st Tue Aug 03 22:08:14 2004 +0200
@@ -73,192 +73,197 @@
s newString idx next6Bits last6Bits
errorReporter|
+ "/ avoid creation of new strings
+ aStringOrByteCollection isString ifTrue:[
+ aStringOrByteCollection contains8BitCharacters ifFalse:[^ aStringOrByteCollection].
+ ].
+
errorReporter := [:msg | DecodingError raiseWith:aStringOrByteCollection errorString:msg].
next6Bits := [
- | byte |
+ | byte |
- byte := s nextByte.
- byte isNil ifTrue:[^ errorReporter value:'short utf8 string'].
- ascii := (ascii bitShift:6) bitOr:(byte bitAnd:2r00111111).
- (byte bitAnd:2r11000000) ~~ 2r10000000 ifTrue:[
- ^ errorReporter value:'illegal followbyte'.].
- ].
+ byte := s nextByte.
+ byte isNil ifTrue:[^ errorReporter value:'short utf8 string'].
+ ascii := (ascii bitShift:6) bitOr:(byte bitAnd:2r00111111).
+ (byte bitAnd:2r11000000) ~~ 2r10000000 ifTrue:[
+ ^ errorReporter value:'illegal followbyte'.].
+ ].
last6Bits := [
- | a byte |
+ | a byte |
- byte := s nextByte.
- byte isNil ifTrue:[^ errorReporter value:'short utf8 string'].
- a := (ascii bitShift:6) bitOr:(byte bitAnd:2r00111111).
- (a > 16r3FFFFFFF) ifTrue:[
- "/ ST/X can only represent 30 bit unicode characters.
- errorReporter value:'unicode character out of range'.
- a := 16r3FFFFFFF.
- ].
- ascii := a.
- (byte bitAnd:2r11000000) ~~ 2r10000000 ifTrue:[
- ^ errorReporter value:'illegal followbyte'.].
- ].
+ byte := s nextByte.
+ byte isNil ifTrue:[^ errorReporter value:'short utf8 string'].
+ a := (ascii bitShift:6) bitOr:(byte bitAnd:2r00111111).
+ (a > 16r3FFFFFFF) ifTrue:[
+ "/ ST/X can only represent 30 bit unicode characters.
+ errorReporter value:'unicode character out of range'.
+ a := 16r3FFFFFFF.
+ ].
+ ascii := a.
+ (byte bitAnd:2r11000000) ~~ 2r10000000 ifTrue:[
+ ^ errorReporter value:'illegal followbyte'.].
+ ].
nBitsRequired := 8.
anyAbove7BitAscii := false.
sz := 0.
s := aStringOrByteCollection readStream.
[s atEnd] whileFalse:[
- byte := ascii := s nextByte.
- (byte bitAnd:16r80) ~~ 0 ifTrue:[
- anyAbove7BitAscii := true.
- (byte bitAnd:2r11100000) == 2r11000000 ifTrue:[
- "/ 80 .. 7FF
- ascii := (byte bitAnd:2r00011111).
- next6Bits value.
- ascii > 16rFF ifTrue:[
- nBitsRequired := nBitsRequired max:16
- ].
- "/ a strict utf8 decoder does not allow overlong sequences
- ascii < 16r80 ifTrue:[
- errorReporter value:'overlong utf8 sequence'
- ].
- ] ifFalse:[
- (byte bitAnd:2r11110000) == 2r11100000 ifTrue:[
- "/ 800 .. FFFF
- ascii := (byte bitAnd:2r00001111).
- next6Bits value.
- next6Bits value.
- ascii > 16rFF ifTrue:[
- nBitsRequired := nBitsRequired max:16
- ].
- ascii < 16r800 ifTrue:[
- errorReporter value:'overlong utf8 sequence'
- ].
- ] ifFalse:[
- (byte bitAnd:2r11111000) == 2r11110000 ifTrue:[
- "/ 10000 .. 1FFFFF
- ascii := (byte bitAnd:2r00000111).
- next6Bits value.
- next6Bits value.
- next6Bits value.
- ascii > 16rFF ifTrue:[
- ascii > 16rFFFF ifTrue:[
- nBitsRequired := nBitsRequired max:32
- ] ifFalse:[
- nBitsRequired := nBitsRequired max:16
- ]
- ].
- ascii < 16r10000 ifTrue:[
- errorReporter value:'overlong utf8 sequence'
- ].
- ] ifFalse:[
- (byte bitAnd:2r11111100) == 2r11111000 ifTrue:[
- "/ 200000 .. 3FFFFFF
- ascii := (byte bitAnd:2r00000011).
- next6Bits value.
- next6Bits value.
- next6Bits value.
- next6Bits value.
- ascii > 16rFF ifTrue:[
- ascii > 16rFFFF ifTrue:[
- nBitsRequired := nBitsRequired max:32
- ] ifFalse:[
- nBitsRequired := nBitsRequired max:16
- ]
- ].
- ascii < 200000 ifTrue:[
- errorReporter value:'overlong utf8 sequence'
- ].
- ] ifFalse:[
- (byte bitAnd:2r11111110) == 2r11111100 ifTrue:[
- "/ 4000000 .. 7FFFFFFF
- ascii := (byte bitAnd:2r00000001).
- next6Bits value.
- next6Bits value.
- next6Bits value.
- next6Bits value.
- last6Bits value.
- ascii > 16rFF ifTrue:[
- ascii > 16rFFFF ifTrue:[
- nBitsRequired := nBitsRequired max:32
- ] ifFalse:[
- nBitsRequired := nBitsRequired max:16
- ]
- ].
- ascii < 16r4000000 ifTrue:[
- errorReporter value:'overlong utf8 sequence'
- ].
- ] ifFalse:[
- errorReporter value:'invalid utf8 encoding'
- ]
- ]
- ]
- ]
- ].
- ].
- sz := sz + 1.
+ byte := ascii := s nextByte.
+ (byte bitAnd:16r80) ~~ 0 ifTrue:[
+ anyAbove7BitAscii := true.
+ (byte bitAnd:2r11100000) == 2r11000000 ifTrue:[
+ "/ 80 .. 7FF
+ ascii := (byte bitAnd:2r00011111).
+ next6Bits value.
+ ascii > 16rFF ifTrue:[
+ nBitsRequired := nBitsRequired max:16
+ ].
+ "/ a strict utf8 decoder does not allow overlong sequences
+ ascii < 16r80 ifTrue:[
+ errorReporter value:'overlong utf8 sequence'
+ ].
+ ] ifFalse:[
+ (byte bitAnd:2r11110000) == 2r11100000 ifTrue:[
+ "/ 800 .. FFFF
+ ascii := (byte bitAnd:2r00001111).
+ next6Bits value.
+ next6Bits value.
+ ascii > 16rFF ifTrue:[
+ nBitsRequired := nBitsRequired max:16
+ ].
+ ascii < 16r800 ifTrue:[
+ errorReporter value:'overlong utf8 sequence'
+ ].
+ ] ifFalse:[
+ (byte bitAnd:2r11111000) == 2r11110000 ifTrue:[
+ "/ 10000 .. 1FFFFF
+ ascii := (byte bitAnd:2r00000111).
+ next6Bits value.
+ next6Bits value.
+ next6Bits value.
+ ascii > 16rFF ifTrue:[
+ ascii > 16rFFFF ifTrue:[
+ nBitsRequired := nBitsRequired max:32
+ ] ifFalse:[
+ nBitsRequired := nBitsRequired max:16
+ ]
+ ].
+ ascii < 16r10000 ifTrue:[
+ errorReporter value:'overlong utf8 sequence'
+ ].
+ ] ifFalse:[
+ (byte bitAnd:2r11111100) == 2r11111000 ifTrue:[
+ "/ 200000 .. 3FFFFFF
+ ascii := (byte bitAnd:2r00000011).
+ next6Bits value.
+ next6Bits value.
+ next6Bits value.
+ next6Bits value.
+ ascii > 16rFF ifTrue:[
+ ascii > 16rFFFF ifTrue:[
+ nBitsRequired := nBitsRequired max:32
+ ] ifFalse:[
+ nBitsRequired := nBitsRequired max:16
+ ]
+ ].
+ ascii < 200000 ifTrue:[
+ errorReporter value:'overlong utf8 sequence'
+ ].
+ ] ifFalse:[
+ (byte bitAnd:2r11111110) == 2r11111100 ifTrue:[
+ "/ 4000000 .. 7FFFFFFF
+ ascii := (byte bitAnd:2r00000001).
+ next6Bits value.
+ next6Bits value.
+ next6Bits value.
+ next6Bits value.
+ last6Bits value.
+ ascii > 16rFF ifTrue:[
+ ascii > 16rFFFF ifTrue:[
+ nBitsRequired := nBitsRequired max:32
+ ] ifFalse:[
+ nBitsRequired := nBitsRequired max:16
+ ]
+ ].
+ ascii < 16r4000000 ifTrue:[
+ errorReporter value:'overlong utf8 sequence'
+ ].
+ ] ifFalse:[
+ errorReporter value:'invalid utf8 encoding'
+ ]
+ ]
+ ]
+ ]
+ ].
+ ].
+ sz := sz + 1.
].
nBitsRequired == 8 ifTrue:[
- anyAbove7BitAscii ifFalse:[
- "/ can return the original string
- aStringOrByteCollection isString ifTrue:[^ aStringOrByteCollection].
- ].
- newString := String uninitializedNew:sz
+ anyAbove7BitAscii ifFalse:[
+ "/ can return the original string
+ aStringOrByteCollection isString ifTrue:[^ aStringOrByteCollection].
+ ].
+ newString := String uninitializedNew:sz
] ifFalse:[
- nBitsRequired <= 16 ifTrue:[
- newString := Unicode16String new:sz
- ] ifFalse:[
- newString := Unicode32String new:sz
- ]
+ nBitsRequired <= 16 ifTrue:[
+ newString := Unicode16String new:sz
+ ] ifFalse:[
+ newString := Unicode32String new:sz
+ ]
].
next6Bits := [
- |byte|
+ |byte|
- byte := s nextByte.
- ascii := (ascii bitShift:6) bitOr:(byte bitAnd:2r00111111).
- ].
+ byte := s nextByte.
+ ascii := (ascii bitShift:6) bitOr:(byte bitAnd:2r00111111).
+ ].
s := aStringOrByteCollection readStream.
idx := 1.
[s atEnd] whileFalse:[
- byte := ascii := s nextByte.
- (byte bitAnd:2r10000000) ~~ 0 ifTrue:[
- (byte bitAnd:2r11100000) == 2r11000000 ifTrue:[
- ascii := (byte bitAnd:2r00011111).
- next6Bits value.
- ] ifFalse:[
- (byte bitAnd:2r11110000) == 2r11100000 ifTrue:[
- ascii := (byte bitAnd:2r00001111).
- next6Bits value.
- next6Bits value.
- ] ifFalse:[
- (byte bitAnd:2r11111000) == 2r11110000 ifTrue:[
- ascii := (byte bitAnd:2r00000111).
- next6Bits value.
- next6Bits value.
- next6Bits value.
- ] ifFalse:[
- (byte bitAnd:2r11111100) == 2r11111000 ifTrue:[
- ascii := (byte bitAnd:2r00000011).
- next6Bits value.
- next6Bits value.
- next6Bits value.
- next6Bits value.
- ] ifFalse:[
- (byte bitAnd:2r11111110) == 2r11111100 ifTrue:[
- ascii := (byte bitAnd:2r00000001).
- next6Bits value.
- next6Bits value.
- next6Bits value.
- next6Bits value.
- last6Bits value.
- ]
- ]
- ]
- ]
- ].
- ].
- newString at:idx put:(Character value:ascii).
- idx := idx + 1.
+ byte := ascii := s nextByte.
+ (byte bitAnd:2r10000000) ~~ 0 ifTrue:[
+ (byte bitAnd:2r11100000) == 2r11000000 ifTrue:[
+ ascii := (byte bitAnd:2r00011111).
+ next6Bits value.
+ ] ifFalse:[
+ (byte bitAnd:2r11110000) == 2r11100000 ifTrue:[
+ ascii := (byte bitAnd:2r00001111).
+ next6Bits value.
+ next6Bits value.
+ ] ifFalse:[
+ (byte bitAnd:2r11111000) == 2r11110000 ifTrue:[
+ ascii := (byte bitAnd:2r00000111).
+ next6Bits value.
+ next6Bits value.
+ next6Bits value.
+ ] ifFalse:[
+ (byte bitAnd:2r11111100) == 2r11111000 ifTrue:[
+ ascii := (byte bitAnd:2r00000011).
+ next6Bits value.
+ next6Bits value.
+ next6Bits value.
+ next6Bits value.
+ ] ifFalse:[
+ (byte bitAnd:2r11111110) == 2r11111100 ifTrue:[
+ ascii := (byte bitAnd:2r00000001).
+ next6Bits value.
+ next6Bits value.
+ next6Bits value.
+ next6Bits value.
+ last6Bits value.
+ ]
+ ]
+ ]
+ ]
+ ].
+ ].
+ newString at:idx put:(Character value:ascii).
+ idx := idx + 1.
].
^ newString
@@ -291,59 +296,59 @@
If you work a lot with utf8 encoded textFiles,
this is a first-class candidate for a primitive."
- |s anyAbove7BitAscii|
+ |s|
- anyAbove7BitAscii := false.
+ "/ avoid creation of new strings
+ aUnicodeString contains8BitCharacters ifFalse:[^ aUnicodeString].
+
s := WriteStream on:(String uninitializedNew:aUnicodeString size).
aUnicodeString do:[:eachCharacter |
- |codePoint b1 b2 b3 b4 b5 v "{Class: SmallInteger }"|
+ |codePoint b1 b2 b3 b4 b5 v "{Class: SmallInteger }"|
- codePoint := eachCharacter codePoint.
- codePoint <= 16r7F ifTrue:[
- s nextPut:eachCharacter.
- ] ifFalse:[
- anyAbove7BitAscii := true.
- b1 := Character value:((codePoint bitAnd:16r3F) bitOr:2r10000000).
- v := codePoint bitShift:-6.
- v <= 16r1F ifTrue:[
- s nextPut:(Character value:(v bitOr:2r11000000)).
- s nextPut:b1.
- ] ifFalse:[
- b2 := Character value:((v bitAnd:16r3F) bitOr:2r10000000).
- v := v bitShift:-6.
- v <= 16r0F ifTrue:[
- s nextPut:(Character value:(v bitOr:2r11100000)).
- s nextPut:b2; nextPut:b1.
- ] ifFalse:[
- b3 := Character value:((v bitAnd:16r3F) bitOr:2r10000000).
- v := v bitShift:-6.
- v <= 16r07 ifTrue:[
- s nextPut:(Character value:(v bitOr:2r11110000)).
- s nextPut:b3; nextPut:b2; nextPut:b1.
- ] ifFalse:[
- b4 := Character value:((v bitAnd:16r3F) bitOr:2r10000000).
- v := v bitShift:-6.
- v <= 16r03 ifTrue:[
- s nextPut:(Character value:(v bitOr:2r11111000)).
- s nextPut:b4; nextPut:b3; nextPut:b2; nextPut:b1.
- ] ifFalse:[
- b5 := Character value:((v bitAnd:16r3F) bitOr:2r10000000).
- v := v bitShift:-6.
- v <= 16r01 ifTrue:[
- s nextPut:(Character value:(v bitOr:2r11111100)).
- s nextPut:b5; nextPut:b4; nextPut:b3; nextPut:b2; nextPut:b1.
- ] ifFalse:[
- "/ cannot happen - we only support up to 30 bit characters
- self error:'ascii value > 31bit in utf8Encode'.
- ]
- ].
- ].
- ].
- ].
- ].
+ codePoint := eachCharacter codePoint.
+ codePoint <= 16r7F ifTrue:[
+ s nextPut:eachCharacter.
+ ] ifFalse:[
+ b1 := Character value:((codePoint bitAnd:16r3F) bitOr:2r10000000).
+ v := codePoint bitShift:-6.
+ v <= 16r1F ifTrue:[
+ s nextPut:(Character value:(v bitOr:2r11000000)).
+ s nextPut:b1.
+ ] ifFalse:[
+ b2 := Character value:((v bitAnd:16r3F) bitOr:2r10000000).
+ v := v bitShift:-6.
+ v <= 16r0F ifTrue:[
+ s nextPut:(Character value:(v bitOr:2r11100000)).
+ s nextPut:b2; nextPut:b1.
+ ] ifFalse:[
+ b3 := Character value:((v bitAnd:16r3F) bitOr:2r10000000).
+ v := v bitShift:-6.
+ v <= 16r07 ifTrue:[
+ s nextPut:(Character value:(v bitOr:2r11110000)).
+ s nextPut:b3; nextPut:b2; nextPut:b1.
+ ] ifFalse:[
+ b4 := Character value:((v bitAnd:16r3F) bitOr:2r10000000).
+ v := v bitShift:-6.
+ v <= 16r03 ifTrue:[
+ s nextPut:(Character value:(v bitOr:2r11111000)).
+ s nextPut:b4; nextPut:b3; nextPut:b2; nextPut:b1.
+ ] ifFalse:[
+ b5 := Character value:((v bitAnd:16r3F) bitOr:2r10000000).
+ v := v bitShift:-6.
+ v <= 16r01 ifTrue:[
+ s nextPut:(Character value:(v bitOr:2r11111100)).
+ s nextPut:b5; nextPut:b4; nextPut:b3; nextPut:b2; nextPut:b1.
+ ] ifFalse:[
+ "/ cannot happen - we only support up to 30 bit characters
+ self error:'ascii value > 31bit in utf8Encode'.
+ ]
+ ].
+ ].
+ ].
+ ].
+ ].
].
- anyAbove7BitAscii ifFalse:[^ aUnicodeString]. "/ avoid creation of new strings
^ s contents
"
@@ -373,5 +378,5 @@
!ISO10646_to_UTF8 class methodsFor:'documentation'!
version
- ^ '$Header: /cvs/stx/stx/libbasic/Attic/Encoder_ISO10646_to_UTF8.st,v 1.9 2004-06-16 18:52:26 ca Exp $'
+ ^ '$Header: /cvs/stx/stx/libbasic/Attic/Encoder_ISO10646_to_UTF8.st,v 1.10 2004-08-03 20:08:14 penk Exp $'
! !