tuned encoding/decoding (quick check for 8-bit chars)
authorpenk
Tue, 03 Aug 2004 22:08:14 +0200
changeset 8460 f4d333135e1d
parent 8459 1f59b17291a3
child 8461 b7e8f7be658e
tuned encoding/decoding (quick check for 8-bit chars)
CharacterEncoderImplementations__ISO10646_to_UTF8.st
Encoder_ISO10646_to_UTF8.st
--- a/CharacterEncoderImplementations__ISO10646_to_UTF8.st	Tue Aug 03 22:07:01 2004 +0200
+++ b/CharacterEncoderImplementations__ISO10646_to_UTF8.st	Tue Aug 03 22:08:14 2004 +0200
@@ -73,192 +73,197 @@
      s newString idx next6Bits last6Bits
      errorReporter|
 
+    "/ avoid creation of new strings
+    aStringOrByteCollection isString ifTrue:[
+        aStringOrByteCollection contains8BitCharacters ifFalse:[^ aStringOrByteCollection].
+    ].
+
     errorReporter := [:msg | DecodingError raiseWith:aStringOrByteCollection errorString:msg].
 
     next6Bits := [
-		    | byte |
+                    | byte |
 
-		    byte := s nextByte.
-		    byte isNil ifTrue:[^ errorReporter value:'short utf8 string'].
-		    ascii := (ascii bitShift:6) bitOr:(byte bitAnd:2r00111111).
-		    (byte bitAnd:2r11000000) ~~ 2r10000000 ifTrue:[
-			^ errorReporter value:'illegal followbyte'.].
-		 ].
+                    byte := s nextByte.
+                    byte isNil ifTrue:[^ errorReporter value:'short utf8 string'].
+                    ascii := (ascii bitShift:6) bitOr:(byte bitAnd:2r00111111).
+                    (byte bitAnd:2r11000000) ~~ 2r10000000 ifTrue:[
+                        ^ errorReporter value:'illegal followbyte'.].
+                 ].
 
     last6Bits := [
-		    | a byte |
+                    | a byte |
 
-		    byte := s nextByte.
-		    byte isNil ifTrue:[^ errorReporter value:'short utf8 string'].
-		    a := (ascii bitShift:6) bitOr:(byte bitAnd:2r00111111).
-		    (a > 16r3FFFFFFF) ifTrue:[
-			"/ ST/X can only represent 30 bit unicode characters.
-			errorReporter value:'unicode character out of range'.
-			a := 16r3FFFFFFF.
-		    ].
-		    ascii := a.
-		    (byte bitAnd:2r11000000) ~~ 2r10000000 ifTrue:[
-			^ errorReporter value:'illegal followbyte'.].
-		 ].
+                    byte := s nextByte.
+                    byte isNil ifTrue:[^ errorReporter value:'short utf8 string'].
+                    a := (ascii bitShift:6) bitOr:(byte bitAnd:2r00111111).
+                    (a > 16r3FFFFFFF) ifTrue:[
+                        "/ ST/X can only represent 30 bit unicode characters.
+                        errorReporter value:'unicode character out of range'.
+                        a := 16r3FFFFFFF.
+                    ].
+                    ascii := a.
+                    (byte bitAnd:2r11000000) ~~ 2r10000000 ifTrue:[
+                        ^ errorReporter value:'illegal followbyte'.].
+                 ].
 
     nBitsRequired := 8.
     anyAbove7BitAscii := false.
     sz := 0.
     s := aStringOrByteCollection readStream.
     [s atEnd] whileFalse:[
-	byte := ascii := s nextByte.
-	(byte bitAnd:16r80) ~~ 0 ifTrue:[
-	    anyAbove7BitAscii := true.
-	    (byte bitAnd:2r11100000) == 2r11000000 ifTrue:[
-		"/ 80 .. 7FF
-		ascii := (byte bitAnd:2r00011111).
-		next6Bits value.
-		ascii > 16rFF ifTrue:[
-		    nBitsRequired := nBitsRequired max:16
-		].
-		"/ a strict utf8 decoder does not allow overlong sequences
-		ascii < 16r80 ifTrue:[
-		    errorReporter value:'overlong utf8 sequence'
-		].
-	    ] ifFalse:[
-		(byte bitAnd:2r11110000) == 2r11100000 ifTrue:[
-		    "/ 800 .. FFFF
-		    ascii := (byte bitAnd:2r00001111).
-		    next6Bits value.
-		    next6Bits value.
-		    ascii > 16rFF ifTrue:[
-			nBitsRequired := nBitsRequired max:16
-		    ].
-		    ascii < 16r800 ifTrue:[
-			errorReporter value:'overlong utf8 sequence'
-		    ].
-		] ifFalse:[
-		    (byte bitAnd:2r11111000) == 2r11110000 ifTrue:[
-			"/ 10000 .. 1FFFFF
-			ascii := (byte bitAnd:2r00000111).
-			next6Bits value.
-			next6Bits value.
-			next6Bits value.
-			ascii > 16rFF ifTrue:[
-			    ascii > 16rFFFF ifTrue:[
-				nBitsRequired := nBitsRequired max:32
-			    ] ifFalse:[
-				nBitsRequired := nBitsRequired max:16
-			    ]
-			].
-			ascii < 16r10000 ifTrue:[
-			    errorReporter value:'overlong utf8 sequence'
-			].
-		    ] ifFalse:[
-			(byte bitAnd:2r11111100) == 2r11111000 ifTrue:[
-			    "/ 200000 .. 3FFFFFF
-			    ascii := (byte bitAnd:2r00000011).
-			    next6Bits value.
-			    next6Bits value.
-			    next6Bits value.
-			    next6Bits value.
-			    ascii > 16rFF ifTrue:[
-				ascii > 16rFFFF ifTrue:[
-				    nBitsRequired := nBitsRequired max:32
-				] ifFalse:[
-				    nBitsRequired := nBitsRequired max:16
-				]
-			    ].
-			    ascii < 200000 ifTrue:[
-				errorReporter value:'overlong utf8 sequence'
-			    ].
-			] ifFalse:[
-			    (byte bitAnd:2r11111110) == 2r11111100 ifTrue:[
-				"/ 4000000 .. 7FFFFFFF
-				ascii := (byte bitAnd:2r00000001).
-				next6Bits value.
-				next6Bits value.
-				next6Bits value.
-				next6Bits value.
-				last6Bits value.
-				ascii > 16rFF ifTrue:[
-				    ascii > 16rFFFF ifTrue:[
-					nBitsRequired := nBitsRequired max:32
-				    ] ifFalse:[
-					nBitsRequired := nBitsRequired max:16
-				    ]
-				].
-				ascii < 16r4000000 ifTrue:[
-				    errorReporter value:'overlong utf8 sequence'
-				].
-			    ] ifFalse:[
-				errorReporter value:'invalid utf8 encoding'
-			    ]
-			]
-		    ]
-		]
-	    ].
-	].
-	sz := sz + 1.
+        byte := ascii := s nextByte.
+        (byte bitAnd:16r80) ~~ 0 ifTrue:[
+            anyAbove7BitAscii := true.
+            (byte bitAnd:2r11100000) == 2r11000000 ifTrue:[
+                "/ 80 .. 7FF
+                ascii := (byte bitAnd:2r00011111).
+                next6Bits value.
+                ascii > 16rFF ifTrue:[
+                    nBitsRequired := nBitsRequired max:16
+                ].
+                "/ a strict utf8 decoder does not allow overlong sequences
+                ascii < 16r80 ifTrue:[
+                    errorReporter value:'overlong utf8 sequence'
+                ].
+            ] ifFalse:[
+                (byte bitAnd:2r11110000) == 2r11100000 ifTrue:[
+                    "/ 800 .. FFFF
+                    ascii := (byte bitAnd:2r00001111).
+                    next6Bits value.
+                    next6Bits value.
+                    ascii > 16rFF ifTrue:[
+                        nBitsRequired := nBitsRequired max:16
+                    ].
+                    ascii < 16r800 ifTrue:[
+                        errorReporter value:'overlong utf8 sequence'
+                    ].
+                ] ifFalse:[
+                    (byte bitAnd:2r11111000) == 2r11110000 ifTrue:[
+                        "/ 10000 .. 1FFFFF
+                        ascii := (byte bitAnd:2r00000111).
+                        next6Bits value.
+                        next6Bits value.
+                        next6Bits value.
+                        ascii > 16rFF ifTrue:[
+                            ascii > 16rFFFF ifTrue:[
+                                nBitsRequired := nBitsRequired max:32
+                            ] ifFalse:[
+                                nBitsRequired := nBitsRequired max:16
+                            ]
+                        ].
+                        ascii < 16r10000 ifTrue:[
+                            errorReporter value:'overlong utf8 sequence'
+                        ].
+                    ] ifFalse:[
+                        (byte bitAnd:2r11111100) == 2r11111000 ifTrue:[
+                            "/ 200000 .. 3FFFFFF
+                            ascii := (byte bitAnd:2r00000011).
+                            next6Bits value.
+                            next6Bits value.
+                            next6Bits value.
+                            next6Bits value.
+                            ascii > 16rFF ifTrue:[
+                                ascii > 16rFFFF ifTrue:[
+                                    nBitsRequired := nBitsRequired max:32
+                                ] ifFalse:[
+                                    nBitsRequired := nBitsRequired max:16
+                                ]
+                            ].
+                            ascii < 200000 ifTrue:[
+                                errorReporter value:'overlong utf8 sequence'
+                            ].
+                        ] ifFalse:[
+                            (byte bitAnd:2r11111110) == 2r11111100 ifTrue:[
+                                "/ 4000000 .. 7FFFFFFF
+                                ascii := (byte bitAnd:2r00000001).
+                                next6Bits value.
+                                next6Bits value.
+                                next6Bits value.
+                                next6Bits value.
+                                last6Bits value.
+                                ascii > 16rFF ifTrue:[
+                                    ascii > 16rFFFF ifTrue:[
+                                        nBitsRequired := nBitsRequired max:32
+                                    ] ifFalse:[
+                                        nBitsRequired := nBitsRequired max:16
+                                    ]
+                                ].
+                                ascii < 16r4000000 ifTrue:[
+                                    errorReporter value:'overlong utf8 sequence'
+                                ].
+                            ] ifFalse:[
+                                errorReporter value:'invalid utf8 encoding'
+                            ]
+                        ]
+                    ]
+                ]
+            ].
+        ].
+        sz := sz + 1.
     ].
     nBitsRequired == 8 ifTrue:[
-	anyAbove7BitAscii ifFalse:[
-	    "/ can return the original string
-	    aStringOrByteCollection isString ifTrue:[^ aStringOrByteCollection].
-	].
-	newString := String uninitializedNew:sz
+        anyAbove7BitAscii ifFalse:[
+            "/ can return the original string
+            aStringOrByteCollection isString ifTrue:[^ aStringOrByteCollection].
+        ].
+        newString := String uninitializedNew:sz
     ] ifFalse:[
-	nBitsRequired <= 16 ifTrue:[
-	    newString := Unicode16String new:sz
-	] ifFalse:[
-	    newString := Unicode32String new:sz
-	]
+        nBitsRequired <= 16 ifTrue:[
+            newString := Unicode16String new:sz
+        ] ifFalse:[
+            newString := Unicode32String new:sz
+        ]
     ].
 
     next6Bits := [
-		    |byte|
+                    |byte|
 
-		    byte := s nextByte.
-		    ascii := (ascii bitShift:6) bitOr:(byte bitAnd:2r00111111).
-		 ].
+                    byte := s nextByte.
+                    ascii := (ascii bitShift:6) bitOr:(byte bitAnd:2r00111111).
+                 ].
 
     s := aStringOrByteCollection readStream.
     idx := 1.
     [s atEnd] whileFalse:[
-	byte := ascii := s nextByte.
-	(byte bitAnd:2r10000000) ~~ 0 ifTrue:[
-	    (byte bitAnd:2r11100000) == 2r11000000 ifTrue:[
-		ascii := (byte bitAnd:2r00011111).
-		next6Bits value.
-	    ] ifFalse:[
-		(byte bitAnd:2r11110000) == 2r11100000 ifTrue:[
-		    ascii := (byte bitAnd:2r00001111).
-		    next6Bits value.
-		    next6Bits value.
-		] ifFalse:[
-		    (byte bitAnd:2r11111000) == 2r11110000 ifTrue:[
-			ascii := (byte bitAnd:2r00000111).
-			next6Bits value.
-			next6Bits value.
-			next6Bits value.
-		    ] ifFalse:[
-			(byte bitAnd:2r11111100) == 2r11111000 ifTrue:[
-			    ascii := (byte bitAnd:2r00000011).
-			    next6Bits value.
-			    next6Bits value.
-			    next6Bits value.
-			    next6Bits value.
-			] ifFalse:[
-			    (byte bitAnd:2r11111110) == 2r11111100 ifTrue:[
-				ascii := (byte bitAnd:2r00000001).
-				next6Bits value.
-				next6Bits value.
-				next6Bits value.
-				next6Bits value.
-				last6Bits value.
-			    ]
-			]
-		    ]
-		]
-	    ].
-	].
-	newString at:idx put:(Character value:ascii).
-	idx := idx + 1.
+        byte := ascii := s nextByte.
+        (byte bitAnd:2r10000000) ~~ 0 ifTrue:[
+            (byte bitAnd:2r11100000) == 2r11000000 ifTrue:[
+                ascii := (byte bitAnd:2r00011111).
+                next6Bits value.
+            ] ifFalse:[
+                (byte bitAnd:2r11110000) == 2r11100000 ifTrue:[
+                    ascii := (byte bitAnd:2r00001111).
+                    next6Bits value.
+                    next6Bits value.
+                ] ifFalse:[
+                    (byte bitAnd:2r11111000) == 2r11110000 ifTrue:[
+                        ascii := (byte bitAnd:2r00000111).
+                        next6Bits value.
+                        next6Bits value.
+                        next6Bits value.
+                    ] ifFalse:[
+                        (byte bitAnd:2r11111100) == 2r11111000 ifTrue:[
+                            ascii := (byte bitAnd:2r00000011).
+                            next6Bits value.
+                            next6Bits value.
+                            next6Bits value.
+                            next6Bits value.
+                        ] ifFalse:[
+                            (byte bitAnd:2r11111110) == 2r11111100 ifTrue:[
+                                ascii := (byte bitAnd:2r00000001).
+                                next6Bits value.
+                                next6Bits value.
+                                next6Bits value.
+                                next6Bits value.
+                                last6Bits value.
+                            ]
+                        ]
+                    ]
+                ]
+            ].
+        ].
+        newString at:idx put:(Character value:ascii).
+        idx := idx + 1.
     ].
     ^ newString
 
@@ -291,59 +296,59 @@
      If you work a lot with utf8 encoded textFiles,
      this is a first-class candidate for a primitive."
 
-    |s anyAbove7BitAscii|
+    |s|
 
-    anyAbove7BitAscii := false.
+    "/ avoid creation of new strings
+    aUnicodeString contains8BitCharacters ifFalse:[^ aUnicodeString].
+
     s := WriteStream on:(String uninitializedNew:aUnicodeString size).
     aUnicodeString do:[:eachCharacter |
-	|codePoint b1 b2 b3 b4 b5 v "{Class: SmallInteger }"|
+        |codePoint b1 b2 b3 b4 b5 v "{Class: SmallInteger }"|
 
-	codePoint := eachCharacter codePoint.
-	codePoint <= 16r7F ifTrue:[
-	    s nextPut:eachCharacter.
-	] ifFalse:[
-	    anyAbove7BitAscii := true.
-	    b1 := Character value:((codePoint bitAnd:16r3F) bitOr:2r10000000).
-	    v := codePoint bitShift:-6.
-	    v <= 16r1F ifTrue:[
-		s nextPut:(Character value:(v bitOr:2r11000000)).
-		s nextPut:b1.
-	    ] ifFalse:[
-		b2 := Character value:((v bitAnd:16r3F) bitOr:2r10000000).
-		v := v bitShift:-6.
-		v <= 16r0F ifTrue:[
-		    s nextPut:(Character value:(v bitOr:2r11100000)).
-		    s nextPut:b2; nextPut:b1.
-		] ifFalse:[
-		    b3 := Character value:((v bitAnd:16r3F) bitOr:2r10000000).
-		    v := v bitShift:-6.
-		    v <= 16r07 ifTrue:[
-			s nextPut:(Character value:(v bitOr:2r11110000)).
-			s nextPut:b3; nextPut:b2; nextPut:b1.
-		    ] ifFalse:[
-			b4 := Character value:((v bitAnd:16r3F) bitOr:2r10000000).
-			v := v bitShift:-6.
-			v <= 16r03 ifTrue:[
-			    s nextPut:(Character value:(v bitOr:2r11111000)).
-			    s nextPut:b4; nextPut:b3; nextPut:b2; nextPut:b1.
-			] ifFalse:[
-			    b5 := Character value:((v bitAnd:16r3F) bitOr:2r10000000).
-			    v := v bitShift:-6.
-			    v <= 16r01 ifTrue:[
-				s nextPut:(Character value:(v bitOr:2r11111100)).
-				s nextPut:b5; nextPut:b4; nextPut:b3; nextPut:b2; nextPut:b1.
-			    ] ifFalse:[
-				"/ cannot happen - we only support up to 30 bit characters
-				self error:'ascii value > 31bit in utf8Encode'.
-			    ]
-			].
-		    ].
-		].
-	    ].
-	].
+        codePoint := eachCharacter codePoint.
+        codePoint <= 16r7F ifTrue:[
+            s nextPut:eachCharacter.
+        ] ifFalse:[
+            b1 := Character value:((codePoint bitAnd:16r3F) bitOr:2r10000000).
+            v := codePoint bitShift:-6.
+            v <= 16r1F ifTrue:[
+                s nextPut:(Character value:(v bitOr:2r11000000)).
+                s nextPut:b1.
+            ] ifFalse:[
+                b2 := Character value:((v bitAnd:16r3F) bitOr:2r10000000).
+                v := v bitShift:-6.
+                v <= 16r0F ifTrue:[
+                    s nextPut:(Character value:(v bitOr:2r11100000)).
+                    s nextPut:b2; nextPut:b1.
+                ] ifFalse:[
+                    b3 := Character value:((v bitAnd:16r3F) bitOr:2r10000000).
+                    v := v bitShift:-6.
+                    v <= 16r07 ifTrue:[
+                        s nextPut:(Character value:(v bitOr:2r11110000)).
+                        s nextPut:b3; nextPut:b2; nextPut:b1.
+                    ] ifFalse:[
+                        b4 := Character value:((v bitAnd:16r3F) bitOr:2r10000000).
+                        v := v bitShift:-6.
+                        v <= 16r03 ifTrue:[
+                            s nextPut:(Character value:(v bitOr:2r11111000)).
+                            s nextPut:b4; nextPut:b3; nextPut:b2; nextPut:b1.
+                        ] ifFalse:[
+                            b5 := Character value:((v bitAnd:16r3F) bitOr:2r10000000).
+                            v := v bitShift:-6.
+                            v <= 16r01 ifTrue:[
+                                s nextPut:(Character value:(v bitOr:2r11111100)).
+                                s nextPut:b5; nextPut:b4; nextPut:b3; nextPut:b2; nextPut:b1.
+                            ] ifFalse:[
+                                "/ cannot happen - we only support up to 30 bit characters
+                                self error:'ascii value > 31bit in utf8Encode'.
+                            ]
+                        ].
+                    ].
+                ].
+            ].
+        ].
     ].
 
-    anyAbove7BitAscii ifFalse:[^ aUnicodeString].   "/ avoid creation of new strings
     ^ s contents
 
     "
@@ -373,5 +378,5 @@
 !ISO10646_to_UTF8 class methodsFor:'documentation'!
 
 version
-    ^ '$Header: /cvs/stx/stx/libbasic/CharacterEncoderImplementations__ISO10646_to_UTF8.st,v 1.9 2004-06-16 18:52:26 ca Exp $'
+    ^ '$Header: /cvs/stx/stx/libbasic/CharacterEncoderImplementations__ISO10646_to_UTF8.st,v 1.10 2004-08-03 20:08:14 penk Exp $'
 ! !
--- a/Encoder_ISO10646_to_UTF8.st	Tue Aug 03 22:07:01 2004 +0200
+++ b/Encoder_ISO10646_to_UTF8.st	Tue Aug 03 22:08:14 2004 +0200
@@ -73,192 +73,197 @@
      s newString idx next6Bits last6Bits
      errorReporter|
 
+    "/ avoid creation of new strings
+    aStringOrByteCollection isString ifTrue:[
+        aStringOrByteCollection contains8BitCharacters ifFalse:[^ aStringOrByteCollection].
+    ].
+
     errorReporter := [:msg | DecodingError raiseWith:aStringOrByteCollection errorString:msg].
 
     next6Bits := [
-		    | byte |
+                    | byte |
 
-		    byte := s nextByte.
-		    byte isNil ifTrue:[^ errorReporter value:'short utf8 string'].
-		    ascii := (ascii bitShift:6) bitOr:(byte bitAnd:2r00111111).
-		    (byte bitAnd:2r11000000) ~~ 2r10000000 ifTrue:[
-			^ errorReporter value:'illegal followbyte'.].
-		 ].
+                    byte := s nextByte.
+                    byte isNil ifTrue:[^ errorReporter value:'short utf8 string'].
+                    ascii := (ascii bitShift:6) bitOr:(byte bitAnd:2r00111111).
+                    (byte bitAnd:2r11000000) ~~ 2r10000000 ifTrue:[
+                        ^ errorReporter value:'illegal followbyte'.].
+                 ].
 
     last6Bits := [
-		    | a byte |
+                    | a byte |
 
-		    byte := s nextByte.
-		    byte isNil ifTrue:[^ errorReporter value:'short utf8 string'].
-		    a := (ascii bitShift:6) bitOr:(byte bitAnd:2r00111111).
-		    (a > 16r3FFFFFFF) ifTrue:[
-			"/ ST/X can only represent 30 bit unicode characters.
-			errorReporter value:'unicode character out of range'.
-			a := 16r3FFFFFFF.
-		    ].
-		    ascii := a.
-		    (byte bitAnd:2r11000000) ~~ 2r10000000 ifTrue:[
-			^ errorReporter value:'illegal followbyte'.].
-		 ].
+                    byte := s nextByte.
+                    byte isNil ifTrue:[^ errorReporter value:'short utf8 string'].
+                    a := (ascii bitShift:6) bitOr:(byte bitAnd:2r00111111).
+                    (a > 16r3FFFFFFF) ifTrue:[
+                        "/ ST/X can only represent 30 bit unicode characters.
+                        errorReporter value:'unicode character out of range'.
+                        a := 16r3FFFFFFF.
+                    ].
+                    ascii := a.
+                    (byte bitAnd:2r11000000) ~~ 2r10000000 ifTrue:[
+                        ^ errorReporter value:'illegal followbyte'.].
+                 ].
 
     nBitsRequired := 8.
     anyAbove7BitAscii := false.
     sz := 0.
     s := aStringOrByteCollection readStream.
     [s atEnd] whileFalse:[
-	byte := ascii := s nextByte.
-	(byte bitAnd:16r80) ~~ 0 ifTrue:[
-	    anyAbove7BitAscii := true.
-	    (byte bitAnd:2r11100000) == 2r11000000 ifTrue:[
-		"/ 80 .. 7FF
-		ascii := (byte bitAnd:2r00011111).
-		next6Bits value.
-		ascii > 16rFF ifTrue:[
-		    nBitsRequired := nBitsRequired max:16
-		].
-		"/ a strict utf8 decoder does not allow overlong sequences
-		ascii < 16r80 ifTrue:[
-		    errorReporter value:'overlong utf8 sequence'
-		].
-	    ] ifFalse:[
-		(byte bitAnd:2r11110000) == 2r11100000 ifTrue:[
-		    "/ 800 .. FFFF
-		    ascii := (byte bitAnd:2r00001111).
-		    next6Bits value.
-		    next6Bits value.
-		    ascii > 16rFF ifTrue:[
-			nBitsRequired := nBitsRequired max:16
-		    ].
-		    ascii < 16r800 ifTrue:[
-			errorReporter value:'overlong utf8 sequence'
-		    ].
-		] ifFalse:[
-		    (byte bitAnd:2r11111000) == 2r11110000 ifTrue:[
-			"/ 10000 .. 1FFFFF
-			ascii := (byte bitAnd:2r00000111).
-			next6Bits value.
-			next6Bits value.
-			next6Bits value.
-			ascii > 16rFF ifTrue:[
-			    ascii > 16rFFFF ifTrue:[
-				nBitsRequired := nBitsRequired max:32
-			    ] ifFalse:[
-				nBitsRequired := nBitsRequired max:16
-			    ]
-			].
-			ascii < 16r10000 ifTrue:[
-			    errorReporter value:'overlong utf8 sequence'
-			].
-		    ] ifFalse:[
-			(byte bitAnd:2r11111100) == 2r11111000 ifTrue:[
-			    "/ 200000 .. 3FFFFFF
-			    ascii := (byte bitAnd:2r00000011).
-			    next6Bits value.
-			    next6Bits value.
-			    next6Bits value.
-			    next6Bits value.
-			    ascii > 16rFF ifTrue:[
-				ascii > 16rFFFF ifTrue:[
-				    nBitsRequired := nBitsRequired max:32
-				] ifFalse:[
-				    nBitsRequired := nBitsRequired max:16
-				]
-			    ].
-			    ascii < 200000 ifTrue:[
-				errorReporter value:'overlong utf8 sequence'
-			    ].
-			] ifFalse:[
-			    (byte bitAnd:2r11111110) == 2r11111100 ifTrue:[
-				"/ 4000000 .. 7FFFFFFF
-				ascii := (byte bitAnd:2r00000001).
-				next6Bits value.
-				next6Bits value.
-				next6Bits value.
-				next6Bits value.
-				last6Bits value.
-				ascii > 16rFF ifTrue:[
-				    ascii > 16rFFFF ifTrue:[
-					nBitsRequired := nBitsRequired max:32
-				    ] ifFalse:[
-					nBitsRequired := nBitsRequired max:16
-				    ]
-				].
-				ascii < 16r4000000 ifTrue:[
-				    errorReporter value:'overlong utf8 sequence'
-				].
-			    ] ifFalse:[
-				errorReporter value:'invalid utf8 encoding'
-			    ]
-			]
-		    ]
-		]
-	    ].
-	].
-	sz := sz + 1.
+        byte := ascii := s nextByte.
+        (byte bitAnd:16r80) ~~ 0 ifTrue:[
+            anyAbove7BitAscii := true.
+            (byte bitAnd:2r11100000) == 2r11000000 ifTrue:[
+                "/ 80 .. 7FF
+                ascii := (byte bitAnd:2r00011111).
+                next6Bits value.
+                ascii > 16rFF ifTrue:[
+                    nBitsRequired := nBitsRequired max:16
+                ].
+                "/ a strict utf8 decoder does not allow overlong sequences
+                ascii < 16r80 ifTrue:[
+                    errorReporter value:'overlong utf8 sequence'
+                ].
+            ] ifFalse:[
+                (byte bitAnd:2r11110000) == 2r11100000 ifTrue:[
+                    "/ 800 .. FFFF
+                    ascii := (byte bitAnd:2r00001111).
+                    next6Bits value.
+                    next6Bits value.
+                    ascii > 16rFF ifTrue:[
+                        nBitsRequired := nBitsRequired max:16
+                    ].
+                    ascii < 16r800 ifTrue:[
+                        errorReporter value:'overlong utf8 sequence'
+                    ].
+                ] ifFalse:[
+                    (byte bitAnd:2r11111000) == 2r11110000 ifTrue:[
+                        "/ 10000 .. 1FFFFF
+                        ascii := (byte bitAnd:2r00000111).
+                        next6Bits value.
+                        next6Bits value.
+                        next6Bits value.
+                        ascii > 16rFF ifTrue:[
+                            ascii > 16rFFFF ifTrue:[
+                                nBitsRequired := nBitsRequired max:32
+                            ] ifFalse:[
+                                nBitsRequired := nBitsRequired max:16
+                            ]
+                        ].
+                        ascii < 16r10000 ifTrue:[
+                            errorReporter value:'overlong utf8 sequence'
+                        ].
+                    ] ifFalse:[
+                        (byte bitAnd:2r11111100) == 2r11111000 ifTrue:[
+                            "/ 200000 .. 3FFFFFF
+                            ascii := (byte bitAnd:2r00000011).
+                            next6Bits value.
+                            next6Bits value.
+                            next6Bits value.
+                            next6Bits value.
+                            ascii > 16rFF ifTrue:[
+                                ascii > 16rFFFF ifTrue:[
+                                    nBitsRequired := nBitsRequired max:32
+                                ] ifFalse:[
+                                    nBitsRequired := nBitsRequired max:16
+                                ]
+                            ].
+                            ascii < 200000 ifTrue:[
+                                errorReporter value:'overlong utf8 sequence'
+                            ].
+                        ] ifFalse:[
+                            (byte bitAnd:2r11111110) == 2r11111100 ifTrue:[
+                                "/ 4000000 .. 7FFFFFFF
+                                ascii := (byte bitAnd:2r00000001).
+                                next6Bits value.
+                                next6Bits value.
+                                next6Bits value.
+                                next6Bits value.
+                                last6Bits value.
+                                ascii > 16rFF ifTrue:[
+                                    ascii > 16rFFFF ifTrue:[
+                                        nBitsRequired := nBitsRequired max:32
+                                    ] ifFalse:[
+                                        nBitsRequired := nBitsRequired max:16
+                                    ]
+                                ].
+                                ascii < 16r4000000 ifTrue:[
+                                    errorReporter value:'overlong utf8 sequence'
+                                ].
+                            ] ifFalse:[
+                                errorReporter value:'invalid utf8 encoding'
+                            ]
+                        ]
+                    ]
+                ]
+            ].
+        ].
+        sz := sz + 1.
     ].
     nBitsRequired == 8 ifTrue:[
-	anyAbove7BitAscii ifFalse:[
-	    "/ can return the original string
-	    aStringOrByteCollection isString ifTrue:[^ aStringOrByteCollection].
-	].
-	newString := String uninitializedNew:sz
+        anyAbove7BitAscii ifFalse:[
+            "/ can return the original string
+            aStringOrByteCollection isString ifTrue:[^ aStringOrByteCollection].
+        ].
+        newString := String uninitializedNew:sz
     ] ifFalse:[
-	nBitsRequired <= 16 ifTrue:[
-	    newString := Unicode16String new:sz
-	] ifFalse:[
-	    newString := Unicode32String new:sz
-	]
+        nBitsRequired <= 16 ifTrue:[
+            newString := Unicode16String new:sz
+        ] ifFalse:[
+            newString := Unicode32String new:sz
+        ]
     ].
 
     next6Bits := [
-		    |byte|
+                    |byte|
 
-		    byte := s nextByte.
-		    ascii := (ascii bitShift:6) bitOr:(byte bitAnd:2r00111111).
-		 ].
+                    byte := s nextByte.
+                    ascii := (ascii bitShift:6) bitOr:(byte bitAnd:2r00111111).
+                 ].
 
     s := aStringOrByteCollection readStream.
     idx := 1.
     [s atEnd] whileFalse:[
-	byte := ascii := s nextByte.
-	(byte bitAnd:2r10000000) ~~ 0 ifTrue:[
-	    (byte bitAnd:2r11100000) == 2r11000000 ifTrue:[
-		ascii := (byte bitAnd:2r00011111).
-		next6Bits value.
-	    ] ifFalse:[
-		(byte bitAnd:2r11110000) == 2r11100000 ifTrue:[
-		    ascii := (byte bitAnd:2r00001111).
-		    next6Bits value.
-		    next6Bits value.
-		] ifFalse:[
-		    (byte bitAnd:2r11111000) == 2r11110000 ifTrue:[
-			ascii := (byte bitAnd:2r00000111).
-			next6Bits value.
-			next6Bits value.
-			next6Bits value.
-		    ] ifFalse:[
-			(byte bitAnd:2r11111100) == 2r11111000 ifTrue:[
-			    ascii := (byte bitAnd:2r00000011).
-			    next6Bits value.
-			    next6Bits value.
-			    next6Bits value.
-			    next6Bits value.
-			] ifFalse:[
-			    (byte bitAnd:2r11111110) == 2r11111100 ifTrue:[
-				ascii := (byte bitAnd:2r00000001).
-				next6Bits value.
-				next6Bits value.
-				next6Bits value.
-				next6Bits value.
-				last6Bits value.
-			    ]
-			]
-		    ]
-		]
-	    ].
-	].
-	newString at:idx put:(Character value:ascii).
-	idx := idx + 1.
+        byte := ascii := s nextByte.
+        (byte bitAnd:2r10000000) ~~ 0 ifTrue:[
+            (byte bitAnd:2r11100000) == 2r11000000 ifTrue:[
+                ascii := (byte bitAnd:2r00011111).
+                next6Bits value.
+            ] ifFalse:[
+                (byte bitAnd:2r11110000) == 2r11100000 ifTrue:[
+                    ascii := (byte bitAnd:2r00001111).
+                    next6Bits value.
+                    next6Bits value.
+                ] ifFalse:[
+                    (byte bitAnd:2r11111000) == 2r11110000 ifTrue:[
+                        ascii := (byte bitAnd:2r00000111).
+                        next6Bits value.
+                        next6Bits value.
+                        next6Bits value.
+                    ] ifFalse:[
+                        (byte bitAnd:2r11111100) == 2r11111000 ifTrue:[
+                            ascii := (byte bitAnd:2r00000011).
+                            next6Bits value.
+                            next6Bits value.
+                            next6Bits value.
+                            next6Bits value.
+                        ] ifFalse:[
+                            (byte bitAnd:2r11111110) == 2r11111100 ifTrue:[
+                                ascii := (byte bitAnd:2r00000001).
+                                next6Bits value.
+                                next6Bits value.
+                                next6Bits value.
+                                next6Bits value.
+                                last6Bits value.
+                            ]
+                        ]
+                    ]
+                ]
+            ].
+        ].
+        newString at:idx put:(Character value:ascii).
+        idx := idx + 1.
     ].
     ^ newString
 
@@ -291,59 +296,59 @@
      If you work a lot with utf8 encoded textFiles,
      this is a first-class candidate for a primitive."
 
-    |s anyAbove7BitAscii|
+    |s|
 
-    anyAbove7BitAscii := false.
+    "/ avoid creation of new strings
+    aUnicodeString contains8BitCharacters ifFalse:[^ aUnicodeString].
+
     s := WriteStream on:(String uninitializedNew:aUnicodeString size).
     aUnicodeString do:[:eachCharacter |
-	|codePoint b1 b2 b3 b4 b5 v "{Class: SmallInteger }"|
+        |codePoint b1 b2 b3 b4 b5 v "{Class: SmallInteger }"|
 
-	codePoint := eachCharacter codePoint.
-	codePoint <= 16r7F ifTrue:[
-	    s nextPut:eachCharacter.
-	] ifFalse:[
-	    anyAbove7BitAscii := true.
-	    b1 := Character value:((codePoint bitAnd:16r3F) bitOr:2r10000000).
-	    v := codePoint bitShift:-6.
-	    v <= 16r1F ifTrue:[
-		s nextPut:(Character value:(v bitOr:2r11000000)).
-		s nextPut:b1.
-	    ] ifFalse:[
-		b2 := Character value:((v bitAnd:16r3F) bitOr:2r10000000).
-		v := v bitShift:-6.
-		v <= 16r0F ifTrue:[
-		    s nextPut:(Character value:(v bitOr:2r11100000)).
-		    s nextPut:b2; nextPut:b1.
-		] ifFalse:[
-		    b3 := Character value:((v bitAnd:16r3F) bitOr:2r10000000).
-		    v := v bitShift:-6.
-		    v <= 16r07 ifTrue:[
-			s nextPut:(Character value:(v bitOr:2r11110000)).
-			s nextPut:b3; nextPut:b2; nextPut:b1.
-		    ] ifFalse:[
-			b4 := Character value:((v bitAnd:16r3F) bitOr:2r10000000).
-			v := v bitShift:-6.
-			v <= 16r03 ifTrue:[
-			    s nextPut:(Character value:(v bitOr:2r11111000)).
-			    s nextPut:b4; nextPut:b3; nextPut:b2; nextPut:b1.
-			] ifFalse:[
-			    b5 := Character value:((v bitAnd:16r3F) bitOr:2r10000000).
-			    v := v bitShift:-6.
-			    v <= 16r01 ifTrue:[
-				s nextPut:(Character value:(v bitOr:2r11111100)).
-				s nextPut:b5; nextPut:b4; nextPut:b3; nextPut:b2; nextPut:b1.
-			    ] ifFalse:[
-				"/ cannot happen - we only support up to 30 bit characters
-				self error:'ascii value > 31bit in utf8Encode'.
-			    ]
-			].
-		    ].
-		].
-	    ].
-	].
+        codePoint := eachCharacter codePoint.
+        codePoint <= 16r7F ifTrue:[
+            s nextPut:eachCharacter.
+        ] ifFalse:[
+            b1 := Character value:((codePoint bitAnd:16r3F) bitOr:2r10000000).
+            v := codePoint bitShift:-6.
+            v <= 16r1F ifTrue:[
+                s nextPut:(Character value:(v bitOr:2r11000000)).
+                s nextPut:b1.
+            ] ifFalse:[
+                b2 := Character value:((v bitAnd:16r3F) bitOr:2r10000000).
+                v := v bitShift:-6.
+                v <= 16r0F ifTrue:[
+                    s nextPut:(Character value:(v bitOr:2r11100000)).
+                    s nextPut:b2; nextPut:b1.
+                ] ifFalse:[
+                    b3 := Character value:((v bitAnd:16r3F) bitOr:2r10000000).
+                    v := v bitShift:-6.
+                    v <= 16r07 ifTrue:[
+                        s nextPut:(Character value:(v bitOr:2r11110000)).
+                        s nextPut:b3; nextPut:b2; nextPut:b1.
+                    ] ifFalse:[
+                        b4 := Character value:((v bitAnd:16r3F) bitOr:2r10000000).
+                        v := v bitShift:-6.
+                        v <= 16r03 ifTrue:[
+                            s nextPut:(Character value:(v bitOr:2r11111000)).
+                            s nextPut:b4; nextPut:b3; nextPut:b2; nextPut:b1.
+                        ] ifFalse:[
+                            b5 := Character value:((v bitAnd:16r3F) bitOr:2r10000000).
+                            v := v bitShift:-6.
+                            v <= 16r01 ifTrue:[
+                                s nextPut:(Character value:(v bitOr:2r11111100)).
+                                s nextPut:b5; nextPut:b4; nextPut:b3; nextPut:b2; nextPut:b1.
+                            ] ifFalse:[
+                                "/ cannot happen - we only support up to 30 bit characters
+                                self error:'ascii value > 31bit in utf8Encode'.
+                            ]
+                        ].
+                    ].
+                ].
+            ].
+        ].
     ].
 
-    anyAbove7BitAscii ifFalse:[^ aUnicodeString].   "/ avoid creation of new strings
     ^ s contents
 
     "
@@ -373,5 +378,5 @@
 !ISO10646_to_UTF8 class methodsFor:'documentation'!
 
 version
-    ^ '$Header: /cvs/stx/stx/libbasic/Attic/Encoder_ISO10646_to_UTF8.st,v 1.9 2004-06-16 18:52:26 ca Exp $'
+    ^ '$Header: /cvs/stx/stx/libbasic/Attic/Encoder_ISO10646_to_UTF8.st,v 1.10 2004-08-03 20:08:14 penk Exp $'
 ! !