--- a/Character.st Tue Jul 21 06:19:13 2015 +0100
+++ b/Character.st Tue Jul 21 06:19:27 2015 +0100
@@ -1,5 +1,3 @@
-"{ Encoding: utf8 }"
-
"
COPYRIGHT (c) 1988 by Claus Gittinger
All Rights Reserved
@@ -198,107 +196,94 @@
utf8DecodeFrom:aStream
"read and return a single unicode character from an UTF8 encoded stream"
- |fetchNext c1 c2 c3 c4 c5 codePoint|
+ |fetchNext c1 c2 codePoint|
c1 := aStream next.
codePoint := c1 codePoint.
codePoint <= 16r7F ifTrue:[
- "/ 0xxxxxxx - 7 bits
- ^ c1.
+ "/ 0xxxxxxx - 7 bits
+ ^ c1 asCharacter.
].
(codePoint bitAnd:2r11000000) == 2r10000000 ifTrue:[
- "/ out of sync (got an intermediate character)
- InvalidEncodingError raiseRequestWith:codePoint errorString:' - out of sync'.
- ^ c1.
+ "/ out of sync (got an intermediate character)
+ InvalidEncodingError raiseRequestWith:codePoint errorString:' - out of sync'.
+ ^ c1 asCharacter.
].
- fetchNext := [ |ch|
- ch := aStream next.
- (ch codePoint bitAnd:2r11000000) == 2r10000000 ifFalse:[
- "/ followup chars must have 2r10 in high bits
- InvalidEncodingError raiseRequestWith:ch codePoint.
- ^ c1.
- ].
- ch
- ].
+ fetchNext := [ |code|
+ code := aStream next codePoint.
+ (code bitAnd:2r11000000) == 2r10000000 ifFalse:[
+ "/ followup chars must have 2r10 in high bits
+ InvalidEncodingError raiseRequestWith:code.
+ ^ c1 asCharacter.
+ ].
+ code bitAnd:16r3F
+ ].
(codePoint bitAnd:2r11100000) == 2r11000000 ifTrue:[
- "/ 110xxxxx 10xxxxxx - 11 bits
- c2 := fetchNext value.
- codePoint := c1 codePoint bitAnd:16r1F.
- codePoint := (codePoint bitShift:6) bitOr:(c2 codePoint bitAnd:16r3F).
- codePoint <= 16r7F ifTrue:[
- InvalidEncodingError raiseRequestWith:codePoint.
- ].
- ^ Character codePoint:codePoint
+ "/ 110xxxxx 10xxxxxx - 11 bits
+ codePoint := codePoint bitAnd:16r1F.
+ codePoint := (codePoint bitShift:6) bitOr:(fetchNext value).
+ codePoint <= 16r7F ifTrue:[
+ InvalidEncodingError raiseRequestWith:codePoint.
+ ].
+ ^ Character codePoint:codePoint
].
(codePoint bitAnd:2r11110000) == 2r11100000 ifTrue:[
- "/ 1110xxxx 10xxxxxx 10xxxxxx - 16 bits
- c2 := fetchNext value.
- c3 := fetchNext value.
- codePoint := c1 codePoint bitAnd:16r0F.
- codePoint := (codePoint bitShift:6) bitOr:(c2 codePoint bitAnd:16r3F).
- codePoint := (codePoint bitShift:6) bitOr:(c3 codePoint bitAnd:16r3F).
- codePoint <= 16r7FF ifTrue:[
- InvalidEncodingError raiseRequestWith:codePoint.
- ].
- ^ Character codePoint:codePoint
+ "/ 1110xxxx 10xxxxxx 10xxxxxx - 16 bits
+ codePoint := codePoint bitAnd:16r0F.
+ codePoint := (codePoint bitShift:6) bitOr:(fetchNext value).
+ codePoint := (codePoint bitShift:6) bitOr:(fetchNext value).
+ codePoint <= 16r7FF ifTrue:[
+ InvalidEncodingError raiseRequestWith:codePoint.
+ ].
+ ^ Character codePoint:codePoint
].
- "/ notice: currently, characters can only have 16bit encoding;
- "/ therefore the following will raise a runtime exception,
-
(codePoint bitAnd:2r11111000) == 2r11110000 ifTrue:[
- "/ 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx - 21 bits
- c2 := fetchNext value.
- c3 := fetchNext value.
- c4 := fetchNext value.
- codePoint := c1 codePoint bitAnd:16r07.
- codePoint := (codePoint bitShift:6) bitOr:(c2 codePoint bitAnd:16r3F).
- codePoint := (codePoint bitShift:6) bitOr:(c3 codePoint bitAnd:16r3F).
- codePoint := (codePoint bitShift:6) bitOr:(c4 codePoint bitAnd:16r3F).
- codePoint <= 16rFFFF ifTrue:[
- InvalidEncodingError raiseRequestWith:codePoint.
- ].
- ^ Character codePoint:codePoint
+ "/ 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx - 21 bits
+ codePoint := codePoint bitAnd:16r07.
+ codePoint := (codePoint bitShift:6) bitOr:(fetchNext value).
+ codePoint := (codePoint bitShift:6) bitOr:(fetchNext value).
+ codePoint := (codePoint bitShift:6) bitOr:(fetchNext value).
+ codePoint <= 16rFFFF ifTrue:[
+ InvalidEncodingError raiseRequestWith:codePoint.
+ ].
+ ^ Character codePoint:codePoint
].
(codePoint bitAnd:2r11111100) == 2r11111000 ifTrue:[
- "/ 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx - 26 bits
- c2 := fetchNext value.
- c3 := fetchNext value.
- c4 := fetchNext value.
- c5 := fetchNext value.
- codePoint := c1 codePoint bitAnd:16r03.
- codePoint := (codePoint bitShift:6) bitOr:(c2 codePoint bitAnd:16r3F).
- codePoint := (codePoint bitShift:6) bitOr:(c3 codePoint bitAnd:16r3F).
- codePoint := (codePoint bitShift:6) bitOr:(c4 codePoint bitAnd:16r3F).
- codePoint := (codePoint bitShift:6) bitOr:(c5 codePoint bitAnd:16r3F).
- codePoint <= 16r1FFFFF ifTrue:[
- InvalidEncodingError raiseRequestWith:codePoint.
- ].
- ^ Character codePoint:codePoint
+ "/ 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx - 26 bits
+ codePoint := codePoint bitAnd:16r03.
+ codePoint := (codePoint bitShift:6) bitOr:(fetchNext value).
+ codePoint := (codePoint bitShift:6) bitOr:(fetchNext value).
+ codePoint := (codePoint bitShift:6) bitOr:(fetchNext value).
+ codePoint := (codePoint bitShift:6) bitOr:(fetchNext value).
+ codePoint <= 16r1FFFFF ifTrue:[
+ InvalidEncodingError raiseRequestWith:codePoint.
+ ].
+ ^ Character codePoint:codePoint
].
(codePoint bitAnd:2r11111110) == 2r11111100 ifTrue:[
- "/ 1111110x ... 10xxxxxx - any number of bits
- codePoint := c1 codePoint bitAnd:16r01.
-
- c2 := aStream peek.
- [c2 notNil and:[(c2 codePoint bitAnd:2r11000000) == 2r10000000]] whileTrue:[
- codePoint := (codePoint bitShift:6) bitOr:(c2 codePoint bitAnd:16r3F).
- aStream next.
- c2 := aStream peek.
- ].
- codePoint <= 16r3FFFFFF ifTrue:[
- InvalidEncodingError raiseRequestWith:codePoint.
- ].
- ^ Character codePoint:codePoint
+ "/ 1111110x ... 10xxxxxx - any number of bits
+ codePoint := codePoint bitAnd:16r01.
+
+ c2 := aStream peek.
+ [c2 notNil and:[(c2 codePoint bitAnd:2r11000000) == 2r10000000]] whileTrue:[
+ codePoint := (codePoint bitShift:6) bitOr:(c2 codePoint bitAnd:16r3F).
+ aStream next.
+ c2 := aStream peek.
+ ].
+ codePoint <= 16r3FFFFFF ifTrue:[
+ InvalidEncodingError raiseRequestWith:codePoint.
+ ].
+ ^ Character codePoint:codePoint
].
InvalidEncodingError raiseRequestWith:codePoint.
- ^ c1
+ ^ c1 asCharacter.
"
Character utf8DecodeFrom:'a' readStream
@@ -310,12 +295,12 @@
|utf8Encoding original readBack|
1 to:16rFFFF do:[:codePoint |
- original := Character value:codePoint.
- utf8Encoding := original asString utf8Encoded.
- readBack := Character utf8DecodeFrom:(utf8Encoding readStream).
- readBack codePoint = codePoint ifFalse:[
- self halt
- ]
+ original := Character value:codePoint.
+ utf8Encoding := original utf8Encoded.
+ readBack := Character utf8DecodeFrom:(utf8Encoding readStream).
+ readBack codePoint = codePoint ifFalse:[
+ self halt
+ ]
]
"
!
@@ -1472,7 +1457,7 @@
^ s contents
"
- 'ä' utf8Encoded
+ 'ä' utf8Encoded
"
! !
@@ -2534,9 +2519,9 @@
"
$e asNonDiacritical
- $é asNonDiacritical
- $ä asNonDiacritical
- $Ã¥ asNonDiacritical
+ $é asNonDiacritical
+ $ä asNonDiacritical
+ $å asNonDiacritical
"
!
--- a/CharacterEncoderImplementations__ISO10646_to_UTF8.st Tue Jul 21 06:19:13 2015 +0100
+++ b/CharacterEncoderImplementations__ISO10646_to_UTF8.st Tue Jul 21 06:19:27 2015 +0100
@@ -1,3 +1,5 @@
+"{ Encoding: utf8 }"
+
"
COPYRIGHT (c) 2004 by eXept Software AG
All Rights Reserved
@@ -20,6 +22,13 @@
category:'Collections-Text-Encodings'
!
+ISO10646_to_UTF8 class instanceVariableNames:'theOneAndOnlyInstance'
+
+"
+ No other class instance variables are inherited by this class.
+"
+!
+
!ISO10646_to_UTF8 class methodsFor:'documentation'!
copyright
@@ -45,11 +54,41 @@
Decoding (utf8 to unicode):
|t|
- t := ISO10646_to_UTF8 encodeString:'Helloœ'.
+ t := ISO10646_to_UTF8 encodeString:'Helloœ'.
ISO10646_to_UTF8 decodeString:t.
"
! !
+!ISO10646_to_UTF8 class methodsFor:'instance creation'!
+
+flushSingleton
+ "flushes the cached singleton"
+
+ theOneAndOnlyInstance := nil
+
+ "
+ self flushSingleton
+ "
+!
+
+new
+ "returns a singleton"
+
+ theOneAndOnlyInstance isNil ifTrue:[
+ theOneAndOnlyInstance := self basicNew initialize.
+ ].
+ ^ theOneAndOnlyInstance.
+!
+
+theOneAndOnlyInstance
+ "returns a singleton"
+
+ theOneAndOnlyInstance isNil ifTrue:[
+ theOneAndOnlyInstance := self basicNew initialize.
+ ].
+ ^ theOneAndOnlyInstance.
+! !
+
!ISO10646_to_UTF8 methodsFor:'encoding & decoding'!
decode:aCode
@@ -311,12 +350,12 @@
|s|
- "/ avoid creation of new strings
+ "/ avoid creation of new strings if possible
aUnicodeString containsNon7BitAscii ifFalse:[
^ aUnicodeString asSingleByteString
].
- s := WriteStream on:(String uninitializedNew:aUnicodeString size).
+ s := WriteStream on:(String uninitializedNew:(aUnicodeString size * 3 // 2)).
aUnicodeString do:[:eachCharacter |
|codePoint "{Class: SmallInteger }" b1 b2 b3 b4 b5 v "{Class: SmallInteger }"|
--- a/ExternalBytes.st Tue Jul 21 06:19:13 2015 +0100
+++ b/ExternalBytes.st Tue Jul 21 06:19:27 2015 +0100
@@ -1,5 +1,3 @@
-"{ Encoding: utf8 }"
-
"
COPYRIGHT (c) 1993 by Claus Gittinger
All Rights Reserved
@@ -580,6 +578,15 @@
"
!
+isBuiltInClass
+ "return true if this class is known by the run-time-system.
+ Here, true is returned."
+
+ ^ self == ExternalBytes
+
+ "Modified: / 11.6.1998 / 17:12:51 / cg"
+!
+
longAlignment
"return the alignement of longs in structs and unions"
@@ -595,15 +602,6 @@
"
!
-isBuiltInClass
- "return true if this class is known by the run-time-system.
- Here, true is returned."
-
- ^ self == ExternalBytes
-
- "Modified: / 11.6.1998 / 17:12:51 / cg"
-!
-
sizeofDouble
"return the number of bytes used by the machines native doubles"
@@ -1274,6 +1272,48 @@
^ size
!
+containsNon7BitAscii
+ "return true, if any byte in the receiver has the 7th bit on.
+ This my look as a too specific operation to be put here,
+ put it is very helpful for UTF8 string reading (Java class reader),
+ to quickly determine, if UTF8 decoding is needed or not.
+ As most strings in a class file are in fact only containing 7bit ascii,
+ this should speedup class file reading considerably"
+
+%{ /* NOCONTEXT */
+ unsigned char *cp = __INST(address_);
+ unsigned int size = __intVal(__INST(size));
+ unsigned char *endP;
+
+ if (cp == nil || size == 0) {
+ RETURN(false);
+ }
+
+ endP = cp + size;
+#if __POINTER_SIZE__ == 8
+ while (cp+8 < endP) {
+ if ( ((unsigned INT *)cp)[0] & 0x8080808080808080) RETURN( true );
+ cp += 8;
+ }
+#endif
+ while (cp+4 < endP) {
+ if ( ((unsigned int *)cp)[0] & 0x80808080) RETURN( true );
+ cp += 4;
+ }
+ while (cp < endP) {
+ if (*cp++ & 0x80) RETURN( true );
+ }
+ RETURN ( false );
+%}
+.
+ ^ self contains:[:b | b bitTest:16r80].
+
+ "
+ #[1 2 3 1 2 3 1 2 127 ] asExternalBytes containsNon7BitAscii
+ #[1 2 3 1 2 3 1 2 250 251 250 251 255] asExternalBytes containsNon7BitAscii
+ "
+!
+
isValid
"true if I gave an address"
--- a/TwoByteString.st Tue Jul 21 06:19:13 2015 +0100
+++ b/TwoByteString.st Tue Jul 21 06:19:27 2015 +0100
@@ -122,11 +122,7 @@
utf8Encoded
"Return my UTF-8 representation as a new String"
- self containsNon7BitAscii ifTrue:[
- ^ self basicUtf8Encoded.
- ].
-
- ^ self asSingleByteString.
+ ^ CharacterEncoderImplementations::ISO10646_to_UTF8 new encodeString:self
"