Merge jv
authorJan Vrany <jan.vrany@fit.cvut.cz>
Tue, 21 Jul 2015 06:19:27 +0100
branchjv
changeset 18610 b9799e74a9c5
parent 18609 af1c36c18e24 (current diff)
parent 18607 90941e1c74c8 (diff)
child 18617 fbfd2d411738
Merge
Character.st
CharacterEncoderImplementations__ISO10646_to_UTF8.st
ExternalBytes.st
TwoByteString.st
--- a/Character.st	Tue Jul 21 06:19:13 2015 +0100
+++ b/Character.st	Tue Jul 21 06:19:27 2015 +0100
@@ -1,5 +1,3 @@
-"{ Encoding: utf8 }"
-
 "
  COPYRIGHT (c) 1988 by Claus Gittinger
 	      All Rights Reserved
@@ -198,107 +196,94 @@
 utf8DecodeFrom:aStream
     "read and return a single unicode character from an UTF8 encoded stream"
 
-    |fetchNext c1 c2 c3 c4 c5 codePoint|
+    |fetchNext c1 c2 codePoint|
 
     c1 := aStream next.
     codePoint := c1 codePoint.
     codePoint <= 16r7F ifTrue:[
-	"/ 0xxxxxxx - 7 bits
-	^ c1.
+        "/ 0xxxxxxx - 7 bits
+        ^ c1 asCharacter.
     ].
 
     (codePoint bitAnd:2r11000000) == 2r10000000 ifTrue:[
-	"/ out of sync (got an intermediate character)
-	InvalidEncodingError raiseRequestWith:codePoint errorString:' - out of sync'.
-	^ c1.
+        "/ out of sync (got an intermediate character)
+        InvalidEncodingError raiseRequestWith:codePoint errorString:' - out of sync'.
+        ^ c1 asCharacter.
     ].
 
-    fetchNext := [  |ch|
-		    ch := aStream next.
-		    (ch codePoint bitAnd:2r11000000) == 2r10000000 ifFalse:[
-			"/ followup chars must have 2r10 in high bits
-			InvalidEncodingError raiseRequestWith:ch codePoint.
-			^ c1.
-		    ].
-		    ch
-		 ].
+    fetchNext := [  |code|
+                    code := aStream next codePoint.
+                    (code bitAnd:2r11000000) == 2r10000000 ifFalse:[
+                        "/ followup chars must have 2r10 in high bits
+                        InvalidEncodingError raiseRequestWith:code.
+                        ^ c1 asCharacter.
+                    ].
+                    code bitAnd:16r3F
+                 ].
 
     (codePoint bitAnd:2r11100000) == 2r11000000 ifTrue:[
-	"/ 110xxxxx 10xxxxxx - 11 bits
-	c2 := fetchNext value.
-	codePoint := c1 codePoint bitAnd:16r1F.
-	codePoint := (codePoint bitShift:6) bitOr:(c2 codePoint bitAnd:16r3F).
-	codePoint <= 16r7F ifTrue:[
-	    InvalidEncodingError raiseRequestWith:codePoint.
-	].
-	^ Character codePoint:codePoint
+        "/ 110xxxxx 10xxxxxx - 11 bits
+        codePoint := codePoint bitAnd:16r1F.
+        codePoint := (codePoint bitShift:6) bitOr:(fetchNext value).
+        codePoint <= 16r7F ifTrue:[
+            InvalidEncodingError raiseRequestWith:codePoint.
+        ].
+        ^ Character codePoint:codePoint
     ].
     (codePoint bitAnd:2r11110000) == 2r11100000 ifTrue:[
-	"/ 1110xxxx 10xxxxxx 10xxxxxx - 16 bits
-	c2 := fetchNext value.
-	c3 := fetchNext value.
-	codePoint := c1 codePoint bitAnd:16r0F.
-	codePoint := (codePoint bitShift:6) bitOr:(c2 codePoint bitAnd:16r3F).
-	codePoint := (codePoint bitShift:6) bitOr:(c3 codePoint bitAnd:16r3F).
-	codePoint <= 16r7FF ifTrue:[
-	    InvalidEncodingError raiseRequestWith:codePoint.
-	].
-	^ Character codePoint:codePoint
+        "/ 1110xxxx 10xxxxxx 10xxxxxx - 16 bits
+        codePoint := codePoint bitAnd:16r0F.
+        codePoint := (codePoint bitShift:6) bitOr:(fetchNext value).
+        codePoint := (codePoint bitShift:6) bitOr:(fetchNext value).
+        codePoint <= 16r7FF ifTrue:[
+            InvalidEncodingError raiseRequestWith:codePoint.
+        ].
+        ^ Character codePoint:codePoint
     ].
 
-    "/ notice: currently, characters can only have 16bit encoding;
-    "/ therefore the following will raise a runtime exception,
-
     (codePoint bitAnd:2r11111000) == 2r11110000 ifTrue:[
-	"/ 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx - 21 bits
-	c2 := fetchNext value.
-	c3 := fetchNext value.
-	c4 := fetchNext value.
-	codePoint := c1 codePoint bitAnd:16r07.
-	codePoint := (codePoint bitShift:6) bitOr:(c2 codePoint bitAnd:16r3F).
-	codePoint := (codePoint bitShift:6) bitOr:(c3 codePoint bitAnd:16r3F).
-	codePoint := (codePoint bitShift:6) bitOr:(c4 codePoint bitAnd:16r3F).
-	codePoint <= 16rFFFF ifTrue:[
-	    InvalidEncodingError raiseRequestWith:codePoint.
-	].
-	^ Character codePoint:codePoint
+        "/ 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx - 21 bits
+        codePoint := codePoint bitAnd:16r07.
+        codePoint := (codePoint bitShift:6) bitOr:(fetchNext value).
+        codePoint := (codePoint bitShift:6) bitOr:(fetchNext value).
+        codePoint := (codePoint bitShift:6) bitOr:(fetchNext value).
+        codePoint <= 16rFFFF ifTrue:[
+            InvalidEncodingError raiseRequestWith:codePoint.
+        ].
+        ^ Character codePoint:codePoint
     ].
 
     (codePoint bitAnd:2r11111100) == 2r11111000 ifTrue:[
-	"/ 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx - 26 bits
-	c2 := fetchNext value.
-	c3 := fetchNext value.
-	c4 := fetchNext value.
-	c5 := fetchNext value.
-	codePoint := c1 codePoint bitAnd:16r03.
-	codePoint := (codePoint bitShift:6) bitOr:(c2 codePoint bitAnd:16r3F).
-	codePoint := (codePoint bitShift:6) bitOr:(c3 codePoint bitAnd:16r3F).
-	codePoint := (codePoint bitShift:6) bitOr:(c4 codePoint bitAnd:16r3F).
-	codePoint := (codePoint bitShift:6) bitOr:(c5 codePoint bitAnd:16r3F).
-	codePoint <= 16r1FFFFF ifTrue:[
-	    InvalidEncodingError raiseRequestWith:codePoint.
-	].
-	^ Character codePoint:codePoint
+        "/ 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx - 26 bits
+        codePoint := codePoint bitAnd:16r03.
+        codePoint := (codePoint bitShift:6) bitOr:(fetchNext value).
+        codePoint := (codePoint bitShift:6) bitOr:(fetchNext value).
+        codePoint := (codePoint bitShift:6) bitOr:(fetchNext value).
+        codePoint := (codePoint bitShift:6) bitOr:(fetchNext value).
+        codePoint <= 16r1FFFFF ifTrue:[
+            InvalidEncodingError raiseRequestWith:codePoint.
+        ].
+        ^ Character codePoint:codePoint
     ].
 
     (codePoint bitAnd:2r11111110) == 2r11111100 ifTrue:[
-	"/ 1111110x ... 10xxxxxx - any number of bits
-	codePoint := c1 codePoint bitAnd:16r01.
-
-	c2 := aStream peek.
-	[c2 notNil and:[(c2 codePoint bitAnd:2r11000000) == 2r10000000]] whileTrue:[
-	    codePoint := (codePoint bitShift:6) bitOr:(c2 codePoint bitAnd:16r3F).
-	    aStream next.
-	    c2 := aStream peek.
-	].
-	codePoint <= 16r3FFFFFF ifTrue:[
-	    InvalidEncodingError raiseRequestWith:codePoint.
-	].
-	^ Character codePoint:codePoint
+        "/ 1111110x ... 10xxxxxx - any number of bits
+        codePoint := codePoint bitAnd:16r01.
+
+        c2 := aStream peek.
+        [c2 notNil and:[(c2 codePoint bitAnd:2r11000000) == 2r10000000]] whileTrue:[
+            codePoint := (codePoint bitShift:6) bitOr:(c2 codePoint bitAnd:16r3F).
+            aStream next.
+            c2 := aStream peek.
+        ].
+        codePoint <= 16r3FFFFFF ifTrue:[
+            InvalidEncodingError raiseRequestWith:codePoint.
+        ].
+        ^ Character codePoint:codePoint
     ].
 
     InvalidEncodingError raiseRequestWith:codePoint.
-    ^ c1
+    ^ c1 asCharacter.
 
     "
       Character utf8DecodeFrom:'a' readStream
@@ -310,12 +295,12 @@
       |utf8Encoding original readBack|
 
       1 to:16rFFFF do:[:codePoint |
-	original := Character value:codePoint.
-	utf8Encoding := original asString utf8Encoded.
-	readBack := Character utf8DecodeFrom:(utf8Encoding readStream).
-	readBack codePoint = codePoint ifFalse:[
-	    self halt
-	]
+        original := Character value:codePoint.
+        utf8Encoding := original utf8Encoded.
+        readBack := Character utf8DecodeFrom:(utf8Encoding readStream).
+        readBack codePoint = codePoint ifFalse:[
+            self halt
+        ]
       ]
     "
 !
@@ -1472,7 +1457,7 @@
     ^ s contents
 
     "
-	'ä' utf8Encoded
+	'ä' utf8Encoded
     "
 ! !
 
@@ -2534,9 +2519,9 @@
 
     "
      $e asNonDiacritical
-     $é asNonDiacritical
-     $ä asNonDiacritical
-     $Ã¥ asNonDiacritical
+     $é asNonDiacritical
+     $ä asNonDiacritical
+     $å asNonDiacritical
     "
 !
 
--- a/CharacterEncoderImplementations__ISO10646_to_UTF8.st	Tue Jul 21 06:19:13 2015 +0100
+++ b/CharacterEncoderImplementations__ISO10646_to_UTF8.st	Tue Jul 21 06:19:27 2015 +0100
@@ -1,3 +1,5 @@
+"{ Encoding: utf8 }"
+
 "
  COPYRIGHT (c) 2004 by eXept Software AG
 	      All Rights Reserved
@@ -20,6 +22,13 @@
 	category:'Collections-Text-Encodings'
 !
 
+ISO10646_to_UTF8 class instanceVariableNames:'theOneAndOnlyInstance'
+
+"
+ No other class instance variables are inherited by this class.
+"
+!
+
 !ISO10646_to_UTF8 class methodsFor:'documentation'!
 
 copyright
@@ -45,11 +54,41 @@
   Decoding (utf8 to unicode):
      |t|
 
-     t := ISO10646_to_UTF8 encodeString:'Helloœ'.
+     t := ISO10646_to_UTF8 encodeString:'Helloœ'.
      ISO10646_to_UTF8 decodeString:t.
 "
 ! !
 
+!ISO10646_to_UTF8 class methodsFor:'instance creation'!
+
+flushSingleton
+    "flushes the cached singleton"
+
+    theOneAndOnlyInstance := nil
+
+    "
+     self flushSingleton
+    "
+!
+
+new
+    "returns a singleton"
+
+    theOneAndOnlyInstance isNil ifTrue:[
+        theOneAndOnlyInstance := self basicNew initialize.
+    ].
+    ^ theOneAndOnlyInstance.
+!
+
+theOneAndOnlyInstance
+    "returns a singleton"
+
+    theOneAndOnlyInstance isNil ifTrue:[
+        theOneAndOnlyInstance := self basicNew initialize.
+    ].
+    ^ theOneAndOnlyInstance.
+! !
+
 !ISO10646_to_UTF8 methodsFor:'encoding & decoding'!
 
 decode:aCode
@@ -311,12 +350,12 @@
 
     |s|
 
-    "/ avoid creation of new strings
+    "/ avoid creation of new strings if possible
     aUnicodeString containsNon7BitAscii ifFalse:[
         ^ aUnicodeString asSingleByteString
     ].
 
-    s := WriteStream on:(String uninitializedNew:aUnicodeString size).
+    s := WriteStream on:(String uninitializedNew:(aUnicodeString size * 3 // 2)).
     aUnicodeString do:[:eachCharacter |
         |codePoint "{Class: SmallInteger }" b1 b2 b3 b4 b5 v "{Class: SmallInteger }"|
 
--- a/ExternalBytes.st	Tue Jul 21 06:19:13 2015 +0100
+++ b/ExternalBytes.st	Tue Jul 21 06:19:27 2015 +0100
@@ -1,5 +1,3 @@
-"{ Encoding: utf8 }"
-
 "
  COPYRIGHT (c) 1993 by Claus Gittinger
 	      All Rights Reserved
@@ -580,6 +578,15 @@
     "
 !
 
+isBuiltInClass
+    "return true if this class is known by the run-time-system.
+     Here, true is returned."
+
+    ^ self == ExternalBytes
+
+    "Modified: / 11.6.1998 / 17:12:51 / cg"
+!
+
 longAlignment
     "return the alignement of longs in structs and unions"
 
@@ -595,15 +602,6 @@
     "
 !
 
-isBuiltInClass
-    "return true if this class is known by the run-time-system.
-     Here, true is returned."
-
-    ^ self == ExternalBytes
-
-    "Modified: / 11.6.1998 / 17:12:51 / cg"
-!
-
 sizeofDouble
     "return the number of bytes used by the machines native doubles"
 
@@ -1274,6 +1272,48 @@
     ^ size
 !
 
+containsNon7BitAscii
+    "return true, if any byte in the receiver has the 7th bit on.
+     This my look as a too specific operation to be put here,
+     put it is very helpful for UTF8 string reading (Java class reader),
+     to quickly determine, if UTF8 decoding is needed or not.
+     As most strings in a class file are in fact only containing 7bit ascii,
+     this should speedup class file reading considerably"
+
+%{  /* NOCONTEXT */
+    unsigned char *cp = __INST(address_);
+    unsigned int size = __intVal(__INST(size));
+    unsigned char *endP;
+
+    if (cp == nil || size == 0) {
+        RETURN(false);
+    }
+
+    endP = cp + size;
+#if __POINTER_SIZE__ == 8
+    while (cp+8 < endP) {
+        if ( ((unsigned INT *)cp)[0] & 0x8080808080808080) RETURN( true );
+        cp += 8;
+    }
+#endif
+    while (cp+4 < endP) {
+        if ( ((unsigned int *)cp)[0] & 0x80808080) RETURN( true );
+        cp += 4;
+    }
+    while (cp < endP) {
+        if (*cp++ & 0x80) RETURN( true );
+    }
+    RETURN ( false );
+%}
+.
+    ^ self contains:[:b | b bitTest:16r80].
+
+    "
+     #[1 2 3 1 2 3 1 2 127 ] asExternalBytes containsNon7BitAscii
+     #[1 2 3 1 2 3 1 2 250 251 250 251 255] asExternalBytes containsNon7BitAscii
+    "
+!
+
 isValid
     "true if I gave an address"
 
--- a/TwoByteString.st	Tue Jul 21 06:19:13 2015 +0100
+++ b/TwoByteString.st	Tue Jul 21 06:19:27 2015 +0100
@@ -122,11 +122,7 @@
 utf8Encoded
     "Return my UTF-8 representation as a new String"
 
-    self containsNon7BitAscii ifTrue:[
-        ^ self basicUtf8Encoded.
-    ].
-
-    ^ self asSingleByteString.
+    ^ CharacterEncoderImplementations::ISO10646_to_UTF8 new encodeString:self
 
 
     "