extensions.st
changeset 3195 0b6a9ff08acd
parent 3152 2f2937ea3f03
child 3296 761c7bb5853c
--- a/extensions.st	Sun Aug 03 23:43:40 2014 +0100
+++ b/extensions.st	Mon Aug 04 15:43:51 2014 +0100
@@ -417,96 +417,64 @@
 
 !CharacterArray class methodsFor:'encoding & decoding'!
 
-decodeFromJavaUTF8: bytes
+decodeFromJavaUTF8: string8
     "Decodes a string from modified UTF8 encoding
      as used in Java .class files. see
      'The class file format specification', section 4.5.7"
 
-    | string  i  s  b  codePoint  realLength |
-
-    string := String new: bytes size.
-    realLength := bytes size.
-    s := bytes readStream.
-    i := 1.
-    [ s atEnd ] whileFalse:
-            [ b := s next.
-            (b & 2r10000000) == 0
-                ifTrue: [ codePoint := b ]
-                ifFalse:
-                    [ self assert: (b & 2r01000000) = 2r01000000.
-                    (b & 2r00100000) = 0
-                        ifTrue:
-                            [ "two byte utf char"
-                            realLength := realLength - 1.
-                            self assert: s size > 0.
-                            self assert: (b & 2r01000000) = 2r01000000.
-                            string bitsPerCharacter < 16
-                                ifTrue: [ string := Unicode16String fromString: string ].
-                            codePoint := (b & 2r00011111) << 6.
-                            b := s next.
-                            self assert: (b & 2r11000000) = 2r10000000.
-                            codePoint := codePoint + (b & 2r00111111). ]
-                        ifFalse:
-                            [ "at lease 3 byte utf char"
-                            realLength := realLength - 2.
-                            string bitsPerCharacter < 16"was: 32"
-                                ifTrue: [ string := Unicode16String"was: Unicode32String" fromString: string ].
-                            self assert: s size > 1.
-                            (b & 2r00010000) = 0
-                                ifTrue:
-                                    [ | utf32Possible  utf32Value |
-
-                                    "3 or 6 byte utf char"
-                                    self assert: s size > 1.
-                                    s size < 5
-                                        ifTrue: [ utf32Possible := false ]
-                                        ifFalse: [ utf32Possible := true ].
-                                    b ~= 2r11101101 ifTrue: [ utf32Possible := false ].
-                                    codePoint := (b & 2r00001111) << 12.
-                                    b := s next.
-                                    self assert: (b & 2r11000000) = 2r10000000.
-                                    ((b & 2r11110000) = 2r10100000 and: [ utf32Possible ])
-                                        ifTrue: [ utf32Value := 2r00010000 + ((b & 2r00001111) << 16) ]
-                                        ifFalse: [ utf32Possible := false ].
-                                    codePoint := codePoint + ((b & 2r00111111) << 6).
-                                    b := s next.
-                                    self assert: (b & 2r11000000) = 2r10000000.
-                                    utf32Possible
-                                        ifTrue: [ utf32Value := utf32Value + ((b & 2r00111111) << 10) ].
-                                    codePoint := codePoint + (b & 2r00111111).
-                                    utf32Possible
-                                        ifTrue:
-                                            [ | tmpB |
-
-                                            string bitsPerCharacter < 32
-                                                ifTrue: [ string := Unicode32String fromString: string ].
-                                            tmpB := s copy.
-                                            b := tmpB next.
-                                            b = 2r11101101
-                                                ifTrue:
-                                                    [ b := tmpB next.
-                                                    (b & 2r11110000) = 2r10110000
-                                                        ifTrue:
-                                                            [ utf32Value := utf32Value + ((b & 2r00001111) << 6).
-                                                            b := tmpB next.
-                                                            self assert: (b & 2r11000000) = 2r10000000.
-                                                            utf32Value := utf32Value + (b & 2r00111111).
-                                                            codePoint := utf32Value.
-                                                            realLength := realLength - 3. s position: tmpB position.] ] ] ]
-                                ifFalse:
-                                    [ "should not happen, ask mh"
-                                    self halt. ] ] ].
-            string at: i put: (Character codePoint: codePoint).
-            i := i + 1. ].
-    ^ string subString: 1 to: realLength.
-
-    "
-        String decodeFromJavaUTF8: 'Hello world' asByteArray"
+    | string16 |
+%{
+    if (__isString(string8) ) {
+        unsigned char *ptr;
+        unsigned short c;
+        unsigned short* dst;
+        int len;
+
+        ptr = __stringVal(string8);
+        len = 0;
+        while (*ptr) {
+            len++;
+            if ( ( *ptr & 0x80) == 0 ) {
+                ptr++;
+            } else {
+                len--;
+                goto non7bitString;
+            }
+        }
+        RETURN ( string8 );
+    non7bitString:
+        while (*ptr) {
+            len++;
+            ptr += (*ptr & 0x80) ? ((*ptr & 0x20) ? 3 : 2) : 1;
+        }
+
+        string16 = __MKEMPTYUSTRING(len);
+        ptr = __stringVal(string8);
+        dst = __unicode16StringVal(string16);
+        while (*ptr) {
+            int x = *ptr++;
+            if ( x & 0x80 ) {
+                int y = *ptr++;
+                if ( x & 0x20 ) {
+                    int z = *ptr++;
+                    *dst++ = ( (x & 0xf ) << 12 ) + ( ( y & 0x3f ) << 6 ) + ( z & 0x3f);
+                } else {
+                    *dst++ = ( ( x & 0x1f ) << 6 ) + ( y & 0x3f );
+                }
+            } else {
+                *dst++ = x;
+            }
+        }
+        RETURN ( string16 );
+    }
+%}.
+    self primitiveFailed.
 
     "Created: / 22-12-2010 / 23:45:04 / Jan Vrany <jan.vrany@fit.cvut.cz>"
     "Modified: / 09-02-2011 / 01:12:25 / Marcel Hlopko <hlopik@gmail.com>"
     "Modified: / 13-03-2011 / 15:52:36 / Marcel Hlopko <hlopkmar@fel.cvut.cz>"
-    "Modified: / 09-12-2011 / 19:49:04 / Jan Vrany <jan.vrany@fit.cvut.cz>"
+    "Modified: / 04-08-2014 / 14:45:26 / Jan Vrany <jan.vrany@fit.cvut.cz>"
+
 ! !
 
 !CharacterArray class methodsFor:'instance creation'!
@@ -516,7 +484,7 @@
      from the modified utf8 encoded bytes as specified in
      The class file format specification, section 4.5.7"
 
-    ^ self decodeFromJavaUTF8:aByteCollection.
+    ^ self decodeFromJavaUTF8:aByteCollection asString.
 
     "
      CharacterArray fromUTF8Bytes:#[ 16r41 16r42 ]
@@ -535,6 +503,7 @@
     "
 
     "Created: / 23-12-2010 / 09:01:32 / Jan Vrany <jan.vrany@fit.cvut.cz>"
+    "Modified: / 04-08-2014 / 14:48:28 / Jan Vrany <jan.vrany@fit.cvut.cz>"
 ! !
 
 !CharacterArray class methodsFor:'queries'!