--- a/extensions.st Sun Aug 03 23:43:40 2014 +0100
+++ b/extensions.st Mon Aug 04 15:43:51 2014 +0100
@@ -417,96 +417,64 @@
!CharacterArray class methodsFor:'encoding & decoding'!
-decodeFromJavaUTF8: bytes
+decodeFromJavaUTF8: string8
"Decodes a string from modified UTF8 encoding
as used in Java .class files. see
'The class file format specification', section 4.5.7"
- | string i s b codePoint realLength |
-
- string := String new: bytes size.
- realLength := bytes size.
- s := bytes readStream.
- i := 1.
- [ s atEnd ] whileFalse:
- [ b := s next.
- (b & 2r10000000) == 0
- ifTrue: [ codePoint := b ]
- ifFalse:
- [ self assert: (b & 2r01000000) = 2r01000000.
- (b & 2r00100000) = 0
- ifTrue:
- [ "two byte utf char"
- realLength := realLength - 1.
- self assert: s size > 0.
- self assert: (b & 2r01000000) = 2r01000000.
- string bitsPerCharacter < 16
- ifTrue: [ string := Unicode16String fromString: string ].
- codePoint := (b & 2r00011111) << 6.
- b := s next.
- self assert: (b & 2r11000000) = 2r10000000.
- codePoint := codePoint + (b & 2r00111111). ]
- ifFalse:
- [ "at lease 3 byte utf char"
- realLength := realLength - 2.
- string bitsPerCharacter < 16"was: 32"
- ifTrue: [ string := Unicode16String"was: Unicode32String" fromString: string ].
- self assert: s size > 1.
- (b & 2r00010000) = 0
- ifTrue:
- [ | utf32Possible utf32Value |
-
- "3 or 6 byte utf char"
- self assert: s size > 1.
- s size < 5
- ifTrue: [ utf32Possible := false ]
- ifFalse: [ utf32Possible := true ].
- b ~= 2r11101101 ifTrue: [ utf32Possible := false ].
- codePoint := (b & 2r00001111) << 12.
- b := s next.
- self assert: (b & 2r11000000) = 2r10000000.
- ((b & 2r11110000) = 2r10100000 and: [ utf32Possible ])
- ifTrue: [ utf32Value := 2r00010000 + ((b & 2r00001111) << 16) ]
- ifFalse: [ utf32Possible := false ].
- codePoint := codePoint + ((b & 2r00111111) << 6).
- b := s next.
- self assert: (b & 2r11000000) = 2r10000000.
- utf32Possible
- ifTrue: [ utf32Value := utf32Value + ((b & 2r00111111) << 10) ].
- codePoint := codePoint + (b & 2r00111111).
- utf32Possible
- ifTrue:
- [ | tmpB |
-
- string bitsPerCharacter < 32
- ifTrue: [ string := Unicode32String fromString: string ].
- tmpB := s copy.
- b := tmpB next.
- b = 2r11101101
- ifTrue:
- [ b := tmpB next.
- (b & 2r11110000) = 2r10110000
- ifTrue:
- [ utf32Value := utf32Value + ((b & 2r00001111) << 6).
- b := tmpB next.
- self assert: (b & 2r11000000) = 2r10000000.
- utf32Value := utf32Value + (b & 2r00111111).
- codePoint := utf32Value.
- realLength := realLength - 3. s position: tmpB position.] ] ] ]
- ifFalse:
- [ "should not happen, ask mh"
- self halt. ] ] ].
- string at: i put: (Character codePoint: codePoint).
- i := i + 1. ].
- ^ string subString: 1 to: realLength.
-
- "
- String decodeFromJavaUTF8: 'Hello world' asByteArray"
+ | string16 |
+%{
+ if (__isString(string8) ) {
+ unsigned char *ptr;
+ unsigned short c;
+ unsigned short* dst;
+ int len;
+
+ ptr = __stringVal(string8);
+ len = 0;
+ while (*ptr) {
+ len++;
+ if ( ( *ptr & 0x80) == 0 ) {
+ ptr++;
+ } else {
+ len--;
+ goto non7bitString;
+ }
+ }
+ RETURN ( string8 );
+ non7bitString:
+ while (*ptr) {
+ len++;
+ ptr += (*ptr & 0x80) ? ((*ptr & 0x20) ? 3 : 2) : 1;
+ }
+
+ string16 = __MKEMPTYUSTRING(len);
+ ptr = __stringVal(string8);
+ dst = __unicode16StringVal(string16);
+ while (*ptr) {
+ int x = *ptr++;
+ if ( x & 0x80 ) {
+ int y = *ptr++;
+ if ( x & 0x20 ) {
+ int z = *ptr++;
+ *dst++ = ( (x & 0xf ) << 12 ) + ( ( y & 0x3f ) << 6 ) + ( z & 0x3f);
+ } else {
+ *dst++ = ( ( x & 0x1f ) << 6 ) + ( y & 0x3f );
+ }
+ } else {
+ *dst++ = x;
+ }
+ }
+ RETURN ( string16 );
+ }
+%}.
+ self primitiveFailed.
"Created: / 22-12-2010 / 23:45:04 / Jan Vrany <jan.vrany@fit.cvut.cz>"
"Modified: / 09-02-2011 / 01:12:25 / Marcel Hlopko <hlopik@gmail.com>"
"Modified: / 13-03-2011 / 15:52:36 / Marcel Hlopko <hlopkmar@fel.cvut.cz>"
- "Modified: / 09-12-2011 / 19:49:04 / Jan Vrany <jan.vrany@fit.cvut.cz>"
+ "Modified: / 04-08-2014 / 14:45:26 / Jan Vrany <jan.vrany@fit.cvut.cz>"
+
! !
!CharacterArray class methodsFor:'instance creation'!
@@ -516,7 +484,7 @@
from the modified utf8 encoded bytes as specified in
The class file format specification, section 4.5.7"
- ^ self decodeFromJavaUTF8:aByteCollection.
+ ^ self decodeFromJavaUTF8:aByteCollection asString.
"
CharacterArray fromUTF8Bytes:#[ 16r41 16r42 ]
@@ -535,6 +503,7 @@
"
"Created: / 23-12-2010 / 09:01:32 / Jan Vrany <jan.vrany@fit.cvut.cz>"
+ "Modified: / 04-08-2014 / 14:48:28 / Jan Vrany <jan.vrany@fit.cvut.cz>"
! !
!CharacterArray class methodsFor:'queries'!