#TUNING by stefan
class: UninterpretedBytes
added: #utf8DecodedSize
comment/format in: #utf8Decoded
changed:
#bcdByteAt:put:
#byteAt:
#byteAt:put:
#signedByteAt:
#signedByteAt:put:
declare primitives NOCONTEXT
--- a/UninterpretedBytes.st Tue Feb 07 22:15:28 2017 +0100
+++ b/UninterpretedBytes.st Tue Feb 07 22:18:51 2017 +0100
@@ -1,3 +1,5 @@
+"{ Encoding: utf8 }"
+
"
COPYRIGHT (c) 1993 by Claus Gittinger
All Rights Reserved
@@ -930,7 +932,7 @@
longLongAt:index
"return the 8-bytes starting at index as a signed Integer.
The index is a smalltalk index (i.e. 1-based).
- The value is retrieved in the machineÄs natural byte order.
+ The value is retrieved in the machineÄs natural byte order.
This may be worth a primitive."
^ self signedInt64At:index MSB:IsBigEndian
@@ -1925,7 +1927,7 @@
(i.e. the value n is encoded as: ((n // 10) * 16) + (n \\ 10)"
(aNumber between:0 and:99) ifFalse:[
- self error:'invalid value for BCD encoding'
+ self elementBoundsError:aNumber.
].
^ self byteAt:index put:aNumber encodeAsBCD
@@ -1937,13 +1939,14 @@
"
"Modified (comment): / 26-09-2011 / 11:57:36 / cg"
+ "Modified: / 07-02-2017 / 20:12:04 / stefan"
!
byteAt:byteIndex
"return the byte at byteIndex as an unsigned 8 bit value in the range 0..255.
The index is a smalltalk index (i.e. 1-based)."
-%{
+%{ /* NOCONTEXT */
if (__isSmallInteger(byteIndex)) {
unsigned char *cp;
INT sz;
@@ -1964,10 +1967,19 @@
"
|b|
- b := ByteArray new:3.
- b at:1 put:16rFF.
- b at:2 put:16r7F.
- b at:3 put:16r80.
+ b := String new:3.
+ b byteAt:1 put:16rFF.
+ b byteAt:2 put:16r7F.
+ b byteAt:3 put:16r80.
+ b byteAt:1.
+ b byteAt:2.
+ b byteAt:3.
+
+ |b|
+ b := ExternalBytes new:3.
+ b byteAt:1 put:16rFF.
+ b byteAt:2 put:16r7F.
+ b byteAt:3 put:16r80.
b byteAt:1.
b byteAt:2.
b byteAt:3.
@@ -1975,13 +1987,14 @@
"Modified: / 01-07-1996 / 21:13:53 / cg"
"Modified (comment): / 26-09-2011 / 11:57:14 / cg"
+ "Modified (comment): / 07-02-2017 / 19:49:13 / stefan"
!
byteAt:byteIndex put:anInteger
"set the byte at byteIndex as an unsigned 8 bit value in the range 0..255.
The index is a smalltalk index (i.e. 1-based)."
-%{
+%{ /* NOCONTEXT */
if (__isSmallInteger(byteIndex) && __isSmallInteger(anInteger)) {
unsigned char *cp;
INT sz;
@@ -2005,14 +2018,16 @@
"
|b|
- b := ByteArray new:3.
+ b := String new:3.
b byteAt:1 put:16rFF.
b byteAt:2 put:16r7F.
b byteAt:3 put:16r80.
- b signedByteAt:1.
- b signedByteAt:2.
- b signedByteAt:3.
- "
+ b byteAt:1.
+ b byteAt:2.
+ b byteAt:3.
+ "
+
+ "Modified (comment): / 07-02-2017 / 19:32:26 / stefan"
!
signedByteAt:byteIndex
@@ -2020,7 +2035,7 @@
The index is a smalltalk index (i.e. 1-based).
This may be worth a primitive."
-%{
+%{ /* NOCONTEXT */
/*
* handle the most common cases fast ...
*/
@@ -2055,40 +2070,45 @@
b at:1 put:16rFF.
b at:2 put:16r7F.
b at:3 put:16r80.
- b byteAt:1.
- b byteAt:2.
- b byteAt:3.
+ b signedByteAt:1.
+ b signedByteAt:2.
+ b signedByteAt:3.
"
"Modified: / 01-07-1996 / 21:13:53 / cg"
"Modified (comment): / 26-09-2011 / 11:57:14 / cg"
+ "Modified: / 07-02-2017 / 19:25:03 / stefan"
!
-signedByteAt:index put:aSignedByteValue
- "return the byte at index as a signed 8 bit value in the range -128..+127.
+signedByteAt:byteIndex put:aSignedByteValue
+ "set the byte at byteIndex to aSignedByteValue in the range -128 .. 255
The index is a smalltalk index (i.e. 1-based).
- Return the signedByteValue argument.
- This may be worth a primitive."
-
- |b "{ Class: SmallInteger }"|
-
- aSignedByteValue >= 0 ifTrue:[
- b := aSignedByteValue
- ] ifFalse:[
- b := 16r100 + aSignedByteValue
+ Return the signedByteValue argument."
+
+ |b|
+
+ b := aSignedByteValue.
+ b < 0 ifTrue:[
+ b := 16r100 + b
].
- self byteAt:index put:b.
+ self byteAt:byteIndex put:b.
^ aSignedByteValue
"
|b|
b := ByteArray new:2.
b signedByteAt:1 put:-1.
- b at:1
+ b at:1.
+ b signedByteAt:1.
+
+ |b|
+ b := ByteArray new:2.
+ b signedByteAt:1 put:-1.0.
"
"Modified: / 01-07-1996 / 21:12:37 / cg"
"Modified (comment): / 26-09-2011 / 11:57:18 / cg"
+ "Modified (comment): / 07-02-2017 / 20:03:46 / stefan"
! !
!UninterpretedBytes methodsFor:'accessing-floats & doubles'!
@@ -4266,17 +4286,19 @@
"test:
- |utf8Encoding original readBack|
-
- 1 to:16rFFFF do:[:ascii |
- original := (Character value:ascii) asString.
+ 1 to:16r10FFFF do:[:codepoint |
+ |utf8Encoding original readBack|
+
+ original := (Character value:codepoint) asString.
utf8Encoding := original utf8Encoded.
readBack := utf8Encoding utf8Decoded.
- readBack = original ifFalse:[
+ readBack ~= original ifTrue:[
self halt
]
]
"
+
+ "Modified (comment): / 07-02-2017 / 17:36:08 / stefan"
!
utf8DecodedWithTwoByteCharactersReplacedBy:replacementCharacter
@@ -4962,8 +4984,8 @@
"
'abc' isValidUTF8
- 'abcöäü' isValidUTF8
- 'abcöäü' utf8Encoded isValidUTF8
+ 'abcöäü' isValidUTF8
+ 'abcöäü' utf8Encoded isValidUTF8
(Character value:16r800) utf8Encoded isValidUTF8
(Character value:16r1000) utf8Encoded isValidUTF8
@@ -5008,6 +5030,34 @@
^ super size
"Created: / 5.3.1998 / 10:41:13 / stefan"
+!
+
+utf8DecodedSize
+ "return the number of charcters needed when this string is
+ decoded from UTF-8."
+
+ |sz "{ Class:SmallInteger }"
+ cnt "{ Class:SmallInteger }"|
+
+ sz := self size.
+ cnt := 0.
+
+ 1 to:sz do:[:idx|
+ "/ count the number of UTF-8 start bytes
+ ((self byteAt:idx) bitAnd:16rC0) ~~ 16r80 ifTrue:[
+ cnt := cnt+1.
+ ].
+ ].
+ ^ cnt.
+
+ "
+ 'hello world' asByteArray utf8DecodedSize
+ 'ä' utf8Encoded asByteArray utf8DecodedSize
+ 'äΣΔΨӕἤῴ' utf8Encoded asByteArray utf8DecodedSize
+ "
+
+ "Created: / 07-02-2017 / 15:03:07 / stefan"
+ "Modified: / 07-02-2017 / 19:14:06 / stefan"
! !
!UninterpretedBytes methodsFor:'testing'!