CharacterEncoderImplementations__ISO10646_to_UTF16BE.st
changeset 8903 4e15c297fadc
child 9325 a4c635a6f8eb
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/CharacterEncoderImplementations__ISO10646_to_UTF16BE.st	Thu Jul 07 19:36:41 2005 +0200
@@ -0,0 +1,244 @@
+"
+ COPYRIGHT (c) 2005 by eXept Software AG
+              All Rights Reserved
+
+ This software is furnished under a license and may be used
+ only in accordance with the terms of that license and with the
+ inclusion of the above copyright notice.   This software may not
+ be provided or otherwise made available to, or used by, any
+ other person.  No title to or ownership of the software is
+ hereby transferred.
+"
+
+"{ Package: 'stx:libbasic' }"
+
+"{ NameSpace: CharacterEncoderImplementations }"
+
+TwoByteEncoder subclass:#ISO10646_to_UTF16BE
+	instanceVariableNames:''
+	classVariableNames:''
+	poolDictionaries:''
+	category:'Collections-Text-Encodings'
+!
+
+!ISO10646_to_UTF16BE class methodsFor:'documentation'!
+
+copyright
+"
+ COPYRIGHT (c) 2005 by eXept Software AG
+              All Rights Reserved
+
+ This software is furnished under a license and may be used
+ only in accordance with the terms of that license and with the
+ inclusion of the above copyright notice.   This software may not
+ be provided or otherwise made available to, or used by, any
+ other person.  No title to or ownership of the software is
+ hereby transferred.
+"
+!
+
+documentation
+"
+    encodes/decodes UTF16 BigEndian (big-end-first)
+"
+!
+
+examples
+"
+  Encoding (unicode to utf16BE)
+     ISO10646_to_UTF16BE encodeString:'hello'.
+
+
+  Decoding (utf16BE to unicode):
+     |t|
+
+     t := ISO10646_to_UTF16BE encodeString:''.
+     ISO10646_to_UTF16BE decodeString:t.
+"
+! !
+
+!ISO10646_to_UTF16BE methodsFor:'encoding & decoding'!
+
+decode:aCode
+    self shouldNotImplement "/ no single byte conversion possible
+!
+
+decodeString:aStringOrByteCollection
+    "given a byteArray (2-bytes per character) or unsignedShortArra in UTF16 encoding,
+     return a new string containing the same characters, in 8, 16bit (or more) encoding.
+     Returns either a normal String, a TwoByte- or a FourByte-String instance.
+     Only useful, when reading from external sources.
+     This only handles up-to 30bit characters."
+
+    |sz nBitsRequired s newString idx bitsPerElementIn nextIn
+     codeIn codeIn1 codeIn2|
+
+    aStringOrByteCollection isByteArray ifTrue:[
+        bitsPerElementIn := 8.
+    ] ifFalse:[
+        aStringOrByteCollection isString ifTrue:[
+            bitsPerElementIn := aStringOrByteCollection bitsPerCharacter.
+        ] ifFalse:[
+            bitsPerElementIn := 16.
+        ].
+    ].
+
+    bitsPerElementIn == 8 ifTrue:[
+        nextIn := [self nextTwoByteValueFrom:s].
+    ] ifFalse:[
+        nextIn := [s next].
+    ].
+
+    nBitsRequired := 8.
+    sz := 0.
+    s := aStringOrByteCollection readStream.
+    [s atEnd] whileFalse:[
+        codeIn := nextIn value.
+        sz := sz + 1.
+
+        codeIn <= 16rFF ifTrue:[
+        ] ifFalse:[
+            nBitsRequired := nBitsRequired max:16.
+            (codeIn between:16rD800 and:16rDBFF) ifTrue:[
+                nBitsRequired := 32.
+                codeIn2 := nextIn value.
+            ].
+        ]
+    ].
+
+    nBitsRequired == 8 ifTrue:[
+        newString := String uninitializedNew:sz
+    ] ifFalse:[
+        nBitsRequired <= 16 ifTrue:[
+            newString := Unicode16String new:sz
+        ] ifFalse:[
+            newString := Unicode32String new:sz
+        ]
+    ].
+
+    s := aStringOrByteCollection readStream.
+    idx := 1.
+    [s atEnd] whileFalse:[
+        codeIn := nextIn value.
+        codeIn <= 16rFF ifTrue:[
+        ] ifFalse:[
+            nBitsRequired := nBitsRequired max:16.
+            (codeIn between:16rD800 and:16rDBFF) ifTrue:[
+                nBitsRequired := 32.
+                codeIn1 := codeIn.
+                codeIn2 := nextIn value.
+                codeIn := ((codeIn1 - 16rD800) bitShift:10)
+                          +
+                          (codeIn2 - 16rDC00)
+                          + 16r00010000.
+            ].
+        ].
+        newString at:idx put:(Character value:codeIn).
+        idx := idx + 1.
+    ].
+    ^ newString
+
+    "
+     self new decodeString:#[ 16r00 16r42 ]            
+     self new decodeString:#[ 16r01 16r42 ]            
+     self new decodeString:#[ 16r00 16r48
+                              16r00 16r69  
+                              16rD8 16r00  
+                              16rDC 16r00  
+                              16r00 16r21  
+                              16r00 16r21  
+                            ]            
+
+     self new decodeString:#( 16r0048
+                              16r0069  
+                              16rD800  
+                              16rDC00  
+                              16r0021  
+                              16r0021  
+                            )
+    "
+!
+
+encode:aCode
+    self shouldNotImplement "/ no single byte conversion possible
+!
+
+encodeString:aUnicodeString
+    "return the UTF-16 representation of a aUnicodeString.
+     The resulting string is only useful to be stored on some external file,
+     not for being used inside ST/X."
+
+    |s|
+
+    s := WriteStream on:(ByteArray uninitializedNew:aUnicodeString size).
+    aUnicodeString do:[:eachCharacter |
+        |codePoint t hi low|
+
+        codePoint := eachCharacter codePoint.
+        (codePoint <= 16rFFFF) ifTrue:[
+            ((codePoint <= 16rD7FF) or:[ codePoint between:16rE000 and:16rFFFF]) ifTrue:[
+                self nextPutTwoByteValue:codePoint to:s.
+            ] ifFalse:[
+                "/ unrepresentable: D800..DFFFF
+                self error:'unrepresentable value (D800..DFFFF) in utf16Encode'.
+            ].
+        ] ifFalse:[
+            t := codePoint - 16r00010000.
+            hi := t bitShift:-10.
+            low := t bitAnd:16r3FF.
+            hi > 16r3FF ifTrue:[
+                "/ unrepresentable: above 110000
+                self error:'unrepresentable value (> 10FFFF) in utf16Encode'.
+            ].
+            self nextPutTwoByteValue:(hi + 16rD800) to:s.
+            self nextPutTwoByteValue:(low + 16rDC00) to:s.
+        ].
+    ].
+
+    ^ s contents
+
+    "
+     (self encodeString:'hello')                                         #[0 104 0 101 0 108 0 108 0 111]
+     (self encodeString:(Character value:16r40) asString)                #[0 64]
+     (self encodeString:(Character value:16rFF) asString)                #[0 255]
+     (self encodeString:(Character value:16r100) asString)               #[1 0]
+     (self encodeString:(Character value:16r1000) asString)              #[16 0]
+     (self encodeString:(Character value:16r2000) asString)              #[32 0]
+     (self encodeString:(Character value:16r4000) asString)              #[64 0]
+     (self encodeString:(Character value:16r8000) asString)              #[128 0]
+     (self encodeString:(Character value:16rD7FF) asString)              #[215 255]
+     (self encodeString:(Character value:16rE000) asString)              #[224 0]
+     (self encodeString:(Character value:16rFFFF) asString)              #[255 255]
+     (self encodeString:(Character value:16r10000) asString)             #[216 64 220 0]
+     (self encodeString:(Character value:16r10FFF) asString)             #[216 67 223 255]
+     (self encodeString:(Character value:16r1FFFF) asString)             #[216 127 223 255]
+     (self encodeString:(Character value:16r10FFFF) asString)            #[219 255 223 255]             
+    error cases:
+     (self encodeString:(Character value:16rD800) asString) 
+     (self encodeString:(Character value:16rD801) asString) 
+     (self encodeString:(Character value:16rDFFF) asString) 
+     (self encodeString:(Character value:16r110000) asString)   
+    "
+! !
+
+!ISO10646_to_UTF16BE methodsFor:'private'!
+
+nextPutTwoByteValue:anInteger to:aStream
+    aStream nextPutShort:anInteger MSB:true
+!
+
+nextTwoByteValueFrom:aStream
+    ^ aStream nextUnsignedShortMSB:true
+! !
+
+!ISO10646_to_UTF16BE methodsFor:'queries'!
+
+nameOfEncoding
+    ^ #'utf8be'
+! !
+
+!ISO10646_to_UTF16BE class methodsFor:'documentation'!
+
+version
+    ^ '$Header: /cvs/stx/stx/libbasic/CharacterEncoderImplementations__ISO10646_to_UTF16BE.st,v 1.1 2005-07-07 17:36:41 cg Exp $'
+! !