CharacterEncoderImplementations__ISO10646_to_UTF8.st
author Claus Gittinger <cg@exept.de>
Tue, 27 Sep 2016 15:04:34 +0200
changeset 20458 55727803f73d
parent 19838 a6ca726d596c
child 19863 513bd7237fe7
child 21298 cb1ce1924d13
permissions -rw-r--r--
#DOCUMENTATION by cg class: ZeroDivide comment/format in: #defaultResumeValue

"{ Encoding: utf8 }"

"
 COPYRIGHT (c) 2004 by eXept Software AG
	      All Rights Reserved

 This software is furnished under a license and may be used
 only in accordance with the terms of that license and with the
 inclusion of the above copyright notice.   This software may not
 be provided or otherwise made available to, or used by, any
 other person.  No title to or ownership of the software is
 hereby transferred.
"
"{ Package: 'stx:libbasic' }"

"{ NameSpace: CharacterEncoderImplementations }"

TwoByteEncoder subclass:#ISO10646_to_UTF8
	instanceVariableNames:''
	classVariableNames:''
	poolDictionaries:''
	category:'Collections-Text-Encodings'
!

ISO10646_to_UTF8 class instanceVariableNames:'theOneAndOnlyInstance'

"
 No other class instance variables are inherited by this class.
"
!

!ISO10646_to_UTF8 class methodsFor:'documentation'!

copyright
"
 COPYRIGHT (c) 2004 by eXept Software AG
	      All Rights Reserved

 This software is furnished under a license and may be used
 only in accordance with the terms of that license and with the
 inclusion of the above copyright notice.   This software may not
 be provided or otherwise made available to, or used by, any
 other person.  No title to or ownership of the software is
 hereby transferred.
"
!

examples
"
  Encoding (unicode to utf8)
     ISO10646_to_UTF8 encodeString:'hello'.


  Decoding (utf8 to unicode):
     |t|

     t := ISO10646_to_UTF8 encodeString:'Helloœ'.
     ISO10646_to_UTF8 decodeString:t.
"
! !

!ISO10646_to_UTF8 class methodsFor:'instance creation'!

flushSingleton
    "flushes the cached singleton"

    theOneAndOnlyInstance := nil

    "
     self flushSingleton
    "
!

new
    "returns a singleton"

    theOneAndOnlyInstance isNil ifTrue:[
        theOneAndOnlyInstance := self basicNew initialize.
    ].
    ^ theOneAndOnlyInstance.
!

theOneAndOnlyInstance
    "returns a singleton"

    theOneAndOnlyInstance isNil ifTrue:[
        theOneAndOnlyInstance := self basicNew initialize.
    ].
    ^ theOneAndOnlyInstance.
! !

!ISO10646_to_UTF8 methodsFor:'encoding & decoding'!

decode:aCode
    self shouldNotImplement "/ no single byte conversion possible
!

decodeString:aStringOrByteCollection
    "given a string in UTF8 encoding,
     return a new string containing the same characters, in Unicode encoding.
     Returns either a normal String, a Unicode16String or a Unicode32String instance.
     This is only useful, when reading from external sources or communicating with
     other systems 
     (ST/X never uses utf8 internally, but always uses strings of fully decoded unicode characters).
     This only handles up-to 30bit characters."

    ^ CharacterArray decodeFromUTF8:aStringOrByteCollection.
!

encode:aCode
    self shouldNotImplement "/ no single byte conversion possible
!

encodeString:aUnicodeString
    "return the UTF-8 representation of a Unicode string.
     The resulting string is only useful to be stored on some external file,
     not for being used inside ST/X."

    ^ aUnicodeString utf8Encoded.
! !

!ISO10646_to_UTF8 privateMethodsFor:'queries'!

bytesToReadFor:firstByte 
    |bytesToRead|

    bytesToRead := 1.
    (firstByte isBitSet:8) ifFalse:[^1].
    7 downTo:3
        do:[:idx | 
            (firstByte isBitSet:idx) ifTrue:[
                bytesToRead := bytesToRead + 1
            ] ifFalse:[
                ^bytesToRead                
            ]
        ].
    ^bytesToRead

    "Created: / 14-06-2005 / 17:17:24 / janfrog"
! !

!ISO10646_to_UTF8 methodsFor:'queries'!

characterSize:charOrcodePoint
    "return the number of bytes required to encode codePoint"

    "Taken from RFC 3629"

    (charOrcodePoint asInteger between:16r00000000 and:16r0000007F) ifTrue:[^1].
    (charOrcodePoint asInteger between:16r00000080 and:16r000007FF) ifTrue:[^2].
    (charOrcodePoint asInteger between:16r00000800 and:16r0000FFFF) ifTrue:[^3].
    (charOrcodePoint asInteger between:16r00010000 and:16r0010FFFF) ifTrue:[^4].

    ^self error:'Invalid codePoint'

    "Created: / 15-06-2005 / 15:16:22 / janfrog"
!

nameOfEncoding
    ^ #utf8
! !

!ISO10646_to_UTF8 methodsFor:'stream support'!

readNext:charactersToRead charactersFrom:stream

    | s |

    s := (String new:charactersToRead) writeStream.
    charactersToRead timesRepeat:[
        | c |
        c := stream peek.
        s nextPutAll:(stream next:(self bytesToReadFor:c))
    ].
    ^ self decodeString:s contents

    "Created: / 16-06-2005 / 11:45:14 / masca"
!

readNextCharacterFrom:aStream 

    |firstByte bytesToRead str|

    firstByte := aStream peek. 
    firstByte ifNil:[^nil].
    firstByte := firstByte asInteger.
    bytesToRead := self bytesToReadFor:firstByte.
    str := self decodeString:(aStream next:bytesToRead).
    str size > 1 ifTrue:[
        self error:'Badly coded method'
    ].
    ^ str first

    "Created: / 14-06-2005 / 17:03:59 / janfrog"
! !

!ISO10646_to_UTF8 class methodsFor:'documentation'!

version
    ^ '$Header$'
! !