CharacterEncoderImplementations__ISO10646_to_UTF8.st
author Claus Gittinger <cg@exept.de>
Tue, 09 Jul 2019 20:55:17 +0200
changeset 24417 03b083548da2
parent 22474 f42c97c037ed
child 25271 3b763ce09c7e
permissions -rw-r--r--
#REFACTORING by exept class: Smalltalk class changed: #recursiveInstallAutoloadedClassesFrom:rememberIn:maxLevels:noAutoload:packageTop:showSplashInLevels: Transcript showCR:(... bindWith:...) -> Transcript showCR:... with:...

"{ Encoding: utf8 }"

"
 COPYRIGHT (c) 2004 by eXept Software AG
	      All Rights Reserved

 This software is furnished under a license and may be used
 only in accordance with the terms of that license and with the
 inclusion of the above copyright notice.   This software may not
 be provided or otherwise made available to, or used by, any
 other person.  No title to or ownership of the software is
 hereby transferred.
"
"{ Package: 'stx:libbasic' }"

"{ NameSpace: CharacterEncoderImplementations }"

VariableBytesEncoder subclass:#ISO10646_to_UTF8
	instanceVariableNames:''
	classVariableNames:''
	poolDictionaries:''
	category:'Collections-Text-Encodings'
!

ISO10646_to_UTF8 class instanceVariableNames:'theOneAndOnlyInstance'

"
 No other class instance variables are inherited by this class.
"
!

!ISO10646_to_UTF8 class methodsFor:'documentation'!

copyright
"
 COPYRIGHT (c) 2004 by eXept Software AG
	      All Rights Reserved

 This software is furnished under a license and may be used
 only in accordance with the terms of that license and with the
 inclusion of the above copyright notice.   This software may not
 be provided or otherwise made available to, or used by, any
 other person.  No title to or ownership of the software is
 hereby transferred.
"
!

documentation
"
    I can encode unicode characters into utf-8 and
    decode utf-8 characters into unicode.
    
    Notice the naming (many are confused):
        Unicode is the set of number-to-glyph assignments
    whereas:
        UTF8 is a concrete way of xmitting Unicode codePoints (numbers).
    UTF16 is another concrete encoding, for example.    
        
    ST/X NEVER uses UTF8 internally - all characters are full 24bit characters.
    Only when exchanging data, are these converted into UTF8 (or other) byte sequences.
"
!

examples
"
  Encoding (unicode to utf8)
     ISO10646_to_UTF8 encodeString:'hello'.


  Decoding (utf8 to unicode):
     |t|

     t := ISO10646_to_UTF8 encodeString:'Helloœ'.
     ISO10646_to_UTF8 decodeString:t.
"
! !

!ISO10646_to_UTF8 class methodsFor:'instance creation'!

flushSingleton
    "flushes the cached singleton"

    theOneAndOnlyInstance := nil

    "
     self flushSingleton
    "
!

new
    "returns a singleton"

    theOneAndOnlyInstance isNil ifTrue:[
        theOneAndOnlyInstance := self basicNew initialize.
    ].
    ^ theOneAndOnlyInstance.
!

theOneAndOnlyInstance
    "returns a singleton"

    theOneAndOnlyInstance isNil ifTrue:[
        theOneAndOnlyInstance := self basicNew initialize.
    ].
    ^ theOneAndOnlyInstance.
! !

!ISO10646_to_UTF8 class methodsFor:'queries'!

bytesToReadFor:firstByte 
    (firstByte bitAnd:2r10000000) == 0 ifTrue:[^ 1].
    (firstByte bitAnd:2r11000000) == 2r10000000 ifTrue:[^ 2].
    (firstByte bitAnd:2r11100000) == 2r11000000 ifTrue:[^ 3].
    (firstByte bitAnd:2r11110000) == 2r11100000 ifTrue:[^ 4].
    (firstByte bitAnd:2r11111000) == 2r11110000 ifTrue:[^ 5].
    (firstByte bitAnd:2r11111100) == 2r11111000 ifTrue:[^ 6].

    InvalidEncodingError raiseWith:firstByte errorString:' - unsupported utf8 encoding (too large, only 31bit supported)'

    "Created: / 14-06-2005 / 17:17:24 / janfrog"
    "Modified: / 10-01-2018 / 22:59:20 / stefan"
! !

!ISO10646_to_UTF8 methodsFor:'encoding & decoding'!

decodeString:aStringOrByteCollection
    "given a string in UTF8 encoding,
     return a new string containing the same characters, in Unicode encoding.
     Returns either a normal String, a Unicode16String or a Unicode32String instance.
     This is only useful, when reading from external sources or communicating with
     other systems 
     (ST/X never uses utf8 internally, but always uses strings of fully decoded unicode characters).
     This only handles up-to 30bit characters."

    ^ CharacterArray decodeFromUTF8:aStringOrByteCollection.
!

encodeString:aUnicodeString
    "return the UTF-8 representation of a Unicode string.
     The resulting string is only useful to be stored on some external file,
     not for being used inside ST/X."

    ^ aUnicodeString utf8Encoded.
! !

!ISO10646_to_UTF8 methodsFor:'queries'!

characterSize:charOrCodePoint
    "return the number of bytes required to encode codePoint"

    ^ charOrCodePoint asCharacter utf8BytesPerCharacter.

    "Created: / 15-06-2005 / 15:16:22 / janfrog"
    "Modified: / 03-01-2018 / 23:05:59 / stefan"
!

nameOfEncoding
    ^ #utf8
! !

!ISO10646_to_UTF8 methodsFor:'stream support'!

encodeCharacter:aUnicodeCharacter on:aStream
    "given a character in unicode, encode it onto aStream."

    aStream nextPutUtf8:aUnicodeCharacter.

    "Created: / 16-02-2017 / 16:20:57 / stefan"
!

encodeString:aUnicodeString on:aStream
    "given a string in unicode, encode it onto aStream."

     aStream nextPutAllUtf8:aUnicodeString.

    "Created: / 16-02-2017 / 16:27:31 / stefan"
!

readNext:charactersToReadArg charactersFrom:aStream
    "decode the next charactersToRead on aStream from utf-8 to unicode"

    |s c cp hasUtf8 charactersToRead "{ Class:SmallInteger }"|

    charactersToRead := charactersToReadArg.
    hasUtf8 := false.
    "stream may be both text or bytes"
    s := (aStream contentsSpecies new:charactersToRead) writeStream.
    charactersToRead timesRepeat:[
        c := aStream next.
        s nextPut:c.
        cp := c codePoint.
        (cp bitTest:16r80) ifTrue:[
            hasUtf8 := true.
            s nextPutAll:(aStream next:(self class bytesToReadFor:cp)-1).
        ].
    ].
    hasUtf8 ifTrue:[
        ^ self decodeString:s contents.
    ].
    ^ s contents asString

    "Created: / 16-06-2005 / 11:45:14 / masca"
    "Modified (comment): / 17-01-2018 / 13:24:42 / stefan"
!

readNextCharacterFrom:aStream
    "decode the next character or byte on aStream from utf-8 to unicode"

    ^ Character utf8DecodeFrom:aStream.

    "Created: / 14-06-2005 / 17:03:59 / janfrog"
    "Modified: / 10-01-2018 / 17:35:40 / stefan"
    "Modified (comment): / 17-01-2018 / 13:24:08 / stefan"
! !

!ISO10646_to_UTF8 class methodsFor:'documentation'!

version
    ^ '$Header$'
!

version_CVS
    ^ '$Header$'
! !