CharacterEncoderImplementations__ISO10646_to_SGML.st
author Claus Gittinger <cg@exept.de>
Fri, 12 Mar 2004 13:50:27 +0100
changeset 8170 ffa1ed9338ad
child 8171 ac837a7ca3a3
permissions -rw-r--r--
initial checkin

"{ Package: 'stx:libbasic' }"

"{ NameSpace: CharacterEncoderImplementations }"

TwoByteEncoder subclass:#ISO10646_to_SGML
	instanceVariableNames:''
	classVariableNames:''
	poolDictionaries:''
	category:'Collections-Text-Encodings'
!


!ISO10646_to_SGML methodsFor:'encoding & decoding'!

decode:aCode
    self shouldNotImplement "/ no single byte conversion possible
!

decodeString:aStringOrByteCollection
    "given a string in SGML encoding (i.e. with SGML escaped characters),
     return a new string containing the same characters, in 16bit (or more) encoding.
     Returns either a normal String, a TwoByteString or a FourByteString instance.
     Only useful, when reading from external sources.
     This only handles up-to 30bit characters."

    |nBits ch 
     in out codePoint t|

    nBits := 8.
    in := aStringOrByteCollection readStream.
    out := WriteStream on:(String new:10).
    [in atEnd] whileFalse:[
        ch := in next.
        ch == $& ifTrue:[
            in peekOrNil == $# ifTrue:[
                in next.
                codePoint := 0.
                [ch := in peekOrNil.
                 ch notNil and:[ch isDigit]
                ] whileTrue:[
                    codePoint := (codePoint * 10) + ch digitValue.
                    in next.
                ].
                codePoint > 16rFF ifTrue:[
                    codePoint > 16rFFFF ifTrue:[
                        nBits < 32 ifTrue:[
                            t := out contents.
                            out := WriteStream on:(Unicode32String fromString:t).
                            out position:t size.
                            nBits := 32.
                        ]
                    ] ifFalse:[
                        nBits < 16 ifTrue:[
                            t := out contents.
                            out := WriteStream on:(Unicode16String fromString:t).
                            out position:t size.
                            nBits := 16.
                        ]
                    ]
                ].
                out nextPut:(Character value:codePoint).
                in peekOrNil == $; ifTrue:[
                    in next.
                ]
            ] ifFalse:[
                out nextPut:ch
            ]
        ] ifFalse:[
            out nextPut:ch
        ].
    ].
    ^ out contents

    "
     CharacterEncoderImplementations::ISO10646_to_SGML
        decodeString:'&#1060;&#1072;&#1081;&#1083;' 

     CharacterEncoderImplementations::ISO10646_to_SGML
        decodeString:'#197;&bn...'
    "
!

encode:aCode
    self shouldNotImplement "/ no single byte conversion possible
!

encodeString:aUnicodeString
    "return the UTF-8 representation of a aUnicodeString.
     The resulting string is only useful to be stored on some external file,
     not for being used inside ST/X.

     If you work a lot with utf8 encoded textFiles, 
     this is a first-class candidate for a primitive."

    |ch in out codePoint|

    in := aUnicodeString readStream.
    out := WriteStream on:(String new:10).
    [in atEnd] whileFalse:[
        ch := in next.
        codePoint := ch codePoint.
        (codePoint between:16r20 and:16r7F) ifTrue:[
            out nextPut:ch.
        ] ifFalse:[
            out nextPutAll:'&#'.
            out nextPutAll:(codePoint printString).
            out nextPutAll:';'.
        ].
    ].
    ^ out contents

    "
     CharacterEncoderImplementations::ISO10646_to_SGML
        encodeString:'hello äöü' 
    "
! !

!ISO10646_to_SGML class methodsFor:'documentation'!

version
    ^ '$Header: /cvs/stx/stx/libbasic/CharacterEncoderImplementations__ISO10646_to_SGML.st,v 1.1 2004-03-12 12:50:27 cg Exp $'
! !