CharacterEncoderImplementations__ISO10646_to_UTF8.st
author Jan Vrany <jan.vrany@fit.cvut.cz>
Sat, 05 May 2012 22:58:24 +0100
branchjv
changeset 17940 985e22966acb
parent 17911 a99f15c5efa5
child 17951 fa0e1d7467ea
permissions -rw-r--r--
Merged with /trunk

"
 COPYRIGHT (c) 2004 by eXept Software AG
	      All Rights Reserved

 This software is furnished under a license and may be used
 only in accordance with the terms of that license and with the
 inclusion of the above copyright notice.   This software may not
 be provided or otherwise made available to, or used by, any
 other person.  No title to or ownership of the software is
 hereby transferred.
"
"{ Package: 'stx:libbasic' }"

"{ NameSpace: CharacterEncoderImplementations }"

TwoByteEncoder subclass:#ISO10646_to_UTF8
	instanceVariableNames:''
	classVariableNames:''
	poolDictionaries:''
	category:'Collections-Text-Encodings'
!

!ISO10646_to_UTF8 class methodsFor:'documentation'!

copyright
"
 COPYRIGHT (c) 2004 by eXept Software AG
	      All Rights Reserved

 This software is furnished under a license and may be used
 only in accordance with the terms of that license and with the
 inclusion of the above copyright notice.   This software may not
 be provided or otherwise made available to, or used by, any
 other person.  No title to or ownership of the software is
 hereby transferred.
"
!

examples
"
  Encoding (unicode to utf8)
     ISO10646_to_UTF8 encodeString:'hello'.


  Decoding (utf8 to unicode):
     |t|

     t := ISO10646_to_UTF8 encodeString:'Hello'.
     ISO10646_to_UTF8 decodeString:t.
"
! !

!ISO10646_to_UTF8 methodsFor:'encoding & decoding'!

decode:aCode
    self shouldNotImplement "/ no single byte conversion possible
!

decodeString:aStringOrByteCollection
    "given a string in UTF8 encoding,
     return a new string containing the same characters, in 16bit (or more) encoding.
     Returns either a normal String, a TwoByteString or a FourByteString instance.
     Only useful, when reading from external sources.
     This only handles up-to 30bit characters.

     If you work a lot with utf8 encoded textFiles,
     this is a first-class candidate for a primitive."

    |sz anyAbove7BitAscii nBitsRequired
     ascii "{ Class: SmallInteger }"
     byte  "{ Class: SmallInteger }"
     s newString idx next6Bits last6Bits
     errorReporter|

    "/ avoid creation of new strings
    aStringOrByteCollection isString ifTrue:[
        aStringOrByteCollection contains8BitCharacters ifFalse:[^ aStringOrByteCollection].
    ].

    errorReporter := [:msg | 
                             DecodingError newException
                                defaultValue:aStringOrByteCollection;
                                raiseRequestWith:aStringOrByteCollection errorString:msg.
                     ].

    next6Bits := [
                    | byte |

                    byte := s nextByte.
                    byte isNil ifTrue:[^ errorReporter value:'short utf8 string'].
                    ascii := (ascii bitShift:6) bitOr:(byte bitAnd:2r00111111).
                    (byte bitAnd:2r11000000) ~~ 2r10000000 ifTrue:[
                        ^ errorReporter value:'illegal followbyte (next)'.].
                 ].

    last6Bits := [
                    | a byte |

                    byte := s nextByte.
                    byte isNil ifTrue:[^ errorReporter value:'short utf8 string'].
                    a := (ascii bitShift:6) bitOr:(byte bitAnd:2r00111111).
                    (a > 16r3FFFFFFF) ifTrue:[
                        "/ ST/X can only represent 30 bit unicode characters.
                        errorReporter value:'unicode character out of range'.
                        a := 16r3FFFFFFF.
                    ].
                    ascii := a.
                    (byte bitAnd:2r11000000) ~~ 2r10000000 ifTrue:[
                        ^ errorReporter value:'illegal followbyte (last)'.].
                 ].

    nBitsRequired := 8.
    anyAbove7BitAscii := false.
    sz := 0.
    s := aStringOrByteCollection readStream.
    [s atEnd] whileFalse:[
        byte := ascii := s nextByte.
        (byte bitAnd:16r80) ~~ 0 ifTrue:[
            anyAbove7BitAscii := true.
            (byte bitAnd:2r11100000) == 2r11000000 ifTrue:[
                "/ 80 .. 7FF
                ascii := (byte bitAnd:2r00011111).
                next6Bits value.
                ascii > 16rFF ifTrue:[
                    nBitsRequired := nBitsRequired max:16
                ].
                "/ a strict utf8 decoder does not allow overlong sequences
                ascii < 16r80 ifTrue:[
                    errorReporter value:'overlong utf8 sequence'
                ].
            ] ifFalse:[
                (byte bitAnd:2r11110000) == 2r11100000 ifTrue:[
                    "/ 800 .. FFFF
                    ascii := (byte bitAnd:2r00001111).
                    next6Bits value.
                    next6Bits value.
                    ascii > 16rFF ifTrue:[
                        nBitsRequired := nBitsRequired max:16
                    ].
                    ascii < 16r800 ifTrue:[
                        errorReporter value:'overlong utf8 sequence'
                    ].
                ] ifFalse:[
                    (byte bitAnd:2r11111000) == 2r11110000 ifTrue:[
                        "/ 10000 .. 1FFFFF
                        ascii := (byte bitAnd:2r00000111).
                        next6Bits value.
                        next6Bits value.
                        next6Bits value.
                        ascii > 16rFF ifTrue:[
                            ascii > 16rFFFF ifTrue:[
                                nBitsRequired := nBitsRequired max:32
                            ] ifFalse:[
                                nBitsRequired := nBitsRequired max:16
                            ]
                        ].
                        ascii < 16r10000 ifTrue:[
                            errorReporter value:'overlong utf8 sequence'
                        ].
                    ] ifFalse:[
                        (byte bitAnd:2r11111100) == 2r11111000 ifTrue:[
                            "/ 200000 .. 3FFFFFF
                            ascii := (byte bitAnd:2r00000011).
                            next6Bits value.
                            next6Bits value.
                            next6Bits value.
                            next6Bits value.
                            ascii > 16rFF ifTrue:[
                                ascii > 16rFFFF ifTrue:[
                                    nBitsRequired := nBitsRequired max:32
                                ] ifFalse:[
                                    nBitsRequired := nBitsRequired max:16
                                ]
                            ].
                            ascii < 200000 ifTrue:[
                                errorReporter value:'overlong utf8 sequence'
                            ].
                        ] ifFalse:[
                            (byte bitAnd:2r11111110) == 2r11111100 ifTrue:[
                                "/ 4000000 .. 7FFFFFFF
                                ascii := (byte bitAnd:2r00000001).
                                next6Bits value.
                                next6Bits value.
                                next6Bits value.
                                next6Bits value.
                                last6Bits value.
                                ascii > 16rFF ifTrue:[
                                    ascii > 16rFFFF ifTrue:[
                                        nBitsRequired := nBitsRequired max:32
                                    ] ifFalse:[
                                        nBitsRequired := nBitsRequired max:16
                                    ]
                                ].
                                ascii < 16r4000000 ifTrue:[
                                    errorReporter value:'overlong utf8 sequence'
                                ].
                            ] ifFalse:[
                                errorReporter value:'invalid utf8 encoding'
                            ]
                        ]
                    ]
                ]
            ].
        ].
        sz := sz + 1.
    ].
    nBitsRequired == 8 ifTrue:[
        anyAbove7BitAscii ifFalse:[
            "/ can return the original string
            aStringOrByteCollection isString ifTrue:[^ aStringOrByteCollection].
        ].
        newString := String uninitializedNew:sz
    ] ifFalse:[
        nBitsRequired <= 16 ifTrue:[
            newString := Unicode16String new:sz
        ] ifFalse:[
            newString := Unicode32String new:sz
        ]
    ].

    next6Bits := [
                    |byte|

                    byte := s nextByte.
                    ascii := (ascii bitShift:6) bitOr:(byte bitAnd:2r00111111).
                 ].

    s := aStringOrByteCollection readStream.
    idx := 1.
    [s atEnd] whileFalse:[
        byte := ascii := s nextByte.
        (byte bitAnd:2r10000000) ~~ 0 ifTrue:[
            (byte bitAnd:2r11100000) == 2r11000000 ifTrue:[
                ascii := (byte bitAnd:2r00011111).
                next6Bits value.
            ] ifFalse:[
                (byte bitAnd:2r11110000) == 2r11100000 ifTrue:[
                    ascii := (byte bitAnd:2r00001111).
                    next6Bits value.
                    next6Bits value.
                ] ifFalse:[
                    (byte bitAnd:2r11111000) == 2r11110000 ifTrue:[
                        ascii := (byte bitAnd:2r00000111).
                        next6Bits value.
                        next6Bits value.
                        next6Bits value.
                    ] ifFalse:[
                        (byte bitAnd:2r11111100) == 2r11111000 ifTrue:[
                            ascii := (byte bitAnd:2r00000011).
                            next6Bits value.
                            next6Bits value.
                            next6Bits value.
                            next6Bits value.
                        ] ifFalse:[
                            (byte bitAnd:2r11111110) == 2r11111100 ifTrue:[
                                ascii := (byte bitAnd:2r00000001).
                                next6Bits value.
                                next6Bits value.
                                next6Bits value.
                                next6Bits value.
                                last6Bits value.
                            ]
                        ]
                    ]
                ]
            ].
        ].
        newString at:idx put:(Character value:ascii).
        idx := idx + 1.
    ].
    ^ newString

    "
     CharacterArray fromUTF8Bytes:#[ 16r41 16r42 ]
     CharacterArray fromUTF8Bytes:#[ 16rC1 16r02 ]
     CharacterArray fromUTF8Bytes:#[ 16rE0 16r81 16r02 ]
     CharacterArray fromUTF8Bytes:#[ 16rEF 16rBF 16rBF ]

   rfc2279 examples:
     CharacterArray fromUTF8Bytes:#[ 16r41 16rE2 16r89 16rA2 16rCE 16r91 16r2E ]
     CharacterArray fromUTF8Bytes:#[ 16rED 16r95 16r9C 16rEA 16rB5 16rAD 16rEC 16r96 16rB4 ]
     CharacterArray fromUTF8Bytes:#[ 16rE6 16r97 16rA5 16rE6 16r9C 16rAC 16rE8 16rAA 16r9E ]

   invalid:
     CharacterArray fromUTF8Bytes:#[ 16rC0 16r80 ]
     CharacterArray fromUTF8Bytes:#[ 16rE0 16r80 16r80 ]
    "

    "Modified: / 18-09-2006 / 19:55:52 / cg"
!

encode:aCode
    self shouldNotImplement "/ no single byte conversion possible
!

encodeString:aUnicodeString
    "return the UTF-8 representation of a aUnicodeString.
     The resulting string is only useful to be stored on some external file,
     not for being used inside ST/X.

     If you work a lot with utf8 encoded textFiles,
     this is a first-class candidate for a primitive."

    |s|

    "/ avoid creation of new strings
    aUnicodeString contains8BitCharacters ifFalse:[^ aUnicodeString].

    s := WriteStream on:(String uninitializedNew:aUnicodeString size).
    aUnicodeString do:[:eachCharacter |
        |codePoint b1 b2 b3 b4 b5 v "{Class: SmallInteger }"|

        codePoint := eachCharacter codePoint.
        codePoint <= 16r7F ifTrue:[
            s nextPut:eachCharacter.
        ] ifFalse:[
            b1 := Character value:((codePoint bitAnd:16r3F) bitOr:2r10000000).
            v := codePoint bitShift:-6.
            v <= 16r1F ifTrue:[
                s nextPut:(Character value:(v bitOr:2r11000000)).
                s nextPut:b1.
            ] ifFalse:[
                b2 := Character value:((v bitAnd:16r3F) bitOr:2r10000000).
                v := v bitShift:-6.
                v <= 16r0F ifTrue:[
                    s nextPut:(Character value:(v bitOr:2r11100000)).
                    s nextPut:b2; nextPut:b1.
                ] ifFalse:[
                    b3 := Character value:((v bitAnd:16r3F) bitOr:2r10000000).
                    v := v bitShift:-6.
                    v <= 16r07 ifTrue:[
                        s nextPut:(Character value:(v bitOr:2r11110000)).
                        s nextPut:b3; nextPut:b2; nextPut:b1.
                    ] ifFalse:[
                        b4 := Character value:((v bitAnd:16r3F) bitOr:2r10000000).
                        v := v bitShift:-6.
                        v <= 16r03 ifTrue:[
                            s nextPut:(Character value:(v bitOr:2r11111000)).
                            s nextPut:b4; nextPut:b3; nextPut:b2; nextPut:b1.
                        ] ifFalse:[
                            b5 := Character value:((v bitAnd:16r3F) bitOr:2r10000000).
                            v := v bitShift:-6.
                            v <= 16r01 ifTrue:[
                                s nextPut:(Character value:(v bitOr:2r11111100)).
                                s nextPut:b5; nextPut:b4; nextPut:b3; nextPut:b2; nextPut:b1.
                            ] ifFalse:[
                                "/ cannot happen - we only support up to 30 bit characters
                                self error:'ascii value > 31bit in utf8Encode'.
                            ]
                        ].
                    ].
                ].
            ].
        ].
    ].

    ^ s contents

    "
     (self encodeString:'hello') asByteArray                             #[104 101 108 108 111]
     (self encodeString:(Character value:16r40) asString) asByteArray    #[64]
     (self encodeString:(Character value:16r7F) asString) asByteArray    #[127]
     (self encodeString:(Character value:16r80) asString) asByteArray    #[194 128]
     (self encodeString:(Character value:16rFF) asString) asByteArray    #[195 191]
     (self encodeString:(Character value:16r100) asString) asByteArray   #[196 128]
     (self encodeString:(Character value:16r200) asString) asByteArray   #[200 128]
     (self encodeString:(Character value:16r400) asString) asByteArray   #[208 128]
     (self encodeString:(Character value:16r800) asString) asByteArray   #[224 160 128]
     (self encodeString:(Character value:16r1000) asString) asByteArray  #[225 128 128]
     (self encodeString:(Character value:16r2000) asString) asByteArray  #[226 128 128]
     (self encodeString:(Character value:16r4000) asString) asByteArray  #[228 128 128]
     (self encodeString:(Character value:16r8000) asString) asByteArray  #[232 128 128]
     (self encodeString:(Character value:16rFFFF) asString) asByteArray  #[239 191 191]
    "
! !

!ISO10646_to_UTF8 privateMethodsFor:'queries'!

bytesToReadFor:firstByte 
    |bytesToRead|

    bytesToRead := 1.
    (firstByte isBitSet:8) ifFalse:[^1].
    7 downTo:3
        do:[:idx | 
            (firstByte isBitSet:idx) ifTrue:[
                bytesToRead := bytesToRead + 1
            ] ifFalse:[
                ^bytesToRead                
            ]
        ].
    ^bytesToRead

    "Created: / 14-06-2005 / 17:17:24 / janfrog"
! !

!ISO10646_to_UTF8 methodsFor:'queries'!

characterSize:charOrcodePoint
    "return the number of bytes required to encode codePoint"

    "Taken from RFC 3629"

    (charOrcodePoint asInteger between:16r00000000 and:16r0000007F) ifTrue:[^1].
    (charOrcodePoint asInteger between:16r00000080 and:16r000007FF) ifTrue:[^2].
    (charOrcodePoint asInteger between:16r00000800 and:16r0000FFFF) ifTrue:[^3].
    (charOrcodePoint asInteger between:16r00010000 and:16r0010FFFF) ifTrue:[^4].

    ^self error:'Invalid codePoint'

    "Created: / 15-06-2005 / 15:16:22 / janfrog"
!

nameOfEncoding
    ^ #'utf8'
! !

!ISO10646_to_UTF8 methodsFor:'stream support'!

readNext:charactersToRead charactersFrom:stream

    | s |

    s := (String new:charactersToRead) writeStream.
    charactersToRead timesRepeat:[
        | c |
        c := stream peek.
        s nextPutAll:(stream next:(self bytesToReadFor:c))
    ].
    ^ self decodeString:s contents

    "Created: / 16-06-2005 / 11:45:14 / masca"
!

readNextCharacterFrom:aStream 

    |firstByte bytesToRead str|

    firstByte := aStream peek. 
    firstByte ifNil:[^nil].
    firstByte := firstByte asInteger.
    bytesToRead := self bytesToReadFor:firstByte.
    str := self decodeString:(aStream next:bytesToRead).
    str size > 1 ifTrue:[
        self error:'Badly coded method'
    ].
    ^ str first

    "Created: / 14-06-2005 / 17:03:59 / janfrog"
! !

!ISO10646_to_UTF8 class methodsFor:'documentation'!

version
    ^ '$Header: /cvs/stx/stx/libbasic/CharacterEncoderImplementations__ISO10646_to_UTF8.st,v 1.16 2009/09/22 09:08:09 fm Exp $'
!

version_SVN
    ^ '$Id: CharacterEncoderImplementations__ISO10646_to_UTF8.st 10807 2012-05-05 21:58:24Z vranyj1 $'
! !