CharacterEncoderImplementations__ISO10646_to_UTF8_MAC.st
author Claus Gittinger <cg@exept.de>
Fri, 20 Feb 2015 19:50:00 +0100
changeset 17522 eea77b0b2c82
parent 17497 36ab19b73c1f
child 17564 67ae75f28757
permissions -rw-r--r--
class: CharacterEncoderImplementations::ISO10646_to_UTF8_MAC comment/format in: #encodeString: changed: #decodeString: ouch - a bug (last character lost)

"{ Encoding: utf8 }"

"
 COPYRIGHT (c) 2015 by eXept Software AG
              All Rights Reserved

 This software is furnished under a license and may be used
 only in accordance with the terms of that license and with the
 inclusion of the above copyright notice.   This software may not
 be provided or otherwise made available to, or used by, any
 other person.  No title to or ownership of the software is
 hereby transferred.
"
"{ Package: 'stx:libbasic' }"

"{ NameSpace: CharacterEncoderImplementations }"

ISO10646_to_UTF8 subclass:#ISO10646_to_UTF8_MAC
	instanceVariableNames:''
	classVariableNames:'AccentMap DecomposeMap'
	poolDictionaries:''
	category:'Collections-Text-Encodings'
!

!ISO10646_to_UTF8_MAC class methodsFor:'documentation'!

copyright
"
 COPYRIGHT (c) 2015 by eXept Software AG
              All Rights Reserved

 This software is furnished under a license and may be used
 only in accordance with the terms of that license and with the
 inclusion of the above copyright notice.   This software may not
 be provided or otherwise made available to, or used by, any
 other person.  No title to or ownership of the software is
 hereby transferred.
"
!

documentation
"
    UTF-8 can encode some diacritical characters (umlauts) in multiple ways:
        - either with a single uniode (e.g. ae -> ä -> &#228 -> C3 A4)
        - or as so called 'Normalization Form canonical Decomposition', i.e. as a regular 'a' followed by a
          combining diacritical mark (for example: acute).

    MAC OSX needs the second form for its filenames.
    However, OSX does not decompose the ranges U+2000-U+2FFF, U+F900-U+FAFF and U+2F800-U+2FAFF.

    This is a q&d hack, to at least support the first page (latin1) characters.
    Will be enhanced for the 2nd and 3rd unicode page, when I find time.

    [author:]
        Claus Gittinger

    [instance variables:]

    [class variables:]

    [see also:]
        http://developer.apple.com/library/mac/#qa/qa2001/qa1173.html

"
! !

!ISO10646_to_UTF8_MAC class methodsFor:'initialization'!

initializeDecomposeMap
    DecomposeMap := Dictionary new.

    DecomposeMap at:"À"  16rC0 put:#( 16r41 16r0300).
    DecomposeMap at:"à" 16rE0 put:#( 16r61 16r0300).
    DecomposeMap at:"Á"  16rC1 put:#( 16r41 16r0301).
    DecomposeMap at:"á" 16rE1 put:#( 16r61 16r0301).
    DecomposeMap at:"Â"  16rC2 put:#( 16r41 16r0302).
    DecomposeMap at:"â" 16rE2 put:#( 16r61 16r0302).
    DecomposeMap at:"Ã"  16rC3 put:#( 16r41 16r0303).
    DecomposeMap at:"ã" 16rE3 put:#( 16r61 16r0303).
    DecomposeMap at:"Ä"  16rC4 put:#( 16r41 16r0308).
    DecomposeMap at:"ä" 16rE4 put:#( 16r61 16r0308).
    DecomposeMap at:"Å"  16rC5 put:#( 16r41 16r030A).
    DecomposeMap at:"å" 16rE5 put:#( 16r61 16r030A).

    DecomposeMap at:"È"  16rC8 put:#( 16r45 16r0300).
    DecomposeMap at:"è" 16rE8 put:#( 16r65 16r0300).
    DecomposeMap at:"É"  16rC9 put:#( 16r45 16r0301).
    DecomposeMap at:"é" 16rE9 put:#( 16r65 16r0301).
    DecomposeMap at:"Ê"  16rCA put:#( 16r45 16r0302).
    DecomposeMap at:"ê" 16rEA put:#( 16r65 16r0302).
    DecomposeMap at:"Ë"  16rCB put:#( 16r45 16r0308).
    DecomposeMap at:"ë" 16rEB put:#( 16r65 16r0308).

    DecomposeMap at:"Ì"  16rCC put:#( 16r49 16r0300).
    DecomposeMap at:"ì" 16rEC put:#( 16r69 16r0300).
    DecomposeMap at:"í"  16rCD put:#( 16r49 16r0301).
    DecomposeMap at:"í" 16rED put:#( 16r69 16r0301).
    DecomposeMap at:"Î"  16rCE put:#( 16r49 16r0302).
    DecomposeMap at:"î" 16rEE put:#( 16r69 16r0302).
    DecomposeMap at:"Ï"  16rCF put:#( 16r49 16r0308).
    DecomposeMap at:"ï" 16rEF put:#( 16r69 16r0308).

    DecomposeMap at:"Ñ"  16rD1 put:#( 16r4E 16r0303).
    DecomposeMap at:"ñ" 16rF1 put:#( 16r6E 16r0303).

    DecomposeMap at:"Ò"  16rD2 put:#( 16r4F 16r0300).
    DecomposeMap at:"ò" 16rF2 put:#( 16r6F 16r0300).
    DecomposeMap at:"Ó"  16rD3 put:#( 16r4F 16r0301).
    DecomposeMap at:"ó" 16rF3 put:#( 16r6F 16r0301).
    DecomposeMap at:"Ô"  16rD4 put:#( 16r4F 16r0302).
    DecomposeMap at:"ô" 16rF4 put:#( 16r6F 16r0302).
    DecomposeMap at:"Õ"  16rD5 put:#( 16r4F 16r0303).
    DecomposeMap at:"õ" 16rF5 put:#( 16r6F 16r0303).
    DecomposeMap at:"Ö"  16rD6 put:#( 16r4F 16r0308).
    DecomposeMap at:"ö" 16rF6 put:#( 16r6F 16r0308).

    DecomposeMap at:"Ù"  16rD9 put:#( 16r55 16r0300).
    DecomposeMap at:"ù" 16rF9 put:#( 16r75 16r0300).
    DecomposeMap at:"Ú"  16rDA put:#( 16r55 16r0301).
    DecomposeMap at:"ú" 16rFA put:#( 16r75 16r0301).
    DecomposeMap at:"Û"  16rDB put:#( 16r55 16r0302).
    DecomposeMap at:"û" 16rDB put:#( 16r75 16r0302).
    DecomposeMap at:"Ü"  16rDC put:#( 16r55 16r0308).
    DecomposeMap at:"ü" 16rFC put:#( 16r75 16r0308).

    DecomposeMap at:"Ý"  16rDD put:#( 16r59 16r0301).
    DecomposeMap at:"ý" 16rFD put:#( 16r79 16r0301).

    DecomposeMap at:"ÿ"  16rFF put:#( 16r79 16r0308).
! !

!ISO10646_to_UTF8_MAC methodsFor:'encoding & decoding'!

compositionOf: baseChar with: diacriticalChar  to: outStream
    "compose two characters into one
     a + umlaut-diacritic-mark -> ä."

    |cp map i|

    cp := diacriticalChar codePoint.
    cp == 16r0300  ifTrue:[
        "/ accent grave
        map := 'AÀaàEÈeèIÌiìoòOÒUÙuù'.
    ] ifFalse:[ cp == 16r0301  ifTrue:[
        "/ accent
        map := 'AÁaáEÉeéIÍiíOÓoóUÚuúyýYÝ'.
    ] ifFalse:[ cp == 16r0302  ifTrue:[
        "/ circonflex
        map := 'AÂaâEÊeêIÎiîOÔoôUÛuû'.
    ] ifFalse:[ cp == 16r0303  ifTrue:[
        "/ tilde
        map := 'AÃaãNÑnñOÕoõ'.
    ] ifFalse:[ cp == 16r0308  ifTrue:[
        "/ umlaut
        map := 'AÄaäOÖoöUÜuüIÏiïyÿ'.
    ] ifFalse:[ cp == 16r030A  ifTrue:[
        "/ ring
        map := 'AÅaå'.
    ]]]]]].
    map notNil ifTrue:[
        i := map indexOf: baseChar.
        i ~~ 0 ifTrue:[
            outStream nextPut: (map at:i+1).
            ^ self.
        ].
    ].

    outStream nextPut: baseChar.
    outStream nextPut: diacriticalChar.
!

decodeString:aStringOrByteCollection
    "return a Unicode string from the passed in UTF-8-MAC encoded string.
     This is UTF-8 with compose-characters decomposed 
     (i.e. as separate codes, not as single combined characters).

     For now, here is a hacked (hardwired knowledge) version, 
     which will work for some european countries only...
    "

    |s buff previous|

    s := super  decodeString:aStringOrByteCollection.
    (s contains:[:char | char codePoint between:16r0300 and:16r030F]) ifFalse:[^ s].

    buff := CharacterWriteStream on:''.
    previous := nil.
    s do:[:each |
        (each codePoint between:16r0300 and:16r030F) ifTrue:[
            self compositionOf:previous with:each to:buff.
            previous := nil.
        ] ifFalse:[
            previous notNil ifTrue:[
                buff nextPut:previous.
            ].
            previous := each.
        ].
    ].
    previous notNil ifTrue:[
        buff nextPut:previous.
    ].
    ^ buff contents.

    "
     (ISO10646_to_UTF8 new encodeString:'aäoöuü') asByteArray   
        -> #[97 195 164 111 195 182 117 195 188]

     (ISO10646_to_UTF8 new decodeString:
            (ISO10646_to_UTF8 new encodeString:'aäoöuü') asByteArray)    

    (ISO10646_to_UTF8_MAC new encodeString:'aäoöuü') asByteArray 
        -> #[97 97 204 136 111 111 204 136 117 117 204 136]  

     (ISO10646_to_UTF8_MAC new decodeString:
            (ISO10646_to_UTF8_MAC new encodeString:'aäoöuü') asByteArray)    
    "
!

decompositionOf: codePointIn into:outBlockWithTwoArgs
    "if required, decompose a diacritical character into a base character and a punctuation;
     eg. ä -> a + umlaut-diacritic-mark.
     Pass both as args to the given block.
     For non diactit. chars, pass a nil diacrit-mark value"

    |entry|

    codePointIn < 16rC0 ifTrue:[ ^ false ].

    entry := DecomposeMap at:codePointIn ifAbsent:nil.
    entry isNil ifTrue:[ ^ false ].

    outBlockWithTwoArgs value:(entry at:1) value:(entry at:2).
    ^ true
!

encodeString:aUnicodeString
    "return the UTF-8-MAC representation of a aUnicodeString.
     This is UTF-8 with compose-characters decompose (i.e. as separate codes, not as
     single combined characters).

     For now, here is a hacked (hardwired knowledge) version, which should work
     at least for some european countries...
    "

    |gen s decomp codePoint composeCodePoint|

    DecomposeMap isNil ifTrue:[
        self class initializeDecomposeMap
    ].

    gen := 
        [:codePointArg |
            |codePoint "{Class: SmallInteger }" b1 b2 b3 b4 b5 v "{Class: SmallInteger }"|

            codePoint := codePointArg.
            codePoint <= 16r7F ifTrue:[
                s nextPut:(Character value:codePoint).
            ] ifFalse:[
                b1 := Character value:((codePoint bitAnd:16r3F) bitOr:2r10000000).
                v := codePoint bitShift:-6.
                v <= 16r1F ifTrue:[
                    s nextPut:(Character value:(v bitOr:2r11000000)).
                    s nextPut:b1.
                ] ifFalse:[
                    b2 := Character value:((v bitAnd:16r3F) bitOr:2r10000000).
                    v := v bitShift:-6.
                    v <= 16r0F ifTrue:[
                        s nextPut:(Character value:(v bitOr:2r11100000)).
                        s nextPut:b2; nextPut:b1.
                    ] ifFalse:[
                        b3 := Character value:((v bitAnd:16r3F) bitOr:2r10000000).
                        v := v bitShift:-6.
                        v <= 16r07 ifTrue:[
                            s nextPut:(Character value:(v bitOr:2r11110000)).
                            s nextPut:b3; nextPut:b2; nextPut:b1.
                        ] ifFalse:[
                            b4 := Character value:((v bitAnd:16r3F) bitOr:2r10000000).
                            v := v bitShift:-6.
                            v <= 16r03 ifTrue:[
                                s nextPut:(Character value:(v bitOr:2r11111000)).
                                s nextPut:b4; nextPut:b3; nextPut:b2; nextPut:b1.
                            ] ifFalse:[
                                b5 := Character value:((v bitAnd:16r3F) bitOr:2r10000000).
                                v := v bitShift:-6.
                                v <= 16r01 ifTrue:[
                                    s nextPut:(Character value:(v bitOr:2r11111100)).
                                    s nextPut:b5; nextPut:b4; nextPut:b3; nextPut:b2; nextPut:b1.
                                ] ifFalse:[
                                    "/ cannot happen - we only support up to 30 bit characters
                                    self error:'ascii value > 31bit in utf8Encode'.
                                ]
                            ].
                        ].
                    ].
                ].
            ].
        ].

    decomp := [:baseCodePointArg :composeCodePointArg | 
                codePoint := baseCodePointArg. composeCodePoint := composeCodePointArg
              ].

    s := WriteStream on:(String uninitializedNew:aUnicodeString size).
    aUnicodeString do:[:eachCharacter |
        |needExtra|

        codePoint := eachCharacter codePoint.
        needExtra := self decompositionOf: codePoint into:decomp.
        gen value:codePoint.
        needExtra ifTrue:[
            gen value:composeCodePoint
        ].
    ].

    ^ s contents

    "
     (self encodeString:'hello') asByteArray                             #[104 101 108 108 111]
     (self encodeString:(Character value:16r40) asString) asByteArray    #[64]
     (self encodeString:(Character value:16r7F) asString) asByteArray    #[127]
     (self encodeString:(Character value:16r80) asString) asByteArray    #[194 128]
     (self encodeString:(Character value:16rFF) asString) asByteArray    #[195 191]

     (ISO10646_to_UTF8     new encodeString:'aäoöuü') asByteArray   
        -> #[97 195 164 111 195 182 117 195 188]
     (ISO10646_to_UTF8_MAC new encodeString:'aäoöuü') asByteArray 
        -> #[97 97 204 136 111 111 204 136 117 117 204 136]  

     ISO10646_to_UTF8_MAC new decodeString:
         (ISO10646_to_UTF8_MAC new encodeString:'Packages aus VSE für Smalltalk_X') asByteArray 
    "
! !

!ISO10646_to_UTF8_MAC methodsFor:'queries'!

nameOfEncoding
    ^ #'utf8-mac'
! !

!ISO10646_to_UTF8_MAC class methodsFor:'documentation'!

version
    ^ '$Header: /cvs/stx/stx/libbasic/CharacterEncoderImplementations__ISO10646_to_UTF8_MAC.st,v 1.3 2015-02-20 18:50:00 cg Exp $'
!

version_CVS
    ^ '$Header: /cvs/stx/stx/libbasic/CharacterEncoderImplementations__ISO10646_to_UTF8_MAC.st,v 1.3 2015-02-20 18:50:00 cg Exp $'
! !