CharacterEncoderImplementations__ISO10646_to_UTF8_MAC.st
changeset 17564 67ae75f28757
parent 17522 eea77b0b2c82
child 17565 29224f55218c
equal deleted inserted replaced
17563:4affe2d5112a 17564:67ae75f28757
    15 
    15 
    16 "{ NameSpace: CharacterEncoderImplementations }"
    16 "{ NameSpace: CharacterEncoderImplementations }"
    17 
    17 
    18 ISO10646_to_UTF8 subclass:#ISO10646_to_UTF8_MAC
    18 ISO10646_to_UTF8 subclass:#ISO10646_to_UTF8_MAC
    19 	instanceVariableNames:''
    19 	instanceVariableNames:''
    20 	classVariableNames:'AccentMap DecomposeMap'
    20 	classVariableNames:'AccentMap DecomposeMap ComposeMap'
    21 	poolDictionaries:''
    21 	poolDictionaries:''
    22 	category:'Collections-Text-Encodings'
    22 	category:'Collections-Text-Encodings'
    23 !
    23 !
    24 
    24 
    25 !ISO10646_to_UTF8_MAC class methodsFor:'documentation'!
    25 !ISO10646_to_UTF8_MAC class methodsFor:'documentation'!
    43     UTF-8 can encode some diacritical characters (umlauts) in multiple ways:
    43     UTF-8 can encode some diacritical characters (umlauts) in multiple ways:
    44         - either with a single uniode (e.g. ae -> ä -> &#228 -> C3 A4)
    44         - either with a single uniode (e.g. ae -> ä -> &#228 -> C3 A4)
    45         - or as so called 'Normalization Form canonical Decomposition', i.e. as a regular 'a' followed by a
    45         - or as so called 'Normalization Form canonical Decomposition', i.e. as a regular 'a' followed by a
    46           combining diacritical mark (for example: acute).
    46           combining diacritical mark (for example: acute).
    47 
    47 
    48     MAC OSX needs the second form for its filenames.
    48     MAC OSX needs the second form for its file names.
    49     However, OSX does not decompose the ranges U+2000-U+2FFF, U+F900-U+FAFF and U+2F800-U+2FAFF.
    49     However, OSX does not decompose the ranges U+2000-U+2FFF, U+F900-U+FAFF and U+2F800-U+2FAFF.
    50 
    50 
    51     This is a q&d hack, to at least support the first page (latin1) characters.
    51     This is a q&d hack, to at least support the first page (latin1) characters.
    52     Will be enhanced for the 2nd and 3rd unicode page, when I find time.
    52     Will be enhanced for the 2nd and 3rd unicode page, when I find time.
    53 
    53 
    55         Claus Gittinger
    55         Claus Gittinger
    56 
    56 
    57     [instance variables:]
    57     [instance variables:]
    58 
    58 
    59     [class variables:]
    59     [class variables:]
       
    60         ComposeMap DecomposeMap
    60 
    61 
    61     [see also:]
    62     [see also:]
    62         http://developer.apple.com/library/mac/#qa/qa2001/qa1173.html
    63         http://developer.apple.com/library/mac/#qa/qa2001/qa1173.html
    63 
    64 
    64 "
    65 "
    65 ! !
    66 ! !
    66 
    67 
    67 !ISO10646_to_UTF8_MAC class methodsFor:'initialization'!
    68 !ISO10646_to_UTF8_MAC class methodsFor:'initialization'!
    68 
    69 
    69 initializeDecomposeMap
    70 initializeDecomposeMap
       
    71     "the map which decomposes a diacritical character into its two components"
       
    72 
    70     DecomposeMap := Dictionary new.
    73     DecomposeMap := Dictionary new.
    71 
    74     ComposeMap := Dictionary new.
    72     DecomposeMap at:"À"  16rC0 put:#( 16r41 16r0300).
    75 
    73     DecomposeMap at:"à" 16rE0 put:#( 16r61 16r0300).
    76     #(
    74     DecomposeMap at:"Á"  16rC1 put:#( 16r41 16r0301).
    77         (16r0300 "gravis" 'AÀaàEÈeèIÌiìoòOÒUÙuù')
    75     DecomposeMap at:"á" 16rE1 put:#( 16r61 16r0301).
    78         (16r0301 "akut"   'AÁaáEÉeéIÍiíOÓoóUÚuúyýYÝCĆcćNŃnńRŔrŕSŚsśZŹzź')
    76     DecomposeMap at:"Â"  16rC2 put:#( 16r41 16r0302).
    79         (16r0302 "circonflex" 'AÂaâEÊeêIÎiîOÔoôUÛuûCĈcĉGĜgĝHĤhĥJĴjĵSŜsŝWŴwŵYŶyŷ')
    77     DecomposeMap at:"â" 16rE2 put:#( 16r61 16r0302).
    80         (16r0303  "tilde" 'AÃaãNÑnñOÕoõUŨuũ')
    78     DecomposeMap at:"Ã"  16rC3 put:#( 16r41 16r0303).
    81         (16r0308  "umlaut" 'AÄaäOÖoöUÜuüIÏiïyÿYŸ')
    79     DecomposeMap at:"ã" 16rE3 put:#( 16r61 16r0303).
    82         (16r030A  "ring" 'AÅaåUŮuů')
    80     DecomposeMap at:"Ä"  16rC4 put:#( 16r41 16r0308).
    83         (16r030C  "breve" 'CČcčDĎEĚeěNŇnňRŘrřSŠsšZŽzž')
    81     DecomposeMap at:"ä" 16rE4 put:#( 16r61 16r0308).
    84         (16r0327  "cedille" 'CÇc窺TŢtţ')       
    82     DecomposeMap at:"Å"  16rC5 put:#( 16r41 16r030A).
    85     ) do:[:eachPair |
    83     DecomposeMap at:"å" 16rE5 put:#( 16r61 16r030A).
    86         |composeCode mapping|
    84 
    87 
    85     DecomposeMap at:"È"  16rC8 put:#( 16r45 16r0300).
    88         composeCode := eachPair first.
    86     DecomposeMap at:"è" 16rE8 put:#( 16r65 16r0300).
    89         mapping := eachPair second.
    87     DecomposeMap at:"É"  16rC9 put:#( 16r45 16r0301).
    90         mapping pairWiseDo:[:baseChar :composedChar |
    88     DecomposeMap at:"é" 16rE9 put:#( 16r65 16r0301).
    91             "/ setup, so that we find
    89     DecomposeMap at:"Ê"  16rCA put:#( 16r45 16r0302).
    92             "/    DecomposeMap at:"$à codePoint" 16rE0 put:#( "$a codePoint" 16r61 "greve codePoint" 16r0300).
    90     DecomposeMap at:"ê" 16rEA put:#( 16r65 16r0302).
    93             DecomposeMap 
    91     DecomposeMap at:"Ë"  16rCB put:#( 16r45 16r0308).
    94                 at:composedChar codePoint 
    92     DecomposeMap at:"ë" 16rEB put:#( 16r65 16r0308).
    95                 put:(Array with:baseChar codePoint with:composeCode)
    93 
    96         ].
    94     DecomposeMap at:"Ì"  16rCC put:#( 16r49 16r0300).
    97 
    95     DecomposeMap at:"ì" 16rEC put:#( 16r69 16r0300).
    98         ComposeMap at:composeCode put:mapping.
    96     DecomposeMap at:"í"  16rCD put:#( 16r49 16r0301).
    99     ].
    97     DecomposeMap at:"í" 16rED put:#( 16r69 16r0301).
       
    98     DecomposeMap at:"Î"  16rCE put:#( 16r49 16r0302).
       
    99     DecomposeMap at:"î" 16rEE put:#( 16r69 16r0302).
       
   100     DecomposeMap at:"Ï"  16rCF put:#( 16r49 16r0308).
       
   101     DecomposeMap at:"ï" 16rEF put:#( 16r69 16r0308).
       
   102 
       
   103     DecomposeMap at:"Ñ"  16rD1 put:#( 16r4E 16r0303).
       
   104     DecomposeMap at:"ñ" 16rF1 put:#( 16r6E 16r0303).
       
   105 
       
   106     DecomposeMap at:"Ò"  16rD2 put:#( 16r4F 16r0300).
       
   107     DecomposeMap at:"ò" 16rF2 put:#( 16r6F 16r0300).
       
   108     DecomposeMap at:"Ó"  16rD3 put:#( 16r4F 16r0301).
       
   109     DecomposeMap at:"ó" 16rF3 put:#( 16r6F 16r0301).
       
   110     DecomposeMap at:"Ô"  16rD4 put:#( 16r4F 16r0302).
       
   111     DecomposeMap at:"ô" 16rF4 put:#( 16r6F 16r0302).
       
   112     DecomposeMap at:"Õ"  16rD5 put:#( 16r4F 16r0303).
       
   113     DecomposeMap at:"õ" 16rF5 put:#( 16r6F 16r0303).
       
   114     DecomposeMap at:"Ö"  16rD6 put:#( 16r4F 16r0308).
       
   115     DecomposeMap at:"ö" 16rF6 put:#( 16r6F 16r0308).
       
   116 
       
   117     DecomposeMap at:"Ù"  16rD9 put:#( 16r55 16r0300).
       
   118     DecomposeMap at:"ù" 16rF9 put:#( 16r75 16r0300).
       
   119     DecomposeMap at:"Ú"  16rDA put:#( 16r55 16r0301).
       
   120     DecomposeMap at:"ú" 16rFA put:#( 16r75 16r0301).
       
   121     DecomposeMap at:"Û"  16rDB put:#( 16r55 16r0302).
       
   122     DecomposeMap at:"û" 16rDB put:#( 16r75 16r0302).
       
   123     DecomposeMap at:"Ü"  16rDC put:#( 16r55 16r0308).
       
   124     DecomposeMap at:"ü" 16rFC put:#( 16r75 16r0308).
       
   125 
       
   126     DecomposeMap at:"Ý"  16rDD put:#( 16r59 16r0301).
       
   127     DecomposeMap at:"ý" 16rFD put:#( 16r79 16r0301).
       
   128 
       
   129     DecomposeMap at:"ÿ"  16rFF put:#( 16r79 16r0308).
       
   130 ! !
   100 ! !
   131 
   101 
   132 !ISO10646_to_UTF8_MAC methodsFor:'encoding & decoding'!
   102 !ISO10646_to_UTF8_MAC methodsFor:'encoding & decoding'!
   133 
   103 
   134 compositionOf: baseChar with: diacriticalChar  to: outStream
   104 compositionOf: baseChar with: diacriticalChar  to: outStream
   136      a + umlaut-diacritic-mark -> ä."
   106      a + umlaut-diacritic-mark -> ä."
   137 
   107 
   138     |cp map i|
   108     |cp map i|
   139 
   109 
   140     cp := diacriticalChar codePoint.
   110     cp := diacriticalChar codePoint.
   141     cp == 16r0300  ifTrue:[
   111     map := ComposeMap at:cp ifAbsent:nil.
   142         "/ accent grave
   112 
   143         map := 'AÀaàEÈeèIÌiìoòOÒUÙuù'.
       
   144     ] ifFalse:[ cp == 16r0301  ifTrue:[
       
   145         "/ accent
       
   146         map := 'AÁaáEÉeéIÍiíOÓoóUÚuúyýYÝ'.
       
   147     ] ifFalse:[ cp == 16r0302  ifTrue:[
       
   148         "/ circonflex
       
   149         map := 'AÂaâEÊeêIÎiîOÔoôUÛuû'.
       
   150     ] ifFalse:[ cp == 16r0303  ifTrue:[
       
   151         "/ tilde
       
   152         map := 'AÃaãNÑnñOÕoõ'.
       
   153     ] ifFalse:[ cp == 16r0308  ifTrue:[
       
   154         "/ umlaut
       
   155         map := 'AÄaäOÖoöUÜuüIÏiïyÿ'.
       
   156     ] ifFalse:[ cp == 16r030A  ifTrue:[
       
   157         "/ ring
       
   158         map := 'AÅaå'.
       
   159     ]]]]]].
       
   160     map notNil ifTrue:[
   113     map notNil ifTrue:[
       
   114         "/ compose
   161         i := map indexOf: baseChar.
   115         i := map indexOf: baseChar.
   162         i ~~ 0 ifTrue:[
   116         i ~~ 0 ifTrue:[
   163             outStream nextPut: (map at:i+1).
   117             outStream nextPut: (map at:i+1).
   164             ^ self.
   118             ^ self.
   165         ].
   119         ].
   166     ].
   120     ].
   167 
   121 
       
   122     "/ leave as is
   168     outStream nextPut: baseChar.
   123     outStream nextPut: baseChar.
   169     outStream nextPut: diacriticalChar.
   124     outStream nextPut: diacriticalChar.
   170 !
   125 !
   171 
   126 
   172 decodeString:aStringOrByteCollection
   127 decodeString:aStringOrByteCollection
   173     "return a Unicode string from the passed in UTF-8-MAC encoded string.
   128     "return a Unicode string from the passed in UTF-8-MAC encoded string.
   174      This is UTF-8 with compose-characters decomposed 
   129      This is UTF-8 with compose-characters decomposed 
   175      (i.e. as separate codes, not as single combined characters).
   130      (i.e. as separate codes, not as single combined characters).
   176 
   131 
   177      For now, here is a hacked (hardwired knowledge) version, 
   132      For now, here is a limited version, which should work
   178      which will work for some european countries only...
   133      at least for most european countries...
   179     "
   134     "
   180 
   135 
   181     |s buff previous|
   136     |s buff previous|
   182 
   137 
   183     s := super  decodeString:aStringOrByteCollection.
   138     s := super decodeString:aStringOrByteCollection.
   184     (s contains:[:char | char codePoint between:16r0300 and:16r030F]) ifFalse:[^ s].
   139     (s contains:[:char | char codePoint between:16r0300 and:16r0327]) ifFalse:[^ s].
       
   140 
       
   141     ComposeMap isNil ifTrue:[
       
   142         self class initializeDecomposeMap
       
   143     ].
   185 
   144 
   186     buff := CharacterWriteStream on:''.
   145     buff := CharacterWriteStream on:''.
   187     previous := nil.
   146     previous := nil.
   188     s do:[:each |
   147     s do:[:each |
   189         (each codePoint between:16r0300 and:16r030F) ifTrue:[
   148         (each codePoint between:16r0300 and:16r0327) ifTrue:[
   190             self compositionOf:previous with:each to:buff.
   149             self compositionOf:previous with:each to:buff.
   191             previous := nil.
   150             previous := nil.
   192         ] ifFalse:[
   151         ] ifFalse:[
   193             previous notNil ifTrue:[
   152             previous notNil ifTrue:[
   194                 buff nextPut:previous.
   153                 buff nextPut:previous.
   218 
   177 
   219 decompositionOf: codePointIn into:outBlockWithTwoArgs
   178 decompositionOf: codePointIn into:outBlockWithTwoArgs
   220     "if required, decompose a diacritical character into a base character and a punctuation;
   179     "if required, decompose a diacritical character into a base character and a punctuation;
   221      eg. ä -> a + umlaut-diacritic-mark.
   180      eg. ä -> a + umlaut-diacritic-mark.
   222      Pass both as args to the given block.
   181      Pass both as args to the given block.
   223      For non diactit. chars, pass a nil diacrit-mark value"
   182      For non diactit. chars, pass a nil diacrit-mark value.
       
   183      Return true, if a decomposition was done."
   224 
   184 
   225     |entry|
   185     |entry|
   226 
   186 
   227     codePointIn < 16rC0 ifTrue:[ ^ false ].
   187     codePointIn < 16rC0 ifTrue:[ ^ false ].
   228 
   188 
   236 encodeString:aUnicodeString
   196 encodeString:aUnicodeString
   237     "return the UTF-8-MAC representation of a aUnicodeString.
   197     "return the UTF-8-MAC representation of a aUnicodeString.
   238      This is UTF-8 with compose-characters decompose (i.e. as separate codes, not as
   198      This is UTF-8 with compose-characters decompose (i.e. as separate codes, not as
   239      single combined characters).
   199      single combined characters).
   240 
   200 
   241      For now, here is a hacked (hardwired knowledge) version, which should work
   201      For now, here is a limited version, which should work
   242      at least for some european countries...
   202      at least for most european countries...
   243     "
   203     "
   244 
   204 
   245     |gen s decomp codePoint composeCodePoint|
   205     |gen s decomp codePoint composeCodePoint|
   246 
   206 
   247     DecomposeMap isNil ifTrue:[
   207     DecomposeMap isNil ifTrue:[
   294                     ].
   254                     ].
   295                 ].
   255                 ].
   296             ].
   256             ].
   297         ].
   257         ].
   298 
   258 
   299     decomp := [:baseCodePointArg :composeCodePointArg | 
   259     decomp := 
   300                 codePoint := baseCodePointArg. composeCodePoint := composeCodePointArg
   260         [:baseCodePointArg :composeCodePointArg | 
   301               ].
   261             codePoint := baseCodePointArg. composeCodePoint := composeCodePointArg
       
   262         ].
   302 
   263 
   303     s := WriteStream on:(String uninitializedNew:aUnicodeString size).
   264     s := WriteStream on:(String uninitializedNew:aUnicodeString size).
   304     aUnicodeString do:[:eachCharacter |
   265     aUnicodeString do:[:eachCharacter |
   305         |needExtra|
   266         |needExtra|
   306 
   267 
   338 ! !
   299 ! !
   339 
   300 
   340 !ISO10646_to_UTF8_MAC class methodsFor:'documentation'!
   301 !ISO10646_to_UTF8_MAC class methodsFor:'documentation'!
   341 
   302 
   342 version
   303 version
   343     ^ '$Header: /cvs/stx/stx/libbasic/CharacterEncoderImplementations__ISO10646_to_UTF8_MAC.st,v 1.3 2015-02-20 18:50:00 cg Exp $'
   304     ^ '$Header: /cvs/stx/stx/libbasic/CharacterEncoderImplementations__ISO10646_to_UTF8_MAC.st,v 1.4 2015-02-27 17:07:14 cg Exp $'
   344 !
   305 !
   345 
   306 
   346 version_CVS
   307 version_CVS
   347     ^ '$Header: /cvs/stx/stx/libbasic/CharacterEncoderImplementations__ISO10646_to_UTF8_MAC.st,v 1.3 2015-02-20 18:50:00 cg Exp $'
   308     ^ '$Header: /cvs/stx/stx/libbasic/CharacterEncoderImplementations__ISO10646_to_UTF8_MAC.st,v 1.4 2015-02-27 17:07:14 cg Exp $'
   348 ! !
   309 ! !
   349 
   310