43 UTF-8 can encode some diacritical characters (umlauts) in multiple ways: |
43 UTF-8 can encode some diacritical characters (umlauts) in multiple ways: |
44 - either with a single uniode (e.g. ae -> ä -> ä -> C3 A4) |
44 - either with a single uniode (e.g. ae -> ä -> ä -> C3 A4) |
45 - or as so called 'Normalization Form canonical Decomposition', i.e. as a regular 'a' followed by a |
45 - or as so called 'Normalization Form canonical Decomposition', i.e. as a regular 'a' followed by a |
46 combining diacritical mark (for example: acute). |
46 combining diacritical mark (for example: acute). |
47 |
47 |
48 MAC OSX needs the second form for its filenames. |
48 MAC OSX needs the second form for its file names. |
49 However, OSX does not decompose the ranges U+2000-U+2FFF, U+F900-U+FAFF and U+2F800-U+2FAFF. |
49 However, OSX does not decompose the ranges U+2000-U+2FFF, U+F900-U+FAFF and U+2F800-U+2FAFF. |
50 |
50 |
51 This is a q&d hack, to at least support the first page (latin1) characters. |
51 This is a q&d hack, to at least support the first page (latin1) characters. |
52 Will be enhanced for the 2nd and 3rd unicode page, when I find time. |
52 Will be enhanced for the 2nd and 3rd unicode page, when I find time. |
53 |
53 |
55 Claus Gittinger |
55 Claus Gittinger |
56 |
56 |
57 [instance variables:] |
57 [instance variables:] |
58 |
58 |
59 [class variables:] |
59 [class variables:] |
|
60 ComposeMap DecomposeMap |
60 |
61 |
61 [see also:] |
62 [see also:] |
62 http://developer.apple.com/library/mac/#qa/qa2001/qa1173.html |
63 http://developer.apple.com/library/mac/#qa/qa2001/qa1173.html |
63 |
64 |
64 " |
65 " |
65 ! ! |
66 ! ! |
66 |
67 |
67 !ISO10646_to_UTF8_MAC class methodsFor:'initialization'! |
68 !ISO10646_to_UTF8_MAC class methodsFor:'initialization'! |
68 |
69 |
69 initializeDecomposeMap |
70 initializeDecomposeMap |
|
71 "the map which decomposes a diacritical character into its two components" |
|
72 |
70 DecomposeMap := Dictionary new. |
73 DecomposeMap := Dictionary new. |
71 |
74 ComposeMap := Dictionary new. |
72 DecomposeMap at:"À" 16rC0 put:#( 16r41 16r0300). |
75 |
73 DecomposeMap at:"à" 16rE0 put:#( 16r61 16r0300). |
76 #( |
74 DecomposeMap at:"Á" 16rC1 put:#( 16r41 16r0301). |
77 (16r0300 "gravis" 'AÀaàEÈeèIÌiìoòOÒUÙuù') |
75 DecomposeMap at:"á" 16rE1 put:#( 16r61 16r0301). |
78 (16r0301 "akut" 'AÁaáEÉeéIÍiíOÓoóUÚuúyýYÝCĆcćNŃnńRŔrŕSŚsśZŹzź') |
76 DecomposeMap at:"Â" 16rC2 put:#( 16r41 16r0302). |
79 (16r0302 "circonflex" 'AÂaâEÊeêIÎiîOÔoôUÛuûCĈcĉGĜgĝHĤhĥJĴjĵSŜsŝWŴwŵYŶyŷ') |
77 DecomposeMap at:"â" 16rE2 put:#( 16r61 16r0302). |
80 (16r0303 "tilde" 'AÃaãNÑnñOÕoõUŨuũ') |
78 DecomposeMap at:"Ã" 16rC3 put:#( 16r41 16r0303). |
81 (16r0308 "umlaut" 'AÄaäOÖoöUÜuüIÏiïyÿYŸ') |
79 DecomposeMap at:"ã" 16rE3 put:#( 16r61 16r0303). |
82 (16r030A "ring" 'AÅaåUŮuů') |
80 DecomposeMap at:"Ä" 16rC4 put:#( 16r41 16r0308). |
83 (16r030C "breve" 'CČcčDĎEĚeěNŇnňRŘrřSŠsšZŽzž') |
81 DecomposeMap at:"ä" 16rE4 put:#( 16r61 16r0308). |
84 (16r0327 "cedille" 'CÇc窺TŢtţ') |
82 DecomposeMap at:"Å" 16rC5 put:#( 16r41 16r030A). |
85 ) do:[:eachPair | |
83 DecomposeMap at:"å" 16rE5 put:#( 16r61 16r030A). |
86 |composeCode mapping| |
84 |
87 |
85 DecomposeMap at:"È" 16rC8 put:#( 16r45 16r0300). |
88 composeCode := eachPair first. |
86 DecomposeMap at:"è" 16rE8 put:#( 16r65 16r0300). |
89 mapping := eachPair second. |
87 DecomposeMap at:"É" 16rC9 put:#( 16r45 16r0301). |
90 mapping pairWiseDo:[:baseChar :composedChar | |
88 DecomposeMap at:"é" 16rE9 put:#( 16r65 16r0301). |
91 "/ setup, so that we find |
89 DecomposeMap at:"Ê" 16rCA put:#( 16r45 16r0302). |
92 "/ DecomposeMap at:"$à codePoint" 16rE0 put:#( "$a codePoint" 16r61 "greve codePoint" 16r0300). |
90 DecomposeMap at:"ê" 16rEA put:#( 16r65 16r0302). |
93 DecomposeMap |
91 DecomposeMap at:"Ë" 16rCB put:#( 16r45 16r0308). |
94 at:composedChar codePoint |
92 DecomposeMap at:"ë" 16rEB put:#( 16r65 16r0308). |
95 put:(Array with:baseChar codePoint with:composeCode) |
93 |
96 ]. |
94 DecomposeMap at:"Ì" 16rCC put:#( 16r49 16r0300). |
97 |
95 DecomposeMap at:"ì" 16rEC put:#( 16r69 16r0300). |
98 ComposeMap at:composeCode put:mapping. |
96 DecomposeMap at:"í" 16rCD put:#( 16r49 16r0301). |
99 ]. |
97 DecomposeMap at:"í" 16rED put:#( 16r69 16r0301). |
|
98 DecomposeMap at:"Î" 16rCE put:#( 16r49 16r0302). |
|
99 DecomposeMap at:"î" 16rEE put:#( 16r69 16r0302). |
|
100 DecomposeMap at:"Ï" 16rCF put:#( 16r49 16r0308). |
|
101 DecomposeMap at:"ï" 16rEF put:#( 16r69 16r0308). |
|
102 |
|
103 DecomposeMap at:"Ñ" 16rD1 put:#( 16r4E 16r0303). |
|
104 DecomposeMap at:"ñ" 16rF1 put:#( 16r6E 16r0303). |
|
105 |
|
106 DecomposeMap at:"Ò" 16rD2 put:#( 16r4F 16r0300). |
|
107 DecomposeMap at:"ò" 16rF2 put:#( 16r6F 16r0300). |
|
108 DecomposeMap at:"Ó" 16rD3 put:#( 16r4F 16r0301). |
|
109 DecomposeMap at:"ó" 16rF3 put:#( 16r6F 16r0301). |
|
110 DecomposeMap at:"Ô" 16rD4 put:#( 16r4F 16r0302). |
|
111 DecomposeMap at:"ô" 16rF4 put:#( 16r6F 16r0302). |
|
112 DecomposeMap at:"Õ" 16rD5 put:#( 16r4F 16r0303). |
|
113 DecomposeMap at:"õ" 16rF5 put:#( 16r6F 16r0303). |
|
114 DecomposeMap at:"Ö" 16rD6 put:#( 16r4F 16r0308). |
|
115 DecomposeMap at:"ö" 16rF6 put:#( 16r6F 16r0308). |
|
116 |
|
117 DecomposeMap at:"Ù" 16rD9 put:#( 16r55 16r0300). |
|
118 DecomposeMap at:"ù" 16rF9 put:#( 16r75 16r0300). |
|
119 DecomposeMap at:"Ú" 16rDA put:#( 16r55 16r0301). |
|
120 DecomposeMap at:"ú" 16rFA put:#( 16r75 16r0301). |
|
121 DecomposeMap at:"Û" 16rDB put:#( 16r55 16r0302). |
|
122 DecomposeMap at:"û" 16rDB put:#( 16r75 16r0302). |
|
123 DecomposeMap at:"Ü" 16rDC put:#( 16r55 16r0308). |
|
124 DecomposeMap at:"ü" 16rFC put:#( 16r75 16r0308). |
|
125 |
|
126 DecomposeMap at:"Ý" 16rDD put:#( 16r59 16r0301). |
|
127 DecomposeMap at:"ý" 16rFD put:#( 16r79 16r0301). |
|
128 |
|
129 DecomposeMap at:"ÿ" 16rFF put:#( 16r79 16r0308). |
|
130 ! ! |
100 ! ! |
131 |
101 |
132 !ISO10646_to_UTF8_MAC methodsFor:'encoding & decoding'! |
102 !ISO10646_to_UTF8_MAC methodsFor:'encoding & decoding'! |
133 |
103 |
134 compositionOf: baseChar with: diacriticalChar to: outStream |
104 compositionOf: baseChar with: diacriticalChar to: outStream |
136 a + umlaut-diacritic-mark -> ä." |
106 a + umlaut-diacritic-mark -> ä." |
137 |
107 |
138 |cp map i| |
108 |cp map i| |
139 |
109 |
140 cp := diacriticalChar codePoint. |
110 cp := diacriticalChar codePoint. |
141 cp == 16r0300 ifTrue:[ |
111 map := ComposeMap at:cp ifAbsent:nil. |
142 "/ accent grave |
112 |
143 map := 'AÀaàEÈeèIÌiìoòOÒUÙuù'. |
|
144 ] ifFalse:[ cp == 16r0301 ifTrue:[ |
|
145 "/ accent |
|
146 map := 'AÁaáEÉeéIÍiíOÓoóUÚuúyýYÝ'. |
|
147 ] ifFalse:[ cp == 16r0302 ifTrue:[ |
|
148 "/ circonflex |
|
149 map := 'AÂaâEÊeêIÎiîOÔoôUÛuû'. |
|
150 ] ifFalse:[ cp == 16r0303 ifTrue:[ |
|
151 "/ tilde |
|
152 map := 'AÃaãNÑnñOÕoõ'. |
|
153 ] ifFalse:[ cp == 16r0308 ifTrue:[ |
|
154 "/ umlaut |
|
155 map := 'AÄaäOÖoöUÜuüIÏiïyÿ'. |
|
156 ] ifFalse:[ cp == 16r030A ifTrue:[ |
|
157 "/ ring |
|
158 map := 'AÅaå'. |
|
159 ]]]]]]. |
|
160 map notNil ifTrue:[ |
113 map notNil ifTrue:[ |
|
114 "/ compose |
161 i := map indexOf: baseChar. |
115 i := map indexOf: baseChar. |
162 i ~~ 0 ifTrue:[ |
116 i ~~ 0 ifTrue:[ |
163 outStream nextPut: (map at:i+1). |
117 outStream nextPut: (map at:i+1). |
164 ^ self. |
118 ^ self. |
165 ]. |
119 ]. |
166 ]. |
120 ]. |
167 |
121 |
|
122 "/ leave as is |
168 outStream nextPut: baseChar. |
123 outStream nextPut: baseChar. |
169 outStream nextPut: diacriticalChar. |
124 outStream nextPut: diacriticalChar. |
170 ! |
125 ! |
171 |
126 |
172 decodeString:aStringOrByteCollection |
127 decodeString:aStringOrByteCollection |
173 "return a Unicode string from the passed in UTF-8-MAC encoded string. |
128 "return a Unicode string from the passed in UTF-8-MAC encoded string. |
174 This is UTF-8 with compose-characters decomposed |
129 This is UTF-8 with compose-characters decomposed |
175 (i.e. as separate codes, not as single combined characters). |
130 (i.e. as separate codes, not as single combined characters). |
176 |
131 |
177 For now, here is a hacked (hardwired knowledge) version, |
132 For now, here is a limited version, which should work |
178 which will work for some european countries only... |
133 at least for most european countries... |
179 " |
134 " |
180 |
135 |
181 |s buff previous| |
136 |s buff previous| |
182 |
137 |
183 s := super decodeString:aStringOrByteCollection. |
138 s := super decodeString:aStringOrByteCollection. |
184 (s contains:[:char | char codePoint between:16r0300 and:16r030F]) ifFalse:[^ s]. |
139 (s contains:[:char | char codePoint between:16r0300 and:16r0327]) ifFalse:[^ s]. |
|
140 |
|
141 ComposeMap isNil ifTrue:[ |
|
142 self class initializeDecomposeMap |
|
143 ]. |
185 |
144 |
186 buff := CharacterWriteStream on:''. |
145 buff := CharacterWriteStream on:''. |
187 previous := nil. |
146 previous := nil. |
188 s do:[:each | |
147 s do:[:each | |
189 (each codePoint between:16r0300 and:16r030F) ifTrue:[ |
148 (each codePoint between:16r0300 and:16r0327) ifTrue:[ |
190 self compositionOf:previous with:each to:buff. |
149 self compositionOf:previous with:each to:buff. |
191 previous := nil. |
150 previous := nil. |
192 ] ifFalse:[ |
151 ] ifFalse:[ |
193 previous notNil ifTrue:[ |
152 previous notNil ifTrue:[ |
194 buff nextPut:previous. |
153 buff nextPut:previous. |
236 encodeString:aUnicodeString |
196 encodeString:aUnicodeString |
237 "return the UTF-8-MAC representation of a aUnicodeString. |
197 "return the UTF-8-MAC representation of a aUnicodeString. |
238 This is UTF-8 with compose-characters decompose (i.e. as separate codes, not as |
198 This is UTF-8 with compose-characters decompose (i.e. as separate codes, not as |
239 single combined characters). |
199 single combined characters). |
240 |
200 |
241 For now, here is a hacked (hardwired knowledge) version, which should work |
201 For now, here is a limited version, which should work |
242 at least for some european countries... |
202 at least for most european countries... |
243 " |
203 " |
244 |
204 |
245 |gen s decomp codePoint composeCodePoint| |
205 |gen s decomp codePoint composeCodePoint| |
246 |
206 |
247 DecomposeMap isNil ifTrue:[ |
207 DecomposeMap isNil ifTrue:[ |