author | Claus Gittinger <cg@exept.de> |
Wed, 18 Feb 2015 18:56:03 +0100 | |
changeset 17497 | 36ab19b73c1f |
parent 17490 | dd28d3bda290 |
child 17522 | eea77b0b2c82 |
permissions | -rw-r--r-- |
17490 | 1 |
"{ Encoding: utf8 }" |
2 |
||
3 |
" |
|
4 |
COPYRIGHT (c) 2015 by eXept Software AG |
|
5 |
All Rights Reserved |
|
6 |
||
7 |
This software is furnished under a license and may be used |
|
8 |
only in accordance with the terms of that license and with the |
|
9 |
inclusion of the above copyright notice. This software may not |
|
10 |
be provided or otherwise made available to, or used by, any |
|
11 |
other person. No title to or ownership of the software is |
|
12 |
hereby transferred. |
|
13 |
" |
|
14 |
"{ Package: 'stx:libbasic' }" |
|
15 |
||
16 |
"{ NameSpace: CharacterEncoderImplementations }" |
|
17 |
||
18 |
ISO10646_to_UTF8 subclass:#ISO10646_to_UTF8_MAC |
|
19 |
instanceVariableNames:'' |
|
20 |
classVariableNames:'AccentMap DecomposeMap' |
|
21 |
poolDictionaries:'' |
|
22 |
category:'Collections-Text-Encodings' |
|
23 |
! |
|
24 |
||
25 |
!ISO10646_to_UTF8_MAC class methodsFor:'documentation'! |
|
26 |
||
27 |
copyright |
|
28 |
" |
|
29 |
COPYRIGHT (c) 2015 by eXept Software AG |
|
30 |
All Rights Reserved |
|
31 |
||
32 |
This software is furnished under a license and may be used |
|
33 |
only in accordance with the terms of that license and with the |
|
34 |
inclusion of the above copyright notice. This software may not |
|
35 |
be provided or otherwise made available to, or used by, any |
|
36 |
other person. No title to or ownership of the software is |
|
37 |
hereby transferred. |
|
38 |
" |
|
39 |
! |
|
40 |
||
41 |
documentation |
|
42 |
" |
|
43 |
UTF-8 can encode some diacritical characters (umlauts) in multiple ways: |
|
44 |
- either with a single uniode (e.g. ae -> ä -> ä -> C3 A4) |
|
45 |
- or as so called 'Normalization Form canonical Decomposition', i.e. as a regular 'a' followed by a |
|
46 |
combining diacritical mark (for example: acute). |
|
47 |
||
48 |
MAC OSX needs the second form for its filenames. |
|
49 |
However, OSX does not decompose the ranges U+2000-U+2FFF, U+F900-U+FAFF and U+2F800-U+2FAFF. |
|
50 |
||
51 |
This is a q&d hack, to at least support the first page (latin1) characters. |
|
52 |
Will be enhanced for the 2nd and 3rd unicode page, when I find time. |
|
53 |
||
54 |
[author:] |
|
55 |
Claus Gittinger |
|
56 |
||
57 |
[instance variables:] |
|
58 |
||
59 |
[class variables:] |
|
60 |
||
61 |
[see also:] |
|
62 |
http://developer.apple.com/library/mac/#qa/qa2001/qa1173.html |
|
63 |
||
64 |
" |
|
65 |
! ! |
|
66 |
||
67 |
!ISO10646_to_UTF8_MAC class methodsFor:'initialization'! |
|
68 |
||
69 |
initializeDecomposeMap |
|
70 |
DecomposeMap := Dictionary new. |
|
71 |
||
72 |
DecomposeMap at:"À" 16rC0 put:#( 16r41 16r0300). |
|
73 |
DecomposeMap at:"à" 16rE0 put:#( 16r61 16r0300). |
|
74 |
DecomposeMap at:"Á" 16rC1 put:#( 16r41 16r0301). |
|
75 |
DecomposeMap at:"á" 16rE1 put:#( 16r61 16r0301). |
|
76 |
DecomposeMap at:"Â" 16rC2 put:#( 16r41 16r0302). |
|
77 |
DecomposeMap at:"â" 16rE2 put:#( 16r61 16r0302). |
|
78 |
DecomposeMap at:"Ã" 16rC3 put:#( 16r41 16r0303). |
|
79 |
DecomposeMap at:"ã" 16rE3 put:#( 16r61 16r0303). |
|
80 |
DecomposeMap at:"Ä" 16rC4 put:#( 16r41 16r0308). |
|
81 |
DecomposeMap at:"ä" 16rE4 put:#( 16r61 16r0308). |
|
82 |
DecomposeMap at:"Å" 16rC5 put:#( 16r41 16r030A). |
|
83 |
DecomposeMap at:"å" 16rE5 put:#( 16r61 16r030A). |
|
84 |
||
85 |
DecomposeMap at:"È" 16rC8 put:#( 16r45 16r0300). |
|
86 |
DecomposeMap at:"è" 16rE8 put:#( 16r65 16r0300). |
|
87 |
DecomposeMap at:"É" 16rC9 put:#( 16r45 16r0301). |
|
88 |
DecomposeMap at:"é" 16rE9 put:#( 16r65 16r0301). |
|
89 |
DecomposeMap at:"Ê" 16rCA put:#( 16r45 16r0302). |
|
90 |
DecomposeMap at:"ê" 16rEA put:#( 16r65 16r0302). |
|
91 |
DecomposeMap at:"Ë" 16rCB put:#( 16r45 16r0308). |
|
92 |
DecomposeMap at:"ë" 16rEB put:#( 16r65 16r0308). |
|
93 |
||
94 |
DecomposeMap at:"Ì" 16rCC put:#( 16r49 16r0300). |
|
95 |
DecomposeMap at:"ì" 16rEC put:#( 16r69 16r0300). |
|
96 |
DecomposeMap at:"í" 16rCD put:#( 16r49 16r0301). |
|
97 |
DecomposeMap at:"í" 16rED put:#( 16r69 16r0301). |
|
98 |
DecomposeMap at:"Î" 16rCE put:#( 16r49 16r0302). |
|
99 |
DecomposeMap at:"î" 16rEE put:#( 16r69 16r0302). |
|
100 |
DecomposeMap at:"Ï" 16rCF put:#( 16r49 16r0308). |
|
101 |
DecomposeMap at:"ï" 16rEF put:#( 16r69 16r0308). |
|
102 |
||
103 |
DecomposeMap at:"Ñ" 16rD1 put:#( 16r4E 16r0303). |
|
104 |
DecomposeMap at:"ñ" 16rF1 put:#( 16r6E 16r0303). |
|
105 |
||
106 |
DecomposeMap at:"Ò" 16rD2 put:#( 16r4F 16r0300). |
|
107 |
DecomposeMap at:"ò" 16rF2 put:#( 16r6F 16r0300). |
|
108 |
DecomposeMap at:"Ó" 16rD3 put:#( 16r4F 16r0301). |
|
109 |
DecomposeMap at:"ó" 16rF3 put:#( 16r6F 16r0301). |
|
110 |
DecomposeMap at:"Ô" 16rD4 put:#( 16r4F 16r0302). |
|
111 |
DecomposeMap at:"ô" 16rF4 put:#( 16r6F 16r0302). |
|
112 |
DecomposeMap at:"Õ" 16rD5 put:#( 16r4F 16r0303). |
|
113 |
DecomposeMap at:"õ" 16rF5 put:#( 16r6F 16r0303). |
|
114 |
DecomposeMap at:"Ö" 16rD6 put:#( 16r4F 16r0308). |
|
115 |
DecomposeMap at:"ö" 16rF6 put:#( 16r6F 16r0308). |
|
116 |
||
117 |
DecomposeMap at:"Ù" 16rD9 put:#( 16r55 16r0300). |
|
118 |
DecomposeMap at:"ù" 16rF9 put:#( 16r75 16r0300). |
|
119 |
DecomposeMap at:"Ú" 16rDA put:#( 16r55 16r0301). |
|
120 |
DecomposeMap at:"ú" 16rFA put:#( 16r75 16r0301). |
|
121 |
DecomposeMap at:"Û" 16rDB put:#( 16r55 16r0302). |
|
122 |
DecomposeMap at:"û" 16rDB put:#( 16r75 16r0302). |
|
123 |
DecomposeMap at:"Ü" 16rDC put:#( 16r55 16r0308). |
|
124 |
DecomposeMap at:"ü" 16rFC put:#( 16r75 16r0308). |
|
125 |
||
126 |
DecomposeMap at:"Ý" 16rDD put:#( 16r59 16r0301). |
|
127 |
DecomposeMap at:"ý" 16rFD put:#( 16r79 16r0301). |
|
128 |
||
129 |
DecomposeMap at:"ÿ" 16rFF put:#( 16r79 16r0308). |
|
130 |
! ! |
|
131 |
||
132 |
!ISO10646_to_UTF8_MAC methodsFor:'encoding & decoding'! |
|
133 |
||
134 |
compositionOf: baseChar with: diacriticalChar to: outStream |
|
135 |
"compose two characters into one |
|
136 |
a + umlaut-diacritic-mark -> ä." |
|
137 |
||
138 |
|cp map i| |
|
139 |
||
140 |
cp := diacriticalChar codePoint. |
|
141 |
cp == 16r0300 ifTrue:[ |
|
142 |
"/ accent grave |
|
143 |
map := 'AÀaàEÈeèIÌiìoòOÒUÙuù'. |
|
144 |
] ifFalse:[ cp == 16r0301 ifTrue:[ |
|
145 |
"/ accent |
|
146 |
map := 'AÁaáEÉeéIÍiíOÓoóUÚuúyýYÝ'. |
|
147 |
] ifFalse:[ cp == 16r0302 ifTrue:[ |
|
148 |
"/ circonflex |
|
149 |
map := 'AÂaâEÊeêIÎiîOÔoôUÛuû'. |
|
150 |
] ifFalse:[ cp == 16r0303 ifTrue:[ |
|
151 |
"/ tilde |
|
152 |
map := 'AÃaãNÑnñOÕoõ'. |
|
153 |
] ifFalse:[ cp == 16r0308 ifTrue:[ |
|
154 |
"/ umlaut |
|
155 |
map := 'AÄaäOÖoöUÜuüIÏiïyÿ'. |
|
156 |
] ifFalse:[ cp == 16r030A ifTrue:[ |
|
157 |
"/ ring |
|
158 |
map := 'AÅaå'. |
|
159 |
]]]]]]. |
|
160 |
map notNil ifTrue:[ |
|
161 |
i := map indexOf: baseChar. |
|
162 |
i ~~ 0 ifTrue:[ |
|
163 |
outStream nextPut: (map at:i+1). |
|
164 |
^ self. |
|
165 |
]. |
|
166 |
]. |
|
167 |
||
168 |
outStream nextPut: baseChar. |
|
169 |
outStream nextPut: diacriticalChar. |
|
170 |
! |
|
171 |
||
172 |
decodeString:aStringOrByteCollection |
|
173 |
"return a Unicode string from the passed in UTF-8-MAC encoded string. |
|
174 |
This is UTF-8 with compose-characters decomposed |
|
175 |
(i.e. as separate codes, not as single combined characters). |
|
176 |
||
177 |
For now, here is a hacked (hardwired knowledge) version, |
|
178 |
which will work for some european countries only... |
|
179 |
" |
|
180 |
||
181 |
|s buff previous| |
|
182 |
||
183 |
s := super decodeString:aStringOrByteCollection. |
|
184 |
(s contains:[:char | char codePoint between:16r0300 and:16r030F]) ifTrue:[ |
|
185 |
buff := CharacterWriteStream on:''. |
|
186 |
previous := nil. |
|
187 |
s do:[:each | |
|
188 |
(each codePoint between:16r0300 and:16r030F) ifTrue:[ |
|
189 |
self compositionOf:previous with:each to:buff. |
|
190 |
previous := nil. |
|
191 |
] ifFalse:[ |
|
192 |
previous notNil ifTrue:[ |
|
193 |
buff nextPut:previous. |
|
194 |
]. |
|
195 |
previous := each. |
|
196 |
]. |
|
197 |
]. |
|
198 |
^ buff contents. |
|
199 |
]. |
|
200 |
^ s |
|
201 |
||
202 |
" |
|
203 |
(ISO10646_to_UTF8 new encodeString:'aäoöuü') asByteArray |
|
204 |
-> #[97 195 164 111 195 182 117 195 188] |
|
205 |
||
206 |
(ISO10646_to_UTF8 new decodeString: |
|
207 |
(ISO10646_to_UTF8 new encodeString:'aäoöuü') asByteArray) |
|
208 |
||
209 |
(ISO10646_to_UTF8_MAC new encodeString:'aäoöuü') asByteArray |
|
210 |
-> #[97 97 204 136 111 111 204 136 117 117 204 136] |
|
211 |
||
212 |
(ISO10646_to_UTF8_MAC new decodeString: |
|
213 |
(ISO10646_to_UTF8_MAC new encodeString:'aäoöuü') asByteArray) |
|
214 |
" |
|
215 |
! |
|
216 |
||
217 |
decompositionOf: codePointIn into:outBlockWithTwoArgs |
|
218 |
"if required, decompose a diacritical character into a base character and a punctuation; |
|
219 |
eg. ä -> a + umlaut-diacritic-mark. |
|
220 |
Pass both as args to the given block. |
|
221 |
For non diactit. chars, pass a nil diacrit-mark value" |
|
222 |
||
223 |
|entry| |
|
224 |
||
225 |
codePointIn < 16rC0 ifTrue:[ ^ false ]. |
|
226 |
||
227 |
entry := DecomposeMap at:codePointIn ifAbsent:nil. |
|
228 |
entry isNil ifTrue:[ ^ false ]. |
|
229 |
||
230 |
outBlockWithTwoArgs value:(entry at:1) value:(entry at:2). |
|
231 |
^ true |
|
232 |
! |
|
233 |
||
234 |
encodeString:aUnicodeString |
|
235 |
"return the UTF-8-MAC representation of a aUnicodeString. |
|
236 |
This is UTF-8 with compose-characters decompose (i.e. as separate codes, not as |
|
237 |
single combined characters). |
|
238 |
||
239 |
For now, here is a hacked (hardwired knowledge) version, which should work |
|
240 |
at least for some european countries... |
|
241 |
" |
|
242 |
||
243 |
|gen s decomp codePoint composeCodePoint| |
|
244 |
||
245 |
DecomposeMap isNil ifTrue:[ |
|
246 |
self class initializeDecomposeMap |
|
247 |
]. |
|
248 |
||
249 |
gen := |
|
250 |
[:codePointArg | |
|
251 |
|codePoint "{Class: SmallInteger }" b1 b2 b3 b4 b5 v "{Class: SmallInteger }"| |
|
252 |
||
253 |
codePoint := codePointArg. |
|
254 |
codePoint <= 16r7F ifTrue:[ |
|
255 |
s nextPut:(Character value:codePoint). |
|
256 |
] ifFalse:[ |
|
257 |
b1 := Character value:((codePoint bitAnd:16r3F) bitOr:2r10000000). |
|
258 |
v := codePoint bitShift:-6. |
|
259 |
v <= 16r1F ifTrue:[ |
|
260 |
s nextPut:(Character value:(v bitOr:2r11000000)). |
|
261 |
s nextPut:b1. |
|
262 |
] ifFalse:[ |
|
263 |
b2 := Character value:((v bitAnd:16r3F) bitOr:2r10000000). |
|
264 |
v := v bitShift:-6. |
|
265 |
v <= 16r0F ifTrue:[ |
|
266 |
s nextPut:(Character value:(v bitOr:2r11100000)). |
|
267 |
s nextPut:b2; nextPut:b1. |
|
268 |
] ifFalse:[ |
|
269 |
b3 := Character value:((v bitAnd:16r3F) bitOr:2r10000000). |
|
270 |
v := v bitShift:-6. |
|
271 |
v <= 16r07 ifTrue:[ |
|
272 |
s nextPut:(Character value:(v bitOr:2r11110000)). |
|
273 |
s nextPut:b3; nextPut:b2; nextPut:b1. |
|
274 |
] ifFalse:[ |
|
275 |
b4 := Character value:((v bitAnd:16r3F) bitOr:2r10000000). |
|
276 |
v := v bitShift:-6. |
|
277 |
v <= 16r03 ifTrue:[ |
|
278 |
s nextPut:(Character value:(v bitOr:2r11111000)). |
|
279 |
s nextPut:b4; nextPut:b3; nextPut:b2; nextPut:b1. |
|
280 |
] ifFalse:[ |
|
281 |
b5 := Character value:((v bitAnd:16r3F) bitOr:2r10000000). |
|
282 |
v := v bitShift:-6. |
|
283 |
v <= 16r01 ifTrue:[ |
|
284 |
s nextPut:(Character value:(v bitOr:2r11111100)). |
|
285 |
s nextPut:b5; nextPut:b4; nextPut:b3; nextPut:b2; nextPut:b1. |
|
286 |
] ifFalse:[ |
|
287 |
"/ cannot happen - we only support up to 30 bit characters |
|
288 |
self error:'ascii value > 31bit in utf8Encode'. |
|
289 |
] |
|
290 |
]. |
|
291 |
]. |
|
292 |
]. |
|
293 |
]. |
|
294 |
]. |
|
295 |
]. |
|
296 |
||
297 |
decomp := [:baseCodePointArg :composeCodePointArg | |
|
298 |
codePoint := baseCodePointArg. composeCodePoint := composeCodePointArg |
|
299 |
]. |
|
300 |
||
301 |
s := WriteStream on:(String uninitializedNew:aUnicodeString size). |
|
302 |
aUnicodeString do:[:eachCharacter | |
|
303 |
|needExtra| |
|
304 |
||
305 |
codePoint := eachCharacter codePoint. |
|
306 |
needExtra := self decompositionOf: codePoint into:decomp. |
|
307 |
gen value:codePoint. |
|
308 |
needExtra ifTrue:[ |
|
309 |
gen value:composeCodePoint |
|
310 |
]. |
|
311 |
]. |
|
312 |
||
313 |
^ s contents |
|
314 |
||
315 |
" |
|
316 |
(self encodeString:'hello') asByteArray #[104 101 108 108 111] |
|
317 |
(self encodeString:(Character value:16r40) asString) asByteArray #[64] |
|
318 |
(self encodeString:(Character value:16r7F) asString) asByteArray #[127] |
|
319 |
(self encodeString:(Character value:16r80) asString) asByteArray #[194 128] |
|
320 |
(self encodeString:(Character value:16rFF) asString) asByteArray #[195 191] |
|
321 |
||
322 |
(ISO10646_to_UTF8 new encodeString:'aäoöuü') asByteArray |
|
323 |
-> #[97 195 164 111 195 182 117 195 188] |
|
324 |
(ISO10646_to_UTF8_MAC new encodeString:'aäoöuü') asByteArray |
|
325 |
-> #[97 97 204 136 111 111 204 136 117 117 204 136] |
|
326 |
" |
|
327 |
! ! |
|
328 |
||
17497
36ab19b73c1f
class: CharacterEncoderImplementations::ISO10646_to_UTF8_MAC
Claus Gittinger <cg@exept.de>
parents:
17490
diff
changeset
|
329 |
!ISO10646_to_UTF8_MAC methodsFor:'queries'! |
36ab19b73c1f
class: CharacterEncoderImplementations::ISO10646_to_UTF8_MAC
Claus Gittinger <cg@exept.de>
parents:
17490
diff
changeset
|
330 |
|
36ab19b73c1f
class: CharacterEncoderImplementations::ISO10646_to_UTF8_MAC
Claus Gittinger <cg@exept.de>
parents:
17490
diff
changeset
|
331 |
nameOfEncoding |
36ab19b73c1f
class: CharacterEncoderImplementations::ISO10646_to_UTF8_MAC
Claus Gittinger <cg@exept.de>
parents:
17490
diff
changeset
|
332 |
^ #'utf8-mac' |
36ab19b73c1f
class: CharacterEncoderImplementations::ISO10646_to_UTF8_MAC
Claus Gittinger <cg@exept.de>
parents:
17490
diff
changeset
|
333 |
! ! |
36ab19b73c1f
class: CharacterEncoderImplementations::ISO10646_to_UTF8_MAC
Claus Gittinger <cg@exept.de>
parents:
17490
diff
changeset
|
334 |
|
17490 | 335 |
!ISO10646_to_UTF8_MAC class methodsFor:'documentation'! |
336 |
||
337 |
version |
|
17497
36ab19b73c1f
class: CharacterEncoderImplementations::ISO10646_to_UTF8_MAC
Claus Gittinger <cg@exept.de>
parents:
17490
diff
changeset
|
338 |
^ '$Header: /cvs/stx/stx/libbasic/CharacterEncoderImplementations__ISO10646_to_UTF8_MAC.st,v 1.2 2015-02-18 17:56:03 cg Exp $' |
17490 | 339 |
! |
340 |
||
341 |
version_CVS |
|
17497
36ab19b73c1f
class: CharacterEncoderImplementations::ISO10646_to_UTF8_MAC
Claus Gittinger <cg@exept.de>
parents:
17490
diff
changeset
|
342 |
^ '$Header: /cvs/stx/stx/libbasic/CharacterEncoderImplementations__ISO10646_to_UTF8_MAC.st,v 1.2 2015-02-18 17:56:03 cg Exp $' |
17490 | 343 |
! ! |
344 |