author | Claus Gittinger <cg@exept.de> |
Mon, 27 May 2019 16:53:37 +0200 | |
changeset 24192 | 18512bf68422 |
parent 22475 | 71b77246e002 |
permissions | -rw-r--r-- |
17490 | 1 |
"{ Encoding: utf8 }" |
2 |
||
3 |
" |
|
4 |
COPYRIGHT (c) 2015 by eXept Software AG |
|
5 |
All Rights Reserved |
|
6 |
||
7 |
This software is furnished under a license and may be used |
|
8 |
only in accordance with the terms of that license and with the |
|
9 |
inclusion of the above copyright notice. This software may not |
|
10 |
be provided or otherwise made available to, or used by, any |
|
11 |
other person. No title to or ownership of the software is |
|
12 |
hereby transferred. |
|
13 |
" |
|
14 |
"{ Package: 'stx:libbasic' }" |
|
15 |
||
16 |
"{ NameSpace: CharacterEncoderImplementations }" |
|
17 |
||
18 |
ISO10646_to_UTF8 subclass:#ISO10646_to_UTF8_MAC |
|
19 |
instanceVariableNames:'' |
|
22475 | 20 |
classVariableNames:'DecomposeMap ComposeMap' |
17490 | 21 |
poolDictionaries:'' |
22 |
category:'Collections-Text-Encodings' |
|
23 |
! |
|
24 |
||
25 |
!ISO10646_to_UTF8_MAC class methodsFor:'documentation'! |
|
26 |
||
27 |
copyright |
|
28 |
" |
|
29 |
COPYRIGHT (c) 2015 by eXept Software AG |
|
30 |
All Rights Reserved |
|
31 |
||
32 |
This software is furnished under a license and may be used |
|
33 |
only in accordance with the terms of that license and with the |
|
34 |
inclusion of the above copyright notice. This software may not |
|
35 |
be provided or otherwise made available to, or used by, any |
|
36 |
other person. No title to or ownership of the software is |
|
37 |
hereby transferred. |
|
38 |
" |
|
39 |
! |
|
40 |
||
41 |
documentation |
|
42 |
" |
|
43 |
UTF-8 can encode some diacritical characters (umlauts) in multiple ways: |
|
44 |
- either with a single uniode (e.g. ae -> รค -> ä -> C3 A4) |
|
45 |
- or as so called 'Normalization Form canonical Decomposition', i.e. as a regular 'a' followed by a |
|
46 |
combining diacritical mark (for example: acute). |
|
47 |
||
17564
67ae75f28757
class: CharacterEncoderImplementations::ISO10646_to_UTF8_MAC
Claus Gittinger <cg@exept.de>
parents:
17522
diff
changeset
|
48 |
MAC OSX needs the second form for its file names. |
17490 | 49 |
However, OSX does not decompose the ranges U+2000-U+2FFF, U+F900-U+FAFF and U+2F800-U+2FAFF. |
50 |
||
51 |
This is a q&d hack, to at least support the first page (latin1) characters. |
|
52 |
Will be enhanced for the 2nd and 3rd unicode page, when I find time. |
|
53 |
||
17568
e90410336cc2
class: CharacterEncoderImplementations::ISO10646_to_UTF8_MAC
Claus Gittinger <cg@exept.de>
parents:
17567
diff
changeset
|
54 |
[caveat:] |
e90410336cc2
class: CharacterEncoderImplementations::ISO10646_to_UTF8_MAC
Claus Gittinger <cg@exept.de>
parents:
17567
diff
changeset
|
55 |
only a small subset of multi-composes are supported yet (for example: trema plus acute) |
e90410336cc2
class: CharacterEncoderImplementations::ISO10646_to_UTF8_MAC
Claus Gittinger <cg@exept.de>
parents:
17567
diff
changeset
|
56 |
|
17490 | 57 |
[author:] |
58 |
Claus Gittinger |
|
59 |
||
60 |
[instance variables:] |
|
61 |
||
62 |
[class variables:] |
|
17564
67ae75f28757
class: CharacterEncoderImplementations::ISO10646_to_UTF8_MAC
Claus Gittinger <cg@exept.de>
parents:
17522
diff
changeset
|
63 |
ComposeMap DecomposeMap |
17490 | 64 |
|
65 |
[see also:] |
|
66 |
http://developer.apple.com/library/mac/#qa/qa2001/qa1173.html |
|
67 |
||
68 |
" |
|
69 |
! ! |
|
70 |
||
71 |
!ISO10646_to_UTF8_MAC class methodsFor:'initialization'! |
|
72 |
||
73 |
initializeDecomposeMap |
|
17564
67ae75f28757
class: CharacterEncoderImplementations::ISO10646_to_UTF8_MAC
Claus Gittinger <cg@exept.de>
parents:
17522
diff
changeset
|
74 |
"the map which decomposes a diacritical character into its two components" |
17490 | 75 |
|
17564
67ae75f28757
class: CharacterEncoderImplementations::ISO10646_to_UTF8_MAC
Claus Gittinger <cg@exept.de>
parents:
17522
diff
changeset
|
76 |
DecomposeMap := Dictionary new. |
67ae75f28757
class: CharacterEncoderImplementations::ISO10646_to_UTF8_MAC
Claus Gittinger <cg@exept.de>
parents:
17522
diff
changeset
|
77 |
ComposeMap := Dictionary new. |
17490 | 78 |
|
17564
67ae75f28757
class: CharacterEncoderImplementations::ISO10646_to_UTF8_MAC
Claus Gittinger <cg@exept.de>
parents:
17522
diff
changeset
|
79 |
#( |
17566
a990c12c71c0
class: CharacterEncoderImplementations::ISO10646_to_UTF8_MAC
Claus Gittinger <cg@exept.de>
parents:
17565
diff
changeset
|
80 |
"/ attention: the following strings contain non-latin characters |
a990c12c71c0
class: CharacterEncoderImplementations::ISO10646_to_UTF8_MAC
Claus Gittinger <cg@exept.de>
parents:
17565
diff
changeset
|
81 |
"/ if you don't see them, change your font setting for a better font |
a990c12c71c0
class: CharacterEncoderImplementations::ISO10646_to_UTF8_MAC
Claus Gittinger <cg@exept.de>
parents:
17565
diff
changeset
|
82 |
|
17568
e90410336cc2
class: CharacterEncoderImplementations::ISO10646_to_UTF8_MAC
Claus Gittinger <cg@exept.de>
parents:
17567
diff
changeset
|
83 |
(16r0300 "gravis" 'Aรaร EรeรจIรiรฌoรฒOรUรuรนNวธnวนWแบwแบYแปฒyแปณรวรผว') |
e90410336cc2
class: CharacterEncoderImplementations::ISO10646_to_UTF8_MAC
Claus Gittinger <cg@exept.de>
parents:
17567
diff
changeset
|
84 |
(16r0301 "akut" 'AรaรกEรeรฉIรiรญOรoรณUรuรบyรฝYรCฤcฤNลnลRลrลSลsลZลนzลบGวดgวตรวผรฆวฝรวพรธวฟMแธพmแธฟKแธฐkแธฑPแนpแนWแบwแบรวรผว') |
17567
2d57395ef7e0
class: CharacterEncoderImplementations::ISO10646_to_UTF8_MAC
Claus Gittinger <cg@exept.de>
parents:
17566
diff
changeset
|
85 |
(16r0302 "circonflex" 'AรaรขEรeรชIรiรฎOรoรดUรuรปCฤcฤGฤgฤHฤคhฤฅJฤดjฤตSลsลWลดwลตYลถyลทZแบzแบ') |
2d57395ef7e0
class: CharacterEncoderImplementations::ISO10646_to_UTF8_MAC
Claus Gittinger <cg@exept.de>
parents:
17566
diff
changeset
|
86 |
(16r0303 "tilde" 'AรaรฃNรnรฑOรoรตUลจuลฉYแปธyแปนEแบผeแบฝVแนผvแนฝ') |
17568
e90410336cc2
class: CharacterEncoderImplementations::ISO10646_to_UTF8_MAC
Claus Gittinger <cg@exept.de>
parents:
17567
diff
changeset
|
87 |
(16r0304 "macron" 'AฤaฤEฤeฤIฤชiฤซOลoลUลชuลซGแธ gแธกรวรผว' ) |
17567
2d57395ef7e0
class: CharacterEncoderImplementations::ISO10646_to_UTF8_MAC
Claus Gittinger <cg@exept.de>
parents:
17566
diff
changeset
|
88 |
(16r0306 "breve" 'AฤaฤEฤeฤGฤgฤIฤฌiฤญOลoลUลฌuลญ') |
2d57395ef7e0
class: CharacterEncoderImplementations::ISO10646_to_UTF8_MAC
Claus Gittinger <cg@exept.de>
parents:
17566
diff
changeset
|
89 |
(16r0307 "dot above" 'AศฆaศงOศฎoศฏCฤcฤEฤeฤGฤ gฤกZลปzลผBแธbแธDแธdแธFแธfแธHแธขhแธฃMแนmแนNแนnแน PแนpแนRแนrแนSแน sแนกTแนชtแนซWแบwแบXแบxแบYแบyแบ' ) |
17568
e90410336cc2
class: CharacterEncoderImplementations::ISO10646_to_UTF8_MAC
Claus Gittinger <cg@exept.de>
parents:
17567
diff
changeset
|
90 |
(16r0308 "umlaut/trema" 'AรaรคEรeรซOรoรถUรuรผIรiรฏyรฟYลธHแธฆhแธงXแบxแบtแบรวรนวลชวลซวรวรบววววว') |
17567
2d57395ef7e0
class: CharacterEncoderImplementations::ISO10646_to_UTF8_MAC
Claus Gittinger <cg@exept.de>
parents:
17566
diff
changeset
|
91 |
(16r030A "ring" 'Aร aรฅUลฎuลฏwแบyแบ') |
2d57395ef7e0
class: CharacterEncoderImplementations::ISO10646_to_UTF8_MAC
Claus Gittinger <cg@exept.de>
parents:
17566
diff
changeset
|
92 |
(16r030B "dbl akut" 'OลoลUลฐuลฑ') |
2d57395ef7e0
class: CharacterEncoderImplementations::ISO10646_to_UTF8_MAC
Claus Gittinger <cg@exept.de>
parents:
17566
diff
changeset
|
93 |
(16r030C "hatcheck" 'CฤcฤDฤEฤeฤNลnลRลrลSล sลกZลฝzลพAวaวIวiวOวoวUวuวGวฆgวงKวจkวฉรวรผว') |
2d57395ef7e0
class: CharacterEncoderImplementations::ISO10646_to_UTF8_MAC
Claus Gittinger <cg@exept.de>
parents:
17566
diff
changeset
|
94 |
(16r030F "dbl grave" 'AศaศEศeศ IศiศOศoศRศrศUศuศ') |
2d57395ef7e0
class: CharacterEncoderImplementations::ISO10646_to_UTF8_MAC
Claus Gittinger <cg@exept.de>
parents:
17566
diff
changeset
|
95 |
(16r0311 "inv. breve" 'AศaศEศeศIศiศOศoศRศrศUศuศ') |
2d57395ef7e0
class: CharacterEncoderImplementations::ISO10646_to_UTF8_MAC
Claus Gittinger <cg@exept.de>
parents:
17566
diff
changeset
|
96 |
(16r0317 "acute. below" 'KฤถkฤทLฤปlฤผNล nลRลrลSศsศTศtศ') |
2d57395ef7e0
class: CharacterEncoderImplementations::ISO10646_to_UTF8_MAC
Claus Gittinger <cg@exept.de>
parents:
17566
diff
changeset
|
97 |
(16r0327 "cedille" 'CรcรงลลTลขtลฃEศจeศฉDแธdแธHแธจhแธฉ') |
2d57395ef7e0
class: CharacterEncoderImplementations::ISO10646_to_UTF8_MAC
Claus Gittinger <cg@exept.de>
parents:
17566
diff
changeset
|
98 |
(16r0328 "ogonek" 'Aฤaฤ EฤeฤIฤฎiฤฏOวชoวซUลฒuลณ') |
17564
67ae75f28757
class: CharacterEncoderImplementations::ISO10646_to_UTF8_MAC
Claus Gittinger <cg@exept.de>
parents:
17522
diff
changeset
|
99 |
) do:[:eachPair | |
67ae75f28757
class: CharacterEncoderImplementations::ISO10646_to_UTF8_MAC
Claus Gittinger <cg@exept.de>
parents:
17522
diff
changeset
|
100 |
|composeCode mapping| |
17490 | 101 |
|
17564
67ae75f28757
class: CharacterEncoderImplementations::ISO10646_to_UTF8_MAC
Claus Gittinger <cg@exept.de>
parents:
17522
diff
changeset
|
102 |
composeCode := eachPair first. |
67ae75f28757
class: CharacterEncoderImplementations::ISO10646_to_UTF8_MAC
Claus Gittinger <cg@exept.de>
parents:
17522
diff
changeset
|
103 |
mapping := eachPair second. |
67ae75f28757
class: CharacterEncoderImplementations::ISO10646_to_UTF8_MAC
Claus Gittinger <cg@exept.de>
parents:
17522
diff
changeset
|
104 |
mapping pairWiseDo:[:baseChar :composedChar | |
67ae75f28757
class: CharacterEncoderImplementations::ISO10646_to_UTF8_MAC
Claus Gittinger <cg@exept.de>
parents:
17522
diff
changeset
|
105 |
"/ setup, so that we find |
67ae75f28757
class: CharacterEncoderImplementations::ISO10646_to_UTF8_MAC
Claus Gittinger <cg@exept.de>
parents:
17522
diff
changeset
|
106 |
"/ DecomposeMap at:"$ร codePoint" 16rE0 put:#( "$a codePoint" 16r61 "greve codePoint" 16r0300). |
67ae75f28757
class: CharacterEncoderImplementations::ISO10646_to_UTF8_MAC
Claus Gittinger <cg@exept.de>
parents:
17522
diff
changeset
|
107 |
DecomposeMap |
67ae75f28757
class: CharacterEncoderImplementations::ISO10646_to_UTF8_MAC
Claus Gittinger <cg@exept.de>
parents:
17522
diff
changeset
|
108 |
at:composedChar codePoint |
67ae75f28757
class: CharacterEncoderImplementations::ISO10646_to_UTF8_MAC
Claus Gittinger <cg@exept.de>
parents:
17522
diff
changeset
|
109 |
put:(Array with:baseChar codePoint with:composeCode) |
67ae75f28757
class: CharacterEncoderImplementations::ISO10646_to_UTF8_MAC
Claus Gittinger <cg@exept.de>
parents:
17522
diff
changeset
|
110 |
]. |
17490 | 111 |
|
17564
67ae75f28757
class: CharacterEncoderImplementations::ISO10646_to_UTF8_MAC
Claus Gittinger <cg@exept.de>
parents:
17522
diff
changeset
|
112 |
ComposeMap at:composeCode put:mapping. |
67ae75f28757
class: CharacterEncoderImplementations::ISO10646_to_UTF8_MAC
Claus Gittinger <cg@exept.de>
parents:
17522
diff
changeset
|
113 |
]. |
17490 | 114 |
! ! |
115 |
||
116 |
!ISO10646_to_UTF8_MAC methodsFor:'encoding & decoding'! |
|
117 |
||
118 |
compositionOf: baseChar with: diacriticalChar to: outStream |
|
119 |
"compose two characters into one |
|
120 |
a + umlaut-diacritic-mark -> รค." |
|
121 |
||
122 |
|cp map i| |
|
123 |
||
124 |
cp := diacriticalChar codePoint. |
|
17568
e90410336cc2
class: CharacterEncoderImplementations::ISO10646_to_UTF8_MAC
Claus Gittinger <cg@exept.de>
parents:
17567
diff
changeset
|
125 |
(cp between:16r300 and:16r328) ifTrue:[ |
e90410336cc2
class: CharacterEncoderImplementations::ISO10646_to_UTF8_MAC
Claus Gittinger <cg@exept.de>
parents:
17567
diff
changeset
|
126 |
map := ComposeMap at:cp ifAbsent:nil. |
e90410336cc2
class: CharacterEncoderImplementations::ISO10646_to_UTF8_MAC
Claus Gittinger <cg@exept.de>
parents:
17567
diff
changeset
|
127 |
map notNil ifTrue:[ |
e90410336cc2
class: CharacterEncoderImplementations::ISO10646_to_UTF8_MAC
Claus Gittinger <cg@exept.de>
parents:
17567
diff
changeset
|
128 |
"/ compose |
e90410336cc2
class: CharacterEncoderImplementations::ISO10646_to_UTF8_MAC
Claus Gittinger <cg@exept.de>
parents:
17567
diff
changeset
|
129 |
i := map indexOf: baseChar. |
e90410336cc2
class: CharacterEncoderImplementations::ISO10646_to_UTF8_MAC
Claus Gittinger <cg@exept.de>
parents:
17567
diff
changeset
|
130 |
i ~~ 0 ifTrue:[ |
e90410336cc2
class: CharacterEncoderImplementations::ISO10646_to_UTF8_MAC
Claus Gittinger <cg@exept.de>
parents:
17567
diff
changeset
|
131 |
outStream nextPut: (map at:i+1). |
e90410336cc2
class: CharacterEncoderImplementations::ISO10646_to_UTF8_MAC
Claus Gittinger <cg@exept.de>
parents:
17567
diff
changeset
|
132 |
^ self. |
e90410336cc2
class: CharacterEncoderImplementations::ISO10646_to_UTF8_MAC
Claus Gittinger <cg@exept.de>
parents:
17567
diff
changeset
|
133 |
]. |
17490 | 134 |
]. |
135 |
]. |
|
136 |
||
17564
67ae75f28757
class: CharacterEncoderImplementations::ISO10646_to_UTF8_MAC
Claus Gittinger <cg@exept.de>
parents:
17522
diff
changeset
|
137 |
"/ leave as is |
17490 | 138 |
outStream nextPut: baseChar. |
139 |
outStream nextPut: diacriticalChar. |
|
140 |
! |
|
141 |
||
142 |
decodeString:aStringOrByteCollection |
|
143 |
"return a Unicode string from the passed in UTF-8-MAC encoded string. |
|
144 |
This is UTF-8 with compose-characters decomposed |
|
145 |
(i.e. as separate codes, not as single combined characters). |
|
146 |
||
17564
67ae75f28757
class: CharacterEncoderImplementations::ISO10646_to_UTF8_MAC
Claus Gittinger <cg@exept.de>
parents:
17522
diff
changeset
|
147 |
For now, here is a limited version, which should work |
67ae75f28757
class: CharacterEncoderImplementations::ISO10646_to_UTF8_MAC
Claus Gittinger <cg@exept.de>
parents:
17522
diff
changeset
|
148 |
at least for most european countries... |
17490 | 149 |
" |
150 |
||
151 |
|s buff previous| |
|
152 |
||
17564
67ae75f28757
class: CharacterEncoderImplementations::ISO10646_to_UTF8_MAC
Claus Gittinger <cg@exept.de>
parents:
17522
diff
changeset
|
153 |
s := super decodeString:aStringOrByteCollection. |
17568
e90410336cc2
class: CharacterEncoderImplementations::ISO10646_to_UTF8_MAC
Claus Gittinger <cg@exept.de>
parents:
17567
diff
changeset
|
154 |
(s contains:[:char | char codePoint between:16r0300 and:16r0328]) ifFalse:[^ s]. |
17564
67ae75f28757
class: CharacterEncoderImplementations::ISO10646_to_UTF8_MAC
Claus Gittinger <cg@exept.de>
parents:
17522
diff
changeset
|
155 |
|
67ae75f28757
class: CharacterEncoderImplementations::ISO10646_to_UTF8_MAC
Claus Gittinger <cg@exept.de>
parents:
17522
diff
changeset
|
156 |
ComposeMap isNil ifTrue:[ |
67ae75f28757
class: CharacterEncoderImplementations::ISO10646_to_UTF8_MAC
Claus Gittinger <cg@exept.de>
parents:
17522
diff
changeset
|
157 |
self class initializeDecomposeMap |
67ae75f28757
class: CharacterEncoderImplementations::ISO10646_to_UTF8_MAC
Claus Gittinger <cg@exept.de>
parents:
17522
diff
changeset
|
158 |
]. |
17522
eea77b0b2c82
class: CharacterEncoderImplementations::ISO10646_to_UTF8_MAC
Claus Gittinger <cg@exept.de>
parents:
17497
diff
changeset
|
159 |
|
eea77b0b2c82
class: CharacterEncoderImplementations::ISO10646_to_UTF8_MAC
Claus Gittinger <cg@exept.de>
parents:
17497
diff
changeset
|
160 |
buff := CharacterWriteStream on:''. |
eea77b0b2c82
class: CharacterEncoderImplementations::ISO10646_to_UTF8_MAC
Claus Gittinger <cg@exept.de>
parents:
17497
diff
changeset
|
161 |
previous := nil. |
eea77b0b2c82
class: CharacterEncoderImplementations::ISO10646_to_UTF8_MAC
Claus Gittinger <cg@exept.de>
parents:
17497
diff
changeset
|
162 |
s do:[:each | |
17568
e90410336cc2
class: CharacterEncoderImplementations::ISO10646_to_UTF8_MAC
Claus Gittinger <cg@exept.de>
parents:
17567
diff
changeset
|
163 |
(each codePoint between:16r0300 and:16r0328) ifTrue:[ |
e90410336cc2
class: CharacterEncoderImplementations::ISO10646_to_UTF8_MAC
Claus Gittinger <cg@exept.de>
parents:
17567
diff
changeset
|
164 |
previous isNil ifTrue:[ |
e90410336cc2
class: CharacterEncoderImplementations::ISO10646_to_UTF8_MAC
Claus Gittinger <cg@exept.de>
parents:
17567
diff
changeset
|
165 |
buff isEmpty ifTrue:[ |
e90410336cc2
class: CharacterEncoderImplementations::ISO10646_to_UTF8_MAC
Claus Gittinger <cg@exept.de>
parents:
17567
diff
changeset
|
166 |
"/ wrong - combiner not allowed here. |
e90410336cc2
class: CharacterEncoderImplementations::ISO10646_to_UTF8_MAC
Claus Gittinger <cg@exept.de>
parents:
17567
diff
changeset
|
167 |
buff nextPut:each. |
e90410336cc2
class: CharacterEncoderImplementations::ISO10646_to_UTF8_MAC
Claus Gittinger <cg@exept.de>
parents:
17567
diff
changeset
|
168 |
] ifFalse:[ |
e90410336cc2
class: CharacterEncoderImplementations::ISO10646_to_UTF8_MAC
Claus Gittinger <cg@exept.de>
parents:
17567
diff
changeset
|
169 |
"/ ouch - a multi-compose |
e90410336cc2
class: CharacterEncoderImplementations::ISO10646_to_UTF8_MAC
Claus Gittinger <cg@exept.de>
parents:
17567
diff
changeset
|
170 |
previous := buff last. |
e90410336cc2
class: CharacterEncoderImplementations::ISO10646_to_UTF8_MAC
Claus Gittinger <cg@exept.de>
parents:
17567
diff
changeset
|
171 |
buff skip:-1. |
e90410336cc2
class: CharacterEncoderImplementations::ISO10646_to_UTF8_MAC
Claus Gittinger <cg@exept.de>
parents:
17567
diff
changeset
|
172 |
self compositionOf:previous with:each to:buff. |
e90410336cc2
class: CharacterEncoderImplementations::ISO10646_to_UTF8_MAC
Claus Gittinger <cg@exept.de>
parents:
17567
diff
changeset
|
173 |
]. |
e90410336cc2
class: CharacterEncoderImplementations::ISO10646_to_UTF8_MAC
Claus Gittinger <cg@exept.de>
parents:
17567
diff
changeset
|
174 |
] ifFalse:[ |
e90410336cc2
class: CharacterEncoderImplementations::ISO10646_to_UTF8_MAC
Claus Gittinger <cg@exept.de>
parents:
17567
diff
changeset
|
175 |
self compositionOf:previous with:each to:buff. |
e90410336cc2
class: CharacterEncoderImplementations::ISO10646_to_UTF8_MAC
Claus Gittinger <cg@exept.de>
parents:
17567
diff
changeset
|
176 |
]. |
17522
eea77b0b2c82
class: CharacterEncoderImplementations::ISO10646_to_UTF8_MAC
Claus Gittinger <cg@exept.de>
parents:
17497
diff
changeset
|
177 |
previous := nil. |
eea77b0b2c82
class: CharacterEncoderImplementations::ISO10646_to_UTF8_MAC
Claus Gittinger <cg@exept.de>
parents:
17497
diff
changeset
|
178 |
] ifFalse:[ |
eea77b0b2c82
class: CharacterEncoderImplementations::ISO10646_to_UTF8_MAC
Claus Gittinger <cg@exept.de>
parents:
17497
diff
changeset
|
179 |
previous notNil ifTrue:[ |
eea77b0b2c82
class: CharacterEncoderImplementations::ISO10646_to_UTF8_MAC
Claus Gittinger <cg@exept.de>
parents:
17497
diff
changeset
|
180 |
buff nextPut:previous. |
17490 | 181 |
]. |
17522
eea77b0b2c82
class: CharacterEncoderImplementations::ISO10646_to_UTF8_MAC
Claus Gittinger <cg@exept.de>
parents:
17497
diff
changeset
|
182 |
previous := each. |
17490 | 183 |
]. |
184 |
]. |
|
17522
eea77b0b2c82
class: CharacterEncoderImplementations::ISO10646_to_UTF8_MAC
Claus Gittinger <cg@exept.de>
parents:
17497
diff
changeset
|
185 |
previous notNil ifTrue:[ |
eea77b0b2c82
class: CharacterEncoderImplementations::ISO10646_to_UTF8_MAC
Claus Gittinger <cg@exept.de>
parents:
17497
diff
changeset
|
186 |
buff nextPut:previous. |
eea77b0b2c82
class: CharacterEncoderImplementations::ISO10646_to_UTF8_MAC
Claus Gittinger <cg@exept.de>
parents:
17497
diff
changeset
|
187 |
]. |
eea77b0b2c82
class: CharacterEncoderImplementations::ISO10646_to_UTF8_MAC
Claus Gittinger <cg@exept.de>
parents:
17497
diff
changeset
|
188 |
^ buff contents. |
17490 | 189 |
|
190 |
" |
|
191 |
(ISO10646_to_UTF8 new encodeString:'aรคoรถuรผ') asByteArray |
|
192 |
-> #[97 195 164 111 195 182 117 195 188] |
|
193 |
||
194 |
(ISO10646_to_UTF8 new decodeString: |
|
195 |
(ISO10646_to_UTF8 new encodeString:'aรคoรถuรผ') asByteArray) |
|
196 |
||
197 |
(ISO10646_to_UTF8_MAC new encodeString:'aรคoรถuรผ') asByteArray |
|
198 |
-> #[97 97 204 136 111 111 204 136 117 117 204 136] |
|
199 |
||
200 |
(ISO10646_to_UTF8_MAC new decodeString: |
|
201 |
(ISO10646_to_UTF8_MAC new encodeString:'aรคoรถuรผ') asByteArray) |
|
202 |
" |
|
203 |
! |
|
204 |
||
205 |
decompositionOf: codePointIn into:outBlockWithTwoArgs |
|
206 |
"if required, decompose a diacritical character into a base character and a punctuation; |
|
207 |
eg. รค -> a + umlaut-diacritic-mark. |
|
208 |
Pass both as args to the given block. |
|
17564
67ae75f28757
class: CharacterEncoderImplementations::ISO10646_to_UTF8_MAC
Claus Gittinger <cg@exept.de>
parents:
17522
diff
changeset
|
209 |
For non diactit. chars, pass a nil diacrit-mark value. |
67ae75f28757
class: CharacterEncoderImplementations::ISO10646_to_UTF8_MAC
Claus Gittinger <cg@exept.de>
parents:
17522
diff
changeset
|
210 |
Return true, if a decomposition was done." |
17490 | 211 |
|
212 |
|entry| |
|
213 |
||
214 |
codePointIn < 16rC0 ifTrue:[ ^ false ]. |
|
215 |
||
21593 | 216 |
DecomposeMap isNil ifTrue:[ |
217 |
self class initializeDecomposeMap |
|
218 |
]. |
|
17490 | 219 |
entry := DecomposeMap at:codePointIn ifAbsent:nil. |
220 |
entry isNil ifTrue:[ ^ false ]. |
|
221 |
||
222 |
outBlockWithTwoArgs value:(entry at:1) value:(entry at:2). |
|
223 |
^ true |
|
21593 | 224 |
|
225 |
"Modified: / 28-02-2017 / 12:43:03 / cg" |
|
17490 | 226 |
! |
227 |
||
21478 | 228 |
encodeCharacter:aUnicodeCharacter on:aStream |
229 |
"return the UTF-8-MAC representation of a aUnicodeString. |
|
230 |
This is UTF-8 with compose-characters decompose (i.e. as separate codes, not as |
|
231 |
single combined characters). |
|
232 |
||
233 |
For now, here is a limited version, which should work |
|
234 |
at least for most european countries... |
|
235 |
" |
|
236 |
||
237 |
|codePoint composeCodePoint needExtra| |
|
238 |
||
239 |
DecomposeMap isNil ifTrue:[ |
|
240 |
self class initializeDecomposeMap |
|
241 |
]. |
|
242 |
||
243 |
codePoint := aUnicodeCharacter codePoint. |
|
244 |
needExtra := self decompositionOf:codePoint into:[:baseCodePointArg :composeCodePointArg | |
|
245 |
codePoint := baseCodePointArg. composeCodePoint := composeCodePointArg |
|
246 |
]. |
|
247 |
aStream nextPutUtf8:codePoint. |
|
248 |
needExtra ifTrue:[ |
|
249 |
aStream nextPutUtf8:composeCodePoint |
|
250 |
]. |
|
251 |
||
252 |
"Created: / 16-02-2017 / 17:45:18 / stefan" |
|
253 |
! |
|
254 |
||
17490 | 255 |
encodeString:aUnicodeString |
256 |
"return the UTF-8-MAC representation of a aUnicodeString. |
|
257 |
This is UTF-8 with compose-characters decompose (i.e. as separate codes, not as |
|
258 |
single combined characters). |
|
259 |
||
17564
67ae75f28757
class: CharacterEncoderImplementations::ISO10646_to_UTF8_MAC
Claus Gittinger <cg@exept.de>
parents:
17522
diff
changeset
|
260 |
For now, here is a limited version, which should work |
67ae75f28757
class: CharacterEncoderImplementations::ISO10646_to_UTF8_MAC
Claus Gittinger <cg@exept.de>
parents:
17522
diff
changeset
|
261 |
at least for most european countries... |
17490 | 262 |
" |
263 |
||
21478 | 264 |
|s| |
17490 | 265 |
|
266 |
s := WriteStream on:(String uninitializedNew:aUnicodeString size). |
|
21478 | 267 |
self encodeString:aUnicodeString on:s. |
17490 | 268 |
^ s contents |
269 |
||
270 |
" |
|
271 |
(self encodeString:'hello') asByteArray #[104 101 108 108 111] |
|
272 |
(self encodeString:(Character value:16r40) asString) asByteArray #[64] |
|
273 |
(self encodeString:(Character value:16r7F) asString) asByteArray #[127] |
|
274 |
(self encodeString:(Character value:16r80) asString) asByteArray #[194 128] |
|
275 |
(self encodeString:(Character value:16rFF) asString) asByteArray #[195 191] |
|
276 |
||
277 |
(ISO10646_to_UTF8 new encodeString:'aรคoรถuรผ') asByteArray |
|
278 |
-> #[97 195 164 111 195 182 117 195 188] |
|
279 |
(ISO10646_to_UTF8_MAC new encodeString:'aรคoรถuรผ') asByteArray |
|
280 |
-> #[97 97 204 136 111 111 204 136 117 117 204 136] |
|
17522
eea77b0b2c82
class: CharacterEncoderImplementations::ISO10646_to_UTF8_MAC
Claus Gittinger <cg@exept.de>
parents:
17497
diff
changeset
|
281 |
|
eea77b0b2c82
class: CharacterEncoderImplementations::ISO10646_to_UTF8_MAC
Claus Gittinger <cg@exept.de>
parents:
17497
diff
changeset
|
282 |
ISO10646_to_UTF8_MAC new decodeString: |
eea77b0b2c82
class: CharacterEncoderImplementations::ISO10646_to_UTF8_MAC
Claus Gittinger <cg@exept.de>
parents:
17497
diff
changeset
|
283 |
(ISO10646_to_UTF8_MAC new encodeString:'Packages aus VSE fรผr Smalltalk_X') asByteArray |
17490 | 284 |
" |
21478 | 285 |
|
286 |
"Modified (format): / 16-02-2017 / 17:36:14 / stefan" |
|
287 |
! |
|
288 |
||
289 |
encodeString:aUnicodeString on:aStream |
|
290 |
"return the UTF-8-MAC representation of a aUnicodeString. |
|
291 |
This is UTF-8 with compose-characters decompose (i.e. as separate codes, not as |
|
292 |
single combined characters). |
|
293 |
||
294 |
For now, here is a limited version, which should work |
|
295 |
at least for most european countries... |
|
296 |
" |
|
297 |
||
298 |
|sz "{Class: SmallInteger}" decomposeBlock codePoint composeCodePoint needExtra| |
|
299 |
||
300 |
decomposeBlock := [:baseCodePointArg :composeCodePointArg | |
|
301 |
codePoint := baseCodePointArg. composeCodePoint := composeCodePointArg |
|
302 |
]. |
|
303 |
||
304 |
sz := aUnicodeString size. |
|
305 |
1 to:sz do:[:idx| |
|
306 |
codePoint := (aUnicodeString at:idx) codePoint. |
|
307 |
needExtra := self decompositionOf:codePoint into:decomposeBlock. |
|
308 |
aStream nextPutUtf8:codePoint. |
|
309 |
needExtra ifTrue:[ |
|
310 |
aStream nextPutUtf8:composeCodePoint |
|
311 |
]. |
|
312 |
]. |
|
313 |
||
314 |
"Created: / 16-02-2017 / 17:33:04 / stefan" |
|
22414 | 315 |
! |
316 |
||
317 |
readNextCharacterFrom:aStream |
|
318 |
|firstByte bytesToRead str| |
|
319 |
||
320 |
firstByte := aStream peek. |
|
22475 | 321 |
firstByte isNil ifTrue:[ |
322 |
^ nil |
|
323 |
]. |
|
22414 | 324 |
firstByte := firstByte codePoint. |
325 |
bytesToRead := self class bytesToReadFor:firstByte. |
|
326 |
str := self decodeString:(aStream next:bytesToRead). |
|
327 |
str size ~~ 1 ifTrue:[ |
|
328 |
DecodingError raiseRequestErrorString:' - bad UTF8_MAC encoding'. |
|
329 |
]. |
|
330 |
^ str first |
|
331 |
||
332 |
"Created: / 10-01-2018 / 22:35:23 / stefan" |
|
22475 | 333 |
"Modified: / 16-01-2018 / 16:53:59 / stefan" |
17490 | 334 |
! ! |
335 |
||
17497
36ab19b73c1f
class: CharacterEncoderImplementations::ISO10646_to_UTF8_MAC
Claus Gittinger <cg@exept.de>
parents:
17490
diff
changeset
|
336 |
!ISO10646_to_UTF8_MAC methodsFor:'queries'! |
36ab19b73c1f
class: CharacterEncoderImplementations::ISO10646_to_UTF8_MAC
Claus Gittinger <cg@exept.de>
parents:
17490
diff
changeset
|
337 |
|
36ab19b73c1f
class: CharacterEncoderImplementations::ISO10646_to_UTF8_MAC
Claus Gittinger <cg@exept.de>
parents:
17490
diff
changeset
|
338 |
nameOfEncoding |
36ab19b73c1f
class: CharacterEncoderImplementations::ISO10646_to_UTF8_MAC
Claus Gittinger <cg@exept.de>
parents:
17490
diff
changeset
|
339 |
^ #'utf8-mac' |
36ab19b73c1f
class: CharacterEncoderImplementations::ISO10646_to_UTF8_MAC
Claus Gittinger <cg@exept.de>
parents:
17490
diff
changeset
|
340 |
! ! |
36ab19b73c1f
class: CharacterEncoderImplementations::ISO10646_to_UTF8_MAC
Claus Gittinger <cg@exept.de>
parents:
17490
diff
changeset
|
341 |
|
17490 | 342 |
!ISO10646_to_UTF8_MAC class methodsFor:'documentation'! |
343 |
||
344 |
version |
|
21478 | 345 |
^ '$Header$' |
17490 | 346 |
! |
347 |
||
348 |
version_CVS |
|
21478 | 349 |
^ '$Header$' |
17490 | 350 |
! ! |
351 |