author | Michael Beyl <mb@exept.de> |
Wed, 11 Jul 2012 13:38:13 +0200 | |
changeset 14174 | 3a4f041c94a2 |
parent 14169 | eab487f07a2b |
child 14188 | 9ff8607b11a4 |
permissions | -rw-r--r-- |
8048 | 1 |
" |
7899
7577df77ba95
character encodings - first attempt
Claus Gittinger <cg@exept.de>
parents:
7893
diff
changeset
|
2 |
COPYRIGHT (c) 2004 by eXept Software AG |
7932 | 3 |
All Rights Reserved |
7899
7577df77ba95
character encodings - first attempt
Claus Gittinger <cg@exept.de>
parents:
7893
diff
changeset
|
4 |
|
7577df77ba95
character encodings - first attempt
Claus Gittinger <cg@exept.de>
parents:
7893
diff
changeset
|
5 |
This software is furnished under a license and may be used |
7577df77ba95
character encodings - first attempt
Claus Gittinger <cg@exept.de>
parents:
7893
diff
changeset
|
6 |
only in accordance with the terms of that license and with the |
7577df77ba95
character encodings - first attempt
Claus Gittinger <cg@exept.de>
parents:
7893
diff
changeset
|
7 |
inclusion of the above copyright notice. This software may not |
7577df77ba95
character encodings - first attempt
Claus Gittinger <cg@exept.de>
parents:
7893
diff
changeset
|
8 |
be provided or otherwise made available to, or used by, any |
7577df77ba95
character encodings - first attempt
Claus Gittinger <cg@exept.de>
parents:
7893
diff
changeset
|
9 |
other person. No title to or ownership of the software is |
7577df77ba95
character encodings - first attempt
Claus Gittinger <cg@exept.de>
parents:
7893
diff
changeset
|
10 |
hereby transferred. |
7577df77ba95
character encodings - first attempt
Claus Gittinger <cg@exept.de>
parents:
7893
diff
changeset
|
11 |
" |
8114
05274a80fcc4
separated implementation into dynamically (lazy) loaded classes
Claus Gittinger <cg@exept.de>
parents:
8105
diff
changeset
|
12 |
"{ Package: 'stx:libbasic' }" |
05274a80fcc4
separated implementation into dynamically (lazy) loaded classes
Claus Gittinger <cg@exept.de>
parents:
8105
diff
changeset
|
13 |
|
8118 | 14 |
Object subclass:#CharacterEncoder |
7914 | 15 |
instanceVariableNames:'' |
8118 | 16 |
classVariableNames:'EncoderClassesByName EncodersByName CachedEncoders LastEncoder |
8122 | 17 |
AccessLock NullEncoderInstance Jis7KanjiEscapeSequence |
18 |
Jis7RomanEscapeSequence JisISO2022EscapeSequence |
|
19 |
Jis7KanjiOldEscapeSequence' |
|
7915 | 20 |
poolDictionaries:'' |
8114
05274a80fcc4
separated implementation into dynamically (lazy) loaded classes
Claus Gittinger <cg@exept.de>
parents:
8105
diff
changeset
|
21 |
category:'Collections-Text-Encodings' |
7969 | 22 |
! |
23 |
||
7914 | 24 |
CharacterEncoder subclass:#CompoundEncoder |
25 |
instanceVariableNames:'decoder encoder' |
|
26 |
classVariableNames:'' |
|
27 |
poolDictionaries:'' |
|
7915 | 28 |
privateIn:CharacterEncoder |
29 |
! |
|
30 |
||
7932 | 31 |
CharacterEncoder subclass:#DefaultEncoder |
32 |
instanceVariableNames:'' |
|
33 |
classVariableNames:'' |
|
34 |
poolDictionaries:'' |
|
35 |
privateIn:CharacterEncoder |
|
36 |
! |
|
37 |
||
7914 | 38 |
CharacterEncoder subclass:#InverseEncoder |
39 |
instanceVariableNames:'decoder' |
|
40 |
classVariableNames:'' |
|
41 |
poolDictionaries:'' |
|
7915 | 42 |
privateIn:CharacterEncoder |
43 |
! |
|
44 |
||
7899
7577df77ba95
character encodings - first attempt
Claus Gittinger <cg@exept.de>
parents:
7893
diff
changeset
|
45 |
CharacterEncoder subclass:#NullEncoder |
7893 | 46 |
instanceVariableNames:'' |
47 |
classVariableNames:'' |
|
48 |
poolDictionaries:'' |
|
7915 | 49 |
privateIn:CharacterEncoder |
50 |
! |
|
51 |
||
7892 | 52 |
CharacterEncoder subclass:#OtherEncoding |
53 |
instanceVariableNames:'' |
|
54 |
classVariableNames:'' |
|
55 |
poolDictionaries:'' |
|
7915 | 56 |
privateIn:CharacterEncoder |
57 |
! |
|
58 |
||
7919 | 59 |
CharacterEncoder subclass:#TwoStepEncoder |
60 |
instanceVariableNames:'encoder1 encoder2' |
|
61 |
classVariableNames:'' |
|
62 |
poolDictionaries:'' |
|
63 |
privateIn:CharacterEncoder |
|
64 |
! |
|
65 |
||
7893 | 66 |
!CharacterEncoder class methodsFor:'documentation'! |
67 |
||
7899
7577df77ba95
character encodings - first attempt
Claus Gittinger <cg@exept.de>
parents:
7893
diff
changeset
|
68 |
copyright |
7577df77ba95
character encodings - first attempt
Claus Gittinger <cg@exept.de>
parents:
7893
diff
changeset
|
69 |
" |
7577df77ba95
character encodings - first attempt
Claus Gittinger <cg@exept.de>
parents:
7893
diff
changeset
|
70 |
COPYRIGHT (c) 2004 by eXept Software AG |
7932 | 71 |
All Rights Reserved |
7899
7577df77ba95
character encodings - first attempt
Claus Gittinger <cg@exept.de>
parents:
7893
diff
changeset
|
72 |
|
7577df77ba95
character encodings - first attempt
Claus Gittinger <cg@exept.de>
parents:
7893
diff
changeset
|
73 |
This software is furnished under a license and may be used |
7577df77ba95
character encodings - first attempt
Claus Gittinger <cg@exept.de>
parents:
7893
diff
changeset
|
74 |
only in accordance with the terms of that license and with the |
7577df77ba95
character encodings - first attempt
Claus Gittinger <cg@exept.de>
parents:
7893
diff
changeset
|
75 |
inclusion of the above copyright notice. This software may not |
7577df77ba95
character encodings - first attempt
Claus Gittinger <cg@exept.de>
parents:
7893
diff
changeset
|
76 |
be provided or otherwise made available to, or used by, any |
7577df77ba95
character encodings - first attempt
Claus Gittinger <cg@exept.de>
parents:
7893
diff
changeset
|
77 |
other person. No title to or ownership of the software is |
7577df77ba95
character encodings - first attempt
Claus Gittinger <cg@exept.de>
parents:
7893
diff
changeset
|
78 |
hereby transferred. |
7577df77ba95
character encodings - first attempt
Claus Gittinger <cg@exept.de>
parents:
7893
diff
changeset
|
79 |
" |
7577df77ba95
character encodings - first attempt
Claus Gittinger <cg@exept.de>
parents:
7893
diff
changeset
|
80 |
! |
7577df77ba95
character encodings - first attempt
Claus Gittinger <cg@exept.de>
parents:
7893
diff
changeset
|
81 |
|
7893 | 82 |
documentation |
83 |
" |
|
7899
7577df77ba95
character encodings - first attempt
Claus Gittinger <cg@exept.de>
parents:
7893
diff
changeset
|
84 |
unfinished code - please read howToAddMoreCoders. |
7577df77ba95
character encodings - first attempt
Claus Gittinger <cg@exept.de>
parents:
7893
diff
changeset
|
85 |
|
7577df77ba95
character encodings - first attempt
Claus Gittinger <cg@exept.de>
parents:
7893
diff
changeset
|
86 |
Character mappings are based on information in character maps found at either: |
8226 | 87 |
http://std.dkuug.dk/i18n/charmaps |
7899
7577df77ba95
character encodings - first attempt
Claus Gittinger <cg@exept.de>
parents:
7893
diff
changeset
|
88 |
or: |
8226 | 89 |
http://www.unicode.org/Public/MAPPINGS |
7899
7577df77ba95
character encodings - first attempt
Claus Gittinger <cg@exept.de>
parents:
7893
diff
changeset
|
90 |
|
7577df77ba95
character encodings - first attempt
Claus Gittinger <cg@exept.de>
parents:
7893
diff
changeset
|
91 |
No Warranty. |
8226 | 92 |
|
93 |
All the ISO 8859 codesets include ASCII as a proper codeset within them: |
|
94 |
||
95 |
ISO 8859-1: Latin 1 - Western European Languages. |
|
96 |
ISO 8859-2: Latin 2 - Eastern European Languages. |
|
97 |
ISO 8859-3: Latin 3 - Afrikaans, Catalan, Dutch, English, Esperanto, German, |
|
98 |
Italian, Maltese, Spanish and Turkish. |
|
99 |
ISO 8859-4: Latin 4 - Danish, English, Estonian, Finnish, German, Greenlandic, Lappish and Latvian. |
|
100 |
ISO 8859-5: Latin/Cyrillic - Bulgarian, Byelorussian, English, Macedonian, Russian, Serbo-Croat and Ukranian. |
|
101 |
ISO 8859-6: Latin/Arabic - Arabic. |
|
102 |
ISO 8859-7: Latin/Greek - Greek. |
|
103 |
ISO 8859-8: Latin/Hebrew - Hebrew. |
|
104 |
ISO 8859-9: Latin 5 - Danish, Dutch, English, Finnish, French, German, Irish, Italian, |
|
105 |
Norwegian, Portuguese, Spanish, Swedish and Turkish. |
|
106 |
ISO 8859-10: Latin 6 - Danish, English, Estonian, Finnish, German, Greenlandic, Icelandic, |
|
107 |
Sami (Lappish), Latvian, Lithuanian, Norwegian, Faroese and Swedish. |
|
8810 | 108 |
[author:] |
109 |
Claus Gittinger |
|
7899
7577df77ba95
character encodings - first attempt
Claus Gittinger <cg@exept.de>
parents:
7893
diff
changeset
|
110 |
" |
7577df77ba95
character encodings - first attempt
Claus Gittinger <cg@exept.de>
parents:
7893
diff
changeset
|
111 |
! |
7577df77ba95
character encodings - first attempt
Claus Gittinger <cg@exept.de>
parents:
7893
diff
changeset
|
112 |
|
7577df77ba95
character encodings - first attempt
Claus Gittinger <cg@exept.de>
parents:
7893
diff
changeset
|
113 |
examples |
7577df77ba95
character encodings - first attempt
Claus Gittinger <cg@exept.de>
parents:
7893
diff
changeset
|
114 |
" |
9143 | 115 |
[exBegin] |
7899
7577df77ba95
character encodings - first attempt
Claus Gittinger <cg@exept.de>
parents:
7893
diff
changeset
|
116 |
|s1 s2| |
7577df77ba95
character encodings - first attempt
Claus Gittinger <cg@exept.de>
parents:
7893
diff
changeset
|
117 |
|
7577df77ba95
character encodings - first attempt
Claus Gittinger <cg@exept.de>
parents:
7893
diff
changeset
|
118 |
s1 := 'hello'. |
9143 | 119 |
s2 := CharacterEncoder encodeString:s1 from:#'iso8859-1' into:#'unicode'. |
120 |
s2 |
|
121 |
[exEnd] |
|
7899
7577df77ba95
character encodings - first attempt
Claus Gittinger <cg@exept.de>
parents:
7893
diff
changeset
|
122 |
|
9143 | 123 |
[exBegin] |
7899
7577df77ba95
character encodings - first attempt
Claus Gittinger <cg@exept.de>
parents:
7893
diff
changeset
|
124 |
|s1 s2| |
7577df77ba95
character encodings - first attempt
Claus Gittinger <cg@exept.de>
parents:
7893
diff
changeset
|
125 |
|
7577df77ba95
character encodings - first attempt
Claus Gittinger <cg@exept.de>
parents:
7893
diff
changeset
|
126 |
s1 := 'hello'. |
9143 | 127 |
s2 := CharacterEncoder encodeString:s1 from:#'iso8859-1' into:#'iso8859-7'. |
128 |
s2 |
|
129 |
[exEnd] |
|
7899
7577df77ba95
character encodings - first attempt
Claus Gittinger <cg@exept.de>
parents:
7893
diff
changeset
|
130 |
" |
7577df77ba95
character encodings - first attempt
Claus Gittinger <cg@exept.de>
parents:
7893
diff
changeset
|
131 |
! |
7577df77ba95
character encodings - first attempt
Claus Gittinger <cg@exept.de>
parents:
7893
diff
changeset
|
132 |
|
7577df77ba95
character encodings - first attempt
Claus Gittinger <cg@exept.de>
parents:
7893
diff
changeset
|
133 |
howToAddMoreCoders |
7577df77ba95
character encodings - first attempt
Claus Gittinger <cg@exept.de>
parents:
7893
diff
changeset
|
134 |
" |
9143 | 135 |
Coders can be hand-written or automagically generated via a mapping table. |
7932 | 136 |
Examples for hand-written coders are UTF8_to_ISO10464 or JIS0208_to_JIS7. |
137 |
||
7899
7577df77ba95
character encodings - first attempt
Claus Gittinger <cg@exept.de>
parents:
7893
diff
changeset
|
138 |
The table driven encode/decode methods can be generated from a character mapping document |
7577df77ba95
character encodings - first attempt
Claus Gittinger <cg@exept.de>
parents:
7893
diff
changeset
|
139 |
as found on the unicode consortium host |
9143 | 140 |
(for example: 'http://www.unicode.org/Public/MAPPINGS/ISO8859/8859-1.TXT') |
7899
7577df77ba95
character encodings - first attempt
Claus Gittinger <cg@exept.de>
parents:
7893
diff
changeset
|
141 |
|
7577df77ba95
character encodings - first attempt
Claus Gittinger <cg@exept.de>
parents:
7893
diff
changeset
|
142 |
or from the i18n character maps: |
9143 | 143 |
(for example: 'http://std.dkuug.dk/i18n/charmaps/ISO-8859-1 |
7899
7577df77ba95
character encodings - first attempt
Claus Gittinger <cg@exept.de>
parents:
7893
diff
changeset
|
144 |
|
9143 | 145 |
In order to add another coder (for example: for EBCDIC or ms-codePage 278), |
7899
7577df77ba95
character encodings - first attempt
Claus Gittinger <cg@exept.de>
parents:
7893
diff
changeset
|
146 |
perform the following steps: |
9143 | 147 |
- create a private subclass of CharacterEncoder named (for example) CP267. |
8114
05274a80fcc4
separated implementation into dynamically (lazy) loaded classes
Claus Gittinger <cg@exept.de>
parents:
8105
diff
changeset
|
148 |
|
9143 | 149 |
- create a public subclass of CharacterEncoderImplementations::CharacterEncoderImplementation named (for example) CharacterEncoderImplementations::CP267. |
7899
7577df77ba95
character encodings - first attempt
Claus Gittinger <cg@exept.de>
parents:
7893
diff
changeset
|
150 |
|
9143 | 151 |
- define the mappingURL1_relativeName (if the table is found on 'www.unicode.org') |
152 |
or the mappingURL2_relativeName (if it is found on 'std.dkuug.dk') method, which |
|
153 |
should return the name of the tables file, relative to the top directory there |
|
154 |
(which is '.../Public/MAPPINGS' on www.unicode.org and '.../i18n/charmaops' on 'std.dkuug.dk'. |
|
7899
7577df77ba95
character encodings - first attempt
Claus Gittinger <cg@exept.de>
parents:
7893
diff
changeset
|
155 |
|
9143 | 156 |
In this example, the table from 'std.dkuug.dk' is used, and named 'EBCDIC-CP-FI' there. |
7899
7577df77ba95
character encodings - first attempt
Claus Gittinger <cg@exept.de>
parents:
7893
diff
changeset
|
157 |
|
9143 | 158 |
- generate code by evaluating: |
159 |
CharacterEncoder::CP267 generateCode |
|
7899
7577df77ba95
character encodings - first attempt
Claus Gittinger <cg@exept.de>
parents:
7893
diff
changeset
|
160 |
|
7577df77ba95
character encodings - first attempt
Claus Gittinger <cg@exept.de>
parents:
7893
diff
changeset
|
161 |
Thats all !! |
7577df77ba95
character encodings - first attempt
Claus Gittinger <cg@exept.de>
parents:
7893
diff
changeset
|
162 |
|
7909 | 163 |
|
164 |
The existing code was generated by: |
|
165 |
||
9143 | 166 |
CharacterEncoder::SingleByteEncoder subclassesDo:[:cls | Transcript showCR:cls name. cls flushCode; generateCode ] |
167 |
CharacterEncoder::SingleByteEncoder subclassesDo:[:cls | cls allSubclassesDo:[:sub | Transcript showCR:sub name. sub flushCode; generateSubclassCode]] |
|
7909 | 168 |
|
169 |
or individually: |
|
9143 | 170 |
CharacterEncoder::ASCII flushCode; generateCode. |
171 |
CharacterEncoder::ISO8859_1 flushCode; generateCode. |
|
172 |
CharacterEncoder::ISO8859_2 flushCode; generateCode. |
|
173 |
CharacterEncoder::ISO8859_3 flushCode; generateCode. |
|
174 |
CharacterEncoder::ISO8859_4 flushCode; generateCode. |
|
175 |
CharacterEncoder::ISO8859_5 flushCode; generateCode. |
|
176 |
CharacterEncoder::ISO8859_6 flushCode; generateCode. |
|
177 |
CharacterEncoder::ISO8859_7 flushCode; generateCode. |
|
178 |
CharacterEncoder::ISO8859_8 flushCode; generateCode. |
|
179 |
CharacterEncoder::ISO8859_9 flushCode; generateCode. |
|
180 |
CharacterEncoder::ISO8859_10 flushCode; generateCode. |
|
181 |
CharacterEncoder::ISO8859_11 flushCode; generateCode. |
|
182 |
CharacterEncoder::ISO8859_13 flushCode; generateCode. |
|
183 |
CharacterEncoder::ISO8859_14 flushCode; generateCode. |
|
184 |
CharacterEncoder::ISO8859_15 flushCode; generateCode. |
|
185 |
CharacterEncoder::ISO8859_16 flushCode; generateCode. |
|
186 |
CharacterEncoder::KOI8_R flushCode; generateCode. |
|
187 |
CharacterEncoder::GSM0338 flushCode; generateCode. |
|
7909 | 188 |
|
9143 | 189 |
CharacterEncoder::KOI8_U flushCode; generateSubclassCode. |
7912 | 190 |
|
9143 | 191 |
CharacterEncoder::JIS0208 flushCode; generateCode. |
13072
e189e07c16aa
changed: #howToAddMoreCoders
Claus Gittinger <cg@exept.de>
parents:
13063
diff
changeset
|
192 |
|
e189e07c16aa
changed: #howToAddMoreCoders
Claus Gittinger <cg@exept.de>
parents:
13063
diff
changeset
|
193 |
Please check if your encoder tables are complete; for example, with: |
e189e07c16aa
changed: #howToAddMoreCoders
Claus Gittinger <cg@exept.de>
parents:
13063
diff
changeset
|
194 |
0 to:255 do:[:ebc | |
e189e07c16aa
changed: #howToAddMoreCoders
Claus Gittinger <cg@exept.de>
parents:
13063
diff
changeset
|
195 |
|asc ebc2| |
e189e07c16aa
changed: #howToAddMoreCoders
Claus Gittinger <cg@exept.de>
parents:
13063
diff
changeset
|
196 |
|
e189e07c16aa
changed: #howToAddMoreCoders
Claus Gittinger <cg@exept.de>
parents:
13063
diff
changeset
|
197 |
asc := CharacterEncoderImplementations::EBCDIC new decode:ebc. |
e189e07c16aa
changed: #howToAddMoreCoders
Claus Gittinger <cg@exept.de>
parents:
13063
diff
changeset
|
198 |
asc notNil ifTrue:[ |
e189e07c16aa
changed: #howToAddMoreCoders
Claus Gittinger <cg@exept.de>
parents:
13063
diff
changeset
|
199 |
ebc2 := CharacterEncoderImplementations::EBCDIC new encode:asc. |
e189e07c16aa
changed: #howToAddMoreCoders
Claus Gittinger <cg@exept.de>
parents:
13063
diff
changeset
|
200 |
self assert:(ebc2 = ebc) |
e189e07c16aa
changed: #howToAddMoreCoders
Claus Gittinger <cg@exept.de>
parents:
13063
diff
changeset
|
201 |
]. |
e189e07c16aa
changed: #howToAddMoreCoders
Claus Gittinger <cg@exept.de>
parents:
13063
diff
changeset
|
202 |
]. |
e189e07c16aa
changed: #howToAddMoreCoders
Claus Gittinger <cg@exept.de>
parents:
13063
diff
changeset
|
203 |
|
e189e07c16aa
changed: #howToAddMoreCoders
Claus Gittinger <cg@exept.de>
parents:
13063
diff
changeset
|
204 |
0 to:255 do:[:asc | |
e189e07c16aa
changed: #howToAddMoreCoders
Claus Gittinger <cg@exept.de>
parents:
13063
diff
changeset
|
205 |
|ebc asc2| |
e189e07c16aa
changed: #howToAddMoreCoders
Claus Gittinger <cg@exept.de>
parents:
13063
diff
changeset
|
206 |
|
e189e07c16aa
changed: #howToAddMoreCoders
Claus Gittinger <cg@exept.de>
parents:
13063
diff
changeset
|
207 |
ebc := CharacterEncoderImplementations::EBCDIC new encode:asc. |
e189e07c16aa
changed: #howToAddMoreCoders
Claus Gittinger <cg@exept.de>
parents:
13063
diff
changeset
|
208 |
ebc notNil ifTrue:[ |
e189e07c16aa
changed: #howToAddMoreCoders
Claus Gittinger <cg@exept.de>
parents:
13063
diff
changeset
|
209 |
asc2 := CharacterEncoderImplementations::EBCDIC new decode:ebc. |
e189e07c16aa
changed: #howToAddMoreCoders
Claus Gittinger <cg@exept.de>
parents:
13063
diff
changeset
|
210 |
self assert:(asc2 = asc) |
e189e07c16aa
changed: #howToAddMoreCoders
Claus Gittinger <cg@exept.de>
parents:
13063
diff
changeset
|
211 |
]. |
e189e07c16aa
changed: #howToAddMoreCoders
Claus Gittinger <cg@exept.de>
parents:
13063
diff
changeset
|
212 |
]. |
7899
7577df77ba95
character encodings - first attempt
Claus Gittinger <cg@exept.de>
parents:
7893
diff
changeset
|
213 |
" |
7577df77ba95
character encodings - first attempt
Claus Gittinger <cg@exept.de>
parents:
7893
diff
changeset
|
214 |
! ! |
7577df77ba95
character encodings - first attempt
Claus Gittinger <cg@exept.de>
parents:
7893
diff
changeset
|
215 |
|
7971 | 216 |
!CharacterEncoder class methodsFor:'instance creation'! |
217 |
||
218 |
encoderFor:encodingNameSymbol |
|
219 |
"given the name of an encoding, return an encoder-instance which can map these from/into unicode." |
|
220 |
||
221 |
^ self |
|
8156 | 222 |
encoderFor:encodingNameSymbol |
223 |
ifAbsent:[ |
|
224 |
"/ proceed to ignore this error in the future. |
|
8352
20d2476f538e
add nullEncoder BEFORE raising an error
Claus Gittinger <cg@exept.de>
parents:
8262
diff
changeset
|
225 |
|
20d2476f538e
add nullEncoder BEFORE raising an error
Claus Gittinger <cg@exept.de>
parents:
8262
diff
changeset
|
226 |
(EncodersByName at:#unicode) at:encodingNameSymbol put:NullEncoderInstance. |
14169
eab487f07a2b
comment/format in: #encoderFor:
Stefan Vogel <sv@exept.de>
parents:
14094
diff
changeset
|
227 |
(EncoderClassesByName at:#unicode) at:encodingNameSymbol put:NullEncoder. |
8352
20d2476f538e
add nullEncoder BEFORE raising an error
Claus Gittinger <cg@exept.de>
parents:
8262
diff
changeset
|
228 |
|
8388
b5cf7abdfe64
no encoder: send a message to stdError instead of entering
Claus Gittinger <cg@exept.de>
parents:
8352
diff
changeset
|
229 |
"/ self error:'no encoder for ' , encodingNameSymbol mayProceed:true. |
13325 | 230 |
('CharacterEncoder [warning]: no encoder for ' , encodingNameSymbol) infoPrintCR. |
8388
b5cf7abdfe64
no encoder: send a message to stdError instead of entering
Claus Gittinger <cg@exept.de>
parents:
8352
diff
changeset
|
231 |
|
8156 | 232 |
NullEncoderInstance |
233 |
] |
|
7971 | 234 |
|
235 |
" |
|
8388
b5cf7abdfe64
no encoder: send a message to stdError instead of entering
Claus Gittinger <cg@exept.de>
parents:
8352
diff
changeset
|
236 |
CharacterEncoder encoderFor:#'blabla2' |
7971 | 237 |
CharacterEncoder encoderFor:#'latin1' |
238 |
self encoderFor:#'arabic' |
|
239 |
self encoderFor:#'ms-arabic' |
|
8814 | 240 |
self encoderFor:#'cp1250' |
241 |
self encoderFor:#'cp1251' |
|
242 |
self encoderFor:#'cp1252' |
|
243 |
self encoderFor:#'cp1253' |
|
7971 | 244 |
self encoderFor:#'iso8859-5' |
245 |
self encoderFor:#'koi8-r' |
|
246 |
self encoderFor:#'koi8-u' |
|
247 |
self encoderFor:#'jis0208' |
|
248 |
self encoderFor:#'jis7' |
|
8087
0a2ee76bcf55
last version before separating into extra classes
Claus Gittinger <cg@exept.de>
parents:
8062
diff
changeset
|
249 |
self encoderFor:#'utf8' |
14169
eab487f07a2b
comment/format in: #encoderFor:
Stefan Vogel <sv@exept.de>
parents:
14094
diff
changeset
|
250 |
(self encoderFor:#'utf16le') encodeString:'hello' |
eab487f07a2b
comment/format in: #encoderFor:
Stefan Vogel <sv@exept.de>
parents:
14094
diff
changeset
|
251 |
(self encoderFor:#'utf16le') encode:5 |
eab487f07a2b
comment/format in: #encoderFor:
Stefan Vogel <sv@exept.de>
parents:
14094
diff
changeset
|
252 |
(self encoderFor:#'utf16be') encodeString:'hello' |
eab487f07a2b
comment/format in: #encoderFor:
Stefan Vogel <sv@exept.de>
parents:
14094
diff
changeset
|
253 |
(self encoderFor:#'utf16be') encode:5 |
eab487f07a2b
comment/format in: #encoderFor:
Stefan Vogel <sv@exept.de>
parents:
14094
diff
changeset
|
254 |
(self encoderFor:#'utf32le') encodeString:'hello' |
eab487f07a2b
comment/format in: #encoderFor:
Stefan Vogel <sv@exept.de>
parents:
14094
diff
changeset
|
255 |
(self encoderFor:#'utf32be') encodeString:'hello' |
10111 | 256 |
self encoderFor:#'sgml' |
257 |
self encoderFor:#'java' |
|
7971 | 258 |
" |
10111 | 259 |
|
13325 | 260 |
"Modified: / 01-04-2011 / 14:27:22 / cg" |
7971 | 261 |
! |
262 |
||
8168 | 263 |
encoderFor:encodingNameSymbolArg ifAbsent:exceptionValue |
7971 | 264 |
"given the name of an encoding, return an encoder-instance which can map these from/into unicode." |
265 |
||
8168 | 266 |
|encodingNameSymbol enc clsName cls lcName name unicodeEncoders unicodeEncoderClasses| |
8118 | 267 |
|
8168 | 268 |
encodingNameSymbol := encodingNameSymbolArg. |
8118 | 269 |
encodingNameSymbol isNil ifTrue:[ ^ NullEncoderInstance]. |
7972 | 270 |
|
14169
eab487f07a2b
comment/format in: #encoderFor:
Stefan Vogel <sv@exept.de>
parents:
14094
diff
changeset
|
271 |
encodingNameSymbol == #'iso10646-1' ifTrue:[ encodingNameSymbol := #unicode]. |
8168 | 272 |
|
8118 | 273 |
lcName := encodingNameSymbol asLowercase. |
274 |
name := lcName asSymbolIfInterned. |
|
275 |
name isNil ifTrue:[name := lcName]. |
|
8052 | 276 |
|
8118 | 277 |
name includesMatchCharacters ifTrue:[ |
8262
550c67712dfa
do not autoload while in accesslock (deadlock)
Claus Gittinger <cg@exept.de>
parents:
8261
diff
changeset
|
278 |
AccessLock critical:[ |
550c67712dfa
do not autoload while in accesslock (deadlock)
Claus Gittinger <cg@exept.de>
parents:
8261
diff
changeset
|
279 |
unicodeEncoders := EncodersByName at:#unicode ifAbsent:nil. |
550c67712dfa
do not autoload while in accesslock (deadlock)
Claus Gittinger <cg@exept.de>
parents:
8261
diff
changeset
|
280 |
]. |
8155 | 281 |
unicodeEncoders notNil ifTrue:[ |
282 |
unicodeEncoders keysAndValuesDo:[:eachEncodingAlias :eachEncoderInstance | |
|
283 |
(name matches:eachEncodingAlias) ifTrue:[ |
|
284 |
^ eachEncoderInstance. |
|
285 |
]. |
|
286 |
]. |
|
287 |
]. |
|
8118 | 288 |
|
8262
550c67712dfa
do not autoload while in accesslock (deadlock)
Claus Gittinger <cg@exept.de>
parents:
8261
diff
changeset
|
289 |
AccessLock critical:[ |
550c67712dfa
do not autoload while in accesslock (deadlock)
Claus Gittinger <cg@exept.de>
parents:
8261
diff
changeset
|
290 |
unicodeEncoderClasses := EncoderClassesByName at:#unicode. |
550c67712dfa
do not autoload while in accesslock (deadlock)
Claus Gittinger <cg@exept.de>
parents:
8261
diff
changeset
|
291 |
]. |
8155 | 292 |
unicodeEncoderClasses notNil ifTrue:[ |
293 |
unicodeEncoderClasses keysAndValuesDo:[:eachEncodingAlias :eachEncoderClassOrName | |
|
294 |
(name matches:eachEncodingAlias) ifTrue:[ |
|
295 |
eachEncoderClassOrName isBehavior ifTrue:[ |
|
8194 | 296 |
cls := eachEncoderClassOrName |
297 |
] ifFalse:[ |
|
298 |
cls := CharacterEncoderImplementations at:eachEncoderClassOrName. |
|
8155 | 299 |
]. |
300 |
cls notNil ifTrue:[ |
|
301 |
^ cls new. |
|
302 |
] |
|
303 |
]. |
|
304 |
]. |
|
305 |
]. |
|
306 |
^ exceptionValue value |
|
7972 | 307 |
]. |
7971 | 308 |
|
8118 | 309 |
AccessLock critical:[ |
8155 | 310 |
unicodeEncoders := EncodersByName at:#unicode ifAbsent:nil. |
311 |
unicodeEncoders isNil ifTrue:[ |
|
312 |
EncodersByName at:#unicode put:(unicodeEncoders := Dictionary new). |
|
313 |
]. |
|
314 |
enc := unicodeEncoders at:name ifAbsent:nil. |
|
8262
550c67712dfa
do not autoload while in accesslock (deadlock)
Claus Gittinger <cg@exept.de>
parents:
8261
diff
changeset
|
315 |
]. |
550c67712dfa
do not autoload while in accesslock (deadlock)
Claus Gittinger <cg@exept.de>
parents:
8261
diff
changeset
|
316 |
enc isNil ifTrue:[ |
550c67712dfa
do not autoload while in accesslock (deadlock)
Claus Gittinger <cg@exept.de>
parents:
8261
diff
changeset
|
317 |
AccessLock critical:[ |
8155 | 318 |
unicodeEncoderClasses := EncoderClassesByName at:#unicode ifAbsent:nil. |
319 |
unicodeEncoderClasses isNil ifTrue:[ |
|
320 |
EncoderClassesByName at:#unicode put:(unicodeEncoderClasses := Dictionary new). |
|
321 |
]. |
|
322 |
clsName := unicodeEncoderClasses at:name ifAbsent:nil. |
|
8262
550c67712dfa
do not autoload while in accesslock (deadlock)
Claus Gittinger <cg@exept.de>
parents:
8261
diff
changeset
|
323 |
]. |
550c67712dfa
do not autoload while in accesslock (deadlock)
Claus Gittinger <cg@exept.de>
parents:
8261
diff
changeset
|
324 |
clsName notNil ifTrue:[ |
550c67712dfa
do not autoload while in accesslock (deadlock)
Claus Gittinger <cg@exept.de>
parents:
8261
diff
changeset
|
325 |
clsName isBehavior ifTrue:[ |
550c67712dfa
do not autoload while in accesslock (deadlock)
Claus Gittinger <cg@exept.de>
parents:
8261
diff
changeset
|
326 |
cls := clsName |
550c67712dfa
do not autoload while in accesslock (deadlock)
Claus Gittinger <cg@exept.de>
parents:
8261
diff
changeset
|
327 |
] ifFalse:[ |
550c67712dfa
do not autoload while in accesslock (deadlock)
Claus Gittinger <cg@exept.de>
parents:
8261
diff
changeset
|
328 |
cls := CharacterEncoderImplementations at:clsName. |
550c67712dfa
do not autoload while in accesslock (deadlock)
Claus Gittinger <cg@exept.de>
parents:
8261
diff
changeset
|
329 |
]. |
550c67712dfa
do not autoload while in accesslock (deadlock)
Claus Gittinger <cg@exept.de>
parents:
8261
diff
changeset
|
330 |
cls notNil ifTrue:[ |
550c67712dfa
do not autoload while in accesslock (deadlock)
Claus Gittinger <cg@exept.de>
parents:
8261
diff
changeset
|
331 |
enc := cls new. |
550c67712dfa
do not autoload while in accesslock (deadlock)
Claus Gittinger <cg@exept.de>
parents:
8261
diff
changeset
|
332 |
AccessLock critical:[ |
8155 | 333 |
unicodeEncoders at:name put:enc. |
334 |
] |
|
335 |
]. |
|
336 |
]. |
|
7973 | 337 |
]. |
8262
550c67712dfa
do not autoload while in accesslock (deadlock)
Claus Gittinger <cg@exept.de>
parents:
8261
diff
changeset
|
338 |
|
8118 | 339 |
enc notNil ifTrue:[ |
8155 | 340 |
^ enc |
7973 | 341 |
]. |
7971 | 342 |
|
8118 | 343 |
"/ no direct encoder from unicode->name |
344 |
"/ search for unicode->any and: any->name |
|
8262
550c67712dfa
do not autoload while in accesslock (deadlock)
Claus Gittinger <cg@exept.de>
parents:
8261
diff
changeset
|
345 |
AccessLock critical:[ |
550c67712dfa
do not autoload while in accesslock (deadlock)
Claus Gittinger <cg@exept.de>
parents:
8261
diff
changeset
|
346 |
unicodeEncoderClasses := EncoderClassesByName at:#unicode ifAbsent:nil. |
550c67712dfa
do not autoload while in accesslock (deadlock)
Claus Gittinger <cg@exept.de>
parents:
8261
diff
changeset
|
347 |
]. |
8118 | 348 |
unicodeEncoderClasses keysAndValuesDo:[:eachEncodingAlias :eachEncoderClass | |
8155 | 349 |
|dict2 enc1 enc2| |
8118 | 350 |
|
8262
550c67712dfa
do not autoload while in accesslock (deadlock)
Claus Gittinger <cg@exept.de>
parents:
8261
diff
changeset
|
351 |
AccessLock critical:[ |
550c67712dfa
do not autoload while in accesslock (deadlock)
Claus Gittinger <cg@exept.de>
parents:
8261
diff
changeset
|
352 |
dict2 := EncoderClassesByName at:eachEncodingAlias ifAbsent:nil. |
550c67712dfa
do not autoload while in accesslock (deadlock)
Claus Gittinger <cg@exept.de>
parents:
8261
diff
changeset
|
353 |
]. |
8155 | 354 |
dict2 notNil ifTrue:[ |
355 |
clsName := dict2 at:name ifAbsent:nil. |
|
356 |
clsName notNil ifTrue:[ |
|
357 |
clsName isBehavior ifTrue:[ |
|
8194 | 358 |
cls := clsName |
8155 | 359 |
] ifFalse:[ |
360 |
cls := CharacterEncoderImplementations at:clsName. |
|
361 |
]. |
|
362 |
cls notNil ifTrue:[ |
|
363 |
enc2 := cls new. |
|
364 |
enc1 := self encoderFor:eachEncodingAlias. |
|
365 |
(enc1 notNil and:[enc2 notNil]) ifTrue:[ |
|
366 |
enc := TwoStepEncoder new encoder1:enc1 encoder2:enc2. |
|
367 |
AccessLock critical:[ |
|
368 |
unicodeEncoders at:name put:enc. |
|
369 |
]. |
|
370 |
^ enc. |
|
371 |
] |
|
372 |
] |
|
373 |
] |
|
374 |
]. |
|
7971 | 375 |
]. |
376 |
||
8194 | 377 |
EncoderClassesByName keysAndValuesDo:[:encoding1 :dict1 | |
378 |
dict1 keysAndValuesDo:[:encoding2 :clsName1| |
|
379 |
|clsName2 cls1 cls2 dict2 enc1 enc2| |
|
380 |
||
381 |
encoding2 = encodingNameSymbol ifTrue:[ |
|
8262
550c67712dfa
do not autoload while in accesslock (deadlock)
Claus Gittinger <cg@exept.de>
parents:
8261
diff
changeset
|
382 |
AccessLock critical:[ |
550c67712dfa
do not autoload while in accesslock (deadlock)
Claus Gittinger <cg@exept.de>
parents:
8261
diff
changeset
|
383 |
dict2 := EncoderClassesByName at:#unicode. |
550c67712dfa
do not autoload while in accesslock (deadlock)
Claus Gittinger <cg@exept.de>
parents:
8261
diff
changeset
|
384 |
]. |
8194 | 385 |
clsName2 := dict2 at:encoding1 ifAbsent:nil. |
386 |
clsName2 notNil ifTrue:[ |
|
387 |
clsName1 isBehavior ifTrue:[ |
|
388 |
cls1 := clsName1 |
|
389 |
] ifFalse:[ |
|
390 |
cls1 := CharacterEncoderImplementations at:clsName1. |
|
391 |
]. |
|
392 |
clsName2 isBehavior ifTrue:[ |
|
393 |
cls2 := clsName2 |
|
394 |
] ifFalse:[ |
|
395 |
cls2 := CharacterEncoderImplementations at:clsName2. |
|
396 |
]. |
|
397 |
(cls1 notNil and:[cls2 notNil]) ifTrue:[ |
|
398 |
enc := TwoStepEncoder new encoder1:enc1 encoder2:enc2. |
|
399 |
^ enc. |
|
400 |
]. |
|
401 |
] |
|
402 |
] |
|
403 |
] |
|
404 |
]. |
|
405 |
||
7971 | 406 |
^ exceptionValue value |
407 |
||
408 |
" |
|
409 |
CharacterEncoder encoderFor:#'latin1' |
|
7972 | 410 |
self encoderFor:#'arabic' |
411 |
self encoderFor:#'ms-arabic' |
|
412 |
self encoderFor:#'iso8859-5' |
|
7971 | 413 |
self encoderFor:#'koi8-r' |
414 |
self encoderFor:#'koi8-u' |
|
415 |
self encoderFor:#'jis0208' |
|
416 |
self encoderFor:#'jis7' |
|
7972 | 417 |
self encoderFor:#'unicode' |
7971 | 418 |
" |
419 |
! |
|
420 |
||
8210 | 421 |
encoderForUTF8 |
8211 | 422 |
"return an encoder-instance which can map unicode into/from utf8" |
423 |
||
8210 | 424 |
^ self encoderFor:#utf8 |
425 |
||
426 |
" |
|
427 |
CharacterEncoder encoderFor:#'latin1' |
|
428 |
self encoderFor:#'arabic' |
|
429 |
self encoderFor:#'ms-arabic' |
|
430 |
self encoderFor:#'iso8859-5' |
|
431 |
self encoderFor:#'koi8-r' |
|
432 |
self encoderFor:#'koi8-u' |
|
433 |
self encoderFor:#'jis0208' |
|
434 |
self encoderFor:#'jis7' |
|
435 |
self encoderFor:#'utf8' |
|
436 |
self encoderForUTF8' |
|
437 |
" |
|
438 |
! |
|
439 |
||
7971 | 440 |
encoderToEncodeFrom:oldEncodingArg into:newEncodingArg |
8135 | 441 |
|oldEncoding newEncoding encoders encoderClasses encoder decoder clsName cls| |
8118 | 442 |
|
14169
eab487f07a2b
comment/format in: #encoderFor:
Stefan Vogel <sv@exept.de>
parents:
14094
diff
changeset
|
443 |
oldEncoding := oldEncodingArg ? #unicode. |
eab487f07a2b
comment/format in: #encoderFor:
Stefan Vogel <sv@exept.de>
parents:
14094
diff
changeset
|
444 |
oldEncoding == #'iso10646-1' ifTrue:[ oldEncoding := #unicode]. |
eab487f07a2b
comment/format in: #encoderFor:
Stefan Vogel <sv@exept.de>
parents:
14094
diff
changeset
|
445 |
newEncoding := newEncodingArg ? #unicode. |
eab487f07a2b
comment/format in: #encoderFor:
Stefan Vogel <sv@exept.de>
parents:
14094
diff
changeset
|
446 |
newEncoding == #'iso10646-1' ifTrue:[ newEncoding := #unicode]. |
eab487f07a2b
comment/format in: #encoderFor:
Stefan Vogel <sv@exept.de>
parents:
14094
diff
changeset
|
447 |
|
eab487f07a2b
comment/format in: #encoderFor:
Stefan Vogel <sv@exept.de>
parents:
14094
diff
changeset
|
448 |
oldEncoding = newEncoding ifTrue:[^ NullEncoderInstance]. |
eab487f07a2b
comment/format in: #encoderFor:
Stefan Vogel <sv@exept.de>
parents:
14094
diff
changeset
|
449 |
(oldEncoding match:newEncoding) ifTrue:[^ NullEncoderInstance]. |
eab487f07a2b
comment/format in: #encoderFor:
Stefan Vogel <sv@exept.de>
parents:
14094
diff
changeset
|
450 |
|
eab487f07a2b
comment/format in: #encoderFor:
Stefan Vogel <sv@exept.de>
parents:
14094
diff
changeset
|
451 |
(oldEncoding = #unicode) ifTrue:[ |
eab487f07a2b
comment/format in: #encoderFor:
Stefan Vogel <sv@exept.de>
parents:
14094
diff
changeset
|
452 |
"/ something -> unicode |
eab487f07a2b
comment/format in: #encoderFor:
Stefan Vogel <sv@exept.de>
parents:
14094
diff
changeset
|
453 |
^ self encoderFor:newEncoding. |
eab487f07a2b
comment/format in: #encoderFor:
Stefan Vogel <sv@exept.de>
parents:
14094
diff
changeset
|
454 |
]. |
7972 | 455 |
|
8261 | 456 |
oldEncoding isSymbol ifFalse:[oldEncoding := oldEncoding asSymbol]. |
457 |
newEncoding isSymbol ifFalse:[newEncoding := newEncoding asSymbol]. |
|
8120 | 458 |
|
8118 | 459 |
AccessLock critical:[ |
8155 | 460 |
encoders := EncodersByName at:oldEncoding ifAbsent:nil. |
461 |
encoders isNil ifTrue:[ |
|
462 |
EncodersByName at:oldEncoding put:(encoders := Dictionary new). |
|
463 |
]. |
|
464 |
encoder := encoders at:newEncodingArg ifAbsent:nil. |
|
465 |
encoder isNil ifTrue:[ |
|
466 |
encoderClasses := EncoderClassesByName at:oldEncoding ifAbsent:nil. |
|
467 |
encoderClasses isNil ifTrue:[ |
|
468 |
EncoderClassesByName at:oldEncoding put:(encoderClasses := Dictionary new). |
|
469 |
]. |
|
470 |
clsName := encoderClasses at:newEncoding ifAbsent:nil. |
|
471 |
clsName notNil ifTrue:[ |
|
472 |
clsName isBehavior ifTrue:[ |
|
8262
550c67712dfa
do not autoload while in accesslock (deadlock)
Claus Gittinger <cg@exept.de>
parents:
8261
diff
changeset
|
473 |
cls := clsName |
8155 | 474 |
] ifFalse:[ |
475 |
cls := CharacterEncoderImplementations at:clsName. |
|
476 |
] |
|
477 |
]. |
|
478 |
]. |
|
7971 | 479 |
]. |
8262
550c67712dfa
do not autoload while in accesslock (deadlock)
Claus Gittinger <cg@exept.de>
parents:
8261
diff
changeset
|
480 |
cls notNil ifTrue:[ |
550c67712dfa
do not autoload while in accesslock (deadlock)
Claus Gittinger <cg@exept.de>
parents:
8261
diff
changeset
|
481 |
encoder := cls new. |
550c67712dfa
do not autoload while in accesslock (deadlock)
Claus Gittinger <cg@exept.de>
parents:
8261
diff
changeset
|
482 |
]. |
7971 | 483 |
|
8118 | 484 |
encoder isNil ifTrue:[ |
8155 | 485 |
(newEncoding == #unicode) ifTrue:[ |
486 |
"/ something -> unicode |
|
487 |
decoder := self encoderFor:oldEncoding. |
|
488 |
encoder := InverseEncoder new decoder:decoder. |
|
489 |
] ifFalse:[ |
|
490 |
"/ do it as: oldEncoding -> unicode -> newEncoding |
|
7972 | 491 |
|
8155 | 492 |
"/ something -> unicode |
493 |
decoder := self encoderFor:oldEncoding. |
|
7972 | 494 |
|
8155 | 495 |
"/ unicode -> something |
496 |
encoder := self encoderFor:newEncoding. |
|
497 |
encoder := CompoundEncoder new encoder:encoder decoder:decoder. |
|
498 |
]. |
|
7971 | 499 |
]. |
500 |
||
8118 | 501 |
AccessLock critical:[ |
8155 | 502 |
(EncodersByName at:oldEncoding) at:newEncoding put:encoder |
8118 | 503 |
]. |
504 |
^ encoder |
|
7971 | 505 |
|
8118 | 506 |
" CharacterEncoder initialize |
7972 | 507 |
CharacterEncoder encoderToEncodeFrom:#'latin1' into:#'jis7' |
8118 | 508 |
CharacterEncoder encoderToEncodeFrom:#'koi8-r' into:#'mac-cyrillic' |
8087
0a2ee76bcf55
last version before separating into extra classes
Claus Gittinger <cg@exept.de>
parents:
8062
diff
changeset
|
509 |
CharacterEncoder encoderToEncodeFrom:#'ms-arabic' into:#'mac-arabic' |
0a2ee76bcf55
last version before separating into extra classes
Claus Gittinger <cg@exept.de>
parents:
8062
diff
changeset
|
510 |
CharacterEncoder encoderToEncodeFrom:#'iso8859-5' into:#'koi8-r' |
0a2ee76bcf55
last version before separating into extra classes
Claus Gittinger <cg@exept.de>
parents:
8062
diff
changeset
|
511 |
CharacterEncoder encoderToEncodeFrom:#'koi8-r' into:#'koi8-u' |
7971 | 512 |
" |
513 |
! ! |
|
514 |
||
7932 | 515 |
!CharacterEncoder class methodsFor:'Compatibility-ST80'! |
516 |
||
517 |
encoderNamed: encoderName |
|
518 |
"/ q & d hack |
|
519 |
||
520 |
encoderName == #default ifTrue:[ |
|
11262
5de131eaba9e
changed #classMenuCompareTwoRepositoryVersions
Claus Gittinger <cg@exept.de>
parents:
11228
diff
changeset
|
521 |
^ DefaultEncoder new |
7932 | 522 |
]. |
11262
5de131eaba9e
changed #classMenuCompareTwoRepositoryVersions
Claus Gittinger <cg@exept.de>
parents:
11228
diff
changeset
|
523 |
self halt:'should not be reached'. |
7932 | 524 |
^ self new |
525 |
! |
|
526 |
||
527 |
platformName |
|
528 |
^ OperatingSystem platformName |
|
529 |
||
530 |
"Created: 20.6.1997 / 17:34:03 / cg" |
|
531 |
"Modified: 20.6.1997 / 17:38:40 / cg" |
|
532 |
! ! |
|
533 |
||
11316
0b2757774461
access method #nullEncoderInstance
Stefan Vogel <sv@exept.de>
parents:
11300
diff
changeset
|
534 |
!CharacterEncoder class methodsFor:'accessing'! |
0b2757774461
access method #nullEncoderInstance
Stefan Vogel <sv@exept.de>
parents:
11300
diff
changeset
|
535 |
|
0b2757774461
access method #nullEncoderInstance
Stefan Vogel <sv@exept.de>
parents:
11300
diff
changeset
|
536 |
nullEncoderInstance |
0b2757774461
access method #nullEncoderInstance
Stefan Vogel <sv@exept.de>
parents:
11300
diff
changeset
|
537 |
^ NullEncoderInstance |
0b2757774461
access method #nullEncoderInstance
Stefan Vogel <sv@exept.de>
parents:
11300
diff
changeset
|
538 |
! ! |
0b2757774461
access method #nullEncoderInstance
Stefan Vogel <sv@exept.de>
parents:
11300
diff
changeset
|
539 |
|
7899
7577df77ba95
character encodings - first attempt
Claus Gittinger <cg@exept.de>
parents:
7893
diff
changeset
|
540 |
!CharacterEncoder class methodsFor:'class initialization'! |
7577df77ba95
character encodings - first attempt
Claus Gittinger <cg@exept.de>
parents:
7893
diff
changeset
|
541 |
|
7577df77ba95
character encodings - first attempt
Claus Gittinger <cg@exept.de>
parents:
7893
diff
changeset
|
542 |
initialize |
8154 | 543 |
|ud| |
544 |
||
8151 | 545 |
AccessLock := RecursionLock new. |
8118 | 546 |
NullEncoderInstance := NullEncoder new. |
7973 | 547 |
|
8126 | 548 |
EncodersByName := Dictionary new. |
549 |
EncoderClassesByName := Dictionary new. |
|
550 |
CachedEncoders := Dictionary new. |
|
7972 | 551 |
|
8154 | 552 |
EncoderClassesByName at:#'unicode' put:(ud := Dictionary new). |
553 |
ud at:#'fontspecific' put:NullEncoder. |
|
554 |
ud at:#'adobe-fontspecific' put:NullEncoder. |
|
8190 | 555 |
ud at:#'ms-oem' put:NullEncoder. |
13326 | 556 |
ud at:#'ms-default' put:NullEncoder. |
8152 | 557 |
|
8135 | 558 |
"/ className decoded-name array-of-encodingNames |
8118 | 559 |
#( |
8151 | 560 |
(ASCII unicode ( ascii 'us-ascii' 'iso-ir-6' 'ibm-367' 'ms-cp367' 'cp367' 'iso646-us' 'ibm-cp367' )) |
8118 | 561 |
|
8151 | 562 |
(BIG5 unicode ( big5 )) |
8118 | 563 |
|
8151 | 564 |
(CNS11643 unicode ( 'cns11643' )) |
8118 | 565 |
|
8151 | 566 |
(CP437 unicode ( 'cp437' 'cp-437' 'ibm-437' 'ms-cp437' 'microsoft-cp437' 'ibm-cp437' )) |
8118 | 567 |
|
13063
a17ba204b911
comment/format in: #encodeString:into:
Claus Gittinger <cg@exept.de>
parents:
12608
diff
changeset
|
568 |
(EBCDIC unicode ( 'ebcdic' )) |
a17ba204b911
comment/format in: #encodeString:into:
Claus Gittinger <cg@exept.de>
parents:
12608
diff
changeset
|
569 |
|
8151 | 570 |
(GB2313_1980 unicode ( 'gb2313' 'gb2313-1980' )) |
8118 | 571 |
|
8151 | 572 |
(HANGUL unicode ( 'hangul' )) |
8118 | 573 |
|
8151 | 574 |
(ISO10646_1 unicode ( unicode 'iso10646_1' 'iso10646-1' 'iso-10646-1' )) |
8118 | 575 |
|
8151 | 576 |
(ISO10646_to_UTF8 unicode ( utf8 'utf-8' )) |
8904 | 577 |
(ISO10646_to_UTF16BE unicode ( utf16b utf16be 'utf-16b' 'utf-16be' )) |
578 |
(ISO10646_to_UTF16LE unicode ( utf16l utf16le 'utf-16e' 'utf-16le' )) |
|
8118 | 579 |
|
8855
289b5bda04bb
guessEncoding - return the real encodings name
Claus Gittinger <cg@exept.de>
parents:
8814
diff
changeset
|
580 |
(ISO8859_1 unicode ( 'iso8859_1' 'iso8859-1' 'iso-8859-1' 'latin-1' 'latin1' 'iso-ir-100' 'ibm-819' 'ms-cp819' 'ibm-cp819' 'iso8859')) |
8118 | 581 |
|
8151 | 582 |
(ISO8859_2 unicode ( 'iso8859_2' 'iso8859-2' 'iso-8859-2' 'latin2' 'latin-2' 'iso-ir-101')) |
8118 | 583 |
|
8151 | 584 |
(ISO8859_3 unicode ( 'iso8859_3' 'iso8859-3' 'iso-8859-3' 'latin3' 'latin-3' 'iso-ir-109')) |
8118 | 585 |
|
8151 | 586 |
(ISO8859_4 unicode ( 'iso8859_4' 'iso8859-4' 'iso-8859-4' 'latin4' 'latin-4' 'iso-ir-110')) |
8118 | 587 |
|
8151 | 588 |
(ISO8859_5 unicode ( 'iso8859_5' 'iso8859-5' 'iso-8859-5' 'cyrillic' 'iso-ir-144' )) |
8118 | 589 |
|
8151 | 590 |
(ISO8859_6 unicode ( 'iso8859_6' 'iso8859-6' 'iso-8859-6' 'arabic' 'asmo-708' 'ecma-114' 'iso-ir-127' )) |
8118 | 591 |
|
8151 | 592 |
(ISO8859_7 unicode ( 'iso8859_7' 'iso8859-7' 'iso-8859-7' 'greek' 'iso-ir-126' 'ecma-118')) |
8118 | 593 |
|
8151 | 594 |
(ISO8859_8 unicode ( 'iso8859_8' 'iso8859-8' 'iso-8859-8' 'hebrew' 'iso-ir-138' )) |
8118 | 595 |
|
8151 | 596 |
(ISO8859_9 unicode ( 'iso8859_9' 'iso8859-9' 'iso-8859-9' 'latin5' 'latin-5' 'iso-ir-148')) |
8118 | 597 |
|
8151 | 598 |
(ISO8859_10 unicode ( 'iso8859_10' 'iso8859-10' 'iso-8859-10' 'latin6' 'latin-6' 'iso-ir-157')) |
8118 | 599 |
|
8151 | 600 |
(ISO8859_11 unicode ( 'iso8859_11' 'iso8859-11' 'iso-8859-11' 'thai' )) |
8118 | 601 |
|
8151 | 602 |
(ISO8859_13 unicode ( 'iso8859_13' 'iso8859-13' 'iso-8859-13' 'latin7' 'latin-7' )) |
8118 | 603 |
|
8151 | 604 |
(ISO8859_14 unicode ( 'iso8859_14' 'iso8859-14' 'iso-8859-14' 'latin8' 'latin-8' 'latin-celtic' )) |
8118 | 605 |
|
8151 | 606 |
(ISO8859_15 unicode ( 'iso8859_15' 'iso8859-15' 'iso-8859-15' 'latin9' 'latin-9' 'iso-ir-203')) |
8118 | 607 |
|
8151 | 608 |
(ISO8859_16 unicode ( 'iso8859_16' 'iso8859-16' 'iso-8859-16' 'latin10' 'latin-10' )) |
8118 | 609 |
|
8151 | 610 |
(JIS0201 unicode ( 'jis0201' #'jisx0201.1976-0')) |
8118 | 611 |
|
8151 | 612 |
(JIS0208 unicode ( jis0208 'jisx0208' 'jisx0208.1983-0' 'jisx0208.1990-0')) |
8118 | 613 |
|
8151 | 614 |
(JIS0208_to_JIS7 jis0208 ( jis7 'jis-7' 'x-jis7' 'x-iso2022-jp' 'iso2022-jp')) |
8118 | 615 |
|
8151 | 616 |
(JIS0208_to_EUC jis0208 ( euc #'x-euc-jp' )) |
8122 | 617 |
|
8176 | 618 |
(JIS0208_to_SJIS jis0208 ( 'sjis' 'shiftjis' 'x-sjis' #'x-shift-jis' #'shift-jis')) |
619 |
||
8151 | 620 |
(JIS0212 unicode ( 'jis0212' )) |
8118 | 621 |
|
8151 | 622 |
(JOHAB unicode ( 'johab' )) |
8118 | 623 |
|
8151 | 624 |
(KOI7 unicode ( 'koi7' )) |
8118 | 625 |
|
8151 | 626 |
(KOI8_R unicode ( #'koi8-r' 'cp878' )) |
8118 | 627 |
|
8151 | 628 |
(KOI8_U unicode ( #'koi8-u' )) |
8118 | 629 |
|
8151 | 630 |
(KSC5601 unicode ( #'ksc5601' )) |
8118 | 631 |
|
8151 | 632 |
(MAC_Arabic unicode ( #'mac-arabic' 'macarabic' )) |
8118 | 633 |
|
8151 | 634 |
(MAC_CentralEuropean unicode ( #'mac-centraleuropean' #'mac-centraleurope' 'maccentraleurope' 'maccentraleuropean' )) |
8118 | 635 |
|
8151 | 636 |
(MAC_Croatian unicode ( #'mac-croatian' 'maccroatian')) |
8118 | 637 |
|
8151 | 638 |
(MAC_Cyrillic unicode ( #'mac-cyrillic' 'maccyrillic' )) |
8118 | 639 |
|
8151 | 640 |
(MAC_Dingbats unicode ( #'mac-dingbats' 'macdingbats' 'macdingbat')) |
8118 | 641 |
|
8151 | 642 |
(MAC_Farsi unicode ( #'mac-farsi' 'macfarsi' )) |
8118 | 643 |
|
8151 | 644 |
(MAC_Greek unicode ( #'mac-greek' #'macgreek' )) |
8118 | 645 |
|
8151 | 646 |
(MAC_Hebrew unicode ( #'mac-hebrew' #'machebrew' )) |
8118 | 647 |
|
8151 | 648 |
(MAC_Iceland unicode ( #'mac-iceland' #'maciceland' )) |
8118 | 649 |
|
8151 | 650 |
(MAC_Japanese unicode ( #'mac-japanese' #'macjapanese' )) |
8118 | 651 |
|
8151 | 652 |
(MAC_Korean unicode ( #'mac-korean' #'mackorean' )) |
8118 | 653 |
|
8151 | 654 |
(MAC_Roman unicode ( #'mac-roman' #'macroman' )) |
8118 | 655 |
|
8151 | 656 |
(MAC_Romanian unicode ( #'mac-romanian' #'macromanian' )) |
8118 | 657 |
|
8151 | 658 |
(MAC_Symbol unicode ( #'mac-symbol' #'macsymbol' )) |
8118 | 659 |
|
8151 | 660 |
(MAC_Thai unicode ( #'mac-thai' #'macthai' )) |
8118 | 661 |
|
8151 | 662 |
(MAC_Turkish unicode ( #'mac-turkish' #'macturkish' )) |
8118 | 663 |
|
8151 | 664 |
(MS_Ansi unicode ( #'ms-ansi' 'ms-cp1252' 'microsoft-cp1252' 'cp1252' 'microsoft-ansi' 'windows-1252' 'windows-latin1')) |
8118 | 665 |
|
8151 | 666 |
(MS_Arabic unicode ( 'ms-arabic' 'ms-cp1256' 'microsoft-cp1256' 'cp1256' 'microsoft-arabic' 'windows-1256' )) |
8118 | 667 |
|
8151 | 668 |
(MS_Baltic unicode ( 'ms-baltic' 'ms-cp1257' 'microsoft-cp1257' 'cp1257' 'microsoft-baltic' 'windows-1257' )) |
8118 | 669 |
|
8151 | 670 |
(MS_Cyrillic unicode ( 'ms-cyrillic' 'ms-cp1251' 'microsoft-cp1251' 'cp1251' 'microsoft-cyrillic' 'windows-1251' )) |
8118 | 671 |
|
8151 | 672 |
(MS_EastEuropean unicode ( 'ms-easteuropean' 'ms-ee' 'cp1250' 'ms-cp1250' 'microsoft-cp1250' 'microsoft-easteuropean' 'windows-1250' )) |
8118 | 673 |
|
8151 | 674 |
(MS_Greek unicode ( 'ms-greek' 'ms-cp1253' 'microsoft-cp1253' 'cp1253' 'microsoft-greek' 'windows-1253' )) |
8118 | 675 |
|
8151 | 676 |
(MS_Hebrew unicode ( 'ms-hebrew' 'ms-cp1255' 'microsoft-cp1255' 'cp1255' 'microsoft-hebrew' 'windows-1255' )) |
8118 | 677 |
|
678 |
"/ (MS_Symbol unicode ( 'ms-symbol' 'microsoft-symbol' )) |
|
679 |
||
8151 | 680 |
(MS_Turkish unicode ( 'ms-turkish' 'ms-cp1254' 'microsoft-cp1254' 'cp1254' 'microsoft-turkish' 'windows-1254' )) |
8118 | 681 |
|
8151 | 682 |
(NEXT unicode ( 'next' 'nextstep' )) |
8186 | 683 |
|
10111 | 684 |
(ISO10646_to_SGML unicode ( 'sgml' )) |
685 |
(ISO10646_to_JavaText unicode ( 'java' 'javaText' )) |
|
8118 | 686 |
) triplesDo:[:className :decodesTo :encodesTo | |
8151 | 687 |
|dict| |
8134 | 688 |
|
8151 | 689 |
"/ notice that the encoders are not yet installed as autoloaded. |
690 |
"/ Therefore, we remember their names here. |
|
691 |
dict := EncoderClassesByName at:decodesTo ifAbsent:nil. |
|
692 |
dict isNil ifTrue:[ |
|
693 |
EncoderClassesByName at:decodesTo put:(dict := Dictionary new). |
|
694 |
]. |
|
695 |
encodesTo do:[:eachEncodingAlias | |
|
696 |
(dict includesKey:eachEncodingAlias) ifTrue:[ |
|
697 |
self halt:'conflicting alias' |
|
698 |
]. |
|
699 |
dict at:eachEncodingAlias put:className. |
|
700 |
]. |
|
8118 | 701 |
]. |
7899
7577df77ba95
character encodings - first attempt
Claus Gittinger <cg@exept.de>
parents:
7893
diff
changeset
|
702 |
|
7577df77ba95
character encodings - first attempt
Claus Gittinger <cg@exept.de>
parents:
7893
diff
changeset
|
703 |
" |
7577df77ba95
character encodings - first attempt
Claus Gittinger <cg@exept.de>
parents:
7893
diff
changeset
|
704 |
self initialize |
7577df77ba95
character encodings - first attempt
Claus Gittinger <cg@exept.de>
parents:
7893
diff
changeset
|
705 |
" |
10111 | 706 |
|
13326 | 707 |
"Modified: / 01-04-2011 / 14:30:06 / cg" |
7892 | 708 |
! ! |
709 |
||
8122 | 710 |
!CharacterEncoder class methodsFor:'constants'! |
711 |
||
712 |
jis7KanjiEscapeSequence |
|
713 |
"return the escape sequence used to switch to kanji in jis7 encoded strings. |
|
714 |
This happens to be the same as ISO2022-JP's escape sequence." |
|
715 |
||
716 |
Jis7KanjiEscapeSequence isNil ifTrue:[ |
|
717 |
Jis7KanjiEscapeSequence := Character esc asString , '$B'. |
|
718 |
]. |
|
719 |
^ Jis7KanjiEscapeSequence. |
|
720 |
||
721 |
"Created: 26.2.1996 / 17:38:08 / cg" |
|
722 |
"Modified: 30.6.1997 / 16:03:16 / cg" |
|
723 |
! |
|
724 |
||
725 |
jis7KanjiOldEscapeSequence |
|
726 |
"return the escape sequence used to switch to kanji in some old jis7 encoded strings." |
|
727 |
||
728 |
Jis7KanjiOldEscapeSequence isNil ifTrue:[ |
|
8856 | 729 |
Jis7KanjiOldEscapeSequence := Character esc asString , '$@'. |
8122 | 730 |
]. |
731 |
^ Jis7KanjiOldEscapeSequence. |
|
732 |
! |
|
733 |
||
734 |
jis7RomanEscapeSequence |
|
735 |
"return the escape sequence used to switch to roman in jis7 encoded strings" |
|
736 |
||
737 |
Jis7RomanEscapeSequence isNil ifTrue:[ |
|
738 |
Jis7RomanEscapeSequence := Character esc asString , '(J'. |
|
739 |
]. |
|
740 |
^ Jis7RomanEscapeSequence. |
|
741 |
||
742 |
"Created: 26.2.1996 / 17:38:08 / cg" |
|
743 |
"Modified: 30.6.1997 / 16:03:16 / cg" |
|
744 |
! |
|
745 |
||
746 |
jisISO2022EscapeSequence |
|
747 |
"return the escape sequence used to switch to kanji in iso2022 encoded strings" |
|
748 |
||
749 |
JisISO2022EscapeSequence isNil ifTrue:[ |
|
8136 | 750 |
JisISO2022EscapeSequence := Character esc asString , '&@' , Character esc asString , '$B'. |
8122 | 751 |
]. |
752 |
^ JisISO2022EscapeSequence. |
|
753 |
! ! |
|
754 |
||
7892 | 755 |
!CharacterEncoder class methodsFor:'encoding & decoding'! |
756 |
||
757 |
decode:aCodePoint |
|
758 |
^ self new decode:aCodePoint |
|
759 |
! |
|
760 |
||
761 |
decodeString:aString |
|
762 |
^ self new decodeString:aString |
|
763 |
! |
|
764 |
||
7972 | 765 |
decodeString:aString from:oldEncoding |
8016 | 766 |
^ self encodeString:aString from:oldEncoding into:#'unicode' |
7967 | 767 |
! |
768 |
||
7892 | 769 |
encode:aCodePoint |
770 |
^ self new encode:aCodePoint |
|
771 |
||
772 |
" |
|
773 |
ISO8859_1 encode:16r00FF |
|
774 |
ISO8859_1 decodeString:'hello' |
|
775 |
ISO8859_1 encodeString:(ISO8859_1 decodeString:'hello') |
|
776 |
||
777 |
ISO8859_5 decodeString:(String |
|
7899
7577df77ba95
character encodings - first attempt
Claus Gittinger <cg@exept.de>
parents:
7893
diff
changeset
|
778 |
with:(Character value:16rE4) |
7577df77ba95
character encodings - first attempt
Claus Gittinger <cg@exept.de>
parents:
7893
diff
changeset
|
779 |
with:(Character value:16rE0)) |
7892 | 780 |
" |
781 |
! |
|
782 |
||
7994 | 783 |
encode:codePoint from:oldEncodingArg into:newEncodingArg |
8015 | 784 |
|oldEncoding newEncoding encoder| |
7994 | 785 |
|
786 |
oldEncoding := oldEncodingArg ? #'unicode'. |
|
787 |
oldEncoding == #'iso10646-1' ifTrue:[ oldEncoding := #'unicode']. |
|
788 |
newEncoding := newEncodingArg ? #'unicode'. |
|
789 |
newEncoding == #'iso10646-1' ifTrue:[ newEncoding := #'unicode']. |
|
790 |
||
791 |
oldEncoding == newEncoding ifTrue:[^ codePoint]. |
|
792 |
||
8016 | 793 |
oldEncoding == #'unicode' ifTrue:[ |
8136 | 794 |
newEncoding == #'iso8859-1' ifTrue:[ |
795 |
codePoint <= 16rFF ifTrue:[ |
|
796 |
^ codePoint |
|
797 |
] |
|
798 |
] |
|
8016 | 799 |
]. |
800 |
newEncoding == #'unicode' ifTrue:[ |
|
8136 | 801 |
oldEncoding == #'iso8859-1' ifTrue:[ |
802 |
codePoint <= 16rFF ifTrue:[ |
|
803 |
^ codePoint |
|
804 |
] |
|
805 |
] |
|
8016 | 806 |
]. |
8118 | 807 |
encoder := self encoderToEncodeFrom:oldEncoding into:newEncoding. |
8015 | 808 |
^ encoder encode:codePoint. |
7994 | 809 |
! |
810 |
||
7899
7577df77ba95
character encodings - first attempt
Claus Gittinger <cg@exept.de>
parents:
7893
diff
changeset
|
811 |
encodeString:aUnicodeString |
7577df77ba95
character encodings - first attempt
Claus Gittinger <cg@exept.de>
parents:
7893
diff
changeset
|
812 |
"given a string in unicode, return a string in my encoding for it" |
7577df77ba95
character encodings - first attempt
Claus Gittinger <cg@exept.de>
parents:
7893
diff
changeset
|
813 |
|
7912 | 814 |
^ self new encodeString:aUnicodeString |
7899
7577df77ba95
character encodings - first attempt
Claus Gittinger <cg@exept.de>
parents:
7893
diff
changeset
|
815 |
|
7577df77ba95
character encodings - first attempt
Claus Gittinger <cg@exept.de>
parents:
7893
diff
changeset
|
816 |
" |
7577df77ba95
character encodings - first attempt
Claus Gittinger <cg@exept.de>
parents:
7893
diff
changeset
|
817 |
ISO8859_1 decodeString:'hello' |
7577df77ba95
character encodings - first attempt
Claus Gittinger <cg@exept.de>
parents:
7893
diff
changeset
|
818 |
" |
7914 | 819 |
! |
820 |
||
7967 | 821 |
encodeString:aString from:oldEncodingArg into:newEncodingArg |
8015 | 822 |
|oldEncoding newEncoding encoder| |
7967 | 823 |
|
824 |
oldEncoding := oldEncodingArg ? #'unicode'. |
|
7972 | 825 |
oldEncoding == #'iso10646-1' ifTrue:[ oldEncoding := #'unicode']. |
7967 | 826 |
newEncoding := newEncodingArg ? #'unicode'. |
7972 | 827 |
newEncoding == #'iso10646-1' ifTrue:[ newEncoding := #'unicode']. |
828 |
||
7967 | 829 |
oldEncoding == newEncoding ifTrue:[^ aString]. |
830 |
||
8016 | 831 |
oldEncoding == #'unicode' ifTrue:[ |
8136 | 832 |
newEncoding == #'iso8859-1' ifTrue:[ |
833 |
aString bitsPerCharacter == 8 ifTrue:[ |
|
834 |
^ aString |
|
835 |
] |
|
836 |
] |
|
8016 | 837 |
]. |
838 |
newEncoding == #'unicode' ifTrue:[ |
|
8136 | 839 |
oldEncoding == #'iso8859-1' ifTrue:[ |
840 |
aString bitsPerCharacter == 8 ifTrue:[ |
|
841 |
^ aString |
|
842 |
] |
|
843 |
] |
|
8016 | 844 |
]. |
845 |
||
8118 | 846 |
encoder := self encoderToEncodeFrom:oldEncoding into:newEncoding. |
8015 | 847 |
^ encoder encodeString:aString. |
7972 | 848 |
! |
849 |
||
850 |
encodeString:aString into:newEncoding |
|
8016 | 851 |
^ self encodeString:aString from:#'unicode' into:newEncoding |
13063
a17ba204b911
comment/format in: #encodeString:into:
Claus Gittinger <cg@exept.de>
parents:
12608
diff
changeset
|
852 |
|
a17ba204b911
comment/format in: #encodeString:into:
Claus Gittinger <cg@exept.de>
parents:
12608
diff
changeset
|
853 |
" |
a17ba204b911
comment/format in: #encodeString:into:
Claus Gittinger <cg@exept.de>
parents:
12608
diff
changeset
|
854 |
self encodeString:'hello' into:#ebcdic |
a17ba204b911
comment/format in: #encodeString:into:
Claus Gittinger <cg@exept.de>
parents:
12608
diff
changeset
|
855 |
|
a17ba204b911
comment/format in: #encodeString:into:
Claus Gittinger <cg@exept.de>
parents:
12608
diff
changeset
|
856 |
self encodeString:(self encodeString:'hello' into:#ebcdic) from:#ebcdic into:#ascii |
a17ba204b911
comment/format in: #encodeString:into:
Claus Gittinger <cg@exept.de>
parents:
12608
diff
changeset
|
857 |
self encodeString:(self encodeString:'hello' into:#ebcdic) from:#ebcdic into:#unicode |
a17ba204b911
comment/format in: #encodeString:into:
Claus Gittinger <cg@exept.de>
parents:
12608
diff
changeset
|
858 |
" |
7892 | 859 |
! ! |
860 |
||
861 |
!CharacterEncoder class methodsFor:'private'! |
|
862 |
||
863 |
flushCode |
|
8127 | 864 |
self initialize. |
7914 | 865 |
|
7892 | 866 |
self isAbstract ifFalse:[ |
8136 | 867 |
(self mapFileURL1_relativePathName notNil |
868 |
or:[ self mapFileURL2_relativePathName notNil]) ifTrue:[ |
|
869 |
self class removeSelector:#mapping. |
|
870 |
]. |
|
7892 | 871 |
]. |
872 |
||
873 |
" |
|
874 |
self flushCode |
|
875 |
" |
|
876 |
! ! |
|
877 |
||
878 |
!CharacterEncoder class methodsFor:'private-mapping setup'! |
|
879 |
||
7899
7577df77ba95
character encodings - first attempt
Claus Gittinger <cg@exept.de>
parents:
7893
diff
changeset
|
880 |
generateCode |
7909 | 881 |
(CharacterEncoderCodeGenerator new targetClass:self) generateCode. |
882 |
! |
|
883 |
||
884 |
generateSubclassCode |
|
885 |
(CharacterEncoderCodeGenerator new targetClass:self) generateSubclassCode. |
|
7892 | 886 |
! |
887 |
||
7914 | 888 |
mapFileURL1_codeColumn |
889 |
^ 1 |
|
890 |
! |
|
891 |
||
7912 | 892 |
mapFileURL1_relativePathName |
893 |
"raise an error: must be redefined in concrete subclass(es)" |
|
894 |
||
895 |
^ nil |
|
896 |
! |
|
897 |
||
898 |
mapFileURL2_relativePathName |
|
899 |
"raise an error: must be redefined in concrete subclass(es)" |
|
900 |
||
901 |
^ nil |
|
902 |
! |
|
903 |
||
7899
7577df77ba95
character encodings - first attempt
Claus Gittinger <cg@exept.de>
parents:
7893
diff
changeset
|
904 |
mappingURL1 |
7892 | 905 |
"raise an error: must be redefined in concrete subclass(es)" |
7912 | 906 |
|
7899
7577df77ba95
character encodings - first attempt
Claus Gittinger <cg@exept.de>
parents:
7893
diff
changeset
|
907 |
|rel| |
7577df77ba95
character encodings - first attempt
Claus Gittinger <cg@exept.de>
parents:
7893
diff
changeset
|
908 |
|
7912 | 909 |
rel := self mapFileURL1_relativePathName. |
910 |
rel isNil ifTrue:[ |
|
7932 | 911 |
^ nil |
7912 | 912 |
]. |
7899
7577df77ba95
character encodings - first attempt
Claus Gittinger <cg@exept.de>
parents:
7893
diff
changeset
|
913 |
^ 'http://www.unicode.org/Public/MAPPINGS/' , rel |
7577df77ba95
character encodings - first attempt
Claus Gittinger <cg@exept.de>
parents:
7893
diff
changeset
|
914 |
! |
7577df77ba95
character encodings - first attempt
Claus Gittinger <cg@exept.de>
parents:
7893
diff
changeset
|
915 |
|
7892 | 916 |
mappingURL2 |
917 |
"raise an error: must be redefined in concrete subclass(es)" |
|
7899
7577df77ba95
character encodings - first attempt
Claus Gittinger <cg@exept.de>
parents:
7893
diff
changeset
|
918 |
|
7577df77ba95
character encodings - first attempt
Claus Gittinger <cg@exept.de>
parents:
7893
diff
changeset
|
919 |
|rel| |
7577df77ba95
character encodings - first attempt
Claus Gittinger <cg@exept.de>
parents:
7893
diff
changeset
|
920 |
|
7912 | 921 |
rel := self mapFileURL2_relativePathName. |
922 |
rel isNil ifTrue:[ |
|
7932 | 923 |
^ nil |
7912 | 924 |
]. |
7899
7577df77ba95
character encodings - first attempt
Claus Gittinger <cg@exept.de>
parents:
7893
diff
changeset
|
925 |
^ 'http://std.dkuug.dk/i18n/charmaps/' , rel |
7892 | 926 |
! ! |
927 |
||
7899
7577df77ba95
character encodings - first attempt
Claus Gittinger <cg@exept.de>
parents:
7893
diff
changeset
|
928 |
!CharacterEncoder class methodsFor:'queries'! |
7577df77ba95
character encodings - first attempt
Claus Gittinger <cg@exept.de>
parents:
7893
diff
changeset
|
929 |
|
7938 | 930 |
isEncoding:subSetEncodingArg subSetOf:superSetEncodingArg |
7994 | 931 |
"return true, if superSetEncoding encoding includes all characters of subSetEncoding. |
932 |
(this means: characters are included - not that they have the same encoding)" |
|
7938 | 933 |
|
934 |
|subSetEncoding superSetEncoding| |
|
935 |
||
936 |
subSetEncodingArg = superSetEncodingArg ifTrue:[^ true]. |
|
937 |
subSetEncoding := subSetEncodingArg asLowercase. |
|
938 |
superSetEncoding := superSetEncodingArg asLowercase. |
|
939 |
||
940 |
(subSetEncoding match:superSetEncoding) ifTrue:[^ true]. |
|
941 |
||
8214
406c7fc10e12
assume ms-ansi is same as unicode
Claus Gittinger <cg@exept.de>
parents:
8211
diff
changeset
|
942 |
(('iso10646*' match:superSetEncoding) |
406c7fc10e12
assume ms-ansi is same as unicode
Claus Gittinger <cg@exept.de>
parents:
8211
diff
changeset
|
943 |
or:[superSetEncoding = 'unicode' |
406c7fc10e12
assume ms-ansi is same as unicode
Claus Gittinger <cg@exept.de>
parents:
8211
diff
changeset
|
944 |
or:[superSetEncoding = 'ms-ansi']]) ifTrue:[ |
406c7fc10e12
assume ms-ansi is same as unicode
Claus Gittinger <cg@exept.de>
parents:
8211
diff
changeset
|
945 |
"/ assume that any character is in unicode |
406c7fc10e12
assume ms-ansi is same as unicode
Claus Gittinger <cg@exept.de>
parents:
8211
diff
changeset
|
946 |
^ true. |
7938 | 947 |
]. |
948 |
||
949 |
"/ if the subSet is iso8859-*, that means ascii (i.e. the lower 7 bits of iso8859 only). |
|
950 |
((subSetEncoding = 'iso8859*') or:[subSetEncoding = 'iso8859-*']) ifTrue:[ |
|
8168 | 951 |
('ascii*' match:superSetEncoding) ifTrue:[^ true]. |
952 |
('ms-ansi*' match:superSetEncoding) ifTrue:[^ true]. |
|
7938 | 953 |
]. |
954 |
(subSetEncoding = 'ascii') ifTrue:[ |
|
8168 | 955 |
('iso8859*' match:superSetEncoding) ifTrue:[^ true]. |
956 |
('ms-ansi*' match:superSetEncoding) ifTrue:[^ true]. |
|
7938 | 957 |
]. |
958 |
||
7923 | 959 |
"/ TODO: check the charSets mappingTables... |
960 |
"/ self halt. |
|
961 |
^ false. |
|
962 |
! |
|
963 |
||
7919 | 964 |
nameOfDecodedCode |
965 |
"Most coders decode from their code into unicode / encode from unicode into their code. |
|
966 |
There are a few exceptions to this, though - these must redefine this." |
|
967 |
||
968 |
^ #'unicode' |
|
969 |
! |
|
970 |
||
971 |
nameOfEncoding |
|
7974 | 972 |
^ (self nameWithoutPrefix asLowercase copyReplaceAll:$_ with:$-) asSymbol |
7919 | 973 |
! |
974 |
||
7959 | 975 |
supportedExternalEncodings |
976 |
"return an array of arrays containing the names of supported |
|
977 |
encodings which are supported for external resources (i.e. files). |
|
978 |
The first element contains the internally used symbolic name, |
|
979 |
the second contains a user-readable string (description). |
|
980 |
More than one external name may be mapped onto the same symbolic." |
|
981 |
||
982 |
^ #( |
|
8176 | 983 |
('utf8' 'Unicode as 8Bit characters' ) |
8904 | 984 |
('utf16BE' 'Unicode as 16Bit big-endian' ) |
985 |
('utf16LE' 'Unicode as 16Bit little-endian' ) |
|
8176 | 986 |
"/ ('utf7' 'Unicode as 7Bit characters' ) |
987 |
"/ nil |
|
988 |
('ascii' 'Common 7bit subset of iso8859' ) |
|
989 |
('iso8859-1' 'Latin1' ) |
|
990 |
('iso8859-2' 'Latin2' ) |
|
991 |
('iso8859-3' 'Latin3' ) |
|
992 |
('iso8859-4' 'Latin4' ) |
|
993 |
('iso8859-5' 'Cyrillic' ) |
|
994 |
('iso8859-6' 'Arabic' ) |
|
995 |
('iso8859-7' 'Greek' ) |
|
996 |
('iso8859-8' 'Hebrew' ) |
|
997 |
"/ nil |
|
998 |
('koi7' 'Cyrillic (Old)' ) |
|
999 |
('koi8-r' 'Cyrillic' ) |
|
1000 |
('koi8-u' 'Cyrillic (Ukraine)' ) |
|
1001 |
"/ nil |
|
1002 |
('cp437' 'msdos US / codepage 437' ) |
|
8810 | 1003 |
('cp850' 'msdos Latin1 / codepage 850' ) |
1004 |
('cp1250' 'msdos Latin2 / codepage 1250' ) |
|
14174
3a4f041c94a2
changed: #supportedExternalEncodings
Michael Beyl <mb@exept.de>
parents:
14169
diff
changeset
|
1005 |
('cp1251' 'msdos Latin2 / codepage 1251' ) |
8176 | 1006 |
"/ ('mac' 'macintosh 8 bit' ) |
1007 |
('next' 'NeXT 8 bit' ) |
|
1008 |
"/ ('hp' 'hpux 8 bit' ) |
|
1009 |
"/ nil |
|
1010 |
('euc' 'EUC - extended unix code (japanese)' ) |
|
1011 |
('jis7' 'JIS7 - jis 7bit escape codes (japanese)' ) |
|
1012 |
('iso-2022-jp' 'Same as jis 7bit' ) |
|
1013 |
('sjis' 'SJIS - shift jis 8bit codes (japanese)' ) |
|
1014 |
"/ nil |
|
1015 |
('gb' 'GB - mainland china' ) |
|
1016 |
('big5' 'BIG5 - taiwan' ) |
|
7959 | 1017 |
"/ ('ksc' 'korean' ) |
8186 | 1018 |
('sgml' 'SGML (XML/HTML) character escapes' ) |
10111 | 1019 |
('java' 'JavaText (\uXXXX) character escapes' ) |
7959 | 1020 |
) |
10111 | 1021 |
|
1022 |
"Modified: / 23-10-2006 / 13:27:48 / cg" |
|
7959 | 1023 |
! |
1024 |
||
7947 | 1025 |
userFriendlyNameOfEncoding |
7972 | 1026 |
^ self nameOfEncoding asUppercaseFirst |
7899
7577df77ba95
character encodings - first attempt
Claus Gittinger <cg@exept.de>
parents:
7893
diff
changeset
|
1027 |
! ! |
7577df77ba95
character encodings - first attempt
Claus Gittinger <cg@exept.de>
parents:
7893
diff
changeset
|
1028 |
|
7912 | 1029 |
!CharacterEncoder class methodsFor:'testing'! |
1030 |
||
1031 |
isAbstract |
|
11228 | 1032 |
"Return if this class is an abstract class. |
1033 |
True is returned for CharacterEncoder here; false for subclasses. |
|
1034 |
Abstract subclasses must redefine again." |
|
1035 |
||
7912 | 1036 |
^ self == CharacterEncoder |
1037 |
! ! |
|
1038 |
||
8711
c5f28b4c719d
guessEncoding now implemented in CharacterEncoder
Claus Gittinger <cg@exept.de>
parents:
8388
diff
changeset
|
1039 |
!CharacterEncoder class methodsFor:'utilities'! |
c5f28b4c719d
guessEncoding now implemented in CharacterEncoder
Claus Gittinger <cg@exept.de>
parents:
8388
diff
changeset
|
1040 |
|
c5f28b4c719d
guessEncoding now implemented in CharacterEncoder
Claus Gittinger <cg@exept.de>
parents:
8388
diff
changeset
|
1041 |
guessEncodingOfBuffer:buffer |
c5f28b4c719d
guessEncoding now implemented in CharacterEncoder
Claus Gittinger <cg@exept.de>
parents:
8388
diff
changeset
|
1042 |
"look for a string of the form |
c5f28b4c719d
guessEncoding now implemented in CharacterEncoder
Claus Gittinger <cg@exept.de>
parents:
8388
diff
changeset
|
1043 |
encoding #name |
c5f28b4c719d
guessEncoding now implemented in CharacterEncoder
Claus Gittinger <cg@exept.de>
parents:
8388
diff
changeset
|
1044 |
or: |
c5f28b4c719d
guessEncoding now implemented in CharacterEncoder
Claus Gittinger <cg@exept.de>
parents:
8388
diff
changeset
|
1045 |
encoding: name |
c5f28b4c719d
guessEncoding now implemented in CharacterEncoder
Claus Gittinger <cg@exept.de>
parents:
8388
diff
changeset
|
1046 |
within the given buffer |
c5f28b4c719d
guessEncoding now implemented in CharacterEncoder
Claus Gittinger <cg@exept.de>
parents:
8388
diff
changeset
|
1047 |
(which is usually the first few bytes of a textFile)." |
c5f28b4c719d
guessEncoding now implemented in CharacterEncoder
Claus Gittinger <cg@exept.de>
parents:
8388
diff
changeset
|
1048 |
|
14169
eab487f07a2b
comment/format in: #encoderFor:
Stefan Vogel <sv@exept.de>
parents:
14094
diff
changeset
|
1049 |
|lcBuffer quote peek| |
eab487f07a2b
comment/format in: #encoderFor:
Stefan Vogel <sv@exept.de>
parents:
14094
diff
changeset
|
1050 |
|
eab487f07a2b
comment/format in: #encoderFor:
Stefan Vogel <sv@exept.de>
parents:
14094
diff
changeset
|
1051 |
buffer size < 4 ifTrue:[ |
eab487f07a2b
comment/format in: #encoderFor:
Stefan Vogel <sv@exept.de>
parents:
14094
diff
changeset
|
1052 |
"not enough bytes to determine the contents" |
eab487f07a2b
comment/format in: #encoderFor:
Stefan Vogel <sv@exept.de>
parents:
14094
diff
changeset
|
1053 |
^ nil. |
eab487f07a2b
comment/format in: #encoderFor:
Stefan Vogel <sv@exept.de>
parents:
14094
diff
changeset
|
1054 |
]. |
8711
c5f28b4c719d
guessEncoding now implemented in CharacterEncoder
Claus Gittinger <cg@exept.de>
parents:
8388
diff
changeset
|
1055 |
|
14169
eab487f07a2b
comment/format in: #encoderFor:
Stefan Vogel <sv@exept.de>
parents:
14094
diff
changeset
|
1056 |
"check the Byte Order Mark (BOM)" |
eab487f07a2b
comment/format in: #encoderFor:
Stefan Vogel <sv@exept.de>
parents:
14094
diff
changeset
|
1057 |
peek := (buffer at:1) codePoint. |
eab487f07a2b
comment/format in: #encoderFor:
Stefan Vogel <sv@exept.de>
parents:
14094
diff
changeset
|
1058 |
peek < 16rFE ifTrue:[ |
eab487f07a2b
comment/format in: #encoderFor:
Stefan Vogel <sv@exept.de>
parents:
14094
diff
changeset
|
1059 |
(peek = 16rEF |
eab487f07a2b
comment/format in: #encoderFor:
Stefan Vogel <sv@exept.de>
parents:
14094
diff
changeset
|
1060 |
and:[(buffer at:2) codePoint = 16rBB |
eab487f07a2b
comment/format in: #encoderFor:
Stefan Vogel <sv@exept.de>
parents:
14094
diff
changeset
|
1061 |
and:[(buffer at:3) codePoint = 16rBF]]) ifTrue:[ |
eab487f07a2b
comment/format in: #encoderFor:
Stefan Vogel <sv@exept.de>
parents:
14094
diff
changeset
|
1062 |
^ #utf8 |
eab487f07a2b
comment/format in: #encoderFor:
Stefan Vogel <sv@exept.de>
parents:
14094
diff
changeset
|
1063 |
]. |
eab487f07a2b
comment/format in: #encoderFor:
Stefan Vogel <sv@exept.de>
parents:
14094
diff
changeset
|
1064 |
(peek = 0 |
eab487f07a2b
comment/format in: #encoderFor:
Stefan Vogel <sv@exept.de>
parents:
14094
diff
changeset
|
1065 |
and:[(buffer at:2) codePoint = 0 |
eab487f07a2b
comment/format in: #encoderFor:
Stefan Vogel <sv@exept.de>
parents:
14094
diff
changeset
|
1066 |
and:[(buffer at:3) codePoint = 16rFE |
eab487f07a2b
comment/format in: #encoderFor:
Stefan Vogel <sv@exept.de>
parents:
14094
diff
changeset
|
1067 |
and:[(buffer at:4) codePoint = 16rFF]]]) ifTrue:[ |
eab487f07a2b
comment/format in: #encoderFor:
Stefan Vogel <sv@exept.de>
parents:
14094
diff
changeset
|
1068 |
^ #utf32be |
eab487f07a2b
comment/format in: #encoderFor:
Stefan Vogel <sv@exept.de>
parents:
14094
diff
changeset
|
1069 |
]. |
eab487f07a2b
comment/format in: #encoderFor:
Stefan Vogel <sv@exept.de>
parents:
14094
diff
changeset
|
1070 |
] ifFalse:[ |
eab487f07a2b
comment/format in: #encoderFor:
Stefan Vogel <sv@exept.de>
parents:
14094
diff
changeset
|
1071 |
peek = 16rFF ifTrue:[ |
eab487f07a2b
comment/format in: #encoderFor:
Stefan Vogel <sv@exept.de>
parents:
14094
diff
changeset
|
1072 |
(buffer at:2) codePoint = 16rFE ifTrue:[ |
eab487f07a2b
comment/format in: #encoderFor:
Stefan Vogel <sv@exept.de>
parents:
14094
diff
changeset
|
1073 |
"little endian" |
eab487f07a2b
comment/format in: #encoderFor:
Stefan Vogel <sv@exept.de>
parents:
14094
diff
changeset
|
1074 |
((buffer at:3) codePoint = 0 and:[(buffer at:4) codePoint = 0]) ifTrue:[ |
eab487f07a2b
comment/format in: #encoderFor:
Stefan Vogel <sv@exept.de>
parents:
14094
diff
changeset
|
1075 |
^ #utf32le. |
eab487f07a2b
comment/format in: #encoderFor:
Stefan Vogel <sv@exept.de>
parents:
14094
diff
changeset
|
1076 |
]. |
eab487f07a2b
comment/format in: #encoderFor:
Stefan Vogel <sv@exept.de>
parents:
14094
diff
changeset
|
1077 |
^ #utf16le |
eab487f07a2b
comment/format in: #encoderFor:
Stefan Vogel <sv@exept.de>
parents:
14094
diff
changeset
|
1078 |
]. |
eab487f07a2b
comment/format in: #encoderFor:
Stefan Vogel <sv@exept.de>
parents:
14094
diff
changeset
|
1079 |
] ifFalse:["peek = 16rFE" |
eab487f07a2b
comment/format in: #encoderFor:
Stefan Vogel <sv@exept.de>
parents:
14094
diff
changeset
|
1080 |
(buffer at:2) codePoint = 16rFF ifTrue:[ |
eab487f07a2b
comment/format in: #encoderFor:
Stefan Vogel <sv@exept.de>
parents:
14094
diff
changeset
|
1081 |
"big endian" |
eab487f07a2b
comment/format in: #encoderFor:
Stefan Vogel <sv@exept.de>
parents:
14094
diff
changeset
|
1082 |
^ #utf16be |
eab487f07a2b
comment/format in: #encoderFor:
Stefan Vogel <sv@exept.de>
parents:
14094
diff
changeset
|
1083 |
]. |
eab487f07a2b
comment/format in: #encoderFor:
Stefan Vogel <sv@exept.de>
parents:
14094
diff
changeset
|
1084 |
] |
10672
b6230a13035b
#guessEncodingOfBuffer - do NOT handle encoding=utf8
Stefan Vogel <sv@exept.de>
parents:
10111
diff
changeset
|
1085 |
]. |
8711
c5f28b4c719d
guessEncoding now implemented in CharacterEncoder
Claus Gittinger <cg@exept.de>
parents:
8388
diff
changeset
|
1086 |
|
c5f28b4c719d
guessEncoding now implemented in CharacterEncoder
Claus Gittinger <cg@exept.de>
parents:
8388
diff
changeset
|
1087 |
lcBuffer := buffer asLowercase. |
c5f28b4c719d
guessEncoding now implemented in CharacterEncoder
Claus Gittinger <cg@exept.de>
parents:
8388
diff
changeset
|
1088 |
|
14169
eab487f07a2b
comment/format in: #encoderFor:
Stefan Vogel <sv@exept.de>
parents:
14094
diff
changeset
|
1089 |
"now look for an inline encoding markup" |
10672
b6230a13035b
#guessEncodingOfBuffer - do NOT handle encoding=utf8
Stefan Vogel <sv@exept.de>
parents:
10111
diff
changeset
|
1090 |
#(charset encoding) do:[:keyWord | |
8855
289b5bda04bb
guessEncoding - return the real encodings name
Claus Gittinger <cg@exept.de>
parents:
8814
diff
changeset
|
1091 |
|encoderOrNil idx s w enc| |
289b5bda04bb
guessEncoding - return the real encodings name
Claus Gittinger <cg@exept.de>
parents:
8814
diff
changeset
|
1092 |
|
8711
c5f28b4c719d
guessEncoding now implemented in CharacterEncoder
Claus Gittinger <cg@exept.de>
parents:
8388
diff
changeset
|
1093 |
(idx := lcBuffer findString:keyWord) ~~ 0 ifTrue:[ |
c5f28b4c719d
guessEncoding now implemented in CharacterEncoder
Claus Gittinger <cg@exept.de>
parents:
8388
diff
changeset
|
1094 |
s := ReadStream on:buffer. |
c5f28b4c719d
guessEncoding now implemented in CharacterEncoder
Claus Gittinger <cg@exept.de>
parents:
8388
diff
changeset
|
1095 |
s position1Based:idx. |
c5f28b4c719d
guessEncoding now implemented in CharacterEncoder
Claus Gittinger <cg@exept.de>
parents:
8388
diff
changeset
|
1096 |
s skip:keyWord size. |
c5f28b4c719d
guessEncoding now implemented in CharacterEncoder
Claus Gittinger <cg@exept.de>
parents:
8388
diff
changeset
|
1097 |
s skipSeparators. |
c5f28b4c719d
guessEncoding now implemented in CharacterEncoder
Claus Gittinger <cg@exept.de>
parents:
8388
diff
changeset
|
1098 |
|
10672
b6230a13035b
#guessEncodingOfBuffer - do NOT handle encoding=utf8
Stefan Vogel <sv@exept.de>
parents:
10111
diff
changeset
|
1099 |
"do not include '=' here, otherwise |
b6230a13035b
#guessEncodingOfBuffer - do NOT handle encoding=utf8
Stefan Vogel <sv@exept.de>
parents:
10111
diff
changeset
|
1100 |
files containing xml code (<?xml charset='utf8'> will be parsed as UTF-8" |
b6230a13035b
#guessEncodingOfBuffer - do NOT handle encoding=utf8
Stefan Vogel <sv@exept.de>
parents:
10111
diff
changeset
|
1101 |
|
11300 | 1102 |
[':#=' includes:s peek] whileTrue:[ |
8711
c5f28b4c719d
guessEncoding now implemented in CharacterEncoder
Claus Gittinger <cg@exept.de>
parents:
8388
diff
changeset
|
1103 |
s next. |
c5f28b4c719d
guessEncoding now implemented in CharacterEncoder
Claus Gittinger <cg@exept.de>
parents:
8388
diff
changeset
|
1104 |
s skipSeparators. |
c5f28b4c719d
guessEncoding now implemented in CharacterEncoder
Claus Gittinger <cg@exept.de>
parents:
8388
diff
changeset
|
1105 |
]. |
c5f28b4c719d
guessEncoding now implemented in CharacterEncoder
Claus Gittinger <cg@exept.de>
parents:
8388
diff
changeset
|
1106 |
s skipSeparators. |
c5f28b4c719d
guessEncoding now implemented in CharacterEncoder
Claus Gittinger <cg@exept.de>
parents:
8388
diff
changeset
|
1107 |
('"''' includes:s peek) ifTrue:[ |
c5f28b4c719d
guessEncoding now implemented in CharacterEncoder
Claus Gittinger <cg@exept.de>
parents:
8388
diff
changeset
|
1108 |
quote := s next. |
c5f28b4c719d
guessEncoding now implemented in CharacterEncoder
Claus Gittinger <cg@exept.de>
parents:
8388
diff
changeset
|
1109 |
w := s upTo:quote. |
c5f28b4c719d
guessEncoding now implemented in CharacterEncoder
Claus Gittinger <cg@exept.de>
parents:
8388
diff
changeset
|
1110 |
] ifFalse:[ |
11300 | 1111 |
w := s upToMatching:[:ch | ch isSeparator or:[ch == $" or:[ch == $' or:[ch == $> ]]]]. |
8711
c5f28b4c719d
guessEncoding now implemented in CharacterEncoder
Claus Gittinger <cg@exept.de>
parents:
8388
diff
changeset
|
1112 |
]. |
c5f28b4c719d
guessEncoding now implemented in CharacterEncoder
Claus Gittinger <cg@exept.de>
parents:
8388
diff
changeset
|
1113 |
w notNil ifTrue:[ |
11300 | 1114 |
enc := w withoutQuotes. |
1115 |
(enc startsWith:'x-') ifTrue:[ |
|
1116 |
enc := enc copyFrom:3. |
|
1117 |
]. |
|
10672
b6230a13035b
#guessEncodingOfBuffer - do NOT handle encoding=utf8
Stefan Vogel <sv@exept.de>
parents:
10111
diff
changeset
|
1118 |
encoderOrNil := self encoderFor:enc ifAbsent:nil. |
8855
289b5bda04bb
guessEncoding - return the real encodings name
Claus Gittinger <cg@exept.de>
parents:
8814
diff
changeset
|
1119 |
encoderOrNil notNil ifTrue:[ |
289b5bda04bb
guessEncoding - return the real encodings name
Claus Gittinger <cg@exept.de>
parents:
8814
diff
changeset
|
1120 |
^ encoderOrNil nameOfEncoding |
8711
c5f28b4c719d
guessEncoding now implemented in CharacterEncoder
Claus Gittinger <cg@exept.de>
parents:
8388
diff
changeset
|
1121 |
]. |
c5f28b4c719d
guessEncoding now implemented in CharacterEncoder
Claus Gittinger <cg@exept.de>
parents:
8388
diff
changeset
|
1122 |
"/ enc size >=3 ifTrue:[ |
c5f28b4c719d
guessEncoding now implemented in CharacterEncoder
Claus Gittinger <cg@exept.de>
parents:
8388
diff
changeset
|
1123 |
"/ Transcript showCR:'Unknown encoding: ' , (withoutQuotes value:w). |
c5f28b4c719d
guessEncoding now implemented in CharacterEncoder
Claus Gittinger <cg@exept.de>
parents:
8388
diff
changeset
|
1124 |
"/ ] |
c5f28b4c719d
guessEncoding now implemented in CharacterEncoder
Claus Gittinger <cg@exept.de>
parents:
8388
diff
changeset
|
1125 |
]. |
c5f28b4c719d
guessEncoding now implemented in CharacterEncoder
Claus Gittinger <cg@exept.de>
parents:
8388
diff
changeset
|
1126 |
]. |
c5f28b4c719d
guessEncoding now implemented in CharacterEncoder
Claus Gittinger <cg@exept.de>
parents:
8388
diff
changeset
|
1127 |
]. |
c5f28b4c719d
guessEncoding now implemented in CharacterEncoder
Claus Gittinger <cg@exept.de>
parents:
8388
diff
changeset
|
1128 |
|
c5f28b4c719d
guessEncoding now implemented in CharacterEncoder
Claus Gittinger <cg@exept.de>
parents:
8388
diff
changeset
|
1129 |
"/ look for JIS7 / EUC encoding |
14169
eab487f07a2b
comment/format in: #encoderFor:
Stefan Vogel <sv@exept.de>
parents:
14094
diff
changeset
|
1130 |
(buffer findString:self jisISO2022EscapeSequence) ~~ 0 ifTrue:[ |
8711
c5f28b4c719d
guessEncoding now implemented in CharacterEncoder
Claus Gittinger <cg@exept.de>
parents:
8388
diff
changeset
|
1131 |
^ #'iso2020-jp' |
c5f28b4c719d
guessEncoding now implemented in CharacterEncoder
Claus Gittinger <cg@exept.de>
parents:
8388
diff
changeset
|
1132 |
]. |
14169
eab487f07a2b
comment/format in: #encoderFor:
Stefan Vogel <sv@exept.de>
parents:
14094
diff
changeset
|
1133 |
(buffer findString:self jis7KanjiEscapeSequence) ~~ 0 ifTrue:[ |
8711
c5f28b4c719d
guessEncoding now implemented in CharacterEncoder
Claus Gittinger <cg@exept.de>
parents:
8388
diff
changeset
|
1134 |
^ #jis7 |
c5f28b4c719d
guessEncoding now implemented in CharacterEncoder
Claus Gittinger <cg@exept.de>
parents:
8388
diff
changeset
|
1135 |
]. |
14169
eab487f07a2b
comment/format in: #encoderFor:
Stefan Vogel <sv@exept.de>
parents:
14094
diff
changeset
|
1136 |
(buffer findString:self jis7KanjiOldEscapeSequence) ~~ 0 ifTrue:[ |
8711
c5f28b4c719d
guessEncoding now implemented in CharacterEncoder
Claus Gittinger <cg@exept.de>
parents:
8388
diff
changeset
|
1137 |
^ #jis7 |
c5f28b4c719d
guessEncoding now implemented in CharacterEncoder
Claus Gittinger <cg@exept.de>
parents:
8388
diff
changeset
|
1138 |
]. |
c5f28b4c719d
guessEncoding now implemented in CharacterEncoder
Claus Gittinger <cg@exept.de>
parents:
8388
diff
changeset
|
1139 |
|
c5f28b4c719d
guessEncoding now implemented in CharacterEncoder
Claus Gittinger <cg@exept.de>
parents:
8388
diff
changeset
|
1140 |
"/ TODO: |
c5f28b4c719d
guessEncoding now implemented in CharacterEncoder
Claus Gittinger <cg@exept.de>
parents:
8388
diff
changeset
|
1141 |
|
c5f28b4c719d
guessEncoding now implemented in CharacterEncoder
Claus Gittinger <cg@exept.de>
parents:
8388
diff
changeset
|
1142 |
"/ "/ look for EUC |
c5f28b4c719d
guessEncoding now implemented in CharacterEncoder
Claus Gittinger <cg@exept.de>
parents:
8388
diff
changeset
|
1143 |
"/ idx := aString findFirst:[:char | |ascii| |
c5f28b4c719d
guessEncoding now implemented in CharacterEncoder
Claus Gittinger <cg@exept.de>
parents:
8388
diff
changeset
|
1144 |
"/ ((ascii := char asciiValue) >= 16rA1) |
c5f28b4c719d
guessEncoding now implemented in CharacterEncoder
Claus Gittinger <cg@exept.de>
parents:
8388
diff
changeset
|
1145 |
"/ and:[ascii <= 16rFE]]. |
c5f28b4c719d
guessEncoding now implemented in CharacterEncoder
Claus Gittinger <cg@exept.de>
parents:
8388
diff
changeset
|
1146 |
"/ idx ~~ 0 ifTrue:[ |
c5f28b4c719d
guessEncoding now implemented in CharacterEncoder
Claus Gittinger <cg@exept.de>
parents:
8388
diff
changeset
|
1147 |
"/ ascii := (aString at:(idx + 1)) asciiValue. |
c5f28b4c719d
guessEncoding now implemented in CharacterEncoder
Claus Gittinger <cg@exept.de>
parents:
8388
diff
changeset
|
1148 |
"/ (ascii >= 16rA1 and:[ascii <= 16rFE]) ifTrue:[ |
c5f28b4c719d
guessEncoding now implemented in CharacterEncoder
Claus Gittinger <cg@exept.de>
parents:
8388
diff
changeset
|
1149 |
"/ ^ #euc |
c5f28b4c719d
guessEncoding now implemented in CharacterEncoder
Claus Gittinger <cg@exept.de>
parents:
8388
diff
changeset
|
1150 |
"/ ] |
c5f28b4c719d
guessEncoding now implemented in CharacterEncoder
Claus Gittinger <cg@exept.de>
parents:
8388
diff
changeset
|
1151 |
"/ ]. |
c5f28b4c719d
guessEncoding now implemented in CharacterEncoder
Claus Gittinger <cg@exept.de>
parents:
8388
diff
changeset
|
1152 |
"/ look for SJIS ... |
c5f28b4c719d
guessEncoding now implemented in CharacterEncoder
Claus Gittinger <cg@exept.de>
parents:
8388
diff
changeset
|
1153 |
|
c5f28b4c719d
guessEncoding now implemented in CharacterEncoder
Claus Gittinger <cg@exept.de>
parents:
8388
diff
changeset
|
1154 |
^ nil |
14169
eab487f07a2b
comment/format in: #encoderFor:
Stefan Vogel <sv@exept.de>
parents:
14094
diff
changeset
|
1155 |
! |
eab487f07a2b
comment/format in: #encoderFor:
Stefan Vogel <sv@exept.de>
parents:
14094
diff
changeset
|
1156 |
|
eab487f07a2b
comment/format in: #encoderFor:
Stefan Vogel <sv@exept.de>
parents:
14094
diff
changeset
|
1157 |
guessEncodingOfFile:aFilename |
eab487f07a2b
comment/format in: #encoderFor:
Stefan Vogel <sv@exept.de>
parents:
14094
diff
changeset
|
1158 |
"look for a string |
eab487f07a2b
comment/format in: #encoderFor:
Stefan Vogel <sv@exept.de>
parents:
14094
diff
changeset
|
1159 |
encoding #name |
eab487f07a2b
comment/format in: #encoderFor:
Stefan Vogel <sv@exept.de>
parents:
14094
diff
changeset
|
1160 |
or: |
eab487f07a2b
comment/format in: #encoderFor:
Stefan Vogel <sv@exept.de>
parents:
14094
diff
changeset
|
1161 |
encoding: name |
eab487f07a2b
comment/format in: #encoderFor:
Stefan Vogel <sv@exept.de>
parents:
14094
diff
changeset
|
1162 |
within the given buffer |
eab487f07a2b
comment/format in: #encoderFor:
Stefan Vogel <sv@exept.de>
parents:
14094
diff
changeset
|
1163 |
(which is usually the first few bytes of a textFile). |
eab487f07a2b
comment/format in: #encoderFor:
Stefan Vogel <sv@exept.de>
parents:
14094
diff
changeset
|
1164 |
If thats not found, use heuristics (in CharacterArray) to guess." |
eab487f07a2b
comment/format in: #encoderFor:
Stefan Vogel <sv@exept.de>
parents:
14094
diff
changeset
|
1165 |
|
eab487f07a2b
comment/format in: #encoderFor:
Stefan Vogel <sv@exept.de>
parents:
14094
diff
changeset
|
1166 |
|s buffer n "{Class: SmallInteger }"| |
eab487f07a2b
comment/format in: #encoderFor:
Stefan Vogel <sv@exept.de>
parents:
14094
diff
changeset
|
1167 |
|
eab487f07a2b
comment/format in: #encoderFor:
Stefan Vogel <sv@exept.de>
parents:
14094
diff
changeset
|
1168 |
s := aFilename asFilename readStreamOrNil. |
eab487f07a2b
comment/format in: #encoderFor:
Stefan Vogel <sv@exept.de>
parents:
14094
diff
changeset
|
1169 |
s isNil ifTrue:[^ nil]. |
eab487f07a2b
comment/format in: #encoderFor:
Stefan Vogel <sv@exept.de>
parents:
14094
diff
changeset
|
1170 |
|
eab487f07a2b
comment/format in: #encoderFor:
Stefan Vogel <sv@exept.de>
parents:
14094
diff
changeset
|
1171 |
buffer := String new:64. |
eab487f07a2b
comment/format in: #encoderFor:
Stefan Vogel <sv@exept.de>
parents:
14094
diff
changeset
|
1172 |
n := s nextBytes:buffer size into:buffer. |
eab487f07a2b
comment/format in: #encoderFor:
Stefan Vogel <sv@exept.de>
parents:
14094
diff
changeset
|
1173 |
s close. |
eab487f07a2b
comment/format in: #encoderFor:
Stefan Vogel <sv@exept.de>
parents:
14094
diff
changeset
|
1174 |
|
eab487f07a2b
comment/format in: #encoderFor:
Stefan Vogel <sv@exept.de>
parents:
14094
diff
changeset
|
1175 |
^ self guessEncodingOfBuffer:buffer. |
8711
c5f28b4c719d
guessEncoding now implemented in CharacterEncoder
Claus Gittinger <cg@exept.de>
parents:
8388
diff
changeset
|
1176 |
|
c5f28b4c719d
guessEncoding now implemented in CharacterEncoder
Claus Gittinger <cg@exept.de>
parents:
8388
diff
changeset
|
1177 |
" |
14094 | 1178 |
self guessEncodingOfFile:'../../libview/resources/de.rs' asFilename |
1179 |
self guessEncodingOfFile:'../../libview/resources/ru.rs' asFilename |
|
1180 |
self guessEncodingOfFile:'../../libview/resources/th.rs' asFilename |
|
8711
c5f28b4c719d
guessEncoding now implemented in CharacterEncoder
Claus Gittinger <cg@exept.de>
parents:
8388
diff
changeset
|
1181 |
" |
13382 | 1182 |
|
1183 |
"Modified: / 31-05-2011 / 15:45:19 / cg" |
|
8711
c5f28b4c719d
guessEncoding now implemented in CharacterEncoder
Claus Gittinger <cg@exept.de>
parents:
8388
diff
changeset
|
1184 |
! |
c5f28b4c719d
guessEncoding now implemented in CharacterEncoder
Claus Gittinger <cg@exept.de>
parents:
8388
diff
changeset
|
1185 |
|
c5f28b4c719d
guessEncoding now implemented in CharacterEncoder
Claus Gittinger <cg@exept.de>
parents:
8388
diff
changeset
|
1186 |
guessEncodingOfStream:aStream |
c5f28b4c719d
guessEncoding now implemented in CharacterEncoder
Claus Gittinger <cg@exept.de>
parents:
8388
diff
changeset
|
1187 |
"look for a string of the form |
c5f28b4c719d
guessEncoding now implemented in CharacterEncoder
Claus Gittinger <cg@exept.de>
parents:
8388
diff
changeset
|
1188 |
encoding #name |
c5f28b4c719d
guessEncoding now implemented in CharacterEncoder
Claus Gittinger <cg@exept.de>
parents:
8388
diff
changeset
|
1189 |
or: |
c5f28b4c719d
guessEncoding now implemented in CharacterEncoder
Claus Gittinger <cg@exept.de>
parents:
8388
diff
changeset
|
1190 |
encoding: name |
c5f28b4c719d
guessEncoding now implemented in CharacterEncoder
Claus Gittinger <cg@exept.de>
parents:
8388
diff
changeset
|
1191 |
in the first few bytes of aStream." |
c5f28b4c719d
guessEncoding now implemented in CharacterEncoder
Claus Gittinger <cg@exept.de>
parents:
8388
diff
changeset
|
1192 |
|
c5f28b4c719d
guessEncoding now implemented in CharacterEncoder
Claus Gittinger <cg@exept.de>
parents:
8388
diff
changeset
|
1193 |
|oldPosition buffer n| |
c5f28b4c719d
guessEncoding now implemented in CharacterEncoder
Claus Gittinger <cg@exept.de>
parents:
8388
diff
changeset
|
1194 |
|
13382 | 1195 |
buffer := String new:64. |
8711
c5f28b4c719d
guessEncoding now implemented in CharacterEncoder
Claus Gittinger <cg@exept.de>
parents:
8388
diff
changeset
|
1196 |
|
c5f28b4c719d
guessEncoding now implemented in CharacterEncoder
Claus Gittinger <cg@exept.de>
parents:
8388
diff
changeset
|
1197 |
oldPosition := aStream position. |
14094 | 1198 |
n := aStream nextBytes:buffer size into:buffer. |
8711
c5f28b4c719d
guessEncoding now implemented in CharacterEncoder
Claus Gittinger <cg@exept.de>
parents:
8388
diff
changeset
|
1199 |
aStream position:oldPosition. |
c5f28b4c719d
guessEncoding now implemented in CharacterEncoder
Claus Gittinger <cg@exept.de>
parents:
8388
diff
changeset
|
1200 |
|
c5f28b4c719d
guessEncoding now implemented in CharacterEncoder
Claus Gittinger <cg@exept.de>
parents:
8388
diff
changeset
|
1201 |
^ self guessEncodingOfBuffer:buffer |
13382 | 1202 |
|
1203 |
"Modified: / 31-05-2011 / 15:45:23 / cg" |
|
8810 | 1204 |
! |
1205 |
||
1206 |
showCharacterSet |
|
1207 |
|font| |
|
1208 |
||
1209 |
"/ font := (Font family:'courier' face:'medium' style:'roman' size:12 encoding:'iso10646-1'). |
|
1210 |
font := View defaultFont. |
|
1211 |
||
1212 |
CharacterSetView |
|
1213 |
openOn:font |
|
1214 |
label:'Characters of ',self nameWithoutPrefix |
|
1215 |
clickLabel:nil |
|
1216 |
asInputFor:nil |
|
1217 |
encoder:self |
|
1218 |
||
1219 |
" |
|
1220 |
CharacterEncoderImplementations::CP1250 showCharacterSet |
|
1221 |
" |
|
8711
c5f28b4c719d
guessEncoding now implemented in CharacterEncoder
Claus Gittinger <cg@exept.de>
parents:
8388
diff
changeset
|
1222 |
! ! |
c5f28b4c719d
guessEncoding now implemented in CharacterEncoder
Claus Gittinger <cg@exept.de>
parents:
8388
diff
changeset
|
1223 |
|
7892 | 1224 |
!CharacterEncoder methodsFor:'encoding & decoding'! |
1225 |
||
1226 |
decode:anEncoding |
|
1227 |
"given an integer in my encoding, return a unicode codePoint for it" |
|
1228 |
||
8118 | 1229 |
self subclassResponsibility |
7892 | 1230 |
! |
1231 |
||
1232 |
decodeString:anEncodedString |
|
1233 |
"given a string in my encoding, return a unicode-string for it" |
|
1234 |
||
8150
ba9c6e587973
care for bitsPerCharacter change during encodeString/decodeString.
ca
parents:
8136
diff
changeset
|
1235 |
|newString myCode uniCodePoint bits| |
8118 | 1236 |
|
1237 |
newString := String new:(anEncodedString size). |
|
8150
ba9c6e587973
care for bitsPerCharacter change during encodeString/decodeString.
ca
parents:
8136
diff
changeset
|
1238 |
bits := newString bitsPerCharacter. |
8118 | 1239 |
|
8150
ba9c6e587973
care for bitsPerCharacter change during encodeString/decodeString.
ca
parents:
8136
diff
changeset
|
1240 |
1 to:anEncodedString size do:[:idx | |
ba9c6e587973
care for bitsPerCharacter change during encodeString/decodeString.
ca
parents:
8136
diff
changeset
|
1241 |
uniCodePoint := (anEncodedString at:idx) codePoint. |
ba9c6e587973
care for bitsPerCharacter change during encodeString/decodeString.
ca
parents:
8136
diff
changeset
|
1242 |
myCode := self decode:uniCodePoint. |
ba9c6e587973
care for bitsPerCharacter change during encodeString/decodeString.
ca
parents:
8136
diff
changeset
|
1243 |
myCode > 16rFF ifTrue:[ |
ba9c6e587973
care for bitsPerCharacter change during encodeString/decodeString.
ca
parents:
8136
diff
changeset
|
1244 |
myCode > 16rFFFF ifTrue:[ |
ba9c6e587973
care for bitsPerCharacter change during encodeString/decodeString.
ca
parents:
8136
diff
changeset
|
1245 |
bits < 32 ifTrue:[ |
ba9c6e587973
care for bitsPerCharacter change during encodeString/decodeString.
ca
parents:
8136
diff
changeset
|
1246 |
newString := Unicode32String fromString:newString. |
ba9c6e587973
care for bitsPerCharacter change during encodeString/decodeString.
ca
parents:
8136
diff
changeset
|
1247 |
bits := 32. |
ba9c6e587973
care for bitsPerCharacter change during encodeString/decodeString.
ca
parents:
8136
diff
changeset
|
1248 |
] |
ba9c6e587973
care for bitsPerCharacter change during encodeString/decodeString.
ca
parents:
8136
diff
changeset
|
1249 |
] ifFalse:[ |
ba9c6e587973
care for bitsPerCharacter change during encodeString/decodeString.
ca
parents:
8136
diff
changeset
|
1250 |
bits < 16 ifTrue:[ |
ba9c6e587973
care for bitsPerCharacter change during encodeString/decodeString.
ca
parents:
8136
diff
changeset
|
1251 |
newString := Unicode16String fromString:newString. |
ba9c6e587973
care for bitsPerCharacter change during encodeString/decodeString.
ca
parents:
8136
diff
changeset
|
1252 |
bits := 16. |
ba9c6e587973
care for bitsPerCharacter change during encodeString/decodeString.
ca
parents:
8136
diff
changeset
|
1253 |
] |
ba9c6e587973
care for bitsPerCharacter change during encodeString/decodeString.
ca
parents:
8136
diff
changeset
|
1254 |
] |
ba9c6e587973
care for bitsPerCharacter change during encodeString/decodeString.
ca
parents:
8136
diff
changeset
|
1255 |
]. |
ba9c6e587973
care for bitsPerCharacter change during encodeString/decodeString.
ca
parents:
8136
diff
changeset
|
1256 |
newString at:idx put:(Character value:myCode). |
8118 | 1257 |
]. |
1258 |
^ newString |
|
7892 | 1259 |
|
1260 |
" |
|
1261 |
ISO8859_1 decodeString:'hello' |
|
1262 |
" |
|
1263 |
! |
|
1264 |
||
1265 |
encode:aCodePoint |
|
1266 |
"given a codePoint in unicode, return a byte in my encoding for it" |
|
1267 |
||
8118 | 1268 |
self subclassResponsibility |
7892 | 1269 |
! |
1270 |
||
1271 |
encodeString:aUnicodeString |
|
1272 |
"given a string in unicode, return a string in my encoding for it" |
|
1273 |
||
8150
ba9c6e587973
care for bitsPerCharacter change during encodeString/decodeString.
ca
parents:
8136
diff
changeset
|
1274 |
|newString myCode uniCodePoint bits| |
8118 | 1275 |
|
8150
ba9c6e587973
care for bitsPerCharacter change during encodeString/decodeString.
ca
parents:
8136
diff
changeset
|
1276 |
newString := String new:(aUnicodeString size). |
ba9c6e587973
care for bitsPerCharacter change during encodeString/decodeString.
ca
parents:
8136
diff
changeset
|
1277 |
bits := newString bitsPerCharacter. |
ba9c6e587973
care for bitsPerCharacter change during encodeString/decodeString.
ca
parents:
8136
diff
changeset
|
1278 |
|
8118 | 1279 |
1 to:aUnicodeString size do:[:idx | |
8150
ba9c6e587973
care for bitsPerCharacter change during encodeString/decodeString.
ca
parents:
8136
diff
changeset
|
1280 |
uniCodePoint := (aUnicodeString at:idx) codePoint. |
ba9c6e587973
care for bitsPerCharacter change during encodeString/decodeString.
ca
parents:
8136
diff
changeset
|
1281 |
myCode := self encode:uniCodePoint. |
ba9c6e587973
care for bitsPerCharacter change during encodeString/decodeString.
ca
parents:
8136
diff
changeset
|
1282 |
myCode > 16rFF ifTrue:[ |
ba9c6e587973
care for bitsPerCharacter change during encodeString/decodeString.
ca
parents:
8136
diff
changeset
|
1283 |
myCode > 16rFFFF ifTrue:[ |
ba9c6e587973
care for bitsPerCharacter change during encodeString/decodeString.
ca
parents:
8136
diff
changeset
|
1284 |
bits < 32 ifTrue:[ |
ba9c6e587973
care for bitsPerCharacter change during encodeString/decodeString.
ca
parents:
8136
diff
changeset
|
1285 |
newString := Unicode32String fromString:newString. |
ba9c6e587973
care for bitsPerCharacter change during encodeString/decodeString.
ca
parents:
8136
diff
changeset
|
1286 |
bits := 32. |
ba9c6e587973
care for bitsPerCharacter change during encodeString/decodeString.
ca
parents:
8136
diff
changeset
|
1287 |
] |
ba9c6e587973
care for bitsPerCharacter change during encodeString/decodeString.
ca
parents:
8136
diff
changeset
|
1288 |
] ifFalse:[ |
ba9c6e587973
care for bitsPerCharacter change during encodeString/decodeString.
ca
parents:
8136
diff
changeset
|
1289 |
bits < 16 ifTrue:[ |
ba9c6e587973
care for bitsPerCharacter change during encodeString/decodeString.
ca
parents:
8136
diff
changeset
|
1290 |
newString := Unicode16String fromString:newString. |
ba9c6e587973
care for bitsPerCharacter change during encodeString/decodeString.
ca
parents:
8136
diff
changeset
|
1291 |
bits := 16. |
ba9c6e587973
care for bitsPerCharacter change during encodeString/decodeString.
ca
parents:
8136
diff
changeset
|
1292 |
] |
ba9c6e587973
care for bitsPerCharacter change during encodeString/decodeString.
ca
parents:
8136
diff
changeset
|
1293 |
] |
ba9c6e587973
care for bitsPerCharacter change during encodeString/decodeString.
ca
parents:
8136
diff
changeset
|
1294 |
]. |
ba9c6e587973
care for bitsPerCharacter change during encodeString/decodeString.
ca
parents:
8136
diff
changeset
|
1295 |
newString at:idx put:(Character value:myCode). |
8118 | 1296 |
]. |
1297 |
^ newString |
|
7892 | 1298 |
! ! |
1299 |
||
7899
7577df77ba95
character encodings - first attempt
Claus Gittinger <cg@exept.de>
parents:
7893
diff
changeset
|
1300 |
!CharacterEncoder methodsFor:'error handling'! |
7577df77ba95
character encodings - first attempt
Claus Gittinger <cg@exept.de>
parents:
7893
diff
changeset
|
1301 |
|
7577df77ba95
character encodings - first attempt
Claus Gittinger <cg@exept.de>
parents:
7893
diff
changeset
|
1302 |
decodingError |
7904 | 1303 |
"report an error that there is no unicode-codePoint for a given codePoint in this encoding. |
1304 |
(which is unlikely) or that the encoding is undefined for that value |
|
1305 |
(for example, holes in the ISO8859-3 encoding)" |
|
1306 |
||
7919 | 1307 |
|badCodePoint sender| |
1308 |
||
1309 |
sender := thisContext sender. |
|
1310 |
((sender selector == #encode:) or:[sender selector == #decode:]) ifFalse:[ |
|
11295 | 1311 |
badCodePoint := sender methodHome argAt:1 |
7919 | 1312 |
]. |
11295 | 1313 |
^ (DecodingError new) |
1314 |
defaultValue:(self defaultDecoderValue); |
|
1315 |
parameter:badCodePoint; |
|
1316 |
messageText:'invalid code'; |
|
1317 |
suspendedContext:sender; |
|
1318 |
raiseRequest. |
|
7899
7577df77ba95
character encodings - first attempt
Claus Gittinger <cg@exept.de>
parents:
7893
diff
changeset
|
1319 |
! |
7577df77ba95
character encodings - first attempt
Claus Gittinger <cg@exept.de>
parents:
7893
diff
changeset
|
1320 |
|
7577df77ba95
character encodings - first attempt
Claus Gittinger <cg@exept.de>
parents:
7893
diff
changeset
|
1321 |
defaultDecoderValue |
7577df77ba95
character encodings - first attempt
Claus Gittinger <cg@exept.de>
parents:
7893
diff
changeset
|
1322 |
"placed into a decoded string, in case there is no unicode codePoint |
7577df77ba95
character encodings - first attempt
Claus Gittinger <cg@exept.de>
parents:
7893
diff
changeset
|
1323 |
for a given encoded codePoint. |
7904 | 1324 |
(typically 16rFFFF)." |
7899
7577df77ba95
character encodings - first attempt
Claus Gittinger <cg@exept.de>
parents:
7893
diff
changeset
|
1325 |
|
7904 | 1326 |
^ 16rFFFF |
7899
7577df77ba95
character encodings - first attempt
Claus Gittinger <cg@exept.de>
parents:
7893
diff
changeset
|
1327 |
! |
7577df77ba95
character encodings - first attempt
Claus Gittinger <cg@exept.de>
parents:
7893
diff
changeset
|
1328 |
|
7577df77ba95
character encodings - first attempt
Claus Gittinger <cg@exept.de>
parents:
7893
diff
changeset
|
1329 |
defaultEncoderValue |
7577df77ba95
character encodings - first attempt
Claus Gittinger <cg@exept.de>
parents:
7893
diff
changeset
|
1330 |
"placed into an encoded string, in case there is no codePoint |
7577df77ba95
character encodings - first attempt
Claus Gittinger <cg@exept.de>
parents:
7893
diff
changeset
|
1331 |
for a given unicode codePoint. |
7577df77ba95
character encodings - first attempt
Claus Gittinger <cg@exept.de>
parents:
7893
diff
changeset
|
1332 |
(typically $?)." |
7577df77ba95
character encodings - first attempt
Claus Gittinger <cg@exept.de>
parents:
7893
diff
changeset
|
1333 |
|
8101
f7023a4735bf
Use the ANSI-blessed #codePoint instead of deprecated #asciiValue
Stefan Vogel <sv@exept.de>
parents:
8087
diff
changeset
|
1334 |
^ $? codePoint |
7899
7577df77ba95
character encodings - first attempt
Claus Gittinger <cg@exept.de>
parents:
7893
diff
changeset
|
1335 |
! |
7577df77ba95
character encodings - first attempt
Claus Gittinger <cg@exept.de>
parents:
7893
diff
changeset
|
1336 |
|
7919 | 1337 |
encodingError |
1338 |
"report an error that some unicode-codePoint cannot be represented by this encoder" |
|
1339 |
||
1340 |
|badCodePoint sender| |
|
7904 | 1341 |
|
1342 |
sender := thisContext sender. |
|
1343 |
((sender selector == #encode:) or:[sender selector == #decode:]) ifFalse:[ |
|
8136 | 1344 |
badCodePoint := sender methodHome argAt:1 |
7904 | 1345 |
]. |
8048 | 1346 |
^ (EncodingError new) |
8136 | 1347 |
defaultValue:(self defaultEncoderValue); |
1348 |
parameter:badCodePoint; |
|
1349 |
messageText:'unrepresentable unicode'; |
|
1350 |
suspendedContext:sender; |
|
1351 |
raiseRequest |
|
7899
7577df77ba95
character encodings - first attempt
Claus Gittinger <cg@exept.de>
parents:
7893
diff
changeset
|
1352 |
! ! |
7577df77ba95
character encodings - first attempt
Claus Gittinger <cg@exept.de>
parents:
7893
diff
changeset
|
1353 |
|
7972 | 1354 |
!CharacterEncoder methodsFor:'printing'! |
1355 |
||
1356 |
printOn:aStream |
|
1357 |
aStream |
|
8136 | 1358 |
nextPutAll:(self nameOfDecodedCode); |
1359 |
nextPutAll:'->'; |
|
1360 |
nextPutAll:(self nameOfEncoding) |
|
7972 | 1361 |
! ! |
1362 |
||
7892 | 1363 |
!CharacterEncoder methodsFor:'private'! |
1364 |
||
1365 |
newString:size |
|
1366 |
self subclassResponsibility |
|
1367 |
! ! |
|
1368 |
||
7917 | 1369 |
!CharacterEncoder methodsFor:'queries'! |
1370 |
||
11975 | 1371 |
characterSize:codePoint |
1372 |
"return the number of bytes required to encode codePoint" |
|
1373 |
||
1374 |
^ self subclassResponsibility |
|
1375 |
||
1376 |
"Created: / 15-06-2005 / 15:11:04 / janfrog" |
|
1377 |
! |
|
1378 |
||
7917 | 1379 |
isNullEncoder |
1380 |
^ false |
|
7972 | 1381 |
! |
1382 |
||
1383 |
nameOfDecodedCode |
|
1384 |
"Most coders decode from their code into unicode / encode from unicode into their code. |
|
1385 |
There are a few exceptions to this, though - these must redefine this." |
|
1386 |
||
1387 |
^ self class nameOfDecodedCode |
|
1388 |
! |
|
1389 |
||
1390 |
nameOfEncoding |
|
1391 |
^ self class nameOfEncoding |
|
1392 |
! |
|
1393 |
||
1394 |
userFriendlyNameOfEncoding |
|
1395 |
^ self class userFriendlyNameOfEncoding |
|
7917 | 1396 |
! ! |
1397 |
||
11975 | 1398 |
!CharacterEncoder methodsFor:'stream support'! |
1399 |
||
12608 | 1400 |
readNext:charactersToRead charactersFrom:stream |
1401 |
^ self decodeString:(stream next:charactersToRead) |
|
11975 | 1402 |
! |
1403 |
||
1404 |
readNextCharacterFrom:aStream |
|
12608 | 1405 |
|
1406 |
| c | |
|
1407 |
||
1408 |
c := aStream next. |
|
1409 |
||
1410 |
^ c isNil |
|
1411 |
ifTrue: [nil] |
|
1412 |
ifFalse: [(self decode:c asInteger) asCharacter] |
|
1413 |
||
1414 |
"Created: / 14-06-2005 / 17:03:21 / janfrog" |
|
1415 |
"Modified: / 15-06-2005 / 15:27:49 / janfrog" |
|
1416 |
"Modified: / 20-06-2005 / 13:13:52 / masca" |
|
12435
539c24148e90
added: #readNextInputCharacterFrom:
Claus Gittinger <cg@exept.de>
parents:
11975
diff
changeset
|
1417 |
! |
11975 | 1418 |
|
12435
539c24148e90
added: #readNextInputCharacterFrom:
Claus Gittinger <cg@exept.de>
parents:
11975
diff
changeset
|
1419 |
readNextInputCharacterFrom:aStream |
12608 | 1420 |
^ aStream next |
11975 | 1421 |
! ! |
1422 |
||
7915 | 1423 |
!CharacterEncoder::CompoundEncoder class methodsFor:'documentation'! |
7914 | 1424 |
|
1425 |
documentation |
|
1426 |
" |
|
1427 |
A compoundEncoder uses two real encoders; |
|
1428 |
to encode: |
|
8136 | 1429 |
string -> decoder(encode) -> encoder -> result |
7914 | 1430 |
to decode: |
8136 | 1431 |
string -> encoder -> decoder -> result |
7956 | 1432 |
|
1433 |
|e| |
|
1434 |
||
1435 |
e := CompoundEncoder new. |
|
1436 |
e encoder:ISO8859_5 decoder:KOI8_R. |
|
1437 |
e decode:16rB0. 'CYRILLIC CAPITAL LETTER A; 16rB0 in 8859-5; 16rE1 in KOI8-R'. |
|
1438 |
e encode:16rE1. |
|
7914 | 1439 |
" |
1440 |
! ! |
|
1441 |
||
7915 | 1442 |
!CharacterEncoder::CompoundEncoder methodsFor:'accessing'! |
7914 | 1443 |
|
1444 |
encoder:encoderArg decoder:decoderArg |
|
1445 |
"set instance variables (automatically generated)" |
|
1446 |
||
1447 |
decoder := decoderArg. |
|
1448 |
encoder := encoderArg. |
|
1449 |
! ! |
|
1450 |
||
7915 | 1451 |
!CharacterEncoder::CompoundEncoder methodsFor:'encoding & decoding'! |
7914 | 1452 |
|
7956 | 1453 |
decode:aCode |
1454 |
^ decoder encode:(encoder decode:aCode) |
|
1455 |
! |
|
1456 |
||
1457 |
decodeString:aString |
|
1458 |
^ decoder encodeString:(encoder decodeString:aString) |
|
1459 |
! |
|
1460 |
||
7914 | 1461 |
encode:aCode |
1462 |
^ encoder encode:(decoder decode:aCode) |
|
1463 |
! |
|
1464 |
||
1465 |
encodeString:aString |
|
1466 |
^ encoder encodeString:(decoder decodeString:aString) |
|
1467 |
! ! |
|
1468 |
||
7972 | 1469 |
!CharacterEncoder::CompoundEncoder methodsFor:'printing'! |
1470 |
||
1471 |
printOn:aStream |
|
1472 |
aStream |
|
8136 | 1473 |
nextPutAll:(decoder nameOfEncoding); |
1474 |
nextPutAll:'->'. |
|
7972 | 1475 |
"/ nextPutAll:(decoder nameOfDecodedCode); |
1476 |
"/ nextPutAll:'->'; |
|
1477 |
"/ nextPutAll:(encoder nameOfEncoding) |
|
1478 |
encoder printOn:aStream |
|
1479 |
! ! |
|
1480 |
||
7932 | 1481 |
!CharacterEncoder::DefaultEncoder class methodsFor:'documentation'! |
1482 |
||
1483 |
documentation |
|
1484 |
" |
|
7972 | 1485 |
That is only a dummy for ST80 compatibility |
7932 | 1486 |
" |
1487 |
! ! |
|
1488 |
||
7915 | 1489 |
!CharacterEncoder::InverseEncoder class methodsFor:'documentation'! |
7914 | 1490 |
|
1491 |
documentation |
|
1492 |
" |
|
1493 |
An inverseEncoder does the inverse - i.e. encode is really a decode |
|
1494 |
and decode is really an encode. |
|
1495 |
" |
|
1496 |
! ! |
|
1497 |
||
7915 | 1498 |
!CharacterEncoder::InverseEncoder methodsFor:'accessing'! |
7914 | 1499 |
|
1500 |
decoder:something |
|
1501 |
decoder := something. |
|
1502 |
! ! |
|
1503 |
||
7915 | 1504 |
!CharacterEncoder::InverseEncoder methodsFor:'encoding & decoding'! |
7914 | 1505 |
|
1506 |
decode:aCode |
|
1507 |
^ decoder encode:aCode |
|
1508 |
! |
|
1509 |
||
1510 |
decodeString:aString |
|
1511 |
^ decoder encodeString:aString |
|
1512 |
! |
|
1513 |
||
1514 |
encode:aCode |
|
1515 |
^ decoder decode:aCode |
|
1516 |
! |
|
1517 |
||
1518 |
encodeString:aString |
|
1519 |
^ decoder decodeString:aString |
|
1520 |
! ! |
|
1521 |
||
7972 | 1522 |
!CharacterEncoder::InverseEncoder methodsFor:'printing'! |
1523 |
||
1524 |
printOn:aStream |
|
1525 |
aStream |
|
8136 | 1526 |
nextPutAll:(decoder nameOfEncoding); |
1527 |
nextPutAll:'->'; |
|
1528 |
nextPutAll:(decoder nameOfDecodedCode) |
|
7972 | 1529 |
! ! |
1530 |
||
12435
539c24148e90
added: #readNextInputCharacterFrom:
Claus Gittinger <cg@exept.de>
parents:
11975
diff
changeset
|
1531 |
!CharacterEncoder::InverseEncoder methodsFor:'queries'! |
539c24148e90
added: #readNextInputCharacterFrom:
Claus Gittinger <cg@exept.de>
parents:
11975
diff
changeset
|
1532 |
|
539c24148e90
added: #readNextInputCharacterFrom:
Claus Gittinger <cg@exept.de>
parents:
11975
diff
changeset
|
1533 |
characterSize:charOrcodePoint |
539c24148e90
added: #readNextInputCharacterFrom:
Claus Gittinger <cg@exept.de>
parents:
11975
diff
changeset
|
1534 |
^ decoder characterSize:charOrcodePoint |
539c24148e90
added: #readNextInputCharacterFrom:
Claus Gittinger <cg@exept.de>
parents:
11975
diff
changeset
|
1535 |
! ! |
539c24148e90
added: #readNextInputCharacterFrom:
Claus Gittinger <cg@exept.de>
parents:
11975
diff
changeset
|
1536 |
|
539c24148e90
added: #readNextInputCharacterFrom:
Claus Gittinger <cg@exept.de>
parents:
11975
diff
changeset
|
1537 |
!CharacterEncoder::InverseEncoder methodsFor:'stream support'! |
539c24148e90
added: #readNextInputCharacterFrom:
Claus Gittinger <cg@exept.de>
parents:
11975
diff
changeset
|
1538 |
|
539c24148e90
added: #readNextInputCharacterFrom:
Claus Gittinger <cg@exept.de>
parents:
11975
diff
changeset
|
1539 |
readNextInputCharacterFrom:aStream |
539c24148e90
added: #readNextInputCharacterFrom:
Claus Gittinger <cg@exept.de>
parents:
11975
diff
changeset
|
1540 |
^ decoder readNextInputCharacterFrom:aStream |
539c24148e90
added: #readNextInputCharacterFrom:
Claus Gittinger <cg@exept.de>
parents:
11975
diff
changeset
|
1541 |
! ! |
539c24148e90
added: #readNextInputCharacterFrom:
Claus Gittinger <cg@exept.de>
parents:
11975
diff
changeset
|
1542 |
|
7915 | 1543 |
!CharacterEncoder::NullEncoder class methodsFor:'documentation'! |
7914 | 1544 |
|
1545 |
documentation |
|
1546 |
" |
|
1547 |
A NullEncoder does nothing. |
|
1548 |
" |
|
1549 |
! ! |
|
1550 |
||
7915 | 1551 |
!CharacterEncoder::NullEncoder methodsFor:'encoding & decoding'! |
7899
7577df77ba95
character encodings - first attempt
Claus Gittinger <cg@exept.de>
parents:
7893
diff
changeset
|
1552 |
|
7577df77ba95
character encodings - first attempt
Claus Gittinger <cg@exept.de>
parents:
7893
diff
changeset
|
1553 |
decode:aCode |
7577df77ba95
character encodings - first attempt
Claus Gittinger <cg@exept.de>
parents:
7893
diff
changeset
|
1554 |
^ aCode |
7577df77ba95
character encodings - first attempt
Claus Gittinger <cg@exept.de>
parents:
7893
diff
changeset
|
1555 |
! |
7577df77ba95
character encodings - first attempt
Claus Gittinger <cg@exept.de>
parents:
7893
diff
changeset
|
1556 |
|
7577df77ba95
character encodings - first attempt
Claus Gittinger <cg@exept.de>
parents:
7893
diff
changeset
|
1557 |
decodeString:aString |
7577df77ba95
character encodings - first attempt
Claus Gittinger <cg@exept.de>
parents:
7893
diff
changeset
|
1558 |
^ aString |
7577df77ba95
character encodings - first attempt
Claus Gittinger <cg@exept.de>
parents:
7893
diff
changeset
|
1559 |
! |
7577df77ba95
character encodings - first attempt
Claus Gittinger <cg@exept.de>
parents:
7893
diff
changeset
|
1560 |
|
7577df77ba95
character encodings - first attempt
Claus Gittinger <cg@exept.de>
parents:
7893
diff
changeset
|
1561 |
encode:aCode |
7577df77ba95
character encodings - first attempt
Claus Gittinger <cg@exept.de>
parents:
7893
diff
changeset
|
1562 |
^ aCode |
7577df77ba95
character encodings - first attempt
Claus Gittinger <cg@exept.de>
parents:
7893
diff
changeset
|
1563 |
! |
7577df77ba95
character encodings - first attempt
Claus Gittinger <cg@exept.de>
parents:
7893
diff
changeset
|
1564 |
|
7577df77ba95
character encodings - first attempt
Claus Gittinger <cg@exept.de>
parents:
7893
diff
changeset
|
1565 |
encodeString:aString |
7577df77ba95
character encodings - first attempt
Claus Gittinger <cg@exept.de>
parents:
7893
diff
changeset
|
1566 |
^ aString |
7577df77ba95
character encodings - first attempt
Claus Gittinger <cg@exept.de>
parents:
7893
diff
changeset
|
1567 |
! ! |
7577df77ba95
character encodings - first attempt
Claus Gittinger <cg@exept.de>
parents:
7893
diff
changeset
|
1568 |
|
7917 | 1569 |
!CharacterEncoder::NullEncoder methodsFor:'queries'! |
1570 |
||
1571 |
isNullEncoder |
|
1572 |
^ true |
|
1573 |
! ! |
|
1574 |
||
7915 | 1575 |
!CharacterEncoder::OtherEncoding class methodsFor:'private'! |
7892 | 1576 |
|
1577 |
flushCode |
|
1578 |
! |
|
1579 |
||
1580 |
generateEncoderCode |
|
1581 |
! ! |
|
1582 |
||
7919 | 1583 |
!CharacterEncoder::TwoStepEncoder class methodsFor:'documentation'! |
1584 |
||
1585 |
documentation |
|
1586 |
" |
|
1587 |
A twoStepEncoder uses two real encoders; |
|
1588 |
to encode: |
|
7932 | 1589 |
string -> encoder1(encode) -> encoder2(encode) -> result |
7919 | 1590 |
to decode: |
7932 | 1591 |
string -> encoder2(decode) -> encoder1(decode) -> result |
7919 | 1592 |
" |
1593 |
! ! |
|
1594 |
||
1595 |
!CharacterEncoder::TwoStepEncoder methodsFor:'accessing'! |
|
1596 |
||
1597 |
encoder1:encoder1Arg encoder2:encoder2Arg |
|
1598 |
"set instance variables (automatically generated)" |
|
1599 |
||
1600 |
encoder1 := encoder1Arg. |
|
1601 |
encoder2 := encoder2Arg. |
|
1602 |
! ! |
|
1603 |
||
1604 |
!CharacterEncoder::TwoStepEncoder methodsFor:'encoding & decoding'! |
|
1605 |
||
1606 |
decode:aCode |
|
1607 |
^ encoder1 decode:(encoder2 decode:aCode) |
|
1608 |
! |
|
1609 |
||
1610 |
decodeString:aString |
|
1611 |
^ encoder1 decodeString:(encoder2 decodeString:aString) |
|
1612 |
! |
|
1613 |
||
1614 |
encode:aCode |
|
1615 |
^ encoder2 encode:(encoder1 encode:aCode) |
|
1616 |
! |
|
1617 |
||
1618 |
encodeString:aString |
|
1619 |
^ encoder2 encodeString:(encoder1 encodeString:aString) |
|
1620 |
! ! |
|
1621 |
||
7972 | 1622 |
!CharacterEncoder::TwoStepEncoder methodsFor:'printing'! |
1623 |
||
1624 |
printOn:aStream |
|
1625 |
aStream |
|
8136 | 1626 |
nextPutAll:(encoder1 nameOfDecodedCode); |
1627 |
nextPutAll:'->'; |
|
1628 |
nextPutAll:(encoder1 nameOfEncoding); |
|
1629 |
nextPutAll:'->'; |
|
1630 |
nextPutAll:(encoder2 nameOfEncoding) |
|
7972 | 1631 |
! ! |
1632 |
||
11300 | 1633 |
!CharacterEncoder::TwoStepEncoder methodsFor:'queries'! |
1634 |
||
1635 |
nameOfEncoding |
|
1636 |
^ "encoder1 nameOfEncoding , '-' ," encoder2 nameOfEncoding |
|
1637 |
||
1638 |
||
1639 |
||
1640 |
||
1641 |
||
1642 |
||
1643 |
||
1644 |
||
1645 |
||
1646 |
||
1647 |
||
1648 |
||
1649 |
||
1650 |
||
1651 |
||
1652 |
||
1653 |
! ! |
|
1654 |
||
7892 | 1655 |
!CharacterEncoder class methodsFor:'documentation'! |
1656 |
||
1657 |
version |
|
14174
3a4f041c94a2
changed: #supportedExternalEncodings
Michael Beyl <mb@exept.de>
parents:
14169
diff
changeset
|
1658 |
^ '$Header: /cvs/stx/stx/libbasic/CharacterEncoder.st,v 1.114 2012-07-11 11:38:13 mb Exp $' |
12435
539c24148e90
added: #readNextInputCharacterFrom:
Claus Gittinger <cg@exept.de>
parents:
11975
diff
changeset
|
1659 |
! |
539c24148e90
added: #readNextInputCharacterFrom:
Claus Gittinger <cg@exept.de>
parents:
11975
diff
changeset
|
1660 |
|
539c24148e90
added: #readNextInputCharacterFrom:
Claus Gittinger <cg@exept.de>
parents:
11975
diff
changeset
|
1661 |
version_CVS |
14174
3a4f041c94a2
changed: #supportedExternalEncodings
Michael Beyl <mb@exept.de>
parents:
14169
diff
changeset
|
1662 |
^ '$Header: /cvs/stx/stx/libbasic/CharacterEncoder.st,v 1.114 2012-07-11 11:38:13 mb Exp $' |
7899
7577df77ba95
character encodings - first attempt
Claus Gittinger <cg@exept.de>
parents:
7893
diff
changeset
|
1663 |
! ! |
7577df77ba95
character encodings - first attempt
Claus Gittinger <cg@exept.de>
parents:
7893
diff
changeset
|
1664 |
|
7577df77ba95
character encodings - first attempt
Claus Gittinger <cg@exept.de>
parents:
7893
diff
changeset
|
1665 |
CharacterEncoder initialize! |