author | ca |
Mon, 15 Mar 2004 13:58:54 +0100 | |
changeset 8194 | 7027457dbe4f |
parent 8190 | 2c1bbf4a6a13 |
child 8210 | cac1802b8603 |
permissions | -rw-r--r-- |
8048 | 1 |
" |
7899
7577df77ba95
character encodings - first attempt
Claus Gittinger <cg@exept.de>
parents:
7893
diff
changeset
|
2 |
COPYRIGHT (c) 2004 by eXept Software AG |
7932 | 3 |
All Rights Reserved |
7899
7577df77ba95
character encodings - first attempt
Claus Gittinger <cg@exept.de>
parents:
7893
diff
changeset
|
4 |
|
7577df77ba95
character encodings - first attempt
Claus Gittinger <cg@exept.de>
parents:
7893
diff
changeset
|
5 |
This software is furnished under a license and may be used |
7577df77ba95
character encodings - first attempt
Claus Gittinger <cg@exept.de>
parents:
7893
diff
changeset
|
6 |
only in accordance with the terms of that license and with the |
7577df77ba95
character encodings - first attempt
Claus Gittinger <cg@exept.de>
parents:
7893
diff
changeset
|
7 |
inclusion of the above copyright notice. This software may not |
7577df77ba95
character encodings - first attempt
Claus Gittinger <cg@exept.de>
parents:
7893
diff
changeset
|
8 |
be provided or otherwise made available to, or used by, any |
7577df77ba95
character encodings - first attempt
Claus Gittinger <cg@exept.de>
parents:
7893
diff
changeset
|
9 |
other person. No title to or ownership of the software is |
7577df77ba95
character encodings - first attempt
Claus Gittinger <cg@exept.de>
parents:
7893
diff
changeset
|
10 |
hereby transferred. |
7577df77ba95
character encodings - first attempt
Claus Gittinger <cg@exept.de>
parents:
7893
diff
changeset
|
11 |
" |
7577df77ba95
character encodings - first attempt
Claus Gittinger <cg@exept.de>
parents:
7893
diff
changeset
|
12 |
|
8114
05274a80fcc4
separated implementation into dynamically (lazy) loaded classes
Claus Gittinger <cg@exept.de>
parents:
8105
diff
changeset
|
13 |
"{ Package: 'stx:libbasic' }" |
05274a80fcc4
separated implementation into dynamically (lazy) loaded classes
Claus Gittinger <cg@exept.de>
parents:
8105
diff
changeset
|
14 |
|
8118 | 15 |
Object subclass:#CharacterEncoder |
7914 | 16 |
instanceVariableNames:'' |
8118 | 17 |
classVariableNames:'EncoderClassesByName EncodersByName CachedEncoders LastEncoder |
8122 | 18 |
AccessLock NullEncoderInstance Jis7KanjiEscapeSequence |
19 |
Jis7RomanEscapeSequence JisISO2022EscapeSequence |
|
20 |
Jis7KanjiOldEscapeSequence' |
|
7915 | 21 |
poolDictionaries:'' |
8114
05274a80fcc4
separated implementation into dynamically (lazy) loaded classes
Claus Gittinger <cg@exept.de>
parents:
8105
diff
changeset
|
22 |
category:'Collections-Text-Encodings' |
7969 | 23 |
! |
24 |
||
7914 | 25 |
CharacterEncoder subclass:#CompoundEncoder |
26 |
instanceVariableNames:'decoder encoder' |
|
27 |
classVariableNames:'' |
|
28 |
poolDictionaries:'' |
|
7915 | 29 |
privateIn:CharacterEncoder |
30 |
! |
|
31 |
||
7932 | 32 |
CharacterEncoder subclass:#DefaultEncoder |
33 |
instanceVariableNames:'' |
|
34 |
classVariableNames:'' |
|
35 |
poolDictionaries:'' |
|
36 |
privateIn:CharacterEncoder |
|
37 |
! |
|
38 |
||
7914 | 39 |
CharacterEncoder subclass:#InverseEncoder |
40 |
instanceVariableNames:'decoder' |
|
41 |
classVariableNames:'' |
|
42 |
poolDictionaries:'' |
|
7915 | 43 |
privateIn:CharacterEncoder |
44 |
! |
|
45 |
||
7899
7577df77ba95
character encodings - first attempt
Claus Gittinger <cg@exept.de>
parents:
7893
diff
changeset
|
46 |
CharacterEncoder subclass:#NullEncoder |
7893 | 47 |
instanceVariableNames:'' |
48 |
classVariableNames:'' |
|
49 |
poolDictionaries:'' |
|
7915 | 50 |
privateIn:CharacterEncoder |
51 |
! |
|
52 |
||
7892 | 53 |
CharacterEncoder subclass:#OtherEncoding |
54 |
instanceVariableNames:'' |
|
55 |
classVariableNames:'' |
|
56 |
poolDictionaries:'' |
|
7915 | 57 |
privateIn:CharacterEncoder |
58 |
! |
|
59 |
||
7919 | 60 |
CharacterEncoder subclass:#TwoStepEncoder |
61 |
instanceVariableNames:'encoder1 encoder2' |
|
62 |
classVariableNames:'' |
|
63 |
poolDictionaries:'' |
|
64 |
privateIn:CharacterEncoder |
|
65 |
! |
|
66 |
||
7893 | 67 |
!CharacterEncoder class methodsFor:'documentation'! |
68 |
||
7899
7577df77ba95
character encodings - first attempt
Claus Gittinger <cg@exept.de>
parents:
7893
diff
changeset
|
69 |
copyright |
7577df77ba95
character encodings - first attempt
Claus Gittinger <cg@exept.de>
parents:
7893
diff
changeset
|
70 |
" |
7577df77ba95
character encodings - first attempt
Claus Gittinger <cg@exept.de>
parents:
7893
diff
changeset
|
71 |
COPYRIGHT (c) 2004 by eXept Software AG |
7932 | 72 |
All Rights Reserved |
7899
7577df77ba95
character encodings - first attempt
Claus Gittinger <cg@exept.de>
parents:
7893
diff
changeset
|
73 |
|
7577df77ba95
character encodings - first attempt
Claus Gittinger <cg@exept.de>
parents:
7893
diff
changeset
|
74 |
This software is furnished under a license and may be used |
7577df77ba95
character encodings - first attempt
Claus Gittinger <cg@exept.de>
parents:
7893
diff
changeset
|
75 |
only in accordance with the terms of that license and with the |
7577df77ba95
character encodings - first attempt
Claus Gittinger <cg@exept.de>
parents:
7893
diff
changeset
|
76 |
inclusion of the above copyright notice. This software may not |
7577df77ba95
character encodings - first attempt
Claus Gittinger <cg@exept.de>
parents:
7893
diff
changeset
|
77 |
be provided or otherwise made available to, or used by, any |
7577df77ba95
character encodings - first attempt
Claus Gittinger <cg@exept.de>
parents:
7893
diff
changeset
|
78 |
other person. No title to or ownership of the software is |
7577df77ba95
character encodings - first attempt
Claus Gittinger <cg@exept.de>
parents:
7893
diff
changeset
|
79 |
hereby transferred. |
7577df77ba95
character encodings - first attempt
Claus Gittinger <cg@exept.de>
parents:
7893
diff
changeset
|
80 |
" |
7577df77ba95
character encodings - first attempt
Claus Gittinger <cg@exept.de>
parents:
7893
diff
changeset
|
81 |
! |
7577df77ba95
character encodings - first attempt
Claus Gittinger <cg@exept.de>
parents:
7893
diff
changeset
|
82 |
|
7893 | 83 |
documentation |
84 |
" |
|
7899
7577df77ba95
character encodings - first attempt
Claus Gittinger <cg@exept.de>
parents:
7893
diff
changeset
|
85 |
unfinished code - please read howToAddMoreCoders. |
7577df77ba95
character encodings - first attempt
Claus Gittinger <cg@exept.de>
parents:
7893
diff
changeset
|
86 |
|
7577df77ba95
character encodings - first attempt
Claus Gittinger <cg@exept.de>
parents:
7893
diff
changeset
|
87 |
Character mappings are based on information in character maps found at either: |
7932 | 88 |
http://std.dkuug.dk/i18n/charmaps |
7899
7577df77ba95
character encodings - first attempt
Claus Gittinger <cg@exept.de>
parents:
7893
diff
changeset
|
89 |
or: |
7932 | 90 |
http://www.unicode.org/Public/MAPPINGS |
7899
7577df77ba95
character encodings - first attempt
Claus Gittinger <cg@exept.de>
parents:
7893
diff
changeset
|
91 |
|
7577df77ba95
character encodings - first attempt
Claus Gittinger <cg@exept.de>
parents:
7893
diff
changeset
|
92 |
No Warranty. |
7577df77ba95
character encodings - first attempt
Claus Gittinger <cg@exept.de>
parents:
7893
diff
changeset
|
93 |
" |
7577df77ba95
character encodings - first attempt
Claus Gittinger <cg@exept.de>
parents:
7893
diff
changeset
|
94 |
! |
7577df77ba95
character encodings - first attempt
Claus Gittinger <cg@exept.de>
parents:
7893
diff
changeset
|
95 |
|
7577df77ba95
character encodings - first attempt
Claus Gittinger <cg@exept.de>
parents:
7893
diff
changeset
|
96 |
examples |
7577df77ba95
character encodings - first attempt
Claus Gittinger <cg@exept.de>
parents:
7893
diff
changeset
|
97 |
" |
7577df77ba95
character encodings - first attempt
Claus Gittinger <cg@exept.de>
parents:
7893
diff
changeset
|
98 |
|s1 s2| |
7577df77ba95
character encodings - first attempt
Claus Gittinger <cg@exept.de>
parents:
7893
diff
changeset
|
99 |
|
7577df77ba95
character encodings - first attempt
Claus Gittinger <cg@exept.de>
parents:
7893
diff
changeset
|
100 |
s1 := 'hello'. |
7577df77ba95
character encodings - first attempt
Claus Gittinger <cg@exept.de>
parents:
7893
diff
changeset
|
101 |
s2 := CharacterEncoder encode:s1 from:#'iso8859-1' into:#'unicode'. |
7577df77ba95
character encodings - first attempt
Claus Gittinger <cg@exept.de>
parents:
7893
diff
changeset
|
102 |
s2 |
7577df77ba95
character encodings - first attempt
Claus Gittinger <cg@exept.de>
parents:
7893
diff
changeset
|
103 |
|
7577df77ba95
character encodings - first attempt
Claus Gittinger <cg@exept.de>
parents:
7893
diff
changeset
|
104 |
|
7577df77ba95
character encodings - first attempt
Claus Gittinger <cg@exept.de>
parents:
7893
diff
changeset
|
105 |
|s1 s2| |
7577df77ba95
character encodings - first attempt
Claus Gittinger <cg@exept.de>
parents:
7893
diff
changeset
|
106 |
|
7577df77ba95
character encodings - first attempt
Claus Gittinger <cg@exept.de>
parents:
7893
diff
changeset
|
107 |
s1 := 'hello'. |
7577df77ba95
character encodings - first attempt
Claus Gittinger <cg@exept.de>
parents:
7893
diff
changeset
|
108 |
s2 := CharacterEncoder encode:s1 from:#'iso8859-1' into:#'iso8859-7'. |
7577df77ba95
character encodings - first attempt
Claus Gittinger <cg@exept.de>
parents:
7893
diff
changeset
|
109 |
s2 |
7577df77ba95
character encodings - first attempt
Claus Gittinger <cg@exept.de>
parents:
7893
diff
changeset
|
110 |
" |
7577df77ba95
character encodings - first attempt
Claus Gittinger <cg@exept.de>
parents:
7893
diff
changeset
|
111 |
! |
7577df77ba95
character encodings - first attempt
Claus Gittinger <cg@exept.de>
parents:
7893
diff
changeset
|
112 |
|
7577df77ba95
character encodings - first attempt
Claus Gittinger <cg@exept.de>
parents:
7893
diff
changeset
|
113 |
howToAddMoreCoders |
7577df77ba95
character encodings - first attempt
Claus Gittinger <cg@exept.de>
parents:
7893
diff
changeset
|
114 |
" |
7971 | 115 |
Coders can be hand-written or automagically generated via a mapping table. |
7932 | 116 |
Examples for hand-written coders are UTF8_to_ISO10464 or JIS0208_to_JIS7. |
117 |
||
7899
7577df77ba95
character encodings - first attempt
Claus Gittinger <cg@exept.de>
parents:
7893
diff
changeset
|
118 |
The table driven encode/decode methods can be generated from a character mapping document |
7577df77ba95
character encodings - first attempt
Claus Gittinger <cg@exept.de>
parents:
7893
diff
changeset
|
119 |
as found on the unicode consortium host |
8136 | 120 |
(for example: 'http://www.unicode.org/Public/MAPPINGS/ISO8859/8859-1.TXT') |
7899
7577df77ba95
character encodings - first attempt
Claus Gittinger <cg@exept.de>
parents:
7893
diff
changeset
|
121 |
|
7577df77ba95
character encodings - first attempt
Claus Gittinger <cg@exept.de>
parents:
7893
diff
changeset
|
122 |
or from the i18n character maps: |
8136 | 123 |
(for example: 'http://std.dkuug.dk/i18n/charmaps/ISO-8859-1 |
7899
7577df77ba95
character encodings - first attempt
Claus Gittinger <cg@exept.de>
parents:
7893
diff
changeset
|
124 |
|
7577df77ba95
character encodings - first attempt
Claus Gittinger <cg@exept.de>
parents:
7893
diff
changeset
|
125 |
In order to add another coder (for example: for finish EBCDIC or ms-codePage 278), |
7577df77ba95
character encodings - first attempt
Claus Gittinger <cg@exept.de>
parents:
7893
diff
changeset
|
126 |
perform the following steps: |
8136 | 127 |
- create a private subclass of CharacterEncoder named (for example) CP267. |
8114
05274a80fcc4
separated implementation into dynamically (lazy) loaded classes
Claus Gittinger <cg@exept.de>
parents:
8105
diff
changeset
|
128 |
|
8136 | 129 |
- create a public subclass of CharacterEncoderImplementations::CharacterEncoderImplementation named (for example) CharacterEncoderImplementations::CP267. |
7899
7577df77ba95
character encodings - first attempt
Claus Gittinger <cg@exept.de>
parents:
7893
diff
changeset
|
130 |
|
8136 | 131 |
- define the mappingURL1_relativeName (if the table is found on 'www.unicode.org') |
132 |
or the mappingURL2_relativeName (if it is found on 'std.dkuug.dk') method, which |
|
133 |
should return the name of the tables file, relative to the top directory there |
|
134 |
(which is '.../Public/MAPPINGS' on www.unicode.org and '.../i18n/charmaops' on 'std.dkuug.dk'. |
|
7899
7577df77ba95
character encodings - first attempt
Claus Gittinger <cg@exept.de>
parents:
7893
diff
changeset
|
135 |
|
8136 | 136 |
In this example, the table from 'std.dkuug.dk' is used, and named 'EBCDIC-CP-FI' there. |
7899
7577df77ba95
character encodings - first attempt
Claus Gittinger <cg@exept.de>
parents:
7893
diff
changeset
|
137 |
|
8136 | 138 |
- generate code by evaluating: |
139 |
CharacterEncoder::CP267 generateCode |
|
7899
7577df77ba95
character encodings - first attempt
Claus Gittinger <cg@exept.de>
parents:
7893
diff
changeset
|
140 |
|
7577df77ba95
character encodings - first attempt
Claus Gittinger <cg@exept.de>
parents:
7893
diff
changeset
|
141 |
Thats all !! |
7577df77ba95
character encodings - first attempt
Claus Gittinger <cg@exept.de>
parents:
7893
diff
changeset
|
142 |
|
7909 | 143 |
|
144 |
The existing code was generated by: |
|
145 |
||
8136 | 146 |
CharacterEncoder::SingleByteEncoder subclassesDo:[:cls | Transcript showCR:cls name. cls flushCode; generateCode ] |
147 |
CharacterEncoder::SingleByteEncoder subclassesDo:[:cls | cls allSubclassesDo:[:sub | Transcript showCR:sub name. sub flushCode; generateSubclassCode]] |
|
7909 | 148 |
|
149 |
or individually: |
|
8136 | 150 |
CharacterEncoder::ASCII flushCode; generateCode. |
151 |
CharacterEncoder::ISO8859_1 flushCode; generateCode. |
|
152 |
CharacterEncoder::ISO8859_2 flushCode; generateCode. |
|
153 |
CharacterEncoder::ISO8859_3 flushCode; generateCode. |
|
154 |
CharacterEncoder::ISO8859_4 flushCode; generateCode. |
|
155 |
CharacterEncoder::ISO8859_5 flushCode; generateCode. |
|
156 |
CharacterEncoder::ISO8859_6 flushCode; generateCode. |
|
157 |
CharacterEncoder::ISO8859_7 flushCode; generateCode. |
|
158 |
CharacterEncoder::ISO8859_8 flushCode; generateCode. |
|
159 |
CharacterEncoder::ISO8859_9 flushCode; generateCode. |
|
160 |
CharacterEncoder::ISO8859_10 flushCode; generateCode. |
|
161 |
CharacterEncoder::ISO8859_11 flushCode; generateCode. |
|
162 |
CharacterEncoder::ISO8859_13 flushCode; generateCode. |
|
163 |
CharacterEncoder::ISO8859_14 flushCode; generateCode. |
|
164 |
CharacterEncoder::ISO8859_15 flushCode; generateCode. |
|
165 |
CharacterEncoder::ISO8859_16 flushCode; generateCode. |
|
166 |
CharacterEncoder::KOI8_R flushCode; generateCode. |
|
167 |
CharacterEncoder::GSM0338 flushCode; generateCode. |
|
7909 | 168 |
|
8136 | 169 |
CharacterEncoder::KOI8_U flushCode; generateSubclassCode. |
7912 | 170 |
|
8136 | 171 |
CharacterEncoder::JIS0208 flushCode; generateCode. |
7899
7577df77ba95
character encodings - first attempt
Claus Gittinger <cg@exept.de>
parents:
7893
diff
changeset
|
172 |
" |
7577df77ba95
character encodings - first attempt
Claus Gittinger <cg@exept.de>
parents:
7893
diff
changeset
|
173 |
! ! |
7577df77ba95
character encodings - first attempt
Claus Gittinger <cg@exept.de>
parents:
7893
diff
changeset
|
174 |
|
7971 | 175 |
!CharacterEncoder class methodsFor:'instance creation'! |
176 |
||
177 |
encoderFor:encodingNameSymbol |
|
178 |
"given the name of an encoding, return an encoder-instance which can map these from/into unicode." |
|
179 |
||
180 |
^ self |
|
8156 | 181 |
encoderFor:encodingNameSymbol |
182 |
ifAbsent:[ |
|
183 |
"/ proceed to ignore this error in the future. |
|
184 |
self error:'no encoder for ' , encodingNameSymbol mayProceed:true. |
|
185 |
(EncoderClassesByName at:#'unicode') at:encodingNameSymbol put:NullEncoder. |
|
186 |
NullEncoderInstance |
|
187 |
] |
|
7971 | 188 |
|
189 |
" |
|
190 |
CharacterEncoder encoderFor:#'latin1' |
|
191 |
self encoderFor:#'arabic' |
|
192 |
self encoderFor:#'ms-arabic' |
|
193 |
self encoderFor:#'iso8859-5' |
|
194 |
self encoderFor:#'koi8-r' |
|
195 |
self encoderFor:#'koi8-u' |
|
196 |
self encoderFor:#'jis0208' |
|
197 |
self encoderFor:#'jis7' |
|
8087
0a2ee76bcf55
last version before separating into extra classes
Claus Gittinger <cg@exept.de>
parents:
8062
diff
changeset
|
198 |
self encoderFor:#'utf8' |
7971 | 199 |
" |
200 |
! |
|
201 |
||
8168 | 202 |
encoderFor:encodingNameSymbolArg ifAbsent:exceptionValue |
7971 | 203 |
"given the name of an encoding, return an encoder-instance which can map these from/into unicode." |
204 |
||
8168 | 205 |
|encodingNameSymbol enc clsName cls lcName name unicodeEncoders unicodeEncoderClasses| |
8118 | 206 |
|
8168 | 207 |
encodingNameSymbol := encodingNameSymbolArg. |
8118 | 208 |
encodingNameSymbol isNil ifTrue:[ ^ NullEncoderInstance]. |
7972 | 209 |
|
8168 | 210 |
encodingNameSymbol == #'iso10646-1' ifTrue:[ encodingNameSymbol := #'unicode']. |
211 |
encodingNameSymbol isSymbol ifFalse:[self halt:'symbol argument expected'. encodingNameSymbol := encodingNameSymbol asSymbol]. |
|
212 |
||
8118 | 213 |
lcName := encodingNameSymbol asLowercase. |
214 |
name := lcName asSymbolIfInterned. |
|
215 |
name isNil ifTrue:[name := lcName]. |
|
8052 | 216 |
|
8118 | 217 |
name includesMatchCharacters ifTrue:[ |
8155 | 218 |
unicodeEncoders := EncodersByName at:#unicode ifAbsent:nil. |
219 |
unicodeEncoders notNil ifTrue:[ |
|
220 |
unicodeEncoders keysAndValuesDo:[:eachEncodingAlias :eachEncoderInstance | |
|
221 |
(name matches:eachEncodingAlias) ifTrue:[ |
|
222 |
^ eachEncoderInstance. |
|
223 |
]. |
|
224 |
]. |
|
225 |
]. |
|
8118 | 226 |
|
8155 | 227 |
unicodeEncoderClasses := EncoderClassesByName at:#unicode. |
228 |
unicodeEncoderClasses notNil ifTrue:[ |
|
229 |
unicodeEncoderClasses keysAndValuesDo:[:eachEncodingAlias :eachEncoderClassOrName | |
|
230 |
(name matches:eachEncodingAlias) ifTrue:[ |
|
231 |
eachEncoderClassOrName isBehavior ifTrue:[ |
|
8194 | 232 |
cls := eachEncoderClassOrName |
233 |
] ifFalse:[ |
|
234 |
cls := CharacterEncoderImplementations at:eachEncoderClassOrName. |
|
8155 | 235 |
]. |
236 |
cls notNil ifTrue:[ |
|
237 |
^ cls new. |
|
238 |
] |
|
239 |
]. |
|
240 |
]. |
|
241 |
]. |
|
242 |
^ exceptionValue value |
|
7972 | 243 |
]. |
7971 | 244 |
|
8118 | 245 |
AccessLock critical:[ |
8155 | 246 |
unicodeEncoders := EncodersByName at:#unicode ifAbsent:nil. |
247 |
unicodeEncoders isNil ifTrue:[ |
|
248 |
EncodersByName at:#unicode put:(unicodeEncoders := Dictionary new). |
|
249 |
]. |
|
250 |
enc := unicodeEncoders at:name ifAbsent:nil. |
|
7971 | 251 |
|
8155 | 252 |
enc isNil ifTrue:[ |
253 |
unicodeEncoderClasses := EncoderClassesByName at:#unicode ifAbsent:nil. |
|
254 |
unicodeEncoderClasses isNil ifTrue:[ |
|
255 |
EncoderClassesByName at:#unicode put:(unicodeEncoderClasses := Dictionary new). |
|
256 |
]. |
|
257 |
clsName := unicodeEncoderClasses at:name ifAbsent:nil. |
|
258 |
clsName notNil ifTrue:[ |
|
259 |
clsName isBehavior ifTrue:[ |
|
8194 | 260 |
cls := clsName |
261 |
] ifFalse:[ |
|
262 |
cls := CharacterEncoderImplementations at:clsName. |
|
8155 | 263 |
]. |
264 |
cls notNil ifTrue:[ |
|
265 |
enc := cls new. |
|
266 |
unicodeEncoders at:name put:enc. |
|
267 |
] |
|
268 |
]. |
|
269 |
]. |
|
7973 | 270 |
]. |
8118 | 271 |
enc notNil ifTrue:[ |
8155 | 272 |
^ enc |
7973 | 273 |
]. |
7971 | 274 |
|
8118 | 275 |
"/ no direct encoder from unicode->name |
276 |
"/ search for unicode->any and: any->name |
|
277 |
unicodeEncoderClasses := EncoderClassesByName at:#unicode ifAbsent:nil. |
|
278 |
unicodeEncoderClasses keysAndValuesDo:[:eachEncodingAlias :eachEncoderClass | |
|
8155 | 279 |
|dict2 enc1 enc2| |
8118 | 280 |
|
8155 | 281 |
dict2 := EncoderClassesByName at:eachEncodingAlias ifAbsent:nil. |
282 |
dict2 notNil ifTrue:[ |
|
283 |
clsName := dict2 at:name ifAbsent:nil. |
|
284 |
clsName notNil ifTrue:[ |
|
285 |
clsName isBehavior ifTrue:[ |
|
8194 | 286 |
cls := clsName |
8155 | 287 |
] ifFalse:[ |
288 |
cls := CharacterEncoderImplementations at:clsName. |
|
289 |
]. |
|
290 |
cls notNil ifTrue:[ |
|
291 |
enc2 := cls new. |
|
292 |
enc1 := self encoderFor:eachEncodingAlias. |
|
293 |
(enc1 notNil and:[enc2 notNil]) ifTrue:[ |
|
294 |
enc := TwoStepEncoder new encoder1:enc1 encoder2:enc2. |
|
295 |
AccessLock critical:[ |
|
296 |
unicodeEncoders at:name put:enc. |
|
297 |
]. |
|
298 |
^ enc. |
|
299 |
] |
|
300 |
] |
|
301 |
] |
|
302 |
]. |
|
7971 | 303 |
]. |
304 |
||
8194 | 305 |
EncoderClassesByName keysAndValuesDo:[:encoding1 :dict1 | |
306 |
dict1 keysAndValuesDo:[:encoding2 :clsName1| |
|
307 |
|clsName2 cls1 cls2 dict2 enc1 enc2| |
|
308 |
||
309 |
encoding2 = encodingNameSymbol ifTrue:[ |
|
310 |
dict2 := EncoderClassesByName at:#unicode. |
|
311 |
clsName2 := dict2 at:encoding1 ifAbsent:nil. |
|
312 |
encodingNameSymbol = 'sjis' ifTrue:[ self halt ]. |
|
313 |
clsName2 notNil ifTrue:[ |
|
314 |
clsName1 isBehavior ifTrue:[ |
|
315 |
cls1 := clsName1 |
|
316 |
] ifFalse:[ |
|
317 |
cls1 := CharacterEncoderImplementations at:clsName1. |
|
318 |
]. |
|
319 |
clsName2 isBehavior ifTrue:[ |
|
320 |
cls2 := clsName2 |
|
321 |
] ifFalse:[ |
|
322 |
cls2 := CharacterEncoderImplementations at:clsName2. |
|
323 |
]. |
|
324 |
encodingNameSymbol = 'sjis' ifTrue:[ self halt ]. |
|
325 |
(cls1 notNil and:[cls2 notNil]) ifTrue:[ |
|
326 |
encodingNameSymbol = 'sjis' ifTrue:[ self halt ]. |
|
327 |
enc := TwoStepEncoder new encoder1:enc1 encoder2:enc2. |
|
328 |
^ enc. |
|
329 |
]. |
|
330 |
] |
|
331 |
] |
|
332 |
] |
|
333 |
]. |
|
334 |
||
7971 | 335 |
^ exceptionValue value |
336 |
||
337 |
" |
|
338 |
CharacterEncoder encoderFor:#'latin1' |
|
7972 | 339 |
self encoderFor:#'arabic' |
340 |
self encoderFor:#'ms-arabic' |
|
341 |
self encoderFor:#'iso8859-5' |
|
7971 | 342 |
self encoderFor:#'koi8-r' |
343 |
self encoderFor:#'koi8-u' |
|
344 |
self encoderFor:#'jis0208' |
|
345 |
self encoderFor:#'jis7' |
|
7972 | 346 |
self encoderFor:#'unicode' |
7971 | 347 |
" |
348 |
! |
|
349 |
||
350 |
encoderToEncodeFrom:oldEncodingArg into:newEncodingArg |
|
8135 | 351 |
|oldEncoding newEncoding encoders encoderClasses encoder decoder clsName cls| |
8118 | 352 |
|
7971 | 353 |
oldEncoding := oldEncodingArg ? #'unicode'. |
7972 | 354 |
oldEncoding == #'iso10646-1' ifTrue:[ oldEncoding := #'unicode']. |
7971 | 355 |
newEncoding := newEncodingArg ? #'unicode'. |
7972 | 356 |
newEncoding == #'iso10646-1' ifTrue:[ newEncoding := #'unicode']. |
357 |
||
8120 | 358 |
oldEncoding isSymbol ifFalse:[self halt:'symbol argument expected'. oldEncoding := oldEncoding asSymbol]. |
359 |
newEncoding isSymbol ifFalse:[self halt:'symbol argument expected'. newEncoding := newEncoding asSymbol]. |
|
360 |
||
8118 | 361 |
oldEncoding == newEncoding ifTrue:[^ NullEncoderInstance]. |
362 |
(oldEncoding match:newEncoding) ifTrue:[^ NullEncoderInstance]. |
|
7971 | 363 |
|
8122 | 364 |
(oldEncoding == #unicode) ifTrue:[ |
8155 | 365 |
"/ something -> unicode |
366 |
^ self encoderFor:newEncoding. |
|
8122 | 367 |
]. |
368 |
||
8118 | 369 |
AccessLock critical:[ |
8155 | 370 |
encoders := EncodersByName at:oldEncoding ifAbsent:nil. |
371 |
encoders isNil ifTrue:[ |
|
372 |
EncodersByName at:oldEncoding put:(encoders := Dictionary new). |
|
373 |
]. |
|
374 |
encoder := encoders at:newEncodingArg ifAbsent:nil. |
|
375 |
encoder isNil ifTrue:[ |
|
376 |
encoderClasses := EncoderClassesByName at:oldEncoding ifAbsent:nil. |
|
377 |
encoderClasses isNil ifTrue:[ |
|
378 |
EncoderClassesByName at:oldEncoding put:(encoderClasses := Dictionary new). |
|
379 |
]. |
|
380 |
clsName := encoderClasses at:newEncoding ifAbsent:nil. |
|
381 |
clsName notNil ifTrue:[ |
|
382 |
clsName isBehavior ifTrue:[ |
|
383 |
encoder := clsName new |
|
384 |
] ifFalse:[ |
|
385 |
cls := CharacterEncoderImplementations at:clsName. |
|
386 |
cls notNil ifTrue:[ |
|
387 |
encoder := cls new. |
|
388 |
] |
|
389 |
] |
|
390 |
]. |
|
391 |
]. |
|
7971 | 392 |
]. |
393 |
||
8118 | 394 |
encoder isNil ifTrue:[ |
8155 | 395 |
(newEncoding == #unicode) ifTrue:[ |
396 |
"/ something -> unicode |
|
397 |
decoder := self encoderFor:oldEncoding. |
|
398 |
encoder := InverseEncoder new decoder:decoder. |
|
399 |
] ifFalse:[ |
|
400 |
"/ do it as: oldEncoding -> unicode -> newEncoding |
|
7972 | 401 |
|
8155 | 402 |
"/ something -> unicode |
403 |
decoder := self encoderFor:oldEncoding. |
|
7972 | 404 |
|
8155 | 405 |
"/ unicode -> something |
406 |
encoder := self encoderFor:newEncoding. |
|
407 |
encoder := CompoundEncoder new encoder:encoder decoder:decoder. |
|
408 |
]. |
|
7971 | 409 |
]. |
410 |
||
8118 | 411 |
AccessLock critical:[ |
8155 | 412 |
(EncodersByName at:oldEncoding) at:newEncoding put:encoder |
8118 | 413 |
]. |
414 |
^ encoder |
|
7971 | 415 |
|
8118 | 416 |
" CharacterEncoder initialize |
7972 | 417 |
CharacterEncoder encoderToEncodeFrom:#'latin1' into:#'jis7' |
8118 | 418 |
CharacterEncoder encoderToEncodeFrom:#'koi8-r' into:#'mac-cyrillic' |
8087
0a2ee76bcf55
last version before separating into extra classes
Claus Gittinger <cg@exept.de>
parents:
8062
diff
changeset
|
419 |
CharacterEncoder encoderToEncodeFrom:#'ms-arabic' into:#'mac-arabic' |
0a2ee76bcf55
last version before separating into extra classes
Claus Gittinger <cg@exept.de>
parents:
8062
diff
changeset
|
420 |
CharacterEncoder encoderToEncodeFrom:#'iso8859-5' into:#'koi8-r' |
0a2ee76bcf55
last version before separating into extra classes
Claus Gittinger <cg@exept.de>
parents:
8062
diff
changeset
|
421 |
CharacterEncoder encoderToEncodeFrom:#'koi8-r' into:#'koi8-u' |
7971 | 422 |
" |
423 |
! ! |
|
424 |
||
7932 | 425 |
!CharacterEncoder class methodsFor:'Compatibility-ST80'! |
426 |
||
427 |
encoderNamed: encoderName |
|
428 |
"/ q & d hack |
|
429 |
||
430 |
encoderName == #default ifTrue:[ |
|
8136 | 431 |
^ DefaultEncoder new |
7932 | 432 |
]. |
433 |
self halt. |
|
434 |
^ self new |
|
435 |
! |
|
436 |
||
437 |
platformName |
|
438 |
^ OperatingSystem platformName |
|
439 |
||
440 |
"Created: 20.6.1997 / 17:34:03 / cg" |
|
441 |
"Modified: 20.6.1997 / 17:38:40 / cg" |
|
442 |
! ! |
|
443 |
||
7899
7577df77ba95
character encodings - first attempt
Claus Gittinger <cg@exept.de>
parents:
7893
diff
changeset
|
444 |
!CharacterEncoder class methodsFor:'class initialization'! |
7577df77ba95
character encodings - first attempt
Claus Gittinger <cg@exept.de>
parents:
7893
diff
changeset
|
445 |
|
7577df77ba95
character encodings - first attempt
Claus Gittinger <cg@exept.de>
parents:
7893
diff
changeset
|
446 |
initialize |
8154 | 447 |
|ud| |
448 |
||
8151 | 449 |
AccessLock := RecursionLock new. |
8118 | 450 |
NullEncoderInstance := NullEncoder new. |
7973 | 451 |
|
8126 | 452 |
EncodersByName := Dictionary new. |
453 |
EncoderClassesByName := Dictionary new. |
|
454 |
CachedEncoders := Dictionary new. |
|
7972 | 455 |
|
8154 | 456 |
EncoderClassesByName at:#'unicode' put:(ud := Dictionary new). |
457 |
ud at:#'fontspecific' put:NullEncoder. |
|
458 |
ud at:#'adobe-fontspecific' put:NullEncoder. |
|
8190 | 459 |
ud at:#'ms-oem' put:NullEncoder. |
8152 | 460 |
|
8135 | 461 |
"/ className decoded-name array-of-encodingNames |
8118 | 462 |
#( |
8151 | 463 |
(ASCII unicode ( ascii 'us-ascii' 'iso-ir-6' 'ibm-367' 'ms-cp367' 'cp367' 'iso646-us' 'ibm-cp367' )) |
8118 | 464 |
|
8151 | 465 |
(BIG5 unicode ( big5 )) |
8118 | 466 |
|
8151 | 467 |
(CNS11643 unicode ( 'cns11643' )) |
8118 | 468 |
|
8151 | 469 |
(CP437 unicode ( 'cp437' 'cp-437' 'ibm-437' 'ms-cp437' 'microsoft-cp437' 'ibm-cp437' )) |
8118 | 470 |
|
8151 | 471 |
(GB2313_1980 unicode ( 'gb2313' 'gb2313-1980' )) |
8118 | 472 |
|
8151 | 473 |
(HANGUL unicode ( 'hangul' )) |
8118 | 474 |
|
8151 | 475 |
(ISO10646_1 unicode ( unicode 'iso10646_1' 'iso10646-1' 'iso-10646-1' )) |
8118 | 476 |
|
8151 | 477 |
(ISO10646_to_UTF8 unicode ( utf8 'utf-8' )) |
8118 | 478 |
|
8151 | 479 |
(ISO8859_1 unicode ( 'iso8859_1' 'iso8859-1' 'iso-8859-1' 'latin-1' 'latin1' 'iso-ir-100' 'ibm-819' 'ms-cp819' 'ibm-cp819' )) |
8118 | 480 |
|
8151 | 481 |
(ISO8859_2 unicode ( 'iso8859_2' 'iso8859-2' 'iso-8859-2' 'latin2' 'latin-2' 'iso-ir-101')) |
8118 | 482 |
|
8151 | 483 |
(ISO8859_3 unicode ( 'iso8859_3' 'iso8859-3' 'iso-8859-3' 'latin3' 'latin-3' 'iso-ir-109')) |
8118 | 484 |
|
8151 | 485 |
(ISO8859_4 unicode ( 'iso8859_4' 'iso8859-4' 'iso-8859-4' 'latin4' 'latin-4' 'iso-ir-110')) |
8118 | 486 |
|
8151 | 487 |
(ISO8859_5 unicode ( 'iso8859_5' 'iso8859-5' 'iso-8859-5' 'cyrillic' 'iso-ir-144' )) |
8118 | 488 |
|
8151 | 489 |
(ISO8859_6 unicode ( 'iso8859_6' 'iso8859-6' 'iso-8859-6' 'arabic' 'asmo-708' 'ecma-114' 'iso-ir-127' )) |
8118 | 490 |
|
8151 | 491 |
(ISO8859_7 unicode ( 'iso8859_7' 'iso8859-7' 'iso-8859-7' 'greek' 'iso-ir-126' 'ecma-118')) |
8118 | 492 |
|
8151 | 493 |
(ISO8859_8 unicode ( 'iso8859_8' 'iso8859-8' 'iso-8859-8' 'hebrew' 'iso-ir-138' )) |
8118 | 494 |
|
8151 | 495 |
(ISO8859_9 unicode ( 'iso8859_9' 'iso8859-9' 'iso-8859-9' 'latin5' 'latin-5' 'iso-ir-148')) |
8118 | 496 |
|
8151 | 497 |
(ISO8859_10 unicode ( 'iso8859_10' 'iso8859-10' 'iso-8859-10' 'latin6' 'latin-6' 'iso-ir-157')) |
8118 | 498 |
|
8151 | 499 |
(ISO8859_11 unicode ( 'iso8859_11' 'iso8859-11' 'iso-8859-11' 'thai' )) |
8118 | 500 |
|
8151 | 501 |
(ISO8859_13 unicode ( 'iso8859_13' 'iso8859-13' 'iso-8859-13' 'latin7' 'latin-7' )) |
8118 | 502 |
|
8151 | 503 |
(ISO8859_14 unicode ( 'iso8859_14' 'iso8859-14' 'iso-8859-14' 'latin8' 'latin-8' 'latin-celtic' )) |
8118 | 504 |
|
8151 | 505 |
(ISO8859_15 unicode ( 'iso8859_15' 'iso8859-15' 'iso-8859-15' 'latin9' 'latin-9' 'iso-ir-203')) |
8118 | 506 |
|
8151 | 507 |
(ISO8859_16 unicode ( 'iso8859_16' 'iso8859-16' 'iso-8859-16' 'latin10' 'latin-10' )) |
8118 | 508 |
|
8151 | 509 |
(JIS0201 unicode ( 'jis0201' #'jisx0201.1976-0')) |
8118 | 510 |
|
8151 | 511 |
(JIS0208 unicode ( jis0208 'jisx0208' 'jisx0208.1983-0' 'jisx0208.1990-0')) |
8118 | 512 |
|
8151 | 513 |
(JIS0208_to_JIS7 jis0208 ( jis7 'jis-7' 'x-jis7' 'x-iso2022-jp' 'iso2022-jp')) |
8118 | 514 |
|
8151 | 515 |
(JIS0208_to_EUC jis0208 ( euc #'x-euc-jp' )) |
8122 | 516 |
|
8176 | 517 |
(JIS0208_to_SJIS jis0208 ( 'sjis' 'shiftjis' 'x-sjis' #'x-shift-jis' #'shift-jis')) |
518 |
||
8151 | 519 |
(JIS0212 unicode ( 'jis0212' )) |
8118 | 520 |
|
8151 | 521 |
(JOHAB unicode ( 'johab' )) |
8118 | 522 |
|
8151 | 523 |
(KOI7 unicode ( 'koi7' )) |
8118 | 524 |
|
8151 | 525 |
(KOI8_R unicode ( #'koi8-r' 'cp878' )) |
8118 | 526 |
|
8151 | 527 |
(KOI8_U unicode ( #'koi8-u' )) |
8118 | 528 |
|
8151 | 529 |
(KSC5601 unicode ( #'ksc5601' )) |
8118 | 530 |
|
8151 | 531 |
(MAC_Arabic unicode ( #'mac-arabic' 'macarabic' )) |
8118 | 532 |
|
8151 | 533 |
(MAC_CentralEuropean unicode ( #'mac-centraleuropean' #'mac-centraleurope' 'maccentraleurope' 'maccentraleuropean' )) |
8118 | 534 |
|
8151 | 535 |
(MAC_Croatian unicode ( #'mac-croatian' 'maccroatian')) |
8118 | 536 |
|
8151 | 537 |
(MAC_Cyrillic unicode ( #'mac-cyrillic' 'maccyrillic' )) |
8118 | 538 |
|
8151 | 539 |
(MAC_Dingbats unicode ( #'mac-dingbats' 'macdingbats' 'macdingbat')) |
8118 | 540 |
|
8151 | 541 |
(MAC_Farsi unicode ( #'mac-farsi' 'macfarsi' )) |
8118 | 542 |
|
8151 | 543 |
(MAC_Greek unicode ( #'mac-greek' #'macgreek' )) |
8118 | 544 |
|
8151 | 545 |
(MAC_Hebrew unicode ( #'mac-hebrew' #'machebrew' )) |
8118 | 546 |
|
8151 | 547 |
(MAC_Iceland unicode ( #'mac-iceland' #'maciceland' )) |
8118 | 548 |
|
8151 | 549 |
(MAC_Japanese unicode ( #'mac-japanese' #'macjapanese' )) |
8118 | 550 |
|
8151 | 551 |
(MAC_Korean unicode ( #'mac-korean' #'mackorean' )) |
8118 | 552 |
|
8151 | 553 |
(MAC_Roman unicode ( #'mac-roman' #'macroman' )) |
8118 | 554 |
|
8151 | 555 |
(MAC_Romanian unicode ( #'mac-romanian' #'macromanian' )) |
8118 | 556 |
|
8151 | 557 |
(MAC_Symbol unicode ( #'mac-symbol' #'macsymbol' )) |
8118 | 558 |
|
8151 | 559 |
(MAC_Thai unicode ( #'mac-thai' #'macthai' )) |
8118 | 560 |
|
8151 | 561 |
(MAC_Turkish unicode ( #'mac-turkish' #'macturkish' )) |
8118 | 562 |
|
8151 | 563 |
(MS_Ansi unicode ( #'ms-ansi' 'ms-cp1252' 'microsoft-cp1252' 'cp1252' 'microsoft-ansi' 'windows-1252' 'windows-latin1')) |
8118 | 564 |
|
8151 | 565 |
(MS_Arabic unicode ( 'ms-arabic' 'ms-cp1256' 'microsoft-cp1256' 'cp1256' 'microsoft-arabic' 'windows-1256' )) |
8118 | 566 |
|
8151 | 567 |
(MS_Baltic unicode ( 'ms-baltic' 'ms-cp1257' 'microsoft-cp1257' 'cp1257' 'microsoft-baltic' 'windows-1257' )) |
8118 | 568 |
|
8151 | 569 |
(MS_Cyrillic unicode ( 'ms-cyrillic' 'ms-cp1251' 'microsoft-cp1251' 'cp1251' 'microsoft-cyrillic' 'windows-1251' )) |
8118 | 570 |
|
8151 | 571 |
(MS_EastEuropean unicode ( 'ms-easteuropean' 'ms-ee' 'cp1250' 'ms-cp1250' 'microsoft-cp1250' 'microsoft-easteuropean' 'windows-1250' )) |
8118 | 572 |
|
8151 | 573 |
(MS_Greek unicode ( 'ms-greek' 'ms-cp1253' 'microsoft-cp1253' 'cp1253' 'microsoft-greek' 'windows-1253' )) |
8118 | 574 |
|
8151 | 575 |
(MS_Hebrew unicode ( 'ms-hebrew' 'ms-cp1255' 'microsoft-cp1255' 'cp1255' 'microsoft-hebrew' 'windows-1255' )) |
8118 | 576 |
|
577 |
"/ (MS_Symbol unicode ( 'ms-symbol' 'microsoft-symbol' )) |
|
578 |
||
8151 | 579 |
(MS_Turkish unicode ( 'ms-turkish' 'ms-cp1254' 'microsoft-cp1254' 'cp1254' 'microsoft-turkish' 'windows-1254' )) |
8118 | 580 |
|
8151 | 581 |
(NEXT unicode ( 'next' 'nextstep' )) |
8186 | 582 |
|
8187 | 583 |
(ISO10646_to_SGML unicode ( 'sgml' )) |
8118 | 584 |
) triplesDo:[:className :decodesTo :encodesTo | |
8151 | 585 |
|dict| |
8134 | 586 |
|
8151 | 587 |
"/ notice that the encoders are not yet installed as autoloaded. |
588 |
"/ Therefore, we remember their names here. |
|
589 |
dict := EncoderClassesByName at:decodesTo ifAbsent:nil. |
|
590 |
dict isNil ifTrue:[ |
|
591 |
EncoderClassesByName at:decodesTo put:(dict := Dictionary new). |
|
592 |
]. |
|
593 |
encodesTo do:[:eachEncodingAlias | |
|
594 |
(dict includesKey:eachEncodingAlias) ifTrue:[ |
|
595 |
self halt:'conflicting alias' |
|
596 |
]. |
|
597 |
dict at:eachEncodingAlias put:className. |
|
598 |
]. |
|
8118 | 599 |
]. |
7899
7577df77ba95
character encodings - first attempt
Claus Gittinger <cg@exept.de>
parents:
7893
diff
changeset
|
600 |
|
7577df77ba95
character encodings - first attempt
Claus Gittinger <cg@exept.de>
parents:
7893
diff
changeset
|
601 |
" |
7577df77ba95
character encodings - first attempt
Claus Gittinger <cg@exept.de>
parents:
7893
diff
changeset
|
602 |
self initialize |
7577df77ba95
character encodings - first attempt
Claus Gittinger <cg@exept.de>
parents:
7893
diff
changeset
|
603 |
" |
7892 | 604 |
! ! |
605 |
||
8122 | 606 |
!CharacterEncoder class methodsFor:'constants'! |
607 |
||
608 |
jis7KanjiEscapeSequence |
|
609 |
"return the escape sequence used to switch to kanji in jis7 encoded strings. |
|
610 |
This happens to be the same as ISO2022-JP's escape sequence." |
|
611 |
||
612 |
Jis7KanjiEscapeSequence isNil ifTrue:[ |
|
613 |
Jis7KanjiEscapeSequence := Character esc asString , '$B'. |
|
614 |
]. |
|
615 |
^ Jis7KanjiEscapeSequence. |
|
616 |
||
617 |
"Created: 26.2.1996 / 17:38:08 / cg" |
|
618 |
"Modified: 30.6.1997 / 16:03:16 / cg" |
|
619 |
! |
|
620 |
||
621 |
jis7KanjiOldEscapeSequence |
|
622 |
"return the escape sequence used to switch to kanji in some old jis7 encoded strings." |
|
623 |
||
624 |
Jis7KanjiOldEscapeSequence isNil ifTrue:[ |
|
8136 | 625 |
Jis7KanjiOldEscapeSequence := Character esc asString , '$@'.. |
8122 | 626 |
]. |
627 |
^ Jis7KanjiOldEscapeSequence. |
|
628 |
! |
|
629 |
||
630 |
jis7RomanEscapeSequence |
|
631 |
"return the escape sequence used to switch to roman in jis7 encoded strings" |
|
632 |
||
633 |
Jis7RomanEscapeSequence isNil ifTrue:[ |
|
634 |
Jis7RomanEscapeSequence := Character esc asString , '(J'. |
|
635 |
]. |
|
636 |
^ Jis7RomanEscapeSequence. |
|
637 |
||
638 |
"Created: 26.2.1996 / 17:38:08 / cg" |
|
639 |
"Modified: 30.6.1997 / 16:03:16 / cg" |
|
640 |
! |
|
641 |
||
642 |
jisISO2022EscapeSequence |
|
643 |
"return the escape sequence used to switch to kanji in iso2022 encoded strings" |
|
644 |
||
645 |
JisISO2022EscapeSequence isNil ifTrue:[ |
|
8136 | 646 |
JisISO2022EscapeSequence := Character esc asString , '&@' , Character esc asString , '$B'. |
8122 | 647 |
]. |
648 |
^ JisISO2022EscapeSequence. |
|
649 |
! ! |
|
650 |
||
7892 | 651 |
!CharacterEncoder class methodsFor:'encoding & decoding'! |
652 |
||
653 |
decode:aCodePoint |
|
654 |
^ self new decode:aCodePoint |
|
655 |
! |
|
656 |
||
657 |
decodeString:aString |
|
658 |
^ self new decodeString:aString |
|
659 |
! |
|
660 |
||
7972 | 661 |
decodeString:aString from:oldEncoding |
8016 | 662 |
^ self encodeString:aString from:oldEncoding into:#'unicode' |
7967 | 663 |
! |
664 |
||
7892 | 665 |
encode:aCodePoint |
666 |
^ self new encode:aCodePoint |
|
667 |
||
668 |
" |
|
669 |
ISO8859_1 encode:16r00FF |
|
670 |
ISO8859_1 decodeString:'hello' |
|
671 |
ISO8859_1 encodeString:(ISO8859_1 decodeString:'hello') |
|
672 |
||
673 |
ISO8859_5 decodeString:(String |
|
7899
7577df77ba95
character encodings - first attempt
Claus Gittinger <cg@exept.de>
parents:
7893
diff
changeset
|
674 |
with:(Character value:16rE4) |
7577df77ba95
character encodings - first attempt
Claus Gittinger <cg@exept.de>
parents:
7893
diff
changeset
|
675 |
with:(Character value:16rE0)) |
7892 | 676 |
" |
677 |
! |
|
678 |
||
7994 | 679 |
encode:codePoint from:oldEncodingArg into:newEncodingArg |
8015 | 680 |
|oldEncoding newEncoding encoder| |
7994 | 681 |
|
682 |
oldEncoding := oldEncodingArg ? #'unicode'. |
|
683 |
oldEncoding == #'iso10646-1' ifTrue:[ oldEncoding := #'unicode']. |
|
684 |
newEncoding := newEncodingArg ? #'unicode'. |
|
685 |
newEncoding == #'iso10646-1' ifTrue:[ newEncoding := #'unicode']. |
|
686 |
||
687 |
oldEncoding == newEncoding ifTrue:[^ codePoint]. |
|
688 |
||
8016 | 689 |
oldEncoding == #'unicode' ifTrue:[ |
8136 | 690 |
newEncoding == #'iso8859-1' ifTrue:[ |
691 |
codePoint <= 16rFF ifTrue:[ |
|
692 |
^ codePoint |
|
693 |
] |
|
694 |
] |
|
8016 | 695 |
]. |
696 |
newEncoding == #'unicode' ifTrue:[ |
|
8136 | 697 |
oldEncoding == #'iso8859-1' ifTrue:[ |
698 |
codePoint <= 16rFF ifTrue:[ |
|
699 |
^ codePoint |
|
700 |
] |
|
701 |
] |
|
8016 | 702 |
]. |
8118 | 703 |
encoder := self encoderToEncodeFrom:oldEncoding into:newEncoding. |
8015 | 704 |
^ encoder encode:codePoint. |
7994 | 705 |
! |
706 |
||
7899
7577df77ba95
character encodings - first attempt
Claus Gittinger <cg@exept.de>
parents:
7893
diff
changeset
|
707 |
encodeString:aUnicodeString |
7577df77ba95
character encodings - first attempt
Claus Gittinger <cg@exept.de>
parents:
7893
diff
changeset
|
708 |
"given a string in unicode, return a string in my encoding for it" |
7577df77ba95
character encodings - first attempt
Claus Gittinger <cg@exept.de>
parents:
7893
diff
changeset
|
709 |
|
7912 | 710 |
^ self new encodeString:aUnicodeString |
7899
7577df77ba95
character encodings - first attempt
Claus Gittinger <cg@exept.de>
parents:
7893
diff
changeset
|
711 |
|
7577df77ba95
character encodings - first attempt
Claus Gittinger <cg@exept.de>
parents:
7893
diff
changeset
|
712 |
" |
7577df77ba95
character encodings - first attempt
Claus Gittinger <cg@exept.de>
parents:
7893
diff
changeset
|
713 |
ISO8859_1 decodeString:'hello' |
7577df77ba95
character encodings - first attempt
Claus Gittinger <cg@exept.de>
parents:
7893
diff
changeset
|
714 |
" |
7914 | 715 |
! |
716 |
||
7967 | 717 |
encodeString:aString from:oldEncodingArg into:newEncodingArg |
8015 | 718 |
|oldEncoding newEncoding encoder| |
7967 | 719 |
|
720 |
oldEncoding := oldEncodingArg ? #'unicode'. |
|
7972 | 721 |
oldEncoding == #'iso10646-1' ifTrue:[ oldEncoding := #'unicode']. |
7967 | 722 |
newEncoding := newEncodingArg ? #'unicode'. |
7972 | 723 |
newEncoding == #'iso10646-1' ifTrue:[ newEncoding := #'unicode']. |
724 |
||
7967 | 725 |
oldEncoding == newEncoding ifTrue:[^ aString]. |
726 |
||
8016 | 727 |
oldEncoding == #'unicode' ifTrue:[ |
8136 | 728 |
newEncoding == #'iso8859-1' ifTrue:[ |
729 |
aString bitsPerCharacter == 8 ifTrue:[ |
|
730 |
^ aString |
|
731 |
] |
|
732 |
] |
|
8016 | 733 |
]. |
734 |
newEncoding == #'unicode' ifTrue:[ |
|
8136 | 735 |
oldEncoding == #'iso8859-1' ifTrue:[ |
736 |
aString bitsPerCharacter == 8 ifTrue:[ |
|
737 |
^ aString |
|
738 |
] |
|
739 |
] |
|
8016 | 740 |
]. |
741 |
||
8118 | 742 |
encoder := self encoderToEncodeFrom:oldEncoding into:newEncoding. |
8015 | 743 |
^ encoder encodeString:aString. |
7972 | 744 |
! |
745 |
||
746 |
encodeString:aString into:newEncoding |
|
8016 | 747 |
^ self encodeString:aString from:#'unicode' into:newEncoding |
7892 | 748 |
! ! |
749 |
||
750 |
!CharacterEncoder class methodsFor:'private'! |
|
751 |
||
752 |
flushCode |
|
8127 | 753 |
self initialize. |
7914 | 754 |
|
7892 | 755 |
self isAbstract ifFalse:[ |
8136 | 756 |
(self mapFileURL1_relativePathName notNil |
757 |
or:[ self mapFileURL2_relativePathName notNil]) ifTrue:[ |
|
758 |
self class removeSelector:#mapping. |
|
759 |
]. |
|
7892 | 760 |
]. |
761 |
||
762 |
" |
|
763 |
self flushCode |
|
764 |
" |
|
765 |
! ! |
|
766 |
||
767 |
!CharacterEncoder class methodsFor:'private-mapping setup'! |
|
768 |
||
7899
7577df77ba95
character encodings - first attempt
Claus Gittinger <cg@exept.de>
parents:
7893
diff
changeset
|
769 |
generateCode |
7909 | 770 |
(CharacterEncoderCodeGenerator new targetClass:self) generateCode. |
771 |
! |
|
772 |
||
773 |
generateSubclassCode |
|
774 |
(CharacterEncoderCodeGenerator new targetClass:self) generateSubclassCode. |
|
7892 | 775 |
! |
776 |
||
7914 | 777 |
mapFileURL1_codeColumn |
778 |
^ 1 |
|
779 |
! |
|
780 |
||
7912 | 781 |
mapFileURL1_relativePathName |
782 |
"raise an error: must be redefined in concrete subclass(es)" |
|
783 |
||
784 |
^ nil |
|
785 |
! |
|
786 |
||
787 |
mapFileURL2_relativePathName |
|
788 |
"raise an error: must be redefined in concrete subclass(es)" |
|
789 |
||
790 |
^ nil |
|
791 |
! |
|
792 |
||
7899
7577df77ba95
character encodings - first attempt
Claus Gittinger <cg@exept.de>
parents:
7893
diff
changeset
|
793 |
mappingURL1 |
7892 | 794 |
"raise an error: must be redefined in concrete subclass(es)" |
7912 | 795 |
|
7899
7577df77ba95
character encodings - first attempt
Claus Gittinger <cg@exept.de>
parents:
7893
diff
changeset
|
796 |
|rel| |
7577df77ba95
character encodings - first attempt
Claus Gittinger <cg@exept.de>
parents:
7893
diff
changeset
|
797 |
|
7912 | 798 |
rel := self mapFileURL1_relativePathName. |
799 |
rel isNil ifTrue:[ |
|
7932 | 800 |
^ nil |
7912 | 801 |
]. |
7899
7577df77ba95
character encodings - first attempt
Claus Gittinger <cg@exept.de>
parents:
7893
diff
changeset
|
802 |
^ 'http://www.unicode.org/Public/MAPPINGS/' , rel |
7577df77ba95
character encodings - first attempt
Claus Gittinger <cg@exept.de>
parents:
7893
diff
changeset
|
803 |
! |
7577df77ba95
character encodings - first attempt
Claus Gittinger <cg@exept.de>
parents:
7893
diff
changeset
|
804 |
|
7892 | 805 |
mappingURL2 |
806 |
"raise an error: must be redefined in concrete subclass(es)" |
|
7899
7577df77ba95
character encodings - first attempt
Claus Gittinger <cg@exept.de>
parents:
7893
diff
changeset
|
807 |
|
7577df77ba95
character encodings - first attempt
Claus Gittinger <cg@exept.de>
parents:
7893
diff
changeset
|
808 |
|rel| |
7577df77ba95
character encodings - first attempt
Claus Gittinger <cg@exept.de>
parents:
7893
diff
changeset
|
809 |
|
7912 | 810 |
rel := self mapFileURL2_relativePathName. |
811 |
rel isNil ifTrue:[ |
|
7932 | 812 |
^ nil |
7912 | 813 |
]. |
7899
7577df77ba95
character encodings - first attempt
Claus Gittinger <cg@exept.de>
parents:
7893
diff
changeset
|
814 |
^ 'http://std.dkuug.dk/i18n/charmaps/' , rel |
7892 | 815 |
! ! |
816 |
||
7899
7577df77ba95
character encodings - first attempt
Claus Gittinger <cg@exept.de>
parents:
7893
diff
changeset
|
817 |
!CharacterEncoder class methodsFor:'queries'! |
7577df77ba95
character encodings - first attempt
Claus Gittinger <cg@exept.de>
parents:
7893
diff
changeset
|
818 |
|
7938 | 819 |
isEncoding:subSetEncodingArg subSetOf:superSetEncodingArg |
7994 | 820 |
"return true, if superSetEncoding encoding includes all characters of subSetEncoding. |
821 |
(this means: characters are included - not that they have the same encoding)" |
|
7938 | 822 |
|
823 |
|subSetEncoding superSetEncoding| |
|
824 |
||
825 |
subSetEncodingArg = superSetEncodingArg ifTrue:[^ true]. |
|
826 |
subSetEncoding := subSetEncodingArg asLowercase. |
|
827 |
superSetEncoding := superSetEncodingArg asLowercase. |
|
828 |
||
829 |
(subSetEncoding match:superSetEncoding) ifTrue:[^ true]. |
|
830 |
||
831 |
(('iso10646*' match:superSetEncoding) or:[superSetEncoding = 'unicode']) ifTrue:[ |
|
8168 | 832 |
('ms-*' match:subSetEncoding) ifTrue:[^ true]. |
833 |
('ascii*' match:subSetEncoding) ifTrue:[^ true]. |
|
834 |
('iso8859*' match:subSetEncoding) ifTrue:[^ true]. |
|
835 |
('jis*' match:subSetEncoding) ifTrue:[^ true]. |
|
836 |
('koi8*' match:subSetEncoding) ifTrue:[^ true]. |
|
837 |
('ksc*' match:subSetEncoding) ifTrue:[^ true]. |
|
838 |
('big*' match:subSetEncoding) ifTrue:[^ true]. |
|
839 |
('cns*' match:subSetEncoding) ifTrue:[^ true]. |
|
840 |
('gb2312*' match:subSetEncoding) ifTrue:[^ true]. |
|
7938 | 841 |
]. |
842 |
||
843 |
"/ if the subSet is iso8859-*, that means ascii (i.e. the lower 7 bits of iso8859 only). |
|
844 |
((subSetEncoding = 'iso8859*') or:[subSetEncoding = 'iso8859-*']) ifTrue:[ |
|
8168 | 845 |
('ascii*' match:superSetEncoding) ifTrue:[^ true]. |
846 |
('ms-ansi*' match:superSetEncoding) ifTrue:[^ true]. |
|
7938 | 847 |
]. |
848 |
(subSetEncoding = 'ascii') ifTrue:[ |
|
8168 | 849 |
('iso8859*' match:superSetEncoding) ifTrue:[^ true]. |
850 |
('ms-ansi*' match:superSetEncoding) ifTrue:[^ true]. |
|
7938 | 851 |
]. |
852 |
||
7923 | 853 |
"/ TODO: check the charSets mappingTables... |
854 |
"/ self halt. |
|
855 |
^ false. |
|
856 |
! |
|
857 |
||
7919 | 858 |
nameOfDecodedCode |
859 |
"Most coders decode from their code into unicode / encode from unicode into their code. |
|
860 |
There are a few exceptions to this, though - these must redefine this." |
|
861 |
||
862 |
^ #'unicode' |
|
863 |
! |
|
864 |
||
865 |
nameOfEncoding |
|
7974 | 866 |
^ (self nameWithoutPrefix asLowercase copyReplaceAll:$_ with:$-) asSymbol |
7919 | 867 |
! |
868 |
||
7959 | 869 |
supportedExternalEncodings |
870 |
"return an array of arrays containing the names of supported |
|
871 |
encodings which are supported for external resources (i.e. files). |
|
872 |
The first element contains the internally used symbolic name, |
|
873 |
the second contains a user-readable string (description). |
|
874 |
More than one external name may be mapped onto the same symbolic." |
|
875 |
||
876 |
^ #( |
|
8176 | 877 |
('utf8' 'Unicode as 8Bit characters' ) |
878 |
"/ ('utf7' 'Unicode as 7Bit characters' ) |
|
879 |
"/ nil |
|
880 |
('ascii' 'Common 7bit subset of iso8859' ) |
|
881 |
('iso8859-1' 'Latin1' ) |
|
882 |
('iso8859-2' 'Latin2' ) |
|
883 |
('iso8859-3' 'Latin3' ) |
|
884 |
('iso8859-4' 'Latin4' ) |
|
885 |
('iso8859-5' 'Cyrillic' ) |
|
886 |
('iso8859-6' 'Arabic' ) |
|
887 |
('iso8859-7' 'Greek' ) |
|
888 |
('iso8859-8' 'Hebrew' ) |
|
889 |
"/ nil |
|
890 |
('koi7' 'Cyrillic (Old)' ) |
|
891 |
('koi8-r' 'Cyrillic' ) |
|
892 |
('koi8-u' 'Cyrillic (Ukraine)' ) |
|
893 |
"/ nil |
|
894 |
('cp437' 'msdos US / codepage 437' ) |
|
895 |
('cp850' 'msdos Latin1 codepage 850' ) |
|
896 |
"/ ('mac' 'macintosh 8 bit' ) |
|
897 |
('next' 'NeXT 8 bit' ) |
|
898 |
"/ ('hp' 'hpux 8 bit' ) |
|
899 |
"/ nil |
|
900 |
('euc' 'EUC - extended unix code (japanese)' ) |
|
901 |
('jis7' 'JIS7 - jis 7bit escape codes (japanese)' ) |
|
902 |
('iso-2022-jp' 'Same as jis 7bit' ) |
|
903 |
('sjis' 'SJIS - shift jis 8bit codes (japanese)' ) |
|
904 |
"/ nil |
|
905 |
('gb' 'GB - mainland china' ) |
|
906 |
('big5' 'BIG5 - taiwan' ) |
|
7959 | 907 |
"/ ('ksc' 'korean' ) |
8186 | 908 |
('sgml' 'SGML (XML/HTML) character escapes' ) |
7959 | 909 |
) |
910 |
! |
|
911 |
||
7947 | 912 |
userFriendlyNameOfEncoding |
7972 | 913 |
^ self nameOfEncoding asUppercaseFirst |
7899
7577df77ba95
character encodings - first attempt
Claus Gittinger <cg@exept.de>
parents:
7893
diff
changeset
|
914 |
! ! |
7577df77ba95
character encodings - first attempt
Claus Gittinger <cg@exept.de>
parents:
7893
diff
changeset
|
915 |
|
7912 | 916 |
!CharacterEncoder class methodsFor:'testing'! |
917 |
||
918 |
isAbstract |
|
919 |
^ self == CharacterEncoder |
|
920 |
! ! |
|
921 |
||
7892 | 922 |
!CharacterEncoder methodsFor:'encoding & decoding'! |
923 |
||
924 |
decode:anEncoding |
|
925 |
"given an integer in my encoding, return a unicode codePoint for it" |
|
926 |
||
8118 | 927 |
self subclassResponsibility |
7892 | 928 |
! |
929 |
||
930 |
decodeString:anEncodedString |
|
931 |
"given a string in my encoding, return a unicode-string for it" |
|
932 |
||
8150
ba9c6e587973
care for bitsPerCharacter change during encodeString/decodeString.
ca
parents:
8136
diff
changeset
|
933 |
|newString myCode uniCodePoint bits| |
8118 | 934 |
|
935 |
newString := String new:(anEncodedString size). |
|
8150
ba9c6e587973
care for bitsPerCharacter change during encodeString/decodeString.
ca
parents:
8136
diff
changeset
|
936 |
bits := newString bitsPerCharacter. |
8118 | 937 |
|
8150
ba9c6e587973
care for bitsPerCharacter change during encodeString/decodeString.
ca
parents:
8136
diff
changeset
|
938 |
1 to:anEncodedString size do:[:idx | |
ba9c6e587973
care for bitsPerCharacter change during encodeString/decodeString.
ca
parents:
8136
diff
changeset
|
939 |
uniCodePoint := (anEncodedString at:idx) codePoint. |
ba9c6e587973
care for bitsPerCharacter change during encodeString/decodeString.
ca
parents:
8136
diff
changeset
|
940 |
myCode := self decode:uniCodePoint. |
ba9c6e587973
care for bitsPerCharacter change during encodeString/decodeString.
ca
parents:
8136
diff
changeset
|
941 |
myCode > 16rFF ifTrue:[ |
ba9c6e587973
care for bitsPerCharacter change during encodeString/decodeString.
ca
parents:
8136
diff
changeset
|
942 |
myCode > 16rFFFF ifTrue:[ |
ba9c6e587973
care for bitsPerCharacter change during encodeString/decodeString.
ca
parents:
8136
diff
changeset
|
943 |
bits < 32 ifTrue:[ |
ba9c6e587973
care for bitsPerCharacter change during encodeString/decodeString.
ca
parents:
8136
diff
changeset
|
944 |
newString := Unicode32String fromString:newString. |
ba9c6e587973
care for bitsPerCharacter change during encodeString/decodeString.
ca
parents:
8136
diff
changeset
|
945 |
bits := 32. |
ba9c6e587973
care for bitsPerCharacter change during encodeString/decodeString.
ca
parents:
8136
diff
changeset
|
946 |
] |
ba9c6e587973
care for bitsPerCharacter change during encodeString/decodeString.
ca
parents:
8136
diff
changeset
|
947 |
] ifFalse:[ |
ba9c6e587973
care for bitsPerCharacter change during encodeString/decodeString.
ca
parents:
8136
diff
changeset
|
948 |
bits < 16 ifTrue:[ |
ba9c6e587973
care for bitsPerCharacter change during encodeString/decodeString.
ca
parents:
8136
diff
changeset
|
949 |
newString := Unicode16String fromString:newString. |
ba9c6e587973
care for bitsPerCharacter change during encodeString/decodeString.
ca
parents:
8136
diff
changeset
|
950 |
bits := 16. |
ba9c6e587973
care for bitsPerCharacter change during encodeString/decodeString.
ca
parents:
8136
diff
changeset
|
951 |
] |
ba9c6e587973
care for bitsPerCharacter change during encodeString/decodeString.
ca
parents:
8136
diff
changeset
|
952 |
] |
ba9c6e587973
care for bitsPerCharacter change during encodeString/decodeString.
ca
parents:
8136
diff
changeset
|
953 |
]. |
ba9c6e587973
care for bitsPerCharacter change during encodeString/decodeString.
ca
parents:
8136
diff
changeset
|
954 |
newString at:idx put:(Character value:myCode). |
8118 | 955 |
]. |
956 |
^ newString |
|
7892 | 957 |
|
958 |
" |
|
959 |
ISO8859_1 decodeString:'hello' |
|
960 |
" |
|
961 |
! |
|
962 |
||
963 |
encode:aCodePoint |
|
964 |
"given a codePoint in unicode, return a byte in my encoding for it" |
|
965 |
||
8118 | 966 |
self subclassResponsibility |
7892 | 967 |
! |
968 |
||
969 |
encodeString:aUnicodeString |
|
970 |
"given a string in unicode, return a string in my encoding for it" |
|
971 |
||
8150
ba9c6e587973
care for bitsPerCharacter change during encodeString/decodeString.
ca
parents:
8136
diff
changeset
|
972 |
|newString myCode uniCodePoint bits| |
8118 | 973 |
|
8150
ba9c6e587973
care for bitsPerCharacter change during encodeString/decodeString.
ca
parents:
8136
diff
changeset
|
974 |
newString := String new:(aUnicodeString size). |
ba9c6e587973
care for bitsPerCharacter change during encodeString/decodeString.
ca
parents:
8136
diff
changeset
|
975 |
bits := newString bitsPerCharacter. |
ba9c6e587973
care for bitsPerCharacter change during encodeString/decodeString.
ca
parents:
8136
diff
changeset
|
976 |
|
8118 | 977 |
1 to:aUnicodeString size do:[:idx | |
8150
ba9c6e587973
care for bitsPerCharacter change during encodeString/decodeString.
ca
parents:
8136
diff
changeset
|
978 |
uniCodePoint := (aUnicodeString at:idx) codePoint. |
ba9c6e587973
care for bitsPerCharacter change during encodeString/decodeString.
ca
parents:
8136
diff
changeset
|
979 |
myCode := self encode:uniCodePoint. |
ba9c6e587973
care for bitsPerCharacter change during encodeString/decodeString.
ca
parents:
8136
diff
changeset
|
980 |
myCode > 16rFF ifTrue:[ |
ba9c6e587973
care for bitsPerCharacter change during encodeString/decodeString.
ca
parents:
8136
diff
changeset
|
981 |
myCode > 16rFFFF ifTrue:[ |
ba9c6e587973
care for bitsPerCharacter change during encodeString/decodeString.
ca
parents:
8136
diff
changeset
|
982 |
bits < 32 ifTrue:[ |
ba9c6e587973
care for bitsPerCharacter change during encodeString/decodeString.
ca
parents:
8136
diff
changeset
|
983 |
newString := Unicode32String fromString:newString. |
ba9c6e587973
care for bitsPerCharacter change during encodeString/decodeString.
ca
parents:
8136
diff
changeset
|
984 |
bits := 32. |
ba9c6e587973
care for bitsPerCharacter change during encodeString/decodeString.
ca
parents:
8136
diff
changeset
|
985 |
] |
ba9c6e587973
care for bitsPerCharacter change during encodeString/decodeString.
ca
parents:
8136
diff
changeset
|
986 |
] ifFalse:[ |
ba9c6e587973
care for bitsPerCharacter change during encodeString/decodeString.
ca
parents:
8136
diff
changeset
|
987 |
bits < 16 ifTrue:[ |
ba9c6e587973
care for bitsPerCharacter change during encodeString/decodeString.
ca
parents:
8136
diff
changeset
|
988 |
newString := Unicode16String fromString:newString. |
ba9c6e587973
care for bitsPerCharacter change during encodeString/decodeString.
ca
parents:
8136
diff
changeset
|
989 |
bits := 16. |
ba9c6e587973
care for bitsPerCharacter change during encodeString/decodeString.
ca
parents:
8136
diff
changeset
|
990 |
] |
ba9c6e587973
care for bitsPerCharacter change during encodeString/decodeString.
ca
parents:
8136
diff
changeset
|
991 |
] |
ba9c6e587973
care for bitsPerCharacter change during encodeString/decodeString.
ca
parents:
8136
diff
changeset
|
992 |
]. |
ba9c6e587973
care for bitsPerCharacter change during encodeString/decodeString.
ca
parents:
8136
diff
changeset
|
993 |
newString at:idx put:(Character value:myCode). |
8118 | 994 |
]. |
995 |
^ newString |
|
7892 | 996 |
! ! |
997 |
||
7899
7577df77ba95
character encodings - first attempt
Claus Gittinger <cg@exept.de>
parents:
7893
diff
changeset
|
998 |
!CharacterEncoder methodsFor:'error handling'! |
7577df77ba95
character encodings - first attempt
Claus Gittinger <cg@exept.de>
parents:
7893
diff
changeset
|
999 |
|
7577df77ba95
character encodings - first attempt
Claus Gittinger <cg@exept.de>
parents:
7893
diff
changeset
|
1000 |
decodingError |
7904 | 1001 |
"report an error that there is no unicode-codePoint for a given codePoint in this encoding. |
1002 |
(which is unlikely) or that the encoding is undefined for that value |
|
1003 |
(for example, holes in the ISO8859-3 encoding)" |
|
1004 |
||
7919 | 1005 |
|badCodePoint sender| |
1006 |
||
1007 |
sender := thisContext sender. |
|
1008 |
((sender selector == #encode:) or:[sender selector == #decode:]) ifFalse:[ |
|
8136 | 1009 |
badCodePoint := sender methodHome argAt:1 |
7919 | 1010 |
]. |
8062 | 1011 |
^ (EncodingError new) |
8136 | 1012 |
defaultValue:(self defaultDecoderValue); |
1013 |
parameter:badCodePoint; |
|
1014 |
messageText:'invalid code'; |
|
1015 |
suspendedContext:sender; |
|
1016 |
raiseRequest |
|
7899
7577df77ba95
character encodings - first attempt
Claus Gittinger <cg@exept.de>
parents:
7893
diff
changeset
|
1017 |
! |
7577df77ba95
character encodings - first attempt
Claus Gittinger <cg@exept.de>
parents:
7893
diff
changeset
|
1018 |
|
7577df77ba95
character encodings - first attempt
Claus Gittinger <cg@exept.de>
parents:
7893
diff
changeset
|
1019 |
defaultDecoderValue |
7577df77ba95
character encodings - first attempt
Claus Gittinger <cg@exept.de>
parents:
7893
diff
changeset
|
1020 |
"placed into a decoded string, in case there is no unicode codePoint |
7577df77ba95
character encodings - first attempt
Claus Gittinger <cg@exept.de>
parents:
7893
diff
changeset
|
1021 |
for a given encoded codePoint. |
7904 | 1022 |
(typically 16rFFFF)." |
7899
7577df77ba95
character encodings - first attempt
Claus Gittinger <cg@exept.de>
parents:
7893
diff
changeset
|
1023 |
|
7904 | 1024 |
^ 16rFFFF |
7899
7577df77ba95
character encodings - first attempt
Claus Gittinger <cg@exept.de>
parents:
7893
diff
changeset
|
1025 |
! |
7577df77ba95
character encodings - first attempt
Claus Gittinger <cg@exept.de>
parents:
7893
diff
changeset
|
1026 |
|
7577df77ba95
character encodings - first attempt
Claus Gittinger <cg@exept.de>
parents:
7893
diff
changeset
|
1027 |
defaultEncoderValue |
7577df77ba95
character encodings - first attempt
Claus Gittinger <cg@exept.de>
parents:
7893
diff
changeset
|
1028 |
"placed into an encoded string, in case there is no codePoint |
7577df77ba95
character encodings - first attempt
Claus Gittinger <cg@exept.de>
parents:
7893
diff
changeset
|
1029 |
for a given unicode codePoint. |
7577df77ba95
character encodings - first attempt
Claus Gittinger <cg@exept.de>
parents:
7893
diff
changeset
|
1030 |
(typically $?)." |
7577df77ba95
character encodings - first attempt
Claus Gittinger <cg@exept.de>
parents:
7893
diff
changeset
|
1031 |
|
8101
f7023a4735bf
Use the ANSI-blessed #codePoint instead of deprecated #asciiValue
Stefan Vogel <sv@exept.de>
parents:
8087
diff
changeset
|
1032 |
^ $? codePoint |
7899
7577df77ba95
character encodings - first attempt
Claus Gittinger <cg@exept.de>
parents:
7893
diff
changeset
|
1033 |
! |
7577df77ba95
character encodings - first attempt
Claus Gittinger <cg@exept.de>
parents:
7893
diff
changeset
|
1034 |
|
7919 | 1035 |
encodingError |
1036 |
"report an error that some unicode-codePoint cannot be represented by this encoder" |
|
1037 |
||
1038 |
|badCodePoint sender| |
|
7904 | 1039 |
|
1040 |
sender := thisContext sender. |
|
1041 |
((sender selector == #encode:) or:[sender selector == #decode:]) ifFalse:[ |
|
8136 | 1042 |
badCodePoint := sender methodHome argAt:1 |
7904 | 1043 |
]. |
8048 | 1044 |
^ (EncodingError new) |
8136 | 1045 |
defaultValue:(self defaultEncoderValue); |
1046 |
parameter:badCodePoint; |
|
1047 |
messageText:'unrepresentable unicode'; |
|
1048 |
suspendedContext:sender; |
|
1049 |
raiseRequest |
|
7899
7577df77ba95
character encodings - first attempt
Claus Gittinger <cg@exept.de>
parents:
7893
diff
changeset
|
1050 |
! ! |
7577df77ba95
character encodings - first attempt
Claus Gittinger <cg@exept.de>
parents:
7893
diff
changeset
|
1051 |
|
7972 | 1052 |
!CharacterEncoder methodsFor:'printing'! |
1053 |
||
1054 |
printOn:aStream |
|
1055 |
aStream |
|
8136 | 1056 |
nextPutAll:(self nameOfDecodedCode); |
1057 |
nextPutAll:'->'; |
|
1058 |
nextPutAll:(self nameOfEncoding) |
|
7972 | 1059 |
! ! |
1060 |
||
7892 | 1061 |
!CharacterEncoder methodsFor:'private'! |
1062 |
||
1063 |
newString:size |
|
1064 |
self subclassResponsibility |
|
1065 |
! ! |
|
1066 |
||
7917 | 1067 |
!CharacterEncoder methodsFor:'queries'! |
1068 |
||
1069 |
isNullEncoder |
|
1070 |
^ false |
|
7972 | 1071 |
! |
1072 |
||
1073 |
nameOfDecodedCode |
|
1074 |
"Most coders decode from their code into unicode / encode from unicode into their code. |
|
1075 |
There are a few exceptions to this, though - these must redefine this." |
|
1076 |
||
1077 |
^ self class nameOfDecodedCode |
|
1078 |
! |
|
1079 |
||
1080 |
nameOfEncoding |
|
1081 |
^ self class nameOfEncoding |
|
1082 |
! |
|
1083 |
||
1084 |
userFriendlyNameOfEncoding |
|
1085 |
^ self class userFriendlyNameOfEncoding |
|
7917 | 1086 |
! ! |
1087 |
||
7915 | 1088 |
!CharacterEncoder::CompoundEncoder class methodsFor:'documentation'! |
7914 | 1089 |
|
1090 |
documentation |
|
1091 |
" |
|
1092 |
A compoundEncoder uses two real encoders; |
|
1093 |
to encode: |
|
8136 | 1094 |
string -> decoder(encode) -> encoder -> result |
7914 | 1095 |
to decode: |
8136 | 1096 |
string -> encoder -> decoder -> result |
7956 | 1097 |
|
1098 |
|e| |
|
1099 |
||
1100 |
e := CompoundEncoder new. |
|
1101 |
e encoder:ISO8859_5 decoder:KOI8_R. |
|
1102 |
e decode:16rB0. 'CYRILLIC CAPITAL LETTER A; 16rB0 in 8859-5; 16rE1 in KOI8-R'. |
|
1103 |
e encode:16rE1. |
|
7914 | 1104 |
" |
1105 |
! ! |
|
1106 |
||
7915 | 1107 |
!CharacterEncoder::CompoundEncoder methodsFor:'accessing'! |
7914 | 1108 |
|
1109 |
encoder:encoderArg decoder:decoderArg |
|
1110 |
"set instance variables (automatically generated)" |
|
1111 |
||
1112 |
decoder := decoderArg. |
|
1113 |
encoder := encoderArg. |
|
1114 |
! ! |
|
1115 |
||
7915 | 1116 |
!CharacterEncoder::CompoundEncoder methodsFor:'encoding & decoding'! |
7914 | 1117 |
|
7956 | 1118 |
decode:aCode |
1119 |
^ decoder encode:(encoder decode:aCode) |
|
1120 |
! |
|
1121 |
||
1122 |
decodeString:aString |
|
1123 |
^ decoder encodeString:(encoder decodeString:aString) |
|
1124 |
! |
|
1125 |
||
7914 | 1126 |
encode:aCode |
1127 |
^ encoder encode:(decoder decode:aCode) |
|
1128 |
! |
|
1129 |
||
1130 |
encodeString:aString |
|
1131 |
^ encoder encodeString:(decoder decodeString:aString) |
|
1132 |
! ! |
|
1133 |
||
7972 | 1134 |
!CharacterEncoder::CompoundEncoder methodsFor:'printing'! |
1135 |
||
1136 |
printOn:aStream |
|
1137 |
aStream |
|
8136 | 1138 |
nextPutAll:(decoder nameOfEncoding); |
1139 |
nextPutAll:'->'. |
|
7972 | 1140 |
"/ nextPutAll:(decoder nameOfDecodedCode); |
1141 |
"/ nextPutAll:'->'; |
|
1142 |
"/ nextPutAll:(encoder nameOfEncoding) |
|
1143 |
encoder printOn:aStream |
|
1144 |
! ! |
|
1145 |
||
7932 | 1146 |
!CharacterEncoder::DefaultEncoder class methodsFor:'documentation'! |
1147 |
||
1148 |
documentation |
|
1149 |
" |
|
7972 | 1150 |
That is only a dummy for ST80 compatibility |
7932 | 1151 |
" |
1152 |
! ! |
|
1153 |
||
7915 | 1154 |
!CharacterEncoder::InverseEncoder class methodsFor:'documentation'! |
7914 | 1155 |
|
1156 |
documentation |
|
1157 |
" |
|
1158 |
An inverseEncoder does the inverse - i.e. encode is really a decode |
|
1159 |
and decode is really an encode. |
|
1160 |
" |
|
1161 |
! ! |
|
1162 |
||
7915 | 1163 |
!CharacterEncoder::InverseEncoder methodsFor:'accessing'! |
7914 | 1164 |
|
1165 |
decoder:something |
|
1166 |
decoder := something. |
|
1167 |
! ! |
|
1168 |
||
7915 | 1169 |
!CharacterEncoder::InverseEncoder methodsFor:'encoding & decoding'! |
7914 | 1170 |
|
1171 |
decode:aCode |
|
1172 |
^ decoder encode:aCode |
|
1173 |
! |
|
1174 |
||
1175 |
decodeString:aString |
|
1176 |
^ decoder encodeString:aString |
|
1177 |
! |
|
1178 |
||
1179 |
encode:aCode |
|
1180 |
^ decoder decode:aCode |
|
1181 |
! |
|
1182 |
||
1183 |
encodeString:aString |
|
1184 |
^ decoder decodeString:aString |
|
1185 |
! ! |
|
1186 |
||
7972 | 1187 |
!CharacterEncoder::InverseEncoder methodsFor:'printing'! |
1188 |
||
1189 |
printOn:aStream |
|
1190 |
aStream |
|
8136 | 1191 |
nextPutAll:(decoder nameOfEncoding); |
1192 |
nextPutAll:'->'; |
|
1193 |
nextPutAll:(decoder nameOfDecodedCode) |
|
7972 | 1194 |
! ! |
1195 |
||
7915 | 1196 |
!CharacterEncoder::NullEncoder class methodsFor:'documentation'! |
7914 | 1197 |
|
1198 |
documentation |
|
1199 |
" |
|
1200 |
A NullEncoder does nothing. |
|
1201 |
" |
|
1202 |
! ! |
|
1203 |
||
7915 | 1204 |
!CharacterEncoder::NullEncoder methodsFor:'encoding & decoding'! |
7899
7577df77ba95
character encodings - first attempt
Claus Gittinger <cg@exept.de>
parents:
7893
diff
changeset
|
1205 |
|
7577df77ba95
character encodings - first attempt
Claus Gittinger <cg@exept.de>
parents:
7893
diff
changeset
|
1206 |
decode:aCode |
7577df77ba95
character encodings - first attempt
Claus Gittinger <cg@exept.de>
parents:
7893
diff
changeset
|
1207 |
^ aCode |
7577df77ba95
character encodings - first attempt
Claus Gittinger <cg@exept.de>
parents:
7893
diff
changeset
|
1208 |
! |
7577df77ba95
character encodings - first attempt
Claus Gittinger <cg@exept.de>
parents:
7893
diff
changeset
|
1209 |
|
7577df77ba95
character encodings - first attempt
Claus Gittinger <cg@exept.de>
parents:
7893
diff
changeset
|
1210 |
decodeString:aString |
7577df77ba95
character encodings - first attempt
Claus Gittinger <cg@exept.de>
parents:
7893
diff
changeset
|
1211 |
^ aString |
7577df77ba95
character encodings - first attempt
Claus Gittinger <cg@exept.de>
parents:
7893
diff
changeset
|
1212 |
! |
7577df77ba95
character encodings - first attempt
Claus Gittinger <cg@exept.de>
parents:
7893
diff
changeset
|
1213 |
|
7577df77ba95
character encodings - first attempt
Claus Gittinger <cg@exept.de>
parents:
7893
diff
changeset
|
1214 |
encode:aCode |
7577df77ba95
character encodings - first attempt
Claus Gittinger <cg@exept.de>
parents:
7893
diff
changeset
|
1215 |
^ aCode |
7577df77ba95
character encodings - first attempt
Claus Gittinger <cg@exept.de>
parents:
7893
diff
changeset
|
1216 |
! |
7577df77ba95
character encodings - first attempt
Claus Gittinger <cg@exept.de>
parents:
7893
diff
changeset
|
1217 |
|
7577df77ba95
character encodings - first attempt
Claus Gittinger <cg@exept.de>
parents:
7893
diff
changeset
|
1218 |
encodeString:aString |
7577df77ba95
character encodings - first attempt
Claus Gittinger <cg@exept.de>
parents:
7893
diff
changeset
|
1219 |
^ aString |
7577df77ba95
character encodings - first attempt
Claus Gittinger <cg@exept.de>
parents:
7893
diff
changeset
|
1220 |
! ! |
7577df77ba95
character encodings - first attempt
Claus Gittinger <cg@exept.de>
parents:
7893
diff
changeset
|
1221 |
|
7917 | 1222 |
!CharacterEncoder::NullEncoder methodsFor:'queries'! |
1223 |
||
1224 |
isNullEncoder |
|
1225 |
^ true |
|
1226 |
! ! |
|
1227 |
||
7915 | 1228 |
!CharacterEncoder::OtherEncoding class methodsFor:'private'! |
7892 | 1229 |
|
1230 |
flushCode |
|
1231 |
! |
|
1232 |
||
1233 |
generateEncoderCode |
|
1234 |
! ! |
|
1235 |
||
7919 | 1236 |
!CharacterEncoder::TwoStepEncoder class methodsFor:'documentation'! |
1237 |
||
1238 |
documentation |
|
1239 |
" |
|
1240 |
A twoStepEncoder uses two real encoders; |
|
1241 |
to encode: |
|
7932 | 1242 |
string -> encoder1(encode) -> encoder2(encode) -> result |
7919 | 1243 |
to decode: |
7932 | 1244 |
string -> encoder2(decode) -> encoder1(decode) -> result |
7919 | 1245 |
" |
1246 |
! ! |
|
1247 |
||
1248 |
!CharacterEncoder::TwoStepEncoder methodsFor:'accessing'! |
|
1249 |
||
1250 |
encoder1:encoder1Arg encoder2:encoder2Arg |
|
1251 |
"set instance variables (automatically generated)" |
|
1252 |
||
1253 |
encoder1 := encoder1Arg. |
|
1254 |
encoder2 := encoder2Arg. |
|
1255 |
! ! |
|
1256 |
||
1257 |
!CharacterEncoder::TwoStepEncoder methodsFor:'encoding & decoding'! |
|
1258 |
||
1259 |
decode:aCode |
|
1260 |
^ encoder1 decode:(encoder2 decode:aCode) |
|
1261 |
! |
|
1262 |
||
1263 |
decodeString:aString |
|
1264 |
^ encoder1 decodeString:(encoder2 decodeString:aString) |
|
1265 |
! |
|
1266 |
||
1267 |
encode:aCode |
|
1268 |
^ encoder2 encode:(encoder1 encode:aCode) |
|
1269 |
! |
|
1270 |
||
1271 |
encodeString:aString |
|
1272 |
^ encoder2 encodeString:(encoder1 encodeString:aString) |
|
1273 |
! ! |
|
1274 |
||
7972 | 1275 |
!CharacterEncoder::TwoStepEncoder methodsFor:'printing'! |
1276 |
||
1277 |
printOn:aStream |
|
1278 |
aStream |
|
8136 | 1279 |
nextPutAll:(encoder1 nameOfDecodedCode); |
1280 |
nextPutAll:'->'; |
|
1281 |
nextPutAll:(encoder1 nameOfEncoding); |
|
1282 |
nextPutAll:'->'; |
|
1283 |
nextPutAll:(encoder2 nameOfEncoding) |
|
7972 | 1284 |
! ! |
1285 |
||
7892 | 1286 |
!CharacterEncoder class methodsFor:'documentation'! |
1287 |
||
1288 |
version |
|
8194 | 1289 |
^ '$Header: /cvs/stx/stx/libbasic/CharacterEncoder.st,v 1.78 2004-03-15 12:58:54 ca Exp $' |
7899
7577df77ba95
character encodings - first attempt
Claus Gittinger <cg@exept.de>
parents:
7893
diff
changeset
|
1290 |
! ! |
7577df77ba95
character encodings - first attempt
Claus Gittinger <cg@exept.de>
parents:
7893
diff
changeset
|
1291 |
|
7577df77ba95
character encodings - first attempt
Claus Gittinger <cg@exept.de>
parents:
7893
diff
changeset
|
1292 |
CharacterEncoder initialize! |