"
COPYRIGHT (c) 2004 by eXept Software AG
All Rights Reserved
This software is furnished under a license and may be used
only in accordance with the terms of that license and with the
inclusion of the above copyright notice. This software may not
be provided or otherwise made available to, or used by, any
other person. No title to or ownership of the software is
hereby transferred.
"
"{ Package: 'stx:libbasic' }"
Object subclass:#CharacterEncoder
instanceVariableNames:''
classVariableNames:'EncoderClassesByName EncodersByName CachedEncoders LastEncoder
AccessLock NullEncoderInstance Jis7KanjiEscapeSequence
Jis7RomanEscapeSequence JisISO2022EscapeSequence
Jis7KanjiOldEscapeSequence'
poolDictionaries:''
category:'Collections-Text-Encodings'
!
CharacterEncoder subclass:#CompoundEncoder
instanceVariableNames:'decoder encoder'
classVariableNames:''
poolDictionaries:''
privateIn:CharacterEncoder
!
CharacterEncoder subclass:#DefaultEncoder
instanceVariableNames:''
classVariableNames:''
poolDictionaries:''
privateIn:CharacterEncoder
!
CharacterEncoder subclass:#InverseEncoder
instanceVariableNames:'decoder'
classVariableNames:''
poolDictionaries:''
privateIn:CharacterEncoder
!
CharacterEncoder subclass:#NullEncoder
instanceVariableNames:''
classVariableNames:''
poolDictionaries:''
privateIn:CharacterEncoder
!
CharacterEncoder subclass:#OtherEncoding
instanceVariableNames:''
classVariableNames:''
poolDictionaries:''
privateIn:CharacterEncoder
!
CharacterEncoder subclass:#TwoStepEncoder
instanceVariableNames:'encoder1 encoder2'
classVariableNames:''
poolDictionaries:''
privateIn:CharacterEncoder
!
!CharacterEncoder class methodsFor:'documentation'!
copyright
"
COPYRIGHT (c) 2004 by eXept Software AG
All Rights Reserved
This software is furnished under a license and may be used
only in accordance with the terms of that license and with the
inclusion of the above copyright notice. This software may not
be provided or otherwise made available to, or used by, any
other person. No title to or ownership of the software is
hereby transferred.
"
!
documentation
"
unfinished code - please read howToAddMoreCoders.
Character mappings are based on information in character maps found at either:
http://std.dkuug.dk/i18n/charmaps
or:
http://www.unicode.org/Public/MAPPINGS
No Warranty.
"
!
examples
"
|s1 s2|
s1 := 'hello'.
s2 := CharacterEncoder encode:s1 from:#'iso8859-1' into:#'unicode'.
s2
|s1 s2|
s1 := 'hello'.
s2 := CharacterEncoder encode:s1 from:#'iso8859-1' into:#'iso8859-7'.
s2
"
!
howToAddMoreCoders
"
Coders can be hand-written or automagically generated via a mapping table.
Examples for hand-written coders are UTF8_to_ISO10464 or JIS0208_to_JIS7.
The table driven encode/decode methods can be generated from a character mapping document
as found on the unicode consortium host
(for example: 'http://www.unicode.org/Public/MAPPINGS/ISO8859/8859-1.TXT')
or from the i18n character maps:
(for example: 'http://std.dkuug.dk/i18n/charmaps/ISO-8859-1
In order to add another coder (for example: for finish EBCDIC or ms-codePage 278),
perform the following steps:
- create a private subclass of CharacterEncoder named (for example) CP267.
- create a public subclass of CharacterEncoderImplementations::CharacterEncoderImplementation named (for example) CharacterEncoderImplementations::CP267.
- define the mappingURL1_relativeName (if the table is found on 'www.unicode.org')
or the mappingURL2_relativeName (if it is found on 'std.dkuug.dk') method, which
should return the name of the tables file, relative to the top directory there
(which is '.../Public/MAPPINGS' on www.unicode.org and '.../i18n/charmaops' on 'std.dkuug.dk'.
In this example, the table from 'std.dkuug.dk' is used, and named 'EBCDIC-CP-FI' there.
- generate code by evaluating:
CharacterEncoder::CP267 generateCode
Thats all !!
The existing code was generated by:
CharacterEncoder::SingleByteEncoder subclassesDo:[:cls | Transcript showCR:cls name. cls flushCode; generateCode ]
CharacterEncoder::SingleByteEncoder subclassesDo:[:cls | cls allSubclassesDo:[:sub | Transcript showCR:sub name. sub flushCode; generateSubclassCode]]
or individually:
CharacterEncoder::ASCII flushCode; generateCode.
CharacterEncoder::ISO8859_1 flushCode; generateCode.
CharacterEncoder::ISO8859_2 flushCode; generateCode.
CharacterEncoder::ISO8859_3 flushCode; generateCode.
CharacterEncoder::ISO8859_4 flushCode; generateCode.
CharacterEncoder::ISO8859_5 flushCode; generateCode.
CharacterEncoder::ISO8859_6 flushCode; generateCode.
CharacterEncoder::ISO8859_7 flushCode; generateCode.
CharacterEncoder::ISO8859_8 flushCode; generateCode.
CharacterEncoder::ISO8859_9 flushCode; generateCode.
CharacterEncoder::ISO8859_10 flushCode; generateCode.
CharacterEncoder::ISO8859_11 flushCode; generateCode.
CharacterEncoder::ISO8859_13 flushCode; generateCode.
CharacterEncoder::ISO8859_14 flushCode; generateCode.
CharacterEncoder::ISO8859_15 flushCode; generateCode.
CharacterEncoder::ISO8859_16 flushCode; generateCode.
CharacterEncoder::KOI8_R flushCode; generateCode.
CharacterEncoder::GSM0338 flushCode; generateCode.
CharacterEncoder::KOI8_U flushCode; generateSubclassCode.
CharacterEncoder::JIS0208 flushCode; generateCode.
"
! !
!CharacterEncoder class methodsFor:'instance creation'!
encoderFor:encodingNameSymbol
"given the name of an encoding, return an encoder-instance which can map these from/into unicode."
^ self
encoderFor:encodingNameSymbol
ifAbsent:[
self error:'no encoder for ' , encodingNameSymbol mayProceed:true.
NullEncoderInstance
]
"
CharacterEncoder encoderFor:#'latin1'
self encoderFor:#'arabic'
self encoderFor:#'ms-arabic'
self encoderFor:#'iso8859-5'
self encoderFor:#'koi8-r'
self encoderFor:#'koi8-u'
self encoderFor:#'jis0208'
self encoderFor:#'jis7'
self encoderFor:#'utf8'
"
!
encoderFor:encodingNameSymbol ifAbsent:exceptionValue
"given the name of an encoding, return an encoder-instance which can map these from/into unicode."
|enc cls lcName name unicodeEncoders unicodeEncoderClasses|
encodingNameSymbol isNil ifTrue:[ ^ NullEncoderInstance].
lcName := encodingNameSymbol asLowercase.
name := lcName asSymbolIfInterned.
name isNil ifTrue:[name := lcName].
name includesMatchCharacters ifTrue:[
unicodeEncoders := EncodersByName at:#unicode ifAbsent:nil.
unicodeEncoders notNil ifTrue:[
unicodeEncoders keysAndValuesDo:[:eachEncodingAlias :eachEncoderInstance |
(name matches:eachEncodingAlias) ifTrue:[
^ eachEncoderInstance.
].
].
].
unicodeEncoderClasses := EncoderClassesByName at:#unicode.
unicodeEncoderClasses notNil ifTrue:[
unicodeEncoderClasses keysandvaluesdo:[:eachencodingalias :eachencoderclass |
(name matches:eachencodingalias) iftrue:[
^ eachencoderclass new.
].
].
].
^ exceptionValue value
].
AccessLock critical:[
unicodeEncoders := EncodersByName at:#unicode ifAbsent:nil.
unicodeEncoders isNil ifTrue:[
EncodersByName at:#unicode put:(unicodeEncoders := Dictionary new).
].
enc := unicodeEncoders at:name ifAbsent:nil.
enc isNil ifTrue:[
unicodeEncoderClasses := EncoderClassesByName at:#unicode ifAbsent:nil.
unicodeEncoderClasses isNil ifTrue:[
EncoderClassesByName at:#unicode put:(unicodeEncoderClasses := Dictionary new).
].
cls := unicodeEncoderClasses at:name ifAbsent:nil.
cls notNil ifTrue:[
enc := cls new.
unicodeEncoders at:name put:enc.
].
].
].
enc notNil ifTrue:[
^ enc
].
"/ no direct encoder from unicode->name
"/ search for unicode->any and: any->name
unicodeEncoderClasses := EncoderClassesByName at:#unicode ifAbsent:nil.
unicodeEncoderClasses keysAndValuesDo:[:eachEncodingAlias :eachEncoderClass |
|dict2|
dict2 := EncoderClassesByName at:eachEncodingAlias ifAbsent:nil.
dict2 notNil ifTrue:[
cls := dict2 at:name ifAbsent:nil.
cls notNil ifTrue:[
enc := TwoStepEncoder new
encoder1:(self encoderFor:eachEncodingAlias)
encoder2:(cls new).
AccessLock critical:[
unicodeEncoders at:name put:enc.
].
^ enc.
]
].
].
^ exceptionValue value
"
CharacterEncoder encoderFor:#'latin1'
self encoderFor:#'arabic'
self encoderFor:#'ms-arabic'
self encoderFor:#'iso8859-5'
self encoderFor:#'koi8-r'
self encoderFor:#'koi8-u'
self encoderFor:#'jis0208'
self encoderFor:#'jis7'
self encoderFor:#'unicode'
"
!
encoderToEncodeFrom:oldEncodingArg into:newEncodingArg
|oldEncoding newEncoding encoders encoderClasses encoder decoder cls|
oldEncoding := oldEncodingArg ? #'unicode'.
oldEncoding == #'iso10646-1' ifTrue:[ oldEncoding := #'unicode'].
newEncoding := newEncodingArg ? #'unicode'.
newEncoding == #'iso10646-1' ifTrue:[ newEncoding := #'unicode'].
oldEncoding isSymbol ifFalse:[self halt:'symbol argument expected'. oldEncoding := oldEncoding asSymbol].
newEncoding isSymbol ifFalse:[self halt:'symbol argument expected'. newEncoding := newEncoding asSymbol].
oldEncoding == newEncoding ifTrue:[^ NullEncoderInstance].
(oldEncoding match:newEncoding) ifTrue:[^ NullEncoderInstance].
(oldEncoding == #unicode) ifTrue:[
"/ something -> unicode
^ self encoderFor:newEncoding.
].
AccessLock critical:[
encoders := EncodersByName at:oldEncoding ifAbsent:nil.
encoders isNil ifTrue:[
EncodersByName at:oldEncoding put:(encoders := Dictionary new).
].
encoder := encoders at:newEncodingArg ifAbsent:nil.
encoder isNil ifTrue:[
encoderClasses := EncoderClassesByName at:oldEncoding ifAbsent:nil.
encoderClasses isNil ifTrue:[
EncoderClassesByName at:oldEncoding put:(encoderClasses := Dictionary new).
].
cls := encoderClasses at:newEncoding ifAbsent:nil.
cls notNil ifTrue:[
encoder := cls new.
].
].
].
encoder isNil ifTrue:[
(newEncoding == #unicode) ifTrue:[
"/ something -> unicode
decoder := self encoderFor:oldEncoding.
encoder := InverseEncoder new decoder:decoder.
] ifFalse:[
"/ do it as: oldEncoding -> unicode -> newEncoding
"/ something -> unicode
decoder := self encoderFor:oldEncoding.
"/ unicode -> something
encoder := self encoderFor:newEncoding.
encoder := CompoundEncoder new encoder:encoder decoder:decoder.
].
].
AccessLock critical:[
(EncodersByName at:oldEncoding) at:newEncoding put:encoder
].
^ encoder
" CharacterEncoder initialize
CharacterEncoder encoderToEncodeFrom:#'latin1' into:#'jis7'
CharacterEncoder encoderToEncodeFrom:#'koi8-r' into:#'mac-cyrillic'
CharacterEncoder encoderToEncodeFrom:#'ms-arabic' into:#'mac-arabic'
CharacterEncoder encoderToEncodeFrom:#'iso8859-5' into:#'koi8-r'
CharacterEncoder encoderToEncodeFrom:#'koi8-r' into:#'koi8-u'
"
! !
!CharacterEncoder class methodsFor:'Compatibility-ST80'!
encoderNamed: encoderName
"/ q & d hack
encoderName == #default ifTrue:[
^ DefaultEncoder new
].
self halt.
^ self new
!
platformName
^ OperatingSystem platformName
"Created: 20.6.1997 / 17:34:03 / cg"
"Modified: 20.6.1997 / 17:38:40 / cg"
! !
!CharacterEncoder class methodsFor:'class initialization'!
initialize
AccessLock := Semaphore forMutualExclusion.
NullEncoderInstance := NullEncoder new.
EncodersByName := Dictionary new.
EncoderClassesByName := Dictionary new.
CachedEncoders := Dictionary new.
"/ class decoded-name array-of-encodingNames
#(
(ASCII unicode ( ascii 'us-ascii' 'iso-ir-6' 'ibm-367' 'ms-cp367' 'cp367' 'iso646-us' 'ibm-cp367' ))
(BIG5 unicode ( big5 ))
(CNS11643 unicode ( 'cns11643' ))
(CP437 unicode ( 'cp437' 'cp-437' 'ibm-437' 'ms-cp437' 'microsoft-cp437' 'ibm-cp437' ))
(GB2313_1980 unicode ( 'gb2313' 'gb2313-1980' ))
(HANGUL unicode ( 'hangul' ))
(ISO10646_1 unicode ( unicode 'iso10646_1' 'iso10646-1' 'iso-10646-1' ))
(ISO10646_to_UTF8 unicode ( utf8 'utf-8' ))
(ISO8859_1 unicode ( 'iso8859_1' 'iso8859-1' 'iso-8859-1' 'latin-1' 'latin1' 'iso-ir-100' 'ibm-819' 'ms-cp819' 'ibm-cp819' ))
(ISO8859_2 unicode ( 'iso8859_2' 'iso8859-2' 'iso-8859-2' 'latin2' 'latin-2' 'iso-ir-101'))
(ISO8859_3 unicode ( 'iso8859_3' 'iso8859-3' 'iso-8859-3' 'latin3' 'latin-3' 'iso-ir-109'))
(ISO8859_4 unicode ( 'iso8859_4' 'iso8859-4' 'iso-8859-4' 'latin4' 'latin-4' 'iso-ir-110'))
(ISO8859_5 unicode ( 'iso8859_5' 'iso8859-5' 'iso-8859-5' 'cyrillic' 'iso-ir-144' ))
(ISO8859_6 unicode ( 'iso8859_6' 'iso8859-6' 'iso-8859-6' 'arabic' 'asmo-708' 'ecma-114' 'iso-ir-127' ))
(ISO8859_7 unicode ( 'iso8859_7' 'iso8859-7' 'iso-8859-7' 'greek' 'iso-ir-126' 'ecma-118'))
(ISO8859_8 unicode ( 'iso8859_8' 'iso8859-8' 'iso-8859-8' 'hebrew' 'iso-ir-138' ))
(ISO8859_9 unicode ( 'iso8859_9' 'iso8859-9' 'iso-8859-9' 'latin5' 'latin-5' 'iso-ir-148'))
(ISO8859_10 unicode ( 'iso8859_10' 'iso8859-10' 'iso-8859-10' 'latin6' 'latin-6' 'iso-ir-157'))
(ISO8859_11 unicode ( 'iso8859_11' 'iso8859-11' 'iso-8859-11' 'thai' ))
(ISO8859_13 unicode ( 'iso8859_13' 'iso8859-13' 'iso-8859-13' 'latin7' 'latin-7' ))
(ISO8859_14 unicode ( 'iso8859_14' 'iso8859-14' 'iso-8859-14' 'latin8' 'latin-8' 'latin-celtic' ))
(ISO8859_15 unicode ( 'iso8859_15' 'iso8859-15' 'iso-8859-15' 'latin9' 'latin-9' 'iso-ir-203'))
(ISO8859_16 unicode ( 'iso8859_16' 'iso8859-16' 'iso-8859-16' 'latin10' 'latin-10' ))
(JIS0201 unicode ( 'jis0201' #'jisx0201.1976-0'))
(JIS0208 unicode ( jis0208 'jisx0208' 'jisx0208.1983-0' 'jisx0208.1990-0'))
(JIS0208_to_JIS7 jis0208 ( jis7 'jis-7' 'x-jis7' 'x-iso2022-jp' 'iso2022-jp'))
(JIS0208_to_EUC jis0208 ( euc #'x-euc-jp' ))
(JIS0212 unicode ( 'jis0212' ))
(JOHAB unicode ( 'johab' ))
(KOI7 unicode ( 'koi7' ))
(KOI8_R unicode ( #'koi8-r' 'cp878' ))
(KOI8_U unicode ( #'koi8-u' ))
(KSC5601 unicode ( #'ksc5601' ))
(MAC_Arabic unicode ( #'mac-arabic' 'macarabic' ))
(MAC_CentralEuropean unicode ( #'mac-centraleuropean' #'mac-centraleurope' 'maccentraleurope' 'maccentraleuropean' ))
(MAC_Croatian unicode ( #'mac-croatian' 'maccroatian'))
(MAC_Cyrillic unicode ( #'mac-cyrillic' 'maccyrillic' ))
(MAC_Dingbats unicode ( #'mac-dingbats' 'macdingbats' 'macdingbat'))
(MAC_Farsi unicode ( #'mac-farsi' 'macfarsi' ))
(MAC_Greek unicode ( #'mac-greek' #'macgreek' ))
(MAC_Hebrew unicode ( #'mac-hebrew' #'machebrew' ))
(MAC_Iceland unicode ( #'mac-iceland' #'maciceland' ))
(MAC_Japanese unicode ( #'mac-japanese' #'macjapanese' ))
(MAC_Korean unicode ( #'mac-korean' #'mackorean' ))
(MAC_Roman unicode ( #'mac-roman' #'macroman' ))
(MAC_Romanian unicode ( #'mac-romanian' #'macromanian' ))
(MAC_Symbol unicode ( #'mac-symbol' #'macsymbol' ))
(MAC_Thai unicode ( #'mac-thai' #'macthai' ))
(MAC_Turkish unicode ( #'mac-turkish' #'macturkish' ))
(MS_Ansi unicode ( #'ms-ansi' 'ms-cp1252' 'microsoft-cp1252' 'cp1252' 'microsoft-ansi' 'windows-1252' 'windows-latin1'))
(MS_Arabic unicode ( 'ms-arabic' 'ms-cp1256' 'microsoft-cp1256' 'cp1256' 'microsoft-arabic' 'windows-1256' ))
(MS_Baltic unicode ( 'ms-baltic' 'ms-cp1257' 'microsoft-cp1257' 'cp1257' 'microsoft-baltic' 'windows-1257' ))
(MS_Cyrillic unicode ( 'ms-cyrillic' 'ms-cp1251' 'microsoft-cp1251' 'cp1251' 'microsoft-cyrillic' 'windows-1251' ))
(MS_EastEuropean unicode ( 'ms-easteuropean' 'ms-ee' 'cp1250' 'ms-cp1250' 'microsoft-cp1250' 'microsoft-easteuropean' 'windows-1250' ))
(MS_Greek unicode ( 'ms-greek' 'ms-cp1253' 'microsoft-cp1253' 'cp1253' 'microsoft-greek' 'windows-1253' ))
(MS_Hebrew unicode ( 'ms-hebrew' 'ms-cp1255' 'microsoft-cp1255' 'cp1255' 'microsoft-hebrew' 'windows-1255' ))
"/ (MS_Symbol unicode ( 'ms-symbol' 'microsoft-symbol' ))
(MS_Turkish unicode ( 'ms-turkish' 'ms-cp1254' 'microsoft-cp1254' 'cp1254' 'microsoft-turkish' 'windows-1254' ))
(NEXT unicode ( 'next' 'nextstep' ))
(SJIS unicode ( 'sjis' 'shiftjis' 'x-sjis' #'x-shift-jis' #'shift-jis'))
) triplesDo:[:className :decodesTo :encodesTo |
|implClass dict|
implClass := (Smalltalk at:#CharacterEncoderImplementations) at:className.
implClass isNil ifTrue:[
self halt:'missing encoder-class'
] ifFalse:[
dict := EncoderClassesByName at:decodesTo ifAbsent:nil.
dict isNil ifTrue:[
EncoderClassesByName at:decodesTo put:(dict := Dictionary new).
].
encodesTo do:[:eachEncodingAlias |
(dict includesKey:eachEncodingAlias) ifTrue:[
self halt:'conflicting alias'
].
dict at:eachEncodingAlias put:implClass.
]
].
].
"
self initialize
"
! !
!CharacterEncoder class methodsFor:'constants'!
jis7KanjiEscapeSequence
"return the escape sequence used to switch to kanji in jis7 encoded strings.
This happens to be the same as ISO2022-JP's escape sequence."
Jis7KanjiEscapeSequence isNil ifTrue:[
Jis7KanjiEscapeSequence := Character esc asString , '$B'.
].
^ Jis7KanjiEscapeSequence.
"Created: 26.2.1996 / 17:38:08 / cg"
"Modified: 30.6.1997 / 16:03:16 / cg"
!
jis7KanjiOldEscapeSequence
"return the escape sequence used to switch to kanji in some old jis7 encoded strings."
Jis7KanjiOldEscapeSequence isNil ifTrue:[
Jis7KanjiOldEscapeSequence := Character esc asString , '$@'..
].
^ Jis7KanjiOldEscapeSequence.
!
jis7RomanEscapeSequence
"return the escape sequence used to switch to roman in jis7 encoded strings"
Jis7RomanEscapeSequence isNil ifTrue:[
Jis7RomanEscapeSequence := Character esc asString , '(J'.
].
^ Jis7RomanEscapeSequence.
"Created: 26.2.1996 / 17:38:08 / cg"
"Modified: 30.6.1997 / 16:03:16 / cg"
!
jisISO2022EscapeSequence
"return the escape sequence used to switch to kanji in iso2022 encoded strings"
JisISO2022EscapeSequence isNil ifTrue:[
JisISO2022EscapeSequence := Character esc asString , '&@' , Character esc asString , '$B'.
].
^ JisISO2022EscapeSequence.
! !
!CharacterEncoder class methodsFor:'encoding & decoding'!
decode:aCodePoint
^ self new decode:aCodePoint
!
decodeString:aString
^ self new decodeString:aString
!
decodeString:aString from:oldEncoding
^ self encodeString:aString from:oldEncoding into:#'unicode'
!
encode:aCodePoint
^ self new encode:aCodePoint
"
ISO8859_1 encode:16r00FF
ISO8859_1 decodeString:'hello'
ISO8859_1 encodeString:(ISO8859_1 decodeString:'hello')
ISO8859_5 decodeString:(String
with:(Character value:16rE4)
with:(Character value:16rE0))
"
!
encode:codePoint from:oldEncodingArg into:newEncodingArg
|oldEncoding newEncoding encoder|
oldEncoding := oldEncodingArg ? #'unicode'.
oldEncoding == #'iso10646-1' ifTrue:[ oldEncoding := #'unicode'].
newEncoding := newEncodingArg ? #'unicode'.
newEncoding == #'iso10646-1' ifTrue:[ newEncoding := #'unicode'].
oldEncoding == newEncoding ifTrue:[^ codePoint].
oldEncoding == #'unicode' ifTrue:[
newEncoding == #'iso8859-1' ifTrue:[
codePoint <= 16rFF ifTrue:[
^ codePoint
]
]
].
newEncoding == #'unicode' ifTrue:[
oldEncoding == #'iso8859-1' ifTrue:[
codePoint <= 16rFF ifTrue:[
^ codePoint
]
]
].
encoder := self encoderToEncodeFrom:oldEncoding into:newEncoding.
^ encoder encode:codePoint.
!
encodeString:aUnicodeString
"given a string in unicode, return a string in my encoding for it"
^ self new encodeString:aUnicodeString
"
ISO8859_1 decodeString:'hello'
"
!
encodeString:aString from:oldEncodingArg into:newEncodingArg
|oldEncoding newEncoding encoder|
oldEncoding := oldEncodingArg ? #'unicode'.
oldEncoding == #'iso10646-1' ifTrue:[ oldEncoding := #'unicode'].
newEncoding := newEncodingArg ? #'unicode'.
newEncoding == #'iso10646-1' ifTrue:[ newEncoding := #'unicode'].
oldEncoding == newEncoding ifTrue:[^ aString].
oldEncoding == #'unicode' ifTrue:[
newEncoding == #'iso8859-1' ifTrue:[
aString bitsPerCharacter == 8 ifTrue:[
^ aString
]
]
].
newEncoding == #'unicode' ifTrue:[
oldEncoding == #'iso8859-1' ifTrue:[
aString bitsPerCharacter == 8 ifTrue:[
^ aString
]
]
].
encoder := self encoderToEncodeFrom:oldEncoding into:newEncoding.
^ encoder encodeString:aString.
!
encodeString:aString into:newEncoding
^ self encodeString:aString from:#'unicode' into:newEncoding
! !
!CharacterEncoder class methodsFor:'private'!
flushCode
self initialize.
self isAbstract ifFalse:[
(self mapFileURL1_relativePathName notNil
or:[ self mapFileURL2_relativePathName notNil]) ifTrue:[
self class removeSelector:#mapping.
].
].
"
self flushCode
"
! !
!CharacterEncoder class methodsFor:'private-mapping setup'!
generateCode
(CharacterEncoderCodeGenerator new targetClass:self) generateCode.
!
generateSubclassCode
(CharacterEncoderCodeGenerator new targetClass:self) generateSubclassCode.
!
mapFileURL1_codeColumn
^ 1
!
mapFileURL1_relativePathName
"raise an error: must be redefined in concrete subclass(es)"
^ nil
!
mapFileURL2_relativePathName
"raise an error: must be redefined in concrete subclass(es)"
^ nil
!
mappingURL1
"raise an error: must be redefined in concrete subclass(es)"
|rel|
rel := self mapFileURL1_relativePathName.
rel isNil ifTrue:[
^ nil
].
^ 'http://www.unicode.org/Public/MAPPINGS/' , rel
!
mappingURL2
"raise an error: must be redefined in concrete subclass(es)"
|rel|
rel := self mapFileURL2_relativePathName.
rel isNil ifTrue:[
^ nil
].
^ 'http://std.dkuug.dk/i18n/charmaps/' , rel
! !
!CharacterEncoder class methodsFor:'queries'!
isEncoding:subSetEncodingArg subSetOf:superSetEncodingArg
"return true, if superSetEncoding encoding includes all characters of subSetEncoding.
(this means: characters are included - not that they have the same encoding)"
|subSetEncoding superSetEncoding|
subSetEncodingArg = superSetEncodingArg ifTrue:[^ true].
subSetEncoding := subSetEncodingArg asLowercase.
superSetEncoding := superSetEncodingArg asLowercase.
(subSetEncoding match:superSetEncoding) ifTrue:[^ true].
(('iso10646*' match:superSetEncoding) or:[superSetEncoding = 'unicode']) ifTrue:[
('ascii*' match:subSetEncoding) ifTrue:[^ true].
('iso8859*' match:subSetEncoding) ifTrue:[^ true].
('jis*' match:subSetEncoding) ifTrue:[^ true].
('koi8*' match:subSetEncoding) ifTrue:[^ true].
('ksc*' match:subSetEncoding) ifTrue:[^ true].
('big*' match:subSetEncoding) ifTrue:[^ true].
('cns*' match:subSetEncoding) ifTrue:[^ true].
('gb2312*' match:subSetEncoding) ifTrue:[^ true].
].
"/ if the subSet is iso8859-*, that means ascii (i.e. the lower 7 bits of iso8859 only).
((subSetEncoding = 'iso8859*') or:[subSetEncoding = 'iso8859-*']) ifTrue:[
('ascii*' match:superSetEncoding) ifTrue:[^ true].
].
(subSetEncoding = 'ascii') ifTrue:[
('iso8859*' match:superSetEncoding) ifTrue:[^ true].
].
"/ TODO: check the charSets mappingTables...
"/ self halt.
^ false.
!
nameOfDecodedCode
"Most coders decode from their code into unicode / encode from unicode into their code.
There are a few exceptions to this, though - these must redefine this."
^ #'unicode'
!
nameOfEncoding
^ (self nameWithoutPrefix asLowercase copyReplaceAll:$_ with:$-) asSymbol
!
supportedExternalEncodings
"return an array of arrays containing the names of supported
encodings which are supported for external resources (i.e. files).
The first element contains the internally used symbolic name,
the second contains a user-readable string (description).
More than one external name may be mapped onto the same symbolic."
^ #(
('utf8' 'Unicode as 8Bit characters' )
('utf7' 'Unicode as 7Bit characters' )
nil
('ascii' 'Common 7bit subset of iso8859' )
('iso8859-1' 'Latin1' )
('iso8859-2' 'Latin2' )
('iso8859-3' 'Latin3' )
('iso8859-4' 'Latin4' )
('iso8859-5' 'Cyrillic' )
('iso8859-6' 'Arabic' )
('iso8859-7' 'Greek' )
('iso8859-8' 'Hebrew' )
nil
('koi7' 'Cyrillic (Old)' )
('koi8-r' 'Cyrillic' )
('koi8-u' 'Cyrillic (Ukraine)' )
nil
('cp437' 'msdos US / codepage 437' )
('cp850' 'msdos Latin1 codepage 850' )
('mac' 'macintosh 8 bit' )
('next' 'NeXT 8 bit' )
('hp' 'hpux 8 bit' )
nil
('euc' 'EUC - extended unix code japanese' )
('jis7' 'JIS7 - jis 7bit escape codes japanese' )
('iso-2022-jp' 'Same as jis 7bit' )
('sjis' 'SJIS - shift jis 8bit codes japanese' )
nil
('gb' 'GB - mainland chin' )
('big5' 'BIG5 - taiwan' )
"/ ('ksc' 'korean' )
)
!
userFriendlyNameOfEncoding
^ self nameOfEncoding asUppercaseFirst
! !
!CharacterEncoder class methodsFor:'testing'!
isAbstract
^ self == CharacterEncoder
! !
!CharacterEncoder methodsFor:'encoding & decoding'!
decode:anEncoding
"given an integer in my encoding, return a unicode codePoint for it"
self subclassResponsibility
!
decodeString:anEncodedString
"given a string in my encoding, return a unicode-string for it"
|newString|
newString := String new:(anEncodedString size).
1 to:anEncodedString size do:[:idx |
|myCode uniCodePoint|
myCode := (anEncodedString at:idx) codePoint.
uniCodePoint := self decode:myCode.
uniCodePoint > 16rFF ifTrue:[
uniCodePoint > 16rFFFF ifTrue:[
newString bitsPerCharacter < 32 ifTrue:[
newString := Unicode32String fromString:newString.
]
] ifFalse:[
newString bitsPerCharacter < 16 ifTrue:[
newString := Unicode16String fromString:newString.
]
].
].
newString at:idx put:(Character value:uniCodePoint).
].
^ newString
"
ISO8859_1 decodeString:'hello'
"
!
encode:aCodePoint
"given a codePoint in unicode, return a byte in my encoding for it"
self subclassResponsibility
!
encodeString:aUnicodeString
"given a string in unicode, return a string in my encoding for it"
|newString myCode uniCodePoint|
newString := self newString:(aUnicodeString size).
1 to:aUnicodeString size do:[:idx |
uniCodePoint := (aUnicodeString at:idx) codePoint.
myCode := self encode:uniCodePoint.
newString at:idx put:(Character value:myCode).
].
^ newString
! !
!CharacterEncoder methodsFor:'error handling'!
decodingError
"report an error that there is no unicode-codePoint for a given codePoint in this encoding.
(which is unlikely) or that the encoding is undefined for that value
(for example, holes in the ISO8859-3 encoding)"
|badCodePoint sender|
sender := thisContext sender.
((sender selector == #encode:) or:[sender selector == #decode:]) ifFalse:[
badCodePoint := sender methodHome argAt:1
].
^ (EncodingError new)
defaultValue:(self defaultDecoderValue);
parameter:badCodePoint;
messageText:'invalid code';
suspendedContext:sender;
raiseRequest
!
defaultDecoderValue
"placed into a decoded string, in case there is no unicode codePoint
for a given encoded codePoint.
(typically 16rFFFF)."
^ 16rFFFF
!
defaultEncoderValue
"placed into an encoded string, in case there is no codePoint
for a given unicode codePoint.
(typically $?)."
^ $? codePoint
!
encodingError
"report an error that some unicode-codePoint cannot be represented by this encoder"
|badCodePoint sender|
sender := thisContext sender.
((sender selector == #encode:) or:[sender selector == #decode:]) ifFalse:[
badCodePoint := sender methodHome argAt:1
].
^ (EncodingError new)
defaultValue:(self defaultEncoderValue);
parameter:badCodePoint;
messageText:'unrepresentable unicode';
suspendedContext:sender;
raiseRequest
! !
!CharacterEncoder methodsFor:'printing'!
printOn:aStream
aStream
nextPutAll:(self nameOfDecodedCode);
nextPutAll:'->';
nextPutAll:(self nameOfEncoding)
! !
!CharacterEncoder methodsFor:'private'!
newString:size
self subclassResponsibility
! !
!CharacterEncoder methodsFor:'queries'!
isNullEncoder
^ false
!
nameOfDecodedCode
"Most coders decode from their code into unicode / encode from unicode into their code.
There are a few exceptions to this, though - these must redefine this."
^ self class nameOfDecodedCode
!
nameOfEncoding
^ self class nameOfEncoding
!
userFriendlyNameOfEncoding
^ self class userFriendlyNameOfEncoding
! !
!CharacterEncoder::CompoundEncoder class methodsFor:'documentation'!
documentation
"
A compoundEncoder uses two real encoders;
to encode:
string -> decoder(encode) -> encoder -> result
to decode:
string -> encoder -> decoder -> result
|e|
e := CompoundEncoder new.
e encoder:ISO8859_5 decoder:KOI8_R.
e decode:16rB0. 'CYRILLIC CAPITAL LETTER A; 16rB0 in 8859-5; 16rE1 in KOI8-R'.
e encode:16rE1.
"
! !
!CharacterEncoder::CompoundEncoder methodsFor:'accessing'!
encoder:encoderArg decoder:decoderArg
"set instance variables (automatically generated)"
decoder := decoderArg.
encoder := encoderArg.
! !
!CharacterEncoder::CompoundEncoder methodsFor:'encoding & decoding'!
decode:aCode
^ decoder encode:(encoder decode:aCode)
!
decodeString:aString
^ decoder encodeString:(encoder decodeString:aString)
!
encode:aCode
^ encoder encode:(decoder decode:aCode)
!
encodeString:aString
^ encoder encodeString:(decoder decodeString:aString)
! !
!CharacterEncoder::CompoundEncoder methodsFor:'printing'!
printOn:aStream
aStream
nextPutAll:(decoder nameOfEncoding);
nextPutAll:'->'.
"/ nextPutAll:(decoder nameOfDecodedCode);
"/ nextPutAll:'->';
"/ nextPutAll:(encoder nameOfEncoding)
encoder printOn:aStream
! !
!CharacterEncoder::DefaultEncoder class methodsFor:'documentation'!
documentation
"
That is only a dummy for ST80 compatibility
"
! !
!CharacterEncoder::InverseEncoder class methodsFor:'documentation'!
documentation
"
An inverseEncoder does the inverse - i.e. encode is really a decode
and decode is really an encode.
"
! !
!CharacterEncoder::InverseEncoder methodsFor:'accessing'!
decoder:something
decoder := something.
! !
!CharacterEncoder::InverseEncoder methodsFor:'encoding & decoding'!
decode:aCode
^ decoder encode:aCode
!
decodeString:aString
^ decoder encodeString:aString
!
encode:aCode
^ decoder decode:aCode
!
encodeString:aString
^ decoder decodeString:aString
! !
!CharacterEncoder::InverseEncoder methodsFor:'printing'!
printOn:aStream
aStream
nextPutAll:(decoder nameOfEncoding);
nextPutAll:'->';
nextPutAll:(decoder nameOfDecodedCode)
! !
!CharacterEncoder::NullEncoder class methodsFor:'documentation'!
documentation
"
A NullEncoder does nothing.
"
! !
!CharacterEncoder::NullEncoder methodsFor:'encoding & decoding'!
decode:aCode
^ aCode
!
decodeString:aString
^ aString
!
encode:aCode
^ aCode
!
encodeString:aString
^ aString
! !
!CharacterEncoder::NullEncoder methodsFor:'queries'!
isNullEncoder
^ true
! !
!CharacterEncoder::OtherEncoding class methodsFor:'private'!
flushCode
!
generateEncoderCode
! !
!CharacterEncoder::TwoStepEncoder class methodsFor:'documentation'!
documentation
"
A twoStepEncoder uses two real encoders;
to encode:
string -> encoder1(encode) -> encoder2(encode) -> result
to decode:
string -> encoder2(decode) -> encoder1(decode) -> result
"
! !
!CharacterEncoder::TwoStepEncoder methodsFor:'accessing'!
encoder1:encoder1Arg encoder2:encoder2Arg
"set instance variables (automatically generated)"
encoder1 := encoder1Arg.
encoder2 := encoder2Arg.
! !
!CharacterEncoder::TwoStepEncoder methodsFor:'encoding & decoding'!
decode:aCode
^ encoder1 decode:(encoder2 decode:aCode)
!
decodeString:aString
^ encoder1 decodeString:(encoder2 decodeString:aString)
!
encode:aCode
^ encoder2 encode:(encoder1 encode:aCode)
!
encodeString:aString
^ encoder2 encodeString:(encoder1 encodeString:aString)
! !
!CharacterEncoder::TwoStepEncoder methodsFor:'printing'!
printOn:aStream
aStream
nextPutAll:(encoder1 nameOfDecodedCode);
nextPutAll:'->';
nextPutAll:(encoder1 nameOfEncoding);
nextPutAll:'->';
nextPutAll:(encoder2 nameOfEncoding)
! !
!CharacterEncoder class methodsFor:'documentation'!
version
^ '$Header: /cvs/stx/stx/libbasic/CharacterEncoder.st,v 1.57 2004-03-09 00:08:33 cg Exp $'
! !
CharacterEncoder initialize!