CharacterEncoder.st
changeset 7971 357e53496acc
parent 7969 1c252e9cf79c
child 7972 91aa73f89491
equal deleted inserted replaced
7970:2ae69eb663eb 7971:357e53496acc
   556 "
   556 "
   557 !
   557 !
   558 
   558 
   559 howToAddMoreCoders
   559 howToAddMoreCoders
   560 "
   560 "
   561     Coders can be hand-written or generated via  a mapping table.
   561     Coders can be hand-written or automagically generated via  a mapping table.
   562     Examples for hand-written coders are UTF8_to_ISO10464 or JIS0208_to_JIS7.
   562     Examples for hand-written coders are UTF8_to_ISO10464 or JIS0208_to_JIS7.
   563 
   563 
   564     The table driven encode/decode methods can be generated from a character mapping document
   564     The table driven encode/decode methods can be generated from a character mapping document
   565     as found on the unicode consortium host
   565     as found on the unicode consortium host
   566         (for example: 'http://www.unicode.org/Public/MAPPINGS/ISO8859/8859-1.TXT')
   566         (for example: 'http://www.unicode.org/Public/MAPPINGS/ISO8859/8859-1.TXT')
   614 
   614 
   615         CharacterEncoder::JIS0208 flushCode; generateCode.
   615         CharacterEncoder::JIS0208 flushCode; generateCode.
   616 "
   616 "
   617 ! !
   617 ! !
   618 
   618 
       
   619 !CharacterEncoder class methodsFor:'instance creation'!
       
   620 
       
   621 encoderFor:encodingNameSymbol
       
   622     "given the name of an encoding, return an encoder-instance which can map these from/into unicode."
       
   623 
       
   624     ^ self
       
   625         encoderFor:encodingNameSymbol 
       
   626         ifAbsent:[
       
   627             self error:'no encoder for ' , encodingNameSymbol mayProceed:true.
       
   628             NullEncoder new
       
   629         ]
       
   630 
       
   631     "
       
   632      CharacterEncoder encoderFor:#'latin1'       
       
   633      self encoderFor:#'arabic'       
       
   634      self encoderFor:#'ms-arabic'       
       
   635      self encoderFor:#'iso8859-5'    
       
   636      self encoderFor:#'koi8-r'      
       
   637      self encoderFor:#'koi8-u'      
       
   638      self encoderFor:#'jis0208'      
       
   639      self encoderFor:#'jis7'      
       
   640     "
       
   641 !
       
   642 
       
   643 encoderFor:encodingNameSymbol ifAbsent:exceptionValue
       
   644     "given the name of an encoding, return an encoder-instance which can map these from/into unicode."
       
   645 
       
   646     |cls lcName name|
       
   647 
       
   648     lcName := encodingNameSymbol asLowercase asSymbolIfInterned.
       
   649     name := lcName ? encodingNameSymbol.
       
   650 
       
   651     cls := EncodersByName at:name ifAbsent:nil.
       
   652     cls notNil ifTrue:[^ cls new ].
       
   653 
       
   654     self allSubclassesDo:[:cls |
       
   655         cls nameOfDecodedCode == #unicode ifTrue:[
       
   656             cls nameOfEncoding = name ifTrue:[
       
   657                 EncodersByName at:name put:cls.    
       
   658                 ^ cls new.
       
   659             ]
       
   660         ].
       
   661     ].
       
   662     self allSubclassesDo:[:cls |
       
   663         cls nameOfDecodedCode == #unicode ifTrue:[
       
   664             (cls alternativeNamesOfEncoding includes:name) ifTrue:[
       
   665                 EncodersByName at:name put:cls.    
       
   666                 ^ cls new.
       
   667             ].
       
   668         ].
       
   669     ].
       
   670 
       
   671     self allSubclassesDo:[:cls |
       
   672         ((cls nameOfEncoding = name)
       
   673         or:[(cls alternativeNamesOfEncoding includes:name)]) ifTrue:[
       
   674             "/ ok, found some other encoder - need a compound encoder then.
       
   675             "/ the one found encodes into what we need, but needs something else as input.
       
   676             
       
   677             ^ TwoStepEncoder new
       
   678                 encoder1:(self encoderFor:(cls nameOfDecodedCode))
       
   679                 encoder2:(cls new).
       
   680         ].
       
   681     ].
       
   682     ^ exceptionValue value
       
   683 
       
   684     "
       
   685      CharacterEncoder encoderFor:#'latin1'       
       
   686      self encoderFor:#'arabic'       
       
   687      self encoderFor:#'ms-arabic'       
       
   688      self encoderFor:#'iso8859-5'    
       
   689      self encoderFor:#'koi8-r'      
       
   690      self encoderFor:#'koi8-u'      
       
   691      self encoderFor:#'jis0208'      
       
   692      self encoderFor:#'jis7'      
       
   693     "
       
   694 !
       
   695 
       
   696 encoderToEncodeFrom:oldEncodingArg into:newEncodingArg
       
   697     |oldEncoding newEncoding encoder decoder|
       
   698 
       
   699     oldEncoding := oldEncodingArg ? #'unicode'.
       
   700     newEncoding := newEncodingArg ? #'unicode'.
       
   701     oldEncoding == newEncoding ifTrue:[^ NullEncoder new].
       
   702     (oldEncoding match:newEncoding) ifTrue:[^ NullEncoder new].
       
   703     "/ (newEncoding match:oldEncoding) ifTrue:[^ NullEncoder new].
       
   704 
       
   705     ((oldEncoding == #unicode) or:[(oldEncoding == #'iso10646-1')]) ifTrue:[
       
   706         ((newEncoding == #unicode) or:[(newEncoding == #'iso10646-1')]) ifTrue:[^ NullEncoder new].
       
   707 
       
   708         "/ unicode -> something
       
   709         ^ self encoderFor:newEncoding.
       
   710     ].
       
   711     ((newEncoding == #unicode) or:[(newEncoding == #'iso10646-1')]) ifTrue:[
       
   712         "/ something -> unicode 
       
   713         decoder := self encoderFor:oldEncoding.
       
   714         ^ InverseEncoder new decoder:decoder.
       
   715     ].
       
   716 
       
   717     "/ look for a specialized encoder...
       
   718     self allSubclassesDo:[:cls |
       
   719         (cls nameOfEncoding = oldEncoding
       
   720         or:[ cls alternativeNamesOfEncoding includes:oldEncoding ]) ifTrue:[
       
   721             |nameOfDecodedCode encoderForDecodedCode|
       
   722 
       
   723             nameOfDecodedCode := cls nameOfDecodedCode.
       
   724             encoderForDecodedCode := self encoderFor:nameOfDecodedCode.
       
   725             (nameOfDecodedCode = newEncoding
       
   726             or:[ encoderForDecodedCode class alternativeNamesOfEncoding includes:newEncoding ]) ifTrue:[
       
   727                 ^ InverseEncoder new decoder:cls new. 
       
   728             ]
       
   729         ].
       
   730         (cls nameOfEncoding = newEncoding
       
   731         or:[ cls alternativeNamesOfEncoding includes:newEncoding ]) ifTrue:[
       
   732             |nameOfDecodedCode encoderForDecodedCode|
       
   733 
       
   734             nameOfDecodedCode := cls nameOfDecodedCode.
       
   735             encoderForDecodedCode := self encoderFor:nameOfDecodedCode.
       
   736             (nameOfDecodedCode = oldEncoding
       
   737             or:[ encoderForDecodedCode class alternativeNamesOfEncoding includes:oldEncoding ]) ifTrue:[
       
   738                 ^ cls new. 
       
   739             ]
       
   740         ].
       
   741     ].
       
   742     
       
   743     "/ do it as: oldEncoding -> unicode -> newEncoding
       
   744 
       
   745     "/ something -> unicode 
       
   746     decoder := self encoderFor:oldEncoding.
       
   747 
       
   748     "/ unicode -> something
       
   749     encoder := self encoderFor:newEncoding.
       
   750     ^ CompoundEncoder new encoder:encoder decoder:decoder 
       
   751 !
       
   752 
       
   753 unicodeEncoderFor:encodingNameSymbol
       
   754     "given the name of an encoding, return an encoder-instance which can map these from/into unicode."
       
   755 
       
   756     self obsoleteMethodWarning.
       
   757     ^ self encoderFor:encodingNameSymbol
       
   758 
       
   759     "
       
   760      CharacterEncoder unicodeEncoderFor:#'latin1'       
       
   761      self unicodeEncoderFor:#'arabic'       
       
   762      self unicodeEncoderFor:#'ms-arabic'       
       
   763      self unicodeEncoderFor:#'iso8859-5'    
       
   764      self unicodeEncoderFor:#'koi8-r'      
       
   765      self unicodeEncoderFor:#'koi8-u'      
       
   766      self unicodeEncoderFor:#'jis0208'      
       
   767      self unicodeEncoderFor:#'jis7'      
       
   768     "
       
   769 !
       
   770 
       
   771 unicodeEncoderFor:encodingNameSymbol ifAbsent:exceptionValue
       
   772     "given the name of an encoding, return an encoder-instance which can map these from/into unicode."
       
   773 
       
   774     self obsoleteMethodWarning.
       
   775     ^ self encoderFor:encodingNameSymbol ifAbsent:exceptionValue
       
   776 
       
   777     "
       
   778      CharacterEncoder unicodeEncoderFor:#'latin1'       
       
   779      self unicodeEncoderFor:#'arabic'       
       
   780      self unicodeEncoderFor:#'ms-arabic'       
       
   781      self unicodeEncoderFor:#'iso8859-5'    
       
   782      self unicodeEncoderFor:#'koi8-r'      
       
   783      self unicodeEncoderFor:#'koi8-u'      
       
   784      self unicodeEncoderFor:#'jis0208'      
       
   785      self unicodeEncoderFor:#'jis7'      
       
   786     "
       
   787 ! !
       
   788 
   619 !CharacterEncoder class methodsFor:'Compatibility-ST80'!
   789 !CharacterEncoder class methodsFor:'Compatibility-ST80'!
   620 
   790 
   621 encoderNamed: encoderName
   791 encoderNamed: encoderName
   622     "/ q & d hack
   792     "/ q & d hack
   623 
   793 
   723     ^ (self encoderToEncodeFrom:oldEncoding into:newEncoding) encodeString:aString.
   893     ^ (self encoderToEncodeFrom:oldEncoding into:newEncoding) encodeString:aString.
   724 !
   894 !
   725 
   895 
   726 encodeString:aString into:newEncodingArg
   896 encodeString:aString into:newEncodingArg
   727     ^ self encodeString:aString from:'unicode' into:newEncodingArg
   897     ^ self encodeString:aString from:'unicode' into:newEncodingArg
   728 !
       
   729 
       
   730 encoderToEncodeFrom:oldEncodingArg into:newEncodingArg
       
   731     |oldEncoding newEncoding encoder decoder|
       
   732 
       
   733     oldEncoding := oldEncodingArg ? #'unicode'.
       
   734     newEncoding := newEncodingArg ? #'unicode'.
       
   735     oldEncoding == newEncoding ifTrue:[^ NullEncoder new].
       
   736     (oldEncoding match:newEncoding) ifTrue:[^ NullEncoder new].
       
   737     "/ (newEncoding match:oldEncoding) ifTrue:[^ NullEncoder new].
       
   738 
       
   739     ((oldEncoding == #unicode) or:[(oldEncoding == #'iso10646-1')]) ifTrue:[
       
   740         ((newEncoding == #unicode) or:[(newEncoding == #'iso10646-1')]) ifTrue:[^ NullEncoder new].
       
   741 
       
   742         "/ unicode -> something
       
   743         ^ self encoderFor:newEncoding.
       
   744     ].
       
   745     ((newEncoding == #unicode) or:[(newEncoding == #'iso10646-1')]) ifTrue:[
       
   746         "/ something -> unicode 
       
   747         decoder := self encoderFor:oldEncoding.
       
   748         ^ InverseEncoder new decoder:decoder.
       
   749     ].
       
   750 
       
   751     "/ look for a specialized encoder...
       
   752     self allSubclassesDo:[:cls |
       
   753         (cls nameOfEncoding = oldEncoding
       
   754         or:[ cls alternativeNamesOfEncoding includes:oldEncoding ]) ifTrue:[
       
   755             |nameOfDecodedCode encoderForDecodedCode|
       
   756 
       
   757             nameOfDecodedCode := cls nameOfDecodedCode.
       
   758             encoderForDecodedCode := self encoderFor:nameOfDecodedCode.
       
   759             (nameOfDecodedCode = newEncoding
       
   760             or:[ encoderForDecodedCode class alternativeNamesOfEncoding includes:newEncoding ]) ifTrue:[
       
   761                 ^ InverseEncoder new decoder:cls new. 
       
   762             ]
       
   763         ].
       
   764         (cls nameOfEncoding = newEncoding
       
   765         or:[ cls alternativeNamesOfEncoding includes:newEncoding ]) ifTrue:[
       
   766             |nameOfDecodedCode encoderForDecodedCode|
       
   767 
       
   768             nameOfDecodedCode := cls nameOfDecodedCode.
       
   769             encoderForDecodedCode := self encoderFor:nameOfDecodedCode.
       
   770             (nameOfDecodedCode = oldEncoding
       
   771             or:[ encoderForDecodedCode class alternativeNamesOfEncoding includes:oldEncoding ]) ifTrue:[
       
   772                 ^ cls new. 
       
   773             ]
       
   774         ].
       
   775     ].
       
   776     
       
   777     "/ do it as: oldEncoding -> unicode -> newEncoding
       
   778 
       
   779     "/ something -> unicode 
       
   780     decoder := self encoderFor:oldEncoding.
       
   781 
       
   782     "/ unicode -> something
       
   783     encoder := self encoderFor:newEncoding.
       
   784     ^ CompoundEncoder new encoder:encoder decoder:decoder 
       
   785 ! !
   898 ! !
   786 
   899 
   787 !CharacterEncoder class methodsFor:'private'!
   900 !CharacterEncoder class methodsFor:'private'!
   788 
   901 
   789 flushCode
   902 flushCode
   872     ^ #()
   985     ^ #()
   873 !
   986 !
   874 
   987 
   875 alternativeNamesOfEncoding
   988 alternativeNamesOfEncoding
   876     ^ #()
   989     ^ #()
   877 !
       
   878 
       
   879 encoderFor:encodingNameSymbol
       
   880     "given the name of an encoding, return an encoder-instance which can map these from/into unicode."
       
   881 
       
   882     ^ self
       
   883         encoderFor:encodingNameSymbol 
       
   884         ifAbsent:[
       
   885             self error:'no encoder for ' , encodingNameSymbol mayProceed:true.
       
   886             NullEncoder new
       
   887         ]
       
   888 
       
   889     "
       
   890      CharacterEncoder encoderFor:#'latin1'       
       
   891      self encoderFor:#'arabic'       
       
   892      self encoderFor:#'ms-arabic'       
       
   893      self encoderFor:#'iso8859-5'    
       
   894      self encoderFor:#'koi8-r'      
       
   895      self encoderFor:#'koi8-u'      
       
   896      self encoderFor:#'jis0208'      
       
   897      self encoderFor:#'jis7'      
       
   898     "
       
   899 !
       
   900 
       
   901 encoderFor:encodingNameSymbol ifAbsent:exceptionValue
       
   902     "given the name of an encoding, return an encoder-instance which can map these from/into unicode."
       
   903 
       
   904     |cls lcName name|
       
   905 
       
   906     lcName := encodingNameSymbol asLowercase asSymbolIfInterned.
       
   907     name := lcName ? encodingNameSymbol.
       
   908 
       
   909     cls := EncodersByName at:name ifAbsent:nil.
       
   910     cls notNil ifTrue:[^ cls new ].
       
   911 
       
   912     self allSubclassesDo:[:cls |
       
   913         cls nameOfDecodedCode == #unicode ifTrue:[
       
   914             cls nameOfEncoding = name ifTrue:[
       
   915                 EncodersByName at:name put:cls.    
       
   916                 ^ cls new.
       
   917             ]
       
   918         ].
       
   919     ].
       
   920     self allSubclassesDo:[:cls |
       
   921         cls nameOfDecodedCode == #unicode ifTrue:[
       
   922             (cls alternativeNamesOfEncoding includes:name) ifTrue:[
       
   923                 EncodersByName at:name put:cls.    
       
   924                 ^ cls new.
       
   925             ].
       
   926         ].
       
   927     ].
       
   928 
       
   929     self allSubclassesDo:[:cls |
       
   930         ((cls nameOfEncoding = name)
       
   931         or:[(cls alternativeNamesOfEncoding includes:name)]) ifTrue:[
       
   932             "/ ok, found some other encoder - need a compound encoder then.
       
   933             "/ the one found encodes into what we need, but needs something else as input.
       
   934             
       
   935             ^ TwoStepEncoder new
       
   936                 encoder1:(self encoderFor:(cls nameOfDecodedCode))
       
   937                 encoder2:(cls new).
       
   938         ].
       
   939     ].
       
   940     ^ exceptionValue value
       
   941 
       
   942     "
       
   943      CharacterEncoder encoderFor:#'latin1'       
       
   944      self encoderFor:#'arabic'       
       
   945      self encoderFor:#'ms-arabic'       
       
   946      self encoderFor:#'iso8859-5'    
       
   947      self encoderFor:#'koi8-r'      
       
   948      self encoderFor:#'koi8-u'      
       
   949      self encoderFor:#'jis0208'      
       
   950      self encoderFor:#'jis7'      
       
   951     "
       
   952 !
   990 !
   953 
   991 
   954 isEncoding:subSetEncodingArg subSetOf:superSetEncodingArg
   992 isEncoding:subSetEncodingArg subSetOf:superSetEncodingArg
   955     "return true, if superSetEncoding encoding includes all characters of subSetEncoding"
   993     "return true, if superSetEncoding encoding includes all characters of subSetEncoding"
   956 
   994 
  1033          nil
  1071          nil
  1034          ('gb'          'GB - mainland chin'                    )
  1072          ('gb'          'GB - mainland chin'                    )
  1035          ('big5'        'BIG5 - taiwan'                         )
  1073          ('big5'        'BIG5 - taiwan'                         )
  1036 "/         ('ksc'         'korean'                        )
  1074 "/         ('ksc'         'korean'                        )
  1037        )
  1075        )
  1038 !
       
  1039 
       
  1040 unicodeEncoderFor:encodingNameSymbol
       
  1041     "given the name of an encoding, return an encoder-instance which can map these from/into unicode."
       
  1042 
       
  1043     self obsoleteMethodWarning.
       
  1044     ^ self encoderFor:encodingNameSymbol
       
  1045 
       
  1046     "
       
  1047      CharacterEncoder unicodeEncoderFor:#'latin1'       
       
  1048      self unicodeEncoderFor:#'arabic'       
       
  1049      self unicodeEncoderFor:#'ms-arabic'       
       
  1050      self unicodeEncoderFor:#'iso8859-5'    
       
  1051      self unicodeEncoderFor:#'koi8-r'      
       
  1052      self unicodeEncoderFor:#'koi8-u'      
       
  1053      self unicodeEncoderFor:#'jis0208'      
       
  1054      self unicodeEncoderFor:#'jis7'      
       
  1055     "
       
  1056 !
       
  1057 
       
  1058 unicodeEncoderFor:encodingNameSymbol ifAbsent:exceptionValue
       
  1059     "given the name of an encoding, return an encoder-instance which can map these from/into unicode."
       
  1060 
       
  1061     self obsoleteMethodWarning.
       
  1062     ^ self encoderFor:encodingNameSymbol ifAbsent:exceptionValue
       
  1063 
       
  1064     "
       
  1065      CharacterEncoder unicodeEncoderFor:#'latin1'       
       
  1066      self unicodeEncoderFor:#'arabic'       
       
  1067      self unicodeEncoderFor:#'ms-arabic'       
       
  1068      self unicodeEncoderFor:#'iso8859-5'    
       
  1069      self unicodeEncoderFor:#'koi8-r'      
       
  1070      self unicodeEncoderFor:#'koi8-u'      
       
  1071      self unicodeEncoderFor:#'jis0208'      
       
  1072      self unicodeEncoderFor:#'jis7'      
       
  1073     "
       
  1074 !
  1076 !
  1075 
  1077 
  1076 userFriendlyNameOfEncoding
  1078 userFriendlyNameOfEncoding
  1077     ^ self nameOfEncoding
  1079     ^ self nameOfEncoding
  1078 ! !
  1080 ! !
 52934 ! !
 52936 ! !
 52935 
 52937 
 52936 !CharacterEncoder class methodsFor:'documentation'!
 52938 !CharacterEncoder class methodsFor:'documentation'!
 52937 
 52939 
 52938 version
 52940 version
 52939     ^ '$Header: /cvs/stx/stx/libbasic/CharacterEncoder.st,v 1.28 2004-02-18 23:51:18 cg Exp $'
 52941     ^ '$Header: /cvs/stx/stx/libbasic/CharacterEncoder.st,v 1.29 2004-02-19 17:17:00 cg Exp $'
 52940 ! !
 52942 ! !
 52941 
 52943 
 52942 CharacterEncoder initialize!
 52944 CharacterEncoder initialize!