CharacterEncoder.st
branchjv
changeset 18120 e3a375d5f6a8
parent 18084 ab5b38bd8f81
parent 17687 d5f0453d0899
child 18192 32a7c53ef4d0
equal deleted inserted replaced
18119:cb7a12afe736 18120:e3a375d5f6a8
       
     1 "{ Encoding: utf8 }"
       
     2 
     1 "
     3 "
     2  COPYRIGHT (c) 2004 by eXept Software AG
     4  COPYRIGHT (c) 2004 by eXept Software AG
     3               All Rights Reserved
     5               All Rights Reserved
     4 
     6 
     5  This software is furnished under a license and may be used
     7  This software is furnished under a license and may be used
     9  other person.  No title to or ownership of the software is
    11  other person.  No title to or ownership of the software is
    10  hereby transferred.
    12  hereby transferred.
    11 "
    13 "
    12 "{ Package: 'stx:libbasic' }"
    14 "{ Package: 'stx:libbasic' }"
    13 
    15 
       
    16 "{ NameSpace: Smalltalk }"
       
    17 
    14 Object subclass:#CharacterEncoder
    18 Object subclass:#CharacterEncoder
    15 	instanceVariableNames:''
    19 	instanceVariableNames:''
    16 	classVariableNames:'EncoderClassesByName EncodersByName CachedEncoders LastEncoder
    20 	classVariableNames:'EncoderClassesByName EncodersByName CachedEncoders AccessLock
    17 		AccessLock NullEncoderInstance Jis7KanjiEscapeSequence
    21 		NullEncoderInstance Jis7KanjiEscapeSequence
    18 		Jis7RomanEscapeSequence JisISO2022EscapeSequence
    22 		Jis7RomanEscapeSequence JisISO2022EscapeSequence
    19 		Jis7KanjiOldEscapeSequence'
    23 		Jis7KanjiOldEscapeSequence'
    20 	poolDictionaries:''
    24 	poolDictionaries:''
    21 	category:'Collections-Text-Encodings'
    25 	category:'Collections-Text-Encodings'
    22 !
    26 !
   142     or from the i18n character maps:
   146     or from the i18n character maps:
   143         (for example: 'http://std.dkuug.dk/i18n/charmaps/ISO-8859-1
   147         (for example: 'http://std.dkuug.dk/i18n/charmaps/ISO-8859-1
   144 
   148 
   145     In order to add another coder (for example: for EBCDIC or ms-codePage 278),
   149     In order to add another coder (for example: for EBCDIC or ms-codePage 278),
   146     perform the following steps:
   150     perform the following steps:
   147         - create a private subclass of CharacterEncoder named (for example) CP267.
       
   148 
       
   149         - create a public subclass of CharacterEncoderImplementations::CharacterEncoderImplementation named (for example) CharacterEncoderImplementations::CP267.
   151         - create a public subclass of CharacterEncoderImplementations::CharacterEncoderImplementation named (for example) CharacterEncoderImplementations::CP267.
   150 
   152 
   151         - define the mappingURL1_relativeName (if the table is found on 'www.unicode.org')
   153         - define the mappingURL1_relativeName (if the table is found on 'www.unicode.org')
   152           or the mappingURL2_relativeName (if it is found on 'std.dkuug.dk') method, which
   154           or the mappingURL2_relativeName (if it is found on 'std.dkuug.dk') method, which
   153           should return the name of the tables file, relative to the top directory there
   155           should return the name of the tables file, relative to the top directory there
   154           (which is '.../Public/MAPPINGS' on www.unicode.org and '.../i18n/charmaops' on 'std.dkuug.dk'.
   156           (which is '.../Public/MAPPINGS' on www.unicode.org and '.../i18n/charmaops' on 'std.dkuug.dk'.
   155 
   157 
   156           In this example, the table from 'std.dkuug.dk' is used, and named 'EBCDIC-CP-FI' there.
   158           In this example, the table from 'std.dkuug.dk' is used, and named 'EBCDIC-CP-FI' there.
   157 
   159 
   158         - generate code by evaluating:
   160         - generate code by evaluating (make sure that CharacterEncoderGenerator is loaded from stx:goodies):
   159             CharacterEncoder::CP267 generateCode
   161             CharacterEncoder::CP267 generateCode
   160 
   162 
   161     Thats all !!
   163     Thats all !!
   162 
   164 
   163 
   165 
   221     ^ self
   223     ^ self
   222         encoderFor:encodingNameSymbol 
   224         encoderFor:encodingNameSymbol 
   223         ifAbsent:[
   225         ifAbsent:[
   224             "/ proceed to ignore this error in the future.    
   226             "/ proceed to ignore this error in the future.    
   225 
   227 
   226             (EncodersByName at:#unicode) at:encodingNameSymbol put:NullEncoderInstance. 
   228 "/            (EncodersByName at:#unicode) at:encodingNameSymbol put:NullEncoderInstance. 
   227             (EncoderClassesByName at:#unicode) at:encodingNameSymbol put:NullEncoder.    
   229 "/            (EncoderClassesByName at:#unicode) at:encodingNameSymbol put:NullEncoder.    
   228 
   230 
   229             "/ self error:'no encoder for ' , encodingNameSymbol mayProceed:true.
   231             "/ self error:'no encoder for ' , encodingNameSymbol mayProceed:true.
   230             ('CharacterEncoder [warning]: no encoder for ' , encodingNameSymbol) infoPrintCR.
   232             ('CharacterEncoder [warning]: no encoder for "' , encodingNameSymbol,'"') infoPrintCR.
   231             
   233             
   232             NullEncoderInstance
   234             NullEncoderInstance
   233         ]
   235         ]
   234 
   236 
   235     "
   237     "
   266     |encodingNameSymbol enc clsName cls lcName name unicodeEncoders unicodeEncoderClasses|
   268     |encodingNameSymbol enc clsName cls lcName name unicodeEncoders unicodeEncoderClasses|
   267 
   269 
   268     encodingNameSymbol := encodingNameSymbolArg.
   270     encodingNameSymbol := encodingNameSymbolArg.
   269     encodingNameSymbol isNil ifTrue:[ ^ NullEncoderInstance].
   271     encodingNameSymbol isNil ifTrue:[ ^ NullEncoderInstance].
   270 
   272 
   271     encodingNameSymbol == #'iso10646-1' ifTrue:[ encodingNameSymbol := #unicode].
   273     encodingNameSymbol = 'iso10646-1' ifTrue:[ encodingNameSymbol := #unicode].
   272 
   274 
   273     lcName := encodingNameSymbol asLowercase.
   275     lcName := encodingNameSymbol asLowercase.
   274     name := lcName asSymbolIfInterned.
   276     name := lcName asSymbolIfInterned.
   275     name isNil ifTrue:[name := lcName].
   277     name isNil ifTrue:[name := lcName].
   276 
   278 
   415      self encoderFor:#'koi8-r'      
   417      self encoderFor:#'koi8-r'      
   416      self encoderFor:#'koi8-u'      
   418      self encoderFor:#'koi8-u'      
   417      self encoderFor:#'jis0208'      
   419      self encoderFor:#'jis0208'      
   418      self encoderFor:#'jis7'      
   420      self encoderFor:#'jis7'      
   419      self encoderFor:#'unicode'      
   421      self encoderFor:#'unicode'      
       
   422      self encoderFor:#'UTF-8'      
       
   423      self encoderFor:'UTF-8'      
   420     "
   424     "
   421 
   425 
   422     "Modified: / 12-07-2012 / 19:45:58 / cg"
   426     "Modified: / 12-07-2012 / 19:45:58 / cg"
   423 !
   427 !
   424 
   428 
   546 !CharacterEncoder class methodsFor:'class initialization'!
   550 !CharacterEncoder class methodsFor:'class initialization'!
   547 
   551 
   548 initialize
   552 initialize
   549     |ud|
   553     |ud|
   550 
   554 
       
   555     AccessLock notNil ifTrue:[^ self].  "/ already initialized
       
   556 
   551     AccessLock := RecursionLock new name:'CharacterEncoder'.
   557     AccessLock := RecursionLock new name:'CharacterEncoder'.
   552     NullEncoderInstance := NullEncoder new.
   558     NullEncoderInstance := NullEncoder new.
   553 
   559 
   554     EncodersByName := Dictionary new.
   560     EncodersByName := Dictionary new.
   555     EncoderClassesByName := Dictionary new.
   561     EncoderClassesByName := Dictionary new.
   559     ud at:#'fontspecific' put:NullEncoder.    
   565     ud at:#'fontspecific' put:NullEncoder.    
   560     ud at:#'adobe-fontspecific' put:NullEncoder.    
   566     ud at:#'adobe-fontspecific' put:NullEncoder.    
   561     ud at:#'ms-oem' put:NullEncoder.    
   567     ud at:#'ms-oem' put:NullEncoder.    
   562     ud at:#'ms-default' put:NullEncoder.    
   568     ud at:#'ms-default' put:NullEncoder.    
   563 
   569 
   564     "/ className decoded-name array-of-encodingNames
   570     "/ className        decoded-name    array-of-encodingNames
   565     #(
   571     #(
   566         (ASCII              unicode     ( ascii 'us-ascii' 'iso-ir-6' 'ibm-367' 'ms-cp367' 'cp367'  'iso646-us' 'ibm-cp367' 'ansi_x3.4-1968' ))
   572         (ASCII              unicode     ( ascii 'us-ascii' 'iso-ir-6' 'ibm-367' 'ms-cp367' 'cp367'  'iso646-us' 'ibm-cp367' 'ansi_x3.4-1968' ))
   567 
   573 
   568         (BIG5               unicode     ( big5 ))
   574         (BIG5               unicode     ( big5 ))
   569 
   575 
   571 
   577 
   572         (CP437              unicode     ( 'cp437'  'cp-437' 'ibm-437' 'ms-cp437' 'microsoft-cp437' 'ibm-cp437' ))
   578         (CP437              unicode     ( 'cp437'  'cp-437' 'ibm-437' 'ms-cp437' 'microsoft-cp437' 'ibm-cp437' ))
   573 
   579 
   574         (EBCDIC             unicode     ( 'ebcdic' ))
   580         (EBCDIC             unicode     ( 'ebcdic' ))
   575 
   581 
   576         (GB2313_1980        unicode     ( 'gb2313' 'gb2313-1980' ))
   582 "/        (GB2313_1980        unicode     ( 'gb2313' 'gb2313-1980' ))
       
   583 
       
   584         (GB2312_1980_0      unicode     ( 'gb2312' 'gb2312.1980' 'gb2312.1980-0'))
   577 
   585 
   578         (HANGUL             unicode     ( 'hangul' ))
   586         (HANGUL             unicode     ( 'hangul' ))
   579 
   587 
   580         (ISO10646_1         unicode     ( unicode 'iso10646_1' 'iso10646-1' 'iso-10646-1' ))
   588         (ISO10646_1         unicode     ( unicode 'iso10646_1' 'iso10646-1' 'iso-10646-1' ))
   581 
   589 
   582         (ISO10646_to_UTF8   unicode     ( utf8 'utf-8' ))
   590         (ISO10646_to_UTF8     unicode   ( utf8 'utf-8' ))
   583         (ISO10646_to_UTF16BE unicode    ( utf16b utf16be 'utf-16b' 'utf-16be' ))
   591         (ISO10646_to_UTF16BE  unicode   ( utf16b utf16be 'utf-16b' 'utf-16be' ))
   584         (ISO10646_to_UTF16LE unicode    ( utf16l utf16le 'utf-16e' 'utf-16le' ))
   592         (ISO10646_to_UTF16LE  unicode   ( utf16l utf16le 'utf-16e' 'utf-16le' ))
       
   593 
       
   594         (ISO10646_to_UTF8_MAC unicode   ( 'utf8-mac' 'utf-8-mac' ))
       
   595         (ISO10646_to_XMLUTF8  unicode   ( 'utf8-XML' ))
   585 
   596 
   586         (ISO8859_1          unicode     ( 'iso8859_1' 'iso8859-1' 'iso-8859-1' 'latin-1' 'latin1' 'iso-ir-100' 'ibm-819' 'ms-cp819' 'ibm-cp819' 'iso8859'))
   597         (ISO8859_1          unicode     ( 'iso8859_1' 'iso8859-1' 'iso-8859-1' 'latin-1' 'latin1' 'iso-ir-100' 'ibm-819' 'ms-cp819' 'ibm-cp819' 'iso8859'))
   587 
   598 
   588         (ISO8859_2          unicode     ( 'iso8859_2' 'iso8859-2' 'iso-8859-2' 'latin2' 'latin-2' 'iso-ir-101'))
   599         (ISO8859_2          unicode     ( 'iso8859_2' 'iso8859-2' 'iso-8859-2' 'latin2' 'latin-2' 'iso-ir-101'))
   589 
   600 
   655 
   666 
   656         (MAC_Japanese       unicode     ( #'mac-japanese' #'macjapanese'  ))
   667         (MAC_Japanese       unicode     ( #'mac-japanese' #'macjapanese'  ))
   657 
   668 
   658         (MAC_Korean         unicode     ( #'mac-korean' #'mackorean'  ))
   669         (MAC_Korean         unicode     ( #'mac-korean' #'mackorean'  ))
   659 
   670 
   660         (MAC_Roman          unicode     ( #'mac-roman' #'macroman'  ))
   671         (MAC_Roman          unicode     ( #'mac-roman' #'macroman' 'macintosh' 'cp10000' ))
   661 
   672 
   662         (MAC_Romanian       unicode     ( #'mac-romanian' #'macromanian'  ))
   673         (MAC_Romanian       unicode     ( #'mac-romanian' #'macromanian'  ))
   663 
   674 
   664         (MAC_Symbol         unicode     ( #'mac-symbol' #'macsymbol'  ))
   675         (MAC_Symbol         unicode     ( #'mac-symbol' #'macsymbol'  ))
   665 
   676 
  1011          ('iso8859-6'   'Arabic'                        )
  1022          ('iso8859-6'   'Arabic'                        )
  1012          ('iso8859-7'   'Greek'                         )
  1023          ('iso8859-7'   'Greek'                         )
  1013          ('iso8859-8'   'Hebrew'                        )
  1024          ('iso8859-8'   'Hebrew'                        )
  1014          ('iso8859-15'  'Western with Euro'             )
  1025          ('iso8859-15'  'Western with Euro'             )
  1015          ('iso8859-16'  'South European with Euro'      )
  1026          ('iso8859-16'  'South European with Euro'      )
       
  1027 "/       nil
       
  1028          ('macintosh'   'MAC Western'      )
  1016 "/       nil
  1029 "/       nil
  1017          ('koi7'        'Cyrillic (Old)'                )
  1030          ('koi7'        'Cyrillic (Old)'                )
  1018          ('koi8-r'      'Cyrillic'                      )
  1031          ('koi8-r'      'Cyrillic'                      )
  1019          ('koi8-u'      'Cyrillic (Ukraine)'            )
  1032          ('koi8-u'      'Cyrillic (Ukraine)'            )
  1020 "/       nil
  1033 "/       nil
  1125             s skipSeparators.
  1138             s skipSeparators.
  1126             ('"''' includes:s peek) ifTrue:[
  1139             ('"''' includes:s peek) ifTrue:[
  1127                 quote := s next.
  1140                 quote := s next.
  1128                 w := s upTo:quote.
  1141                 w := s upTo:quote.
  1129             ] ifFalse:[
  1142             ] ifFalse:[
  1130                 w := s upToMatching:[:ch | ch isSeparator or:[ch == $" or:[ch == $' or:[ch == $> ]]]].
  1143                 w := s upToElementForWhich:[:ch | ch isSeparator or:[ch == $" or:[ch == $' or:[ch == $> ]]]].
  1131             ].
  1144             ].
  1132             w notNil ifTrue:[
  1145             w notNil ifTrue:[
  1133                 enc := w withoutQuotes.
  1146                 enc := w withoutQuotes.
  1134                 (enc startsWith:'x-') ifTrue:[
  1147                 (enc startsWith:'x-') ifTrue:[
  1135                     enc := enc copyFrom:3.
  1148                     enc := enc copyFrom:3.
  1312             ]
  1325             ]
  1313         ].
  1326         ].
  1314         newString at:idx put:(Character value:myCode).
  1327         newString at:idx put:(Character value:myCode).
  1315     ].
  1328     ].
  1316     ^ newString
  1329     ^ newString
       
  1330 !
       
  1331 
       
  1332 encodeString:aUnicodeString on:aStream
       
  1333     "given a string in unicode, encode it onto aStream.
       
  1334      Subclasses can redefine this to avoid allocating many new string instances.
       
  1335      (but must then also redefine encodeString:aUnicodeString to collect the characters)"
       
  1336 
       
  1337     aStream nextPutAll:(self encodeString:aUnicodeString).
  1317 ! !
  1338 ! !
  1318 
  1339 
  1319 !CharacterEncoder methodsFor:'error handling'!
  1340 !CharacterEncoder methodsFor:'error handling'!
  1320 
  1341 
  1321 decodingError 
  1342 decodingError 
  1393     "return the number of bytes required to encode codePoint"
  1414     "return the number of bytes required to encode codePoint"
  1394 
  1415 
  1395     ^ self subclassResponsibility
  1416     ^ self subclassResponsibility
  1396 
  1417 
  1397     "Created: / 15-06-2005 / 15:11:04 / janfrog"
  1418     "Created: / 15-06-2005 / 15:11:04 / janfrog"
       
  1419 !
       
  1420 
       
  1421 isEncoderFor:encoding
       
  1422     "does this encode to encoding?"
       
  1423 
       
  1424     |encodingNameSymbol|
       
  1425 
       
  1426     encodingNameSymbol := encoding asLowercase.
       
  1427     encodingNameSymbol = #'iso10646-1' ifTrue:[ encodingNameSymbol := #unicode].
       
  1428 
       
  1429     ^ encodingNameSymbol = self nameOfEncoding
  1398 !
  1430 !
  1399 
  1431 
  1400 isNullEncoder
  1432 isNullEncoder
  1401     ^ false
  1433     ^ false
  1402 !
  1434 !
  1681 ! !
  1713 ! !
  1682 
  1714 
  1683 !CharacterEncoder class methodsFor:'documentation'!
  1715 !CharacterEncoder class methodsFor:'documentation'!
  1684 
  1716 
  1685 version
  1717 version
  1686     ^ '$Header: /cvs/stx/stx/libbasic/CharacterEncoder.st,v 1.123 2013-08-10 11:13:37 stefan Exp $'
  1718     ^ '$Header: /cvs/stx/stx/libbasic/CharacterEncoder.st,v 1.138 2015-03-26 16:21:01 cg Exp $'
  1687 !
  1719 !
  1688 
  1720 
  1689 version_CVS
  1721 version_CVS
  1690     ^ '$Header: /cvs/stx/stx/libbasic/CharacterEncoder.st,v 1.123 2013-08-10 11:13:37 stefan Exp $'
  1722     ^ '$Header: /cvs/stx/stx/libbasic/CharacterEncoder.st,v 1.138 2015-03-26 16:21:01 cg Exp $'
  1691 !
       
  1692 
       
  1693 version_HG
       
  1694 
       
  1695     ^ '$Changeset: <not expanded> $'
       
  1696 ! !
  1723 ! !
  1697 
  1724 
  1698 
  1725 
  1699 CharacterEncoder initialize!
  1726 CharacterEncoder initialize!