CharacterEncoder.st
changeset 21711 2020534180c5
parent 21602 c63ec4a97409
child 21714 e3e1268c3195
equal deleted inserted replaced
21710:285f6431ad4a 21711:2020534180c5
       
     1 "{ Encoding: utf8 }"
       
     2 
     1 "
     3 "
     2  COPYRIGHT (c) 2004 by eXept Software AG
     4  COPYRIGHT (c) 2004 by eXept Software AG
     3               All Rights Reserved
     5               All Rights Reserved
     4 
     6 
     5  This software is furnished under a license and may be used
     7  This software is furnished under a license and may be used
   563     |ud|
   565     |ud|
   564 
   566 
   565     EncoderClassesByName := Dictionary new.
   567     EncoderClassesByName := Dictionary new.
   566 
   568 
   567     EncoderClassesByName at:#'unicode' put:(ud := Dictionary new:237).
   569     EncoderClassesByName at:#'unicode' put:(ud := Dictionary new:237).
   568     ud at:#'fontspecific' put:NullEncoder.    
   570     ud at:#'fontspecific'       put:NullEncoder.    
   569     ud at:#'adobe-fontspecific' put:NullEncoder.    
   571     ud at:#'adobe-fontspecific' put:NullEncoder.    
   570     ud at:#'ms-oem' put:NullEncoder.    
   572     ud at:#'ms-oem'             put:NullEncoder.    
   571     ud at:#'ms-default' put:NullEncoder.    
   573     ud at:#'ms-default'         put:NullEncoder.    
   572 
   574 
   573     "/ className        decoded-name    array-of-encodingNames
   575     "/ className            decoded-name    array-of-encodingNames
   574     #(
   576     #(
   575         (ASCII              unicode     ( ascii 'us-ascii' 'iso-ir-6' 'ibm-367' 'ms-cp367' 'cp367'  'iso646-us' 'ibm-cp367' 'ansi_x3.4-1968' ))
   577         (ASCII                  unicode     ( ascii 'us-ascii' 'iso-ir-6' 'ibm-367' 'ms-cp367' 'cp367'  'iso646-us' 'ibm-cp367' 'ansi_x3.4-1968' ))
   576 
   578 
   577         (BIG5               unicode     ( big5 ))
   579         (BIG5                   unicode     ( big5 ))
   578 
   580 
   579         (CNS11643           unicode     ( 'cns11643' ))
   581         (CNS11643               unicode     ( 'cns11643' ))
   580 
   582 
   581         (CP437              unicode     ( 'cp437'  'cp-437' 'ibm-437' 'ms-cp437' 'microsoft-cp437' 'ibm-cp437' ))
   583         (CP437                  unicode     ( 'cp437'  'cp-437' 'ibm-437' 'ms-cp437' 'microsoft-cp437' 'ibm-cp437' ))
   582 
   584 
   583         (EBCDIC             unicode     ( 'ebcdic' ))
   585         (EBCDIC                 unicode     ( 'ebcdic' ))
   584 
   586 
   585 "/        (GB2313_1980        unicode     ( 'gb2313' 'gb2313-1980' ))
   587 "/        (GB2313_1980        unicode     ( 'gb2313' 'gb2313-1980' ))
   586 
   588 
   587         (GB2312_1980_0      unicode     ( 'gb2312' 'gb2312.1980' 'gb2312.1980-0'))
   589         (GB2312_1980_0          unicode     ( 'gb2312' 'gb2312.1980' 'gb2312.1980-0'))
   588 
   590 
   589         (HANGUL             unicode     ( 'hangul' ))
   591         (HANGUL                 unicode     ( 'hangul' ))
   590 
   592 
   591         (ISO10646_1         unicode     ( unicode 'iso10646_1' 'iso10646-1' 'iso-10646-1' ))
   593         (ISO10646_1             unicode     ( unicode 'iso10646_1' 'iso10646-1' 'iso-10646-1' ))
   592 
   594 
   593         (ISO10646_to_UTF8     unicode   ( utf8 'utf-8' ))
   595         (ISO10646_to_UTF8       unicode   ( utf8 'utf-8' ))
   594         (ISO10646_to_UTF16BE  unicode   ( utf16b utf16be 'utf-16b' 'utf-16be' ))
   596         (ISO10646_to_UTF16BE    unicode   ( utf16b utf16be 'utf-16b' 'utf-16be' ))
   595         (ISO10646_to_UTF16LE  unicode   ( utf16l utf16le 'utf-16e' 'utf-16le' ))
   597         (ISO10646_to_UTF16LE    unicode   ( utf16l utf16le 'utf-16e' 'utf-16le' ))
   596 
   598 
   597         (ISO10646_to_UTF8_MAC unicode   ( 'utf8-mac' 'utf-8-mac' ))
   599         (ISO10646_to_UTF8_MAC   unicode   ( 'utf8-mac' 'utf-8-mac' ))
   598         (ISO10646_to_XMLUTF8  unicode   ( 'utf8-XML' ))
   600         (ISO10646_to_XMLUTF8    unicode   ( 'utf8-XML' ))
   599 
   601 
   600         (ISO8859_1          unicode     ( 'iso8859_1' 'iso8859-1' 'iso-8859-1' 'latin-1' 'latin1' 'iso-ir-100' 'ibm-819' 'ms-cp819' 'ibm-cp819' 'iso8859'))
   602         (ISO8859_1              unicode     ( 'iso8859_1' 'iso8859-1' 'iso-8859-1' 'latin-1' 'latin1' 'iso-ir-100' 'ibm-819' 'ms-cp819' 'ibm-cp819' 'iso8859'))
   601 
   603 
   602         (ISO8859_2          unicode     ( 'iso8859_2' 'iso8859-2' 'iso-8859-2' 'latin2' 'latin-2' 'iso-ir-101'))
   604         (ISO8859_2              unicode     ( 'iso8859_2' 'iso8859-2' 'iso-8859-2' 'latin2' 'latin-2' 'iso-ir-101'))
   603 
   605 
   604         (ISO8859_3          unicode     ( 'iso8859_3' 'iso8859-3' 'iso-8859-3' 'latin3' 'latin-3' 'iso-ir-109'))
   606         (ISO8859_3              unicode     ( 'iso8859_3' 'iso8859-3' 'iso-8859-3' 'latin3' 'latin-3' 'iso-ir-109'))
   605 
   607 
   606         (ISO8859_4          unicode     ( 'iso8859_4' 'iso8859-4' 'iso-8859-4' 'latin4' 'latin-4' 'iso-ir-110'))
   608         (ISO8859_4              unicode     ( 'iso8859_4' 'iso8859-4' 'iso-8859-4' 'latin4' 'latin-4' 'iso-ir-110'))
   607 
   609 
   608         (ISO8859_5          unicode     ( 'iso8859_5' 'iso8859-5' 'iso-8859-5' 'cyrillic' 'iso-ir-144' ))
   610         (ISO8859_5              unicode     ( 'iso8859_5' 'iso8859-5' 'iso-8859-5' 'cyrillic' 'iso-ir-144' ))
   609 
   611 
   610         (ISO8859_6          unicode     ( 'iso8859_6' 'iso8859-6' 'iso-8859-6' 'arabic' 'asmo-708' 'ecma-114' 'iso-ir-127' ))
   612         (ISO8859_6              unicode     ( 'iso8859_6' 'iso8859-6' 'iso-8859-6' 'arabic' 'asmo-708' 'ecma-114' 'iso-ir-127' ))
   611 
   613 
   612         (ISO8859_7          unicode     ( 'iso8859_7' 'iso8859-7' 'iso-8859-7' 'greek' 'iso-ir-126' 'ecma-118'))
   614         (ISO8859_7              unicode     ( 'iso8859_7' 'iso8859-7' 'iso-8859-7' 'greek' 'iso-ir-126' 'ecma-118'))
   613 
   615 
   614         (ISO8859_8          unicode     ( 'iso8859_8' 'iso8859-8' 'iso-8859-8' 'hebrew' 'iso-ir-138' ))
   616         (ISO8859_8              unicode     ( 'iso8859_8' 'iso8859-8' 'iso-8859-8' 'hebrew' 'iso-ir-138' ))
   615 
   617 
   616         (ISO8859_9          unicode     ( 'iso8859_9' 'iso8859-9' 'iso-8859-9' 'latin5' 'latin-5' 'iso-ir-148'))
   618         (ISO8859_9              unicode     ( 'iso8859_9' 'iso8859-9' 'iso-8859-9' 'latin5' 'latin-5' 'iso-ir-148'))
   617 
   619 
   618         (ISO8859_10         unicode     ( 'iso8859_10' 'iso8859-10' 'iso-8859-10' 'latin6' 'latin-6' 'iso-ir-157'))
   620         (ISO8859_10             unicode     ( 'iso8859_10' 'iso8859-10' 'iso-8859-10' 'latin6' 'latin-6' 'iso-ir-157'))
   619 
   621 
   620         (ISO8859_11         unicode     ( 'iso8859_11' 'iso8859-11' 'iso-8859-11' 'thai' ))
   622         (ISO8859_11             unicode     ( 'iso8859_11' 'iso8859-11' 'iso-8859-11' 'thai' ))
   621 
   623 
   622         (ISO8859_13         unicode     ( 'iso8859_13' 'iso8859-13' 'iso-8859-13' 'latin7' 'latin-7' ))
   624         (ISO8859_13             unicode     ( 'iso8859_13' 'iso8859-13' 'iso-8859-13' 'latin7' 'latin-7' ))
   623 
   625 
   624         (ISO8859_14         unicode     ( 'iso8859_14' 'iso8859-14' 'iso-8859-14' 'latin8' 'latin-8' 'latin-celtic' ))
   626         (ISO8859_14             unicode     ( 'iso8859_14' 'iso8859-14' 'iso-8859-14' 'latin8' 'latin-8' 'latin-celtic' ))
   625 
   627 
   626         (ISO8859_15         unicode     ( 'iso8859_15' 'iso8859-15' 'iso-8859-15' 'latin9' 'latin-9' 'iso-ir-203'))
   628         (ISO8859_15             unicode     ( 'iso8859_15' 'iso8859-15' 'iso-8859-15' 'latin9' 'latin-9' 'iso-ir-203'))
   627 
   629 
   628         (ISO8859_16         unicode     ( 'iso8859_16' 'iso8859-16' 'iso-8859-16' 'latin10' 'latin-10' ))
   630         (ISO8859_16             unicode     ( 'iso8859_16' 'iso8859-16' 'iso-8859-16' 'latin10' 'latin-10' ))
   629 
   631 
   630         (JIS0201            unicode     ( 'jis0201' #'jisx0201.1976-0'))
   632         (JIS0201                unicode     ( 'jis0201' #'jisx0201.1976-0'))
   631 
   633 
   632         (JIS0208            unicode     ( jis0208 'jisx0208' 'jisx0208.1983-0' 'jisx0208.1990-0'))
   634         (JIS0208                unicode     ( jis0208 'jisx0208' 'jisx0208.1983-0' 'jisx0208.1990-0'))
   633 
   635 
   634         (JIS0208_to_JIS7    jis0208     ( jis7 'jis-7' 'x-jis7' 'x-iso2022-jp' 'iso2022-jp'))
   636         (JIS0208_to_JIS7        jis0208     ( jis7 'jis-7' 'x-jis7' 'x-iso2022-jp' 'iso2022-jp'))
   635 
   637 
   636         (JIS0208_to_EUC     jis0208     ( euc #'x-euc-jp' ))
   638         (JIS0208_to_EUC         jis0208     ( euc #'x-euc-jp' ))
   637 
   639 
   638         (JIS0208_to_SJIS    jis0208     ( 'sjis' 'shiftjis' 'x-sjis' #'x-shift-jis' #'shift-jis'))
   640         (JIS0208_to_SJIS        jis0208     ( 'sjis' 'shiftjis' 'x-sjis' #'x-shift-jis' #'shift-jis'))
   639 
   641 
   640         (JIS0212            unicode     ( 'jis0212' ))
   642         (JIS0212                unicode     ( 'jis0212' ))
   641 
   643 
   642         (JOHAB              unicode     ( 'johab' ))
   644         (JOHAB                  unicode     ( 'johab' ))
   643 
   645 
   644         (KOI7               unicode     ( 'koi7' ))
   646         (KOI7                   unicode     ( 'koi7' ))
   645 
   647 
   646         (KOI8_R             unicode     ( #'koi8-r' 'cp878' ))
   648         (KOI8_R                 unicode     ( #'koi8-r' 'cp878' ))
   647 
   649 
   648         (KOI8_U             unicode     ( #'koi8-u' ))
   650         (KOI8_U                 unicode     ( #'koi8-u' ))
   649 
   651 
   650         (KSC5601            unicode     ( #'ksc5601' ))
   652         (KSC5601                unicode     ( #'ksc5601' ))
   651 
   653 
   652         (MAC_Arabic         unicode     ( #'mac-arabic' 'macarabic' ))
   654         (MAC_Arabic             unicode     ( #'mac-arabic' 'macarabic' ))
   653 
   655 
   654         (MAC_CentralEuropean unicode    ( #'mac-centraleuropean' #'mac-centraleurope' 'maccentraleurope' 'maccentraleuropean' ))
   656         (MAC_CentralEuropean    unicode    ( #'mac-centraleuropean' #'mac-centraleurope' 'maccentraleurope' 'maccentraleuropean' ))
   655 
   657 
   656         (MAC_Croatian       unicode     ( #'mac-croatian' 'maccroatian'))
   658         (MAC_Croatian           unicode     ( #'mac-croatian' 'maccroatian'))
   657 
   659 
   658         (MAC_Cyrillic       unicode     ( #'mac-cyrillic' 'maccyrillic' ))
   660         (MAC_Cyrillic           unicode     ( #'mac-cyrillic' 'maccyrillic' ))
   659 
   661 
   660         (MAC_Dingbats       unicode     ( #'mac-dingbats'  'macdingbats'  'macdingbat'))
   662         (MAC_Dingbats           unicode     ( #'mac-dingbats'  'macdingbats'  'macdingbat'))
   661 
   663 
   662         (MAC_Farsi          unicode     ( #'mac-farsi' 'macfarsi' ))
   664         (MAC_Farsi              unicode     ( #'mac-farsi' 'macfarsi' ))
   663 
   665 
   664         (MAC_Greek          unicode     ( #'mac-greek' #'macgreek' ))
   666         (MAC_Greek              unicode     ( #'mac-greek' #'macgreek' ))
   665 
   667 
   666         (MAC_Hebrew         unicode     ( #'mac-hebrew' #'machebrew'  ))
   668         (MAC_Hebrew             unicode     ( #'mac-hebrew' #'machebrew'  ))
   667 
   669 
   668         (MAC_Iceland        unicode     ( #'mac-iceland' #'maciceland'  ))
   670         (MAC_Iceland            unicode     ( #'mac-iceland' #'maciceland'  ))
   669 
   671 
   670         (MAC_Japanese       unicode     ( #'mac-japanese' #'macjapanese'  ))
   672         (MAC_Japanese           unicode     ( #'mac-japanese' #'macjapanese'  ))
   671 
   673 
   672         (MAC_Korean         unicode     ( #'mac-korean' #'mackorean'  ))
   674         (MAC_Korean             unicode     ( #'mac-korean' #'mackorean'  ))
   673 
   675 
   674         (MAC_Roman          unicode     ( #'mac-roman' #'macroman' 'macintosh' 'cp10000' ))
   676         (MAC_Roman              unicode     ( #'mac-roman' #'macroman' 'macintosh' 'cp10000' ))
   675 
   677 
   676         (MAC_Romanian       unicode     ( #'mac-romanian' #'macromanian'  ))
   678         (MAC_Romanian           unicode     ( #'mac-romanian' #'macromanian'  ))
   677 
   679 
   678         (MAC_Symbol         unicode     ( #'mac-symbol' #'macsymbol'  ))
   680         (MAC_Symbol             unicode     ( #'mac-symbol' #'macsymbol'  ))
   679 
   681 
   680         (MAC_Thai           unicode     ( #'mac-thai' #'macthai'  ))
   682         (MAC_Thai               unicode     ( #'mac-thai' #'macthai'  ))
   681 
   683 
   682         (MAC_Turkish        unicode     ( #'mac-turkish' #'macturkish'  ))
   684         (MAC_Turkish            unicode     ( #'mac-turkish' #'macturkish'  ))
   683 
   685 
   684         (MS_Ansi            unicode     ( #'ms-ansi' 'ms-cp1252' 'microsoft-cp1252' 'cp1252' 'microsoft-ansi' 'windows-1252' 'windows-latin1'))
   686         (MS_Ansi                unicode     ( #'ms-ansi' 'ms-cp1252' 'microsoft-cp1252' 'cp1252' 'microsoft-ansi' 'windows-1252' 'windows-latin1'))
   685 
   687 
   686         (MS_Arabic          unicode     ( 'ms-arabic' 'ms-cp1256' 'microsoft-cp1256' 'cp1256'  'microsoft-arabic' 'windows-1256'  ))
   688         (MS_Arabic              unicode     ( 'ms-arabic' 'ms-cp1256' 'microsoft-cp1256' 'cp1256'  'microsoft-arabic' 'windows-1256'  ))
   687 
   689 
   688         (MS_Baltic          unicode     ( 'ms-baltic' 'ms-cp1257' 'microsoft-cp1257' 'cp1257' 'microsoft-baltic' 'windows-1257'  ))
   690         (MS_Baltic              unicode     ( 'ms-baltic' 'ms-cp1257' 'microsoft-cp1257' 'cp1257' 'microsoft-baltic' 'windows-1257'  ))
   689 
   691 
   690         (MS_Cyrillic        unicode     ( 'ms-cyrillic' 'ms-cp1251' 'microsoft-cp1251' 'cp1251' 'microsoft-cyrillic' 'windows-1251'  ))
   692         (MS_Cyrillic            unicode     ( 'ms-cyrillic' 'ms-cp1251' 'microsoft-cp1251' 'cp1251' 'microsoft-cyrillic' 'windows-1251'  ))
   691 
   693 
   692         (MS_EastEuropean    unicode     ( 'ms-easteuropean' 'ms-ee' 'cp1250' 'ms-cp1250' 'microsoft-cp1250' 'microsoft-easteuropean' 'windows-1250'  ))
   694         (MS_EastEuropean        unicode     ( 'ms-easteuropean' 'ms-ee' 'cp1250' 'ms-cp1250' 'microsoft-cp1250' 'microsoft-easteuropean' 'windows-1250'  ))
   693 
   695 
   694         (MS_Greek           unicode     ( 'ms-greek' 'ms-cp1253' 'microsoft-cp1253' 'cp1253' 'microsoft-greek' 'windows-1253' ))
   696         (MS_Greek               unicode     ( 'ms-greek' 'ms-cp1253' 'microsoft-cp1253' 'cp1253' 'microsoft-greek' 'windows-1253' ))
   695 
   697 
   696         (MS_Hebrew          unicode     ( 'ms-hebrew' 'ms-cp1255' 'microsoft-cp1255' 'cp1255' 'microsoft-hebrew' 'windows-1255' ))
   698         (MS_Hebrew              unicode     ( 'ms-hebrew' 'ms-cp1255' 'microsoft-cp1255' 'cp1255' 'microsoft-hebrew' 'windows-1255' ))
   697 
   699 
   698 "/        (MS_Symbol           unicode     ( 'ms-symbol' 'microsoft-symbol'  ))
   700 "/        (MS_Symbol           unicode     ( 'ms-symbol' 'microsoft-symbol'  ))
   699 
   701 
   700         (MS_Turkish         unicode     ( 'ms-turkish' 'ms-cp1254' 'microsoft-cp1254' 'cp1254' 'microsoft-turkish' 'windows-1254'  ))
   702         (MS_Turkish             unicode     ( 'ms-turkish' 'ms-cp1254' 'microsoft-cp1254' 'cp1254' 'microsoft-turkish' 'windows-1254'  ))
   701 
   703 
   702         (NEXT               unicode     ( 'next' 'nextstep'  ))
   704         (NEXT                   unicode     ( 'next' 'nextstep'  ))
   703 
   705 
   704         (ISO10646_to_SGML       unicode     ( 'sgml' ))
   706         (ISO10646_to_SGML       unicode     ( 'sgml' ))
   705         (ISO10646_to_JavaText   unicode     ( 'java' 'javaText' ))
   707         (ISO10646_to_JavaText   unicode     ( 'java' 'javaText' ))
       
   708 
       
   709         (AdobeStandard          unicode     ( 'Adobe Standard' 'AdobeStandard' 'Adobe' 'adobe-standard' ))
   706     ) triplesDo:[:className :decodesTo :encodesTo |
   710     ) triplesDo:[:className :decodesTo :encodesTo |
   707         |decodesToDict|
   711         |decodesToDict|
   708 
   712 
   709         "/ notice that the encoders are not yet installed as autoloaded.
   713         "/ notice that the encoders are not yet installed as autoloaded.
   710         "/ Therefore, we remember their names here.
   714         "/ Therefore, we remember their names here.
   909 mapFileURL1_codeColumn
   913 mapFileURL1_codeColumn
   910     ^ 1
   914     ^ 1
   911 !
   915 !
   912 
   916 
   913 mapFileURL1_relativePathName
   917 mapFileURL1_relativePathName
   914     "raise an error: must be redefined in concrete subclass(es)"
   918     "must be redefined in concrete subclass(es)"
   915     
   919     
   916     ^ nil
   920     ^ nil
   917 !
   921 !
   918 
   922 
   919 mapFileURL2_relativePathName
   923 mapFileURL2_relativePathName
   920     "raise an error: must be redefined in concrete subclass(es)"
   924     "must be redefined in concrete subclass(es)"
   921     
   925     
   922     ^ nil
   926     ^ nil
   923 !
   927 !
   924 
   928 
   925 mappingURL1
   929 mappingURL1
   926     "raise an error: must be redefined in concrete subclass(es)"
       
   927     
       
   928     |rel|
   930     |rel|
   929 
   931 
   930     rel := self mapFileURL1_relativePathName.
   932     rel := self mapFileURL1_relativePathName.
   931     rel isNil ifTrue:[
   933     rel isNil ifTrue:[
   932         ^ nil
   934         ^ nil
   933     ].
   935     ].
   934     ^ 'http://www.unicode.org/Public/MAPPINGS/' , rel
   936     ^ 'http://www.unicode.org/Public/MAPPINGS/' , rel
   935 !
   937 !
   936 
   938 
   937 mappingURL2
   939 mappingURL2
   938     "raise an error: must be redefined in concrete subclass(es)"
       
   939     
       
   940     |rel|
   940     |rel|
   941 
   941 
   942     rel := self mapFileURL2_relativePathName.
   942     rel := self mapFileURL2_relativePathName.
   943     rel isNil ifTrue:[
   943     rel isNil ifTrue:[
   944         ^ nil
   944         ^ nil