9 other person. No title to or ownership of the software is |
11 other person. No title to or ownership of the software is |
10 hereby transferred. |
12 hereby transferred. |
11 " |
13 " |
12 "{ Package: 'stx:libbasic' }" |
14 "{ Package: 'stx:libbasic' }" |
13 |
15 |
|
16 "{ NameSpace: Smalltalk }" |
|
17 |
14 Object subclass:#CharacterEncoder |
18 Object subclass:#CharacterEncoder |
15 instanceVariableNames:'' |
19 instanceVariableNames:'' |
16 classVariableNames:'EncoderClassesByName EncodersByName CachedEncoders LastEncoder |
20 classVariableNames:'EncoderClassesByName EncodersByName CachedEncoders AccessLock |
17 AccessLock NullEncoderInstance Jis7KanjiEscapeSequence |
21 NullEncoderInstance Jis7KanjiEscapeSequence |
18 Jis7RomanEscapeSequence JisISO2022EscapeSequence |
22 Jis7RomanEscapeSequence JisISO2022EscapeSequence |
19 Jis7KanjiOldEscapeSequence' |
23 Jis7KanjiOldEscapeSequence' |
20 poolDictionaries:'' |
24 poolDictionaries:'' |
21 category:'Collections-Text-Encodings' |
25 category:'Collections-Text-Encodings' |
22 ! |
26 ! |
142 or from the i18n character maps: |
146 or from the i18n character maps: |
143 (for example: 'http://std.dkuug.dk/i18n/charmaps/ISO-8859-1 |
147 (for example: 'http://std.dkuug.dk/i18n/charmaps/ISO-8859-1 |
144 |
148 |
145 In order to add another coder (for example: for EBCDIC or ms-codePage 278), |
149 In order to add another coder (for example: for EBCDIC or ms-codePage 278), |
146 perform the following steps: |
150 perform the following steps: |
147 - create a private subclass of CharacterEncoder named (for example) CP267. |
|
148 |
|
149 - create a public subclass of CharacterEncoderImplementations::CharacterEncoderImplementation named (for example) CharacterEncoderImplementations::CP267. |
151 - create a public subclass of CharacterEncoderImplementations::CharacterEncoderImplementation named (for example) CharacterEncoderImplementations::CP267. |
150 |
152 |
151 - define the mappingURL1_relativeName (if the table is found on 'www.unicode.org') |
153 - define the mappingURL1_relativeName (if the table is found on 'www.unicode.org') |
152 or the mappingURL2_relativeName (if it is found on 'std.dkuug.dk') method, which |
154 or the mappingURL2_relativeName (if it is found on 'std.dkuug.dk') method, which |
153 should return the name of the tables file, relative to the top directory there |
155 should return the name of the tables file, relative to the top directory there |
154 (which is '.../Public/MAPPINGS' on www.unicode.org and '.../i18n/charmaops' on 'std.dkuug.dk'. |
156 (which is '.../Public/MAPPINGS' on www.unicode.org and '.../i18n/charmaops' on 'std.dkuug.dk'. |
155 |
157 |
156 In this example, the table from 'std.dkuug.dk' is used, and named 'EBCDIC-CP-FI' there. |
158 In this example, the table from 'std.dkuug.dk' is used, and named 'EBCDIC-CP-FI' there. |
157 |
159 |
158 - generate code by evaluating: |
160 - generate code by evaluating (make sure that CharacterEncoderGenerator is loaded from stx:goodies): |
159 CharacterEncoder::CP267 generateCode |
161 CharacterEncoder::CP267 generateCode |
160 |
162 |
161 Thats all !! |
163 Thats all !! |
162 |
164 |
163 |
165 |
221 ^ self |
223 ^ self |
222 encoderFor:encodingNameSymbol |
224 encoderFor:encodingNameSymbol |
223 ifAbsent:[ |
225 ifAbsent:[ |
224 "/ proceed to ignore this error in the future. |
226 "/ proceed to ignore this error in the future. |
225 |
227 |
226 (EncodersByName at:#unicode) at:encodingNameSymbol put:NullEncoderInstance. |
228 "/ (EncodersByName at:#unicode) at:encodingNameSymbol put:NullEncoderInstance. |
227 (EncoderClassesByName at:#unicode) at:encodingNameSymbol put:NullEncoder. |
229 "/ (EncoderClassesByName at:#unicode) at:encodingNameSymbol put:NullEncoder. |
228 |
230 |
229 "/ self error:'no encoder for ' , encodingNameSymbol mayProceed:true. |
231 "/ self error:'no encoder for ' , encodingNameSymbol mayProceed:true. |
230 ('CharacterEncoder [warning]: no encoder for ' , encodingNameSymbol) infoPrintCR. |
232 ('CharacterEncoder [warning]: no encoder for "' , encodingNameSymbol,'"') infoPrintCR. |
231 |
233 |
232 NullEncoderInstance |
234 NullEncoderInstance |
233 ] |
235 ] |
234 |
236 |
235 " |
237 " |
266 |encodingNameSymbol enc clsName cls lcName name unicodeEncoders unicodeEncoderClasses| |
268 |encodingNameSymbol enc clsName cls lcName name unicodeEncoders unicodeEncoderClasses| |
267 |
269 |
268 encodingNameSymbol := encodingNameSymbolArg. |
270 encodingNameSymbol := encodingNameSymbolArg. |
269 encodingNameSymbol isNil ifTrue:[ ^ NullEncoderInstance]. |
271 encodingNameSymbol isNil ifTrue:[ ^ NullEncoderInstance]. |
270 |
272 |
271 encodingNameSymbol == #'iso10646-1' ifTrue:[ encodingNameSymbol := #unicode]. |
273 encodingNameSymbol = 'iso10646-1' ifTrue:[ encodingNameSymbol := #unicode]. |
272 |
274 |
273 lcName := encodingNameSymbol asLowercase. |
275 lcName := encodingNameSymbol asLowercase. |
274 name := lcName asSymbolIfInterned. |
276 name := lcName asSymbolIfInterned. |
275 name isNil ifTrue:[name := lcName]. |
277 name isNil ifTrue:[name := lcName]. |
276 |
278 |
415 self encoderFor:#'koi8-r' |
417 self encoderFor:#'koi8-r' |
416 self encoderFor:#'koi8-u' |
418 self encoderFor:#'koi8-u' |
417 self encoderFor:#'jis0208' |
419 self encoderFor:#'jis0208' |
418 self encoderFor:#'jis7' |
420 self encoderFor:#'jis7' |
419 self encoderFor:#'unicode' |
421 self encoderFor:#'unicode' |
|
422 self encoderFor:#'UTF-8' |
|
423 self encoderFor:'UTF-8' |
420 " |
424 " |
421 |
425 |
422 "Modified: / 12-07-2012 / 19:45:58 / cg" |
426 "Modified: / 12-07-2012 / 19:45:58 / cg" |
423 ! |
427 ! |
424 |
428 |
546 !CharacterEncoder class methodsFor:'class initialization'! |
550 !CharacterEncoder class methodsFor:'class initialization'! |
547 |
551 |
548 initialize |
552 initialize |
549 |ud| |
553 |ud| |
550 |
554 |
|
555 AccessLock notNil ifTrue:[^ self]. "/ already initialized |
|
556 |
551 AccessLock := RecursionLock new name:'CharacterEncoder'. |
557 AccessLock := RecursionLock new name:'CharacterEncoder'. |
552 NullEncoderInstance := NullEncoder new. |
558 NullEncoderInstance := NullEncoder new. |
553 |
559 |
554 EncodersByName := Dictionary new. |
560 EncodersByName := Dictionary new. |
555 EncoderClassesByName := Dictionary new. |
561 EncoderClassesByName := Dictionary new. |
559 ud at:#'fontspecific' put:NullEncoder. |
565 ud at:#'fontspecific' put:NullEncoder. |
560 ud at:#'adobe-fontspecific' put:NullEncoder. |
566 ud at:#'adobe-fontspecific' put:NullEncoder. |
561 ud at:#'ms-oem' put:NullEncoder. |
567 ud at:#'ms-oem' put:NullEncoder. |
562 ud at:#'ms-default' put:NullEncoder. |
568 ud at:#'ms-default' put:NullEncoder. |
563 |
569 |
564 "/ className decoded-name array-of-encodingNames |
570 "/ className decoded-name array-of-encodingNames |
565 #( |
571 #( |
566 (ASCII unicode ( ascii 'us-ascii' 'iso-ir-6' 'ibm-367' 'ms-cp367' 'cp367' 'iso646-us' 'ibm-cp367' 'ansi_x3.4-1968' )) |
572 (ASCII unicode ( ascii 'us-ascii' 'iso-ir-6' 'ibm-367' 'ms-cp367' 'cp367' 'iso646-us' 'ibm-cp367' 'ansi_x3.4-1968' )) |
567 |
573 |
568 (BIG5 unicode ( big5 )) |
574 (BIG5 unicode ( big5 )) |
569 |
575 |
571 |
577 |
572 (CP437 unicode ( 'cp437' 'cp-437' 'ibm-437' 'ms-cp437' 'microsoft-cp437' 'ibm-cp437' )) |
578 (CP437 unicode ( 'cp437' 'cp-437' 'ibm-437' 'ms-cp437' 'microsoft-cp437' 'ibm-cp437' )) |
573 |
579 |
574 (EBCDIC unicode ( 'ebcdic' )) |
580 (EBCDIC unicode ( 'ebcdic' )) |
575 |
581 |
576 (GB2313_1980 unicode ( 'gb2313' 'gb2313-1980' )) |
582 "/ (GB2313_1980 unicode ( 'gb2313' 'gb2313-1980' )) |
|
583 |
|
584 (GB2312_1980_0 unicode ( 'gb2312' 'gb2312.1980' 'gb2312.1980-0')) |
577 |
585 |
578 (HANGUL unicode ( 'hangul' )) |
586 (HANGUL unicode ( 'hangul' )) |
579 |
587 |
580 (ISO10646_1 unicode ( unicode 'iso10646_1' 'iso10646-1' 'iso-10646-1' )) |
588 (ISO10646_1 unicode ( unicode 'iso10646_1' 'iso10646-1' 'iso-10646-1' )) |
581 |
589 |
582 (ISO10646_to_UTF8 unicode ( utf8 'utf-8' )) |
590 (ISO10646_to_UTF8 unicode ( utf8 'utf-8' )) |
583 (ISO10646_to_UTF16BE unicode ( utf16b utf16be 'utf-16b' 'utf-16be' )) |
591 (ISO10646_to_UTF16BE unicode ( utf16b utf16be 'utf-16b' 'utf-16be' )) |
584 (ISO10646_to_UTF16LE unicode ( utf16l utf16le 'utf-16e' 'utf-16le' )) |
592 (ISO10646_to_UTF16LE unicode ( utf16l utf16le 'utf-16e' 'utf-16le' )) |
|
593 |
|
594 (ISO10646_to_UTF8_MAC unicode ( 'utf8-mac' 'utf-8-mac' )) |
|
595 (ISO10646_to_XMLUTF8 unicode ( 'utf8-XML' )) |
585 |
596 |
586 (ISO8859_1 unicode ( 'iso8859_1' 'iso8859-1' 'iso-8859-1' 'latin-1' 'latin1' 'iso-ir-100' 'ibm-819' 'ms-cp819' 'ibm-cp819' 'iso8859')) |
597 (ISO8859_1 unicode ( 'iso8859_1' 'iso8859-1' 'iso-8859-1' 'latin-1' 'latin1' 'iso-ir-100' 'ibm-819' 'ms-cp819' 'ibm-cp819' 'iso8859')) |
587 |
598 |
588 (ISO8859_2 unicode ( 'iso8859_2' 'iso8859-2' 'iso-8859-2' 'latin2' 'latin-2' 'iso-ir-101')) |
599 (ISO8859_2 unicode ( 'iso8859_2' 'iso8859-2' 'iso-8859-2' 'latin2' 'latin-2' 'iso-ir-101')) |
589 |
600 |
655 |
666 |
656 (MAC_Japanese unicode ( #'mac-japanese' #'macjapanese' )) |
667 (MAC_Japanese unicode ( #'mac-japanese' #'macjapanese' )) |
657 |
668 |
658 (MAC_Korean unicode ( #'mac-korean' #'mackorean' )) |
669 (MAC_Korean unicode ( #'mac-korean' #'mackorean' )) |
659 |
670 |
660 (MAC_Roman unicode ( #'mac-roman' #'macroman' )) |
671 (MAC_Roman unicode ( #'mac-roman' #'macroman' 'macintosh' 'cp10000' )) |
661 |
672 |
662 (MAC_Romanian unicode ( #'mac-romanian' #'macromanian' )) |
673 (MAC_Romanian unicode ( #'mac-romanian' #'macromanian' )) |
663 |
674 |
664 (MAC_Symbol unicode ( #'mac-symbol' #'macsymbol' )) |
675 (MAC_Symbol unicode ( #'mac-symbol' #'macsymbol' )) |
665 |
676 |
1011 ('iso8859-6' 'Arabic' ) |
1022 ('iso8859-6' 'Arabic' ) |
1012 ('iso8859-7' 'Greek' ) |
1023 ('iso8859-7' 'Greek' ) |
1013 ('iso8859-8' 'Hebrew' ) |
1024 ('iso8859-8' 'Hebrew' ) |
1014 ('iso8859-15' 'Western with Euro' ) |
1025 ('iso8859-15' 'Western with Euro' ) |
1015 ('iso8859-16' 'South European with Euro' ) |
1026 ('iso8859-16' 'South European with Euro' ) |
|
1027 "/ nil |
|
1028 ('macintosh' 'MAC Western' ) |
1016 "/ nil |
1029 "/ nil |
1017 ('koi7' 'Cyrillic (Old)' ) |
1030 ('koi7' 'Cyrillic (Old)' ) |
1018 ('koi8-r' 'Cyrillic' ) |
1031 ('koi8-r' 'Cyrillic' ) |
1019 ('koi8-u' 'Cyrillic (Ukraine)' ) |
1032 ('koi8-u' 'Cyrillic (Ukraine)' ) |
1020 "/ nil |
1033 "/ nil |
1125 s skipSeparators. |
1138 s skipSeparators. |
1126 ('"''' includes:s peek) ifTrue:[ |
1139 ('"''' includes:s peek) ifTrue:[ |
1127 quote := s next. |
1140 quote := s next. |
1128 w := s upTo:quote. |
1141 w := s upTo:quote. |
1129 ] ifFalse:[ |
1142 ] ifFalse:[ |
1130 w := s upToMatching:[:ch | ch isSeparator or:[ch == $" or:[ch == $' or:[ch == $> ]]]]. |
1143 w := s upToElementForWhich:[:ch | ch isSeparator or:[ch == $" or:[ch == $' or:[ch == $> ]]]]. |
1131 ]. |
1144 ]. |
1132 w notNil ifTrue:[ |
1145 w notNil ifTrue:[ |
1133 enc := w withoutQuotes. |
1146 enc := w withoutQuotes. |
1134 (enc startsWith:'x-') ifTrue:[ |
1147 (enc startsWith:'x-') ifTrue:[ |
1135 enc := enc copyFrom:3. |
1148 enc := enc copyFrom:3. |
1312 ] |
1325 ] |
1313 ]. |
1326 ]. |
1314 newString at:idx put:(Character value:myCode). |
1327 newString at:idx put:(Character value:myCode). |
1315 ]. |
1328 ]. |
1316 ^ newString |
1329 ^ newString |
|
1330 ! |
|
1331 |
|
1332 encodeString:aUnicodeString on:aStream |
|
1333 "given a string in unicode, encode it onto aStream. |
|
1334 Subclasses can redefine this to avoid allocating many new string instances. |
|
1335 (but must then also redefine encodeString:aUnicodeString to collect the characters)" |
|
1336 |
|
1337 aStream nextPutAll:(self encodeString:aUnicodeString). |
1317 ! ! |
1338 ! ! |
1318 |
1339 |
1319 !CharacterEncoder methodsFor:'error handling'! |
1340 !CharacterEncoder methodsFor:'error handling'! |
1320 |
1341 |
1321 decodingError |
1342 decodingError |
1393 "return the number of bytes required to encode codePoint" |
1414 "return the number of bytes required to encode codePoint" |
1394 |
1415 |
1395 ^ self subclassResponsibility |
1416 ^ self subclassResponsibility |
1396 |
1417 |
1397 "Created: / 15-06-2005 / 15:11:04 / janfrog" |
1418 "Created: / 15-06-2005 / 15:11:04 / janfrog" |
|
1419 ! |
|
1420 |
|
1421 isEncoderFor:encoding |
|
1422 "does this encode to encoding?" |
|
1423 |
|
1424 |encodingNameSymbol| |
|
1425 |
|
1426 encodingNameSymbol := encoding asLowercase. |
|
1427 encodingNameSymbol = #'iso10646-1' ifTrue:[ encodingNameSymbol := #unicode]. |
|
1428 |
|
1429 ^ encodingNameSymbol = self nameOfEncoding |
1398 ! |
1430 ! |
1399 |
1431 |
1400 isNullEncoder |
1432 isNullEncoder |
1401 ^ false |
1433 ^ false |
1402 ! |
1434 ! |
1681 ! ! |
1713 ! ! |
1682 |
1714 |
1683 !CharacterEncoder class methodsFor:'documentation'! |
1715 !CharacterEncoder class methodsFor:'documentation'! |
1684 |
1716 |
1685 version |
1717 version |
1686 ^ '$Header: /cvs/stx/stx/libbasic/CharacterEncoder.st,v 1.123 2013-08-10 11:13:37 stefan Exp $' |
1718 ^ '$Header: /cvs/stx/stx/libbasic/CharacterEncoder.st,v 1.138 2015-03-26 16:21:01 cg Exp $' |
1687 ! |
1719 ! |
1688 |
1720 |
1689 version_CVS |
1721 version_CVS |
1690 ^ '$Header: /cvs/stx/stx/libbasic/CharacterEncoder.st,v 1.123 2013-08-10 11:13:37 stefan Exp $' |
1722 ^ '$Header: /cvs/stx/stx/libbasic/CharacterEncoder.st,v 1.138 2015-03-26 16:21:01 cg Exp $' |
1691 ! |
|
1692 |
|
1693 version_HG |
|
1694 |
|
1695 ^ '$Changeset: <not expanded> $' |
|
1696 ! ! |
1723 ! ! |
1697 |
1724 |
1698 |
1725 |
1699 CharacterEncoder initialize! |
1726 CharacterEncoder initialize! |