diff -r cb7a12afe736 -r e3a375d5f6a8 CharacterEncoder.st --- a/CharacterEncoder.st Tue Feb 04 21:09:59 2014 +0100 +++ b/CharacterEncoder.st Wed Apr 01 10:20:10 2015 +0100 @@ -1,3 +1,5 @@ +"{ Encoding: utf8 }" + " COPYRIGHT (c) 2004 by eXept Software AG All Rights Reserved @@ -11,10 +13,12 @@ " "{ Package: 'stx:libbasic' }" +"{ NameSpace: Smalltalk }" + Object subclass:#CharacterEncoder instanceVariableNames:'' - classVariableNames:'EncoderClassesByName EncodersByName CachedEncoders LastEncoder - AccessLock NullEncoderInstance Jis7KanjiEscapeSequence + classVariableNames:'EncoderClassesByName EncodersByName CachedEncoders AccessLock + NullEncoderInstance Jis7KanjiEscapeSequence Jis7RomanEscapeSequence JisISO2022EscapeSequence Jis7KanjiOldEscapeSequence' poolDictionaries:'' @@ -144,8 +148,6 @@ In order to add another coder (for example: for EBCDIC or ms-codePage 278), perform the following steps: - - create a private subclass of CharacterEncoder named (for example) CP267. - - create a public subclass of CharacterEncoderImplementations::CharacterEncoderImplementation named (for example) CharacterEncoderImplementations::CP267. - define the mappingURL1_relativeName (if the table is found on 'www.unicode.org') @@ -155,7 +157,7 @@ In this example, the table from 'std.dkuug.dk' is used, and named 'EBCDIC-CP-FI' there. - - generate code by evaluating: + - generate code by evaluating (make sure that CharacterEncoderGenerator is loaded from stx:goodies): CharacterEncoder::CP267 generateCode Thats all !! @@ -223,11 +225,11 @@ ifAbsent:[ "/ proceed to ignore this error in the future. - (EncodersByName at:#unicode) at:encodingNameSymbol put:NullEncoderInstance. - (EncoderClassesByName at:#unicode) at:encodingNameSymbol put:NullEncoder. +"/ (EncodersByName at:#unicode) at:encodingNameSymbol put:NullEncoderInstance. +"/ (EncoderClassesByName at:#unicode) at:encodingNameSymbol put:NullEncoder. "/ self error:'no encoder for ' , encodingNameSymbol mayProceed:true. - ('CharacterEncoder [warning]: no encoder for ' , encodingNameSymbol) infoPrintCR. + ('CharacterEncoder [warning]: no encoder for "' , encodingNameSymbol,'"') infoPrintCR. NullEncoderInstance ] @@ -268,7 +270,7 @@ encodingNameSymbol := encodingNameSymbolArg. encodingNameSymbol isNil ifTrue:[ ^ NullEncoderInstance]. - encodingNameSymbol == #'iso10646-1' ifTrue:[ encodingNameSymbol := #unicode]. + encodingNameSymbol = 'iso10646-1' ifTrue:[ encodingNameSymbol := #unicode]. lcName := encodingNameSymbol asLowercase. name := lcName asSymbolIfInterned. @@ -417,6 +419,8 @@ self encoderFor:#'jis0208' self encoderFor:#'jis7' self encoderFor:#'unicode' + self encoderFor:#'UTF-8' + self encoderFor:'UTF-8' " "Modified: / 12-07-2012 / 19:45:58 / cg" @@ -548,6 +552,8 @@ initialize |ud| + AccessLock notNil ifTrue:[^ self]. "/ already initialized + AccessLock := RecursionLock new name:'CharacterEncoder'. NullEncoderInstance := NullEncoder new. @@ -561,7 +567,7 @@ ud at:#'ms-oem' put:NullEncoder. ud at:#'ms-default' put:NullEncoder. - "/ className decoded-name array-of-encodingNames + "/ className decoded-name array-of-encodingNames #( (ASCII unicode ( ascii 'us-ascii' 'iso-ir-6' 'ibm-367' 'ms-cp367' 'cp367' 'iso646-us' 'ibm-cp367' 'ansi_x3.4-1968' )) @@ -573,15 +579,20 @@ (EBCDIC unicode ( 'ebcdic' )) - (GB2313_1980 unicode ( 'gb2313' 'gb2313-1980' )) +"/ (GB2313_1980 unicode ( 'gb2313' 'gb2313-1980' )) + + (GB2312_1980_0 unicode ( 'gb2312' 'gb2312.1980' 'gb2312.1980-0')) (HANGUL unicode ( 'hangul' )) (ISO10646_1 unicode ( unicode 'iso10646_1' 'iso10646-1' 'iso-10646-1' )) - (ISO10646_to_UTF8 unicode ( utf8 'utf-8' )) - (ISO10646_to_UTF16BE unicode ( utf16b utf16be 'utf-16b' 'utf-16be' )) - (ISO10646_to_UTF16LE unicode ( utf16l utf16le 'utf-16e' 'utf-16le' )) + (ISO10646_to_UTF8 unicode ( utf8 'utf-8' )) + (ISO10646_to_UTF16BE unicode ( utf16b utf16be 'utf-16b' 'utf-16be' )) + (ISO10646_to_UTF16LE unicode ( utf16l utf16le 'utf-16e' 'utf-16le' )) + + (ISO10646_to_UTF8_MAC unicode ( 'utf8-mac' 'utf-8-mac' )) + (ISO10646_to_XMLUTF8 unicode ( 'utf8-XML' )) (ISO8859_1 unicode ( 'iso8859_1' 'iso8859-1' 'iso-8859-1' 'latin-1' 'latin1' 'iso-ir-100' 'ibm-819' 'ms-cp819' 'ibm-cp819' 'iso8859')) @@ -657,7 +668,7 @@ (MAC_Korean unicode ( #'mac-korean' #'mackorean' )) - (MAC_Roman unicode ( #'mac-roman' #'macroman' )) + (MAC_Roman unicode ( #'mac-roman' #'macroman' 'macintosh' 'cp10000' )) (MAC_Romanian unicode ( #'mac-romanian' #'macromanian' )) @@ -1014,6 +1025,8 @@ ('iso8859-15' 'Western with Euro' ) ('iso8859-16' 'South European with Euro' ) "/ nil + ('macintosh' 'MAC Western' ) +"/ nil ('koi7' 'Cyrillic (Old)' ) ('koi8-r' 'Cyrillic' ) ('koi8-u' 'Cyrillic (Ukraine)' ) @@ -1127,7 +1140,7 @@ quote := s next. w := s upTo:quote. ] ifFalse:[ - w := s upToMatching:[:ch | ch isSeparator or:[ch == $" or:[ch == $' or:[ch == $> ]]]]. + w := s upToElementForWhich:[:ch | ch isSeparator or:[ch == $" or:[ch == $' or:[ch == $> ]]]]. ]. w notNil ifTrue:[ enc := w withoutQuotes. @@ -1314,6 +1327,14 @@ newString at:idx put:(Character value:myCode). ]. ^ newString +! + +encodeString:aUnicodeString on:aStream + "given a string in unicode, encode it onto aStream. + Subclasses can redefine this to avoid allocating many new string instances. + (but must then also redefine encodeString:aUnicodeString to collect the characters)" + + aStream nextPutAll:(self encodeString:aUnicodeString). ! ! !CharacterEncoder methodsFor:'error handling'! @@ -1397,6 +1418,17 @@ "Created: / 15-06-2005 / 15:11:04 / janfrog" ! +isEncoderFor:encoding + "does this encode to encoding?" + + |encodingNameSymbol| + + encodingNameSymbol := encoding asLowercase. + encodingNameSymbol = #'iso10646-1' ifTrue:[ encodingNameSymbol := #unicode]. + + ^ encodingNameSymbol = self nameOfEncoding +! + isNullEncoder ^ false ! @@ -1683,16 +1715,11 @@ !CharacterEncoder class methodsFor:'documentation'! version - ^ '$Header: /cvs/stx/stx/libbasic/CharacterEncoder.st,v 1.123 2013-08-10 11:13:37 stefan Exp $' + ^ '$Header: /cvs/stx/stx/libbasic/CharacterEncoder.st,v 1.138 2015-03-26 16:21:01 cg Exp $' ! version_CVS - ^ '$Header: /cvs/stx/stx/libbasic/CharacterEncoder.st,v 1.123 2013-08-10 11:13:37 stefan Exp $' -! - -version_HG - - ^ '$Changeset: $' + ^ '$Header: /cvs/stx/stx/libbasic/CharacterEncoder.st,v 1.138 2015-03-26 16:21:01 cg Exp $' ! !