--- a/CharacterEncoder.st Tue Feb 04 21:09:59 2014 +0100
+++ b/CharacterEncoder.st Wed Apr 01 10:20:10 2015 +0100
@@ -1,3 +1,5 @@
+"{ Encoding: utf8 }"
+
"
COPYRIGHT (c) 2004 by eXept Software AG
All Rights Reserved
@@ -11,10 +13,12 @@
"
"{ Package: 'stx:libbasic' }"
+"{ NameSpace: Smalltalk }"
+
Object subclass:#CharacterEncoder
instanceVariableNames:''
- classVariableNames:'EncoderClassesByName EncodersByName CachedEncoders LastEncoder
- AccessLock NullEncoderInstance Jis7KanjiEscapeSequence
+ classVariableNames:'EncoderClassesByName EncodersByName CachedEncoders AccessLock
+ NullEncoderInstance Jis7KanjiEscapeSequence
Jis7RomanEscapeSequence JisISO2022EscapeSequence
Jis7KanjiOldEscapeSequence'
poolDictionaries:''
@@ -144,8 +148,6 @@
In order to add another coder (for example: for EBCDIC or ms-codePage 278),
perform the following steps:
- - create a private subclass of CharacterEncoder named (for example) CP267.
-
- create a public subclass of CharacterEncoderImplementations::CharacterEncoderImplementation named (for example) CharacterEncoderImplementations::CP267.
- define the mappingURL1_relativeName (if the table is found on 'www.unicode.org')
@@ -155,7 +157,7 @@
In this example, the table from 'std.dkuug.dk' is used, and named 'EBCDIC-CP-FI' there.
- - generate code by evaluating:
+ - generate code by evaluating (make sure that CharacterEncoderGenerator is loaded from stx:goodies):
CharacterEncoder::CP267 generateCode
Thats all !!
@@ -223,11 +225,11 @@
ifAbsent:[
"/ proceed to ignore this error in the future.
- (EncodersByName at:#unicode) at:encodingNameSymbol put:NullEncoderInstance.
- (EncoderClassesByName at:#unicode) at:encodingNameSymbol put:NullEncoder.
+"/ (EncodersByName at:#unicode) at:encodingNameSymbol put:NullEncoderInstance.
+"/ (EncoderClassesByName at:#unicode) at:encodingNameSymbol put:NullEncoder.
"/ self error:'no encoder for ' , encodingNameSymbol mayProceed:true.
- ('CharacterEncoder [warning]: no encoder for ' , encodingNameSymbol) infoPrintCR.
+ ('CharacterEncoder [warning]: no encoder for "' , encodingNameSymbol,'"') infoPrintCR.
NullEncoderInstance
]
@@ -268,7 +270,7 @@
encodingNameSymbol := encodingNameSymbolArg.
encodingNameSymbol isNil ifTrue:[ ^ NullEncoderInstance].
- encodingNameSymbol == #'iso10646-1' ifTrue:[ encodingNameSymbol := #unicode].
+ encodingNameSymbol = 'iso10646-1' ifTrue:[ encodingNameSymbol := #unicode].
lcName := encodingNameSymbol asLowercase.
name := lcName asSymbolIfInterned.
@@ -417,6 +419,8 @@
self encoderFor:#'jis0208'
self encoderFor:#'jis7'
self encoderFor:#'unicode'
+ self encoderFor:#'UTF-8'
+ self encoderFor:'UTF-8'
"
"Modified: / 12-07-2012 / 19:45:58 / cg"
@@ -548,6 +552,8 @@
initialize
|ud|
+ AccessLock notNil ifTrue:[^ self]. "/ already initialized
+
AccessLock := RecursionLock new name:'CharacterEncoder'.
NullEncoderInstance := NullEncoder new.
@@ -561,7 +567,7 @@
ud at:#'ms-oem' put:NullEncoder.
ud at:#'ms-default' put:NullEncoder.
- "/ className decoded-name array-of-encodingNames
+ "/ className decoded-name array-of-encodingNames
#(
(ASCII unicode ( ascii 'us-ascii' 'iso-ir-6' 'ibm-367' 'ms-cp367' 'cp367' 'iso646-us' 'ibm-cp367' 'ansi_x3.4-1968' ))
@@ -573,15 +579,20 @@
(EBCDIC unicode ( 'ebcdic' ))
- (GB2313_1980 unicode ( 'gb2313' 'gb2313-1980' ))
+"/ (GB2313_1980 unicode ( 'gb2313' 'gb2313-1980' ))
+
+ (GB2312_1980_0 unicode ( 'gb2312' 'gb2312.1980' 'gb2312.1980-0'))
(HANGUL unicode ( 'hangul' ))
(ISO10646_1 unicode ( unicode 'iso10646_1' 'iso10646-1' 'iso-10646-1' ))
- (ISO10646_to_UTF8 unicode ( utf8 'utf-8' ))
- (ISO10646_to_UTF16BE unicode ( utf16b utf16be 'utf-16b' 'utf-16be' ))
- (ISO10646_to_UTF16LE unicode ( utf16l utf16le 'utf-16e' 'utf-16le' ))
+ (ISO10646_to_UTF8 unicode ( utf8 'utf-8' ))
+ (ISO10646_to_UTF16BE unicode ( utf16b utf16be 'utf-16b' 'utf-16be' ))
+ (ISO10646_to_UTF16LE unicode ( utf16l utf16le 'utf-16e' 'utf-16le' ))
+
+ (ISO10646_to_UTF8_MAC unicode ( 'utf8-mac' 'utf-8-mac' ))
+ (ISO10646_to_XMLUTF8 unicode ( 'utf8-XML' ))
(ISO8859_1 unicode ( 'iso8859_1' 'iso8859-1' 'iso-8859-1' 'latin-1' 'latin1' 'iso-ir-100' 'ibm-819' 'ms-cp819' 'ibm-cp819' 'iso8859'))
@@ -657,7 +668,7 @@
(MAC_Korean unicode ( #'mac-korean' #'mackorean' ))
- (MAC_Roman unicode ( #'mac-roman' #'macroman' ))
+ (MAC_Roman unicode ( #'mac-roman' #'macroman' 'macintosh' 'cp10000' ))
(MAC_Romanian unicode ( #'mac-romanian' #'macromanian' ))
@@ -1014,6 +1025,8 @@
('iso8859-15' 'Western with Euro' )
('iso8859-16' 'South European with Euro' )
"/ nil
+ ('macintosh' 'MAC Western' )
+"/ nil
('koi7' 'Cyrillic (Old)' )
('koi8-r' 'Cyrillic' )
('koi8-u' 'Cyrillic (Ukraine)' )
@@ -1127,7 +1140,7 @@
quote := s next.
w := s upTo:quote.
] ifFalse:[
- w := s upToMatching:[:ch | ch isSeparator or:[ch == $" or:[ch == $' or:[ch == $> ]]]].
+ w := s upToElementForWhich:[:ch | ch isSeparator or:[ch == $" or:[ch == $' or:[ch == $> ]]]].
].
w notNil ifTrue:[
enc := w withoutQuotes.
@@ -1314,6 +1327,14 @@
newString at:idx put:(Character value:myCode).
].
^ newString
+!
+
+encodeString:aUnicodeString on:aStream
+ "given a string in unicode, encode it onto aStream.
+ Subclasses can redefine this to avoid allocating many new string instances.
+ (but must then also redefine encodeString:aUnicodeString to collect the characters)"
+
+ aStream nextPutAll:(self encodeString:aUnicodeString).
! !
!CharacterEncoder methodsFor:'error handling'!
@@ -1397,6 +1418,17 @@
"Created: / 15-06-2005 / 15:11:04 / janfrog"
!
+isEncoderFor:encoding
+ "does this encode to encoding?"
+
+ |encodingNameSymbol|
+
+ encodingNameSymbol := encoding asLowercase.
+ encodingNameSymbol = #'iso10646-1' ifTrue:[ encodingNameSymbol := #unicode].
+
+ ^ encodingNameSymbol = self nameOfEncoding
+!
+
isNullEncoder
^ false
!
@@ -1683,16 +1715,11 @@
!CharacterEncoder class methodsFor:'documentation'!
version
- ^ '$Header: /cvs/stx/stx/libbasic/CharacterEncoder.st,v 1.123 2013-08-10 11:13:37 stefan Exp $'
+ ^ '$Header: /cvs/stx/stx/libbasic/CharacterEncoder.st,v 1.138 2015-03-26 16:21:01 cg Exp $'
!
version_CVS
- ^ '$Header: /cvs/stx/stx/libbasic/CharacterEncoder.st,v 1.123 2013-08-10 11:13:37 stefan Exp $'
-!
-
-version_HG
-
- ^ '$Changeset: <not expanded> $'
+ ^ '$Header: /cvs/stx/stx/libbasic/CharacterEncoder.st,v 1.138 2015-03-26 16:21:01 cg Exp $'
! !