CharacterEncoder.st
branchjv
changeset 18120 e3a375d5f6a8
parent 18084 ab5b38bd8f81
parent 17687 d5f0453d0899
child 18192 32a7c53ef4d0
--- a/CharacterEncoder.st	Tue Feb 04 21:09:59 2014 +0100
+++ b/CharacterEncoder.st	Wed Apr 01 10:20:10 2015 +0100
@@ -1,3 +1,5 @@
+"{ Encoding: utf8 }"
+
 "
  COPYRIGHT (c) 2004 by eXept Software AG
               All Rights Reserved
@@ -11,10 +13,12 @@
 "
 "{ Package: 'stx:libbasic' }"
 
+"{ NameSpace: Smalltalk }"
+
 Object subclass:#CharacterEncoder
 	instanceVariableNames:''
-	classVariableNames:'EncoderClassesByName EncodersByName CachedEncoders LastEncoder
-		AccessLock NullEncoderInstance Jis7KanjiEscapeSequence
+	classVariableNames:'EncoderClassesByName EncodersByName CachedEncoders AccessLock
+		NullEncoderInstance Jis7KanjiEscapeSequence
 		Jis7RomanEscapeSequence JisISO2022EscapeSequence
 		Jis7KanjiOldEscapeSequence'
 	poolDictionaries:''
@@ -144,8 +148,6 @@
 
     In order to add another coder (for example: for EBCDIC or ms-codePage 278),
     perform the following steps:
-        - create a private subclass of CharacterEncoder named (for example) CP267.
-
         - create a public subclass of CharacterEncoderImplementations::CharacterEncoderImplementation named (for example) CharacterEncoderImplementations::CP267.
 
         - define the mappingURL1_relativeName (if the table is found on 'www.unicode.org')
@@ -155,7 +157,7 @@
 
           In this example, the table from 'std.dkuug.dk' is used, and named 'EBCDIC-CP-FI' there.
 
-        - generate code by evaluating:
+        - generate code by evaluating (make sure that CharacterEncoderGenerator is loaded from stx:goodies):
             CharacterEncoder::CP267 generateCode
 
     Thats all !!
@@ -223,11 +225,11 @@
         ifAbsent:[
             "/ proceed to ignore this error in the future.    
 
-            (EncodersByName at:#unicode) at:encodingNameSymbol put:NullEncoderInstance. 
-            (EncoderClassesByName at:#unicode) at:encodingNameSymbol put:NullEncoder.    
+"/            (EncodersByName at:#unicode) at:encodingNameSymbol put:NullEncoderInstance. 
+"/            (EncoderClassesByName at:#unicode) at:encodingNameSymbol put:NullEncoder.    
 
             "/ self error:'no encoder for ' , encodingNameSymbol mayProceed:true.
-            ('CharacterEncoder [warning]: no encoder for ' , encodingNameSymbol) infoPrintCR.
+            ('CharacterEncoder [warning]: no encoder for "' , encodingNameSymbol,'"') infoPrintCR.
             
             NullEncoderInstance
         ]
@@ -268,7 +270,7 @@
     encodingNameSymbol := encodingNameSymbolArg.
     encodingNameSymbol isNil ifTrue:[ ^ NullEncoderInstance].
 
-    encodingNameSymbol == #'iso10646-1' ifTrue:[ encodingNameSymbol := #unicode].
+    encodingNameSymbol = 'iso10646-1' ifTrue:[ encodingNameSymbol := #unicode].
 
     lcName := encodingNameSymbol asLowercase.
     name := lcName asSymbolIfInterned.
@@ -417,6 +419,8 @@
      self encoderFor:#'jis0208'      
      self encoderFor:#'jis7'      
      self encoderFor:#'unicode'      
+     self encoderFor:#'UTF-8'      
+     self encoderFor:'UTF-8'      
     "
 
     "Modified: / 12-07-2012 / 19:45:58 / cg"
@@ -548,6 +552,8 @@
 initialize
     |ud|
 
+    AccessLock notNil ifTrue:[^ self].  "/ already initialized
+
     AccessLock := RecursionLock new name:'CharacterEncoder'.
     NullEncoderInstance := NullEncoder new.
 
@@ -561,7 +567,7 @@
     ud at:#'ms-oem' put:NullEncoder.    
     ud at:#'ms-default' put:NullEncoder.    
 
-    "/ className decoded-name array-of-encodingNames
+    "/ className        decoded-name    array-of-encodingNames
     #(
         (ASCII              unicode     ( ascii 'us-ascii' 'iso-ir-6' 'ibm-367' 'ms-cp367' 'cp367'  'iso646-us' 'ibm-cp367' 'ansi_x3.4-1968' ))
 
@@ -573,15 +579,20 @@
 
         (EBCDIC             unicode     ( 'ebcdic' ))
 
-        (GB2313_1980        unicode     ( 'gb2313' 'gb2313-1980' ))
+"/        (GB2313_1980        unicode     ( 'gb2313' 'gb2313-1980' ))
+
+        (GB2312_1980_0      unicode     ( 'gb2312' 'gb2312.1980' 'gb2312.1980-0'))
 
         (HANGUL             unicode     ( 'hangul' ))
 
         (ISO10646_1         unicode     ( unicode 'iso10646_1' 'iso10646-1' 'iso-10646-1' ))
 
-        (ISO10646_to_UTF8   unicode     ( utf8 'utf-8' ))
-        (ISO10646_to_UTF16BE unicode    ( utf16b utf16be 'utf-16b' 'utf-16be' ))
-        (ISO10646_to_UTF16LE unicode    ( utf16l utf16le 'utf-16e' 'utf-16le' ))
+        (ISO10646_to_UTF8     unicode   ( utf8 'utf-8' ))
+        (ISO10646_to_UTF16BE  unicode   ( utf16b utf16be 'utf-16b' 'utf-16be' ))
+        (ISO10646_to_UTF16LE  unicode   ( utf16l utf16le 'utf-16e' 'utf-16le' ))
+
+        (ISO10646_to_UTF8_MAC unicode   ( 'utf8-mac' 'utf-8-mac' ))
+        (ISO10646_to_XMLUTF8  unicode   ( 'utf8-XML' ))
 
         (ISO8859_1          unicode     ( 'iso8859_1' 'iso8859-1' 'iso-8859-1' 'latin-1' 'latin1' 'iso-ir-100' 'ibm-819' 'ms-cp819' 'ibm-cp819' 'iso8859'))
 
@@ -657,7 +668,7 @@
 
         (MAC_Korean         unicode     ( #'mac-korean' #'mackorean'  ))
 
-        (MAC_Roman          unicode     ( #'mac-roman' #'macroman'  ))
+        (MAC_Roman          unicode     ( #'mac-roman' #'macroman' 'macintosh' 'cp10000' ))
 
         (MAC_Romanian       unicode     ( #'mac-romanian' #'macromanian'  ))
 
@@ -1014,6 +1025,8 @@
          ('iso8859-15'  'Western with Euro'             )
          ('iso8859-16'  'South European with Euro'      )
 "/       nil
+         ('macintosh'   'MAC Western'      )
+"/       nil
          ('koi7'        'Cyrillic (Old)'                )
          ('koi8-r'      'Cyrillic'                      )
          ('koi8-u'      'Cyrillic (Ukraine)'            )
@@ -1127,7 +1140,7 @@
                 quote := s next.
                 w := s upTo:quote.
             ] ifFalse:[
-                w := s upToMatching:[:ch | ch isSeparator or:[ch == $" or:[ch == $' or:[ch == $> ]]]].
+                w := s upToElementForWhich:[:ch | ch isSeparator or:[ch == $" or:[ch == $' or:[ch == $> ]]]].
             ].
             w notNil ifTrue:[
                 enc := w withoutQuotes.
@@ -1314,6 +1327,14 @@
         newString at:idx put:(Character value:myCode).
     ].
     ^ newString
+!
+
+encodeString:aUnicodeString on:aStream
+    "given a string in unicode, encode it onto aStream.
+     Subclasses can redefine this to avoid allocating many new string instances.
+     (but must then also redefine encodeString:aUnicodeString to collect the characters)"
+
+    aStream nextPutAll:(self encodeString:aUnicodeString).
 ! !
 
 !CharacterEncoder methodsFor:'error handling'!
@@ -1397,6 +1418,17 @@
     "Created: / 15-06-2005 / 15:11:04 / janfrog"
 !
 
+isEncoderFor:encoding
+    "does this encode to encoding?"
+
+    |encodingNameSymbol|
+
+    encodingNameSymbol := encoding asLowercase.
+    encodingNameSymbol = #'iso10646-1' ifTrue:[ encodingNameSymbol := #unicode].
+
+    ^ encodingNameSymbol = self nameOfEncoding
+!
+
 isNullEncoder
     ^ false
 !
@@ -1683,16 +1715,11 @@
 !CharacterEncoder class methodsFor:'documentation'!
 
 version
-    ^ '$Header: /cvs/stx/stx/libbasic/CharacterEncoder.st,v 1.123 2013-08-10 11:13:37 stefan Exp $'
+    ^ '$Header: /cvs/stx/stx/libbasic/CharacterEncoder.st,v 1.138 2015-03-26 16:21:01 cg Exp $'
 !
 
 version_CVS
-    ^ '$Header: /cvs/stx/stx/libbasic/CharacterEncoder.st,v 1.123 2013-08-10 11:13:37 stefan Exp $'
-!
-
-version_HG
-
-    ^ '$Changeset: <not expanded> $'
+    ^ '$Header: /cvs/stx/stx/libbasic/CharacterEncoder.st,v 1.138 2015-03-26 16:21:01 cg Exp $'
 ! !