#FEATURE by cg
authorClaus Gittinger <cg@exept.de>
Wed, 07 Mar 2018 17:04:55 +0100
changeset 22587 3d2c9f1a70bd
parent 22586 41099578373e
child 22588 f52044b2de85
#FEATURE by cg class: CharacterEncoder class changed: #initializeEncoderClassesByName #initializeEncodingDetectors
CharacterEncoder.st
--- a/CharacterEncoder.st	Wed Mar 07 16:59:05 2018 +0100
+++ b/CharacterEncoder.st	Wed Mar 07 17:04:55 2018 +0100
@@ -14,55 +14,55 @@
 "{ NameSpace: Smalltalk }"
 
 Object subclass:#CharacterEncoder
-	instanceVariableNames:''
-	classVariableNames:'AccessLock CachedEncoders EncoderClassesByName EncodersByName
-		EncodingDetectors Jis7KanjiEscapeSequence
-		Jis7KanjiOldEscapeSequence Jis7RomanEscapeSequence
-		JisISO2022EscapeSequence NullEncoderInstance'
-	poolDictionaries:''
-	category:'Collections-Text-Encodings'
+        instanceVariableNames:''
+        classVariableNames:'AccessLock CachedEncoders EncoderClassesByName EncodersByName
+                EncodingDetectors Jis7KanjiEscapeSequence
+                Jis7KanjiOldEscapeSequence Jis7RomanEscapeSequence
+                JisISO2022EscapeSequence NullEncoderInstance'
+        poolDictionaries:''
+        category:'Collections-Text-Encodings'
 !
 
 CharacterEncoder subclass:#CompoundEncoder
-	instanceVariableNames:'decoder encoder'
-	classVariableNames:''
-	poolDictionaries:''
-	privateIn:CharacterEncoder
+        instanceVariableNames:'decoder encoder'
+        classVariableNames:''
+        poolDictionaries:''
+        privateIn:CharacterEncoder
 !
 
 CharacterEncoder subclass:#NullEncoder
-	instanceVariableNames:''
-	classVariableNames:''
-	poolDictionaries:''
-	privateIn:CharacterEncoder
+        instanceVariableNames:''
+        classVariableNames:''
+        poolDictionaries:''
+        privateIn:CharacterEncoder
 !
 
 CharacterEncoder subclass:#InverseEncoder
-	instanceVariableNames:'decoder readAhead'
-	classVariableNames:''
-	poolDictionaries:''
-	privateIn:CharacterEncoder
+        instanceVariableNames:'decoder readAhead'
+        classVariableNames:''
+        poolDictionaries:''
+        privateIn:CharacterEncoder
 !
 
 CharacterEncoder::NullEncoder subclass:#DefaultEncoder
-	instanceVariableNames:''
-	classVariableNames:''
-	poolDictionaries:''
-	privateIn:CharacterEncoder
+        instanceVariableNames:''
+        classVariableNames:''
+        poolDictionaries:''
+        privateIn:CharacterEncoder
 !
 
 CharacterEncoder subclass:#OtherEncoding
-	instanceVariableNames:''
-	classVariableNames:''
-	poolDictionaries:''
-	privateIn:CharacterEncoder
+        instanceVariableNames:''
+        classVariableNames:''
+        poolDictionaries:''
+        privateIn:CharacterEncoder
 !
 
 CharacterEncoder subclass:#TwoStepEncoder
-	instanceVariableNames:'encoder1 encoder2'
-	classVariableNames:''
-	poolDictionaries:''
-	privateIn:CharacterEncoder
+        instanceVariableNames:'encoder1 encoder2'
+        classVariableNames:''
+        poolDictionaries:''
+        privateIn:CharacterEncoder
 !
 
 !CharacterEncoder class methodsFor:'documentation'!
@@ -702,23 +702,25 @@
 
         (MAC_Turkish            unicode     ( #'mac-turkish' #'macturkish'  ))
 
-        (MS_Ansi                unicode     ( #'ms-ansi' 'ms-cp1252' 'microsoft-cp1252' 'cp1252' 'microsoft-ansi' 'windows-1252' 'windows-latin1'))
+        (MS_Ansi                unicode     ( #'ms-ansi' 'microsoft-ansi'))
+
+        (MS_CP1252              unicode     ( 'cp1252' 'cp-1252' 'ms-cp1252' 'microsoft-cp1252' 'windows-1252' 'windows-latin1'))
 
-        (MS_Arabic              unicode     ( 'ms-arabic' 'ms-cp1256' 'microsoft-cp1256' 'cp1256'  'microsoft-arabic' 'windows-1256'  ))
+        (MS_Arabic              unicode     ( 'cp1256' 'cp-1256' 'ms-arabic' 'ms-cp1256' 'microsoft-cp1256'  'microsoft-arabic' 'windows-1256'  ))
 
-        (MS_Baltic              unicode     ( 'ms-baltic' 'ms-cp1257' 'microsoft-cp1257' 'cp1257' 'microsoft-baltic' 'windows-1257'  ))
+        (MS_Baltic              unicode     ( 'cp1257' 'cp-1257' 'ms-baltic' 'ms-cp1257' 'microsoft-cp1257' 'microsoft-baltic' 'windows-1257'  ))
 
-        (MS_Cyrillic            unicode     ( 'ms-cyrillic' 'ms-cp1251' 'microsoft-cp1251' 'cp1251' 'microsoft-cyrillic' 'windows-1251'  ))
+        (MS_Cyrillic            unicode     ( 'cp1251' 'cp-1251' 'ms-cyrillic' 'ms-cp1251' 'microsoft-cp1251' 'microsoft-cyrillic' 'windows-1251'  ))
 
-        (MS_EastEuropean        unicode     ( 'ms-easteuropean' 'ms-ee' 'cp1250' 'ms-cp1250' 'microsoft-cp1250' 'microsoft-easteuropean' 'windows-1250'  ))
+        (MS_EastEuropean        unicode     ( 'cp1250' 'cp-1250' 'ms-easteuropean' 'ms-ee' 'ms-cp1250' 'microsoft-cp1250' 'microsoft-easteuropean' 'windows-1250'  ))
 
-        (MS_Greek               unicode     ( 'ms-greek' 'ms-cp1253' 'microsoft-cp1253' 'cp1253' 'microsoft-greek' 'windows-1253' ))
+        (MS_Greek               unicode     ( 'cp1253' 'cp-1253' 'ms-greek' 'ms-cp1253' 'microsoft-cp1253' 'microsoft-greek' 'windows-1253' ))
 
-        (MS_Hebrew              unicode     ( 'ms-hebrew' 'ms-cp1255' 'microsoft-cp1255' 'cp1255' 'microsoft-hebrew' 'windows-1255' ))
+        (MS_Hebrew              unicode     ( 'cp1255' 'cp-1255' 'ms-hebrew' 'ms-cp1255' 'microsoft-cp1255'  'microsoft-hebrew' 'windows-1255' ))
 
 "/        (MS_Symbol           unicode     ( 'ms-symbol' 'microsoft-symbol'  ))
 
-        (MS_Turkish             unicode     ( 'ms-turkish' 'ms-cp1254' 'microsoft-cp1254' 'cp1254' 'microsoft-turkish' 'windows-1254'  ))
+        (MS_Turkish             unicode     ( 'cp1254' 'cp-1254' 'ms-turkish' 'ms-cp1254' 'microsoft-cp1254' 'microsoft-turkish' 'windows-1254'  ))
 
         (NEXT                   unicode     ( 'next' 'nextstep'  ))
 
@@ -1282,7 +1284,30 @@
             ].
             guess
         ].
-        
+
+    "/ check for a string like /*@!!Encoding:1252*/
+    EncodingDetectors
+        add:[:buffer |
+            |guess idx s keyWord codePageNr enc encoderOrNil|
+
+            keyWord := '@!!Encoding:'.
+            (idx := buffer findString:keyWord) ~~ 0 ifTrue:[
+                s := ReadStream on:buffer.
+                s position:idx-1 + keyWord size.
+                s skipSeparators. 
+
+                s peek isDigit ifTrue:[
+                    codePageNr := Integer readFrom:s.
+                    enc := 'cp%1' bindWith:codePageNr.
+                    encoderOrNil := self encoderFor:enc ifAbsent:nil.
+                    encoderOrNil notNil ifTrue:[
+                        guess := (encoderOrNil nameOfEncoding)
+                    ].
+                ].
+            ].
+            guess
+        ].
+
     "/ check for JIS7 encoding
     EncodingDetectors
         add:[:buffer |