#FEATURE by cg
class: CharacterEncoder class
changed:
#initializeEncoderClassesByName
#initializeEncodingDetectors
--- a/CharacterEncoder.st Wed Mar 07 16:59:05 2018 +0100
+++ b/CharacterEncoder.st Wed Mar 07 17:04:55 2018 +0100
@@ -14,55 +14,55 @@
"{ NameSpace: Smalltalk }"
Object subclass:#CharacterEncoder
- instanceVariableNames:''
- classVariableNames:'AccessLock CachedEncoders EncoderClassesByName EncodersByName
- EncodingDetectors Jis7KanjiEscapeSequence
- Jis7KanjiOldEscapeSequence Jis7RomanEscapeSequence
- JisISO2022EscapeSequence NullEncoderInstance'
- poolDictionaries:''
- category:'Collections-Text-Encodings'
+ instanceVariableNames:''
+ classVariableNames:'AccessLock CachedEncoders EncoderClassesByName EncodersByName
+ EncodingDetectors Jis7KanjiEscapeSequence
+ Jis7KanjiOldEscapeSequence Jis7RomanEscapeSequence
+ JisISO2022EscapeSequence NullEncoderInstance'
+ poolDictionaries:''
+ category:'Collections-Text-Encodings'
!
CharacterEncoder subclass:#CompoundEncoder
- instanceVariableNames:'decoder encoder'
- classVariableNames:''
- poolDictionaries:''
- privateIn:CharacterEncoder
+ instanceVariableNames:'decoder encoder'
+ classVariableNames:''
+ poolDictionaries:''
+ privateIn:CharacterEncoder
!
CharacterEncoder subclass:#NullEncoder
- instanceVariableNames:''
- classVariableNames:''
- poolDictionaries:''
- privateIn:CharacterEncoder
+ instanceVariableNames:''
+ classVariableNames:''
+ poolDictionaries:''
+ privateIn:CharacterEncoder
!
CharacterEncoder subclass:#InverseEncoder
- instanceVariableNames:'decoder readAhead'
- classVariableNames:''
- poolDictionaries:''
- privateIn:CharacterEncoder
+ instanceVariableNames:'decoder readAhead'
+ classVariableNames:''
+ poolDictionaries:''
+ privateIn:CharacterEncoder
!
CharacterEncoder::NullEncoder subclass:#DefaultEncoder
- instanceVariableNames:''
- classVariableNames:''
- poolDictionaries:''
- privateIn:CharacterEncoder
+ instanceVariableNames:''
+ classVariableNames:''
+ poolDictionaries:''
+ privateIn:CharacterEncoder
!
CharacterEncoder subclass:#OtherEncoding
- instanceVariableNames:''
- classVariableNames:''
- poolDictionaries:''
- privateIn:CharacterEncoder
+ instanceVariableNames:''
+ classVariableNames:''
+ poolDictionaries:''
+ privateIn:CharacterEncoder
!
CharacterEncoder subclass:#TwoStepEncoder
- instanceVariableNames:'encoder1 encoder2'
- classVariableNames:''
- poolDictionaries:''
- privateIn:CharacterEncoder
+ instanceVariableNames:'encoder1 encoder2'
+ classVariableNames:''
+ poolDictionaries:''
+ privateIn:CharacterEncoder
!
!CharacterEncoder class methodsFor:'documentation'!
@@ -702,23 +702,25 @@
(MAC_Turkish unicode ( #'mac-turkish' #'macturkish' ))
- (MS_Ansi unicode ( #'ms-ansi' 'ms-cp1252' 'microsoft-cp1252' 'cp1252' 'microsoft-ansi' 'windows-1252' 'windows-latin1'))
+ (MS_Ansi unicode ( #'ms-ansi' 'microsoft-ansi'))
+
+ (MS_CP1252 unicode ( 'cp1252' 'cp-1252' 'ms-cp1252' 'microsoft-cp1252' 'windows-1252' 'windows-latin1'))
- (MS_Arabic unicode ( 'ms-arabic' 'ms-cp1256' 'microsoft-cp1256' 'cp1256' 'microsoft-arabic' 'windows-1256' ))
+ (MS_Arabic unicode ( 'cp1256' 'cp-1256' 'ms-arabic' 'ms-cp1256' 'microsoft-cp1256' 'microsoft-arabic' 'windows-1256' ))
- (MS_Baltic unicode ( 'ms-baltic' 'ms-cp1257' 'microsoft-cp1257' 'cp1257' 'microsoft-baltic' 'windows-1257' ))
+ (MS_Baltic unicode ( 'cp1257' 'cp-1257' 'ms-baltic' 'ms-cp1257' 'microsoft-cp1257' 'microsoft-baltic' 'windows-1257' ))
- (MS_Cyrillic unicode ( 'ms-cyrillic' 'ms-cp1251' 'microsoft-cp1251' 'cp1251' 'microsoft-cyrillic' 'windows-1251' ))
+ (MS_Cyrillic unicode ( 'cp1251' 'cp-1251' 'ms-cyrillic' 'ms-cp1251' 'microsoft-cp1251' 'microsoft-cyrillic' 'windows-1251' ))
- (MS_EastEuropean unicode ( 'ms-easteuropean' 'ms-ee' 'cp1250' 'ms-cp1250' 'microsoft-cp1250' 'microsoft-easteuropean' 'windows-1250' ))
+ (MS_EastEuropean unicode ( 'cp1250' 'cp-1250' 'ms-easteuropean' 'ms-ee' 'ms-cp1250' 'microsoft-cp1250' 'microsoft-easteuropean' 'windows-1250' ))
- (MS_Greek unicode ( 'ms-greek' 'ms-cp1253' 'microsoft-cp1253' 'cp1253' 'microsoft-greek' 'windows-1253' ))
+ (MS_Greek unicode ( 'cp1253' 'cp-1253' 'ms-greek' 'ms-cp1253' 'microsoft-cp1253' 'microsoft-greek' 'windows-1253' ))
- (MS_Hebrew unicode ( 'ms-hebrew' 'ms-cp1255' 'microsoft-cp1255' 'cp1255' 'microsoft-hebrew' 'windows-1255' ))
+ (MS_Hebrew unicode ( 'cp1255' 'cp-1255' 'ms-hebrew' 'ms-cp1255' 'microsoft-cp1255' 'microsoft-hebrew' 'windows-1255' ))
"/ (MS_Symbol unicode ( 'ms-symbol' 'microsoft-symbol' ))
- (MS_Turkish unicode ( 'ms-turkish' 'ms-cp1254' 'microsoft-cp1254' 'cp1254' 'microsoft-turkish' 'windows-1254' ))
+ (MS_Turkish unicode ( 'cp1254' 'cp-1254' 'ms-turkish' 'ms-cp1254' 'microsoft-cp1254' 'microsoft-turkish' 'windows-1254' ))
(NEXT unicode ( 'next' 'nextstep' ))
@@ -1282,7 +1284,30 @@
].
guess
].
-
+
+ "/ check for a string like /*@!!Encoding:1252*/
+ EncodingDetectors
+ add:[:buffer |
+ |guess idx s keyWord codePageNr enc encoderOrNil|
+
+ keyWord := '@!!Encoding:'.
+ (idx := buffer findString:keyWord) ~~ 0 ifTrue:[
+ s := ReadStream on:buffer.
+ s position:idx-1 + keyWord size.
+ s skipSeparators.
+
+ s peek isDigit ifTrue:[
+ codePageNr := Integer readFrom:s.
+ enc := 'cp%1' bindWith:codePageNr.
+ encoderOrNil := self encoderFor:enc ifAbsent:nil.
+ encoderOrNil notNil ifTrue:[
+ guess := (encoderOrNil nameOfEncoding)
+ ].
+ ].
+ ].
+ guess
+ ].
+
"/ check for JIS7 encoding
EncodingDetectors
add:[:buffer |