--- a/CharacterEncoder.st Fri Mar 25 06:29:08 2016 +0000
+++ b/CharacterEncoder.st Sat Mar 26 07:56:10 2016 +0000
@@ -18,7 +18,7 @@
classVariableNames:'EncoderClassesByName EncodersByName CachedEncoders AccessLock
NullEncoderInstance Jis7KanjiEscapeSequence
Jis7RomanEscapeSequence JisISO2022EscapeSequence
- Jis7KanjiOldEscapeSequence'
+ Jis7KanjiOldEscapeSequence EncodingDetectors'
poolDictionaries:''
category:'Collections-Text-Encodings'
!
@@ -977,6 +977,14 @@
!CharacterEncoder class methodsFor:'queries'!
+isAbstract
+ "Return if this class is an abstract class.
+ True is returned for CharacterEncoder here; false for subclasses.
+ Abstract subclasses must redefine this again."
+
+ ^ self == CharacterEncoder
+!
+
isEncoding:subSetEncodingArg subSetOf:superSetEncodingArg
"return true, if superSetEncoding encoding includes all characters of subSetEncoding.
(this means: characters are included - not that they have the same encoding)"
@@ -1080,131 +1088,31 @@
^ self nameOfEncoding asUppercaseFirst
! !
-!CharacterEncoder class methodsFor:'testing'!
-
-isAbstract
- "Return if this class is an abstract class.
- True is returned for CharacterEncoder here; false for subclasses.
- Abstract subclasses must redefine again."
-
- ^ self == CharacterEncoder
-! !
-
!CharacterEncoder class methodsFor:'utilities'!
guessEncodingOfBuffer:buffer
- "look for a string of the form
+ "try to guess a string-buffer's encoding.
+ Basically looks for a string of the form
encoding #name
or:
encoding: name
within the given buffer
- (which is usually the first few bytes of a textFile)."
-
- |lcBuffer quote peek|
+ (which is usually within the first few bytes of a textFile)."
buffer size < 4 ifTrue:[
"not enough bytes to determine the contents"
^ nil.
].
-
- "check the Byte Order Mark (BOM)"
- peek := (buffer at:1) codePoint.
- peek < 16rFE ifTrue:[
- (peek = 16rEF
- and:[(buffer at:2) codePoint = 16rBB
- and:[(buffer at:3) codePoint = 16rBF]]) ifTrue:[
- ^ #utf8
- ].
- (peek = 0
- and:[(buffer at:2) codePoint = 0
- and:[(buffer at:3) codePoint = 16rFE
- and:[(buffer at:4) codePoint = 16rFF]]]) ifTrue:[
- ^ #utf32be
- ].
- ] ifFalse:[
- peek = 16rFF ifTrue:[
- (buffer at:2) codePoint = 16rFE ifTrue:[
- "little endian"
- ((buffer at:3) codePoint = 0 and:[(buffer at:4) codePoint = 0]) ifTrue:[
- ^ #utf32le.
- ].
- ^ #utf16le
- ].
- ] ifFalse:["peek = 16rFE"
- (buffer at:2) codePoint = 16rFF ifTrue:[
- "big endian"
- ^ #utf16be
- ].
- ]
- ].
-
- lcBuffer := buffer asLowercase.
-
- "now look for an inline encoding markup"
- #(charset encoding) do:[:keyWord |
- |encoderOrNil idx s w enc|
-
- (idx := lcBuffer findString:keyWord) ~~ 0 ifTrue:[
- s := ReadStream on:buffer.
- s position:idx-1.
- s skip:keyWord size.
- s skipSeparators.
-
- "do not include '=' here, otherwise
- files containing xml code (<?xml charset='utf8'> will be parsed as UTF-8"
+ EncodingDetectors isNil ifTrue:[
+ self initializeEncodingDetectors.
+ ].
+ EncodingDetectors do:[:each |
+ |guess|
- [':#=' includes:s peek] whileTrue:[
- s next.
- s skipSeparators.
- ].
- s skipSeparators.
- ('"''' includes:s peek) ifTrue:[
- quote := s next.
- w := s upTo:quote.
- ] ifFalse:[
- w := s upToElementForWhich:[:ch | ch isSeparator or:[ch == $" or:[ch == $' or:[ch == $> ]]]].
- ].
- w notNil ifTrue:[
- enc := w withoutQuotes.
- (enc startsWith:'x-') ifTrue:[
- enc := enc copyFrom:3.
- ].
- encoderOrNil := self encoderFor:enc ifAbsent:nil.
- encoderOrNil notNil ifTrue:[
- ^ encoderOrNil nameOfEncoding
- ].
-"/ enc size >=3 ifTrue:[
-"/ Transcript showCR:'Unknown encoding: ' , (withoutQuotes value:w).
-"/ ]
- ].
+ (guess := each value:buffer) notNil ifTrue:[
+ ^ guess
].
- ].
-
- "/ look for JIS7 / EUC encoding
- (buffer findString:self jisISO2022EscapeSequence) ~~ 0 ifTrue:[
- ^ #'iso2020-jp'
- ].
- (buffer findString:self jis7KanjiEscapeSequence) ~~ 0 ifTrue:[
- ^ #jis7
- ].
- (buffer findString:self jis7KanjiOldEscapeSequence) ~~ 0 ifTrue:[
- ^ #jis7
- ].
-
- "/ TODO:
-
-"/ "/ look for EUC
-"/ idx := aString findFirst:[:char | |ascii|
-"/ ((ascii := char asciiValue) >= 16rA1)
-"/ and:[ascii <= 16rFE]].
-"/ idx ~~ 0 ifTrue:[
-"/ ascii := (aString at:(idx + 1)) asciiValue.
-"/ (ascii >= 16rA1 and:[ascii <= 16rFE]) ifTrue:[
-"/ ^ #euc
-"/ ]
-"/ ].
- "/ look for SJIS ...
-
+ ].
^ nil
!
@@ -1259,6 +1167,148 @@
"Modified: / 31-05-2011 / 15:45:23 / cg"
!
+initializeEncodingDetectors
+ "setup the list of encoding detectors.
+ This is a list of blocks, which get a buffer as argument,
+ and return an encoding symbol or nil.
+ Can be customized for more detectors
+ (used to be hard-coded in guessEncodingOfBuffer:)"
+
+ EncodingDetectors := OrderedCollection new.
+
+ "check for Unicode Byte Order Marks (BOM)"
+ EncodingDetectors
+ add:[:buffer |
+ |guess byte1 byte2|
+
+ byte1 := (buffer at:1) codePoint.
+ byte2 := (buffer at:2) codePoint.
+ byte1 < 16rFE ifTrue:[
+ (byte1 = 16rEF
+ and:[byte2 = 16rBB
+ and:[(buffer at:3) codePoint = 16rBF]]) ifTrue:[
+ guess := #utf8
+ ] ifFalse:[
+ (byte1 = 0
+ and:[byte2 = 0
+ and:[(buffer at:3) codePoint = 16rFE
+ and:[(buffer at:4) codePoint = 16rFF]]]) ifTrue:[
+ "00-00-FE-FF big endian utf32"
+ guess := #utf32be
+ ].
+ ]
+ ] ifFalse:[
+ byte1 = 16rFF ifTrue:[
+ byte2 = 16rFE ifTrue:[
+ "FF-FE little endian utf16 or utf32"
+ ((buffer at:3) codePoint = 0 and:[(buffer at:4) codePoint = 0]) ifTrue:[
+ "FF-FE-00-00 little endian utf32"
+ guess := #utf32le.
+ ] ifFalse:[
+ guess := #utf16le
+ ]
+ ].
+ ] ifFalse:["byte1 = 16rFE"
+ "FE-FF big endian utf16"
+ byte2 = 16rFF ifTrue:[
+ "big endian"
+ guess := #utf16be
+ ].
+ ]
+ ].
+ guess
+ ].
+
+ "check for an inline encoding markup (charset= / encoding=) substring"
+ EncodingDetectors
+ add:[:buffer |
+ |guess lcBuffer quote peek|
+
+ lcBuffer := buffer asLowercase.
+
+ guess :=
+ #(charset encoding) doWithExit:[:keyWord :exit |
+ |encoderOrNil idx s w enc|
+
+ guess isNil ifTrue:[
+ (idx := lcBuffer findString:keyWord) ~~ 0 ifTrue:[
+ s := ReadStream on:buffer.
+ s position:idx-1.
+ s skip:keyWord size.
+ s skipSeparators.
+
+ "do not include '=' here, otherwise
+ files containing xml code (<?xml charset='utf8'> will be parsed as UTF-8"
+
+ [':#=' includes:s peek] whileTrue:[
+ s next.
+ s skipSeparators.
+ ].
+ s skipSeparators.
+ ('"''' includes:s peek) ifTrue:[
+ quote := s next.
+ w := s upTo:quote.
+ ] ifFalse:[
+ w := s upToElementForWhich:[:ch | ch isSeparator or:[ch == $" or:[ch == $' or:[ch == $> ]]]].
+ ].
+ w notNil ifTrue:[
+ enc := w withoutQuotes.
+ (enc startsWith:'x-') ifTrue:[
+ enc := enc copyFrom:3.
+ ].
+ encoderOrNil := self encoderFor:enc ifAbsent:nil.
+ encoderOrNil notNil ifTrue:[
+ exit value:(encoderOrNil nameOfEncoding)
+ ].
+ ].
+ ].
+ ].
+ nil
+ ].
+ guess
+ ].
+
+ "/ check for JIS7 encoding
+ EncodingDetectors
+ add:[:buffer |
+ (buffer findString:self jisISO2022EscapeSequence) ~~ 0 ifTrue:[
+ #'iso2020-jp'
+ ] ifFalse:[
+ (buffer findString:self jis7KanjiEscapeSequence) ~~ 0 ifTrue:[
+ #jis7
+ ] ifFalse:[
+ (buffer findString:self jis7KanjiOldEscapeSequence) ~~ 0 ifTrue:[
+ #jis7
+ ] ifFalse:[
+ nil
+ ]
+ ]
+ ]
+ ].
+
+ "/ TODO: look for EUC, SJIS etc.
+ "/ Disabled, due to too many false positives.
+ "/ if required, think about it, fix it and uncomment it
+"/ EncodingDetectors
+"/ add:[:buffer |
+"/ |guess idx|
+"/
+"/ idx := buffer
+"/ findFirst:[:char |
+"/ |code|
+"/ code := char codePoint.
+"/ code between:16rA1 and: 16rFE
+"/ ].
+"/ ((idx ~~ 0)
+"/ and:[ (buffer at:(idx + 1)) codePoint between:16rA1 and: 16rFE ])
+"/ ifTrue:[
+"/ guess := #euc
+"/ ] ifFalse:[
+"/ "/ look for SJIS ...
+"/ ]
+"/ ].
+!
+
showCharacterSet
|font|
@@ -1274,6 +1324,15 @@
"
CharacterEncoderImplementations::MS_Ansi showCharacterSet
+ CharacterEncoderImplementations::ISO8859_1 showCharacterSet
+ CharacterEncoderImplementations::ISO8859_2 showCharacterSet
+ CharacterEncoderImplementations::ISO8859_3 showCharacterSet
+ CharacterEncoderImplementations::ISO8859_4 showCharacterSet
+ CharacterEncoderImplementations::ISO8859_5 showCharacterSet
+ CharacterEncoderImplementations::ISO8859_6 showCharacterSet
+ CharacterEncoderImplementations::ISO8859_7 showCharacterSet
+ CharacterEncoderImplementations::ISO8859_8 showCharacterSet
+ CharacterEncoderImplementations::ISO8859_9 showCharacterSet
"
! !