CharacterEncoder.st
branchjv
changeset 19478 1f5aa87f6170
parent 18810 5bd0f1b3f948
parent 19465 83cd3327e4c4
child 19861 95c7068e30ba
--- a/CharacterEncoder.st	Fri Mar 25 06:29:08 2016 +0000
+++ b/CharacterEncoder.st	Sat Mar 26 07:56:10 2016 +0000
@@ -18,7 +18,7 @@
 	classVariableNames:'EncoderClassesByName EncodersByName CachedEncoders AccessLock
 		NullEncoderInstance Jis7KanjiEscapeSequence
 		Jis7RomanEscapeSequence JisISO2022EscapeSequence
-		Jis7KanjiOldEscapeSequence'
+		Jis7KanjiOldEscapeSequence EncodingDetectors'
 	poolDictionaries:''
 	category:'Collections-Text-Encodings'
 !
@@ -977,6 +977,14 @@
 
 !CharacterEncoder class methodsFor:'queries'!
 
+isAbstract
+    "Return if this class is an abstract class.
+     True is returned for CharacterEncoder here; false for subclasses.
+     Abstract subclasses must redefine this again."
+
+    ^ self == CharacterEncoder
+!
+
 isEncoding:subSetEncodingArg subSetOf:superSetEncodingArg
     "return true, if superSetEncoding encoding includes all characters of subSetEncoding.
      (this means: characters are included - not that they have the same encoding)"
@@ -1080,131 +1088,31 @@
     ^ self nameOfEncoding asUppercaseFirst
 ! !
 
-!CharacterEncoder class methodsFor:'testing'!
-
-isAbstract
-    "Return if this class is an abstract class.
-     True is returned for CharacterEncoder here; false for subclasses.
-     Abstract subclasses must redefine again."
-
-    ^ self == CharacterEncoder
-! !
-
 !CharacterEncoder class methodsFor:'utilities'!
 
 guessEncodingOfBuffer:buffer
-    "look for a string of the form
+    "try to guess a string-buffer's encoding.
+     Basically looks for a string of the form
             encoding #name
      or:
             encoding: name
      within the given buffer 
-     (which is usually the first few bytes of a textFile)."
-
-    |lcBuffer quote peek|
+     (which is usually within the first few bytes of a textFile)."
 
     buffer size < 4 ifTrue:[
         "not enough bytes to determine the contents"
         ^ nil.
     ].
-
-    "check the Byte Order Mark (BOM)"
-    peek := (buffer at:1) codePoint.
-    peek < 16rFE ifTrue:[
-        (peek = 16rEF
-            and:[(buffer at:2) codePoint = 16rBB 
-            and:[(buffer at:3) codePoint = 16rBF]]) ifTrue:[
-            ^ #utf8
-        ].
-        (peek = 0 
-            and:[(buffer at:2) codePoint = 0 
-            and:[(buffer at:3) codePoint = 16rFE 
-            and:[(buffer at:4) codePoint = 16rFF]]]) ifTrue:[
-            ^ #utf32be
-        ].
-    ] ifFalse:[
-        peek = 16rFF ifTrue:[
-            (buffer at:2) codePoint = 16rFE ifTrue:[
-                "little endian"
-                ((buffer at:3) codePoint = 0 and:[(buffer at:4) codePoint = 0]) ifTrue:[
-                    ^ #utf32le.   
-                ].
-                ^ #utf16le
-            ].
-        ] ifFalse:["peek = 16rFE"
-            (buffer at:2) codePoint = 16rFF ifTrue:[
-                "big endian"
-                ^ #utf16be
-            ].
-        ]
-    ].
-
-    lcBuffer := buffer asLowercase.
-
-    "now look for an inline encoding markup"
-    #(charset encoding) do:[:keyWord |
-        |encoderOrNil idx s w enc|
-
-        (idx := lcBuffer findString:keyWord) ~~ 0 ifTrue:[
-            s := ReadStream on:buffer.
-            s position:idx-1.
-            s skip:keyWord size.
-            s skipSeparators. 
-
-            "do not include '=' here, otherwise
-             files containing xml code (<?xml charset='utf8'> will be parsed as UTF-8"
+    EncodingDetectors isNil ifTrue:[
+        self initializeEncodingDetectors.
+    ].    
+    EncodingDetectors do:[:each |
+        |guess|
 
-            [':#=' includes:s peek] whileTrue:[
-                s next.
-                s skipSeparators. 
-            ].
-            s skipSeparators.
-            ('"''' includes:s peek) ifTrue:[
-                quote := s next.
-                w := s upTo:quote.
-            ] ifFalse:[
-                w := s upToElementForWhich:[:ch | ch isSeparator or:[ch == $" or:[ch == $' or:[ch == $> ]]]].
-            ].
-            w notNil ifTrue:[
-                enc := w withoutQuotes.
-                (enc startsWith:'x-') ifTrue:[
-                    enc := enc copyFrom:3.
-                ].
-                encoderOrNil := self encoderFor:enc ifAbsent:nil.
-                encoderOrNil notNil ifTrue:[
-                    ^ encoderOrNil nameOfEncoding
-                ].
-"/                enc size >=3 ifTrue:[
-"/                    Transcript showCR:'Unknown encoding: ' , (withoutQuotes value:w).
-"/                ]
-            ].
+        (guess := each value:buffer) notNil ifTrue:[
+            ^ guess
         ].
-    ].
-
-    "/ look for JIS7 / EUC encoding
-    (buffer findString:self jisISO2022EscapeSequence) ~~ 0 ifTrue:[
-        ^ #'iso2020-jp'
-    ].
-    (buffer findString:self jis7KanjiEscapeSequence) ~~ 0 ifTrue:[
-        ^ #jis7
-    ].
-    (buffer findString:self jis7KanjiOldEscapeSequence) ~~ 0 ifTrue:[
-        ^ #jis7
-    ].
-
-    "/ TODO:
-
-"/    "/ look for EUC
-"/    idx := aString findFirst:[:char | |ascii|
-"/                                        ((ascii := char asciiValue) >= 16rA1)     
-"/                                        and:[ascii <= 16rFE]].
-"/    idx ~~ 0 ifTrue:[
-"/        ascii := (aString at:(idx + 1)) asciiValue.
-"/        (ascii >= 16rA1 and:[ascii <= 16rFE]) ifTrue:[
-"/            ^ #euc
-"/        ]
-"/    ].
-    "/ look for SJIS ...
-
+    ].    
     ^ nil
 !
 
@@ -1259,6 +1167,148 @@
     "Modified: / 31-05-2011 / 15:45:23 / cg"
 !
 
+initializeEncodingDetectors
+    "setup the list of encoding detectors.
+     This is a list of blocks, which get a buffer as argument,
+     and return an encoding symbol or nil.
+     Can be customized for more detectors 
+     (used to be hard-coded in guessEncodingOfBuffer:)"
+
+    EncodingDetectors := OrderedCollection new.
+
+    "check for Unicode Byte Order Marks (BOM)"
+    EncodingDetectors
+        add:[:buffer |
+            |guess byte1 byte2|
+            
+            byte1 := (buffer at:1) codePoint.
+            byte2 := (buffer at:2) codePoint.
+            byte1 < 16rFE ifTrue:[
+                (byte1 = 16rEF
+                    and:[byte2 = 16rBB 
+                    and:[(buffer at:3) codePoint = 16rBF]]) ifTrue:[
+                    guess := #utf8
+                ] ifFalse:[
+                    (byte1 = 0 
+                        and:[byte2 = 0 
+                        and:[(buffer at:3) codePoint = 16rFE 
+                        and:[(buffer at:4) codePoint = 16rFF]]]) ifTrue:[
+                        "00-00-FE-FF big endian utf32"
+                        guess := #utf32be
+                    ].
+                ]    
+            ] ifFalse:[
+                byte1 = 16rFF ifTrue:[
+                    byte2 = 16rFE ifTrue:[
+                        "FF-FE little endian utf16 or utf32"
+                        ((buffer at:3) codePoint = 0 and:[(buffer at:4) codePoint = 0]) ifTrue:[
+                            "FF-FE-00-00 little endian utf32"
+                            guess := #utf32le.   
+                        ] ifFalse:[
+                            guess := #utf16le
+                        ]    
+                    ].
+                ] ifFalse:["byte1 = 16rFE"
+                    "FE-FF big endian utf16"
+                    byte2 = 16rFF ifTrue:[
+                        "big endian"
+                        guess := #utf16be
+                    ].
+                ]
+            ].
+            guess
+        ].
+        
+    "check for an inline encoding markup (charset= / encoding=) substring"
+    EncodingDetectors
+        add:[:buffer |
+            |guess lcBuffer quote peek|
+
+            lcBuffer := buffer asLowercase.
+
+            guess :=
+                #(charset encoding) doWithExit:[:keyWord :exit |
+                    |encoderOrNil idx s w enc|
+
+                    guess isNil ifTrue:[
+                    (idx := lcBuffer findString:keyWord) ~~ 0 ifTrue:[
+                        s := ReadStream on:buffer.
+                        s position:idx-1.
+                        s skip:keyWord size.
+                        s skipSeparators. 
+
+                        "do not include '=' here, otherwise
+                         files containing xml code (<?xml charset='utf8'> will be parsed as UTF-8"
+
+                        [':#=' includes:s peek] whileTrue:[
+                            s next.
+                            s skipSeparators. 
+                        ].
+                        s skipSeparators.
+                        ('"''' includes:s peek) ifTrue:[
+                            quote := s next.
+                            w := s upTo:quote.
+                        ] ifFalse:[
+                            w := s upToElementForWhich:[:ch | ch isSeparator or:[ch == $" or:[ch == $' or:[ch == $> ]]]].
+                        ].
+                        w notNil ifTrue:[
+                            enc := w withoutQuotes.
+                            (enc startsWith:'x-') ifTrue:[
+                                enc := enc copyFrom:3.
+                            ].
+                            encoderOrNil := self encoderFor:enc ifAbsent:nil.
+                            encoderOrNil notNil ifTrue:[
+                                exit value:(encoderOrNil nameOfEncoding)
+                            ].
+                        ].
+                    ].
+                ].
+                nil
+            ].
+            guess
+        ].
+        
+    "/ check for JIS7 encoding
+    EncodingDetectors
+        add:[:buffer |
+            (buffer findString:self jisISO2022EscapeSequence) ~~ 0 ifTrue:[
+                #'iso2020-jp'
+            ] ifFalse:[
+                (buffer findString:self jis7KanjiEscapeSequence) ~~ 0 ifTrue:[
+                    #jis7
+                ] ifFalse:[
+                    (buffer findString:self jis7KanjiOldEscapeSequence) ~~ 0 ifTrue:[
+                        #jis7
+                    ] ifFalse:[
+                        nil
+                    ]
+                ]
+            ]    
+        ].
+
+    "/ TODO: look for EUC, SJIS etc.
+    "/ Disabled, due to too many false positives.
+    "/ if required, think about it, fix it and uncomment it
+"/    EncodingDetectors
+"/        add:[:buffer |
+"/            |guess idx|
+"/
+"/            idx := buffer 
+"/                        findFirst:[:char | 
+"/                            |code|
+"/                            code := char codePoint.
+"/                            code between:16rA1 and: 16rFE
+"/                        ].
+"/            ((idx ~~ 0) 
+"/                and:[ (buffer at:(idx + 1)) codePoint between:16rA1 and: 16rFE ])
+"/            ifTrue:[
+"/                guess := #euc
+"/            ] ifFalse:[
+"/                "/ look for SJIS ...
+"/            ]
+"/        ].
+!
+
 showCharacterSet
     |font|
 
@@ -1274,6 +1324,15 @@
 
     "
      CharacterEncoderImplementations::MS_Ansi showCharacterSet
+     CharacterEncoderImplementations::ISO8859_1 showCharacterSet
+     CharacterEncoderImplementations::ISO8859_2 showCharacterSet
+     CharacterEncoderImplementations::ISO8859_3 showCharacterSet
+     CharacterEncoderImplementations::ISO8859_4 showCharacterSet
+     CharacterEncoderImplementations::ISO8859_5 showCharacterSet
+     CharacterEncoderImplementations::ISO8859_6 showCharacterSet
+     CharacterEncoderImplementations::ISO8859_7 showCharacterSet
+     CharacterEncoderImplementations::ISO8859_8 showCharacterSet
+     CharacterEncoderImplementations::ISO8859_9 showCharacterSet
     "
 ! !