#REFACTORING by cg
authorClaus Gittinger <cg@exept.de>
Tue, 26 Mar 2019 14:15:06 +0100
changeset 23981 f7ae2f7c1554
parent 23980 6644ef2d4826
child 23982 18c16665c868
#REFACTORING by cg class: CharacterEncoder class added: #detectBOMInBuffer: comment/format in: #guessEncodingOfBuffer: #guessEncodingOfFile: #guessEncodingOfStream: changed: #initializeEncodingDetectors
CharacterEncoder.st
--- a/CharacterEncoder.st	Tue Mar 26 11:55:51 2019 +0100
+++ b/CharacterEncoder.st	Tue Mar 26 14:15:06 2019 +0100
@@ -1,3 +1,5 @@
+"{ Encoding: utf8 }"
+
 "
  COPYRIGHT (c) 2004 by eXept Software AG
               All Rights Reserved
@@ -904,7 +906,7 @@
     "
      self encodeString:(self encodeString:'hello' into:#ebcdic) from:#ebcdic into:#ascii    
      self encodeString:(self encodeString:'hello' into:#ebcdic) from:#ebcdic into:#unicode    
-     self encodeString:(self encodeString:'Äh ... hello' into:#ebcdic) from:#ebcdic into:#utf8    
+     self encodeString:(self encodeString:'Äh ... hello' into:#ebcdic) from:#ebcdic into:#utf8    
     "
 
     "Modified (comment): / 17-01-2018 / 15:49:40 / stefan"
@@ -1103,14 +1105,73 @@
 
 !CharacterEncoder class methodsFor:'utilities'!
 
+detectBOMInBuffer:buffer
+    "returns one of 
+        #utf8
+        #utf32be
+        #utf32le
+        #utf16le
+        #utf16be
+        nil"
+
+    |nb byte1 byte2 byte3 byte4|
+
+    (nb := buffer size) >= 2 ifTrue:[
+        byte1 := (buffer at:1) codePoint.
+        byte2 := (buffer at:2) codePoint.
+        nb > 2 ifTrue:[
+            byte3 := (buffer at:3) codePoint.
+            nb > 3 ifTrue:[
+                byte4 := (buffer at:4) codePoint.
+            ].    
+        ].    
+        byte1 < 16rFE ifTrue:[
+            "/ EF-BB-BF -> utf8
+            (byte1 == 16rEF
+                and:[byte2 == 16rBB 
+                and:[byte3 == 16rBF]]) ifTrue:[
+                ^ #utf8
+            ].
+            "00-00-FE-FF big endian utf32"
+            (byte1 == 0 
+                and:[byte2 == 0 
+                and:[byte3 == 16rFE 
+                and:[byte4 == 16rFF]]]) ifTrue:[
+                ^ #utf32be
+            ]    
+        ] ifFalse:[
+            byte1 == 16rFF ifTrue:[
+                byte2 == 16rFE ifTrue:[
+                    "FF-FE little endian utf16 or utf32"
+                    (byte3 == 0 and:[byte4 == 0]) ifTrue:[
+                        "FF-FE-00-00 little endian utf32"
+                        ^ #utf32le.   
+                    ] ifFalse:[
+                        ^ #utf16le
+                    ]    
+                ].
+            ] ifFalse:["byte1 = 16rFE"
+                "FE-FF big endian utf16"
+                byte2 == 16rFF ifTrue:[
+                    "big endian"
+                    ^ #utf16be
+                ].
+            ]
+        ].
+    ].
+    ^ nil
+!
+
 guessEncodingOfBuffer:buffer
     "try to guess a string-buffer's encoding.
-     Basically looks for a string of the form
+     Basically looks for BOM (byte order marks)
+     pr a special string of the form
             encoding #name
      or:
             encoding: name
      within the given buffer 
-     (which is usually within the first few bytes of a textFile)."
+     (which is usually found within the first few bytes of a textFile).
+     Many editors and tools write such comments (eg. emacs, st/x, etc.)"
 
     buffer size < 4 ifTrue:[
         "not enough bytes to determine the contents"
@@ -1130,12 +1191,12 @@
 !
 
 guessEncodingOfFile:aFilename
-    "look for a string
+    "look for a BOM (byte order mark) or a special string of the form:
         encoding #name
      or:
         encoding: name
      within the given buffer 
-     (which is usually the first few bytes of a textFile).
+     (which is usually found in the first few bytes of a textFile).
      If that's not found, use heuristics (in CharacterArray) to guess.
      Return a symbol like #utf8."
 
@@ -1161,7 +1222,7 @@
 !
 
 guessEncodingOfStream:aStream
-    "look for a string of the form
+    "look for a BOM (byte order mark) or a special string of the form:
             encoding #name
      or:
             encoding: name
@@ -1198,55 +1259,7 @@
     EncodingDetectors := OrderedCollection new.
 
     "check for Unicode Byte Order Marks (BOM)"
-    EncodingDetectors
-        add:[:buffer |
-            |nb guess byte1 byte2 byte3 byte4|
-
-            (nb := buffer size) >= 2 ifTrue:[
-                byte1 := (buffer at:1) codePoint.
-                byte2 := (buffer at:2) codePoint.
-                nb > 2 ifTrue:[
-                    byte3 := (buffer at:3) codePoint.
-                    nb > 3 ifTrue:[
-                        byte4 := (buffer at:4) codePoint.
-                    ].    
-                ].    
-                byte1 < 16rFE ifTrue:[
-                    (byte1 == 16rEF
-                        and:[byte2 == 16rBB 
-                        and:[byte3 == 16rBF]]) ifTrue:[
-                        guess := #utf8
-                    ] ifFalse:[
-                        (byte1 == 0 
-                            and:[byte2 == 0 
-                            and:[byte3 == 16rFE 
-                            and:[byte4 == 16rFF]]]) ifTrue:[
-                            "00-00-FE-FF big endian utf32"
-                            guess := #utf32be
-                        ].
-                    ]    
-                ] ifFalse:[
-                    byte1 == 16rFF ifTrue:[
-                        byte2 == 16rFE ifTrue:[
-                            "FF-FE little endian utf16 or utf32"
-                            (byte3 == 0 and:[byte4 == 0]) ifTrue:[
-                                "FF-FE-00-00 little endian utf32"
-                                guess := #utf32le.   
-                            ] ifFalse:[
-                                guess := #utf16le
-                            ]    
-                        ].
-                    ] ifFalse:["byte1 = 16rFE"
-                        "FE-FF big endian utf16"
-                        byte2 == 16rFF ifTrue:[
-                            "big endian"
-                            guess := #utf16be
-                        ].
-                    ]
-                ].
-            ].
-            guess
-        ].
+    EncodingDetectors add:[:buffer | self detectBOMInBuffer:buffer].
         
     "check for an inline encoding markup (charset= / encoding=) substring"
     EncodingDetectors