CharacterEncoder.st
changeset 23662 93011efedaa3
parent 23410 850fa3b6150d
child 23981 f7ae2f7c1554
--- a/CharacterEncoder.st	Mon Feb 04 15:24:58 2019 +0100
+++ b/CharacterEncoder.st	Tue Feb 05 11:39:59 2019 +0100
@@ -1200,42 +1200,50 @@
     "check for Unicode Byte Order Marks (BOM)"
     EncodingDetectors
         add:[:buffer |
-            |guess byte1 byte2|
-            
-            byte1 := (buffer at:1) codePoint.
-            byte2 := (buffer at:2) codePoint.
-            byte1 < 16rFE ifTrue:[
-                (byte1 = 16rEF
-                    and:[byte2 = 16rBB 
-                    and:[(buffer at:3) codePoint = 16rBF]]) ifTrue:[
-                    guess := #utf8
+            |nb guess byte1 byte2 byte3 byte4|
+
+            (nb := buffer size) >= 2 ifTrue:[
+                byte1 := (buffer at:1) codePoint.
+                byte2 := (buffer at:2) codePoint.
+                nb > 2 ifTrue:[
+                    byte3 := (buffer at:3) codePoint.
+                    nb > 3 ifTrue:[
+                        byte4 := (buffer at:4) codePoint.
+                    ].    
+                ].    
+                byte1 < 16rFE ifTrue:[
+                    (byte1 == 16rEF
+                        and:[byte2 == 16rBB 
+                        and:[byte3 == 16rBF]]) ifTrue:[
+                        guess := #utf8
+                    ] ifFalse:[
+                        (byte1 == 0 
+                            and:[byte2 == 0 
+                            and:[byte3 == 16rFE 
+                            and:[byte4 == 16rFF]]]) ifTrue:[
+                            "00-00-FE-FF big endian utf32"
+                            guess := #utf32be
+                        ].
+                    ]    
                 ] ifFalse:[
-                    (byte1 = 0 
-                        and:[byte2 = 0 
-                        and:[(buffer at:3) codePoint = 16rFE 
-                        and:[(buffer at:4) codePoint = 16rFF]]]) ifTrue:[
-                        "00-00-FE-FF big endian utf32"
-                        guess := #utf32be
-                    ].
-                ]    
-            ] ifFalse:[
-                byte1 = 16rFF ifTrue:[
-                    byte2 = 16rFE ifTrue:[
-                        "FF-FE little endian utf16 or utf32"
-                        ((buffer at:3) codePoint = 0 and:[(buffer at:4) codePoint = 0]) ifTrue:[
-                            "FF-FE-00-00 little endian utf32"
-                            guess := #utf32le.   
-                        ] ifFalse:[
-                            guess := #utf16le
-                        ]    
-                    ].
-                ] ifFalse:["byte1 = 16rFE"
-                    "FE-FF big endian utf16"
-                    byte2 = 16rFF ifTrue:[
-                        "big endian"
-                        guess := #utf16be
-                    ].
-                ]
+                    byte1 == 16rFF ifTrue:[
+                        byte2 == 16rFE ifTrue:[
+                            "FF-FE little endian utf16 or utf32"
+                            (byte3 == 0 and:[byte4 == 0]) ifTrue:[
+                                "FF-FE-00-00 little endian utf32"
+                                guess := #utf32le.   
+                            ] ifFalse:[
+                                guess := #utf16le
+                            ]    
+                        ].
+                    ] ifFalse:["byte1 = 16rFE"
+                        "FE-FF big endian utf16"
+                        byte2 == 16rFF ifTrue:[
+                            "big endian"
+                            guess := #utf16be
+                        ].
+                    ]
+                ].
             ].
             guess
         ].
@@ -1352,6 +1360,7 @@
 "/        ].
 
     "Modified: / 17-01-2018 / 15:55:36 / stefan"
+    "Modified: / 05-02-2019 / 09:23:37 / Claus Gittinger"
 !
 
 showCharacterSet