CharacterEncoder.st
changeset 23982 18c16665c868
parent 23981 f7ae2f7c1554
child 24002 ac83f90e549c
--- a/CharacterEncoder.st	Tue Mar 26 14:15:06 2019 +0100
+++ b/CharacterEncoder.st	Tue Mar 26 16:23:23 2019 +0100
@@ -1105,6 +1105,110 @@
 
 !CharacterEncoder class methodsFor:'utilities'!
 
+detectAndSkipBOMInStream:stream
+    "skips over the BOM and returns one of 
+        #utf8
+        #utf32be
+        #utf32le
+        #utf16le
+        #utf16be
+     if no BOM is detected, the stream is repositions to where it was before."
+
+    |pos byte1|
+
+    pos := stream position.
+    stream atEnd ifTrue:[^ nil].
+    byte1 := stream peek.
+    "/ EF-BB-BF -> utf8
+    byte1 == 16rEF ifTrue:[
+        stream next.
+        stream peek == 16rBB ifTrue:[
+            stream next.
+            stream next == 16rBF ifTrue:[
+                ^ #utf8
+            ]
+        ].
+        stream position:pos. ^nil
+    ].
+    "00-00-FE-FF big endian utf32"
+    byte1 == 16r00 ifTrue:[
+        stream next.
+        stream peek == 16r00 ifTrue:[
+            stream next.
+            stream peek == 16rFE ifTrue:[
+                stream next.
+                stream next == 16rFF ifTrue:[
+                    ^ #utf32be
+                ]
+            ]
+        ].
+        stream position:pos. ^nil
+    ].
+
+    "FF-FE little endian utf16 or utf32"
+    byte1 == 16rFF ifTrue:[
+        stream next.
+        stream peek == 16rFE ifTrue:[
+            stream next.
+            stream peek == 0 ifTrue:[
+                stream next.
+                stream next == 0 ifTrue:[
+                    "FF-FE-00-00 little endian utf32"
+                    ^ #utf32le.   
+                ].
+                stream skip:-2
+            ].
+            ^ #utf16le
+        ].
+        stream position:pos. ^nil
+    ].
+
+    "FE-FF big endian utf16"
+    byte1 == 16rFE ifTrue:[
+        stream next.
+        stream next == 16rFF ifTrue:[
+            ^ #utf16be
+        ].
+    ].
+    stream position:pos.
+    ^ nil
+
+    "
+     |s enc|
+
+     s := #[1 2 3 4] readStream.
+     enc := self detectAndSkipBOMInStream:s.
+     self assert:(enc == nil).
+     self assert:(s position == 0).
+
+     s := #[16rFF 2 3 4] readStream.
+     enc := self detectAndSkipBOMInStream:s.
+     self assert:(enc == nil).
+     self assert:(s position == 0).
+
+     s := #[16rFF 16rFE 3 4] readStream.
+     enc := self detectAndSkipBOMInStream:s.
+     self assert:(enc == #utf16le).
+     self assert:(s position == 2).
+
+     s := #[16rFE 16rFF 3 4] readStream.
+     enc := self detectAndSkipBOMInStream:s.
+     self assert:(enc == #utf16be).
+     self assert:(s position == 2).
+
+     s := #[16rFF 16rFE 0 0 3 4] readStream.
+     enc := self detectAndSkipBOMInStream:s.
+     self assert:(enc == #utf32le).
+     self assert:(s position == 4).
+
+     s := #[0 0 16rFE 16rFF 0 0 3 4] readStream.
+     enc := self detectAndSkipBOMInStream:s.
+     self assert:(enc == #utf32be).
+     self assert:(s position == 4).
+
+    "
+!
+
 detectBOMInBuffer:buffer
     "returns one of 
         #utf8
@@ -1114,52 +1218,7 @@
         #utf16be
         nil"
 
-    |nb byte1 byte2 byte3 byte4|
-
-    (nb := buffer size) >= 2 ifTrue:[
-        byte1 := (buffer at:1) codePoint.
-        byte2 := (buffer at:2) codePoint.
-        nb > 2 ifTrue:[
-            byte3 := (buffer at:3) codePoint.
-            nb > 3 ifTrue:[
-                byte4 := (buffer at:4) codePoint.
-            ].    
-        ].    
-        byte1 < 16rFE ifTrue:[
-            "/ EF-BB-BF -> utf8
-            (byte1 == 16rEF
-                and:[byte2 == 16rBB 
-                and:[byte3 == 16rBF]]) ifTrue:[
-                ^ #utf8
-            ].
-            "00-00-FE-FF big endian utf32"
-            (byte1 == 0 
-                and:[byte2 == 0 
-                and:[byte3 == 16rFE 
-                and:[byte4 == 16rFF]]]) ifTrue:[
-                ^ #utf32be
-            ]    
-        ] ifFalse:[
-            byte1 == 16rFF ifTrue:[
-                byte2 == 16rFE ifTrue:[
-                    "FF-FE little endian utf16 or utf32"
-                    (byte3 == 0 and:[byte4 == 0]) ifTrue:[
-                        "FF-FE-00-00 little endian utf32"
-                        ^ #utf32le.   
-                    ] ifFalse:[
-                        ^ #utf16le
-                    ]    
-                ].
-            ] ifFalse:["byte1 = 16rFE"
-                "FE-FF big endian utf16"
-                byte2 == 16rFF ifTrue:[
-                    "big endian"
-                    ^ #utf16be
-                ].
-            ]
-        ].
-    ].
-    ^ nil
+    ^ self detectAndSkipBOMInStream:(buffer readStream)
 !
 
 guessEncodingOfBuffer:buffer