--- a/CharacterEncoder.st Tue Mar 26 14:15:06 2019 +0100
+++ b/CharacterEncoder.st Tue Mar 26 16:23:23 2019 +0100
@@ -1105,6 +1105,110 @@
!CharacterEncoder class methodsFor:'utilities'!
+detectAndSkipBOMInStream:stream
+ "skips over the BOM and returns one of
+ #utf8
+ #utf32be
+ #utf32le
+ #utf16le
+ #utf16be
+ if no BOM is detected, the stream is repositions to where it was before."
+
+ |pos byte1|
+
+ pos := stream position.
+ stream atEnd ifTrue:[^ nil].
+ byte1 := stream peek.
+ "/ EF-BB-BF -> utf8
+ byte1 == 16rEF ifTrue:[
+ stream next.
+ stream peek == 16rBB ifTrue:[
+ stream next.
+ stream next == 16rBF ifTrue:[
+ ^ #utf8
+ ]
+ ].
+ stream position:pos. ^nil
+ ].
+ "00-00-FE-FF big endian utf32"
+ byte1 == 16r00 ifTrue:[
+ stream next.
+ stream peek == 16r00 ifTrue:[
+ stream next.
+ stream peek == 16rFE ifTrue:[
+ stream next.
+ stream next == 16rFF ifTrue:[
+ ^ #utf32be
+ ]
+ ]
+ ].
+ stream position:pos. ^nil
+ ].
+
+ "FF-FE little endian utf16 or utf32"
+ byte1 == 16rFF ifTrue:[
+ stream next.
+ stream peek == 16rFE ifTrue:[
+ stream next.
+ stream peek == 0 ifTrue:[
+ stream next.
+ stream next == 0 ifTrue:[
+ "FF-FE-00-00 little endian utf32"
+ ^ #utf32le.
+ ].
+ stream skip:-2
+ ].
+ ^ #utf16le
+ ].
+ stream position:pos. ^nil
+ ].
+
+ "FE-FF big endian utf16"
+ byte1 == 16rFE ifTrue:[
+ stream next.
+ stream next == 16rFF ifTrue:[
+ ^ #utf16be
+ ].
+ ].
+ stream position:pos.
+ ^ nil
+
+ "
+ |s enc|
+
+ s := #[1 2 3 4] readStream.
+ enc := self detectAndSkipBOMInStream:s.
+ self assert:(enc == nil).
+ self assert:(s position == 0).
+
+ s := #[16rFF 2 3 4] readStream.
+ enc := self detectAndSkipBOMInStream:s.
+ self assert:(enc == nil).
+ self assert:(s position == 0).
+
+ s := #[16rFF 16rFE 3 4] readStream.
+ enc := self detectAndSkipBOMInStream:s.
+ self assert:(enc == #utf16le).
+ self assert:(s position == 2).
+
+ s := #[16rFE 16rFF 3 4] readStream.
+ enc := self detectAndSkipBOMInStream:s.
+ self assert:(enc == #utf16be).
+ self assert:(s position == 2).
+
+ s := #[16rFF 16rFE 0 0 3 4] readStream.
+ enc := self detectAndSkipBOMInStream:s.
+ self assert:(enc == #utf32le).
+ self assert:(s position == 4).
+
+ s := #[0 0 16rFE 16rFF 0 0 3 4] readStream.
+ enc := self detectAndSkipBOMInStream:s.
+ self assert:(enc == #utf32be).
+ self assert:(s position == 4).
+
+ "
+!
+
detectBOMInBuffer:buffer
"returns one of
#utf8
@@ -1114,52 +1218,7 @@
#utf16be
nil"
- |nb byte1 byte2 byte3 byte4|
-
- (nb := buffer size) >= 2 ifTrue:[
- byte1 := (buffer at:1) codePoint.
- byte2 := (buffer at:2) codePoint.
- nb > 2 ifTrue:[
- byte3 := (buffer at:3) codePoint.
- nb > 3 ifTrue:[
- byte4 := (buffer at:4) codePoint.
- ].
- ].
- byte1 < 16rFE ifTrue:[
- "/ EF-BB-BF -> utf8
- (byte1 == 16rEF
- and:[byte2 == 16rBB
- and:[byte3 == 16rBF]]) ifTrue:[
- ^ #utf8
- ].
- "00-00-FE-FF big endian utf32"
- (byte1 == 0
- and:[byte2 == 0
- and:[byte3 == 16rFE
- and:[byte4 == 16rFF]]]) ifTrue:[
- ^ #utf32be
- ]
- ] ifFalse:[
- byte1 == 16rFF ifTrue:[
- byte2 == 16rFE ifTrue:[
- "FF-FE little endian utf16 or utf32"
- (byte3 == 0 and:[byte4 == 0]) ifTrue:[
- "FF-FE-00-00 little endian utf32"
- ^ #utf32le.
- ] ifFalse:[
- ^ #utf16le
- ]
- ].
- ] ifFalse:["byte1 = 16rFE"
- "FE-FF big endian utf16"
- byte2 == 16rFF ifTrue:[
- "big endian"
- ^ #utf16be
- ].
- ]
- ].
- ].
- ^ nil
+ ^ self detectAndSkipBOMInStream:(buffer readStream)
!
guessEncodingOfBuffer:buffer