#REFACTORING by cg
class: CharacterEncoder class
added: #detectBOMInBuffer:
comment/format in:
#guessEncodingOfBuffer:
#guessEncodingOfFile:
#guessEncodingOfStream:
changed: #initializeEncodingDetectors
--- a/CharacterEncoder.st Tue Mar 26 11:55:51 2019 +0100
+++ b/CharacterEncoder.st Tue Mar 26 14:15:06 2019 +0100
@@ -1,3 +1,5 @@
+"{ Encoding: utf8 }"
+
"
COPYRIGHT (c) 2004 by eXept Software AG
All Rights Reserved
@@ -904,7 +906,7 @@
"
self encodeString:(self encodeString:'hello' into:#ebcdic) from:#ebcdic into:#ascii
self encodeString:(self encodeString:'hello' into:#ebcdic) from:#ebcdic into:#unicode
- self encodeString:(self encodeString:'Äh ... hello' into:#ebcdic) from:#ebcdic into:#utf8
+ self encodeString:(self encodeString:'Äh ... hello' into:#ebcdic) from:#ebcdic into:#utf8
"
"Modified (comment): / 17-01-2018 / 15:49:40 / stefan"
@@ -1103,14 +1105,73 @@
!CharacterEncoder class methodsFor:'utilities'!
+detectBOMInBuffer:buffer
+ "returns one of
+ #utf8
+ #utf32be
+ #utf32le
+ #utf16le
+ #utf16be
+ nil"
+
+ |nb byte1 byte2 byte3 byte4|
+
+ (nb := buffer size) >= 2 ifTrue:[
+ byte1 := (buffer at:1) codePoint.
+ byte2 := (buffer at:2) codePoint.
+ nb > 2 ifTrue:[
+ byte3 := (buffer at:3) codePoint.
+ nb > 3 ifTrue:[
+ byte4 := (buffer at:4) codePoint.
+ ].
+ ].
+ byte1 < 16rFE ifTrue:[
+ "/ EF-BB-BF -> utf8
+ (byte1 == 16rEF
+ and:[byte2 == 16rBB
+ and:[byte3 == 16rBF]]) ifTrue:[
+ ^ #utf8
+ ].
+ "00-00-FE-FF big endian utf32"
+ (byte1 == 0
+ and:[byte2 == 0
+ and:[byte3 == 16rFE
+ and:[byte4 == 16rFF]]]) ifTrue:[
+ ^ #utf32be
+ ]
+ ] ifFalse:[
+ byte1 == 16rFF ifTrue:[
+ byte2 == 16rFE ifTrue:[
+ "FF-FE little endian utf16 or utf32"
+ (byte3 == 0 and:[byte4 == 0]) ifTrue:[
+ "FF-FE-00-00 little endian utf32"
+ ^ #utf32le.
+ ] ifFalse:[
+ ^ #utf16le
+ ]
+ ].
+ ] ifFalse:["byte1 = 16rFE"
+ "FE-FF big endian utf16"
+ byte2 == 16rFF ifTrue:[
+ "big endian"
+ ^ #utf16be
+ ].
+ ]
+ ].
+ ].
+ ^ nil
+!
+
guessEncodingOfBuffer:buffer
"try to guess a string-buffer's encoding.
- Basically looks for a string of the form
+ Basically looks for BOM (byte order marks)
+ pr a special string of the form
encoding #name
or:
encoding: name
within the given buffer
- (which is usually within the first few bytes of a textFile)."
+ (which is usually found within the first few bytes of a textFile).
+ Many editors and tools write such comments (eg. emacs, st/x, etc.)"
buffer size < 4 ifTrue:[
"not enough bytes to determine the contents"
@@ -1130,12 +1191,12 @@
!
guessEncodingOfFile:aFilename
- "look for a string
+ "look for a BOM (byte order mark) or a special string of the form:
encoding #name
or:
encoding: name
within the given buffer
- (which is usually the first few bytes of a textFile).
+ (which is usually found in the first few bytes of a textFile).
If that's not found, use heuristics (in CharacterArray) to guess.
Return a symbol like #utf8."
@@ -1161,7 +1222,7 @@
!
guessEncodingOfStream:aStream
- "look for a string of the form
+ "look for a BOM (byte order mark) or a special string of the form:
encoding #name
or:
encoding: name
@@ -1198,55 +1259,7 @@
EncodingDetectors := OrderedCollection new.
"check for Unicode Byte Order Marks (BOM)"
- EncodingDetectors
- add:[:buffer |
- |nb guess byte1 byte2 byte3 byte4|
-
- (nb := buffer size) >= 2 ifTrue:[
- byte1 := (buffer at:1) codePoint.
- byte2 := (buffer at:2) codePoint.
- nb > 2 ifTrue:[
- byte3 := (buffer at:3) codePoint.
- nb > 3 ifTrue:[
- byte4 := (buffer at:4) codePoint.
- ].
- ].
- byte1 < 16rFE ifTrue:[
- (byte1 == 16rEF
- and:[byte2 == 16rBB
- and:[byte3 == 16rBF]]) ifTrue:[
- guess := #utf8
- ] ifFalse:[
- (byte1 == 0
- and:[byte2 == 0
- and:[byte3 == 16rFE
- and:[byte4 == 16rFF]]]) ifTrue:[
- "00-00-FE-FF big endian utf32"
- guess := #utf32be
- ].
- ]
- ] ifFalse:[
- byte1 == 16rFF ifTrue:[
- byte2 == 16rFE ifTrue:[
- "FF-FE little endian utf16 or utf32"
- (byte3 == 0 and:[byte4 == 0]) ifTrue:[
- "FF-FE-00-00 little endian utf32"
- guess := #utf32le.
- ] ifFalse:[
- guess := #utf16le
- ]
- ].
- ] ifFalse:["byte1 = 16rFE"
- "FE-FF big endian utf16"
- byte2 == 16rFF ifTrue:[
- "big endian"
- guess := #utf16be
- ].
- ]
- ].
- ].
- guess
- ].
+ EncodingDetectors add:[:buffer | self detectBOMInBuffer:buffer].
"check for an inline encoding markup (charset= / encoding=) substring"
EncodingDetectors