#guessEncodingOfBuffer - do NOT handle encoding=utf8
(only encoding: #utf8).
--- a/CharacterEncoder.st Sat Jul 28 20:09:44 2007 +0200
+++ b/CharacterEncoder.st Sat Jul 28 20:12:36 2007 +0200
@@ -1001,28 +1001,15 @@
within the given buffer
(which is usually the first few bytes of a textFile)."
- |withoutQuotes lcBuffer quote|
-
- withoutQuotes :=
- [:word |
- |result|
+ |lcBuffer quote|
- result := word.
- ((result startsWith:$") or:[(result startsWith:$')]) ifTrue:[
- result := result copyFrom:2
- ].
- ((result endsWith:$") or:[(result endsWith:$')]) ifTrue:[
- result := result copyWithoutLast:1
- ].
- result
- ].
+ (buffer startsWith:#[16rEF 16rBB 16rBF] asString) ifTrue:[
+ ^ #utf8
+ ].
lcBuffer := buffer asLowercase.
- (buffer startsWith:#[16rEF 16rBB 16rBF] asString) ifTrue:[
- ^ 'utf-8'
- ].
- #( 'charset' 'encoding' ) do:[:keyWord |
+ #(charset encoding) do:[:keyWord |
|encoderOrNil idx s w enc|
(idx := lcBuffer findString:keyWord) ~~ 0 ifTrue:[
@@ -1031,7 +1018,10 @@
s skip:keyWord size.
s skipSeparators.
- ['=:#' includes:s peek] whileTrue:[
+ "do not include '=' here, otherwise
+ files containing xml code (<?xml charset='utf8'> will be parsed as UTF-8"
+
+ [':#' includes:s peek] whileTrue:[
s next.
s skipSeparators.
].
@@ -1043,10 +1033,9 @@
w := s upToSeparator.
].
w notNil ifTrue:[
- enc := withoutQuotes value:w.
- encoderOrNil := self encoderFor:enc asSymbol ifAbsent:nil.
+ enc := w withoutQuotes asSymbol.
+ encoderOrNil := self encoderFor:enc ifAbsent:nil.
encoderOrNil notNil ifTrue:[
- "/ ^ enc asSymbol
^ encoderOrNil nameOfEncoding
].
"/ enc size >=3 ifTrue:[
@@ -1523,7 +1512,7 @@
!CharacterEncoder class methodsFor:'documentation'!
version
- ^ '$Header: /cvs/stx/stx/libbasic/CharacterEncoder.st,v 1.96 2006-10-23 11:33:53 cg Exp $'
+ ^ '$Header: /cvs/stx/stx/libbasic/CharacterEncoder.st,v 1.97 2007-07-28 18:12:36 stefan Exp $'
! !
CharacterEncoder initialize!