CharacterEncoder.st
branchjv
changeset 19478 1f5aa87f6170
parent 18810 5bd0f1b3f948
parent 19465 83cd3327e4c4
child 19861 95c7068e30ba
equal deleted inserted replaced
19477:af82888ceb72 19478:1f5aa87f6170
    16 Object subclass:#CharacterEncoder
    16 Object subclass:#CharacterEncoder
    17 	instanceVariableNames:''
    17 	instanceVariableNames:''
    18 	classVariableNames:'EncoderClassesByName EncodersByName CachedEncoders AccessLock
    18 	classVariableNames:'EncoderClassesByName EncodersByName CachedEncoders AccessLock
    19 		NullEncoderInstance Jis7KanjiEscapeSequence
    19 		NullEncoderInstance Jis7KanjiEscapeSequence
    20 		Jis7RomanEscapeSequence JisISO2022EscapeSequence
    20 		Jis7RomanEscapeSequence JisISO2022EscapeSequence
    21 		Jis7KanjiOldEscapeSequence'
    21 		Jis7KanjiOldEscapeSequence EncodingDetectors'
    22 	poolDictionaries:''
    22 	poolDictionaries:''
    23 	category:'Collections-Text-Encodings'
    23 	category:'Collections-Text-Encodings'
    24 !
    24 !
    25 
    25 
    26 CharacterEncoder subclass:#CompoundEncoder
    26 CharacterEncoder subclass:#CompoundEncoder
   974     ].
   974     ].
   975     ^ 'http://std.dkuug.dk/i18n/charmaps/' , rel
   975     ^ 'http://std.dkuug.dk/i18n/charmaps/' , rel
   976 ! !
   976 ! !
   977 
   977 
   978 !CharacterEncoder class methodsFor:'queries'!
   978 !CharacterEncoder class methodsFor:'queries'!
       
   979 
       
   980 isAbstract
       
   981     "Return if this class is an abstract class.
       
   982      True is returned for CharacterEncoder here; false for subclasses.
       
   983      Abstract subclasses must redefine this again."
       
   984 
       
   985     ^ self == CharacterEncoder
       
   986 !
   979 
   987 
   980 isEncoding:subSetEncodingArg subSetOf:superSetEncodingArg
   988 isEncoding:subSetEncodingArg subSetOf:superSetEncodingArg
   981     "return true, if superSetEncoding encoding includes all characters of subSetEncoding.
   989     "return true, if superSetEncoding encoding includes all characters of subSetEncoding.
   982      (this means: characters are included - not that they have the same encoding)"
   990      (this means: characters are included - not that they have the same encoding)"
   983 
   991 
  1078 
  1086 
  1079 userFriendlyNameOfEncoding
  1087 userFriendlyNameOfEncoding
  1080     ^ self nameOfEncoding asUppercaseFirst
  1088     ^ self nameOfEncoding asUppercaseFirst
  1081 ! !
  1089 ! !
  1082 
  1090 
  1083 !CharacterEncoder class methodsFor:'testing'!
       
  1084 
       
  1085 isAbstract
       
  1086     "Return if this class is an abstract class.
       
  1087      True is returned for CharacterEncoder here; false for subclasses.
       
  1088      Abstract subclasses must redefine again."
       
  1089 
       
  1090     ^ self == CharacterEncoder
       
  1091 ! !
       
  1092 
       
  1093 !CharacterEncoder class methodsFor:'utilities'!
  1091 !CharacterEncoder class methodsFor:'utilities'!
  1094 
  1092 
  1095 guessEncodingOfBuffer:buffer
  1093 guessEncodingOfBuffer:buffer
  1096     "look for a string of the form
  1094     "try to guess a string-buffer's encoding.
       
  1095      Basically looks for a string of the form
  1097             encoding #name
  1096             encoding #name
  1098      or:
  1097      or:
  1099             encoding: name
  1098             encoding: name
  1100      within the given buffer 
  1099      within the given buffer 
  1101      (which is usually the first few bytes of a textFile)."
  1100      (which is usually within the first few bytes of a textFile)."
  1102 
       
  1103     |lcBuffer quote peek|
       
  1104 
  1101 
  1105     buffer size < 4 ifTrue:[
  1102     buffer size < 4 ifTrue:[
  1106         "not enough bytes to determine the contents"
  1103         "not enough bytes to determine the contents"
  1107         ^ nil.
  1104         ^ nil.
  1108     ].
  1105     ].
  1109 
  1106     EncodingDetectors isNil ifTrue:[
  1110     "check the Byte Order Mark (BOM)"
  1107         self initializeEncodingDetectors.
  1111     peek := (buffer at:1) codePoint.
  1108     ].    
  1112     peek < 16rFE ifTrue:[
  1109     EncodingDetectors do:[:each |
  1113         (peek = 16rEF
  1110         |guess|
  1114             and:[(buffer at:2) codePoint = 16rBB 
  1111 
  1115             and:[(buffer at:3) codePoint = 16rBF]]) ifTrue:[
  1112         (guess := each value:buffer) notNil ifTrue:[
  1116             ^ #utf8
  1113             ^ guess
  1117         ].
  1114         ].
  1118         (peek = 0 
  1115     ].    
  1119             and:[(buffer at:2) codePoint = 0 
       
  1120             and:[(buffer at:3) codePoint = 16rFE 
       
  1121             and:[(buffer at:4) codePoint = 16rFF]]]) ifTrue:[
       
  1122             ^ #utf32be
       
  1123         ].
       
  1124     ] ifFalse:[
       
  1125         peek = 16rFF ifTrue:[
       
  1126             (buffer at:2) codePoint = 16rFE ifTrue:[
       
  1127                 "little endian"
       
  1128                 ((buffer at:3) codePoint = 0 and:[(buffer at:4) codePoint = 0]) ifTrue:[
       
  1129                     ^ #utf32le.   
       
  1130                 ].
       
  1131                 ^ #utf16le
       
  1132             ].
       
  1133         ] ifFalse:["peek = 16rFE"
       
  1134             (buffer at:2) codePoint = 16rFF ifTrue:[
       
  1135                 "big endian"
       
  1136                 ^ #utf16be
       
  1137             ].
       
  1138         ]
       
  1139     ].
       
  1140 
       
  1141     lcBuffer := buffer asLowercase.
       
  1142 
       
  1143     "now look for an inline encoding markup"
       
  1144     #(charset encoding) do:[:keyWord |
       
  1145         |encoderOrNil idx s w enc|
       
  1146 
       
  1147         (idx := lcBuffer findString:keyWord) ~~ 0 ifTrue:[
       
  1148             s := ReadStream on:buffer.
       
  1149             s position:idx-1.
       
  1150             s skip:keyWord size.
       
  1151             s skipSeparators. 
       
  1152 
       
  1153             "do not include '=' here, otherwise
       
  1154              files containing xml code (<?xml charset='utf8'> will be parsed as UTF-8"
       
  1155 
       
  1156             [':#=' includes:s peek] whileTrue:[
       
  1157                 s next.
       
  1158                 s skipSeparators. 
       
  1159             ].
       
  1160             s skipSeparators.
       
  1161             ('"''' includes:s peek) ifTrue:[
       
  1162                 quote := s next.
       
  1163                 w := s upTo:quote.
       
  1164             ] ifFalse:[
       
  1165                 w := s upToElementForWhich:[:ch | ch isSeparator or:[ch == $" or:[ch == $' or:[ch == $> ]]]].
       
  1166             ].
       
  1167             w notNil ifTrue:[
       
  1168                 enc := w withoutQuotes.
       
  1169                 (enc startsWith:'x-') ifTrue:[
       
  1170                     enc := enc copyFrom:3.
       
  1171                 ].
       
  1172                 encoderOrNil := self encoderFor:enc ifAbsent:nil.
       
  1173                 encoderOrNil notNil ifTrue:[
       
  1174                     ^ encoderOrNil nameOfEncoding
       
  1175                 ].
       
  1176 "/                enc size >=3 ifTrue:[
       
  1177 "/                    Transcript showCR:'Unknown encoding: ' , (withoutQuotes value:w).
       
  1178 "/                ]
       
  1179             ].
       
  1180         ].
       
  1181     ].
       
  1182 
       
  1183     "/ look for JIS7 / EUC encoding
       
  1184     (buffer findString:self jisISO2022EscapeSequence) ~~ 0 ifTrue:[
       
  1185         ^ #'iso2020-jp'
       
  1186     ].
       
  1187     (buffer findString:self jis7KanjiEscapeSequence) ~~ 0 ifTrue:[
       
  1188         ^ #jis7
       
  1189     ].
       
  1190     (buffer findString:self jis7KanjiOldEscapeSequence) ~~ 0 ifTrue:[
       
  1191         ^ #jis7
       
  1192     ].
       
  1193 
       
  1194     "/ TODO:
       
  1195 
       
  1196 "/    "/ look for EUC
       
  1197 "/    idx := aString findFirst:[:char | |ascii|
       
  1198 "/                                        ((ascii := char asciiValue) >= 16rA1)     
       
  1199 "/                                        and:[ascii <= 16rFE]].
       
  1200 "/    idx ~~ 0 ifTrue:[
       
  1201 "/        ascii := (aString at:(idx + 1)) asciiValue.
       
  1202 "/        (ascii >= 16rA1 and:[ascii <= 16rFE]) ifTrue:[
       
  1203 "/            ^ #euc
       
  1204 "/        ]
       
  1205 "/    ].
       
  1206     "/ look for SJIS ...
       
  1207 
       
  1208     ^ nil
  1116     ^ nil
  1209 !
  1117 !
  1210 
  1118 
  1211 guessEncodingOfFile:aFilename
  1119 guessEncodingOfFile:aFilename
  1212     "look for a string
  1120     "look for a string
  1257     ^ self guessEncodingOfBuffer:buffer
  1165     ^ self guessEncodingOfBuffer:buffer
  1258 
  1166 
  1259     "Modified: / 31-05-2011 / 15:45:23 / cg"
  1167     "Modified: / 31-05-2011 / 15:45:23 / cg"
  1260 !
  1168 !
  1261 
  1169 
       
  1170 initializeEncodingDetectors
       
  1171     "setup the list of encoding detectors.
       
  1172      This is a list of blocks, which get a buffer as argument,
       
  1173      and return an encoding symbol or nil.
       
  1174      Can be customized for more detectors 
       
  1175      (used to be hard-coded in guessEncodingOfBuffer:)"
       
  1176 
       
  1177     EncodingDetectors := OrderedCollection new.
       
  1178 
       
  1179     "check for Unicode Byte Order Marks (BOM)"
       
  1180     EncodingDetectors
       
  1181         add:[:buffer |
       
  1182             |guess byte1 byte2|
       
  1183             
       
  1184             byte1 := (buffer at:1) codePoint.
       
  1185             byte2 := (buffer at:2) codePoint.
       
  1186             byte1 < 16rFE ifTrue:[
       
  1187                 (byte1 = 16rEF
       
  1188                     and:[byte2 = 16rBB 
       
  1189                     and:[(buffer at:3) codePoint = 16rBF]]) ifTrue:[
       
  1190                     guess := #utf8
       
  1191                 ] ifFalse:[
       
  1192                     (byte1 = 0 
       
  1193                         and:[byte2 = 0 
       
  1194                         and:[(buffer at:3) codePoint = 16rFE 
       
  1195                         and:[(buffer at:4) codePoint = 16rFF]]]) ifTrue:[
       
  1196                         "00-00-FE-FF big endian utf32"
       
  1197                         guess := #utf32be
       
  1198                     ].
       
  1199                 ]    
       
  1200             ] ifFalse:[
       
  1201                 byte1 = 16rFF ifTrue:[
       
  1202                     byte2 = 16rFE ifTrue:[
       
  1203                         "FF-FE little endian utf16 or utf32"
       
  1204                         ((buffer at:3) codePoint = 0 and:[(buffer at:4) codePoint = 0]) ifTrue:[
       
  1205                             "FF-FE-00-00 little endian utf32"
       
  1206                             guess := #utf32le.   
       
  1207                         ] ifFalse:[
       
  1208                             guess := #utf16le
       
  1209                         ]    
       
  1210                     ].
       
  1211                 ] ifFalse:["byte1 = 16rFE"
       
  1212                     "FE-FF big endian utf16"
       
  1213                     byte2 = 16rFF ifTrue:[
       
  1214                         "big endian"
       
  1215                         guess := #utf16be
       
  1216                     ].
       
  1217                 ]
       
  1218             ].
       
  1219             guess
       
  1220         ].
       
  1221         
       
  1222     "check for an inline encoding markup (charset= / encoding=) substring"
       
  1223     EncodingDetectors
       
  1224         add:[:buffer |
       
  1225             |guess lcBuffer quote peek|
       
  1226 
       
  1227             lcBuffer := buffer asLowercase.
       
  1228 
       
  1229             guess :=
       
  1230                 #(charset encoding) doWithExit:[:keyWord :exit |
       
  1231                     |encoderOrNil idx s w enc|
       
  1232 
       
  1233                     guess isNil ifTrue:[
       
  1234                     (idx := lcBuffer findString:keyWord) ~~ 0 ifTrue:[
       
  1235                         s := ReadStream on:buffer.
       
  1236                         s position:idx-1.
       
  1237                         s skip:keyWord size.
       
  1238                         s skipSeparators. 
       
  1239 
       
  1240                         "do not include '=' here, otherwise
       
  1241                          files containing xml code (<?xml charset='utf8'> will be parsed as UTF-8"
       
  1242 
       
  1243                         [':#=' includes:s peek] whileTrue:[
       
  1244                             s next.
       
  1245                             s skipSeparators. 
       
  1246                         ].
       
  1247                         s skipSeparators.
       
  1248                         ('"''' includes:s peek) ifTrue:[
       
  1249                             quote := s next.
       
  1250                             w := s upTo:quote.
       
  1251                         ] ifFalse:[
       
  1252                             w := s upToElementForWhich:[:ch | ch isSeparator or:[ch == $" or:[ch == $' or:[ch == $> ]]]].
       
  1253                         ].
       
  1254                         w notNil ifTrue:[
       
  1255                             enc := w withoutQuotes.
       
  1256                             (enc startsWith:'x-') ifTrue:[
       
  1257                                 enc := enc copyFrom:3.
       
  1258                             ].
       
  1259                             encoderOrNil := self encoderFor:enc ifAbsent:nil.
       
  1260                             encoderOrNil notNil ifTrue:[
       
  1261                                 exit value:(encoderOrNil nameOfEncoding)
       
  1262                             ].
       
  1263                         ].
       
  1264                     ].
       
  1265                 ].
       
  1266                 nil
       
  1267             ].
       
  1268             guess
       
  1269         ].
       
  1270         
       
  1271     "/ check for JIS7 encoding
       
  1272     EncodingDetectors
       
  1273         add:[:buffer |
       
  1274             (buffer findString:self jisISO2022EscapeSequence) ~~ 0 ifTrue:[
       
  1275                 #'iso2020-jp'
       
  1276             ] ifFalse:[
       
  1277                 (buffer findString:self jis7KanjiEscapeSequence) ~~ 0 ifTrue:[
       
  1278                     #jis7
       
  1279                 ] ifFalse:[
       
  1280                     (buffer findString:self jis7KanjiOldEscapeSequence) ~~ 0 ifTrue:[
       
  1281                         #jis7
       
  1282                     ] ifFalse:[
       
  1283                         nil
       
  1284                     ]
       
  1285                 ]
       
  1286             ]    
       
  1287         ].
       
  1288 
       
  1289     "/ TODO: look for EUC, SJIS etc.
       
  1290     "/ Disabled, due to too many false positives.
       
  1291     "/ if required, think about it, fix it and uncomment it
       
  1292 "/    EncodingDetectors
       
  1293 "/        add:[:buffer |
       
  1294 "/            |guess idx|
       
  1295 "/
       
  1296 "/            idx := buffer 
       
  1297 "/                        findFirst:[:char | 
       
  1298 "/                            |code|
       
  1299 "/                            code := char codePoint.
       
  1300 "/                            code between:16rA1 and: 16rFE
       
  1301 "/                        ].
       
  1302 "/            ((idx ~~ 0) 
       
  1303 "/                and:[ (buffer at:(idx + 1)) codePoint between:16rA1 and: 16rFE ])
       
  1304 "/            ifTrue:[
       
  1305 "/                guess := #euc
       
  1306 "/            ] ifFalse:[
       
  1307 "/                "/ look for SJIS ...
       
  1308 "/            ]
       
  1309 "/        ].
       
  1310 !
       
  1311 
  1262 showCharacterSet
  1312 showCharacterSet
  1263     |font|
  1313     |font|
  1264 
  1314 
  1265     font := View defaultFont.
  1315     font := View defaultFont.
  1266 "/    font := (Font family:'courier' face:'medium' style:'roman' size:12 encoding:'iso10646-1').
  1316 "/    font := (Font family:'courier' face:'medium' style:'roman' size:12 encoding:'iso10646-1').
  1272         asInputFor:nil
  1322         asInputFor:nil
  1273         encoder:self
  1323         encoder:self
  1274 
  1324 
  1275     "
  1325     "
  1276      CharacterEncoderImplementations::MS_Ansi showCharacterSet
  1326      CharacterEncoderImplementations::MS_Ansi showCharacterSet
       
  1327      CharacterEncoderImplementations::ISO8859_1 showCharacterSet
       
  1328      CharacterEncoderImplementations::ISO8859_2 showCharacterSet
       
  1329      CharacterEncoderImplementations::ISO8859_3 showCharacterSet
       
  1330      CharacterEncoderImplementations::ISO8859_4 showCharacterSet
       
  1331      CharacterEncoderImplementations::ISO8859_5 showCharacterSet
       
  1332      CharacterEncoderImplementations::ISO8859_6 showCharacterSet
       
  1333      CharacterEncoderImplementations::ISO8859_7 showCharacterSet
       
  1334      CharacterEncoderImplementations::ISO8859_8 showCharacterSet
       
  1335      CharacterEncoderImplementations::ISO8859_9 showCharacterSet
  1277     "
  1336     "
  1278 ! !
  1337 ! !
  1279 
  1338 
  1280 !CharacterEncoder methodsFor:'encoding & decoding'!
  1339 !CharacterEncoder methodsFor:'encoding & decoding'!
  1281 
  1340