16 Object subclass:#CharacterEncoder |
16 Object subclass:#CharacterEncoder |
17 instanceVariableNames:'' |
17 instanceVariableNames:'' |
18 classVariableNames:'EncoderClassesByName EncodersByName CachedEncoders AccessLock |
18 classVariableNames:'EncoderClassesByName EncodersByName CachedEncoders AccessLock |
19 NullEncoderInstance Jis7KanjiEscapeSequence |
19 NullEncoderInstance Jis7KanjiEscapeSequence |
20 Jis7RomanEscapeSequence JisISO2022EscapeSequence |
20 Jis7RomanEscapeSequence JisISO2022EscapeSequence |
21 Jis7KanjiOldEscapeSequence' |
21 Jis7KanjiOldEscapeSequence EncodingDetectors' |
22 poolDictionaries:'' |
22 poolDictionaries:'' |
23 category:'Collections-Text-Encodings' |
23 category:'Collections-Text-Encodings' |
24 ! |
24 ! |
25 |
25 |
26 CharacterEncoder subclass:#CompoundEncoder |
26 CharacterEncoder subclass:#CompoundEncoder |
974 ]. |
974 ]. |
975 ^ 'http://std.dkuug.dk/i18n/charmaps/' , rel |
975 ^ 'http://std.dkuug.dk/i18n/charmaps/' , rel |
976 ! ! |
976 ! ! |
977 |
977 |
978 !CharacterEncoder class methodsFor:'queries'! |
978 !CharacterEncoder class methodsFor:'queries'! |
|
979 |
|
980 isAbstract |
|
981 "Return if this class is an abstract class. |
|
982 True is returned for CharacterEncoder here; false for subclasses. |
|
983 Abstract subclasses must redefine this again." |
|
984 |
|
985 ^ self == CharacterEncoder |
|
986 ! |
979 |
987 |
980 isEncoding:subSetEncodingArg subSetOf:superSetEncodingArg |
988 isEncoding:subSetEncodingArg subSetOf:superSetEncodingArg |
981 "return true, if superSetEncoding encoding includes all characters of subSetEncoding. |
989 "return true, if superSetEncoding encoding includes all characters of subSetEncoding. |
982 (this means: characters are included - not that they have the same encoding)" |
990 (this means: characters are included - not that they have the same encoding)" |
983 |
991 |
1078 |
1086 |
1079 userFriendlyNameOfEncoding |
1087 userFriendlyNameOfEncoding |
1080 ^ self nameOfEncoding asUppercaseFirst |
1088 ^ self nameOfEncoding asUppercaseFirst |
1081 ! ! |
1089 ! ! |
1082 |
1090 |
1083 !CharacterEncoder class methodsFor:'testing'! |
|
1084 |
|
1085 isAbstract |
|
1086 "Return if this class is an abstract class. |
|
1087 True is returned for CharacterEncoder here; false for subclasses. |
|
1088 Abstract subclasses must redefine again." |
|
1089 |
|
1090 ^ self == CharacterEncoder |
|
1091 ! ! |
|
1092 |
|
1093 !CharacterEncoder class methodsFor:'utilities'! |
1091 !CharacterEncoder class methodsFor:'utilities'! |
1094 |
1092 |
1095 guessEncodingOfBuffer:buffer |
1093 guessEncodingOfBuffer:buffer |
1096 "look for a string of the form |
1094 "try to guess a string-buffer's encoding. |
|
1095 Basically looks for a string of the form |
1097 encoding #name |
1096 encoding #name |
1098 or: |
1097 or: |
1099 encoding: name |
1098 encoding: name |
1100 within the given buffer |
1099 within the given buffer |
1101 (which is usually the first few bytes of a textFile)." |
1100 (which is usually within the first few bytes of a textFile)." |
1102 |
|
1103 |lcBuffer quote peek| |
|
1104 |
1101 |
1105 buffer size < 4 ifTrue:[ |
1102 buffer size < 4 ifTrue:[ |
1106 "not enough bytes to determine the contents" |
1103 "not enough bytes to determine the contents" |
1107 ^ nil. |
1104 ^ nil. |
1108 ]. |
1105 ]. |
1109 |
1106 EncodingDetectors isNil ifTrue:[ |
1110 "check the Byte Order Mark (BOM)" |
1107 self initializeEncodingDetectors. |
1111 peek := (buffer at:1) codePoint. |
1108 ]. |
1112 peek < 16rFE ifTrue:[ |
1109 EncodingDetectors do:[:each | |
1113 (peek = 16rEF |
1110 |guess| |
1114 and:[(buffer at:2) codePoint = 16rBB |
1111 |
1115 and:[(buffer at:3) codePoint = 16rBF]]) ifTrue:[ |
1112 (guess := each value:buffer) notNil ifTrue:[ |
1116 ^ #utf8 |
1113 ^ guess |
1117 ]. |
1114 ]. |
1118 (peek = 0 |
1115 ]. |
1119 and:[(buffer at:2) codePoint = 0 |
|
1120 and:[(buffer at:3) codePoint = 16rFE |
|
1121 and:[(buffer at:4) codePoint = 16rFF]]]) ifTrue:[ |
|
1122 ^ #utf32be |
|
1123 ]. |
|
1124 ] ifFalse:[ |
|
1125 peek = 16rFF ifTrue:[ |
|
1126 (buffer at:2) codePoint = 16rFE ifTrue:[ |
|
1127 "little endian" |
|
1128 ((buffer at:3) codePoint = 0 and:[(buffer at:4) codePoint = 0]) ifTrue:[ |
|
1129 ^ #utf32le. |
|
1130 ]. |
|
1131 ^ #utf16le |
|
1132 ]. |
|
1133 ] ifFalse:["peek = 16rFE" |
|
1134 (buffer at:2) codePoint = 16rFF ifTrue:[ |
|
1135 "big endian" |
|
1136 ^ #utf16be |
|
1137 ]. |
|
1138 ] |
|
1139 ]. |
|
1140 |
|
1141 lcBuffer := buffer asLowercase. |
|
1142 |
|
1143 "now look for an inline encoding markup" |
|
1144 #(charset encoding) do:[:keyWord | |
|
1145 |encoderOrNil idx s w enc| |
|
1146 |
|
1147 (idx := lcBuffer findString:keyWord) ~~ 0 ifTrue:[ |
|
1148 s := ReadStream on:buffer. |
|
1149 s position:idx-1. |
|
1150 s skip:keyWord size. |
|
1151 s skipSeparators. |
|
1152 |
|
1153 "do not include '=' here, otherwise |
|
1154 files containing xml code (<?xml charset='utf8'> will be parsed as UTF-8" |
|
1155 |
|
1156 [':#=' includes:s peek] whileTrue:[ |
|
1157 s next. |
|
1158 s skipSeparators. |
|
1159 ]. |
|
1160 s skipSeparators. |
|
1161 ('"''' includes:s peek) ifTrue:[ |
|
1162 quote := s next. |
|
1163 w := s upTo:quote. |
|
1164 ] ifFalse:[ |
|
1165 w := s upToElementForWhich:[:ch | ch isSeparator or:[ch == $" or:[ch == $' or:[ch == $> ]]]]. |
|
1166 ]. |
|
1167 w notNil ifTrue:[ |
|
1168 enc := w withoutQuotes. |
|
1169 (enc startsWith:'x-') ifTrue:[ |
|
1170 enc := enc copyFrom:3. |
|
1171 ]. |
|
1172 encoderOrNil := self encoderFor:enc ifAbsent:nil. |
|
1173 encoderOrNil notNil ifTrue:[ |
|
1174 ^ encoderOrNil nameOfEncoding |
|
1175 ]. |
|
1176 "/ enc size >=3 ifTrue:[ |
|
1177 "/ Transcript showCR:'Unknown encoding: ' , (withoutQuotes value:w). |
|
1178 "/ ] |
|
1179 ]. |
|
1180 ]. |
|
1181 ]. |
|
1182 |
|
1183 "/ look for JIS7 / EUC encoding |
|
1184 (buffer findString:self jisISO2022EscapeSequence) ~~ 0 ifTrue:[ |
|
1185 ^ #'iso2020-jp' |
|
1186 ]. |
|
1187 (buffer findString:self jis7KanjiEscapeSequence) ~~ 0 ifTrue:[ |
|
1188 ^ #jis7 |
|
1189 ]. |
|
1190 (buffer findString:self jis7KanjiOldEscapeSequence) ~~ 0 ifTrue:[ |
|
1191 ^ #jis7 |
|
1192 ]. |
|
1193 |
|
1194 "/ TODO: |
|
1195 |
|
1196 "/ "/ look for EUC |
|
1197 "/ idx := aString findFirst:[:char | |ascii| |
|
1198 "/ ((ascii := char asciiValue) >= 16rA1) |
|
1199 "/ and:[ascii <= 16rFE]]. |
|
1200 "/ idx ~~ 0 ifTrue:[ |
|
1201 "/ ascii := (aString at:(idx + 1)) asciiValue. |
|
1202 "/ (ascii >= 16rA1 and:[ascii <= 16rFE]) ifTrue:[ |
|
1203 "/ ^ #euc |
|
1204 "/ ] |
|
1205 "/ ]. |
|
1206 "/ look for SJIS ... |
|
1207 |
|
1208 ^ nil |
1116 ^ nil |
1209 ! |
1117 ! |
1210 |
1118 |
1211 guessEncodingOfFile:aFilename |
1119 guessEncodingOfFile:aFilename |
1212 "look for a string |
1120 "look for a string |
1257 ^ self guessEncodingOfBuffer:buffer |
1165 ^ self guessEncodingOfBuffer:buffer |
1258 |
1166 |
1259 "Modified: / 31-05-2011 / 15:45:23 / cg" |
1167 "Modified: / 31-05-2011 / 15:45:23 / cg" |
1260 ! |
1168 ! |
1261 |
1169 |
|
1170 initializeEncodingDetectors |
|
1171 "setup the list of encoding detectors. |
|
1172 This is a list of blocks, which get a buffer as argument, |
|
1173 and return an encoding symbol or nil. |
|
1174 Can be customized for more detectors |
|
1175 (used to be hard-coded in guessEncodingOfBuffer:)" |
|
1176 |
|
1177 EncodingDetectors := OrderedCollection new. |
|
1178 |
|
1179 "check for Unicode Byte Order Marks (BOM)" |
|
1180 EncodingDetectors |
|
1181 add:[:buffer | |
|
1182 |guess byte1 byte2| |
|
1183 |
|
1184 byte1 := (buffer at:1) codePoint. |
|
1185 byte2 := (buffer at:2) codePoint. |
|
1186 byte1 < 16rFE ifTrue:[ |
|
1187 (byte1 = 16rEF |
|
1188 and:[byte2 = 16rBB |
|
1189 and:[(buffer at:3) codePoint = 16rBF]]) ifTrue:[ |
|
1190 guess := #utf8 |
|
1191 ] ifFalse:[ |
|
1192 (byte1 = 0 |
|
1193 and:[byte2 = 0 |
|
1194 and:[(buffer at:3) codePoint = 16rFE |
|
1195 and:[(buffer at:4) codePoint = 16rFF]]]) ifTrue:[ |
|
1196 "00-00-FE-FF big endian utf32" |
|
1197 guess := #utf32be |
|
1198 ]. |
|
1199 ] |
|
1200 ] ifFalse:[ |
|
1201 byte1 = 16rFF ifTrue:[ |
|
1202 byte2 = 16rFE ifTrue:[ |
|
1203 "FF-FE little endian utf16 or utf32" |
|
1204 ((buffer at:3) codePoint = 0 and:[(buffer at:4) codePoint = 0]) ifTrue:[ |
|
1205 "FF-FE-00-00 little endian utf32" |
|
1206 guess := #utf32le. |
|
1207 ] ifFalse:[ |
|
1208 guess := #utf16le |
|
1209 ] |
|
1210 ]. |
|
1211 ] ifFalse:["byte1 = 16rFE" |
|
1212 "FE-FF big endian utf16" |
|
1213 byte2 = 16rFF ifTrue:[ |
|
1214 "big endian" |
|
1215 guess := #utf16be |
|
1216 ]. |
|
1217 ] |
|
1218 ]. |
|
1219 guess |
|
1220 ]. |
|
1221 |
|
1222 "check for an inline encoding markup (charset= / encoding=) substring" |
|
1223 EncodingDetectors |
|
1224 add:[:buffer | |
|
1225 |guess lcBuffer quote peek| |
|
1226 |
|
1227 lcBuffer := buffer asLowercase. |
|
1228 |
|
1229 guess := |
|
1230 #(charset encoding) doWithExit:[:keyWord :exit | |
|
1231 |encoderOrNil idx s w enc| |
|
1232 |
|
1233 guess isNil ifTrue:[ |
|
1234 (idx := lcBuffer findString:keyWord) ~~ 0 ifTrue:[ |
|
1235 s := ReadStream on:buffer. |
|
1236 s position:idx-1. |
|
1237 s skip:keyWord size. |
|
1238 s skipSeparators. |
|
1239 |
|
1240 "do not include '=' here, otherwise |
|
1241 files containing xml code (<?xml charset='utf8'> will be parsed as UTF-8" |
|
1242 |
|
1243 [':#=' includes:s peek] whileTrue:[ |
|
1244 s next. |
|
1245 s skipSeparators. |
|
1246 ]. |
|
1247 s skipSeparators. |
|
1248 ('"''' includes:s peek) ifTrue:[ |
|
1249 quote := s next. |
|
1250 w := s upTo:quote. |
|
1251 ] ifFalse:[ |
|
1252 w := s upToElementForWhich:[:ch | ch isSeparator or:[ch == $" or:[ch == $' or:[ch == $> ]]]]. |
|
1253 ]. |
|
1254 w notNil ifTrue:[ |
|
1255 enc := w withoutQuotes. |
|
1256 (enc startsWith:'x-') ifTrue:[ |
|
1257 enc := enc copyFrom:3. |
|
1258 ]. |
|
1259 encoderOrNil := self encoderFor:enc ifAbsent:nil. |
|
1260 encoderOrNil notNil ifTrue:[ |
|
1261 exit value:(encoderOrNil nameOfEncoding) |
|
1262 ]. |
|
1263 ]. |
|
1264 ]. |
|
1265 ]. |
|
1266 nil |
|
1267 ]. |
|
1268 guess |
|
1269 ]. |
|
1270 |
|
1271 "/ check for JIS7 encoding |
|
1272 EncodingDetectors |
|
1273 add:[:buffer | |
|
1274 (buffer findString:self jisISO2022EscapeSequence) ~~ 0 ifTrue:[ |
|
1275 #'iso2020-jp' |
|
1276 ] ifFalse:[ |
|
1277 (buffer findString:self jis7KanjiEscapeSequence) ~~ 0 ifTrue:[ |
|
1278 #jis7 |
|
1279 ] ifFalse:[ |
|
1280 (buffer findString:self jis7KanjiOldEscapeSequence) ~~ 0 ifTrue:[ |
|
1281 #jis7 |
|
1282 ] ifFalse:[ |
|
1283 nil |
|
1284 ] |
|
1285 ] |
|
1286 ] |
|
1287 ]. |
|
1288 |
|
1289 "/ TODO: look for EUC, SJIS etc. |
|
1290 "/ Disabled, due to too many false positives. |
|
1291 "/ if required, think about it, fix it and uncomment it |
|
1292 "/ EncodingDetectors |
|
1293 "/ add:[:buffer | |
|
1294 "/ |guess idx| |
|
1295 "/ |
|
1296 "/ idx := buffer |
|
1297 "/ findFirst:[:char | |
|
1298 "/ |code| |
|
1299 "/ code := char codePoint. |
|
1300 "/ code between:16rA1 and: 16rFE |
|
1301 "/ ]. |
|
1302 "/ ((idx ~~ 0) |
|
1303 "/ and:[ (buffer at:(idx + 1)) codePoint between:16rA1 and: 16rFE ]) |
|
1304 "/ ifTrue:[ |
|
1305 "/ guess := #euc |
|
1306 "/ ] ifFalse:[ |
|
1307 "/ "/ look for SJIS ... |
|
1308 "/ ] |
|
1309 "/ ]. |
|
1310 ! |
|
1311 |
1262 showCharacterSet |
1312 showCharacterSet |
1263 |font| |
1313 |font| |
1264 |
1314 |
1265 font := View defaultFont. |
1315 font := View defaultFont. |
1266 "/ font := (Font family:'courier' face:'medium' style:'roman' size:12 encoding:'iso10646-1'). |
1316 "/ font := (Font family:'courier' face:'medium' style:'roman' size:12 encoding:'iso10646-1'). |