#BUGFIX by stefan
class: CharacterEncoderImplementations::MS_Ansi
class definition
added: #decodeString:
changed:
#decode:
#encode:
#encodeString:
class: CharacterEncoderImplementations::MS_Ansi class
added:
#maxCode
#minCode
comment/format in: #documentation
Fix bug introduced in previous change and add documentation
--- a/CharacterEncoderImplementations__MS_Ansi.st Sat Jan 20 10:36:49 2018 +0100
+++ b/CharacterEncoderImplementations__MS_Ansi.st Mon Jan 22 20:47:41 2018 +0100
@@ -15,7 +15,7 @@
"{ NameSpace: CharacterEncoderImplementations }"
-SingleByteEncoder subclass:#MS_Ansi
+VariableBytesEncoder subclass:#MS_Ansi
instanceVariableNames:''
classVariableNames:''
poolDictionaries:''
@@ -40,7 +40,14 @@
documentation
"
- Microsoft ANSI
+ Microsoft ANSI - which is wahat Microft thought what is ANSI coding in the 80s (whatever that should be).
+ It is definitely not an ANSI standard!!
+
+ In fact it is CP1252 eincoding which is based on iso8859-1. Codepoints 0x80–0x9F which are control characters
+ in iso8859 are mapped to special windows characters.
+
+ Apparently, meanwhile Microsoft supports codepoints above 0xff as unicode.
+ We map unicode codepoints which are defined in CP1552 to CP1552, and leave others unchanged.
[see with:]
CharacterEncoderImplementations::MS_Ansi showCharacterSet
@@ -326,165 +333,119 @@
"
! !
+!MS_Ansi class methodsFor:'queries'!
+
+maxCode
+ ^ 65535
+!
+
+minCode
+ ^ 0
+! !
+
!MS_Ansi methodsFor:'encoding & decoding'!
decode:codeArg
- |code "{ Class: SmallInteger }"|
+ |code "{ Class: SmallInteger }" t|
code := codeArg.
code <= 16r7F ifTrue:[ ^ code ].
- code > 16rFF ifTrue:[
+ code >= 16rA0 ifTrue:[
^ codeArg.
].
- [
- |t|
- t := #(
- "16r0080" 16r20AC " EURO character "
- "16r0081" 16r0000 " invalid "
- "16r0082" 16r201A " SINGLE LOW-9 QUOTATION MARK "
- "16r0083" 16r0192 " LATIN SMALL LETTER F WITH HOOK "
- "16r0084" 16r201E " DOUBLE LOW-9 QUOTATION MARK "
- "16r0085" 16r2026 " HORIZONTAL ELLIPSIS "
- "16r0086" 16r2020 " DAGGER "
- "16r0087" 16r2021 " DOUBLE DAGGER "
- "16r0088" 16r02C6 " MODIFIER LETTER CIRCUMFLEX ACCENT "
- "16r0089" 16r2030 " PER MILLE SIGN "
- "16r008A" 16r0160 " LATIN CAPITAL LETTER S WITH CARON "
- "16r008B" 16r2039 " SINGLE LEFT-POINTING ANGLE QUOTATION MARK "
- "16r008C" 16r0152 " LATIN CAPITAL LIGATURE OE "
- "16r008D" 16r0000 " invalid "
- "16r008E" 16r0000 " invalid "
- "16r008F" 16r0000 " invalid "
- "16r0090" 16r0000 " invalid "
- "16r0091" 16r2018 " LEFT SINGLE QUOTATION MARK "
- "16r0092" 16r2019 " RIGHT SINGLE QUOTATION MARK "
- "16r0093" 16r201C " LEFT DOUBLE QUOTATION MARK "
- "16r0094" 16r201D " RIGHT DOUBLE QUOTATION MARK "
- "16r0095" 16r2022 " BULLET "
- "16r0096" 16r2013 " EN DASH "
- "16r0097" 16r2014 " EM DASH "
- "16r0098" 16r02DC " SMALL TILDE "
- "16r0099" 16r2122 " TRADE MARK SIGN "
- "16r009A" 16r0161 " LATIN SMALL LETTER S WITH CARON "
- "16r009B" 16r203A " SINGLE RIGHT-POINTING ANGLE QUOTATION MARK "
- "16r009C" 16r0153 " LATIN SMALL LIGATURE OE "
- "16r009D" 16r0000 " invalid "
- "16r009E" 16r0000 " invalid "
- "16r009F" 16r0178 " LATIN CAPITAL LETTER Y WITH DIAERESIS "
- "16r00A0" 16r00A0 " NO-BREAK SPACE "
- "16r00A1" 16r00A1 " INVERTED EXCLAMATION MARK "
- "16r00A2" 16r00A2 " CENT SIGN "
- "16r00A3" 16r00A3 " POUND SIGN "
- "16r00A4" 16r00A4 " CURRENCY SIGN "
- "16r00A5" 16r00A5 " YEN SIGN "
- "16r00A6" 16r00A6 " BROKEN BAR "
- "16r00A7" 16r00A7 " SECTION SIGN "
- "16r00A8" 16r00A8 " DIAERESIS "
- "16r00A9" 16r00A9 " COPYRIGHT SIGN "
- "16r00AA" 16r00AA " FEMININE ORDINAL INDICATOR "
- "16r00AB" 16r00AB " LEFT-POINTING DOUBLE ANGLE QUOTATION MARK "
- "16r00AC" 16r00AC " NOT SIGN "
- "16r00AD" 16r00AD " SOFT HYPHEN "
- "16r00AE" 16r00AE " REGISTERED SIGN "
- "16r00AF" 16r00AF " MACRON "
- "16r00B0" 16r00B0 " DEGREE SIGN "
- "16r00B1" 16r00B1 " PLUS-MINUS SIGN "
- "16r00B2" 16r00B2 " SUPERSCRIPT TWO "
- "16r00B3" 16r00B3 " SUPERSCRIPT THREE "
- "16r00B4" 16r00B4 " ACUTE ACCENT "
- "16r00B5" 16r00B5 " MICRO SIGN "
- "16r00B6" 16r00B6 " PILCROW SIGN "
- "16r00B7" 16r00B7 " MIDDLE DOT "
- "16r00B8" 16r00B8 " CEDILLA "
- "16r00B9" 16r00B9 " SUPERSCRIPT ONE "
- "16r00BA" 16r00BA " MASCULINE ORDINAL INDICATOR "
- "16r00BB" 16r00BB " RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK "
- "16r00BC" 16r00BC " VULGAR FRACTION ONE QUARTER "
- "16r00BD" 16r00BD " VULGAR FRACTION ONE HALF "
- "16r00BE" 16r00BE " VULGAR FRACTION THREE QUARTERS "
- "16r00BF" 16r00BF " INVERTED QUESTION MARK "
- "16r00C0" 16r00C0 " LATIN CAPITAL LETTER A WITH GRAVE "
- "16r00C1" 16r00C1 " LATIN CAPITAL LETTER A WITH ACUTE "
- "16r00C2" 16r00C2 " LATIN CAPITAL LETTER A WITH CIRCUMFLEX "
- "16r00C3" 16r00C3 " LATIN CAPITAL LETTER A WITH TILDE "
- "16r00C4" 16r00C4 " LATIN CAPITAL LETTER A WITH DIAERESIS "
- "16r00C5" 16r00C5 " LATIN CAPITAL LETTER A WITH RING ABOVE "
- "16r00C6" 16r00C6 " LATIN CAPITAL LETTER AE "
- "16r00C7" 16r00C7 " LATIN CAPITAL LETTER C WITH CEDILLA "
- "16r00C8" 16r00C8 " LATIN CAPITAL LETTER E WITH GRAVE "
- "16r00C9" 16r00C9 " LATIN CAPITAL LETTER E WITH ACUTE "
- "16r00CA" 16r00CA " LATIN CAPITAL LETTER E WITH CIRCUMFLEX "
- "16r00CB" 16r00CB " LATIN CAPITAL LETTER E WITH DIAERESIS "
- "16r00CC" 16r00CC " LATIN CAPITAL LETTER I WITH GRAVE "
- "16r00CD" 16r00CD " LATIN CAPITAL LETTER I WITH ACUTE "
- "16r00CE" 16r00CE " LATIN CAPITAL LETTER I WITH CIRCUMFLEX "
- "16r00CF" 16r00CF " LATIN CAPITAL LETTER I WITH DIAERESIS "
- "16r00D0" 16r00D0 " LATIN CAPITAL LETTER ETH (Icelandic) "
- "16r00D1" 16r00D1 " LATIN CAPITAL LETTER N WITH TILDE "
- "16r00D2" 16r00D2 " LATIN CAPITAL LETTER O WITH GRAVE "
- "16r00D3" 16r00D3 " LATIN CAPITAL LETTER O WITH ACUTE "
- "16r00D4" 16r00D4 " LATIN CAPITAL LETTER O WITH CIRCUMFLEX "
- "16r00D5" 16r00D5 " LATIN CAPITAL LETTER O WITH TILDE "
- "16r00D6" 16r00D6 " LATIN CAPITAL LETTER O WITH DIAERESIS "
- "16r00D7" 16r00D7 " MULTIPLICATION SIGN "
- "16r00D8" 16r00D8 " LATIN CAPITAL LETTER O WITH STROKE "
- "16r00D9" 16r00D9 " LATIN CAPITAL LETTER U WITH GRAVE "
- "16r00DA" 16r00DA " LATIN CAPITAL LETTER U WITH ACUTE "
- "16r00DB" 16r00DB " LATIN CAPITAL LETTER U WITH CIRCUMFLEX "
- "16r00DC" 16r00DC " LATIN CAPITAL LETTER U WITH DIAERESIS "
- "16r00DD" 16r00DD " LATIN CAPITAL LETTER Y WITH ACUTE "
- "16r00DE" 16r00DE " LATIN CAPITAL LETTER THORN (Icelandic) "
- "16r00DF" 16r00DF " LATIN SMALL LETTER SHARP S (German) "
- "16r00E0" 16r00E0 " LATIN SMALL LETTER A WITH GRAVE "
- "16r00E1" 16r00E1 " LATIN SMALL LETTER A WITH ACUTE "
- "16r00E2" 16r00E2 " LATIN SMALL LETTER A WITH CIRCUMFLEX "
- "16r00E3" 16r00E3 " LATIN SMALL LETTER A WITH TILDE "
- "16r00E4" 16r00E4 " LATIN SMALL LETTER A WITH DIAERESIS "
- "16r00E5" 16r00E5 " LATIN SMALL LETTER A WITH RING ABOVE "
- "16r00E6" 16r00E6 " LATIN SMALL LETTER AE "
- "16r00E7" 16r00E7 " LATIN SMALL LETTER C WITH CEDILLA "
- "16r00E8" 16r00E8 " LATIN SMALL LETTER E WITH GRAVE "
- "16r00E9" 16r00E9 " LATIN SMALL LETTER E WITH ACUTE "
- "16r00EA" 16r00EA " LATIN SMALL LETTER E WITH CIRCUMFLEX "
- "16r00EB" 16r00EB " LATIN SMALL LETTER E WITH DIAERESIS "
- "16r00EC" 16r00EC " LATIN SMALL LETTER I WITH GRAVE "
- "16r00ED" 16r00ED " LATIN SMALL LETTER I WITH ACUTE "
- "16r00EE" 16r00EE " LATIN SMALL LETTER I WITH CIRCUMFLEX "
- "16r00EF" 16r00EF " LATIN SMALL LETTER I WITH DIAERESIS "
- "16r00F0" 16r00F0 " LATIN SMALL LETTER ETH (Icelandic) "
- "16r00F1" 16r00F1 " LATIN SMALL LETTER N WITH TILDE "
- "16r00F2" 16r00F2 " LATIN SMALL LETTER O WITH GRAVE "
- "16r00F3" 16r00F3 " LATIN SMALL LETTER O WITH ACUTE "
- "16r00F4" 16r00F4 " LATIN SMALL LETTER O WITH CIRCUMFLEX "
- "16r00F5" 16r00F5 " LATIN SMALL LETTER O WITH TILDE "
- "16r00F6" 16r00F6 " LATIN SMALL LETTER O WITH DIAERESIS "
- "16r00F7" 16r00F7 " DIVISION SIGN "
- "16r00F8" 16r00F8 " LATIN SMALL LETTER O WITH STROKE "
- "16r00F9" 16r00F9 " LATIN SMALL LETTER U WITH GRAVE "
- "16r00FA" 16r00FA " LATIN SMALL LETTER U WITH ACUTE "
- "16r00FB" 16r00FB " LATIN SMALL LETTER U WITH CIRCUMFLEX "
- "16r00FC" 16r00FC " LATIN SMALL LETTER U WITH DIAERESIS "
- "16r00FD" 16r00FD " LATIN SMALL LETTER Y WITH ACUTE "
- "16r00FE" 16r00FE " LATIN SMALL LETTER THORN (Icelandic) "
- "16r00FF" 16r00FF " LATIN SMALL LETTER Y WITH DIAERESIS "
- ) at:(code - 16r7F).
- t == 0 ifFalse:[^ t].
- ^ self decodingError.
- ] value.
+
+ "we map CP1252 chars to unicode chars"
+
+ t := #(
+ "16r0080" 16r20AC " EURO character "
+ "16r0081" 16r0000 " invalid "
+ "16r0082" 16r201A " SINGLE LOW-9 QUOTATION MARK "
+ "16r0083" 16r0192 " LATIN SMALL LETTER F WITH HOOK "
+ "16r0084" 16r201E " DOUBLE LOW-9 QUOTATION MARK "
+ "16r0085" 16r2026 " HORIZONTAL ELLIPSIS "
+ "16r0086" 16r2020 " DAGGER "
+ "16r0087" 16r2021 " DOUBLE DAGGER "
+ "16r0088" 16r02C6 " MODIFIER LETTER CIRCUMFLEX ACCENT "
+ "16r0089" 16r2030 " PER MILLE SIGN "
+ "16r008A" 16r0160 " LATIN CAPITAL LETTER S WITH CARON "
+ "16r008B" 16r2039 " SINGLE LEFT-POINTING ANGLE QUOTATION MARK "
+ "16r008C" 16r0152 " LATIN CAPITAL LIGATURE OE "
+ "16r008D" 16r0000 " invalid "
+ "16r008E" 16r0000 " invalid "
+ "16r008F" 16r0000 " invalid "
+ "16r0090" 16r0000 " invalid "
+ "16r0091" 16r2018 " LEFT SINGLE QUOTATION MARK "
+ "16r0092" 16r2019 " RIGHT SINGLE QUOTATION MARK "
+ "16r0093" 16r201C " LEFT DOUBLE QUOTATION MARK "
+ "16r0094" 16r201D " RIGHT DOUBLE QUOTATION MARK "
+ "16r0095" 16r2022 " BULLET "
+ "16r0096" 16r2013 " EN DASH "
+ "16r0097" 16r2014 " EM DASH "
+ "16r0098" 16r02DC " SMALL TILDE "
+ "16r0099" 16r2122 " TRADE MARK SIGN "
+ "16r009A" 16r0161 " LATIN SMALL LETTER S WITH CARON "
+ "16r009B" 16r203A " SINGLE RIGHT-POINTING ANGLE QUOTATION MARK "
+ "16r009C" 16r0153 " LATIN SMALL LIGATURE OE "
+ "16r009D" 16r0000 " invalid "
+ "16r009E" 16r0000 " invalid "
+ "16r009F" 16r0178 " LATIN CAPITAL LETTER Y WITH DIAERESIS "
+ ) at:(code - 16r7F).
+ t == 0 ifFalse:[^ t].
+ ^ self decodingError.
"Modified (format): / 12-07-2012 / 14:06:56 / cg"
!
+decodeString:anEncodedStringOrByteCollection
+ "given a string in my encoding, return a unicode-string for it"
+
+ |newString myCode code bits size "{ Class:SmallInteger }"|
+
+ size := anEncodedStringOrByteCollection size.
+ newString := String new:size.
+ bits := newString bitsPerCharacter.
+
+ 1 to:size do:[:idx |
+ code := (anEncodedStringOrByteCollection at:idx) codePoint.
+ myCode := self decode:code.
+ myCode > 16rFF ifTrue:[
+ myCode > 16rFFFF ifTrue:[
+ bits < 32 ifTrue:[
+ newString := Unicode32String fromString:newString.
+ bits := 32.
+ ]
+ ] ifFalse:[
+ bits < 16 ifTrue:[
+ newString := Unicode16String fromString:newString.
+ bits := 16.
+ ]
+ ]
+ ].
+ newString at:idx put:(Character codePoint:myCode).
+ ].
+ ^ newString
+
+ "
+ CharacterEncoderImplementations::ISO8859_1 decodeString:'hello'
+ "
+
+ "Created: / 16-01-2018 / 19:54:02 / stefan"
+ "Modified (format): / 17-01-2018 / 16:30:59 / stefan"
+!
+
encode:unicodeArg
- |unicode "{ Class: SmallInteger }"|
+ |unicode "{ Class: SmallInteger }" t|
+false ifTrue:[
"/ mh - it seems that microsoft has fixed ms-ansi to be unicode compatible
"/ with XP, Vista etc.
"/ as W95 is not supported anyhow, simply return identity here...
-false ifTrue:[
^ unicodeArg.
].
+ "we map unicode chars to CP1252 where a mapping exists.
+ If no mapping exists, we keep the unicode char"
+
unicode := unicodeArg.
unicode > 16r2122 ifTrue:[
^ unicode.
@@ -496,29 +457,26 @@
unicode <= 16r192 ifTrue:[
unicode <= 16r178 ifTrue:[
unicode <= 16r161 ifTrue:[
- [
- |t|
- t := #[
- "16r0152" 16r8C " LATIN CAPITAL LIGATURE OE "
- "16r0153" 16r9C " LATIN SMALL LIGATURE OE "
- "16r0154" 16r00 " invalid "
- "16r0155" 16r00 " invalid "
- "16r0156" 16r00 " invalid "
- "16r0157" 16r00 " invalid "
- "16r0158" 16r00 " invalid "
- "16r0159" 16r00 " invalid "
- "16r015A" 16r00 " invalid "
- "16r015B" 16r00 " invalid "
- "16r015C" 16r00 " invalid "
- "16r015D" 16r00 " invalid "
- "16r015E" 16r00 " invalid "
- "16r015F" 16r00 " invalid "
- "16r0160" 16r8A " LATIN CAPITAL LETTER S WITH CARON "
- "16r0161" 16r9A " LATIN SMALL LETTER S WITH CARON "
- ] at:(unicode - 16r151).
- t == 0 ifFalse:[^ t].
- ^ unicode
- ] value.
+ t := #[
+ "16r0152" 16r8C " LATIN CAPITAL LIGATURE OE "
+ "16r0153" 16r9C " LATIN SMALL LIGATURE OE "
+ "16r0154" 16r00 " keep unicode "
+ "16r0155" 16r00 " keep unicode "
+ "16r0156" 16r00 " keep unicode "
+ "16r0157" 16r00 " keep unicode "
+ "16r0158" 16r00 " keep unicode "
+ "16r0159" 16r00 " keep unicode "
+ "16r015A" 16r00 " keep unicode "
+ "16r015B" 16r00 " keep unicode "
+ "16r015C" 16r00 " keep unicode "
+ "16r015D" 16r00 " keep unicode "
+ "16r015E" 16r00 " keep unicode "
+ "16r015F" 16r00 " keep unicode "
+ "16r0160" 16r8A " LATIN CAPITAL LETTER S WITH CARON "
+ "16r0161" 16r9A " LATIN SMALL LETTER S WITH CARON "
+ ] at:(unicode - 16r151).
+ t == 0 ifFalse:[^ t].
+ ^ unicode
].
unicode <= 16r177 ifTrue:[
^ unicode
@@ -545,53 +503,50 @@
^ unicode
].
unicode <= 16r203A ifTrue:[
- [
- |t|
- t := #(
- "16r2013" 16r2013 "16r96" " EN DASH "
- "16r2014" 16r2014 "16r97" " EM DASH "
- "16r2015" 16r00 " invalid "
- "16r2016" 16r00 " invalid "
- "16r2017" 16r00 " invalid "
- "16r2018" 16r91 " LEFT SINGLE QUOTATION MARK "
- "16r2019" 16r92 " RIGHT SINGLE QUOTATION MARK "
- "16r201A" 16r82 " SINGLE LOW-9 QUOTATION MARK "
- "16r201B" 16r00 " invalid "
- "16r201C" 16r93 " LEFT DOUBLE QUOTATION MARK "
- "16r201D" 16r94 " RIGHT DOUBLE QUOTATION MARK "
- "16r201E" 16r84 " DOUBLE LOW-9 QUOTATION MARK "
- "16r201F" 16r00 " invalid "
- "16r2020" 16r86 " DAGGER "
- "16r2021" 16r87 " DOUBLE DAGGER "
- "16r2022" 16r95 " BULLET "
- "16r2023" 16r00 " invalid "
- "16r2024" 16r00 " invalid "
- "16r2025" 16r00 " invalid "
- "16r2026" 16r85 " HORIZONTAL ELLIPSIS "
- "16r2027" 16r00 " invalid "
- "16r2028" 16r00 " invalid "
- "16r2029" 16r00 " invalid "
- "16r202A" 16r00 " invalid "
- "16r202B" 16r00 " invalid "
- "16r202C" 16r00 " invalid "
- "16r202D" 16r00 " invalid "
- "16r202E" 16r00 " invalid "
- "16r202F" 16r00 " invalid "
- "16r2030" 16r89 " PER MILLE SIGN "
- "16r2031" 16r00 " invalid "
- "16r2032" 16r00 " invalid "
- "16r2033" 16r00 " invalid "
- "16r2034" 16r00 " invalid "
- "16r2035" 16r00 " invalid "
- "16r2036" 16r00 " invalid "
- "16r2037" 16r00 " invalid "
- "16r2038" 16r00 " invalid "
- "16r2039" 16r8B " SINGLE LEFT-POINTING ANGLE QUOTATION MARK "
- "16r203A" 16r9B " SINGLE RIGHT-POINTING ANGLE QUOTATION MARK "
- ) at:(unicode - 16r2012).
- t == 0 ifFalse:[^ t].
- ^ unicode
- ] value.
+ t := #(
+ "16r2013" 16r2013 "16r96" " EN DASH "
+ "16r2014" 16r2014 "16r97" " EM DASH "
+ "16r2015" 16r00 " keep unicode "
+ "16r2016" 16r00 " keep unicode "
+ "16r2017" 16r00 " keep unicode "
+ "16r2018" 16r91 " LEFT SINGLE QUOTATION MARK "
+ "16r2019" 16r92 " RIGHT SINGLE QUOTATION MARK "
+ "16r201A" 16r82 " SINGLE LOW-9 QUOTATION MARK "
+ "16r201B" 16r00 " keep unicode "
+ "16r201C" 16r93 " LEFT DOUBLE QUOTATION MARK "
+ "16r201D" 16r94 " RIGHT DOUBLE QUOTATION MARK "
+ "16r201E" 16r84 " DOUBLE LOW-9 QUOTATION MARK "
+ "16r201F" 16r00 " keep unicode "
+ "16r2020" 16r86 " DAGGER "
+ "16r2021" 16r87 " DOUBLE DAGGER "
+ "16r2022" 16r95 " BULLET "
+ "16r2023" 16r00 " keep unicode "
+ "16r2024" 16r00 " keep unicode "
+ "16r2025" 16r00 " keep unicode "
+ "16r2026" 16r85 " HORIZONTAL ELLIPSIS "
+ "16r2027" 16r00 " keep unicode "
+ "16r2028" 16r00 " keep unicode "
+ "16r2029" 16r00 " keep unicode "
+ "16r202A" 16r00 " keep unicode "
+ "16r202B" 16r00 " keep unicode "
+ "16r202C" 16r00 " keep unicode "
+ "16r202D" 16r00 " keep unicode "
+ "16r202E" 16r00 " keep unicode "
+ "16r202F" 16r00 " keep unicode "
+ "16r2030" 16r89 " PER MILLE SIGN "
+ "16r2031" 16r00 " keep unicode "
+ "16r2032" 16r00 " keep unicode "
+ "16r2033" 16r00 " keep unicode "
+ "16r2034" 16r00 " keep unicode "
+ "16r2035" 16r00 " keep unicode "
+ "16r2036" 16r00 " keep unicode "
+ "16r2037" 16r00 " keep unicode "
+ "16r2038" 16r00 " keep unicode "
+ "16r2039" 16r8B " SINGLE LEFT-POINTING ANGLE QUOTATION MARK "
+ "16r203A" 16r9B " SINGLE RIGHT-POINTING ANGLE QUOTATION MARK "
+ ) at:(unicode - 16r2012).
+ t == 0 ifFalse:[^ t].
+ ^ unicode
].
"/ unicode = 16r20AC ifTrue:[
"/ ^ 16r0080 " EURO character "
@@ -607,9 +562,13 @@
encodeString:aStringOrUnicodeString
"redefined to speedup simple 8 bit strings"
+ |newString myCode bits size "{ Class:SmallInteger }"|
+
+
"/ mh - it seems that microsoft has fixed ms-ansi to be unicode compatible
"/ with XP, Vista etc.
"/ as W95 is not supported anyhow, simply return identity here...
+
false ifTrue:[
^ aStringOrUnicodeString.
].
@@ -618,9 +577,32 @@
(aStringOrUnicodeString containsNon7BitAscii) ifFalse:[
^ aStringOrUnicodeString asSingleByteString.
].
- ^ super encodeString:aStringOrUnicodeString
+
+ size := aStringOrUnicodeString size.
+ newString := String new:size.
+ bits := newString bitsPerCharacter.
- "Modified: / 12-07-2012 / 14:47:13 / cg"
+ 1 to:size do:[:idx |
+ myCode := self encode:((aStringOrUnicodeString at:idx) codePoint).
+ myCode > 16rFF ifTrue:[
+ myCode > 16rFFFF ifTrue:[
+ bits < 32 ifTrue:[
+ newString := Unicode32String fromString:newString.
+ bits := 32.
+ ]
+ ] ifFalse:[
+ bits < 16 ifTrue:[
+ newString := Unicode16String fromString:newString.
+ bits := 16.
+ ]
+ ]
+ ].
+ newString at:idx put:(Character codePoint:myCode).
+ ].
+ ^ newString
+
+ "Created: / 16-01-2018 / 19:53:33 / stefan"
+ "Modified: / 17-01-2018 / 14:15:39 / stefan"
! !
!MS_Ansi class methodsFor:'documentation'!