#BUGFIX by stefan
authorStefan Vogel <sv@exept.de>
Mon, 22 Jan 2018 20:47:41 +0100
changeset 22501 93981acbcba3
parent 22500 aed0c5a72ba6
child 22502 64f23cbc14df
#BUGFIX by stefan class: CharacterEncoderImplementations::MS_Ansi class definition added: #decodeString: changed: #decode: #encode: #encodeString: class: CharacterEncoderImplementations::MS_Ansi class added: #maxCode #minCode comment/format in: #documentation Fix bug introduced in previous change and add documentation
CharacterEncoderImplementations__MS_Ansi.st
--- a/CharacterEncoderImplementations__MS_Ansi.st	Sat Jan 20 10:36:49 2018 +0100
+++ b/CharacterEncoderImplementations__MS_Ansi.st	Mon Jan 22 20:47:41 2018 +0100
@@ -15,7 +15,7 @@
 
 "{ NameSpace: CharacterEncoderImplementations }"
 
-SingleByteEncoder subclass:#MS_Ansi
+VariableBytesEncoder subclass:#MS_Ansi
 	instanceVariableNames:''
 	classVariableNames:''
 	poolDictionaries:''
@@ -40,7 +40,14 @@
 
 documentation
 "
-    Microsoft ANSI
+    Microsoft ANSI - which is wahat Microft thought what is ANSI coding in the 80s (whatever that should be).
+    It is definitely not an ANSI standard!!
+
+    In fact it is CP1252 eincoding which is based on iso8859-1. Codepoints 0x80–0x9F which are control characters
+    in iso8859 are mapped to special windows characters.
+
+    Apparently, meanwhile Microsoft supports codepoints above 0xff as unicode.
+    We map unicode codepoints which are defined in CP1552 to CP1552, and leave others unchanged.
 
     [see with:]
         CharacterEncoderImplementations::MS_Ansi showCharacterSet
@@ -326,165 +333,119 @@
 "
 ! !
 
+!MS_Ansi class methodsFor:'queries'!
+
+maxCode
+    ^ 65535 
+!
+
+minCode
+    ^ 0 
+! !
+
 !MS_Ansi methodsFor:'encoding & decoding'!
 
 decode:codeArg
-    |code "{ Class: SmallInteger }"|
+    |code "{ Class: SmallInteger }" t|
 
     code := codeArg.
     code <= 16r7F ifTrue:[ ^ code ].
-    code > 16rFF ifTrue:[
+    code >= 16rA0 ifTrue:[
         ^ codeArg.
     ].
-    [
-        |t|
-        t := #(
-           "16r0080"    16r20AC " EURO character " 
-           "16r0081"    16r0000 " invalid " 
-           "16r0082"    16r201A " SINGLE LOW-9 QUOTATION MARK " 
-           "16r0083"    16r0192 " LATIN SMALL LETTER F WITH HOOK " 
-           "16r0084"    16r201E " DOUBLE LOW-9 QUOTATION MARK " 
-           "16r0085"    16r2026 " HORIZONTAL ELLIPSIS " 
-           "16r0086"    16r2020 " DAGGER " 
-           "16r0087"    16r2021 " DOUBLE DAGGER " 
-           "16r0088"    16r02C6 " MODIFIER LETTER CIRCUMFLEX ACCENT " 
-           "16r0089"    16r2030 " PER MILLE SIGN " 
-           "16r008A"    16r0160 " LATIN CAPITAL LETTER S WITH CARON " 
-           "16r008B"    16r2039 " SINGLE LEFT-POINTING ANGLE QUOTATION MARK " 
-           "16r008C"    16r0152 " LATIN CAPITAL LIGATURE OE " 
-           "16r008D"    16r0000 " invalid " 
-           "16r008E"    16r0000 " invalid " 
-           "16r008F"    16r0000 " invalid " 
-           "16r0090"    16r0000 " invalid " 
-           "16r0091"    16r2018 " LEFT SINGLE QUOTATION MARK " 
-           "16r0092"    16r2019 " RIGHT SINGLE QUOTATION MARK " 
-           "16r0093"    16r201C " LEFT DOUBLE QUOTATION MARK " 
-           "16r0094"    16r201D " RIGHT DOUBLE QUOTATION MARK " 
-           "16r0095"    16r2022 " BULLET " 
-           "16r0096"    16r2013 " EN DASH " 
-           "16r0097"    16r2014 " EM DASH " 
-           "16r0098"    16r02DC " SMALL TILDE " 
-           "16r0099"    16r2122 " TRADE MARK SIGN " 
-           "16r009A"    16r0161 " LATIN SMALL LETTER S WITH CARON " 
-           "16r009B"    16r203A " SINGLE RIGHT-POINTING ANGLE QUOTATION MARK " 
-           "16r009C"    16r0153 " LATIN SMALL LIGATURE OE " 
-           "16r009D"    16r0000 " invalid " 
-           "16r009E"    16r0000 " invalid " 
-           "16r009F"    16r0178 " LATIN CAPITAL LETTER Y WITH DIAERESIS " 
-           "16r00A0"    16r00A0 " NO-BREAK SPACE " 
-           "16r00A1"    16r00A1 " INVERTED EXCLAMATION MARK " 
-           "16r00A2"    16r00A2 " CENT SIGN " 
-           "16r00A3"    16r00A3 " POUND SIGN " 
-           "16r00A4"    16r00A4 " CURRENCY SIGN " 
-           "16r00A5"    16r00A5 " YEN SIGN " 
-           "16r00A6"    16r00A6 " BROKEN BAR " 
-           "16r00A7"    16r00A7 " SECTION SIGN " 
-           "16r00A8"    16r00A8 " DIAERESIS " 
-           "16r00A9"    16r00A9 " COPYRIGHT SIGN " 
-           "16r00AA"    16r00AA " FEMININE ORDINAL INDICATOR " 
-           "16r00AB"    16r00AB " LEFT-POINTING DOUBLE ANGLE QUOTATION MARK " 
-           "16r00AC"    16r00AC " NOT SIGN " 
-           "16r00AD"    16r00AD " SOFT HYPHEN " 
-           "16r00AE"    16r00AE " REGISTERED SIGN " 
-           "16r00AF"    16r00AF " MACRON " 
-           "16r00B0"    16r00B0 " DEGREE SIGN " 
-           "16r00B1"    16r00B1 " PLUS-MINUS SIGN " 
-           "16r00B2"    16r00B2 " SUPERSCRIPT TWO " 
-           "16r00B3"    16r00B3 " SUPERSCRIPT THREE " 
-           "16r00B4"    16r00B4 " ACUTE ACCENT " 
-           "16r00B5"    16r00B5 " MICRO SIGN " 
-           "16r00B6"    16r00B6 " PILCROW SIGN " 
-           "16r00B7"    16r00B7 " MIDDLE DOT " 
-           "16r00B8"    16r00B8 " CEDILLA " 
-           "16r00B9"    16r00B9 " SUPERSCRIPT ONE " 
-           "16r00BA"    16r00BA " MASCULINE ORDINAL INDICATOR " 
-           "16r00BB"    16r00BB " RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK " 
-           "16r00BC"    16r00BC " VULGAR FRACTION ONE QUARTER " 
-           "16r00BD"    16r00BD " VULGAR FRACTION ONE HALF " 
-           "16r00BE"    16r00BE " VULGAR FRACTION THREE QUARTERS " 
-           "16r00BF"    16r00BF " INVERTED QUESTION MARK " 
-           "16r00C0"    16r00C0 " LATIN CAPITAL LETTER A WITH GRAVE " 
-           "16r00C1"    16r00C1 " LATIN CAPITAL LETTER A WITH ACUTE " 
-           "16r00C2"    16r00C2 " LATIN CAPITAL LETTER A WITH CIRCUMFLEX " 
-           "16r00C3"    16r00C3 " LATIN CAPITAL LETTER A WITH TILDE " 
-           "16r00C4"    16r00C4 " LATIN CAPITAL LETTER A WITH DIAERESIS " 
-           "16r00C5"    16r00C5 " LATIN CAPITAL LETTER A WITH RING ABOVE " 
-           "16r00C6"    16r00C6 " LATIN CAPITAL LETTER AE " 
-           "16r00C7"    16r00C7 " LATIN CAPITAL LETTER C WITH CEDILLA " 
-           "16r00C8"    16r00C8 " LATIN CAPITAL LETTER E WITH GRAVE " 
-           "16r00C9"    16r00C9 " LATIN CAPITAL LETTER E WITH ACUTE " 
-           "16r00CA"    16r00CA " LATIN CAPITAL LETTER E WITH CIRCUMFLEX " 
-           "16r00CB"    16r00CB " LATIN CAPITAL LETTER E WITH DIAERESIS " 
-           "16r00CC"    16r00CC " LATIN CAPITAL LETTER I WITH GRAVE " 
-           "16r00CD"    16r00CD " LATIN CAPITAL LETTER I WITH ACUTE " 
-           "16r00CE"    16r00CE " LATIN CAPITAL LETTER I WITH CIRCUMFLEX " 
-           "16r00CF"    16r00CF " LATIN CAPITAL LETTER I WITH DIAERESIS " 
-           "16r00D0"    16r00D0 " LATIN CAPITAL LETTER ETH (Icelandic) " 
-           "16r00D1"    16r00D1 " LATIN CAPITAL LETTER N WITH TILDE " 
-           "16r00D2"    16r00D2 " LATIN CAPITAL LETTER O WITH GRAVE " 
-           "16r00D3"    16r00D3 " LATIN CAPITAL LETTER O WITH ACUTE " 
-           "16r00D4"    16r00D4 " LATIN CAPITAL LETTER O WITH CIRCUMFLEX " 
-           "16r00D5"    16r00D5 " LATIN CAPITAL LETTER O WITH TILDE " 
-           "16r00D6"    16r00D6 " LATIN CAPITAL LETTER O WITH DIAERESIS " 
-           "16r00D7"    16r00D7 " MULTIPLICATION SIGN " 
-           "16r00D8"    16r00D8 " LATIN CAPITAL LETTER O WITH STROKE " 
-           "16r00D9"    16r00D9 " LATIN CAPITAL LETTER U WITH GRAVE " 
-           "16r00DA"    16r00DA " LATIN CAPITAL LETTER U WITH ACUTE " 
-           "16r00DB"    16r00DB " LATIN CAPITAL LETTER U WITH CIRCUMFLEX " 
-           "16r00DC"    16r00DC " LATIN CAPITAL LETTER U WITH DIAERESIS " 
-           "16r00DD"    16r00DD " LATIN CAPITAL LETTER Y WITH ACUTE " 
-           "16r00DE"    16r00DE " LATIN CAPITAL LETTER THORN (Icelandic) " 
-           "16r00DF"    16r00DF " LATIN SMALL LETTER SHARP S (German) " 
-           "16r00E0"    16r00E0 " LATIN SMALL LETTER A WITH GRAVE " 
-           "16r00E1"    16r00E1 " LATIN SMALL LETTER A WITH ACUTE " 
-           "16r00E2"    16r00E2 " LATIN SMALL LETTER A WITH CIRCUMFLEX " 
-           "16r00E3"    16r00E3 " LATIN SMALL LETTER A WITH TILDE " 
-           "16r00E4"    16r00E4 " LATIN SMALL LETTER A WITH DIAERESIS " 
-           "16r00E5"    16r00E5 " LATIN SMALL LETTER A WITH RING ABOVE " 
-           "16r00E6"    16r00E6 " LATIN SMALL LETTER AE " 
-           "16r00E7"    16r00E7 " LATIN SMALL LETTER C WITH CEDILLA " 
-           "16r00E8"    16r00E8 " LATIN SMALL LETTER E WITH GRAVE " 
-           "16r00E9"    16r00E9 " LATIN SMALL LETTER E WITH ACUTE " 
-           "16r00EA"    16r00EA " LATIN SMALL LETTER E WITH CIRCUMFLEX " 
-           "16r00EB"    16r00EB " LATIN SMALL LETTER E WITH DIAERESIS " 
-           "16r00EC"    16r00EC " LATIN SMALL LETTER I WITH GRAVE " 
-           "16r00ED"    16r00ED " LATIN SMALL LETTER I WITH ACUTE " 
-           "16r00EE"    16r00EE " LATIN SMALL LETTER I WITH CIRCUMFLEX " 
-           "16r00EF"    16r00EF " LATIN SMALL LETTER I WITH DIAERESIS " 
-           "16r00F0"    16r00F0 " LATIN SMALL LETTER ETH (Icelandic) " 
-           "16r00F1"    16r00F1 " LATIN SMALL LETTER N WITH TILDE " 
-           "16r00F2"    16r00F2 " LATIN SMALL LETTER O WITH GRAVE " 
-           "16r00F3"    16r00F3 " LATIN SMALL LETTER O WITH ACUTE " 
-           "16r00F4"    16r00F4 " LATIN SMALL LETTER O WITH CIRCUMFLEX " 
-           "16r00F5"    16r00F5 " LATIN SMALL LETTER O WITH TILDE " 
-           "16r00F6"    16r00F6 " LATIN SMALL LETTER O WITH DIAERESIS " 
-           "16r00F7"    16r00F7 " DIVISION SIGN " 
-           "16r00F8"    16r00F8 " LATIN SMALL LETTER O WITH STROKE " 
-           "16r00F9"    16r00F9 " LATIN SMALL LETTER U WITH GRAVE " 
-           "16r00FA"    16r00FA " LATIN SMALL LETTER U WITH ACUTE " 
-           "16r00FB"    16r00FB " LATIN SMALL LETTER U WITH CIRCUMFLEX " 
-           "16r00FC"    16r00FC " LATIN SMALL LETTER U WITH DIAERESIS " 
-           "16r00FD"    16r00FD " LATIN SMALL LETTER Y WITH ACUTE " 
-           "16r00FE"    16r00FE " LATIN SMALL LETTER THORN (Icelandic) " 
-           "16r00FF"    16r00FF " LATIN SMALL LETTER Y WITH DIAERESIS " 
-           ) at:(code - 16r7F).
-        t == 0 ifFalse:[^ t].
-        ^ self decodingError.
-    ] value.
+
+    "we map CP1252 chars to unicode chars"
+
+    t := #(
+       "16r0080"    16r20AC " EURO character " 
+       "16r0081"    16r0000 " invalid " 
+       "16r0082"    16r201A " SINGLE LOW-9 QUOTATION MARK " 
+       "16r0083"    16r0192 " LATIN SMALL LETTER F WITH HOOK " 
+       "16r0084"    16r201E " DOUBLE LOW-9 QUOTATION MARK " 
+       "16r0085"    16r2026 " HORIZONTAL ELLIPSIS " 
+       "16r0086"    16r2020 " DAGGER " 
+       "16r0087"    16r2021 " DOUBLE DAGGER " 
+       "16r0088"    16r02C6 " MODIFIER LETTER CIRCUMFLEX ACCENT " 
+       "16r0089"    16r2030 " PER MILLE SIGN " 
+       "16r008A"    16r0160 " LATIN CAPITAL LETTER S WITH CARON " 
+       "16r008B"    16r2039 " SINGLE LEFT-POINTING ANGLE QUOTATION MARK " 
+       "16r008C"    16r0152 " LATIN CAPITAL LIGATURE OE " 
+       "16r008D"    16r0000 " invalid " 
+       "16r008E"    16r0000 " invalid " 
+       "16r008F"    16r0000 " invalid " 
+       "16r0090"    16r0000 " invalid " 
+       "16r0091"    16r2018 " LEFT SINGLE QUOTATION MARK " 
+       "16r0092"    16r2019 " RIGHT SINGLE QUOTATION MARK " 
+       "16r0093"    16r201C " LEFT DOUBLE QUOTATION MARK " 
+       "16r0094"    16r201D " RIGHT DOUBLE QUOTATION MARK " 
+       "16r0095"    16r2022 " BULLET " 
+       "16r0096"    16r2013 " EN DASH " 
+       "16r0097"    16r2014 " EM DASH " 
+       "16r0098"    16r02DC " SMALL TILDE " 
+       "16r0099"    16r2122 " TRADE MARK SIGN " 
+       "16r009A"    16r0161 " LATIN SMALL LETTER S WITH CARON " 
+       "16r009B"    16r203A " SINGLE RIGHT-POINTING ANGLE QUOTATION MARK " 
+       "16r009C"    16r0153 " LATIN SMALL LIGATURE OE " 
+       "16r009D"    16r0000 " invalid " 
+       "16r009E"    16r0000 " invalid " 
+       "16r009F"    16r0178 " LATIN CAPITAL LETTER Y WITH DIAERESIS " 
+       ) at:(code - 16r7F).
+    t == 0 ifFalse:[^ t].
+    ^ self decodingError.
 
     "Modified (format): / 12-07-2012 / 14:06:56 / cg"
 !
 
+decodeString:anEncodedStringOrByteCollection
+    "given a string in my encoding, return a unicode-string for it"
+
+    |newString myCode code bits size "{ Class:SmallInteger }"|
+
+    size := anEncodedStringOrByteCollection size.
+    newString := String new:size.
+    bits := newString bitsPerCharacter.
+
+    1 to:size do:[:idx |
+        code := (anEncodedStringOrByteCollection at:idx) codePoint.
+        myCode := self decode:code.
+        myCode > 16rFF ifTrue:[
+            myCode > 16rFFFF ifTrue:[
+                bits < 32 ifTrue:[
+                    newString := Unicode32String fromString:newString.
+                    bits := 32.
+                ]
+            ] ifFalse:[
+                bits < 16 ifTrue:[
+                    newString := Unicode16String fromString:newString.
+                    bits := 16.
+                ]
+            ]
+        ].
+        newString at:idx put:(Character codePoint:myCode).
+    ].
+    ^ newString
+
+    "
+     CharacterEncoderImplementations::ISO8859_1 decodeString:'hello'
+    "
+
+    "Created: / 16-01-2018 / 19:54:02 / stefan"
+    "Modified (format): / 17-01-2018 / 16:30:59 / stefan"
+!
+
 encode:unicodeArg
-    |unicode "{ Class: SmallInteger }"|
+    |unicode "{ Class: SmallInteger }" t|
 
+false ifTrue:[
     "/ mh - it seems that microsoft has fixed ms-ansi to be unicode compatible
     "/ with XP, Vista etc.
     "/ as W95 is not supported anyhow, simply return identity here...
-false ifTrue:[
     ^ unicodeArg.
 ].
 
+    "we map unicode chars to CP1252 where a mapping exists.
+     If no mapping exists, we keep the unicode char"
+
     unicode := unicodeArg.
     unicode > 16r2122 ifTrue:[
         ^ unicode.
@@ -496,29 +457,26 @@
         unicode <= 16r192 ifTrue:[
             unicode <= 16r178 ifTrue:[
                 unicode <= 16r161 ifTrue:[
-                    [
-                        |t|
-                        t := #[
-                           "16r0152"    16r8C " LATIN CAPITAL LIGATURE OE " 
-                           "16r0153"    16r9C " LATIN SMALL LIGATURE OE " 
-                           "16r0154"    16r00 " invalid " 
-                           "16r0155"    16r00 " invalid " 
-                           "16r0156"    16r00 " invalid " 
-                           "16r0157"    16r00 " invalid " 
-                           "16r0158"    16r00 " invalid " 
-                           "16r0159"    16r00 " invalid " 
-                           "16r015A"    16r00 " invalid " 
-                           "16r015B"    16r00 " invalid " 
-                           "16r015C"    16r00 " invalid " 
-                           "16r015D"    16r00 " invalid " 
-                           "16r015E"    16r00 " invalid " 
-                           "16r015F"    16r00 " invalid " 
-                           "16r0160"    16r8A " LATIN CAPITAL LETTER S WITH CARON " 
-                           "16r0161"    16r9A " LATIN SMALL LETTER S WITH CARON " 
-                           ] at:(unicode - 16r151).
-                        t == 0 ifFalse:[^ t].
-                        ^ unicode
-                    ] value.
+                    t := #[
+                       "16r0152"    16r8C " LATIN CAPITAL LIGATURE OE " 
+                       "16r0153"    16r9C " LATIN SMALL LIGATURE OE " 
+                       "16r0154"    16r00 " keep unicode " 
+                       "16r0155"    16r00 " keep unicode " 
+                       "16r0156"    16r00 " keep unicode " 
+                       "16r0157"    16r00 " keep unicode " 
+                       "16r0158"    16r00 " keep unicode " 
+                       "16r0159"    16r00 " keep unicode " 
+                       "16r015A"    16r00 " keep unicode " 
+                       "16r015B"    16r00 " keep unicode " 
+                       "16r015C"    16r00 " keep unicode " 
+                       "16r015D"    16r00 " keep unicode " 
+                       "16r015E"    16r00 " keep unicode " 
+                       "16r015F"    16r00 " keep unicode " 
+                       "16r0160"    16r8A " LATIN CAPITAL LETTER S WITH CARON " 
+                       "16r0161"    16r9A " LATIN SMALL LETTER S WITH CARON " 
+                       ] at:(unicode - 16r151).
+                    t == 0 ifFalse:[^ t].
+                    ^ unicode
                 ].
                 unicode <= 16r177 ifTrue:[
                     ^ unicode
@@ -545,53 +503,50 @@
         ^ unicode
     ].
     unicode <= 16r203A ifTrue:[
-        [
-            |t|
-            t := #(
-               "16r2013"    16r2013 "16r96" " EN DASH " 
-               "16r2014"    16r2014 "16r97" " EM DASH " 
-               "16r2015"    16r00 " invalid " 
-               "16r2016"    16r00 " invalid " 
-               "16r2017"    16r00 " invalid " 
-               "16r2018"    16r91 " LEFT SINGLE QUOTATION MARK " 
-               "16r2019"    16r92 " RIGHT SINGLE QUOTATION MARK " 
-               "16r201A"    16r82 " SINGLE LOW-9 QUOTATION MARK " 
-               "16r201B"    16r00 " invalid " 
-               "16r201C"    16r93 " LEFT DOUBLE QUOTATION MARK " 
-               "16r201D"    16r94 " RIGHT DOUBLE QUOTATION MARK " 
-               "16r201E"    16r84 " DOUBLE LOW-9 QUOTATION MARK " 
-               "16r201F"    16r00 " invalid " 
-               "16r2020"    16r86 " DAGGER " 
-               "16r2021"    16r87 " DOUBLE DAGGER " 
-               "16r2022"    16r95 " BULLET " 
-               "16r2023"    16r00 " invalid " 
-               "16r2024"    16r00 " invalid " 
-               "16r2025"    16r00 " invalid " 
-               "16r2026"    16r85 " HORIZONTAL ELLIPSIS " 
-               "16r2027"    16r00 " invalid " 
-               "16r2028"    16r00 " invalid " 
-               "16r2029"    16r00 " invalid " 
-               "16r202A"    16r00 " invalid " 
-               "16r202B"    16r00 " invalid " 
-               "16r202C"    16r00 " invalid " 
-               "16r202D"    16r00 " invalid " 
-               "16r202E"    16r00 " invalid " 
-               "16r202F"    16r00 " invalid " 
-               "16r2030"    16r89 " PER MILLE SIGN " 
-               "16r2031"    16r00 " invalid " 
-               "16r2032"    16r00 " invalid " 
-               "16r2033"    16r00 " invalid " 
-               "16r2034"    16r00 " invalid " 
-               "16r2035"    16r00 " invalid " 
-               "16r2036"    16r00 " invalid " 
-               "16r2037"    16r00 " invalid " 
-               "16r2038"    16r00 " invalid " 
-               "16r2039"    16r8B " SINGLE LEFT-POINTING ANGLE QUOTATION MARK " 
-               "16r203A"    16r9B " SINGLE RIGHT-POINTING ANGLE QUOTATION MARK " 
-               ) at:(unicode - 16r2012).
-            t == 0 ifFalse:[^ t].
-            ^ unicode
-        ] value.
+        t := #(
+           "16r2013"    16r2013 "16r96" " EN DASH " 
+           "16r2014"    16r2014 "16r97" " EM DASH " 
+           "16r2015"    16r00 " keep unicode " 
+           "16r2016"    16r00 " keep unicode " 
+           "16r2017"    16r00 " keep unicode " 
+           "16r2018"    16r91 " LEFT SINGLE QUOTATION MARK " 
+           "16r2019"    16r92 " RIGHT SINGLE QUOTATION MARK " 
+           "16r201A"    16r82 " SINGLE LOW-9 QUOTATION MARK " 
+           "16r201B"    16r00 " keep unicode " 
+           "16r201C"    16r93 " LEFT DOUBLE QUOTATION MARK " 
+           "16r201D"    16r94 " RIGHT DOUBLE QUOTATION MARK " 
+           "16r201E"    16r84 " DOUBLE LOW-9 QUOTATION MARK " 
+           "16r201F"    16r00 " keep unicode " 
+           "16r2020"    16r86 " DAGGER " 
+           "16r2021"    16r87 " DOUBLE DAGGER " 
+           "16r2022"    16r95 " BULLET " 
+           "16r2023"    16r00 " keep unicode " 
+           "16r2024"    16r00 " keep unicode " 
+           "16r2025"    16r00 " keep unicode " 
+           "16r2026"    16r85 " HORIZONTAL ELLIPSIS " 
+           "16r2027"    16r00 " keep unicode " 
+           "16r2028"    16r00 " keep unicode " 
+           "16r2029"    16r00 " keep unicode " 
+           "16r202A"    16r00 " keep unicode " 
+           "16r202B"    16r00 " keep unicode " 
+           "16r202C"    16r00 " keep unicode " 
+           "16r202D"    16r00 " keep unicode " 
+           "16r202E"    16r00 " keep unicode " 
+           "16r202F"    16r00 " keep unicode " 
+           "16r2030"    16r89 " PER MILLE SIGN " 
+           "16r2031"    16r00 " keep unicode " 
+           "16r2032"    16r00 " keep unicode " 
+           "16r2033"    16r00 " keep unicode " 
+           "16r2034"    16r00 " keep unicode " 
+           "16r2035"    16r00 " keep unicode " 
+           "16r2036"    16r00 " keep unicode " 
+           "16r2037"    16r00 " keep unicode " 
+           "16r2038"    16r00 " keep unicode " 
+           "16r2039"    16r8B " SINGLE LEFT-POINTING ANGLE QUOTATION MARK " 
+           "16r203A"    16r9B " SINGLE RIGHT-POINTING ANGLE QUOTATION MARK " 
+           ) at:(unicode - 16r2012).
+        t == 0 ifFalse:[^ t].
+        ^ unicode
     ].
 "/    unicode = 16r20AC ifTrue:[
 "/        ^ 16r0080 " EURO character "
@@ -607,9 +562,13 @@
 encodeString:aStringOrUnicodeString
     "redefined to speedup simple 8 bit strings"
 
+    |newString myCode bits size "{ Class:SmallInteger }"|
+
+
     "/ mh - it seems that microsoft has fixed ms-ansi to be unicode compatible
     "/ with XP, Vista etc.
     "/ as W95 is not supported anyhow, simply return identity here...
+
 false ifTrue:[
     ^ aStringOrUnicodeString.
 ].
@@ -618,9 +577,32 @@
     (aStringOrUnicodeString containsNon7BitAscii) ifFalse:[
          ^ aStringOrUnicodeString asSingleByteString.
     ].
-    ^ super encodeString:aStringOrUnicodeString
+
+    size := aStringOrUnicodeString size.
+    newString := String new:size.
+    bits := newString bitsPerCharacter.
 
-    "Modified: / 12-07-2012 / 14:47:13 / cg"
+    1 to:size do:[:idx |
+        myCode := self encode:((aStringOrUnicodeString at:idx) codePoint).
+        myCode > 16rFF ifTrue:[
+            myCode > 16rFFFF ifTrue:[
+                bits < 32 ifTrue:[
+                    newString := Unicode32String fromString:newString.
+                    bits := 32.
+                ]
+            ] ifFalse:[
+                bits < 16 ifTrue:[
+                    newString := Unicode16String fromString:newString.
+                    bits := 16.
+                ]
+            ]
+        ].
+        newString at:idx put:(Character codePoint:myCode).
+    ].
+    ^ newString
+
+    "Created: / 16-01-2018 / 19:53:33 / stefan"
+    "Modified: / 17-01-2018 / 14:15:39 / stefan"
 ! !
 
 !MS_Ansi class methodsFor:'documentation'!