Cherry-pick `HTMLUtilities` to catch yp with changes in `stx:libhtml` jv
authorJan Vrany <jan.vrany@fit.cvut.cz>
Wed, 24 Apr 2019 12:02:49 +0100
branchjv
changeset 4937 b1402d9b1f39
parent 4769 89914ccfcf7d
child 5477 11c625b9ecca
Cherry-pick `HTMLUtilities` to catch yp with changes in `stx:libhtml` Merged `HTMLUtilities` as in 1fec124a7fc7.
HTMLUtilities.st
--- a/HTMLUtilities.st	Tue Nov 13 16:00:39 2018 +0100
+++ b/HTMLUtilities.st	Wed Apr 24 12:02:49 2019 +0100
@@ -17,7 +17,8 @@
 
 Object subclass:#HTMLUtilities
 	instanceVariableNames:''
-	classVariableNames:'EscapeControlCharacters'
+	classVariableNames:'AmpersandEscapes EscapeControlCharacters HtmlEntityToCharacter
+		MathAmpersandEscapes'
 	poolDictionaries:''
 	category:'Net-Communication-Support'
 !
@@ -63,21 +64,476 @@
     "Modified: / 01-06-2010 / 11:25:12 / sr"
 ! !
 
+!HTMLUtilities class methodsFor:'constants'!
+
+ampersandEscapes
+    AmpersandEscapes isNil ifTrue:[
+        AmpersandEscapes := IdentityDictionary new.
+
+        #(
+            #nbsp  160          "/ non-breakable space - do something magic...
+
+            #emspace 160        "/ temporary
+            #enspace 160
+
+            #lt    $<
+            #gt    $>
+            #amp   $&
+            #quot  $"
+            #apos  $'
+
+            #copy  169          "/ copyright
+            #reg   174          "/ registered
+
+            #cent   162
+            #pound  163
+            #yen    165
+            #brvbar $|
+            #sect   167
+            #laquo  171
+            #raquo  187
+            #plusmn 177
+            #micro  181
+            #middot 183
+            #frac14 188
+            #frac12 189
+            #frac34 190
+            #iquest 191
+            #iexcl  16rA1
+            #div    247
+            #divide 247
+            #not    16rAC
+            #shy    16rAD
+            #para   16rB6
+
+            #deg   176
+            #sup1  185
+            #sup2  178
+            #sup3  179
+
+            #ordm   16rBA
+            #ordf   16rAA
+            #macr   16rAF
+
+            #cedil  16rB8
+            #uml    16rA8
+            #acute  16rB4
+            #curren 16rA4
+
+            #Oslash 216
+            #oslash 248
+            #aring  229
+            #Aring  197
+
+            #ccedil 231
+            #Ccedil 199
+
+            #thorn  16rFE
+            #THORN  16rDE
+            #Thorn  15rDE
+
+            #eth  16rF0
+            #ETH  16rD0
+            #Eth  16rD0
+
+            #atilde 227
+            #Atilde 195
+            #ntilde 241
+            #Ntilde 209
+            #otilde 245
+            #Otilde 213
+
+            #auml  228
+            #Auml  196
+            #uuml  252
+            #Uuml  220
+            #ouml  246
+            #Ouml  214
+            #euml  235
+            #Euml  203
+            #iuml  239
+            #Iuml  207
+            #yuml  255
+
+            #acirc  226
+            #Acirc  194
+            #icirc  238
+            #Icirc  206
+            #ecirc  234
+            #Ecirc  202
+            #ucirc  251
+            #Ucirc  219
+            #ocirc  244
+            #Ocirc  212
+
+            #agrave 224
+            #Agrave 192
+            #egrave 232
+            #Egrave 200
+            #igrave 236
+            #Igrave 204
+            #ograve 242
+            #Ograve 210
+            #ugrave 249
+            #Ugrave 217
+
+            #aacute 225
+            #Aacute 193
+            #eacute 233
+            #Eacute 201
+            #iacute 237
+            #Iacute 205
+            #oacute 243
+            #Oacute 211
+            #uacute 250
+            #Uacute 218
+            #yacute 16rFD
+            #Yacute 16rDD
+
+            #szlig  223
+            #aelig  230
+            #AElig  198
+
+            "/ unicode
+
+            #OElig   16r0152         "/ 8859-2 (latin2)
+            #oelig   16r0153         "/ 8859-2 (latin2)
+
+            #ljlig   16r01C9         "/ 8859-2 (latin2)
+            #LJlig   16r01C7         "/ 8859-2 (latin2)
+            #Ljlig   16r01C8         "/ 8859-2 (latin2)
+
+            #Scaron  16r0160         "/ 8859-2 (latin2)
+            #scaron  16r0161         "/ 8859-2 (latin2)
+            #Yuml    16r0178         "/ 8859-2 (latin2)
+
+            #Alpha    16r0391    "/ greek alpha
+            #Beta     16r0392
+            #Gamma    16r0393
+            #Delta    16r0394
+            #Epsilon  16r0395
+            #Zeta     16r0396
+            #Eta      16r0397
+            #Theta    16r0398
+            #Iota     16r0399
+            #Kappa    16r039A
+            #Lambda   16r039B
+            #Mu       16r039C
+            #Nu       16r039D
+            #Xi       16r039E
+            #Omicron  16r039F
+            #Pi       16r03A0
+            #Rho      16r03A1
+            #Sigma    16r03A3
+            #Tau      16r03A4
+            #Upsilon  16r03A5
+            #Phi      16r03A6
+            #Chi      16r03A7      
+            #Psi      16r03A8      
+            #Omega    16r03A9
+
+            #alpha    16r03B1    "/ greek alpha
+            #beta     16r03B2
+            #gamma    16r03B3
+            #delta    16r03B4
+            #epsilon  16r03B5
+            #zeta     16r03B6
+            #eta      16r03B7
+            #theta    16r03B8
+            #iota     16r03B9
+            #kappa    16r03BA
+            #lambda   16r03BB
+            #mu       16r03BC
+            #nu       16r03BD
+            #xi       16r03BE
+            #omicron  16r03BF
+            #pi       16r03C0
+            #rho      16r03C1
+            #sigmaf   16r03C2
+            #sigma    16r03C3
+            #tau      16r03C4
+            #upsilon  16r03C5
+            #phi      16r03C6
+            #chi      16r03C7
+            #psi      16r03C8
+            #omega    16r03C9
+
+            #thetasym 16r03D1
+            #upsih    16r03D2
+            #piv      16r03D6
+
+            #ensp     16r2002
+            #emsp     16r2003
+
+            #thinsp   16r2009         "/ thin space         
+            #zwnj     16r200C         "/ zero width non-joiner         
+            #zwj      16r200D         "/ zero width joiner         
+            #lrm      16r200E         "/ left-to-right mark         
+            #rlm      16r200F         "/ right-to-left mark         
+
+            #ndash    16r2013
+            #mdash    16r2014
+
+            #lsquo    16r2018         "/ left single quot. mark
+            #rsquo    16r2019         "/ right single quot. mark
+            #sbquo    16r201A         "/ single low-9 quot. mark
+            #ldquo    16r201C         "/ left double quot. mark
+            #rdquo    16r201D         "/ right double quot. mark
+            #bdquo    16r201E         "/ double low-9 quot. mark
+            #dagger   16r2020
+            #Dagger   16r2021         "/ double dagger
+
+            #bull     16r2022
+            #hellip   16r2026
+            #prime    16r2032
+            #Prime    16r2033
+            #oline    16r203E
+            #frasl    16r2044
+
+            #euro     16r20AC         "/ 8859-16
+
+            #weierp   16r2118
+            #image    16r2111
+            #real     16r211C
+            #trade    16r2122
+            #angst    16r212B      
+            #alefsym  16r2135
+            #larr     16r2190
+            #uarr     16r2191
+            #rarr     16r2192
+            #darr     16r2193
+            #harr     16r2194
+            #crarr    16r21B5
+            #lArr     16r21D0
+            #uArr     16r21D1
+            #rArr     16r21D2
+            #dArr     16r21D3
+            #hArr     16r21D4
+            #forall   16r2200
+            #part     16r2202
+            #exist    16r2203
+            #empty    16r2205
+            #nabla    16r2207
+            #isin     16r2208
+            #notin    16r2209
+            #ni       16r220B
+            #prod     16r220F
+            #sum      16r2211
+            #minus    16r2212
+            #lowast   16r2217
+            #radic    16r221A
+            #prop     16r221D
+            #infin    16r221E
+            #ang90    16r221F      
+            #ang      16r2220
+            #angmsd   16r2221      
+            #angsph   16r2222      
+            #and      16r2227
+            #or       16r2228
+            #cap      16r2229
+            #cup      16r222A
+            #int      16r222B
+            #there4   16r2234
+            #sim      16r223C
+            #cong     16r2245
+            #asymp    16r2248
+            #ne       16r2260
+            #equiv    16r2261
+            #le       16r2264
+            #ge       16r2265
+            #sub      16r2282
+            #sup      16r2283
+            #nsub     16r2284
+            #sube     16r2286
+            #supe     16r2287
+            #oplus    16r2295
+            #otimes   16r2297
+            #perp     16r22A5
+            #sdot     16r22C5
+            #lceil    16r2308
+            #rceil    16r2309
+            #lfloor   16r230A
+            #rfloor   16r230B
+            #lang     16r2329
+            #rang     16r232A
+            #loz      16r25CA
+            #spades   16r2660
+            #clubs    16r2663
+            #hearts   16r2665
+            #diams    16r2666
+
+        ) pairWiseDo:[:key :val |
+            |v|
+
+            v := val.
+            val isInteger ifTrue:[
+                v := Character value:v
+            ].
+            AmpersandEscapes at:key put:v
+        ].
+    ].
+    ^ AmpersandEscapes
+
+    "Created: / 01-04-2019 / 14:34:25 / Claus Gittinger"
+!
+
+htmlEntityToCharacter
+    ^ self ampersandEscapes
+
+    "Modified: / 01-04-2019 / 14:36:41 / Claus Gittinger"
+!
+
+mathAmpersandEscapes
+    "these are obsolete now, as HTML4 added the missing stuff in the meantime."
+
+    MathAmpersandEscapes isNil ifTrue:[
+        MathAmpersandEscapes := IdentityDictionary new.
+
+        #(
+"/            #alpha    16r61      "/ greek alpha
+"/            #beta     16r62      "/ greek beta
+"/            #chi      16r63      
+"/            #delta    16r64     
+"/            #epsilon  16r65      "/ symbol characterSet has no epsilon
+            #vepsilon 16r65        
+"/            #phi      16r66      
+"/            #gamma    16r67     
+"/            #eta      16r68      
+"/            #iota     16r69      
+            #varphi   16r6A      
+"/            #kappa    16r6B      
+"/            #lambda   16r6C      
+"/            #mu       16r6D      
+"/            #nu       16r6E      
+"/            #omicron  16r6F      
+"/            #pi       16r70      
+"/            #theta    16r71      
+            #vtheta   16r71      "/ symbol characterSet has no vtheta  
+"/            #rho      16r72      
+            #varrho   16r72      "/ symbol characterSet has no varrho  
+"/            #sigma    16r73      
+            #vsigma   16r56
+"/            #tau      16r74      
+"/            #upsilon  16r75      
+            #varpi    16r76     
+"/            #omega    16r77      
+"/            #xi       16r78      
+"/            #psi      16r79      
+"/            #zeta     16r7A      
+
+
+
+"/            #Alpha    16r41      "/ greek alpha
+"/            #Beta     16r42      "/ greek beta
+"/            #Chi      16r43      
+"/            #Delta    16r44     
+"/            #Epsilon  16r45     
+"/            #Phi      16r46      
+"/            #Gamma    16r47      
+"/            #Eta      16r48      
+"/            #Iota     16r49      
+"/
+"/            #Kappa    16r4B      
+"/            #Lambda   16r4C      
+"/            #Mu       16r4D      
+"/            #Nu       16r4E      
+"/            #Omicron  16r4F      
+"/            #Pi       16r50      
+"/            #Theta    16r51      
+"/            #Rho      16r52      
+"/            #Sigma    16r53      
+"/            #Tau      16r54      
+"/            #Upsilon  16rA1      
+
+"/            #Omega    16r57    
+"/            #Xi       16r58      
+"/            #Psi      16r59      
+"/            #Zeta     16r5A      
+
+
+"/            #forall   16r22
+            #exist    16r24
+            #exists   16r24
+            #aleph    16rC0      "/ no, this is not alf ;-)
+            #Re       16rC2      "/ R fraktur
+            #Im       16rC1      "/ I fraktur
+            #infty    16rA5      
+
+            #leq      16rA3      "/ less-equal
+            #geq      16rB3      "/ greater-equal
+            #equiv    16rBA      "/ equivalent
+            #approx   16rBB      
+            #cong     16r40      
+"/            #neq      16rB9      
+
+"/            #plusmn   16rB1     
+            #times    16rB4   
+"/            #div      16rB8    
+            #oplus    16rC5   
+            #otimes   16rC4   
+            #oslash   16rC5   
+
+            #sum      16rE5   
+            #prod     16rD5   
+
+            #uparrow         16rAD   
+            #leftarrow       16rAC   
+            #downarrow       16rAF   
+            #rightarrow      16rAE   
+            #leftrightarrow  16rAB   
+            #Uparrow         16rDD   
+            #Leftarrow       16rDC   
+            #Downarrow       16rDF   
+            #Rightarrow      16rDE   
+            #Leftrightarrow  16rDB   
+
+            #supset          16rC9  
+            #supseteq        16rCA 
+            #subset          16rCC   
+            #subseteq        16rCD   
+
+            #vee             16rDA   
+            #wedge           16rD9   
+            #neg             16rD8   
+
+            #ldots           16rBC   
+
+"/            #lfloor          16rEB
+"/            #rfloor          16rFB
+"/            #lceil           16rE9
+"/            #rceil           16rF9
+
+        ) pairWiseDo:[:key :val |
+            |v|
+
+            v := val.
+            val isInteger ifTrue:[
+                v := Character value:v
+            ].
+            MathAmpersandEscapes at:key put:v
+        ].
+    ].
+    ^ MathAmpersandEscapes
+
+    "Created: / 01-04-2019 / 14:40:51 / Claus Gittinger"
+! !
+
 !HTMLUtilities class methodsFor:'helpers'!
 
 characterFromHtmlEntityNamed:anHtmlEntityName
-    anHtmlEntityName = 'lt'     ifTrue:[^ $<].
-    anHtmlEntityName = 'gt'     ifTrue:[^ $>].
-    anHtmlEntityName = 'amp'    ifTrue:[^ $&].
-    anHtmlEntityName = 'apos'   ifTrue:[^ $'].
-    anHtmlEntityName = 'quot'   ifTrue:[^ $"].
+    ^ self ampersandEscapes
+        at:anHtmlEntityName asSymbol
+        ifAbsent:[
+            self halt. 
+            "/ where to get the mapping???
+            "/ Answer: It is a mess. A good start may be
+            "/ https://www.w3.org/TR/html4/sgml/entities.html with 252 named entities.
+            "/ I guess an actual lookup table would be adequate.
+            $~
+        ]
 
-    self halt. "/ where to get the mapping???
-
-    ^ $~
-
-    "Created: / 07-05-2015 / 15:23:40 / sr"
-    "Modified: / 18-05-2015 / 12:15:36 / sr"
+    "Modified: / 01-04-2019 / 14:36:18 / Claus Gittinger"
+    "Modified: / 04-04-2019 / 10:46:22 / Maren"
 !
 
 controlCharacters
@@ -95,6 +551,25 @@
     "Modified (comment): / 06-05-2015 / 16:17:31 / sr"
 !
 
+copyReplaceCharactersWithHtmlEntitiesIn:aString
+    |stream htmlEntity|
+
+    stream := '' writeStream.
+    (aString ? '') do:[:eachCharacter |
+        htmlEntity := self htmlEntityForCharacter:eachCharacter.
+        htmlEntity isNil ifTrue:[
+            stream nextPut:eachCharacter.
+        ] ifFalse:[
+            stream
+                nextPut:$&;
+                nextPutAll:htmlEntity;
+                nextPut:$;.           
+        ].
+    ].
+
+    ^ stream contents
+!
+
 escapeCharacterEntities:aString
     "helper to escape invalid/dangerous characters in html strings.
      These are:
@@ -129,27 +604,10 @@
     "/ and were developed independent of each other, but later moved to this common place.
 
 
-    |rs ws c controlString|
-
-    rs := ReadStream on: aString.
-    ws := WriteStream on: ''.
-    [ rs atEnd ] whileFalse: [
-        c := rs next.
-        controlString := controlCharacters notEmptyOrNil ifTrue:[controlCharacters at:c ifAbsent:nil] ifFalse:[nil].
-        controlString notNil ifTrue:[
-            ws nextPutAll:controlString.
-        ] ifFalse:[
-            c codePoint > 16r7F ifTrue:[
-                ws 
-                    nextPutAll:'&#';
-                    nextPutAll:(c codePoint printString);
-                    nextPutAll:';'.
-            ] ifFalse:[
-                ws nextPut:c.
-            ]
+    ^ String 
+        streamContents:[:ws |
+            self escapeCharacterEntities:aString andControlCharacters:controlCharacters on:ws.
         ]
-    ].
-    ^ ws contents
     
     "
      self escapeCharacterEntities:'a<b'     
@@ -157,6 +615,69 @@
     "
 
     "Created: / 06-05-2015 / 16:29:51 / sr"
+    "Modified (format): / 05-02-2017 / 17:59:32 / cg"
+!
+
+escapeCharacterEntities:aString andControlCharacters:controlCharacters on:aWriteStream
+    "helper to escape invalid/dangerous characters in html strings.
+     These are:
+        control characters, '<', '>', '&' and space -> %XX ascii as hex digits
+        %     -> %%
+    "
+    "/ TODO: this is similar to withSpecialHTMLCharactersEscaped.
+    "/ we should refactor this into one method only (can we do hex escapes always ?).
+    "/ Notice, that these two methods came into existance due to historic reasons
+    "/ and were developed independent of each other, but later moved to this common place.
+
+
+    |rs c controlString|
+
+    rs := ReadStream on: aString.
+    [ rs atEnd ] whileFalse: [
+        c := rs next.
+        controlString := controlCharacters notEmptyOrNil ifTrue:[controlCharacters at:c ifAbsent:nil] ifFalse:[nil].
+        controlString notNil ifTrue:[
+            aWriteStream nextPutAll:controlString.
+        ] ifFalse:[
+            c codePoint > 16r7F ifTrue:[
+                aWriteStream nextPutAll:'&#'.
+                c codePoint printOn:aWriteStream.
+                aWriteStream nextPut:$;.
+            ] ifFalse:[
+                aWriteStream nextPut:c.
+            ]
+        ]
+    ].
+    
+    "
+     self escapeCharacterEntities:'a<b'     
+     self escapeCharacterEntities:'aöb'     
+    "
+
+    "Created: / 05-02-2017 / 17:58:34 / cg"
+    "Modified: / 17-02-2017 / 10:34:20 / stefan"
+!
+
+escapeCharacterEntities:aString on:aStream
+    "helper to escape invalid/dangerous characters in html strings.
+     These are:
+        control characters, '<', '>', '&' and space -> %XX ascii as hex digits
+        %     -> %%
+    "
+    "/ TODO: this is similar to withSpecialHTMLCharactersEscaped.
+    "/ we should refactor this into one method only (can we do hex escapes always ?).
+    "/ Notice, that these two methods came into existance due to historic reasons
+    "/ and were developed independent of each other, but later moved to this common place.
+
+
+    ^self escapeCharacterEntities:aString andControlCharacters:self controlCharacters on:aStream
+
+    "
+     self escapeCharacterEntities:'a<b'     
+     self escapeCharacterEntities:'aöb'     
+    "
+
+    "Created: / 05-02-2017 / 18:00:56 / cg"
 !
 
 extractCharSetEncodingFromContentType:contentTypeLine
@@ -208,13 +729,24 @@
     "
 !
 
+htmlEntityForCharacter:aCharacter
+    aCharacter == Character space ifTrue:[^ nil].
+    aCharacter isLetterOrDigit ifTrue:[^ nil].
+    
+    ^ self ampersandEscapes
+        keyAtValue:aCharacter
+        ifAbsent:nil
+
+    "Modified: / 01-04-2019 / 14:36:25 / Claus Gittinger"
+!
+
 unEscape:aString
     "Convert escaped characters in an urls arguments or post fields back to their proper characters.
-     Undoes the effect of urlEncode and urlEncode2.
+     Undoes the effect of #urlEncoded: and #urlEncoded2:.
      These are:
         + -> space
         %XX ascii as hex digits
-        %uXXXX unicode as hex digits
+        %uXXXX unicode as hex digits   NOTE: %u is non-standard bit implemented in MS IIS
         %% -> %
     "
 
@@ -276,6 +808,7 @@
 
     "Modified: / 09-01-2011 / 10:44:50 / cg"
     "Modified (comment): / 06-05-2015 / 15:40:04 / sr"
+    "Modified (comment): / 03-02-2017 / 17:06:32 / stefan"
 !
 
 unescapeCharacterEntities:aString
@@ -345,7 +878,7 @@
                     htmlEntityMatchingFailed ifTrue:[
                         ws nextPut:c.
                         ws nextPutAll:entity.
-                        ws nextPutAll:$;.
+                        ws nextPut:$;.
                     ].
                 ] ifFalse:[
                     ws nextPut:c.
@@ -374,9 +907,35 @@
 
     "Created: / 06-05-2015 / 16:56:14 / sr"
     "Modified: / 18-05-2015 / 12:13:35 / sr"
+    "Modified: / 17-02-2017 / 10:18:35 / stefan"
+!
+
+urlDecoded:aString
+    "Convert escaped characters in an urls arguments or post fields back to their proper characters.
+     Undoes the effect of #urlEncoded: and #urlEncoded2:.
+     These are:
+        + -> space
+        %XX ascii as hex digits
+        %uXXXX unicode as hex digits   NOTE: %u is non-standard bit implemented in MS IIS
+        %% -> %
+    "
+    ^ (self unEscape:aString) utf8Decoded
+
+    "
+     self urlDecoded:'a%20b'   
+     self urlDecoded:'a%%b'
+     self urlDecoded:'a+b' 
+     self urlDecoded:'a%+b' 
+     self urlDecoded:'a%' 
+     self urlDecoded:'a%2' 
+     self urlDecoded:'/Home/a%C3%A4%C3%B6%C3%BCa'
+    "
+
+    "Created: / 26-08-2018 / 12:49:24 / Claus Gittinger"
 !
 
 urlEncode2:aStringOrStream on:ws
+    <resource: #obsolete>
     "helper to escape invalid/dangerous characters in an urls arguments.
      Similar to urlEncode, but treats '*','~' and spaces differently.
      (some clients, such as bitTorrent seem to require this - time will tell...)
@@ -411,45 +970,60 @@
 !
 
 urlEncode:aStringOrStream on:ws
-    "helper to escape invalid/dangerous characters in an urls arguments or post-fields.
-     Similar to urlEncode2, but treats '*','~' and spaces differently.
-     (some clients, such as bitTorrent seem to require urlEncode2 - time will tell...)
-     Any byte not in the set 0-9, a-z, A-Z, '.', '-', '_' and '*', is encoded using 
-     the '%nn' format, where nn is the hexadecimal value of the byte.
+    "helper to escape invalid/dangerous characters in an urlÄs argument or post-fields.
+
+     Any byte not in the set 0-9, a-z, A-Z, '.', '-', '_' and '~', 
+     is encoded using the '%nn' format, where nn is the hexadecimal value of the byte.
+     Characters outside the ASCII range are encoded into utf8 first.
      Spaces are encoded as '+'.
         see: application/x-www-form-urlencoded  
-        see: RFC1738"
+        see: https://tools.ietf.org/html/rfc3986 (obsoletes RFC1738)"
 
-    |rs c cp space|
+    |rs c|
 
-    space := Character space.
     rs := aStringOrStream readStream.
 
-    [rs atEnd] whileFalse: [
-        c := rs next.
+    [(c := rs nextOrNil) notNil] whileTrue: [
+        |cp|
 
-        (c isLetterOrDigit or:[ '-_.*' includes:c ]) ifTrue:[
+        (c isLetterOrDigit or:['-_.~' includes:c]) ifTrue:[
             ws nextPut:c.
         ] ifFalse:[
-            c == space ifTrue:[
+            c == Character space ifTrue:[
                 ws nextPut:$+.
             ] ifFalse:[
-                ws nextPut: $%.
-                (cp := c codePoint) > 16rFF ifTrue:[
-                    ws nextPut: $u.
-                    cp printOn:ws base:16 size:4 fill:$0.
+                cp := c codePoint.
+                cp > 16r7F ifTrue:[
+                    c utf8Encoded do:[:eachUtf8Char|
+                        ws nextPut: $%.
+                        eachUtf8Char codePoint printOn:ws base:16 size:2 fill:$0.
+                    ].
                 ] ifFalse:[
+                    ws nextPut: $%.
                     cp printOn:ws base:16 size:2 fill:$0.
                 ].
             ].
         ].
     ].
 
+    "
+     self urlEncoded:'hokus pokus fidibus*-/~'
+     self urlEncoded:'Ützel Brötzel*-/~'
+     self urlEncoded:'χαιρε'
+
+     self urlDecoded:(self urlEncoded:'hokus pokus fidibus*-/~')
+     self urlDecoded:(self urlEncoded:'Ützel Brötzel*-/~')
+     self urlDecoded:(self urlEncoded:'χαιρε')
+    "
+
     "Modified: / 09-01-2011 / 10:43:30 / cg"
     "Modified: / 06-05-2015 / 16:06:52 / sr"
+    "Modified (comment): / 07-02-2017 / 14:51:42 / stefan"
+    "Modified (comment): / 26-08-2018 / 12:50:04 / Claus Gittinger"
 !
 
 urlEncoded2: aString
+    <resource: #obsolete>
     "helper to escape invalid/dangerous characters in an urls arguments or post-fields.
      Similar to urlEncoded, but treats '*','~' and spaces differently.
      (some clients, such as bitTorrent seem to require this - time will tell...)
@@ -479,17 +1053,17 @@
 
 urlEncoded: aString
     "helper to escape invalid/dangerous characters in an urls arguments or post-fields.
-     Similar to urlEncoded2, but treats '*','~' and spaces differently.
-     (some clients, such as bitTorrent seem to require urlEncoded2 - time will tell...)
-     Any byte not in the set 0-9, a-z, A-Z, '.', '-', '_' and '*', is encoded using 
+
+     Any byte not in the set 0-9, a-z, A-Z, '.', '-', '_' and '~', is encoded using 
      the '%nn' format, where nn is the hexadecimal value of the byte.
+     Characters outside the ASCII range are encoded into utf8 first.
      Spaces are encoded as '+'.
         see: application/x-www-form-urlencoded  
-        see: RFC1738"
+        see: https://tools.ietf.org/html/rfc3986 (obsoletes RFC1738)"
 
     |ws|
 
-    ws := String writeStreamWithInitialSize:aString size.
+    ws := WriteStream on:(String new:aString size + 20).
     self urlEncode:aString on:ws.
     ^ ws contents
 
@@ -503,6 +1077,7 @@
     "
 
     "Modified: / 09-01-2011 / 10:43:37 / cg"
+    "Modified: / 07-02-2017 / 14:54:12 / stefan"
 !
 
 withAllSpecialHTMLCharactersEscaped:aStringOrCharacter
@@ -661,7 +1236,6 @@
 
     |parser doc s first|
 
-
     parser := HTMLParser new.
     doc := parser parseText:htmlString.
     s := CharacterWriteStream on:(String new:100).
@@ -687,8 +1261,8 @@
 
     "
      self plainTextOfHTML:'
-bla1 bla2 <br>bla3 <table><tr><td>bla4</td></tr></table> bla5<p>bla6
-'        
+            bla1 bla2 <br>bla3 <table><tr><td>bla4</td></tr></table> bla5<p>bla6'
+     self plainTextOfHTML:'Hello World'        
     "
 
     "Modified: / 06-05-2015 / 17:02:36 / sr"