HTMLUtilities.st
author Claus Gittinger <cg@exept.de>
Sat, 02 May 2020 21:40:13 +0200
changeset 5476 7355a4b11cb6
parent 5430 fa33520af010
permissions -rw-r--r--
#FEATURE by cg class: Socket class added: #newTCPclientToHost:port:domain:domainOrder:withTimeout: changed: #newTCPclientToHost:port:domain:withTimeout:

"{ Encoding: utf8 }"

"
 COPYRIGHT (c) 2007 by eXept Software AG
              All Rights Reserved

 This software is furnished under a license and may be used
 only in accordance with the terms of that license and with the
 inclusion of the above copyright notice.   This software may not
 be provided or otherwise made available to, or used by, any
 other person.  No title to or ownership of the software is
 hereby transferred.
"
"{ Package: 'stx:libbasic2' }"

"{ NameSpace: Smalltalk }"

Object subclass:#HTMLUtilities
	instanceVariableNames:''
	classVariableNames:'AmpersandEscapes EscapeControlCharacters HtmlEntityToCharacter
		MathAmpersandEscapes'
	poolDictionaries:''
	category:'Net-Communication-Support'
!

!HTMLUtilities class methodsFor:'documentation'!

copyright
"
 COPYRIGHT (c) 2007 by eXept Software AG
              All Rights Reserved

 This software is furnished under a license and may be used
 only in accordance with the terms of that license and with the
 inclusion of the above copyright notice.   This software may not
 be provided or otherwise made available to, or used by, any
 other person.  No title to or ownership of the software is
 hereby transferred.
"
!

documentation
"
    Collected support functions to deal with HTML.
    Used both by HTML generators (DocGenerator), HTMLParsers and the webServer.
    Therefore, it has been put into libbasic2.
"
! !

!HTMLUtilities class methodsFor:'common actions'!

openLauncherOnDisplay:displayName
    <resource: #obsolete>

    "obsolete - do not use"

    self obsoleteMethodWarning.
    Error handle:[:ex |
        ^ ex description
    ] do:[
        NewLauncher openLauncherOnInitializedDisplayNamed:displayName
    ]

    "Modified: / 01-06-2010 / 11:25:12 / sr"
! !

!HTMLUtilities class methodsFor:'constants'!

ampersandEscapes
    AmpersandEscapes isNil ifTrue:[
        AmpersandEscapes := IdentityDictionary new.

        #(
            #nbsp  160          "/ non-breakable space - do something magic...

            #emspace 160        "/ temporary
            #enspace 160

            #lt    $<
            #gt    $>
            #amp   $&
            #quot  $"
            #apos  $'

            #copy  169          "/ copyright
            #reg   174          "/ registered

            #cent   162
            #pound  163
            #yen    165
            #brvbar $|
            #sect   167
            #laquo  171
            #raquo  187
            #plusmn 177
            #micro  181
            #middot 183
            #frac14 188
            #frac12 189
            #frac34 190
            #iquest 191
            #iexcl  16rA1
            #div    247
            #divide 247
            #not    16rAC
            #shy    16rAD
            #para   16rB6

            #deg   176
            #sup1  185
            #sup2  178
            #sup3  179

            #ordm   16rBA
            #ordf   16rAA
            #macr   16rAF

            #cedil  16rB8
            #uml    16rA8
            #acute  16rB4
            #curren 16rA4

            #Oslash 216
            #oslash 248
            #aring  229
            #Aring  197

            #ccedil 231
            #Ccedil 199

            #thorn  16rFE
            #THORN  16rDE
            #Thorn  15rDE

            #eth  16rF0
            #ETH  16rD0
            #Eth  16rD0

            #atilde 227
            #Atilde 195
            #ntilde 241
            #Ntilde 209
            #otilde 245
            #Otilde 213

            #auml  228
            #Auml  196
            #uuml  252
            #Uuml  220
            #ouml  246
            #Ouml  214
            #euml  235
            #Euml  203
            #iuml  239
            #Iuml  207
            #yuml  255

            #acirc  226
            #Acirc  194
            #icirc  238
            #Icirc  206
            #ecirc  234
            #Ecirc  202
            #ucirc  251
            #Ucirc  219
            #ocirc  244
            #Ocirc  212

            #agrave 224
            #Agrave 192
            #egrave 232
            #Egrave 200
            #igrave 236
            #Igrave 204
            #ograve 242
            #Ograve 210
            #ugrave 249
            #Ugrave 217

            #aacute 225
            #Aacute 193
            #eacute 233
            #Eacute 201
            #iacute 237
            #Iacute 205
            #oacute 243
            #Oacute 211
            #uacute 250
            #Uacute 218
            #yacute 16rFD
            #Yacute 16rDD

            #szlig  223
            #aelig  230
            #AElig  198

            "/ unicode

            #OElig   16r0152         "/ 8859-2 (latin2)
            #oelig   16r0153         "/ 8859-2 (latin2)

            #ljlig   16r01C9         "/ 8859-2 (latin2)
            #LJlig   16r01C7         "/ 8859-2 (latin2)
            #Ljlig   16r01C8         "/ 8859-2 (latin2)

            #Scaron  16r0160         "/ 8859-2 (latin2)
            #scaron  16r0161         "/ 8859-2 (latin2)
            #Yuml    16r0178         "/ 8859-2 (latin2)

            #Alpha    16r0391    "/ greek alpha
            #Beta     16r0392
            #Gamma    16r0393
            #Delta    16r0394
            #Epsilon  16r0395
            #Zeta     16r0396
            #Eta      16r0397
            #Theta    16r0398
            #Iota     16r0399
            #Kappa    16r039A
            #Lambda   16r039B
            #Mu       16r039C
            #Nu       16r039D
            #Xi       16r039E
            #Omicron  16r039F
            #Pi       16r03A0
            #Rho      16r03A1
            #Sigma    16r03A3
            #Tau      16r03A4
            #Upsilon  16r03A5
            #Phi      16r03A6
            #Chi      16r03A7      
            #Psi      16r03A8      
            #Omega    16r03A9

            #alpha    16r03B1    "/ greek alpha
            #beta     16r03B2
            #gamma    16r03B3
            #delta    16r03B4
            #epsilon  16r03B5
            #zeta     16r03B6
            #eta      16r03B7
            #theta    16r03B8
            #iota     16r03B9
            #kappa    16r03BA
            #lambda   16r03BB
            #mu       16r03BC
            #nu       16r03BD
            #xi       16r03BE
            #omicron  16r03BF
            #pi       16r03C0
            #rho      16r03C1
            #sigmaf   16r03C2
            #sigma    16r03C3
            #tau      16r03C4
            #upsilon  16r03C5
            #phi      16r03C6
            #chi      16r03C7
            #psi      16r03C8
            #omega    16r03C9

            #thetasym 16r03D1
            #upsih    16r03D2
            #piv      16r03D6

            #ensp     16r2002
            #emsp     16r2003

            #thinsp   16r2009         "/ thin space         
            #zwnj     16r200C         "/ zero width non-joiner         
            #zwj      16r200D         "/ zero width joiner         
            #lrm      16r200E         "/ left-to-right mark         
            #rlm      16r200F         "/ right-to-left mark         

            #ndash    16r2013
            #mdash    16r2014

            #lsquo    16r2018         "/ left single quot. mark
            #rsquo    16r2019         "/ right single quot. mark
            #sbquo    16r201A         "/ single low-9 quot. mark
            #ldquo    16r201C         "/ left double quot. mark
            #rdquo    16r201D         "/ right double quot. mark
            #bdquo    16r201E         "/ double low-9 quot. mark
            #dagger   16r2020
            #Dagger   16r2021         "/ double dagger

            #bull     16r2022
            #hellip   16r2026
            #prime    16r2032
            #Prime    16r2033
            #oline    16r203E
            #frasl    16r2044

            #euro     16r20AC         "/ 8859-16

            #weierp   16r2118
            #image    16r2111
            #real     16r211C
            #trade    16r2122
            #angst    16r212B      
            #alefsym  16r2135
            #larr     16r2190
            #uarr     16r2191
            #rarr     16r2192
            #darr     16r2193
            #harr     16r2194
            #crarr    16r21B5
            #lArr     16r21D0
            #uArr     16r21D1
            #rArr     16r21D2
            #dArr     16r21D3
            #hArr     16r21D4
            #forall   16r2200
            #part     16r2202
            #exist    16r2203
            #empty    16r2205
            #nabla    16r2207
            #isin     16r2208
            #notin    16r2209
            #ni       16r220B
            #prod     16r220F
            #sum      16r2211
            #minus    16r2212
            #lowast   16r2217
            #radic    16r221A
            #prop     16r221D
            #infin    16r221E
            #ang90    16r221F      
            #ang      16r2220
            #angmsd   16r2221      
            #angsph   16r2222      
            #and      16r2227
            #or       16r2228
            #cap      16r2229
            #cup      16r222A
            #int      16r222B
            #there4   16r2234
            #sim      16r223C
            #cong     16r2245
            #asymp    16r2248
            #ne       16r2260
            #equiv    16r2261
            #le       16r2264
            #ge       16r2265
            #sub      16r2282
            #sup      16r2283
            #nsub     16r2284
            #sube     16r2286
            #supe     16r2287
            #oplus    16r2295
            #otimes   16r2297
            #perp     16r22A5
            #sdot     16r22C5
            #lceil    16r2308
            #rceil    16r2309
            #lfloor   16r230A
            #rfloor   16r230B
            #lang     16r2329
            #rang     16r232A
            #loz      16r25CA
            #spades   16r2660
            #clubs    16r2663
            #hearts   16r2665
            #diams    16r2666

        ) pairWiseDo:[:key :val |
            |v|

            v := val.
            val isInteger ifTrue:[
                v := Character value:v
            ].
            AmpersandEscapes at:key put:v
        ].
    ].
    ^ AmpersandEscapes

    "Created: / 01-04-2019 / 14:34:25 / Claus Gittinger"
!

htmlEntityToCharacter
    ^ self ampersandEscapes

    "Modified: / 01-04-2019 / 14:36:41 / Claus Gittinger"
!

mathAmpersandEscapes
    "these are obsolete now, as HTML4 added the missing stuff in the meantime."

    MathAmpersandEscapes isNil ifTrue:[
        MathAmpersandEscapes := IdentityDictionary new.

        #(
"/            #alpha    16r61      "/ greek alpha
"/            #beta     16r62      "/ greek beta
"/            #chi      16r63      
"/            #delta    16r64     
"/            #epsilon  16r65      "/ symbol characterSet has no epsilon
            #vepsilon 16r65        
"/            #phi      16r66      
"/            #gamma    16r67     
"/            #eta      16r68      
"/            #iota     16r69      
            #varphi   16r6A      
"/            #kappa    16r6B      
"/            #lambda   16r6C      
"/            #mu       16r6D      
"/            #nu       16r6E      
"/            #omicron  16r6F      
"/            #pi       16r70      
"/            #theta    16r71      
            #vtheta   16r71      "/ symbol characterSet has no vtheta  
"/            #rho      16r72      
            #varrho   16r72      "/ symbol characterSet has no varrho  
"/            #sigma    16r73      
            #vsigma   16r56
"/            #tau      16r74      
"/            #upsilon  16r75      
            #varpi    16r76     
"/            #omega    16r77      
"/            #xi       16r78      
"/            #psi      16r79      
"/            #zeta     16r7A      



"/            #Alpha    16r41      "/ greek alpha
"/            #Beta     16r42      "/ greek beta
"/            #Chi      16r43      
"/            #Delta    16r44     
"/            #Epsilon  16r45     
"/            #Phi      16r46      
"/            #Gamma    16r47      
"/            #Eta      16r48      
"/            #Iota     16r49      
"/
"/            #Kappa    16r4B      
"/            #Lambda   16r4C      
"/            #Mu       16r4D      
"/            #Nu       16r4E      
"/            #Omicron  16r4F      
"/            #Pi       16r50      
"/            #Theta    16r51      
"/            #Rho      16r52      
"/            #Sigma    16r53      
"/            #Tau      16r54      
"/            #Upsilon  16rA1      

"/            #Omega    16r57    
"/            #Xi       16r58      
"/            #Psi      16r59      
"/            #Zeta     16r5A      


"/            #forall   16r22
            #exist    16r24
            #exists   16r24
            #aleph    16rC0      "/ no, this is not alf ;-)
            #Re       16rC2      "/ R fraktur
            #Im       16rC1      "/ I fraktur
            #infty    16rA5      

            #leq      16rA3      "/ less-equal
            #geq      16rB3      "/ greater-equal
            #equiv    16rBA      "/ equivalent
            #approx   16rBB      
            #cong     16r40      
"/            #neq      16rB9      

"/            #plusmn   16rB1     
            #times    16rB4   
"/            #div      16rB8    
            #oplus    16rC5   
            #otimes   16rC4   
            #oslash   16rC5   

            #sum      16rE5   
            #prod     16rD5   

            #uparrow         16rAD   
            #leftarrow       16rAC   
            #downarrow       16rAF   
            #rightarrow      16rAE   
            #leftrightarrow  16rAB   
            #Uparrow         16rDD   
            #Leftarrow       16rDC   
            #Downarrow       16rDF   
            #Rightarrow      16rDE   
            #Leftrightarrow  16rDB   

            #supset          16rC9  
            #supseteq        16rCA 
            #subset          16rCC   
            #subseteq        16rCD   

            #vee             16rDA   
            #wedge           16rD9   
            #neg             16rD8   

            #ldots           16rBC   

"/            #lfloor          16rEB
"/            #rfloor          16rFB
"/            #lceil           16rE9
"/            #rceil           16rF9

        ) pairWiseDo:[:key :val |
            |v|

            v := val.
            val isInteger ifTrue:[
                v := Character value:v
            ].
            MathAmpersandEscapes at:key put:v
        ].
    ].
    ^ MathAmpersandEscapes

    "Created: / 01-04-2019 / 14:40:51 / Claus Gittinger"
! !

!HTMLUtilities class methodsFor:'helpers'!

characterFromHtmlEntityNamed:anHtmlEntityName
    ^ self ampersandEscapes
        at:anHtmlEntityName asSymbol
        ifAbsent:[
            self halt. 
            "/ where to get the mapping???
            "/ Answer: It is a mess. A good start may be
            "/ https://www.w3.org/TR/html4/sgml/entities.html with 252 named entities.
            "/ I guess an actual lookup table would be adequate.
            $~
        ]

    "Modified: / 01-04-2019 / 14:36:18 / Claus Gittinger"
    "Modified: / 04-04-2019 / 10:46:22 / Maren"
!

controlCharacters

    EscapeControlCharacters isNil ifTrue:[
        EscapeControlCharacters := Dictionary new.
        EscapeControlCharacters at:$< put:'&lt;'.
        EscapeControlCharacters at:$> put:'&gt;'.
        EscapeControlCharacters at:$& put:'&amp;'.
        EscapeControlCharacters at:$" put:'&quot;'.
        "/ EscapeControlCharacters at:$' put:'&apos;'.
    ].
    ^ EscapeControlCharacters.

    "Modified (comment): / 06-05-2015 / 16:17:31 / sr"
!

copyReplaceCharactersWithHtmlEntitiesIn:aString
    |stream htmlEntity|

    stream := '' writeStream.
    (aString ? '') do:[:eachCharacter |
        htmlEntity := self htmlEntityForCharacter:eachCharacter.
        htmlEntity isNil ifTrue:[
            stream nextPut:eachCharacter.
        ] ifFalse:[
            stream
                nextPut:$&;
                nextPutAll:htmlEntity;
                nextPut:$;.           
        ].
    ].

    ^ stream contents
!

escapeCharacterEntities:aString
    "helper to escape invalid/dangerous characters in html strings.
     These are:
        control characters, '<', '&' and space -> %XX ascii as hex digits
        %     -> %%
    "
    "/ TODO: this is similar to withSpecialHTMLCharactersEscaped.
    "/ we should refactor this into one method only (can we do hex escapes always ?).
    "/ Notice, that these two methods came into existance due to historic reasons
    "/ and were developed independent of each other, but later moved to this common place.


    ^self escapeCharacterEntities:aString andControlCharacters:self controlCharacters

    "
     self escapeCharacterEntities:'a<b'     
     self escapeCharacterEntities:'aöb'     
    "

    "Modified: / 06-05-2015 / 16:30:13 / sr"
    "Modified (comment): / 26-07-2019 / 12:19:18 / Stefan Vogel"
!

escapeCharacterEntities:aString andControlCharacters:controlCharacters
    "helper to escape invalid/dangerous characters in html strings.
     These are:
        control characters, '<', '>', '&' and space -> %XX ascii as hex digits
        %     -> %%
    "
    "/ TODO: this is similar to withSpecialHTMLCharactersEscaped.
    "/ we should refactor this into one method only (can we do hex escapes always ?).
    "/ Notice, that these two methods came into existance due to historic reasons
    "/ and were developed independent of each other, but later moved to this common place.


    ^ String 
        streamContents:[:ws |
            self escapeCharacterEntities:aString andControlCharacters:controlCharacters on:ws.
        ]
    
    "
     self escapeCharacterEntities:'a<b'     
     self escapeCharacterEntities:'aöb'     
    "

    "Created: / 06-05-2015 / 16:29:51 / sr"
    "Modified (format): / 05-02-2017 / 17:59:32 / cg"
!

escapeCharacterEntities:aString andControlCharacters:controlCharacters on:aWriteStream
    "helper to escape invalid/dangerous characters in html strings.
     These are:
        control characters, '<', '>', '&' and space -> %XX ascii as hex digits
        %     -> %%
    "
    "/ TODO: this is similar to withSpecialHTMLCharactersEscaped.
    "/ we should refactor this into one method only (can we do hex escapes always ?).
    "/ Notice, that these two methods came into existance due to historic reasons
    "/ and were developed independent of each other, but later moved to this common place.


    |rs c controlString|

    rs := ReadStream on: aString.
    [ rs atEnd ] whileFalse: [
        c := rs next.
        controlString := controlCharacters notEmptyOrNil ifTrue:[controlCharacters at:c ifAbsent:nil] ifFalse:[nil].
        controlString notNil ifTrue:[
            aWriteStream nextPutAll:controlString.
        ] ifFalse:[
            c codePoint > 16r7F ifTrue:[
                aWriteStream nextPutAll:'&#'.
                c codePoint printOn:aWriteStream.
                aWriteStream nextPut:$;.
            ] ifFalse:[
                aWriteStream nextPut:c.
            ]
        ]
    ].
    
    "
     self escapeCharacterEntities:'a<b'     
     self escapeCharacterEntities:'aöb'     
    "

    "Created: / 05-02-2017 / 17:58:34 / cg"
    "Modified: / 17-02-2017 / 10:34:20 / stefan"
!

escapeCharacterEntities:aString on:aStream
    "helper to escape invalid/dangerous characters in html strings.
     These are:
        control characters, '<', '&' and space -> %XX ascii as hex digits
        %     -> %%
    "
    "/ TODO: this is similar to withSpecialHTMLCharactersEscaped.
    "/ we should refactor this into one method only (can we do hex escapes always ?).
    "/ Notice, that these two methods came into existance due to historic reasons
    "/ and were developed independent of each other, but later moved to this common place.


    ^self escapeCharacterEntities:aString andControlCharacters:self controlCharacters on:aStream

    "
     self escapeCharacterEntities:'a<b'     
     self escapeCharacterEntities:'aöb'     
    "

    "Created: / 05-02-2017 / 18:00:56 / cg"
    "Modified (comment): / 26-07-2019 / 12:19:09 / Stefan Vogel"
!

extractCharSetEncodingFromContentType:contentTypeLine
    |idx rest encoding|

    idx := contentTypeLine findString:'charset='.
    idx == 0 ifTrue:[
	^ nil
    ].
    rest := (contentTypeLine copyFrom:idx+'charset=' size) withoutSeparators.
    idx := (rest indexOfSeparator) min:(rest indexOf:$;).
    idx == 0 ifTrue:[
	encoding := rest
    ] ifFalse:[
	encoding := rest copyTo:idx-1.
    ].
    (encoding startsWith:$") ifTrue:[
	encoding := encoding copyFrom:2 to:(encoding indexOf:$" startingAt:3)-1.
    ].
    ^ encoding.

    "
     self extractCharSetEncodingFromContentType:'text/html; charset=ascii'
     self extractCharSetEncodingFromContentType:'text/html; charset='
     self extractCharSetEncodingFromContentType:'text/html; fooBar=bla'
     self extractCharSetEncodingFromContentType:'text/xml; charset=utf-8'
     self extractCharSetEncodingFromContentType:'text/xml; charset=utf-8; bla=fasel'
    "
!

extractMimeTypeFromContentType:contentTypeLine
    |idx mimeAndEncoding|

    idx := contentTypeLine indexOf:$:.
    mimeAndEncoding := (contentTypeLine copyFrom:idx+1) withoutSeparators.

    (mimeAndEncoding includes:$;) ifFalse:[
	^ mimeAndEncoding
    ].

    idx := mimeAndEncoding indexOf:$;.
    ^ mimeAndEncoding copyTo:idx-1

    "
     self extractMimeTypeFromContentType:'text/html; charset=ascii'
     self extractMimeTypeFromContentType:'text/html; '
     self extractMimeTypeFromContentType:'text/html'
     self extractMimeTypeFromContentType:'text/xml; charset=utf-8'
    "
!

htmlEntityForCharacter:aCharacter
    aCharacter == Character space ifTrue:[^ nil].
    aCharacter isLetterOrDigit ifTrue:[^ nil].
    
    ^ self ampersandEscapes
        keyAtValue:aCharacter
        ifAbsent:nil

    "Modified: / 01-04-2019 / 14:36:25 / Claus Gittinger"
!

unEscape:aString
    "Convert escaped characters in an urls arguments or post fields back to their proper characters.
     Undoes the effect of #urlEncoded: and #urlEncoded2:.
     These are:
        + -> space
        %XX ascii as hex digits
        %uXXXX unicode as hex digits   NOTE: %u is non-standard bit implemented in MS IIS
        %% -> %
    "

    |rs ws c peekC isUnicodeEscaped|

    aString isNil ifTrue:[
        ^ nil.
    ].

    (aString includesAny:'+%') ifFalse:[        
        ^ aString
    ].

    rs := ReadStream on: aString.
    ws := CharacterWriteStream on: ''.
    isUnicodeEscaped := false.

    [rs atEnd] whileFalse:[
        c := rs next.

        isUnicodeEscaped ifTrue:[
            isUnicodeEscaped := false.
            c := (Integer readFrom:(rs nextAvailable:4) radix:16) asCharacter.
        ] ifFalse:[
            c == $+ ifTrue:[ 
                c := Character space.
            ] ifFalse:[
                c == $% ifTrue:[
                    peekC := rs peek.
                    (peekC notNil and:[peekC isHexDigit]) ifTrue:[
                        c := (Integer readFrom:(rs nextAvailable:2) radix:16) asCharacter. 
                    ] ifFalse:[
                        (peekC notNil and:[peekC == $u]) ifTrue:[
                            isUnicodeEscaped := true.
                            c := nil.
                        ] ifFalse:[
                            c := rs next.
                        ].
                    ].
                ].
            ].
        ].

        c notNil ifTrue:[ 
            ws nextPut:c.
        ].
    ].
    ^ ws contents

    "
     self unEscape:'a%20b'   
     self unEscape:'a%%b'
     self unEscape:'a+b' 
     self unEscape:'a%+b' 
     self unEscape:'a%' 
     self unEscape:'a%2' 
     self unEscape:'/Home/a%C3%A4%C3%B6%C3%BCa'
    "

    "Modified: / 09-01-2011 / 10:44:50 / cg"
    "Modified (comment): / 06-05-2015 / 15:40:04 / sr"
    "Modified (comment): / 03-02-2017 / 17:06:32 / stefan"
!

unescapeCharacterEntities:aString
    "helper to unescape character entities in a string.
     Normally, this is done by the HTMLParser when it scans text,
     but seems to be also used in post-data fields which contain non-ascii characters
     (for example: the login postdata of expeccALM).

     Sequences are:
        &<specialName>;
        &#<decimal>;            
        &#x<hex>

     From Reference:
        http://wiki.selfhtml.org/wiki/Referenz:HTML/Zeichenreferenz#HTML-eigene_Zeichen
    "

    |rs ws c 
     entity entityNumberPart
     htmlEntityMatchingFailed characterFromHtmlEntity|

    (aString includes:$&) ifFalse:[        
        ^ aString
    ].

    rs := ReadStream on:aString.
    ws := CharacterWriteStream on:''.

    [rs atEnd] whileFalse:[
        c := rs next.
        c == $& ifTrue:[
            entity := rs upToMatching:[:ch | ch == $;].
            entity notEmpty ifTrue:[
                rs peek == $; ifTrue:[ "/ something between & and ; 
                    rs next. "/ read over semicolon
                    htmlEntityMatchingFailed := false.

                    entity first == $# ifTrue:[ "/ entity is determined as number
                        entityNumberPart := entity copyFrom:2.
                        entityNumberPart notEmpty ifTrue:[
                            entityNumberPart first == $x ifTrue:[
                                entityNumberPart := entityNumberPart copyFrom:2.
                                entityNumberPart notEmpty ifTrue:[
                                    ws nextPut:(Character value:(Integer readFrom:entityNumberPart radix:16)).
                                ] ifFalse:[
                                    htmlEntityMatchingFailed := true. 
                                ].
                            ] ifFalse:[
                                entityNumberPart isNumeric ifTrue:[
                                    ws nextPut:(Character value:(Integer readFrom:entityNumberPart)).
                                ] ifFalse:[
                                    htmlEntityMatchingFailed := true. 
                                ].
                            ].
                        ] ifFalse:[
                            htmlEntityMatchingFailed := true. 
                        ].
                    ] ifFalse:[
                        characterFromHtmlEntity := self characterFromHtmlEntityNamed:entity.
                        characterFromHtmlEntity notNil ifTrue:[
                            ws nextPut:characterFromHtmlEntity.
                        ] ifFalse:[
                            htmlEntityMatchingFailed := true. 
                        ].
                    ].

                    htmlEntityMatchingFailed ifTrue:[
                        ws nextPut:c.
                        ws nextPutAll:entity.
                        ws nextPut:$;.
                    ].
                ] ifFalse:[
                    ws nextPut:c.
                    ws nextPutAll:entity.
                ].
            ] ifFalse:[
                ws nextPut:c.
            ].
        ] ifFalse:[
            ws nextPut:c.
        ].
    ].

    ^ ws contents

    "
     self unescapeCharacterEntities:'&;'            
     self unescapeCharacterEntities:'&16368;'            
     self unescapeCharacterEntities:'&16368;&16368'            
     self unescapeCharacterEntities:'&16368;&lt;'            
     self unescapeCharacterEntities:'&16368;&lt'            
     self unescapeCharacterEntities:'&#xaffe;'    
     self unescapeCharacterEntities:'&quot;&lt;foo'      
     self unescapeCharacterEntities:'&funny;&lt;foo'     
    "

    "Created: / 06-05-2015 / 16:56:14 / sr"
    "Modified: / 18-05-2015 / 12:13:35 / sr"
    "Modified: / 17-02-2017 / 10:18:35 / stefan"
!

urlDecoded:aString
    "Convert escaped characters in an urls arguments or post fields back to their proper characters.
     Undoes the effect of #urlEncoded: and #urlEncoded2:.
     These are:
        + -> space
        %XX ascii as hex digits
        %uXXXX unicode as hex digits   NOTE: %u is non-standard bit implemented in MS IIS
        %% -> %
    "
    ^ (self unEscape:aString) utf8Decoded

    "
     self urlDecoded:'a%20b'   
     self urlDecoded:'a%%b'
     self urlDecoded:'a+b' 
     self urlDecoded:'a%+b' 
     self urlDecoded:'a%' 
     self urlDecoded:'a%2' 
     self urlDecoded:'/Home/a%C3%A4%C3%B6%C3%BCa'
    "

    "Created: / 26-08-2018 / 12:49:24 / Claus Gittinger"
!

urlEncode2:aStringOrStream on:ws
    <resource: #obsolete>
    "helper to escape invalid/dangerous characters in an urls arguments.
     Similar to urlEncode, but treats '*','~' and spaces differently.
     (some clients, such as bitTorrent seem to require this - time will tell...)
     Any byte not in the set 0-9, a-z, A-Z, '.', '-', '_', is encoded using 
     the '%nn' format, where nn is the hexadecimal value of the byte.
        see: RFC1738"

    |rs c space|

    space := Character space.
    rs := aStringOrStream readStream.

    [rs atEnd] whileFalse: [
        c := rs next.

        (c isLetterOrDigit or:[ ('-_.' includes:c) ]) ifTrue:[
            ws nextPut:c.
        ] ifFalse:[
            ws nextPut: $%.
            c codePoint > 16rFF ifTrue:[
                ws nextPut: $u.
                c codePoint printOn:ws base:16 size:4 fill:$0.
            ] ifFalse:[
                c codePoint printOn:ws base:16 size:2 fill:$0.
            ]
        ].
    ].

    "Created: / 09-01-2011 / 10:32:27 / cg"
    "Modified: / 09-01-2011 / 13:11:17 / cg"
    "Modified: / 06-05-2015 / 15:43:39 / sr"
!

urlEncode:aStringOrStream on:ws
    "helper to escape invalid/dangerous characters in an url's argument or post-fields.

     Any byte not in the set 0-9, a-z, A-Z, '.', '-', '_' and '~', 
     is encoded using the '%nn' format, where nn is the hexadecimal value of the byte.
     Characters outside the ASCII range are encoded into utf8 first.
     Spaces are encoded as '+'.
        see: application/x-www-form-urlencoded  
        see: https://tools.ietf.org/html/rfc3986 (obsoletes RFC1738)"

    |rs c|

    rs := aStringOrStream readStream.

    [(c := rs nextOrNil) notNil] whileTrue: [
        |cp|

        (c isLetterOrDigit or:['-_.~' includes:c]) ifTrue:[
            ws nextPut:c.
        ] ifFalse:[
            c == Character space ifTrue:[
                ws nextPut:$+.
            ] ifFalse:[
                cp := c codePoint.
                cp > 16r7F ifTrue:[
                    c utf8Encoded do:[:eachUtf8Char|
                        ws nextPut: $%.
                        eachUtf8Char codePoint printOn:ws base:16 size:2 fill:$0.
                    ].
                ] ifFalse:[
                    ws nextPut: $%.
                    cp printOn:ws base:16 size:2 fill:$0.
                ].
            ].
        ].
    ].

    "
     self urlEncoded:'hokus pokus fidibus*-/~'
     self urlEncoded:'Ützel Brötzel*-/~'
     self urlEncoded:'χαιρε'

     self urlDecoded:(self urlEncoded:'hokus pokus fidibus*-/~')
     self urlDecoded:(self urlEncoded:'Ützel Brötzel*-/~')
     self urlDecoded:(self urlEncoded:'χαιρε')
    "

    "Modified: / 09-01-2011 / 10:43:30 / cg"
    "Modified: / 06-05-2015 / 16:06:52 / sr"
    "Modified (comment): / 07-02-2017 / 14:51:42 / stefan"
    "Modified (comment): / 26-08-2018 / 12:50:04 / Claus Gittinger"
!

urlEncoded2: aString
    <resource: #obsolete>
    "helper to escape invalid/dangerous characters in an urls arguments or post-fields.
     Similar to urlEncoded, but treats '*','~' and spaces differently.
     (some clients, such as bitTorrent seem to require this - time will tell...)
     Any byte not in the set 0-9, a-z, A-Z, '.', '-', '_' and '~', is encoded using 
     the '%nn' format, where nn is the hexadecimal value of the byte.
        see: application/x-www-form-urlencoded  
        see: RFC1738"

    |ws|

    ws := String writeStreamWithInitialSize:aString size.
    self urlEncode2:aString on:ws.
    ^ ws contents


    "
      self unEscape:(self urlEncoded:'_-.*Frankfurt(Main) Hbf')
      self urlEncoded2:'_-.*Frankfurt(Main) Hbf'

      self unEscape:(self urlEncoded:'-_.*%exept;')
      self urlEncoded2:'-_.*%exept;'  
      self urlEncoded:'-_.*%exept;'    
    "

    "Created: / 09-01-2011 / 10:34:50 / cg"
!

urlEncoded: aString
    "helper to escape invalid/dangerous characters in an urls arguments or post-fields.

     Any byte not in the set 0-9, a-z, A-Z, '.', '-', '_' and '~', is encoded using 
     the '%nn' format, where nn is the hexadecimal value of the byte.
     Characters outside the ASCII range are encoded into utf8 first.
     Spaces are encoded as '+'.
        see: application/x-www-form-urlencoded  
        see: https://tools.ietf.org/html/rfc3986 (obsoletes RFC1738)"

    |ws|

    ws := WriteStream on:(String new:aString size + 20).
    self urlEncode:aString on:ws.
    ^ ws contents


    "
      self unEscape:(self urlEncoded:'_-.*Frankfurt(Main) Hbf')
      self urlEncoded:'_-.*Frankfurt(Main) Hbf'

      self unEscape:(self urlEncoded:'-_.*%exept;')
      self urlEncoded:'-_.*%exept;'

      self urlEncoded:'Не только в сервере, но и в ComSpec, чтобы дочерние КОНСОЛЬНЫЕ процессы могли пользоваться редиректами'
    "

    "Modified: / 09-01-2011 / 10:43:37 / cg"
    "Modified: / 07-02-2017 / 14:54:12 / stefan"
!

withAllSpecialHTMLCharactersEscaped:aStringOrCharacter
    "replace ampersand, less, greater and quotes by html-character escapes"

    "/ TODO: this is similar to escapeCharacterEntities.
    "/ we should refactor this into one method only (can we do hex escapes always ?).
    "/ Notice, that these two methods came into existance due to historic reasons
    "/ and were developed independent of each other, but later moved to this common place.

    |resultStream|

"/    orgs  := #( $&      $<     $>     $"   $').
"/    repls := #( '&amp;' '&lt;' '&gt;' &quot; &apos;).

    (aStringOrCharacter isString
    and:[ (aStringOrCharacter includesAny:'&<>''"') not ]) ifTrue:[^ aStringOrCharacter].

    resultStream := CharacterWriteStream on:''.
    aStringOrCharacter asString do:[:eachCharacter |
        "/ huh - a switch. Sorry, but this method is used heavily.
        eachCharacter == $&
            ifTrue:[ resultStream nextPutAll:'&amp;' ]
            ifFalse:[
        eachCharacter == $<
            ifTrue:[ resultStream nextPutAll:'&lt;' ]
            ifFalse:[
        eachCharacter == $>
            ifTrue:[ resultStream nextPutAll:'&gt;' ]
            ifFalse:[
        eachCharacter == $"
            ifTrue:[ resultStream nextPutAll:'&quot;' ]
            ifFalse:[
        eachCharacter == $'
            ifTrue:[ resultStream nextPutAll:'&apos;' ]
            ifFalse:[
                resultStream nextPut:eachCharacter
            ]]]]].
    ].
    ^ resultStream contents

    "
     self withAllSpecialHTMLCharactersEscaped:'<>#&'     
     self withAllSpecialHTMLCharactersEscaped:$<
     self withAllSpecialHTMLCharactersEscaped:$#
    "

    "Modified: / 05-12-2006 / 13:48:59 / cg"
    "Modified: / 06-05-2015 / 15:41:06 / sr"
!

withSpecialHTMLCharactersEscaped:aStringOrCharacter
    "replace ampersand, less and greater by html-character escapes"

    "/ TODO: this is similar to escapeCharacterEntities.
    "/ we should refactor this into one method only (can we do hex escapes always ?).
    "/ Notice, that these two methods came into existance due to historic reasons
    "/ and were developed independent of each other, but later moved to this common place.

    |resultStream|

"/    orgs  := #( $&      $<     $>     ).
"/    repls := #( '&amp;' '&lt;' '&gt;' ).

    (aStringOrCharacter isString
     and:[ (aStringOrCharacter isWideString not)
     and:[ (aStringOrCharacter includesAny:'&<>') not ]]) ifTrue:[^ aStringOrCharacter].

    resultStream := CharacterWriteStream on:''.
    aStringOrCharacter asString do:[:eachCharacter |
        "/ huh - a switch. Sorry, but this method is used heavily.
        eachCharacter == $&
            ifTrue:[ resultStream nextPutAll:'&amp;' ]
            ifFalse:[
        eachCharacter == $<
            ifTrue:[ resultStream nextPutAll:'&lt;' ]
            ifFalse:[
        eachCharacter == $>
            ifTrue:[ resultStream nextPutAll:'&gt;' ]
            ifFalse:[
"/        eachCharacter codePoint > 16r7F
"/            ifTrue:[ 
"/                resultStream
"/                    nextPutAll:'&#';
"/                    nextPutAll:(eachCharacter codePoint printString);
"/                    nextPutAll:';']
"/            ifFalse:[
                resultStream nextPut:eachCharacter
"/            ]
            ]]].
    ].
    ^ resultStream contents

    "
     self withSpecialHTMLCharactersEscaped:'<>#&'
     self withSpecialHTMLCharactersEscaped:$<
     self withSpecialHTMLCharactersEscaped:$#
    "

    "Modified: / 13-04-2011 / 23:13:32 / cg"
    "Modified: / 06-05-2015 / 15:41:16 / sr"
! !

!HTMLUtilities class methodsFor:'queries'!

isUtilityClass
    ^ self == HTMLUtilities
! !

!HTMLUtilities class methodsFor:'serving-helpers'!

escape:aString
    "helper to escape invalid/dangerous characters in an url's arguments or post-fields.
     These are:
        control characters, dQuote, '+', ';', '?', '&' and space -> %XX ascii as hex digits
        %     -> %%
    "

    | rs ws c cp|

    rs := ReadStream on: aString.
    ws := WriteStream on: ''.
    [ rs atEnd ] whileFalse: [
        c := rs next.
        c == $% ifTrue:[
            ws nextPutAll: '%%'.
        ] ifFalse:[
            (((cp := c codePoint) < 16r7F)
             and:[ ('+;?&" ' includes:c) not ]) ifTrue: [ 
                ws nextPut: c.
            ] ifFalse:[
                ws nextPut: $%.
                cp printOn:ws base:16 size:(cp > 16rFF ifTrue:[4] ifFalse:[2]) fill:$0.
            ]
        ]
    ].
    ^ ws contents

    "
     self escape:'a b'      
     self escape:'a%b'    
     self escape:'a b'      
     self escape:'a+b'      
     self escape:'aäüöb'      
    "

    "Modified: / 06-05-2015 / 16:07:18 / sr"
    "Modified: / 25-11-2016 / 16:37:53 / cg"
! !

!HTMLUtilities class methodsFor:'text processing helpers'!

plainTextOfHTML:htmlString
    "given some HTML, extract the raw text. 
     Can be used to search for strings in some html text."

    |parser doc s first|

    parser := HTMLParser new.
    doc := parser parseText:htmlString.
    s := CharacterWriteStream on:(String new:100).
    first := true.
    doc markUpElementsDo:[:el |
        |t|

        el isTextElement ifTrue:[
            t := el text withoutSeparators.
            t notEmpty ifTrue:[
                first ifFalse:[    
                    s space.
                ].
                s nextPutAll:t.
                first := false    
            ].
        ] ifFalse:[
            "/ ignore non-text; however, we could care for text in info-titles
            "/ or scripts as well...
        ].
    ].
    ^ s contents

    "
     self plainTextOfHTML:'
            bla1 bla2 <br>bla3 <table><tr><td>bla4</td></tr></table> bla5<p>bla6'
     self plainTextOfHTML:'Hello World'        
    "

    "Modified: / 06-05-2015 / 17:02:36 / sr"
! !

!HTMLUtilities class methodsFor:'documentation'!

version
    ^ '$Header$'
!

version_CVS
    ^ '$Header$'
! !