HTMLUtilities.st
author Claus Gittinger <cg@exept.de>
Sun, 24 Mar 2019 20:23:56 +0100
changeset 4903 ff54fc968f34
parent 4737 610d483cb00a
child 4924 b171682381a1
permissions -rw-r--r--
typedef not present in linux ?!

"{ Encoding: utf8 }"

"
 COPYRIGHT (c) 2007 by eXept Software AG
              All Rights Reserved

 This software is furnished under a license and may be used
 only in accordance with the terms of that license and with the
 inclusion of the above copyright notice.   This software may not
 be provided or otherwise made available to, or used by, any
 other person.  No title to or ownership of the software is
 hereby transferred.
"
"{ Package: 'stx:libbasic2' }"

"{ NameSpace: Smalltalk }"

Object subclass:#HTMLUtilities
	instanceVariableNames:''
	classVariableNames:'EscapeControlCharacters HtmlEntityToCharacter'
	poolDictionaries:''
	category:'Net-Communication-Support'
!

!HTMLUtilities class methodsFor:'documentation'!

copyright
"
 COPYRIGHT (c) 2007 by eXept Software AG
              All Rights Reserved

 This software is furnished under a license and may be used
 only in accordance with the terms of that license and with the
 inclusion of the above copyright notice.   This software may not
 be provided or otherwise made available to, or used by, any
 other person.  No title to or ownership of the software is
 hereby transferred.
"
!

documentation
"
    Collected support functions to deal with HTML.
    Used both by HTML generators (DocGenerator), HTMLParsers and the webServer.
    Therefore, it has been put into libbasic2.
"
! !

!HTMLUtilities class methodsFor:'common actions'!

openLauncherOnDisplay:displayName
    <resource: #obsolete>

    "obsolete - do not use"

    self obsoleteMethodWarning.
    Error handle:[:ex |
        ^ ex description
    ] do:[
        NewLauncher openLauncherOnInitializedDisplayNamed:displayName
    ]

    "Modified: / 01-06-2010 / 11:25:12 / sr"
! !

!HTMLUtilities class methodsFor:'constants'!

htmlEntityToCharacter
    |htmlEntityToCharacter|

    HtmlEntityToCharacter isNil ifTrue:[
        htmlEntityToCharacter := Dictionary new.
        htmlEntityToCharacter
            at:'quot'               put:$";
            at:'amp'                put:$&;
            at:'apos'               put:$';
            at:'lt'                 put:$<;
            at:'gt'                 put:$>;
            at:'Auml'               put:$Ä;
            at:'Ouml'               put:$Ö;
            at:'Uuml'               put:$Ü;
            at:'szlig'              put:$ß;
            at:'auml'               put:$ä;
            at:'ouml'               put:$ö;
            at:'uuml'               put:$ü.
            "/ Answer: It is a mess. A good start may be
            "/ https://www.w3.org/TR/html4/sgml/entities.html with 252 named entities.
            "/ I guess an actual lookup table would be adequate.

        HtmlEntityToCharacter := htmlEntityToCharacter.
    ].

    ^ HtmlEntityToCharacter                                      
! !

!HTMLUtilities class methodsFor:'helpers'!

characterFromHtmlEntityNamed:anHtmlEntityName
    ^ self htmlEntityToCharacter
        at:anHtmlEntityName
        ifAbsent:[
            self halt. "/ where to get the mapping???
            "/ Answer: It is a mess. A good start may be
            "/ https://www.w3.org/TR/html4/sgml/entities.html with 252 named entities.
            "/ I guess an actual lookup table would be adequate.
            $~
        ]
!

controlCharacters

    EscapeControlCharacters isNil ifTrue:[
        EscapeControlCharacters := Dictionary new.
        EscapeControlCharacters at:$< put:'&lt;'.
        EscapeControlCharacters at:$> put:'&gt;'.
        EscapeControlCharacters at:$& put:'&amp;'.
        EscapeControlCharacters at:$" put:'&quot;'.
        "/ EscapeControlCharacters at:$' put:'&apos;'.
    ].
    ^ EscapeControlCharacters.

    "Modified (comment): / 06-05-2015 / 16:17:31 / sr"
!

copyReplaceCharactersWithHtmlEntitiesIn:aString
    |stream htmlEntity|

    stream := '' writeStream.
    (aString ? '') do:[:eachCharacter |
        htmlEntity := self htmlEntityForCharacter:eachCharacter.
        htmlEntity isNil ifTrue:[
            stream nextPut:eachCharacter.
        ] ifFalse:[
            stream
                nextPut:$&;
                nextPutAll:htmlEntity;
                nextPut:$;.           
        ].
    ].

    ^ stream contents
!

escapeCharacterEntities:aString
    "helper to escape invalid/dangerous characters in html strings.
     These are:
        control characters, '<', '>', '&' and space -> %XX ascii as hex digits
        %     -> %%
    "
    "/ TODO: this is similar to withSpecialHTMLCharactersEscaped.
    "/ we should refactor this into one method only (can we do hex escapes always ?).
    "/ Notice, that these two methods came into existance due to historic reasons
    "/ and were developed independent of each other, but later moved to this common place.


    ^self escapeCharacterEntities:aString andControlCharacters:self controlCharacters

    "
     self escapeCharacterEntities:'a<b'     
     self escapeCharacterEntities:'aöb'     
    "

    "Modified: / 06-05-2015 / 16:30:13 / sr"
!

escapeCharacterEntities:aString andControlCharacters:controlCharacters
    "helper to escape invalid/dangerous characters in html strings.
     These are:
        control characters, '<', '>', '&' and space -> %XX ascii as hex digits
        %     -> %%
    "
    "/ TODO: this is similar to withSpecialHTMLCharactersEscaped.
    "/ we should refactor this into one method only (can we do hex escapes always ?).
    "/ Notice, that these two methods came into existance due to historic reasons
    "/ and were developed independent of each other, but later moved to this common place.


    ^ String 
        streamContents:[:ws |
            self escapeCharacterEntities:aString andControlCharacters:controlCharacters on:ws.
        ]
    
    "
     self escapeCharacterEntities:'a<b'     
     self escapeCharacterEntities:'aöb'     
    "

    "Created: / 06-05-2015 / 16:29:51 / sr"
    "Modified (format): / 05-02-2017 / 17:59:32 / cg"
!

escapeCharacterEntities:aString andControlCharacters:controlCharacters on:aWriteStream
    "helper to escape invalid/dangerous characters in html strings.
     These are:
        control characters, '<', '>', '&' and space -> %XX ascii as hex digits
        %     -> %%
    "
    "/ TODO: this is similar to withSpecialHTMLCharactersEscaped.
    "/ we should refactor this into one method only (can we do hex escapes always ?).
    "/ Notice, that these two methods came into existance due to historic reasons
    "/ and were developed independent of each other, but later moved to this common place.


    |rs c controlString|

    rs := ReadStream on: aString.
    [ rs atEnd ] whileFalse: [
        c := rs next.
        controlString := controlCharacters notEmptyOrNil ifTrue:[controlCharacters at:c ifAbsent:nil] ifFalse:[nil].
        controlString notNil ifTrue:[
            aWriteStream nextPutAll:controlString.
        ] ifFalse:[
            c codePoint > 16r7F ifTrue:[
                aWriteStream nextPutAll:'&#'.
                c codePoint printOn:aWriteStream.
                aWriteStream nextPut:$;.
            ] ifFalse:[
                aWriteStream nextPut:c.
            ]
        ]
    ].
    
    "
     self escapeCharacterEntities:'a<b'     
     self escapeCharacterEntities:'aöb'     
    "

    "Created: / 05-02-2017 / 17:58:34 / cg"
    "Modified: / 17-02-2017 / 10:34:20 / stefan"
!

escapeCharacterEntities:aString on:aStream
    "helper to escape invalid/dangerous characters in html strings.
     These are:
        control characters, '<', '>', '&' and space -> %XX ascii as hex digits
        %     -> %%
    "
    "/ TODO: this is similar to withSpecialHTMLCharactersEscaped.
    "/ we should refactor this into one method only (can we do hex escapes always ?).
    "/ Notice, that these two methods came into existance due to historic reasons
    "/ and were developed independent of each other, but later moved to this common place.


    ^self escapeCharacterEntities:aString andControlCharacters:self controlCharacters on:aStream

    "
     self escapeCharacterEntities:'a<b'     
     self escapeCharacterEntities:'aöb'     
    "

    "Created: / 05-02-2017 / 18:00:56 / cg"
!

extractCharSetEncodingFromContentType:contentTypeLine
    |idx rest encoding|

    idx := contentTypeLine findString:'charset='.
    idx == 0 ifTrue:[
	^ nil
    ].
    rest := (contentTypeLine copyFrom:idx+'charset=' size) withoutSeparators.
    idx := (rest indexOfSeparator) min:(rest indexOf:$;).
    idx == 0 ifTrue:[
	encoding := rest
    ] ifFalse:[
	encoding := rest copyTo:idx-1.
    ].
    (encoding startsWith:$") ifTrue:[
	encoding := encoding copyFrom:2 to:(encoding indexOf:$" startingAt:3)-1.
    ].
    ^ encoding.

    "
     self extractCharSetEncodingFromContentType:'text/html; charset=ascii'
     self extractCharSetEncodingFromContentType:'text/html; charset='
     self extractCharSetEncodingFromContentType:'text/html; fooBar=bla'
     self extractCharSetEncodingFromContentType:'text/xml; charset=utf-8'
     self extractCharSetEncodingFromContentType:'text/xml; charset=utf-8; bla=fasel'
    "
!

extractMimeTypeFromContentType:contentTypeLine
    |idx mimeAndEncoding|

    idx := contentTypeLine indexOf:$:.
    mimeAndEncoding := (contentTypeLine copyFrom:idx+1) withoutSeparators.

    (mimeAndEncoding includes:$;) ifFalse:[
	^ mimeAndEncoding
    ].

    idx := mimeAndEncoding indexOf:$;.
    ^ mimeAndEncoding copyTo:idx-1

    "
     self extractMimeTypeFromContentType:'text/html; charset=ascii'
     self extractMimeTypeFromContentType:'text/html; '
     self extractMimeTypeFromContentType:'text/html'
     self extractMimeTypeFromContentType:'text/xml; charset=utf-8'
    "
!

htmlEntityForCharacter:aCharacter
    ^ self htmlEntityToCharacter
        keyAtValue:aCharacter
        ifAbsent:nil
!

unEscape:aString
    "Convert escaped characters in an urls arguments or post fields back to their proper characters.
     Undoes the effect of #urlEncoded: and #urlEncoded2:.
     These are:
        + -> space
        %XX ascii as hex digits
        %uXXXX unicode as hex digits   NOTE: %u is non-standard bit implemented in MS IIS
        %% -> %
    "

    |rs ws c peekC isUnicodeEscaped|

    aString isNil ifTrue:[
        ^ nil.
    ].

    (aString includesAny:'+%') ifFalse:[        
        ^ aString
    ].

    rs := ReadStream on: aString.
    ws := CharacterWriteStream on: ''.
    isUnicodeEscaped := false.

    [rs atEnd] whileFalse:[
        c := rs next.

        isUnicodeEscaped ifTrue:[
            isUnicodeEscaped := false.
            c := (Integer readFrom:(rs nextAvailable:4) radix:16) asCharacter.
        ] ifFalse:[
            c == $+ ifTrue:[ 
                c := Character space.
            ] ifFalse:[
                c == $% ifTrue:[
                    peekC := rs peek.
                    (peekC notNil and:[peekC isHexDigit]) ifTrue:[
                        c := (Integer readFrom:(rs nextAvailable:2) radix:16) asCharacter. 
                    ] ifFalse:[
                        (peekC notNil and:[peekC == $u]) ifTrue:[
                            isUnicodeEscaped := true.
                            c := nil.
                        ] ifFalse:[
                            c := rs next.
                        ].
                    ].
                ].
            ].
        ].

        c notNil ifTrue:[ 
            ws nextPut:c.
        ].
    ].
    ^ ws contents

    "
     self unEscape:'a%20b'   
     self unEscape:'a%%b'
     self unEscape:'a+b' 
     self unEscape:'a%+b' 
     self unEscape:'a%' 
     self unEscape:'a%2' 
     self unEscape:'/Home/a%C3%A4%C3%B6%C3%BCa'
    "

    "Modified: / 09-01-2011 / 10:44:50 / cg"
    "Modified (comment): / 06-05-2015 / 15:40:04 / sr"
    "Modified (comment): / 03-02-2017 / 17:06:32 / stefan"
!

unescapeCharacterEntities:aString
    "helper to unescape character entities in a string.
     Normally, this is done by the HTMLParser when it scans text,
     but seems to be also used in post-data fields which contain non-ascii characters
     (for example: the login postdata of expeccALM).

     Sequences are:
        &<specialName>;
        &#<decimal>;            
        &#x<hex>

     From Reference:
        http://wiki.selfhtml.org/wiki/Referenz:HTML/Zeichenreferenz#HTML-eigene_Zeichen
    "

    |rs ws c 
     entity entityNumberPart
     htmlEntityMatchingFailed characterFromHtmlEntity|

    (aString includes:$&) ifFalse:[        
        ^ aString
    ].

    rs := ReadStream on:aString.
    ws := CharacterWriteStream on:''.

    [rs atEnd] whileFalse:[
        c := rs next.
        c == $& ifTrue:[
            entity := rs upToMatching:[:ch | ch == $;].
            entity notEmpty ifTrue:[
                rs peek == $; ifTrue:[ "/ something between & and ; 
                    rs next. "/ read over semicolon
                    htmlEntityMatchingFailed := false.

                    entity first == $# ifTrue:[ "/ entity is determined as number
                        entityNumberPart := entity copyFrom:2.
                        entityNumberPart notEmpty ifTrue:[
                            entityNumberPart first == $x ifTrue:[
                                entityNumberPart := entityNumberPart copyFrom:2.
                                entityNumberPart notEmpty ifTrue:[
                                    ws nextPut:(Character value:(Integer readFrom:entityNumberPart radix:16)).
                                ] ifFalse:[
                                    htmlEntityMatchingFailed := true. 
                                ].
                            ] ifFalse:[
                                entityNumberPart isNumeric ifTrue:[
                                    ws nextPut:(Character value:(Integer readFrom:entityNumberPart)).
                                ] ifFalse:[
                                    htmlEntityMatchingFailed := true. 
                                ].
                            ].
                        ] ifFalse:[
                            htmlEntityMatchingFailed := true. 
                        ].
                    ] ifFalse:[
                        characterFromHtmlEntity := self characterFromHtmlEntityNamed:entity.
                        characterFromHtmlEntity notNil ifTrue:[
                            ws nextPut:characterFromHtmlEntity.
                        ] ifFalse:[
                            htmlEntityMatchingFailed := true. 
                        ].
                    ].

                    htmlEntityMatchingFailed ifTrue:[
                        ws nextPut:c.
                        ws nextPutAll:entity.
                        ws nextPut:$;.
                    ].
                ] ifFalse:[
                    ws nextPut:c.
                    ws nextPutAll:entity.
                ].
            ] ifFalse:[
                ws nextPut:c.
            ].
        ] ifFalse:[
            ws nextPut:c.
        ].
    ].

    ^ ws contents

    "
     self unescapeCharacterEntities:'&;'            
     self unescapeCharacterEntities:'&16368;'            
     self unescapeCharacterEntities:'&16368;&16368'            
     self unescapeCharacterEntities:'&16368;&lt;'            
     self unescapeCharacterEntities:'&16368;&lt'            
     self unescapeCharacterEntities:'&#xaffe;'    
     self unescapeCharacterEntities:'&quot;&lt;foo'      
     self unescapeCharacterEntities:'&funny;&lt;foo'     
    "

    "Created: / 06-05-2015 / 16:56:14 / sr"
    "Modified: / 18-05-2015 / 12:13:35 / sr"
    "Modified: / 17-02-2017 / 10:18:35 / stefan"
!

urlDecoded:aString
    "Convert escaped characters in an urls arguments or post fields back to their proper characters.
     Undoes the effect of #urlEncoded: and #urlEncoded2:.
     These are:
        + -> space
        %XX ascii as hex digits
        %uXXXX unicode as hex digits   NOTE: %u is non-standard bit implemented in MS IIS
        %% -> %
    "
    ^ (self unEscape:aString) utf8Decoded

    "
     self urlDecoded:'a%20b'   
     self urlDecoded:'a%%b'
     self urlDecoded:'a+b' 
     self urlDecoded:'a%+b' 
     self urlDecoded:'a%' 
     self urlDecoded:'a%2' 
     self urlDecoded:'/Home/a%C3%A4%C3%B6%C3%BCa'
    "

    "Created: / 26-08-2018 / 12:49:24 / Claus Gittinger"
!

urlEncode2:aStringOrStream on:ws
    <resource: #obsolete>
    "helper to escape invalid/dangerous characters in an urls arguments.
     Similar to urlEncode, but treats '*','~' and spaces differently.
     (some clients, such as bitTorrent seem to require this - time will tell...)
     Any byte not in the set 0-9, a-z, A-Z, '.', '-', '_', is encoded using 
     the '%nn' format, where nn is the hexadecimal value of the byte.
        see: RFC1738"

    |rs c space|

    space := Character space.
    rs := aStringOrStream readStream.

    [rs atEnd] whileFalse: [
        c := rs next.

        (c isLetterOrDigit or:[ ('-_.' includes:c) ]) ifTrue:[
            ws nextPut:c.
        ] ifFalse:[
            ws nextPut: $%.
            c codePoint > 16rFF ifTrue:[
                ws nextPut: $u.
                c codePoint printOn:ws base:16 size:4 fill:$0.
            ] ifFalse:[
                c codePoint printOn:ws base:16 size:2 fill:$0.
            ]
        ].
    ].

    "Created: / 09-01-2011 / 10:32:27 / cg"
    "Modified: / 09-01-2011 / 13:11:17 / cg"
    "Modified: / 06-05-2015 / 15:43:39 / sr"
!

urlEncode:aStringOrStream on:ws
    "helper to escape invalid/dangerous characters in an urlÄs argument or post-fields.

     Any byte not in the set 0-9, a-z, A-Z, '.', '-', '_' and '~', 
     is encoded using the '%nn' format, where nn is the hexadecimal value of the byte.
     Characters outside the ASCII range are encoded into utf8 first.
     Spaces are encoded as '+'.
        see: application/x-www-form-urlencoded  
        see: https://tools.ietf.org/html/rfc3986 (obsoletes RFC1738)"

    |rs c|

    rs := aStringOrStream readStream.

    [(c := rs nextOrNil) notNil] whileTrue: [
        |cp|

        (c isLetterOrDigit or:['-_.~' includes:c]) ifTrue:[
            ws nextPut:c.
        ] ifFalse:[
            c == Character space ifTrue:[
                ws nextPut:$+.
            ] ifFalse:[
                cp := c codePoint.
                cp > 16r7F ifTrue:[
                    c utf8Encoded do:[:eachUtf8Char|
                        ws nextPut: $%.
                        eachUtf8Char codePoint printOn:ws base:16 size:2 fill:$0.
                    ].
                ] ifFalse:[
                    ws nextPut: $%.
                    cp printOn:ws base:16 size:2 fill:$0.
                ].
            ].
        ].
    ].

    "
     self urlEncoded:'hokus pokus fidibus*-/~'
     self urlEncoded:'Ützel Brötzel*-/~'
     self urlEncoded:'χαιρε'

     self urlDecoded:(self urlEncoded:'hokus pokus fidibus*-/~')
     self urlDecoded:(self urlEncoded:'Ützel Brötzel*-/~')
     self urlDecoded:(self urlEncoded:'χαιρε')
    "

    "Modified: / 09-01-2011 / 10:43:30 / cg"
    "Modified: / 06-05-2015 / 16:06:52 / sr"
    "Modified (comment): / 07-02-2017 / 14:51:42 / stefan"
    "Modified (comment): / 26-08-2018 / 12:50:04 / Claus Gittinger"
!

urlEncoded2: aString
    <resource: #obsolete>
    "helper to escape invalid/dangerous characters in an urls arguments or post-fields.
     Similar to urlEncoded, but treats '*','~' and spaces differently.
     (some clients, such as bitTorrent seem to require this - time will tell...)
     Any byte not in the set 0-9, a-z, A-Z, '.', '-', '_' and '~', is encoded using 
     the '%nn' format, where nn is the hexadecimal value of the byte.
        see: application/x-www-form-urlencoded  
        see: RFC1738"

    |ws|

    ws := String writeStreamWithInitialSize:aString size.
    self urlEncode2:aString on:ws.
    ^ ws contents


    "
      self unEscape:(self urlEncoded:'_-.*Frankfurt(Main) Hbf')
      self urlEncoded2:'_-.*Frankfurt(Main) Hbf'

      self unEscape:(self urlEncoded:'-_.*%exept;')
      self urlEncoded2:'-_.*%exept;'  
      self urlEncoded:'-_.*%exept;'    
    "

    "Created: / 09-01-2011 / 10:34:50 / cg"
!

urlEncoded: aString
    "helper to escape invalid/dangerous characters in an urls arguments or post-fields.

     Any byte not in the set 0-9, a-z, A-Z, '.', '-', '_' and '~', is encoded using 
     the '%nn' format, where nn is the hexadecimal value of the byte.
     Characters outside the ASCII range are encoded into utf8 first.
     Spaces are encoded as '+'.
        see: application/x-www-form-urlencoded  
        see: https://tools.ietf.org/html/rfc3986 (obsoletes RFC1738)"

    |ws|

    ws := WriteStream on:(String new:aString size + 20).
    self urlEncode:aString on:ws.
    ^ ws contents


    "
      self unEscape:(self urlEncoded:'_-.*Frankfurt(Main) Hbf')
      self urlEncoded:'_-.*Frankfurt(Main) Hbf'

      self unEscape:(self urlEncoded:'-_.*%exept;')
      self urlEncoded:'-_.*%exept;'
    "

    "Modified: / 09-01-2011 / 10:43:37 / cg"
    "Modified: / 07-02-2017 / 14:54:12 / stefan"
!

withAllSpecialHTMLCharactersEscaped:aStringOrCharacter
    "replace ampersand, less, greater and quotes by html-character escapes"

    "/ TODO: this is similar to escapeCharacterEntities.
    "/ we should refactor this into one method only (can we do hex escapes always ?).
    "/ Notice, that these two methods came into existance due to historic reasons
    "/ and were developed independent of each other, but later moved to this common place.

    |resultStream|

"/    orgs  := #( $&      $<     $>     $"   $').
"/    repls := #( '&amp;' '&lt;' '&gt;' &quot; &apos;).

    (aStringOrCharacter isString
    and:[ (aStringOrCharacter includesAny:'&<>''"') not ]) ifTrue:[^ aStringOrCharacter].

    resultStream := CharacterWriteStream on:''.
    aStringOrCharacter asString do:[:eachCharacter |
        "/ huh - a switch. Sorry, but this method is used heavily.
        eachCharacter == $&
            ifTrue:[ resultStream nextPutAll:'&amp;' ]
            ifFalse:[
        eachCharacter == $<
            ifTrue:[ resultStream nextPutAll:'&lt;' ]
            ifFalse:[
        eachCharacter == $>
            ifTrue:[ resultStream nextPutAll:'&gt;' ]
            ifFalse:[
        eachCharacter == $"
            ifTrue:[ resultStream nextPutAll:'&quot;' ]
            ifFalse:[
        eachCharacter == $'
            ifTrue:[ resultStream nextPutAll:'&apos;' ]
            ifFalse:[
                resultStream nextPut:eachCharacter
            ]]]]].
    ].
    ^ resultStream contents

    "
     self withAllSpecialHTMLCharactersEscaped:'<>#&'     
     self withAllSpecialHTMLCharactersEscaped:$<
     self withAllSpecialHTMLCharactersEscaped:$#
    "

    "Modified: / 05-12-2006 / 13:48:59 / cg"
    "Modified: / 06-05-2015 / 15:41:06 / sr"
!

withSpecialHTMLCharactersEscaped:aStringOrCharacter
    "replace ampersand, less and greater by html-character escapes"

    "/ TODO: this is similar to escapeCharacterEntities.
    "/ we should refactor this into one method only (can we do hex escapes always ?).
    "/ Notice, that these two methods came into existance due to historic reasons
    "/ and were developed independent of each other, but later moved to this common place.

    |resultStream|

"/    orgs  := #( $&      $<     $>     ).
"/    repls := #( '&amp;' '&lt;' '&gt;' ).

    (aStringOrCharacter isString
     and:[ (aStringOrCharacter isWideString not)
     and:[ (aStringOrCharacter includesAny:'&<>') not ]]) ifTrue:[^ aStringOrCharacter].

    resultStream := CharacterWriteStream on:''.
    aStringOrCharacter asString do:[:eachCharacter |
        "/ huh - a switch. Sorry, but this method is used heavily.
        eachCharacter == $&
            ifTrue:[ resultStream nextPutAll:'&amp;' ]
            ifFalse:[
        eachCharacter == $<
            ifTrue:[ resultStream nextPutAll:'&lt;' ]
            ifFalse:[
        eachCharacter == $>
            ifTrue:[ resultStream nextPutAll:'&gt;' ]
            ifFalse:[
"/        eachCharacter codePoint > 16r7F
"/            ifTrue:[ 
"/                resultStream
"/                    nextPutAll:'&#';
"/                    nextPutAll:(eachCharacter codePoint printString);
"/                    nextPutAll:';']
"/            ifFalse:[
                resultStream nextPut:eachCharacter
"/            ]
            ]]].
    ].
    ^ resultStream contents

    "
     self withSpecialHTMLCharactersEscaped:'<>#&'
     self withSpecialHTMLCharactersEscaped:$<
     self withSpecialHTMLCharactersEscaped:$#
    "

    "Modified: / 13-04-2011 / 23:13:32 / cg"
    "Modified: / 06-05-2015 / 15:41:16 / sr"
! !

!HTMLUtilities class methodsFor:'queries'!

isUtilityClass
    ^ self == HTMLUtilities
! !

!HTMLUtilities class methodsFor:'serving-helpers'!

escape:aString
    "helper to escape invalid/dangerous characters in an url's arguments or post-fields.
     These are:
        control characters, dQuote, '+', ';', '?', '&' and space -> %XX ascii as hex digits
        %     -> %%
    "

    | rs ws c cp|

    rs := ReadStream on: aString.
    ws := WriteStream on: ''.
    [ rs atEnd ] whileFalse: [
        c := rs next.
        c == $% ifTrue:[
            ws nextPutAll: '%%'.
        ] ifFalse:[
            (((cp := c codePoint) < 16r7F)
             and:[ ('+;?&" ' includes:c) not ]) ifTrue: [ 
                ws nextPut: c.
            ] ifFalse:[
                ws nextPut: $%.
                cp printOn:ws base:16 size:(cp > 16rFF ifTrue:[4] ifFalse:[2]) fill:$0.
            ]
        ]
    ].
    ^ ws contents

    "
     self escape:'a b'      
     self escape:'a%b'    
     self escape:'a b'      
     self escape:'a+b'      
     self escape:'aäüöb'      
    "

    "Modified: / 06-05-2015 / 16:07:18 / sr"
    "Modified: / 25-11-2016 / 16:37:53 / cg"
! !

!HTMLUtilities class methodsFor:'text processing helpers'!

plainTextOfHTML:htmlString
    "given some HTML, extract the raw text. 
     Can be used to search for strings in some html text."

    |parser doc s first|

    parser := HTMLParser new.
    doc := parser parseText:htmlString.
    s := CharacterWriteStream on:(String new:100).
    first := true.
    doc markUpElementsDo:[:el |
        |t|

        el isTextElement ifTrue:[
            t := el text withoutSeparators.
            t notEmpty ifTrue:[
                first ifFalse:[    
                    s space.
                ].
                s nextPutAll:t.
                first := false    
            ].
        ] ifFalse:[
            "/ ignore non-text; however, we could care for text in info-titles
            "/ or scripts as well...
        ].
    ].
    ^ s contents

    "
     self plainTextOfHTML:'
            bla1 bla2 <br>bla3 <table><tr><td>bla4</td></tr></table> bla5<p>bla6'
     self plainTextOfHTML:'Hello World'        
    "

    "Modified: / 06-05-2015 / 17:02:36 / sr"
! !

!HTMLUtilities class methodsFor:'documentation'!

version
    ^ '$Header$'
!

version_CVS
    ^ '$Header$'
! !