HTMLUtilities.st
author Stefan Vogel <sv@exept.de>
Mon, 01 Nov 2010 21:35:57 +0100
changeset 2500 c67839870a9c
parent 2464 ebff59707514
child 2522 a7ff39418d38
permissions -rw-r--r--
added: #urlEncode:on: changed: #urlEncoded:

"
 COPYRIGHT (c) 2007 by eXept Software AG
              All Rights Reserved

 This software is furnished under a license and may be used
 only in accordance with the terms of that license and with the
 inclusion of the above copyright notice.   This software may not
 be provided or otherwise made available to, or used by, any
 other person.  No title to or ownership of the software is
 hereby transferred.
"
"{ Package: 'stx:libbasic2' }"

Object subclass:#HTMLUtilities
	instanceVariableNames:''
	classVariableNames:'EscapeControlCharacters'
	poolDictionaries:''
	category:'Net-Communication-Support'
!

!HTMLUtilities class methodsFor:'documentation'!

copyright
"
 COPYRIGHT (c) 2007 by eXept Software AG
              All Rights Reserved

 This software is furnished under a license and may be used
 only in accordance with the terms of that license and with the
 inclusion of the above copyright notice.   This software may not
 be provided or otherwise made available to, or used by, any
 other person.  No title to or ownership of the software is
 hereby transferred.
"
!

documentation
"
    Collected support functions to deal with HTML.
    Used both by HTML generators (DocGenerator), HTMLParsers and the webServer.
    Therefore, it has been put into libbasic2.
"
! !

!HTMLUtilities class methodsFor:'common actions'!

openLauncherOnDisplay:displayName
    <resource: #obsolete>

    "obsolete - do not use"

    self obsoleteMethodWarning.
    Error handle:[:ex |
        ^ ex description
    ] do:[
        NewLauncher openLauncherOnInitializedDisplayNamed:displayName
    ]

    "Modified: / 01-06-2010 / 11:25:12 / sr"
! !

!HTMLUtilities class methodsFor:'helpers'!

controlCharacters

    EscapeControlCharacters isNil ifTrue:[
        EscapeControlCharacters := Dictionary new.
"/        EscapeControlCharacters at:Character space put:'&nbsp;'.
        EscapeControlCharacters at:$< put:'&lt;'.
        EscapeControlCharacters at:$> put:'&gt;'.
        EscapeControlCharacters at:$& put:'&amp;'.
        EscapeControlCharacters at:$" put:'&quot;'.
        "/ EscapeControlCharacters at:$' put:'&apos;'.
    ].
    ^ EscapeControlCharacters.
!

escapeCharacterEntities:aString
    "helper to escape invalid/dangerous characters in html strings.
     These are:
        control characters, '<', '>', '&' and space -> %XX ascii as hex digits
        %     -> %%
    "
    "/ TODO: this is similar to withSpecialHTMLCharactersEscaped.
    "/ we should refactor this into one method only (can we do hex escapes always ?).
    "/ Notice, that these two methods came into existance due to historic reasons
    "/ and were developed independent of each other, but later moved to this common place.


    |rs ws c controlCharacters controlString|

    controlCharacters := self controlCharacters.
    rs := ReadStream on: aString.
    ws := WriteStream on: ''.
    [ rs atEnd ] whileFalse: [
        c := rs next.
        controlString := controlCharacters at:c ifAbsent:nil.
        controlString notNil ifTrue:[
            ws nextPutAll:controlString.
        ] ifFalse:[
            c codePoint > 16r7F ifTrue:[
                ws 
                    nextPutAll:'&#';
                    nextPutAll:(c codePoint printString);
                    nextPutAll:';'.
            ] ifFalse:[
                ws nextPut:c.
            ]
        ]
    ].
    ^ ws contents
    
    "
     self escapeCharacterEntities:'a<b'     
     self escapeCharacterEntities:'aöb'     
    "
!

extractCharSetEncodingFromContentType:contentTypeLine
    |idx rest encoding|

    idx := contentTypeLine findString:'charset='.
    idx == 0 ifTrue:[
	^ nil
    ].
    rest := (contentTypeLine copyFrom:idx+'charset=' size) withoutSeparators.
    idx := (rest indexOfSeparator) min:(rest indexOf:$;).
    idx == 0 ifTrue:[
	encoding := rest
    ] ifFalse:[
	encoding := rest copyTo:idx-1.
    ].
    (encoding startsWith:$") ifTrue:[
	encoding := encoding copyFrom:2 to:(encoding indexOf:$" startingAt:3)-1.
    ].
    ^ encoding.

    "
     self extractCharSetEncodingFromContentType:'text/html; charset=ascii'
     self extractCharSetEncodingFromContentType:'text/html; charset='
     self extractCharSetEncodingFromContentType:'text/html; fooBar=bla'
     self extractCharSetEncodingFromContentType:'text/xml; charset=utf-8'
     self extractCharSetEncodingFromContentType:'text/xml; charset=utf-8; bla=fasel'
    "
!

extractMimeTypeFromContentType:contentTypeLine
    |idx mimeAndEncoding|

    idx := contentTypeLine indexOf:$:.
    mimeAndEncoding := (contentTypeLine copyFrom:idx+1) withoutSeparators.

    (mimeAndEncoding includes:$;) ifFalse:[
	^ mimeAndEncoding
    ].

    idx := mimeAndEncoding indexOf:$;.
    ^ mimeAndEncoding copyTo:idx-1

    "
     self extractMimeTypeFromContentType:'text/html; charset=ascii'
     self extractMimeTypeFromContentType:'text/html; '
     self extractMimeTypeFromContentType:'text/html'
     self extractMimeTypeFromContentType:'text/xml; charset=utf-8'
    "
!

unEscape:aString
    "Convert escaped characters in an urls arguments or post fields to their proper characters.
     These are:
        + -> space
        %XX ascii as hex digits
        %% -> %
    "

    |rs ws c peekC|

    (aString indexOfAny:'+%') == 0 ifTrue:[
        ^ aString
    ].

    rs := ReadStream on: aString.
    ws := WriteStream on: ''.
    [rs atEnd] whileFalse:[
        c := rs next.
        c == $+ 
            ifTrue:[ c := Character space ] 
            ifFalse:[
                c == $% 
                    ifTrue: [
                        peekC := rs peek.
                        (peekC notNil and:[peekC isHexDigit]) ifTrue:[
                            c := (Integer readFrom:(rs nextAvailable:2) radix:16) asCharacter 
                        ] ifFalse:[
                            c := rs next.
                        ]
                    ]
            ].
        c notNil ifTrue:[ ws nextPut: c ].
    ].
    ^ ws contents

    "
     self unEscape:'a%20b'   
     self unEscape:'a%%b'
     self unEscape:'a+b' 
     self unEscape:'a%+b' 
     self unEscape:'a%' 
     self unEscape:'a%2' 
    "

    "Modified: / 08-07-2009 / 12:40:56 / sr"
!

urlEncode:aStringOrStream on:ws
    "helper to escape invalid/dangerous characters in an urls arguments or post-fields.
        see: application/x-www-form-urlencoded"

    |rs c space|

    space := Character space.
    rs := aStringOrStream readStream.

    [rs atEnd] whileFalse: [
        c := rs next.

        c isLetterOrDigit ifTrue:[
            ws nextPut:c.
        ] ifFalse:[
            c == space ifTrue:[
                ws nextPut:$+.
            ] ifFalse:[
                ('-_.*' includes:c) ifTrue:[
                    ws nextPut:c.
                ] ifFalse:[
                    ws nextPut: $%.
                    c codePoint printOn:ws base:16 size:2 fill:$0.
                ].
            ].
        ].
    ].
!

urlEncoded: aString
    "helper to escape invalid/dangerous characters in an urls arguments or post-fields.
        see: application/x-www-form-urlencoded "

    |ws|

    ws := WriteStream on: ''.
    self urlEncode:aString on:ws.
    ^ ws contents


    "
      self unEscape:(self urlEncoded:'_-.*Frankfurt(Main) Hbf')
      self urlEncoded:'_-.*Frankfurt(Main) Hbf'

      self unEscape:(self urlEncoded:'-_.*%exept;')
      self urlEncoded:'-_.*%exept;'
    "

    "Modified: / 12-07-2010 / 16:49:44 / cg"
!

withAllSpecialHTMLCharactersEscaped:aStringOrCharacter
    "replace ampersand, less, greater and quotes by html-character escapes"

    "/ TODO: this is similar to escapeCharacterEntities.
    "/ we should refactor this into one method only (can we do hex escapes always ?).
    "/ Notice, that these two methods came into existance due to historic reasons
    "/ and were developed independent of each other, but later moved to this common place.

    |resultStream|

"/    orgs  := #( $&      $<     $>     $"   $').
"/    repls := #( '&amp;' '&lt;' '&gt;' &quot; &apos;).

    (aStringOrCharacter isString
    and:[ (aStringOrCharacter includesAny:'&<>') not ]) ifTrue:[^ aStringOrCharacter].

    resultStream := WriteStream on:''.
    aStringOrCharacter asString do:[:eachCharacter |
        "/ huh - a switch. Sorry, but this method is used heavily.
        eachCharacter == $&
            ifTrue:[ resultStream nextPutAll:'&amp;' ]
            ifFalse:[
        eachCharacter == $<
            ifTrue:[ resultStream nextPutAll:'&lt;' ]
            ifFalse:[
        eachCharacter == $>
            ifTrue:[ resultStream nextPutAll:'&gt;' ]
            ifFalse:[
        eachCharacter == $"
            ifTrue:[ resultStream nextPutAll:'&quot;' ]
            ifFalse:[
        eachCharacter == $'
            ifTrue:[ resultStream nextPutAll:'&apos;' ]
            ifFalse:[
                resultStream nextPut:eachCharacter
            ]]]]].
    ].
    ^ resultStream contents

    "
     self withSpecialHTMLCharactersEscaped:'<>#&'
     self withSpecialHTMLCharactersEscaped:$<
     self withSpecialHTMLCharactersEscaped:$#
    "

    "Modified: / 05-12-2006 / 13:48:59 / cg"
!

withSpecialHTMLCharactersEscaped:aStringOrCharacter
    "replace ampersand, less and greater by html-character escapes"

    "/ TODO: this is similar to escapeCharacterEntities.
    "/ we should refactor this into one method only (can we do hex escapes always ?).
    "/ Notice, that these two methods came into existance due to historic reasons
    "/ and were developed independent of each other, but later moved to this common place.

    |resultStream orgs repls|

"/    orgs  := #( $&      $<     $>     ).
"/    repls := #( '&amp;' '&lt;' '&gt;' ).

    (aStringOrCharacter isString
    and:[ (aStringOrCharacter includesAny:'&<>') not ]) ifTrue:[^ aStringOrCharacter].

    resultStream := WriteStream on:''.
    aStringOrCharacter asString do:[:eachCharacter |
        "/ huh - a switch. Sorry, but this method is used heavily.
        eachCharacter == $&
            ifTrue:[ resultStream nextPutAll:'&amp;' ]
            ifFalse:[
        eachCharacter == $<
            ifTrue:[ resultStream nextPutAll:'&lt;' ]
            ifFalse:[
        eachCharacter == $>
            ifTrue:[ resultStream nextPutAll:'&gt;' ]
            ifFalse:[
                resultStream nextPut:eachCharacter
            ]]].
    ].
    ^ resultStream contents

    "
     self withSpecialHTMLCharactersEscaped:'<>#&'
     self withSpecialHTMLCharactersEscaped:$<
     self withSpecialHTMLCharactersEscaped:$#
    "

    "Modified: / 05-12-2006 / 13:48:59 / cg"
! !

!HTMLUtilities class methodsFor:'serving-helpers'!

escape:aString
    "helper to escape invalid/dangerous characters in an url's arguments or post-fields.
     These are:
        control characters, '+', ';', '?', '&' and space -> %XX ascii as hex digits
        %     -> %%
    "

    | rs ws c |

    rs := ReadStream on: aString.
    ws := WriteStream on: ''.
    [ rs atEnd ] whileFalse: [
        c := rs next.
        c == $% ifTrue:[
            ws nextPutAll: '%%'.
        ] ifFalse:[
            ((c codePoint < 16r7F)
             and:[ ('+;?& ' includes:c) not ]) ifTrue: [ 
                ws nextPut: c.
            ] ifFalse:[
                ws nextPut: $%.
                c codePoint printOn:ws base:16.
            ]
        ]
    ].
    ^ ws contents

    "
     self escape:'a b'      
     self escape:'a%b'    
     self escape:'a b'      
     self escape:'a+b'      
     self escape:'aäüöb'      
    "
! !

!HTMLUtilities class methodsFor:'text processing helpers'!

plainTextOfHTML:htmlString
    "given some HTML, extract the raw text. 
     Can be used to search for strings in some html text."

    |parser doc s|

    s := CharacterWriteStream on:(String new:100).

    parser := HTMLParser new.
    doc := parser parseText:htmlString.
    doc markUpElementsDo:[:el |
        |t|

        el isTextElement ifTrue:[
            t := el text withoutSeparators.
            t notEmpty ifTrue:[
                s nextPutAll:t.
                s space.
            ].
        ] ifFalse:[
            "/ ignore non-text; however, we could care for text in info-titles
            "/ or scripts as well...
        ].
    ].
    ^ s contents asSingleByteStringIfPossible

    "
     self plainTextOfHTML:'
bla1 bla2 <br>bla3 <table><tr><td>bla4</td></tr></table> bla5<p>bla6
'        
    "
! !

!HTMLUtilities class methodsFor:'documentation'!

version
    ^ '$Header: /cvs/stx/stx/libbasic2/HTMLUtilities.st,v 1.12 2010-11-01 20:35:57 stefan Exp $'
!

version_CVS
    ^ '$Header: /cvs/stx/stx/libbasic2/HTMLUtilities.st,v 1.12 2010-11-01 20:35:57 stefan Exp $'
! !