HTMLUtilities.st
changeset 2058 f407ff58f780
child 2066 0ee2ef2d018c
equal deleted inserted replaced
2057:3780915ebb2a 2058:f407ff58f780
       
     1 "
       
     2  COPYRIGHT (c) 2007 by eXept Software AG
       
     3               All Rights Reserved
       
     4 
       
     5  This software is furnished under a license and may be used
       
     6  only in accordance with the terms of that license and with the
       
     7  inclusion of the above copyright notice.   This software may not
       
     8  be provided or otherwise made available to, or used by, any
       
     9  other person.  No title to or ownership of the software is
       
    10  hereby transferred.
       
    11 "
       
    12 "{ Package: 'stx:libbasic2' }"
       
    13 
       
    14 Object subclass:#HTMLUtilities
       
    15 	instanceVariableNames:''
       
    16 	classVariableNames:'EscapeControlCharacters'
       
    17 	poolDictionaries:''
       
    18 	category:'Net-Communication-Support'
       
    19 !
       
    20 
       
    21 !HTMLUtilities class methodsFor:'documentation'!
       
    22 
       
    23 copyright
       
    24 "
       
    25  COPYRIGHT (c) 2007 by eXept Software AG
       
    26               All Rights Reserved
       
    27 
       
    28  This software is furnished under a license and may be used
       
    29  only in accordance with the terms of that license and with the
       
    30  inclusion of the above copyright notice.   This software may not
       
    31  be provided or otherwise made available to, or used by, any
       
    32  other person.  No title to or ownership of the software is
       
    33  hereby transferred.
       
    34 "
       
    35 !
       
    36 
       
    37 documentation
       
    38 "
       
    39     Collected support functions to deal with HTML.
       
    40     Used both by HTML generators (DocGenerator), HTMLParsers and the webServer.
       
    41     Therefore, it has been put into libbasic2.
       
    42 "
       
    43 ! !
       
    44 
       
    45 !HTMLUtilities class methodsFor:'helpers'!
       
    46 
       
    47 controlCharacters
       
    48 
       
    49     EscapeControlCharacters isNil ifTrue:[
       
    50         EscapeControlCharacters := Dictionary new.
       
    51 "/        EscapeControlCharacters at:Character space put:' '.
       
    52         EscapeControlCharacters at:$< put:'&lt;'.
       
    53         EscapeControlCharacters at:$> put:'&gt;'.
       
    54         EscapeControlCharacters at:$& put:'&amp;'.
       
    55         EscapeControlCharacters at:$" put:'&quot;'.
       
    56     ].
       
    57     ^ EscapeControlCharacters.
       
    58 !
       
    59 
       
    60 escapeCharacterEntities:aString
       
    61     "helper to escape invalid/dangerous characters in html strings.
       
    62      These are:
       
    63         control characters, '<', '>', '&' and space -> %XX ascii as hex digits
       
    64         %     -> %%
       
    65     "
       
    66     |rs ws c controlCharacters controlString|
       
    67 
       
    68     controlCharacters := self controlCharacters.
       
    69     rs := ReadStream on: aString.
       
    70     ws := WriteStream on: ''.
       
    71     [ rs atEnd ] whileFalse: [
       
    72         c := rs next.
       
    73         controlString := controlCharacters at:c ifAbsent:nil.
       
    74         controlString notNil ifTrue:[
       
    75             ws nextPutAll:controlString.
       
    76         ] ifFalse:[
       
    77             c codePoint > 16r7F ifTrue:[
       
    78                 ws 
       
    79                     nextPutAll:'&#';
       
    80                     nextPutAll:(c codePoint printString);
       
    81                     nextPutAll:';'.
       
    82             ] ifFalse:[
       
    83                 ws nextPut:c.
       
    84             ]
       
    85         ]
       
    86     ].
       
    87     ^ ws contents
       
    88     
       
    89 
       
    90     "
       
    91      self escapeCharacterEntities:'a<b'     
       
    92      self escapeCharacterEntities:'aöb'     
       
    93     "
       
    94 !
       
    95 
       
    96 extractCharSetEncodingFromContentType:contentTypeLine
       
    97     |idx rest encoding|
       
    98 
       
    99     idx := contentTypeLine findString:'charset='.
       
   100     idx == 0 ifTrue:[
       
   101 	^ nil
       
   102     ].
       
   103     rest := (contentTypeLine copyFrom:idx+'charset=' size) withoutSeparators.
       
   104     idx := (rest indexOfSeparator) min:(rest indexOf:$;).
       
   105     idx == 0 ifTrue:[
       
   106 	encoding := rest
       
   107     ] ifFalse:[
       
   108 	encoding := rest copyTo:idx-1.
       
   109     ].
       
   110     (encoding startsWith:$") ifTrue:[
       
   111 	encoding := encoding copyFrom:2 to:(encoding indexOf:$" startingAt:3)-1.
       
   112     ].
       
   113     ^ encoding.
       
   114 
       
   115     "
       
   116      self extractCharSetEncodingFromContentType:'text/html; charset=ascii'
       
   117      self extractCharSetEncodingFromContentType:'text/html; charset='
       
   118      self extractCharSetEncodingFromContentType:'text/html; fooBar=bla'
       
   119      self extractCharSetEncodingFromContentType:'text/xml; charset=utf-8'
       
   120      self extractCharSetEncodingFromContentType:'text/xml; charset=utf-8; bla=fasel'
       
   121     "
       
   122 !
       
   123 
       
   124 extractMimeTypeFromContentType:contentTypeLine
       
   125     |idx mimeAndEncoding|
       
   126 
       
   127     idx := contentTypeLine indexOf:$:.
       
   128     mimeAndEncoding := (contentTypeLine copyFrom:idx+1) withoutSeparators.
       
   129 
       
   130     (mimeAndEncoding includes:$;) ifFalse:[
       
   131 	^ mimeAndEncoding
       
   132     ].
       
   133 
       
   134     idx := mimeAndEncoding indexOf:$;.
       
   135     ^ mimeAndEncoding copyTo:idx-1
       
   136 
       
   137     "
       
   138      self extractMimeTypeFromContentType:'text/html; charset=ascii'
       
   139      self extractMimeTypeFromContentType:'text/html; '
       
   140      self extractMimeTypeFromContentType:'text/html'
       
   141      self extractMimeTypeFromContentType:'text/xml; charset=utf-8'
       
   142     "
       
   143 !
       
   144 
       
   145 unEscape:aString
       
   146     "Convert escaped characters in an urls arguments or post fields to their proper characters.
       
   147      These are:
       
   148         + -> space
       
   149         %XX ascii as hex digits
       
   150         %% -> %
       
   151     "
       
   152 
       
   153     |rs ws c peekC|
       
   154 
       
   155     (aString indexOfAny:'+%') == 0 ifTrue:[
       
   156         ^ aString
       
   157     ].
       
   158 
       
   159     rs := ReadStream on: aString.
       
   160     ws := WriteStream on: ''.
       
   161     [rs atEnd] whileFalse:[
       
   162         c := rs next.
       
   163         c == $+ 
       
   164             ifTrue:[ c := Character space ] 
       
   165             ifFalse:[
       
   166                 c == $% 
       
   167                     ifTrue: [
       
   168                         peekC := rs peek.
       
   169                         (peekC notNil and:[peekC isHexDigit]) ifTrue:[
       
   170                             c := (Integer readFrom:(rs next:2) radix:16) asCharacter 
       
   171                         ] ifFalse:[
       
   172                             c := rs next.
       
   173                         ]
       
   174                     ]
       
   175             ].
       
   176         ws nextPut: c.
       
   177     ].
       
   178     ^ ws contents
       
   179 
       
   180     "
       
   181      self new unEscape:'a%20b' 
       
   182      self new unEscape:'a%%b'
       
   183      self new unEscape:'a+b' 
       
   184      self new unEscape:'a%+b' 
       
   185     "
       
   186 !
       
   187 
       
   188 urlEncoded: aString
       
   189     "helper to escape invalid/dangerous characters in an urls arguments or post-fields.
       
   190         see: application/x-www-form-urlencoded
       
   191     "
       
   192 
       
   193 "
       
   194 self unEscape:(self urlEncoded:'_-.*Frankfurt(Main) Hbf')
       
   195 self urlEncoded:'_-.*Frankfurt(Main) Hbf') unescape
       
   196 
       
   197 self unEscape:(self urlEncoded:'-_.*%exept;')
       
   198 self urlEncoded:'-_.*%exept;'
       
   199 "
       
   200 
       
   201     | rs ws c space|
       
   202 
       
   203     space := Character space.
       
   204     rs := ReadStream on: aString.
       
   205     ws := WriteStream on: ''.
       
   206 
       
   207     [ rs atEnd ] whileFalse: [
       
   208         c := rs next.
       
   209 
       
   210         c isLetterOrDigit ifTrue:[
       
   211             ws nextPut:c.
       
   212         ] ifFalse:[
       
   213             c == space ifTrue:[
       
   214                 ws nextPut:$+.
       
   215             ] ifFalse:[
       
   216                 ('-_.*' includes:c) ifTrue:[
       
   217                     ws nextPut:c.
       
   218                 ] ifFalse:[
       
   219                     ws nextPut: $%.
       
   220                     c codePoint printOn:ws base:16.
       
   221                 ].
       
   222             ].
       
   223         ].
       
   224     ].
       
   225     ^ ws contents
       
   226 
       
   227 
       
   228     "
       
   229      self new escape:'a b'      
       
   230      self new escape:'a%b'    
       
   231      self new escape:'a b'      
       
   232      self new escape:'a+b'      
       
   233     "
       
   234 ! !
       
   235 
       
   236 !HTMLUtilities class methodsFor:'serving-helpers'!
       
   237 
       
   238 escape:aString
       
   239     "helper to escape invalid/dangerous characters in an urls arguments or post-fields.
       
   240      These are:
       
   241         control characters, '+', ';', '?', '&' and space -> %XX ascii as hex digits
       
   242         %     -> %%
       
   243     "
       
   244 
       
   245     | rs ws c |
       
   246 
       
   247     rs := ReadStream on: aString.
       
   248     ws := WriteStream on: ''.
       
   249     [ rs atEnd ] whileFalse: [
       
   250         c := rs next.
       
   251         c == $% ifTrue:[
       
   252             ws nextPutAll: '%%'.
       
   253         ] ifFalse:[
       
   254             ((c codePoint < 16r7F)
       
   255              and:[ ('+;?& ' includes:c) not ]) ifTrue: [ 
       
   256                 ws nextPut: c.
       
   257             ] ifFalse:[
       
   258                 ws nextPut: $%.
       
   259                 c codePoint printOn:ws base:16.
       
   260             ]
       
   261         ]
       
   262     ].
       
   263     ^ ws contents
       
   264 
       
   265     "
       
   266      self escape:'a b'      
       
   267      self escape:'a%b'    
       
   268      self escape:'a b'      
       
   269      self escape:'a+b'      
       
   270      self escape:'aäüöb'      
       
   271     "
       
   272 ! !
       
   273 
       
   274 !HTMLUtilities class methodsFor:'documentation'!
       
   275 
       
   276 version
       
   277     ^ '$Header: /cvs/stx/stx/libbasic2/HTMLUtilities.st,v 1.1 2008-12-03 12:23:06 cg Exp $'
       
   278 ! !