HTMLUtilities.st
changeset 4302 f50a1263f3ce
parent 4297 0908351381fd
child 4333 2e428045cb82
equal deleted inserted replaced
4301:0972b064ccee 4302:f50a1263f3ce
       
     1 "{ Encoding: utf8 }"
       
     2 
     1 "
     3 "
     2  COPYRIGHT (c) 2007 by eXept Software AG
     4  COPYRIGHT (c) 2007 by eXept Software AG
     3               All Rights Reserved
     5               All Rights Reserved
     4 
     6 
     5  This software is furnished under a license and may be used
     7  This software is furnished under a license and may be used
   107 
   109 
   108     ^self escapeCharacterEntities:aString andControlCharacters:self controlCharacters
   110     ^self escapeCharacterEntities:aString andControlCharacters:self controlCharacters
   109 
   111 
   110     "
   112     "
   111      self escapeCharacterEntities:'a<b'     
   113      self escapeCharacterEntities:'a<b'     
   112      self escapeCharacterEntities:'ab'     
   114      self escapeCharacterEntities:'aöb'     
   113     "
   115     "
   114 
   116 
   115     "Modified: / 06-05-2015 / 16:30:13 / sr"
   117     "Modified: / 06-05-2015 / 16:30:13 / sr"
   116 !
   118 !
   117 
   119 
   132             self escapeCharacterEntities:aString andControlCharacters:controlCharacters on:ws.
   134             self escapeCharacterEntities:aString andControlCharacters:controlCharacters on:ws.
   133         ]
   135         ]
   134     
   136     
   135     "
   137     "
   136      self escapeCharacterEntities:'a<b'     
   138      self escapeCharacterEntities:'a<b'     
   137      self escapeCharacterEntities:'ab'     
   139      self escapeCharacterEntities:'aöb'     
   138     "
   140     "
   139 
   141 
   140     "Created: / 06-05-2015 / 16:29:51 / sr"
   142     "Created: / 06-05-2015 / 16:29:51 / sr"
   141     "Modified (format): / 05-02-2017 / 17:59:32 / cg"
   143     "Modified (format): / 05-02-2017 / 17:59:32 / cg"
   142 !
   144 !
   173         ]
   175         ]
   174     ].
   176     ].
   175     
   177     
   176     "
   178     "
   177      self escapeCharacterEntities:'a<b'     
   179      self escapeCharacterEntities:'a<b'     
   178      self escapeCharacterEntities:'ab'     
   180      self escapeCharacterEntities:'aöb'     
   179     "
   181     "
   180 
   182 
   181     "Created: / 05-02-2017 / 17:58:34 / cg"
   183     "Created: / 05-02-2017 / 17:58:34 / cg"
   182 !
   184 !
   183 
   185 
   195 
   197 
   196     ^self escapeCharacterEntities:aString andControlCharacters:self controlCharacters on:aStream
   198     ^self escapeCharacterEntities:aString andControlCharacters:self controlCharacters on:aStream
   197 
   199 
   198     "
   200     "
   199      self escapeCharacterEntities:'a<b'     
   201      self escapeCharacterEntities:'a<b'     
   200      self escapeCharacterEntities:'ab'     
   202      self escapeCharacterEntities:'aöb'     
   201     "
   203     "
   202 
   204 
   203     "Created: / 05-02-2017 / 18:00:56 / cg"
   205     "Created: / 05-02-2017 / 18:00:56 / cg"
   204 !
   206 !
   205 
   207 
   252     "
   254     "
   253 !
   255 !
   254 
   256 
   255 unEscape:aString
   257 unEscape:aString
   256     "Convert escaped characters in an urls arguments or post fields back to their proper characters.
   258     "Convert escaped characters in an urls arguments or post fields back to their proper characters.
   257      Undoes the effect of urlEncode and urlEncode2.
   259      Undoes the effect of #urlEncoded: and #urlEncoded2:.
   258      These are:
   260      These are:
   259         + -> space
   261         + -> space
   260         %XX ascii as hex digits
   262         %XX ascii as hex digits
   261         %uXXXX unicode as hex digits
   263         %uXXXX unicode as hex digits   NOTE: %u is non-standard bit implemented in MS IIS
   262         %% -> %
   264         %% -> %
   263     "
   265     "
   264 
   266 
   265     |rs ws c peekC isUnicodeEscaped|
   267     |rs ws c peekC isUnicodeEscaped|
   266 
   268 
   318      self unEscape:'/Home/a%C3%A4%C3%B6%C3%BCa'
   320      self unEscape:'/Home/a%C3%A4%C3%B6%C3%BCa'
   319     "
   321     "
   320 
   322 
   321     "Modified: / 09-01-2011 / 10:44:50 / cg"
   323     "Modified: / 09-01-2011 / 10:44:50 / cg"
   322     "Modified (comment): / 06-05-2015 / 15:40:04 / sr"
   324     "Modified (comment): / 06-05-2015 / 15:40:04 / sr"
       
   325     "Modified (comment): / 03-02-2017 / 17:06:32 / stefan"
   323 !
   326 !
   324 
   327 
   325 unescapeCharacterEntities:aString
   328 unescapeCharacterEntities:aString
   326     "helper to unescape character entities in a string.
   329     "helper to unescape character entities in a string.
   327      Normally, this is done by the HTMLParser when it scans text,
   330      Normally, this is done by the HTMLParser when it scans text,
   419     "Created: / 06-05-2015 / 16:56:14 / sr"
   422     "Created: / 06-05-2015 / 16:56:14 / sr"
   420     "Modified: / 18-05-2015 / 12:13:35 / sr"
   423     "Modified: / 18-05-2015 / 12:13:35 / sr"
   421 !
   424 !
   422 
   425 
   423 urlEncode2:aStringOrStream on:ws
   426 urlEncode2:aStringOrStream on:ws
       
   427     <resource: #obsolete>
   424     "helper to escape invalid/dangerous characters in an urls arguments.
   428     "helper to escape invalid/dangerous characters in an urls arguments.
   425      Similar to urlEncode, but treats '*','~' and spaces differently.
   429      Similar to urlEncode, but treats '*','~' and spaces differently.
   426      (some clients, such as bitTorrent seem to require this - time will tell...)
   430      (some clients, such as bitTorrent seem to require this - time will tell...)
   427      Any byte not in the set 0-9, a-z, A-Z, '.', '-', '_', is encoded using 
   431      Any byte not in the set 0-9, a-z, A-Z, '.', '-', '_', is encoded using 
   428      the '%nn' format, where nn is the hexadecimal value of the byte.
   432      the '%nn' format, where nn is the hexadecimal value of the byte.
   454     "Modified: / 06-05-2015 / 15:43:39 / sr"
   458     "Modified: / 06-05-2015 / 15:43:39 / sr"
   455 !
   459 !
   456 
   460 
   457 urlEncode:aStringOrStream on:ws
   461 urlEncode:aStringOrStream on:ws
   458     "helper to escape invalid/dangerous characters in an urls arguments or post-fields.
   462     "helper to escape invalid/dangerous characters in an urls arguments or post-fields.
   459      Similar to urlEncode2, but treats '*','~' and spaces differently.
   463 
   460      (some clients, such as bitTorrent seem to require urlEncode2 - time will tell...)
   464      Any byte not in the set 0-9, a-z, A-Z, '.', '-', '_' and '~', is encoded using 
   461      Any byte not in the set 0-9, a-z, A-Z, '.', '-', '_' and '*', is encoded using 
       
   462      the '%nn' format, where nn is the hexadecimal value of the byte.
   465      the '%nn' format, where nn is the hexadecimal value of the byte.
       
   466      Characters outside the ASCII range are encoded into utf8 first.
   463      Spaces are encoded as '+'.
   467      Spaces are encoded as '+'.
   464         see: application/x-www-form-urlencoded  
   468         see: application/x-www-form-urlencoded  
   465         see: RFC1738"
   469         see: https://tools.ietf.org/html/rfc3986 (obsoletes RFC1738)"
   466 
   470 
   467     |rs c cp space|
   471     |rs c|
   468 
   472 
   469     space := Character space.
       
   470     rs := aStringOrStream readStream.
   473     rs := aStringOrStream readStream.
   471 
   474 
   472     [rs atEnd] whileFalse: [
   475     [(c := rs nextOrNil) notNil] whileTrue: [
   473         c := rs next.
   476         |cp|
   474 
   477 
   475         (c isLetterOrDigit or:[ '-_.*' includes:c ]) ifTrue:[
   478         (c isLetterOrDigit or:['-_.~' includes:c]) ifTrue:[
   476             ws nextPut:c.
   479             ws nextPut:c.
   477         ] ifFalse:[
   480         ] ifFalse:[
   478             c == space ifTrue:[
   481             c == Character space ifTrue:[
   479                 ws nextPut:$+.
   482                 ws nextPut:$+.
   480             ] ifFalse:[
   483             ] ifFalse:[
   481                 ws nextPut: $%.
   484                 cp := c codePoint.
   482                 (cp := c codePoint) > 16rFF ifTrue:[
   485                 cp > 16r7F ifTrue:[
   483                     ws nextPut: $u.
   486                     c utf8Encoded do:[:eachUtf8Char|
   484                     cp printOn:ws base:16 size:4 fill:$0.
   487                         ws nextPut: $%.
       
   488                         eachUtf8Char codePoint printOn:ws base:16 size:2 fill:$0.
       
   489                     ].
   485                 ] ifFalse:[
   490                 ] ifFalse:[
       
   491                     ws nextPut: $%.
   486                     cp printOn:ws base:16 size:2 fill:$0.
   492                     cp printOn:ws base:16 size:2 fill:$0.
   487                 ].
   493                 ].
   488             ].
   494             ].
   489         ].
   495         ].
   490     ].
   496     ].
   491 
   497 
       
   498     "
       
   499         self urlEncoded:'hokus pokus fidibus*-/~'
       
   500         self urlEncoded:'Ützel Brötzel*-/~'
       
   501         self urlEncoded:'χαιρε'
       
   502     "
       
   503 
   492     "Modified: / 09-01-2011 / 10:43:30 / cg"
   504     "Modified: / 09-01-2011 / 10:43:30 / cg"
   493     "Modified: / 06-05-2015 / 16:06:52 / sr"
   505     "Modified: / 06-05-2015 / 16:06:52 / sr"
       
   506     "Modified (comment): / 07-02-2017 / 14:51:42 / stefan"
   494 !
   507 !
   495 
   508 
   496 urlEncoded2: aString
   509 urlEncoded2: aString
       
   510     <resource: #obsolete>
   497     "helper to escape invalid/dangerous characters in an urls arguments or post-fields.
   511     "helper to escape invalid/dangerous characters in an urls arguments or post-fields.
   498      Similar to urlEncoded, but treats '*','~' and spaces differently.
   512      Similar to urlEncoded, but treats '*','~' and spaces differently.
   499      (some clients, such as bitTorrent seem to require this - time will tell...)
   513      (some clients, such as bitTorrent seem to require this - time will tell...)
   500      Any byte not in the set 0-9, a-z, A-Z, '.', '-', '_' and '~', is encoded using 
   514      Any byte not in the set 0-9, a-z, A-Z, '.', '-', '_' and '~', is encoded using 
   501      the '%nn' format, where nn is the hexadecimal value of the byte.
   515      the '%nn' format, where nn is the hexadecimal value of the byte.
   521     "Created: / 09-01-2011 / 10:34:50 / cg"
   535     "Created: / 09-01-2011 / 10:34:50 / cg"
   522 !
   536 !
   523 
   537 
   524 urlEncoded: aString
   538 urlEncoded: aString
   525     "helper to escape invalid/dangerous characters in an urls arguments or post-fields.
   539     "helper to escape invalid/dangerous characters in an urls arguments or post-fields.
   526      Similar to urlEncoded2, but treats '*','~' and spaces differently.
   540 
   527      (some clients, such as bitTorrent seem to require urlEncoded2 - time will tell...)
   541      Any byte not in the set 0-9, a-z, A-Z, '.', '-', '_' and '~', is encoded using 
   528      Any byte not in the set 0-9, a-z, A-Z, '.', '-', '_' and '*', is encoded using 
       
   529      the '%nn' format, where nn is the hexadecimal value of the byte.
   542      the '%nn' format, where nn is the hexadecimal value of the byte.
       
   543      Characters outside the ASCII range are encoded into utf8 first.
   530      Spaces are encoded as '+'.
   544      Spaces are encoded as '+'.
   531         see: application/x-www-form-urlencoded  
   545         see: application/x-www-form-urlencoded  
   532         see: RFC1738"
   546         see: https://tools.ietf.org/html/rfc3986 (obsoletes RFC1738)"
   533 
   547 
   534     |ws|
   548     |ws|
   535 
   549 
   536     ws := String writeStreamWithInitialSize:aString size.
   550     ws := WriteStream on:(String new:aString size + 20).
   537     self urlEncode:aString on:ws.
   551     self urlEncode:aString on:ws.
   538     ^ ws contents
   552     ^ ws contents
   539 
   553 
   540 
   554 
   541     "
   555     "
   545       self unEscape:(self urlEncoded:'-_.*%exept;')
   559       self unEscape:(self urlEncoded:'-_.*%exept;')
   546       self urlEncoded:'-_.*%exept;'
   560       self urlEncoded:'-_.*%exept;'
   547     "
   561     "
   548 
   562 
   549     "Modified: / 09-01-2011 / 10:43:37 / cg"
   563     "Modified: / 09-01-2011 / 10:43:37 / cg"
       
   564     "Modified: / 07-02-2017 / 14:54:12 / stefan"
   550 !
   565 !
   551 
   566 
   552 withAllSpecialHTMLCharactersEscaped:aStringOrCharacter
   567 withAllSpecialHTMLCharactersEscaped:aStringOrCharacter
   553     "replace ampersand, less, greater and quotes by html-character escapes"
   568     "replace ampersand, less, greater and quotes by html-character escapes"
   554 
   569 
   688     "
   703     "
   689      self escape:'a b'      
   704      self escape:'a b'      
   690      self escape:'a%b'    
   705      self escape:'a%b'    
   691      self escape:'a b'      
   706      self escape:'a b'      
   692      self escape:'a+b'      
   707      self escape:'a+b'      
   693      self escape:'ab'      
   708      self escape:'aäüöb'      
   694     "
   709     "
   695 
   710 
   696     "Modified: / 06-05-2015 / 16:07:18 / sr"
   711     "Modified: / 06-05-2015 / 16:07:18 / sr"
   697     "Modified: / 25-11-2016 / 16:37:53 / cg"
   712     "Modified: / 25-11-2016 / 16:37:53 / cg"
   698 ! !
   713 ! !