"{ Encoding: utf8 }"
"
COPYRIGHT (c) 2007 by eXept Software AG
All Rights Reserved
This software is furnished under a license and may be used
only in accordance with the terms of that license and with the
inclusion of the above copyright notice. This software may not
be provided or otherwise made available to, or used by, any
other person. No title to or ownership of the software is
hereby transferred.
"
"{ Package: 'stx:libbasic2' }"
"{ NameSpace: Smalltalk }"
Object subclass:#HTMLUtilities
instanceVariableNames:''
classVariableNames:'AmpersandEscapes EscapeControlCharacters HtmlEntityToCharacter
MathAmpersandEscapes'
poolDictionaries:''
category:'Net-Communication-Support'
!
!HTMLUtilities class methodsFor:'documentation'!
copyright
"
COPYRIGHT (c) 2007 by eXept Software AG
All Rights Reserved
This software is furnished under a license and may be used
only in accordance with the terms of that license and with the
inclusion of the above copyright notice. This software may not
be provided or otherwise made available to, or used by, any
other person. No title to or ownership of the software is
hereby transferred.
"
!
documentation
"
Collected support functions to deal with HTML.
Used both by HTML generators (DocGenerator), HTMLParsers and the webServer.
Therefore, it has been put into libbasic2.
"
! !
!HTMLUtilities class methodsFor:'common actions'!
openLauncherOnDisplay:displayName
<resource: #obsolete>
"obsolete - do not use"
self obsoleteMethodWarning.
Error handle:[:ex |
^ ex description
] do:[
NewLauncher openLauncherOnInitializedDisplayNamed:displayName
]
"Modified: / 01-06-2010 / 11:25:12 / sr"
! !
!HTMLUtilities class methodsFor:'constants'!
ampersandEscapes
AmpersandEscapes isNil ifTrue:[
AmpersandEscapes := IdentityDictionary new.
#(
#nbsp 160 "/ non-breakable space - do something magic...
#emspace 160 "/ temporary
#enspace 160
#lt $<
#gt $>
#amp $&
#quot $"
#apos $'
#copy 169 "/ copyright
#reg 174 "/ registered
#cent 162
#pound 163
#yen 165
#brvbar $|
#sect 167
#laquo 171
#raquo 187
#plusmn 177
#micro 181
#middot 183
#frac14 188
#frac12 189
#frac34 190
#iquest 191
#iexcl 16rA1
#div 247
#divide 247
#not 16rAC
#shy 16rAD
#para 16rB6
#deg 176
#sup1 185
#sup2 178
#sup3 179
#ordm 16rBA
#ordf 16rAA
#macr 16rAF
#cedil 16rB8
#uml 16rA8
#acute 16rB4
#curren 16rA4
#Oslash 216
#oslash 248
#aring 229
#Aring 197
#ccedil 231
#Ccedil 199
#thorn 16rFE
#THORN 16rDE
#Thorn 15rDE
#eth 16rF0
#ETH 16rD0
#Eth 16rD0
#atilde 227
#Atilde 195
#ntilde 241
#Ntilde 209
#otilde 245
#Otilde 213
#auml 228
#Auml 196
#uuml 252
#Uuml 220
#ouml 246
#Ouml 214
#euml 235
#Euml 203
#iuml 239
#Iuml 207
#yuml 255
#acirc 226
#Acirc 194
#icirc 238
#Icirc 206
#ecirc 234
#Ecirc 202
#ucirc 251
#Ucirc 219
#ocirc 244
#Ocirc 212
#agrave 224
#Agrave 192
#egrave 232
#Egrave 200
#igrave 236
#Igrave 204
#ograve 242
#Ograve 210
#ugrave 249
#Ugrave 217
#aacute 225
#Aacute 193
#eacute 233
#Eacute 201
#iacute 237
#Iacute 205
#oacute 243
#Oacute 211
#uacute 250
#Uacute 218
#yacute 16rFD
#Yacute 16rDD
#szlig 223
#aelig 230
#AElig 198
"/ unicode
#OElig 16r0152 "/ 8859-2 (latin2)
#oelig 16r0153 "/ 8859-2 (latin2)
#ljlig 16r01C9 "/ 8859-2 (latin2)
#LJlig 16r01C7 "/ 8859-2 (latin2)
#Ljlig 16r01C8 "/ 8859-2 (latin2)
#Scaron 16r0160 "/ 8859-2 (latin2)
#scaron 16r0161 "/ 8859-2 (latin2)
#Yuml 16r0178 "/ 8859-2 (latin2)
#Alpha 16r0391 "/ greek alpha
#Beta 16r0392
#Gamma 16r0393
#Delta 16r0394
#Epsilon 16r0395
#Zeta 16r0396
#Eta 16r0397
#Theta 16r0398
#Iota 16r0399
#Kappa 16r039A
#Lambda 16r039B
#Mu 16r039C
#Nu 16r039D
#Xi 16r039E
#Omicron 16r039F
#Pi 16r03A0
#Rho 16r03A1
#Sigma 16r03A3
#Tau 16r03A4
#Upsilon 16r03A5
#Phi 16r03A6
#Chi 16r03A7
#Psi 16r03A8
#Omega 16r03A9
#alpha 16r03B1 "/ greek alpha
#beta 16r03B2
#gamma 16r03B3
#delta 16r03B4
#epsilon 16r03B5
#zeta 16r03B6
#eta 16r03B7
#theta 16r03B8
#iota 16r03B9
#kappa 16r03BA
#lambda 16r03BB
#mu 16r03BC
#nu 16r03BD
#xi 16r03BE
#omicron 16r03BF
#pi 16r03C0
#rho 16r03C1
#sigmaf 16r03C2
#sigma 16r03C3
#tau 16r03C4
#upsilon 16r03C5
#phi 16r03C6
#chi 16r03C7
#psi 16r03C8
#omega 16r03C9
#thetasym 16r03D1
#upsih 16r03D2
#piv 16r03D6
#ensp 16r2002
#emsp 16r2003
#thinsp 16r2009 "/ thin space
#zwnj 16r200C "/ zero width non-joiner
#zwj 16r200D "/ zero width joiner
#lrm 16r200E "/ left-to-right mark
#rlm 16r200F "/ right-to-left mark
#ndash 16r2013
#mdash 16r2014
#lsquo 16r2018 "/ left single quot. mark
#rsquo 16r2019 "/ right single quot. mark
#sbquo 16r201A "/ single low-9 quot. mark
#ldquo 16r201C "/ left double quot. mark
#rdquo 16r201D "/ right double quot. mark
#bdquo 16r201E "/ double low-9 quot. mark
#dagger 16r2020
#Dagger 16r2021 "/ double dagger
#bull 16r2022
#hellip 16r2026
#prime 16r2032
#Prime 16r2033
#oline 16r203E
#frasl 16r2044
#euro 16r20AC "/ 8859-16
#weierp 16r2118
#image 16r2111
#real 16r211C
#trade 16r2122
#angst 16r212B
#alefsym 16r2135
#larr 16r2190
#uarr 16r2191
#rarr 16r2192
#darr 16r2193
#harr 16r2194
#crarr 16r21B5
#lArr 16r21D0
#uArr 16r21D1
#rArr 16r21D2
#dArr 16r21D3
#hArr 16r21D4
#forall 16r2200
#part 16r2202
#exist 16r2203
#empty 16r2205
#nabla 16r2207
#isin 16r2208
#notin 16r2209
#ni 16r220B
#prod 16r220F
#sum 16r2211
#minus 16r2212
#lowast 16r2217
#radic 16r221A
#prop 16r221D
#infin 16r221E
#ang90 16r221F
#ang 16r2220
#angmsd 16r2221
#angsph 16r2222
#and 16r2227
#or 16r2228
#cap 16r2229
#cup 16r222A
#int 16r222B
#there4 16r2234
#sim 16r223C
#cong 16r2245
#asymp 16r2248
#ne 16r2260
#equiv 16r2261
#le 16r2264
#ge 16r2265
#sub 16r2282
#sup 16r2283
#nsub 16r2284
#sube 16r2286
#supe 16r2287
#oplus 16r2295
#otimes 16r2297
#perp 16r22A5
#sdot 16r22C5
#lceil 16r2308
#rceil 16r2309
#lfloor 16r230A
#rfloor 16r230B
#lang 16r2329
#rang 16r232A
#loz 16r25CA
#spades 16r2660
#clubs 16r2663
#hearts 16r2665
#diams 16r2666
) pairWiseDo:[:key :val |
|v|
v := val.
val isInteger ifTrue:[
v := Character value:v
].
AmpersandEscapes at:key put:v
].
].
^ AmpersandEscapes
"Created: / 01-04-2019 / 14:34:25 / Claus Gittinger"
!
htmlEntityToCharacter
^ self ampersandEscapes
"Modified: / 01-04-2019 / 14:36:41 / Claus Gittinger"
!
mathAmpersandEscapes
"these are obsolete now, as HTML4 added the missing stuff in the meantime."
MathAmpersandEscapes isNil ifTrue:[
MathAmpersandEscapes := IdentityDictionary new.
#(
"/ #alpha 16r61 "/ greek alpha
"/ #beta 16r62 "/ greek beta
"/ #chi 16r63
"/ #delta 16r64
"/ #epsilon 16r65 "/ symbol characterSet has no epsilon
#vepsilon 16r65
"/ #phi 16r66
"/ #gamma 16r67
"/ #eta 16r68
"/ #iota 16r69
#varphi 16r6A
"/ #kappa 16r6B
"/ #lambda 16r6C
"/ #mu 16r6D
"/ #nu 16r6E
"/ #omicron 16r6F
"/ #pi 16r70
"/ #theta 16r71
#vtheta 16r71 "/ symbol characterSet has no vtheta
"/ #rho 16r72
#varrho 16r72 "/ symbol characterSet has no varrho
"/ #sigma 16r73
#vsigma 16r56
"/ #tau 16r74
"/ #upsilon 16r75
#varpi 16r76
"/ #omega 16r77
"/ #xi 16r78
"/ #psi 16r79
"/ #zeta 16r7A
"/ #Alpha 16r41 "/ greek alpha
"/ #Beta 16r42 "/ greek beta
"/ #Chi 16r43
"/ #Delta 16r44
"/ #Epsilon 16r45
"/ #Phi 16r46
"/ #Gamma 16r47
"/ #Eta 16r48
"/ #Iota 16r49
"/
"/ #Kappa 16r4B
"/ #Lambda 16r4C
"/ #Mu 16r4D
"/ #Nu 16r4E
"/ #Omicron 16r4F
"/ #Pi 16r50
"/ #Theta 16r51
"/ #Rho 16r52
"/ #Sigma 16r53
"/ #Tau 16r54
"/ #Upsilon 16rA1
"/ #Omega 16r57
"/ #Xi 16r58
"/ #Psi 16r59
"/ #Zeta 16r5A
"/ #forall 16r22
#exist 16r24
#exists 16r24
#aleph 16rC0 "/ no, this is not alf ;-)
#Re 16rC2 "/ R fraktur
#Im 16rC1 "/ I fraktur
#infty 16rA5
#leq 16rA3 "/ less-equal
#geq 16rB3 "/ greater-equal
#equiv 16rBA "/ equivalent
#approx 16rBB
#cong 16r40
"/ #neq 16rB9
"/ #plusmn 16rB1
#times 16rB4
"/ #div 16rB8
#oplus 16rC5
#otimes 16rC4
#oslash 16rC5
#sum 16rE5
#prod 16rD5
#uparrow 16rAD
#leftarrow 16rAC
#downarrow 16rAF
#rightarrow 16rAE
#leftrightarrow 16rAB
#Uparrow 16rDD
#Leftarrow 16rDC
#Downarrow 16rDF
#Rightarrow 16rDE
#Leftrightarrow 16rDB
#supset 16rC9
#supseteq 16rCA
#subset 16rCC
#subseteq 16rCD
#vee 16rDA
#wedge 16rD9
#neg 16rD8
#ldots 16rBC
"/ #lfloor 16rEB
"/ #rfloor 16rFB
"/ #lceil 16rE9
"/ #rceil 16rF9
) pairWiseDo:[:key :val |
|v|
v := val.
val isInteger ifTrue:[
v := Character value:v
].
MathAmpersandEscapes at:key put:v
].
].
^ MathAmpersandEscapes
"Created: / 01-04-2019 / 14:40:51 / Claus Gittinger"
! !
!HTMLUtilities class methodsFor:'helpers'!
characterFromHtmlEntityNamed:anHtmlEntityName
^ self ampersandEscapes
at:anHtmlEntityName asSymbol
ifAbsent:[
self halt.
"/ where to get the mapping???
"/ Answer: It is a mess. A good start may be
"/ https://www.w3.org/TR/html4/sgml/entities.html with 252 named entities.
"/ I guess an actual lookup table would be adequate.
$~
]
"Modified: / 01-04-2019 / 14:36:18 / Claus Gittinger"
"Modified: / 04-04-2019 / 10:46:22 / Maren"
!
controlCharacters
EscapeControlCharacters isNil ifTrue:[
EscapeControlCharacters := Dictionary new.
EscapeControlCharacters at:$< put:'<'.
EscapeControlCharacters at:$> put:'>'.
EscapeControlCharacters at:$& put:'&'.
EscapeControlCharacters at:$" put:'"'.
"/ EscapeControlCharacters at:$' put:'''.
].
^ EscapeControlCharacters.
"Modified (comment): / 06-05-2015 / 16:17:31 / sr"
!
copyReplaceCharactersWithHtmlEntitiesIn:aString
|stream htmlEntity|
stream := '' writeStream.
(aString ? '') do:[:eachCharacter |
htmlEntity := self htmlEntityForCharacter:eachCharacter.
htmlEntity isNil ifTrue:[
stream nextPut:eachCharacter.
] ifFalse:[
stream
nextPut:$&;
nextPutAll:htmlEntity;
nextPut:$;.
].
].
^ stream contents
!
escapeCharacterEntities:aString
"helper to escape invalid/dangerous characters in html strings.
These are:
control characters, '<', '>', '&' and space -> %XX ascii as hex digits
% -> %%
"
"/ TODO: this is similar to withSpecialHTMLCharactersEscaped.
"/ we should refactor this into one method only (can we do hex escapes always ?).
"/ Notice, that these two methods came into existance due to historic reasons
"/ and were developed independent of each other, but later moved to this common place.
^self escapeCharacterEntities:aString andControlCharacters:self controlCharacters
"
self escapeCharacterEntities:'a<b'
self escapeCharacterEntities:'aöb'
"
"Modified: / 06-05-2015 / 16:30:13 / sr"
!
escapeCharacterEntities:aString andControlCharacters:controlCharacters
"helper to escape invalid/dangerous characters in html strings.
These are:
control characters, '<', '>', '&' and space -> %XX ascii as hex digits
% -> %%
"
"/ TODO: this is similar to withSpecialHTMLCharactersEscaped.
"/ we should refactor this into one method only (can we do hex escapes always ?).
"/ Notice, that these two methods came into existance due to historic reasons
"/ and were developed independent of each other, but later moved to this common place.
^ String
streamContents:[:ws |
self escapeCharacterEntities:aString andControlCharacters:controlCharacters on:ws.
]
"
self escapeCharacterEntities:'a<b'
self escapeCharacterEntities:'aöb'
"
"Created: / 06-05-2015 / 16:29:51 / sr"
"Modified (format): / 05-02-2017 / 17:59:32 / cg"
!
escapeCharacterEntities:aString andControlCharacters:controlCharacters on:aWriteStream
"helper to escape invalid/dangerous characters in html strings.
These are:
control characters, '<', '>', '&' and space -> %XX ascii as hex digits
% -> %%
"
"/ TODO: this is similar to withSpecialHTMLCharactersEscaped.
"/ we should refactor this into one method only (can we do hex escapes always ?).
"/ Notice, that these two methods came into existance due to historic reasons
"/ and were developed independent of each other, but later moved to this common place.
|rs c controlString|
rs := ReadStream on: aString.
[ rs atEnd ] whileFalse: [
c := rs next.
controlString := controlCharacters notEmptyOrNil ifTrue:[controlCharacters at:c ifAbsent:nil] ifFalse:[nil].
controlString notNil ifTrue:[
aWriteStream nextPutAll:controlString.
] ifFalse:[
c codePoint > 16r7F ifTrue:[
aWriteStream nextPutAll:'&#'.
c codePoint printOn:aWriteStream.
aWriteStream nextPut:$;.
] ifFalse:[
aWriteStream nextPut:c.
]
]
].
"
self escapeCharacterEntities:'a<b'
self escapeCharacterEntities:'aöb'
"
"Created: / 05-02-2017 / 17:58:34 / cg"
"Modified: / 17-02-2017 / 10:34:20 / stefan"
!
escapeCharacterEntities:aString on:aStream
"helper to escape invalid/dangerous characters in html strings.
These are:
control characters, '<', '>', '&' and space -> %XX ascii as hex digits
% -> %%
"
"/ TODO: this is similar to withSpecialHTMLCharactersEscaped.
"/ we should refactor this into one method only (can we do hex escapes always ?).
"/ Notice, that these two methods came into existance due to historic reasons
"/ and were developed independent of each other, but later moved to this common place.
^self escapeCharacterEntities:aString andControlCharacters:self controlCharacters on:aStream
"
self escapeCharacterEntities:'a<b'
self escapeCharacterEntities:'aöb'
"
"Created: / 05-02-2017 / 18:00:56 / cg"
!
extractCharSetEncodingFromContentType:contentTypeLine
|idx rest encoding|
idx := contentTypeLine findString:'charset='.
idx == 0 ifTrue:[
^ nil
].
rest := (contentTypeLine copyFrom:idx+'charset=' size) withoutSeparators.
idx := (rest indexOfSeparator) min:(rest indexOf:$;).
idx == 0 ifTrue:[
encoding := rest
] ifFalse:[
encoding := rest copyTo:idx-1.
].
(encoding startsWith:$") ifTrue:[
encoding := encoding copyFrom:2 to:(encoding indexOf:$" startingAt:3)-1.
].
^ encoding.
"
self extractCharSetEncodingFromContentType:'text/html; charset=ascii'
self extractCharSetEncodingFromContentType:'text/html; charset='
self extractCharSetEncodingFromContentType:'text/html; fooBar=bla'
self extractCharSetEncodingFromContentType:'text/xml; charset=utf-8'
self extractCharSetEncodingFromContentType:'text/xml; charset=utf-8; bla=fasel'
"
!
extractMimeTypeFromContentType:contentTypeLine
|idx mimeAndEncoding|
idx := contentTypeLine indexOf:$:.
mimeAndEncoding := (contentTypeLine copyFrom:idx+1) withoutSeparators.
(mimeAndEncoding includes:$;) ifFalse:[
^ mimeAndEncoding
].
idx := mimeAndEncoding indexOf:$;.
^ mimeAndEncoding copyTo:idx-1
"
self extractMimeTypeFromContentType:'text/html; charset=ascii'
self extractMimeTypeFromContentType:'text/html; '
self extractMimeTypeFromContentType:'text/html'
self extractMimeTypeFromContentType:'text/xml; charset=utf-8'
"
!
htmlEntityForCharacter:aCharacter
aCharacter == Character space ifTrue:[^ nil].
aCharacter isLetterOrDigit ifTrue:[^ nil].
^ self ampersandEscapes
keyAtValue:aCharacter
ifAbsent:nil
"Modified: / 01-04-2019 / 14:36:25 / Claus Gittinger"
!
unEscape:aString
"Convert escaped characters in an urls arguments or post fields back to their proper characters.
Undoes the effect of #urlEncoded: and #urlEncoded2:.
These are:
+ -> space
%XX ascii as hex digits
%uXXXX unicode as hex digits NOTE: %u is non-standard bit implemented in MS IIS
%% -> %
"
|rs ws c peekC isUnicodeEscaped|
aString isNil ifTrue:[
^ nil.
].
(aString includesAny:'+%') ifFalse:[
^ aString
].
rs := ReadStream on: aString.
ws := CharacterWriteStream on: ''.
isUnicodeEscaped := false.
[rs atEnd] whileFalse:[
c := rs next.
isUnicodeEscaped ifTrue:[
isUnicodeEscaped := false.
c := (Integer readFrom:(rs nextAvailable:4) radix:16) asCharacter.
] ifFalse:[
c == $+ ifTrue:[
c := Character space.
] ifFalse:[
c == $% ifTrue:[
peekC := rs peek.
(peekC notNil and:[peekC isHexDigit]) ifTrue:[
c := (Integer readFrom:(rs nextAvailable:2) radix:16) asCharacter.
] ifFalse:[
(peekC notNil and:[peekC == $u]) ifTrue:[
isUnicodeEscaped := true.
c := nil.
] ifFalse:[
c := rs next.
].
].
].
].
].
c notNil ifTrue:[
ws nextPut:c.
].
].
^ ws contents
"
self unEscape:'a%20b'
self unEscape:'a%%b'
self unEscape:'a+b'
self unEscape:'a%+b'
self unEscape:'a%'
self unEscape:'a%2'
self unEscape:'/Home/a%C3%A4%C3%B6%C3%BCa'
"
"Modified: / 09-01-2011 / 10:44:50 / cg"
"Modified (comment): / 06-05-2015 / 15:40:04 / sr"
"Modified (comment): / 03-02-2017 / 17:06:32 / stefan"
!
unescapeCharacterEntities:aString
"helper to unescape character entities in a string.
Normally, this is done by the HTMLParser when it scans text,
but seems to be also used in post-data fields which contain non-ascii characters
(for example: the login postdata of expeccALM).
Sequences are:
&<specialName>;
&#<decimal>;
&#x<hex>
From Reference:
http://wiki.selfhtml.org/wiki/Referenz:HTML/Zeichenreferenz#HTML-eigene_Zeichen
"
|rs ws c
entity entityNumberPart
htmlEntityMatchingFailed characterFromHtmlEntity|
(aString includes:$&) ifFalse:[
^ aString
].
rs := ReadStream on:aString.
ws := CharacterWriteStream on:''.
[rs atEnd] whileFalse:[
c := rs next.
c == $& ifTrue:[
entity := rs upToMatching:[:ch | ch == $;].
entity notEmpty ifTrue:[
rs peek == $; ifTrue:[ "/ something between & and ;
rs next. "/ read over semicolon
htmlEntityMatchingFailed := false.
entity first == $# ifTrue:[ "/ entity is determined as number
entityNumberPart := entity copyFrom:2.
entityNumberPart notEmpty ifTrue:[
entityNumberPart first == $x ifTrue:[
entityNumberPart := entityNumberPart copyFrom:2.
entityNumberPart notEmpty ifTrue:[
ws nextPut:(Character value:(Integer readFrom:entityNumberPart radix:16)).
] ifFalse:[
htmlEntityMatchingFailed := true.
].
] ifFalse:[
entityNumberPart isNumeric ifTrue:[
ws nextPut:(Character value:(Integer readFrom:entityNumberPart)).
] ifFalse:[
htmlEntityMatchingFailed := true.
].
].
] ifFalse:[
htmlEntityMatchingFailed := true.
].
] ifFalse:[
characterFromHtmlEntity := self characterFromHtmlEntityNamed:entity.
characterFromHtmlEntity notNil ifTrue:[
ws nextPut:characterFromHtmlEntity.
] ifFalse:[
htmlEntityMatchingFailed := true.
].
].
htmlEntityMatchingFailed ifTrue:[
ws nextPut:c.
ws nextPutAll:entity.
ws nextPut:$;.
].
] ifFalse:[
ws nextPut:c.
ws nextPutAll:entity.
].
] ifFalse:[
ws nextPut:c.
].
] ifFalse:[
ws nextPut:c.
].
].
^ ws contents
"
self unescapeCharacterEntities:'&;'
self unescapeCharacterEntities:'&16368;'
self unescapeCharacterEntities:'&16368;&16368'
self unescapeCharacterEntities:'&16368;<'
self unescapeCharacterEntities:'&16368;<'
self unescapeCharacterEntities:'꿾'
self unescapeCharacterEntities:'"<foo'
self unescapeCharacterEntities:'&funny;<foo'
"
"Created: / 06-05-2015 / 16:56:14 / sr"
"Modified: / 18-05-2015 / 12:13:35 / sr"
"Modified: / 17-02-2017 / 10:18:35 / stefan"
!
urlDecoded:aString
"Convert escaped characters in an urls arguments or post fields back to their proper characters.
Undoes the effect of #urlEncoded: and #urlEncoded2:.
These are:
+ -> space
%XX ascii as hex digits
%uXXXX unicode as hex digits NOTE: %u is non-standard bit implemented in MS IIS
%% -> %
"
^ (self unEscape:aString) utf8Decoded
"
self urlDecoded:'a%20b'
self urlDecoded:'a%%b'
self urlDecoded:'a+b'
self urlDecoded:'a%+b'
self urlDecoded:'a%'
self urlDecoded:'a%2'
self urlDecoded:'/Home/a%C3%A4%C3%B6%C3%BCa'
"
"Created: / 26-08-2018 / 12:49:24 / Claus Gittinger"
!
urlEncode2:aStringOrStream on:ws
<resource: #obsolete>
"helper to escape invalid/dangerous characters in an urls arguments.
Similar to urlEncode, but treats '*','~' and spaces differently.
(some clients, such as bitTorrent seem to require this - time will tell...)
Any byte not in the set 0-9, a-z, A-Z, '.', '-', '_', is encoded using
the '%nn' format, where nn is the hexadecimal value of the byte.
see: RFC1738"
|rs c space|
space := Character space.
rs := aStringOrStream readStream.
[rs atEnd] whileFalse: [
c := rs next.
(c isLetterOrDigit or:[ ('-_.' includes:c) ]) ifTrue:[
ws nextPut:c.
] ifFalse:[
ws nextPut: $%.
c codePoint > 16rFF ifTrue:[
ws nextPut: $u.
c codePoint printOn:ws base:16 size:4 fill:$0.
] ifFalse:[
c codePoint printOn:ws base:16 size:2 fill:$0.
]
].
].
"Created: / 09-01-2011 / 10:32:27 / cg"
"Modified: / 09-01-2011 / 13:11:17 / cg"
"Modified: / 06-05-2015 / 15:43:39 / sr"
!
urlEncode:aStringOrStream on:ws
"helper to escape invalid/dangerous characters in an urlÄs argument or post-fields.
Any byte not in the set 0-9, a-z, A-Z, '.', '-', '_' and '~',
is encoded using the '%nn' format, where nn is the hexadecimal value of the byte.
Characters outside the ASCII range are encoded into utf8 first.
Spaces are encoded as '+'.
see: application/x-www-form-urlencoded
see: https://tools.ietf.org/html/rfc3986 (obsoletes RFC1738)"
|rs c|
rs := aStringOrStream readStream.
[(c := rs nextOrNil) notNil] whileTrue: [
|cp|
(c isLetterOrDigit or:['-_.~' includes:c]) ifTrue:[
ws nextPut:c.
] ifFalse:[
c == Character space ifTrue:[
ws nextPut:$+.
] ifFalse:[
cp := c codePoint.
cp > 16r7F ifTrue:[
c utf8Encoded do:[:eachUtf8Char|
ws nextPut: $%.
eachUtf8Char codePoint printOn:ws base:16 size:2 fill:$0.
].
] ifFalse:[
ws nextPut: $%.
cp printOn:ws base:16 size:2 fill:$0.
].
].
].
].
"
self urlEncoded:'hokus pokus fidibus*-/~'
self urlEncoded:'Ützel Brötzel*-/~'
self urlEncoded:'χαιρε'
self urlDecoded:(self urlEncoded:'hokus pokus fidibus*-/~')
self urlDecoded:(self urlEncoded:'Ützel Brötzel*-/~')
self urlDecoded:(self urlEncoded:'χαιρε')
"
"Modified: / 09-01-2011 / 10:43:30 / cg"
"Modified: / 06-05-2015 / 16:06:52 / sr"
"Modified (comment): / 07-02-2017 / 14:51:42 / stefan"
"Modified (comment): / 26-08-2018 / 12:50:04 / Claus Gittinger"
!
urlEncoded2: aString
<resource: #obsolete>
"helper to escape invalid/dangerous characters in an urls arguments or post-fields.
Similar to urlEncoded, but treats '*','~' and spaces differently.
(some clients, such as bitTorrent seem to require this - time will tell...)
Any byte not in the set 0-9, a-z, A-Z, '.', '-', '_' and '~', is encoded using
the '%nn' format, where nn is the hexadecimal value of the byte.
see: application/x-www-form-urlencoded
see: RFC1738"
|ws|
ws := String writeStreamWithInitialSize:aString size.
self urlEncode2:aString on:ws.
^ ws contents
"
self unEscape:(self urlEncoded:'_-.*Frankfurt(Main) Hbf')
self urlEncoded2:'_-.*Frankfurt(Main) Hbf'
self unEscape:(self urlEncoded:'-_.*%exept;')
self urlEncoded2:'-_.*%exept;'
self urlEncoded:'-_.*%exept;'
"
"Created: / 09-01-2011 / 10:34:50 / cg"
!
urlEncoded: aString
"helper to escape invalid/dangerous characters in an urls arguments or post-fields.
Any byte not in the set 0-9, a-z, A-Z, '.', '-', '_' and '~', is encoded using
the '%nn' format, where nn is the hexadecimal value of the byte.
Characters outside the ASCII range are encoded into utf8 first.
Spaces are encoded as '+'.
see: application/x-www-form-urlencoded
see: https://tools.ietf.org/html/rfc3986 (obsoletes RFC1738)"
|ws|
ws := WriteStream on:(String new:aString size + 20).
self urlEncode:aString on:ws.
^ ws contents
"
self unEscape:(self urlEncoded:'_-.*Frankfurt(Main) Hbf')
self urlEncoded:'_-.*Frankfurt(Main) Hbf'
self unEscape:(self urlEncoded:'-_.*%exept;')
self urlEncoded:'-_.*%exept;'
"
"Modified: / 09-01-2011 / 10:43:37 / cg"
"Modified: / 07-02-2017 / 14:54:12 / stefan"
!
withAllSpecialHTMLCharactersEscaped:aStringOrCharacter
"replace ampersand, less, greater and quotes by html-character escapes"
"/ TODO: this is similar to escapeCharacterEntities.
"/ we should refactor this into one method only (can we do hex escapes always ?).
"/ Notice, that these two methods came into existance due to historic reasons
"/ and were developed independent of each other, but later moved to this common place.
|resultStream|
"/ orgs := #( $& $< $> $" $').
"/ repls := #( '&' '<' '>' " ').
(aStringOrCharacter isString
and:[ (aStringOrCharacter includesAny:'&<>''"') not ]) ifTrue:[^ aStringOrCharacter].
resultStream := CharacterWriteStream on:''.
aStringOrCharacter asString do:[:eachCharacter |
"/ huh - a switch. Sorry, but this method is used heavily.
eachCharacter == $&
ifTrue:[ resultStream nextPutAll:'&' ]
ifFalse:[
eachCharacter == $<
ifTrue:[ resultStream nextPutAll:'<' ]
ifFalse:[
eachCharacter == $>
ifTrue:[ resultStream nextPutAll:'>' ]
ifFalse:[
eachCharacter == $"
ifTrue:[ resultStream nextPutAll:'"' ]
ifFalse:[
eachCharacter == $'
ifTrue:[ resultStream nextPutAll:''' ]
ifFalse:[
resultStream nextPut:eachCharacter
]]]]].
].
^ resultStream contents
"
self withAllSpecialHTMLCharactersEscaped:'<>#&'
self withAllSpecialHTMLCharactersEscaped:$<
self withAllSpecialHTMLCharactersEscaped:$#
"
"Modified: / 05-12-2006 / 13:48:59 / cg"
"Modified: / 06-05-2015 / 15:41:06 / sr"
!
withSpecialHTMLCharactersEscaped:aStringOrCharacter
"replace ampersand, less and greater by html-character escapes"
"/ TODO: this is similar to escapeCharacterEntities.
"/ we should refactor this into one method only (can we do hex escapes always ?).
"/ Notice, that these two methods came into existance due to historic reasons
"/ and were developed independent of each other, but later moved to this common place.
|resultStream|
"/ orgs := #( $& $< $> ).
"/ repls := #( '&' '<' '>' ).
(aStringOrCharacter isString
and:[ (aStringOrCharacter isWideString not)
and:[ (aStringOrCharacter includesAny:'&<>') not ]]) ifTrue:[^ aStringOrCharacter].
resultStream := CharacterWriteStream on:''.
aStringOrCharacter asString do:[:eachCharacter |
"/ huh - a switch. Sorry, but this method is used heavily.
eachCharacter == $&
ifTrue:[ resultStream nextPutAll:'&' ]
ifFalse:[
eachCharacter == $<
ifTrue:[ resultStream nextPutAll:'<' ]
ifFalse:[
eachCharacter == $>
ifTrue:[ resultStream nextPutAll:'>' ]
ifFalse:[
"/ eachCharacter codePoint > 16r7F
"/ ifTrue:[
"/ resultStream
"/ nextPutAll:'&#';
"/ nextPutAll:(eachCharacter codePoint printString);
"/ nextPutAll:';']
"/ ifFalse:[
resultStream nextPut:eachCharacter
"/ ]
]]].
].
^ resultStream contents
"
self withSpecialHTMLCharactersEscaped:'<>#&'
self withSpecialHTMLCharactersEscaped:$<
self withSpecialHTMLCharactersEscaped:$#
"
"Modified: / 13-04-2011 / 23:13:32 / cg"
"Modified: / 06-05-2015 / 15:41:16 / sr"
! !
!HTMLUtilities class methodsFor:'queries'!
isUtilityClass
^ self == HTMLUtilities
! !
!HTMLUtilities class methodsFor:'serving-helpers'!
escape:aString
"helper to escape invalid/dangerous characters in an url's arguments or post-fields.
These are:
control characters, dQuote, '+', ';', '?', '&' and space -> %XX ascii as hex digits
% -> %%
"
| rs ws c cp|
rs := ReadStream on: aString.
ws := WriteStream on: ''.
[ rs atEnd ] whileFalse: [
c := rs next.
c == $% ifTrue:[
ws nextPutAll: '%%'.
] ifFalse:[
(((cp := c codePoint) < 16r7F)
and:[ ('+;?&" ' includes:c) not ]) ifTrue: [
ws nextPut: c.
] ifFalse:[
ws nextPut: $%.
cp printOn:ws base:16 size:(cp > 16rFF ifTrue:[4] ifFalse:[2]) fill:$0.
]
]
].
^ ws contents
"
self escape:'a b'
self escape:'a%b'
self escape:'a b'
self escape:'a+b'
self escape:'aäüöb'
"
"Modified: / 06-05-2015 / 16:07:18 / sr"
"Modified: / 25-11-2016 / 16:37:53 / cg"
! !
!HTMLUtilities class methodsFor:'text processing helpers'!
plainTextOfHTML:htmlString
"given some HTML, extract the raw text.
Can be used to search for strings in some html text."
|parser doc s first|
parser := HTMLParser new.
doc := parser parseText:htmlString.
s := CharacterWriteStream on:(String new:100).
first := true.
doc markUpElementsDo:[:el |
|t|
el isTextElement ifTrue:[
t := el text withoutSeparators.
t notEmpty ifTrue:[
first ifFalse:[
s space.
].
s nextPutAll:t.
first := false
].
] ifFalse:[
"/ ignore non-text; however, we could care for text in info-titles
"/ or scripts as well...
].
].
^ s contents
"
self plainTextOfHTML:'
bla1 bla2 <br>bla3 <table><tr><td>bla4</td></tr></table> bla5<p>bla6'
self plainTextOfHTML:'Hello World'
"
"Modified: / 06-05-2015 / 17:02:36 / sr"
! !
!HTMLUtilities class methodsFor:'documentation'!
version
^ '$Header$'
!
version_CVS
^ '$Header$'
! !