author | Claus Gittinger <cg@exept.de> |
Fri, 25 Nov 2016 16:45:58 +0100 | |
changeset 4217 | 1dac9014b77a |
parent 4204 | 481e0286fce9 |
child 4287 | 7d7b30363fa8 |
permissions | -rw-r--r-- |
2058 | 1 |
" |
2 |
COPYRIGHT (c) 2007 by eXept Software AG |
|
3 |
All Rights Reserved |
|
4 |
||
5 |
This software is furnished under a license and may be used |
|
6 |
only in accordance with the terms of that license and with the |
|
7 |
inclusion of the above copyright notice. This software may not |
|
8 |
be provided or otherwise made available to, or used by, any |
|
9 |
other person. No title to or ownership of the software is |
|
10 |
hereby transferred. |
|
11 |
" |
|
12 |
"{ Package: 'stx:libbasic2' }" |
|
13 |
||
3544 | 14 |
"{ NameSpace: Smalltalk }" |
15 |
||
2058 | 16 |
Object subclass:#HTMLUtilities |
17 |
instanceVariableNames:'' |
|
18 |
classVariableNames:'EscapeControlCharacters' |
|
19 |
poolDictionaries:'' |
|
20 |
category:'Net-Communication-Support' |
|
21 |
! |
|
22 |
||
23 |
!HTMLUtilities class methodsFor:'documentation'! |
|
24 |
||
25 |
copyright |
|
26 |
" |
|
27 |
COPYRIGHT (c) 2007 by eXept Software AG |
|
28 |
All Rights Reserved |
|
29 |
||
30 |
This software is furnished under a license and may be used |
|
31 |
only in accordance with the terms of that license and with the |
|
32 |
inclusion of the above copyright notice. This software may not |
|
33 |
be provided or otherwise made available to, or used by, any |
|
34 |
other person. No title to or ownership of the software is |
|
35 |
hereby transferred. |
|
36 |
" |
|
37 |
! |
|
38 |
||
39 |
documentation |
|
40 |
" |
|
41 |
Collected support functions to deal with HTML. |
|
42 |
Used both by HTML generators (DocGenerator), HTMLParsers and the webServer. |
|
43 |
Therefore, it has been put into libbasic2. |
|
44 |
" |
|
45 |
! ! |
|
46 |
||
2442 | 47 |
!HTMLUtilities class methodsFor:'common actions'! |
48 |
||
49 |
openLauncherOnDisplay:displayName |
|
2458 | 50 |
<resource: #obsolete> |
2442 | 51 |
|
2458 | 52 |
"obsolete - do not use" |
2442 | 53 |
|
2458 | 54 |
self obsoleteMethodWarning. |
55 |
Error handle:[:ex | |
|
56 |
^ ex description |
|
57 |
] do:[ |
|
58 |
NewLauncher openLauncherOnInitializedDisplayNamed:displayName |
|
59 |
] |
|
2442 | 60 |
|
2458 | 61 |
"Modified: / 01-06-2010 / 11:25:12 / sr" |
2442 | 62 |
! ! |
63 |
||
2058 | 64 |
!HTMLUtilities class methodsFor:'helpers'! |
65 |
||
3557 | 66 |
characterFromHtmlEntityNamed:anHtmlEntityName |
3640 | 67 |
anHtmlEntityName = 'lt' ifTrue:[^ $<]. |
68 |
anHtmlEntityName = 'gt' ifTrue:[^ $>]. |
|
69 |
anHtmlEntityName = 'amp' ifTrue:[^ $&]. |
|
70 |
anHtmlEntityName = 'apos' ifTrue:[^ $']. |
|
71 |
anHtmlEntityName = 'quot' ifTrue:[^ $"]. |
|
3557 | 72 |
|
73 |
self halt. "/ where to get the mapping??? |
|
74 |
||
75 |
^ $~ |
|
76 |
||
77 |
"Created: / 07-05-2015 / 15:23:40 / sr" |
|
78 |
"Modified: / 18-05-2015 / 12:15:36 / sr" |
|
79 |
! |
|
80 |
||
2058 | 81 |
controlCharacters |
82 |
||
83 |
EscapeControlCharacters isNil ifTrue:[ |
|
84 |
EscapeControlCharacters := Dictionary new. |
|
85 |
EscapeControlCharacters at:$< put:'<'. |
|
86 |
EscapeControlCharacters at:$> put:'>'. |
|
87 |
EscapeControlCharacters at:$& put:'&'. |
|
88 |
EscapeControlCharacters at:$" put:'"'. |
|
2436
a5537ae7be4a
added: #withAllSpecialHTMLCharactersEscaped:
Claus Gittinger <cg@exept.de>
parents:
2434
diff
changeset
|
89 |
"/ EscapeControlCharacters at:$' put:'''. |
2058 | 90 |
]. |
91 |
^ EscapeControlCharacters. |
|
3544 | 92 |
|
93 |
"Modified (comment): / 06-05-2015 / 16:17:31 / sr" |
|
2058 | 94 |
! |
95 |
||
96 |
escapeCharacterEntities:aString |
|
97 |
"helper to escape invalid/dangerous characters in html strings. |
|
98 |
These are: |
|
99 |
control characters, '<', '>', '&' and space -> %XX ascii as hex digits |
|
100 |
% -> %% |
|
101 |
" |
|
2066 | 102 |
"/ TODO: this is similar to withSpecialHTMLCharactersEscaped. |
103 |
"/ we should refactor this into one method only (can we do hex escapes always ?). |
|
104 |
"/ Notice, that these two methods came into existance due to historic reasons |
|
105 |
"/ and were developed independent of each other, but later moved to this common place. |
|
106 |
||
107 |
||
3545 | 108 |
^self escapeCharacterEntities:aString andControlCharacters:self controlCharacters |
109 |
||
110 |
" |
|
111 |
self escapeCharacterEntities:'a<b' |
|
3647 | 112 |
self escapeCharacterEntities:'aöb' |
3545 | 113 |
" |
114 |
||
115 |
"Modified: / 06-05-2015 / 16:30:13 / sr" |
|
116 |
! |
|
2058 | 117 |
|
3545 | 118 |
escapeCharacterEntities:aString andControlCharacters:controlCharacters |
119 |
"helper to escape invalid/dangerous characters in html strings. |
|
120 |
These are: |
|
121 |
control characters, '<', '>', '&' and space -> %XX ascii as hex digits |
|
122 |
% -> %% |
|
123 |
" |
|
124 |
"/ TODO: this is similar to withSpecialHTMLCharactersEscaped. |
|
125 |
"/ we should refactor this into one method only (can we do hex escapes always ?). |
|
126 |
"/ Notice, that these two methods came into existance due to historic reasons |
|
127 |
"/ and were developed independent of each other, but later moved to this common place. |
|
128 |
||
129 |
||
130 |
|rs ws c controlString| |
|
131 |
||
2058 | 132 |
rs := ReadStream on: aString. |
133 |
ws := WriteStream on: ''. |
|
134 |
[ rs atEnd ] whileFalse: [ |
|
135 |
c := rs next. |
|
3545 | 136 |
controlString := controlCharacters notEmptyOrNil ifTrue:[controlCharacters at:c ifAbsent:nil] ifFalse:[nil]. |
2058 | 137 |
controlString notNil ifTrue:[ |
138 |
ws nextPutAll:controlString. |
|
139 |
] ifFalse:[ |
|
140 |
c codePoint > 16r7F ifTrue:[ |
|
141 |
ws |
|
142 |
nextPutAll:'&#'; |
|
143 |
nextPutAll:(c codePoint printString); |
|
144 |
nextPutAll:';'. |
|
145 |
] ifFalse:[ |
|
146 |
ws nextPut:c. |
|
147 |
] |
|
148 |
] |
|
149 |
]. |
|
150 |
^ ws contents |
|
151 |
||
152 |
" |
|
153 |
self escapeCharacterEntities:'a<b' |
|
3647 | 154 |
self escapeCharacterEntities:'aöb' |
2058 | 155 |
" |
3545 | 156 |
|
157 |
"Created: / 06-05-2015 / 16:29:51 / sr" |
|
2058 | 158 |
! |
159 |
||
160 |
extractCharSetEncodingFromContentType:contentTypeLine |
|
161 |
|idx rest encoding| |
|
162 |
||
163 |
idx := contentTypeLine findString:'charset='. |
|
164 |
idx == 0 ifTrue:[ |
|
165 |
^ nil |
|
166 |
]. |
|
167 |
rest := (contentTypeLine copyFrom:idx+'charset=' size) withoutSeparators. |
|
168 |
idx := (rest indexOfSeparator) min:(rest indexOf:$;). |
|
169 |
idx == 0 ifTrue:[ |
|
170 |
encoding := rest |
|
171 |
] ifFalse:[ |
|
172 |
encoding := rest copyTo:idx-1. |
|
173 |
]. |
|
174 |
(encoding startsWith:$") ifTrue:[ |
|
175 |
encoding := encoding copyFrom:2 to:(encoding indexOf:$" startingAt:3)-1. |
|
176 |
]. |
|
177 |
^ encoding. |
|
178 |
||
179 |
" |
|
180 |
self extractCharSetEncodingFromContentType:'text/html; charset=ascii' |
|
181 |
self extractCharSetEncodingFromContentType:'text/html; charset=' |
|
182 |
self extractCharSetEncodingFromContentType:'text/html; fooBar=bla' |
|
183 |
self extractCharSetEncodingFromContentType:'text/xml; charset=utf-8' |
|
184 |
self extractCharSetEncodingFromContentType:'text/xml; charset=utf-8; bla=fasel' |
|
185 |
" |
|
186 |
! |
|
187 |
||
188 |
extractMimeTypeFromContentType:contentTypeLine |
|
189 |
|idx mimeAndEncoding| |
|
190 |
||
191 |
idx := contentTypeLine indexOf:$:. |
|
192 |
mimeAndEncoding := (contentTypeLine copyFrom:idx+1) withoutSeparators. |
|
193 |
||
194 |
(mimeAndEncoding includes:$;) ifFalse:[ |
|
195 |
^ mimeAndEncoding |
|
196 |
]. |
|
197 |
||
198 |
idx := mimeAndEncoding indexOf:$;. |
|
199 |
^ mimeAndEncoding copyTo:idx-1 |
|
200 |
||
201 |
" |
|
202 |
self extractMimeTypeFromContentType:'text/html; charset=ascii' |
|
203 |
self extractMimeTypeFromContentType:'text/html; ' |
|
204 |
self extractMimeTypeFromContentType:'text/html' |
|
205 |
self extractMimeTypeFromContentType:'text/xml; charset=utf-8' |
|
206 |
" |
|
207 |
! |
|
208 |
||
209 |
unEscape:aString |
|
2522 | 210 |
"Convert escaped characters in an urls arguments or post fields back to their proper characters. |
211 |
Undoes the effect of urlEncode and urlEncode2. |
|
2058 | 212 |
These are: |
213 |
+ -> space |
|
214 |
%XX ascii as hex digits |
|
3544 | 215 |
%uXXXX unicode as hex digits |
2058 | 216 |
%% -> % |
217 |
" |
|
218 |
||
3544 | 219 |
|rs ws c peekC isUnicodeEscaped| |
2058 | 220 |
|
4204 | 221 |
aString isNil ifTrue:[ |
222 |
^ nil. |
|
223 |
]. |
|
224 |
||
2522 | 225 |
(aString includesAny:'+%') ifFalse:[ |
2058 | 226 |
^ aString |
227 |
]. |
|
228 |
||
229 |
rs := ReadStream on: aString. |
|
3544 | 230 |
ws := CharacterWriteStream on: ''. |
231 |
isUnicodeEscaped := false. |
|
232 |
||
2058 | 233 |
[rs atEnd] whileFalse:[ |
234 |
c := rs next. |
|
3544 | 235 |
|
236 |
isUnicodeEscaped ifTrue:[ |
|
237 |
isUnicodeEscaped := false. |
|
238 |
c := (Integer readFrom:(rs nextAvailable:4) radix:16) asCharacter. |
|
239 |
] ifFalse:[ |
|
240 |
c == $+ ifTrue:[ |
|
241 |
c := Character space. |
|
242 |
] ifFalse:[ |
|
243 |
c == $% ifTrue:[ |
|
244 |
peekC := rs peek. |
|
245 |
(peekC notNil and:[peekC isHexDigit]) ifTrue:[ |
|
246 |
c := (Integer readFrom:(rs nextAvailable:2) radix:16) asCharacter. |
|
247 |
] ifFalse:[ |
|
248 |
(peekC notNil and:[peekC == $u]) ifTrue:[ |
|
249 |
isUnicodeEscaped := true. |
|
250 |
c := nil. |
|
2058 | 251 |
] ifFalse:[ |
252 |
c := rs next. |
|
3544 | 253 |
]. |
254 |
]. |
|
255 |
]. |
|
2058 | 256 |
]. |
3544 | 257 |
]. |
258 |
||
259 |
c notNil ifTrue:[ |
|
260 |
ws nextPut:c. |
|
261 |
]. |
|
2058 | 262 |
]. |
263 |
^ ws contents |
|
264 |
||
265 |
" |
|
2087 | 266 |
self unEscape:'a%20b' |
267 |
self unEscape:'a%%b' |
|
268 |
self unEscape:'a+b' |
|
269 |
self unEscape:'a%+b' |
|
2179
c1cee8bbc1e5
unescape: care for invalid escape sequence (%, %singleDigit atEnd)
sr
parents:
2144
diff
changeset
|
270 |
self unEscape:'a%' |
c1cee8bbc1e5
unescape: care for invalid escape sequence (%, %singleDigit atEnd)
sr
parents:
2144
diff
changeset
|
271 |
self unEscape:'a%2' |
2058 | 272 |
" |
2179
c1cee8bbc1e5
unescape: care for invalid escape sequence (%, %singleDigit atEnd)
sr
parents:
2144
diff
changeset
|
273 |
|
2522 | 274 |
"Modified: / 09-01-2011 / 10:44:50 / cg" |
3544 | 275 |
"Modified (comment): / 06-05-2015 / 15:40:04 / sr" |
2522 | 276 |
! |
277 |
||
3545 | 278 |
unescapeCharacterEntities:aString |
279 |
"helper to unescape character entities in a string. |
|
280 |
Normally, this is done by the HTMLParser when it scans text, |
|
281 |
but seems to be also used in post-data fields which contain non-ascii characters |
|
282 |
(for example: the login postdata of expeccALM). |
|
283 |
||
284 |
Sequences are: |
|
3557 | 285 |
&<specialName>; |
286 |
&#<decimal>; |
|
287 |
&#x<hex> |
|
288 |
||
289 |
From Reference: |
|
290 |
http://wiki.selfhtml.org/wiki/Referenz:HTML/Zeichenreferenz#HTML-eigene_Zeichen |
|
3545 | 291 |
" |
292 |
||
3557 | 293 |
|rs ws c |
294 |
entity entityNumberPart |
|
295 |
htmlEntityMatchingFailed characterFromHtmlEntity| |
|
3545 | 296 |
|
297 |
(aString includes:$&) ifFalse:[ |
|
298 |
^ aString |
|
299 |
]. |
|
300 |
||
3557 | 301 |
rs := ReadStream on:aString. |
302 |
ws := CharacterWriteStream on:''. |
|
3545 | 303 |
|
304 |
[rs atEnd] whileFalse:[ |
|
305 |
c := rs next. |
|
306 |
c == $& ifTrue:[ |
|
3557 | 307 |
entity := rs upToMatching:[:ch | ch == $;]. |
308 |
entity notEmpty ifTrue:[ |
|
309 |
rs peek == $; ifTrue:[ "/ something between & and ; |
|
310 |
rs next. "/ read over semicolon |
|
311 |
htmlEntityMatchingFailed := false. |
|
312 |
||
313 |
entity first == $# ifTrue:[ "/ entity is determined as number |
|
314 |
entityNumberPart := entity copyFrom:2. |
|
315 |
entityNumberPart notEmpty ifTrue:[ |
|
316 |
entityNumberPart first == $x ifTrue:[ |
|
317 |
entityNumberPart := entityNumberPart copyFrom:2. |
|
318 |
entityNumberPart notEmpty ifTrue:[ |
|
319 |
ws nextPut:(Character value:(Integer readFrom:entityNumberPart radix:16)). |
|
320 |
] ifFalse:[ |
|
321 |
htmlEntityMatchingFailed := true. |
|
322 |
]. |
|
3545 | 323 |
] ifFalse:[ |
3557 | 324 |
entityNumberPart isNumeric ifTrue:[ |
325 |
ws nextPut:(Character value:(Integer readFrom:entityNumberPart)). |
|
326 |
] ifFalse:[ |
|
327 |
htmlEntityMatchingFailed := true. |
|
328 |
]. |
|
3545 | 329 |
]. |
3557 | 330 |
] ifFalse:[ |
331 |
htmlEntityMatchingFailed := true. |
|
332 |
]. |
|
333 |
] ifFalse:[ |
|
334 |
characterFromHtmlEntity := self characterFromHtmlEntityNamed:entity. |
|
335 |
characterFromHtmlEntity notNil ifTrue:[ |
|
336 |
ws nextPut:characterFromHtmlEntity. |
|
337 |
] ifFalse:[ |
|
338 |
htmlEntityMatchingFailed := true. |
|
3545 | 339 |
]. |
3557 | 340 |
]. |
341 |
||
342 |
htmlEntityMatchingFailed ifTrue:[ |
|
343 |
ws nextPut:c. |
|
344 |
ws nextPutAll:entity. |
|
345 |
ws nextPutAll:$;. |
|
346 |
]. |
|
347 |
] ifFalse:[ |
|
348 |
ws nextPut:c. |
|
349 |
ws nextPutAll:entity. |
|
350 |
]. |
|
3545 | 351 |
] ifFalse:[ |
3557 | 352 |
ws nextPut:c. |
353 |
]. |
|
3545 | 354 |
] ifFalse:[ |
3557 | 355 |
ws nextPut:c. |
356 |
]. |
|
3545 | 357 |
]. |
3557 | 358 |
|
3545 | 359 |
^ ws contents |
360 |
||
361 |
" |
|
3557 | 362 |
self unescapeCharacterEntities:'&;' |
3545 | 363 |
self unescapeCharacterEntities:'&16368;' |
364 |
self unescapeCharacterEntities:'&16368;&16368' |
|
365 |
self unescapeCharacterEntities:'&16368;<' |
|
366 |
self unescapeCharacterEntities:'&16368;<' |
|
3557 | 367 |
self unescapeCharacterEntities:'꿾' |
3545 | 368 |
self unescapeCharacterEntities:'"<foo' |
369 |
self unescapeCharacterEntities:'&funny;<foo' |
|
370 |
" |
|
371 |
||
372 |
"Created: / 06-05-2015 / 16:56:14 / sr" |
|
3557 | 373 |
"Modified: / 18-05-2015 / 12:13:35 / sr" |
3545 | 374 |
! |
375 |
||
2522 | 376 |
urlEncode2:aStringOrStream on:ws |
377 |
"helper to escape invalid/dangerous characters in an urls arguments. |
|
378 |
Similar to urlEncode, but treats '*','~' and spaces differently. |
|
379 |
(some clients, such as bitTorrent seem to require this - time will tell...) |
|
2523 | 380 |
Any byte not in the set 0-9, a-z, A-Z, '.', '-', '_', is encoded using |
2522 | 381 |
the '%nn' format, where nn is the hexadecimal value of the byte. |
382 |
see: RFC1738" |
|
383 |
||
384 |
|rs c space| |
|
385 |
||
386 |
space := Character space. |
|
387 |
rs := aStringOrStream readStream. |
|
388 |
||
389 |
[rs atEnd] whileFalse: [ |
|
390 |
c := rs next. |
|
391 |
||
2523 | 392 |
(c isLetterOrDigit or:[ ('-_.' includes:c) ]) ifTrue:[ |
2522 | 393 |
ws nextPut:c. |
394 |
] ifFalse:[ |
|
395 |
ws nextPut: $%. |
|
3544 | 396 |
c codePoint > 16rFF ifTrue:[ |
397 |
ws nextPut: $u. |
|
398 |
c codePoint printOn:ws base:16 size:4 fill:$0. |
|
399 |
] ifFalse:[ |
|
400 |
c codePoint printOn:ws base:16 size:2 fill:$0. |
|
401 |
] |
|
2522 | 402 |
]. |
403 |
]. |
|
404 |
||
405 |
"Created: / 09-01-2011 / 10:32:27 / cg" |
|
2523 | 406 |
"Modified: / 09-01-2011 / 13:11:17 / cg" |
3544 | 407 |
"Modified: / 06-05-2015 / 15:43:39 / sr" |
2058 | 408 |
! |
409 |
||
2500 | 410 |
urlEncode:aStringOrStream on:ws |
2058 | 411 |
"helper to escape invalid/dangerous characters in an urls arguments or post-fields. |
2522 | 412 |
Similar to urlEncode2, but treats '*','~' and spaces differently. |
413 |
(some clients, such as bitTorrent seem to require urlEncode2 - time will tell...) |
|
414 |
Any byte not in the set 0-9, a-z, A-Z, '.', '-', '_' and '*', is encoded using |
|
415 |
the '%nn' format, where nn is the hexadecimal value of the byte. |
|
416 |
Spaces are encoded as '+'. |
|
417 |
see: application/x-www-form-urlencoded |
|
418 |
see: RFC1738" |
|
2058 | 419 |
|
3544 | 420 |
|rs c cp space| |
2058 | 421 |
|
422 |
space := Character space. |
|
2500 | 423 |
rs := aStringOrStream readStream. |
2058 | 424 |
|
2500 | 425 |
[rs atEnd] whileFalse: [ |
2058 | 426 |
c := rs next. |
427 |
||
2522 | 428 |
(c isLetterOrDigit or:[ '-_.*' includes:c ]) ifTrue:[ |
2058 | 429 |
ws nextPut:c. |
430 |
] ifFalse:[ |
|
431 |
c == space ifTrue:[ |
|
432 |
ws nextPut:$+. |
|
433 |
] ifFalse:[ |
|
2522 | 434 |
ws nextPut: $%. |
3544 | 435 |
(cp := c codePoint) > 16rFF ifTrue:[ |
436 |
ws nextPut: $u. |
|
437 |
cp printOn:ws base:16 size:4 fill:$0. |
|
438 |
] ifFalse:[ |
|
439 |
cp printOn:ws base:16 size:2 fill:$0. |
|
440 |
]. |
|
2058 | 441 |
]. |
442 |
]. |
|
443 |
]. |
|
2522 | 444 |
|
445 |
"Modified: / 09-01-2011 / 10:43:30 / cg" |
|
3544 | 446 |
"Modified: / 06-05-2015 / 16:06:52 / sr" |
2522 | 447 |
! |
448 |
||
449 |
urlEncoded2: aString |
|
450 |
"helper to escape invalid/dangerous characters in an urls arguments or post-fields. |
|
451 |
Similar to urlEncoded, but treats '*','~' and spaces differently. |
|
452 |
(some clients, such as bitTorrent seem to require this - time will tell...) |
|
453 |
Any byte not in the set 0-9, a-z, A-Z, '.', '-', '_' and '~', is encoded using |
|
454 |
the '%nn' format, where nn is the hexadecimal value of the byte. |
|
455 |
see: application/x-www-form-urlencoded |
|
456 |
see: RFC1738" |
|
457 |
||
458 |
|ws| |
|
459 |
||
460 |
ws := String writeStreamWithInitialSize:aString size. |
|
461 |
self urlEncode2:aString on:ws. |
|
462 |
^ ws contents |
|
463 |
||
464 |
||
465 |
" |
|
466 |
self unEscape:(self urlEncoded:'_-.*Frankfurt(Main) Hbf') |
|
467 |
self urlEncoded2:'_-.*Frankfurt(Main) Hbf' |
|
468 |
||
469 |
self unEscape:(self urlEncoded:'-_.*%exept;') |
|
470 |
self urlEncoded2:'-_.*%exept;' |
|
471 |
self urlEncoded:'-_.*%exept;' |
|
472 |
" |
|
473 |
||
474 |
"Created: / 09-01-2011 / 10:34:50 / cg" |
|
2500 | 475 |
! |
476 |
||
477 |
urlEncoded: aString |
|
478 |
"helper to escape invalid/dangerous characters in an urls arguments or post-fields. |
|
2522 | 479 |
Similar to urlEncoded2, but treats '*','~' and spaces differently. |
480 |
(some clients, such as bitTorrent seem to require urlEncoded2 - time will tell...) |
|
481 |
Any byte not in the set 0-9, a-z, A-Z, '.', '-', '_' and '*', is encoded using |
|
482 |
the '%nn' format, where nn is the hexadecimal value of the byte. |
|
483 |
Spaces are encoded as '+'. |
|
484 |
see: application/x-www-form-urlencoded |
|
485 |
see: RFC1738" |
|
2500 | 486 |
|
487 |
|ws| |
|
488 |
||
2522 | 489 |
ws := String writeStreamWithInitialSize:aString size. |
2500 | 490 |
self urlEncode:aString on:ws. |
2058 | 491 |
^ ws contents |
492 |
||
493 |
||
494 |
" |
|
2500 | 495 |
self unEscape:(self urlEncoded:'_-.*Frankfurt(Main) Hbf') |
496 |
self urlEncoded:'_-.*Frankfurt(Main) Hbf' |
|
497 |
||
498 |
self unEscape:(self urlEncoded:'-_.*%exept;') |
|
499 |
self urlEncoded:'-_.*%exept;' |
|
2058 | 500 |
" |
2464 | 501 |
|
2522 | 502 |
"Modified: / 09-01-2011 / 10:43:37 / cg" |
2066 | 503 |
! |
504 |
||
2436
a5537ae7be4a
added: #withAllSpecialHTMLCharactersEscaped:
Claus Gittinger <cg@exept.de>
parents:
2434
diff
changeset
|
505 |
withAllSpecialHTMLCharactersEscaped:aStringOrCharacter |
a5537ae7be4a
added: #withAllSpecialHTMLCharactersEscaped:
Claus Gittinger <cg@exept.de>
parents:
2434
diff
changeset
|
506 |
"replace ampersand, less, greater and quotes by html-character escapes" |
a5537ae7be4a
added: #withAllSpecialHTMLCharactersEscaped:
Claus Gittinger <cg@exept.de>
parents:
2434
diff
changeset
|
507 |
|
a5537ae7be4a
added: #withAllSpecialHTMLCharactersEscaped:
Claus Gittinger <cg@exept.de>
parents:
2434
diff
changeset
|
508 |
"/ TODO: this is similar to escapeCharacterEntities. |
a5537ae7be4a
added: #withAllSpecialHTMLCharactersEscaped:
Claus Gittinger <cg@exept.de>
parents:
2434
diff
changeset
|
509 |
"/ we should refactor this into one method only (can we do hex escapes always ?). |
a5537ae7be4a
added: #withAllSpecialHTMLCharactersEscaped:
Claus Gittinger <cg@exept.de>
parents:
2434
diff
changeset
|
510 |
"/ Notice, that these two methods came into existance due to historic reasons |
a5537ae7be4a
added: #withAllSpecialHTMLCharactersEscaped:
Claus Gittinger <cg@exept.de>
parents:
2434
diff
changeset
|
511 |
"/ and were developed independent of each other, but later moved to this common place. |
a5537ae7be4a
added: #withAllSpecialHTMLCharactersEscaped:
Claus Gittinger <cg@exept.de>
parents:
2434
diff
changeset
|
512 |
|
a5537ae7be4a
added: #withAllSpecialHTMLCharactersEscaped:
Claus Gittinger <cg@exept.de>
parents:
2434
diff
changeset
|
513 |
|resultStream| |
a5537ae7be4a
added: #withAllSpecialHTMLCharactersEscaped:
Claus Gittinger <cg@exept.de>
parents:
2434
diff
changeset
|
514 |
|
a5537ae7be4a
added: #withAllSpecialHTMLCharactersEscaped:
Claus Gittinger <cg@exept.de>
parents:
2434
diff
changeset
|
515 |
"/ orgs := #( $& $< $> $" $'). |
a5537ae7be4a
added: #withAllSpecialHTMLCharactersEscaped:
Claus Gittinger <cg@exept.de>
parents:
2434
diff
changeset
|
516 |
"/ repls := #( '&' '<' '>' " '). |
a5537ae7be4a
added: #withAllSpecialHTMLCharactersEscaped:
Claus Gittinger <cg@exept.de>
parents:
2434
diff
changeset
|
517 |
|
a5537ae7be4a
added: #withAllSpecialHTMLCharactersEscaped:
Claus Gittinger <cg@exept.de>
parents:
2434
diff
changeset
|
518 |
(aStringOrCharacter isString |
3098 | 519 |
and:[ (aStringOrCharacter includesAny:'&<>''"') not ]) ifTrue:[^ aStringOrCharacter]. |
2436
a5537ae7be4a
added: #withAllSpecialHTMLCharactersEscaped:
Claus Gittinger <cg@exept.de>
parents:
2434
diff
changeset
|
520 |
|
3544 | 521 |
resultStream := CharacterWriteStream on:''. |
2436
a5537ae7be4a
added: #withAllSpecialHTMLCharactersEscaped:
Claus Gittinger <cg@exept.de>
parents:
2434
diff
changeset
|
522 |
aStringOrCharacter asString do:[:eachCharacter | |
a5537ae7be4a
added: #withAllSpecialHTMLCharactersEscaped:
Claus Gittinger <cg@exept.de>
parents:
2434
diff
changeset
|
523 |
"/ huh - a switch. Sorry, but this method is used heavily. |
a5537ae7be4a
added: #withAllSpecialHTMLCharactersEscaped:
Claus Gittinger <cg@exept.de>
parents:
2434
diff
changeset
|
524 |
eachCharacter == $& |
a5537ae7be4a
added: #withAllSpecialHTMLCharactersEscaped:
Claus Gittinger <cg@exept.de>
parents:
2434
diff
changeset
|
525 |
ifTrue:[ resultStream nextPutAll:'&' ] |
a5537ae7be4a
added: #withAllSpecialHTMLCharactersEscaped:
Claus Gittinger <cg@exept.de>
parents:
2434
diff
changeset
|
526 |
ifFalse:[ |
a5537ae7be4a
added: #withAllSpecialHTMLCharactersEscaped:
Claus Gittinger <cg@exept.de>
parents:
2434
diff
changeset
|
527 |
eachCharacter == $< |
a5537ae7be4a
added: #withAllSpecialHTMLCharactersEscaped:
Claus Gittinger <cg@exept.de>
parents:
2434
diff
changeset
|
528 |
ifTrue:[ resultStream nextPutAll:'<' ] |
a5537ae7be4a
added: #withAllSpecialHTMLCharactersEscaped:
Claus Gittinger <cg@exept.de>
parents:
2434
diff
changeset
|
529 |
ifFalse:[ |
a5537ae7be4a
added: #withAllSpecialHTMLCharactersEscaped:
Claus Gittinger <cg@exept.de>
parents:
2434
diff
changeset
|
530 |
eachCharacter == $> |
a5537ae7be4a
added: #withAllSpecialHTMLCharactersEscaped:
Claus Gittinger <cg@exept.de>
parents:
2434
diff
changeset
|
531 |
ifTrue:[ resultStream nextPutAll:'>' ] |
a5537ae7be4a
added: #withAllSpecialHTMLCharactersEscaped:
Claus Gittinger <cg@exept.de>
parents:
2434
diff
changeset
|
532 |
ifFalse:[ |
a5537ae7be4a
added: #withAllSpecialHTMLCharactersEscaped:
Claus Gittinger <cg@exept.de>
parents:
2434
diff
changeset
|
533 |
eachCharacter == $" |
a5537ae7be4a
added: #withAllSpecialHTMLCharactersEscaped:
Claus Gittinger <cg@exept.de>
parents:
2434
diff
changeset
|
534 |
ifTrue:[ resultStream nextPutAll:'"' ] |
a5537ae7be4a
added: #withAllSpecialHTMLCharactersEscaped:
Claus Gittinger <cg@exept.de>
parents:
2434
diff
changeset
|
535 |
ifFalse:[ |
a5537ae7be4a
added: #withAllSpecialHTMLCharactersEscaped:
Claus Gittinger <cg@exept.de>
parents:
2434
diff
changeset
|
536 |
eachCharacter == $' |
a5537ae7be4a
added: #withAllSpecialHTMLCharactersEscaped:
Claus Gittinger <cg@exept.de>
parents:
2434
diff
changeset
|
537 |
ifTrue:[ resultStream nextPutAll:''' ] |
a5537ae7be4a
added: #withAllSpecialHTMLCharactersEscaped:
Claus Gittinger <cg@exept.de>
parents:
2434
diff
changeset
|
538 |
ifFalse:[ |
a5537ae7be4a
added: #withAllSpecialHTMLCharactersEscaped:
Claus Gittinger <cg@exept.de>
parents:
2434
diff
changeset
|
539 |
resultStream nextPut:eachCharacter |
a5537ae7be4a
added: #withAllSpecialHTMLCharactersEscaped:
Claus Gittinger <cg@exept.de>
parents:
2434
diff
changeset
|
540 |
]]]]]. |
a5537ae7be4a
added: #withAllSpecialHTMLCharactersEscaped:
Claus Gittinger <cg@exept.de>
parents:
2434
diff
changeset
|
541 |
]. |
a5537ae7be4a
added: #withAllSpecialHTMLCharactersEscaped:
Claus Gittinger <cg@exept.de>
parents:
2434
diff
changeset
|
542 |
^ resultStream contents |
a5537ae7be4a
added: #withAllSpecialHTMLCharactersEscaped:
Claus Gittinger <cg@exept.de>
parents:
2434
diff
changeset
|
543 |
|
a5537ae7be4a
added: #withAllSpecialHTMLCharactersEscaped:
Claus Gittinger <cg@exept.de>
parents:
2434
diff
changeset
|
544 |
" |
3312 | 545 |
self withAllSpecialHTMLCharactersEscaped:'<>#&' |
546 |
self withAllSpecialHTMLCharactersEscaped:$< |
|
547 |
self withAllSpecialHTMLCharactersEscaped:$# |
|
2436
a5537ae7be4a
added: #withAllSpecialHTMLCharactersEscaped:
Claus Gittinger <cg@exept.de>
parents:
2434
diff
changeset
|
548 |
" |
a5537ae7be4a
added: #withAllSpecialHTMLCharactersEscaped:
Claus Gittinger <cg@exept.de>
parents:
2434
diff
changeset
|
549 |
|
a5537ae7be4a
added: #withAllSpecialHTMLCharactersEscaped:
Claus Gittinger <cg@exept.de>
parents:
2434
diff
changeset
|
550 |
"Modified: / 05-12-2006 / 13:48:59 / cg" |
3544 | 551 |
"Modified: / 06-05-2015 / 15:41:06 / sr" |
2436
a5537ae7be4a
added: #withAllSpecialHTMLCharactersEscaped:
Claus Gittinger <cg@exept.de>
parents:
2434
diff
changeset
|
552 |
! |
a5537ae7be4a
added: #withAllSpecialHTMLCharactersEscaped:
Claus Gittinger <cg@exept.de>
parents:
2434
diff
changeset
|
553 |
|
2066 | 554 |
withSpecialHTMLCharactersEscaped:aStringOrCharacter |
555 |
"replace ampersand, less and greater by html-character escapes" |
|
556 |
||
557 |
"/ TODO: this is similar to escapeCharacterEntities. |
|
558 |
"/ we should refactor this into one method only (can we do hex escapes always ?). |
|
559 |
"/ Notice, that these two methods came into existance due to historic reasons |
|
560 |
"/ and were developed independent of each other, but later moved to this common place. |
|
561 |
||
2866 | 562 |
|resultStream| |
2066 | 563 |
|
564 |
"/ orgs := #( $& $< $> ). |
|
565 |
"/ repls := #( '&' '<' '>' ). |
|
566 |
||
567 |
(aStringOrCharacter isString |
|
2866 | 568 |
and:[ (aStringOrCharacter isWideString not) |
569 |
and:[ (aStringOrCharacter includesAny:'&<>') not ]]) ifTrue:[^ aStringOrCharacter]. |
|
2066 | 570 |
|
3544 | 571 |
resultStream := CharacterWriteStream on:''. |
2066 | 572 |
aStringOrCharacter asString do:[:eachCharacter | |
573 |
"/ huh - a switch. Sorry, but this method is used heavily. |
|
574 |
eachCharacter == $& |
|
575 |
ifTrue:[ resultStream nextPutAll:'&' ] |
|
576 |
ifFalse:[ |
|
577 |
eachCharacter == $< |
|
578 |
ifTrue:[ resultStream nextPutAll:'<' ] |
|
579 |
ifFalse:[ |
|
580 |
eachCharacter == $> |
|
581 |
ifTrue:[ resultStream nextPutAll:'>' ] |
|
582 |
ifFalse:[ |
|
2554
7cd0f7a16fad
changed: #withSpecialHTMLCharactersEscaped:
Claus Gittinger <cg@exept.de>
parents:
2523
diff
changeset
|
583 |
"/ eachCharacter codePoint > 16r7F |
7cd0f7a16fad
changed: #withSpecialHTMLCharactersEscaped:
Claus Gittinger <cg@exept.de>
parents:
2523
diff
changeset
|
584 |
"/ ifTrue:[ |
7cd0f7a16fad
changed: #withSpecialHTMLCharactersEscaped:
Claus Gittinger <cg@exept.de>
parents:
2523
diff
changeset
|
585 |
"/ resultStream |
7cd0f7a16fad
changed: #withSpecialHTMLCharactersEscaped:
Claus Gittinger <cg@exept.de>
parents:
2523
diff
changeset
|
586 |
"/ nextPutAll:'&#'; |
7cd0f7a16fad
changed: #withSpecialHTMLCharactersEscaped:
Claus Gittinger <cg@exept.de>
parents:
2523
diff
changeset
|
587 |
"/ nextPutAll:(eachCharacter codePoint printString); |
7cd0f7a16fad
changed: #withSpecialHTMLCharactersEscaped:
Claus Gittinger <cg@exept.de>
parents:
2523
diff
changeset
|
588 |
"/ nextPutAll:';'] |
7cd0f7a16fad
changed: #withSpecialHTMLCharactersEscaped:
Claus Gittinger <cg@exept.de>
parents:
2523
diff
changeset
|
589 |
"/ ifFalse:[ |
2066 | 590 |
resultStream nextPut:eachCharacter |
2554
7cd0f7a16fad
changed: #withSpecialHTMLCharactersEscaped:
Claus Gittinger <cg@exept.de>
parents:
2523
diff
changeset
|
591 |
"/ ] |
2066 | 592 |
]]]. |
593 |
]. |
|
594 |
^ resultStream contents |
|
595 |
||
596 |
" |
|
597 |
self withSpecialHTMLCharactersEscaped:'<>#&' |
|
598 |
self withSpecialHTMLCharactersEscaped:$< |
|
599 |
self withSpecialHTMLCharactersEscaped:$# |
|
600 |
" |
|
601 |
||
2554
7cd0f7a16fad
changed: #withSpecialHTMLCharactersEscaped:
Claus Gittinger <cg@exept.de>
parents:
2523
diff
changeset
|
602 |
"Modified: / 13-04-2011 / 23:13:32 / cg" |
3544 | 603 |
"Modified: / 06-05-2015 / 15:41:16 / sr" |
2058 | 604 |
! ! |
605 |
||
3647 | 606 |
!HTMLUtilities class methodsFor:'queries'! |
607 |
||
608 |
isUtilityClass |
|
609 |
^ self == HTMLUtilities |
|
610 |
! ! |
|
611 |
||
2058 | 612 |
!HTMLUtilities class methodsFor:'serving-helpers'! |
613 |
||
614 |
escape:aString |
|
2436
a5537ae7be4a
added: #withAllSpecialHTMLCharactersEscaped:
Claus Gittinger <cg@exept.de>
parents:
2434
diff
changeset
|
615 |
"helper to escape invalid/dangerous characters in an url's arguments or post-fields. |
2058 | 616 |
These are: |
3456 | 617 |
control characters, dQuote, '+', ';', '?', '&' and space -> %XX ascii as hex digits |
2058 | 618 |
% -> %% |
619 |
" |
|
620 |
||
3544 | 621 |
| rs ws c cp| |
2058 | 622 |
|
623 |
rs := ReadStream on: aString. |
|
624 |
ws := WriteStream on: ''. |
|
625 |
[ rs atEnd ] whileFalse: [ |
|
626 |
c := rs next. |
|
627 |
c == $% ifTrue:[ |
|
628 |
ws nextPutAll: '%%'. |
|
629 |
] ifFalse:[ |
|
3544 | 630 |
(((cp := c codePoint) < 16r7F) |
3456 | 631 |
and:[ ('+;?&" ' includes:c) not ]) ifTrue: [ |
2058 | 632 |
ws nextPut: c. |
633 |
] ifFalse:[ |
|
634 |
ws nextPut: $%. |
|
4217 | 635 |
cp printOn:ws base:16 size:(cp > 16rFF ifTrue:[4] ifFalse:[2]) fill:$0. |
2058 | 636 |
] |
637 |
] |
|
638 |
]. |
|
639 |
^ ws contents |
|
640 |
||
641 |
" |
|
642 |
self escape:'a b' |
|
643 |
self escape:'a%b' |
|
644 |
self escape:'a b' |
|
645 |
self escape:'a+b' |
|
3647 | 646 |
self escape:'aäüöb' |
2058 | 647 |
" |
3544 | 648 |
|
649 |
"Modified: / 06-05-2015 / 16:07:18 / sr" |
|
4217 | 650 |
"Modified: / 25-11-2016 / 16:37:53 / cg" |
2058 | 651 |
! ! |
652 |
||
2144 | 653 |
!HTMLUtilities class methodsFor:'text processing helpers'! |
654 |
||
655 |
plainTextOfHTML:htmlString |
|
656 |
"given some HTML, extract the raw text. |
|
657 |
Can be used to search for strings in some html text." |
|
658 |
||
3545 | 659 |
|parser doc s first| |
2144 | 660 |
|
661 |
||
662 |
parser := HTMLParser new. |
|
663 |
doc := parser parseText:htmlString. |
|
3660 | 664 |
s := CharacterWriteStream on:(String new:100). |
3545 | 665 |
first := true. |
2144 | 666 |
doc markUpElementsDo:[:el | |
667 |
|t| |
|
668 |
||
669 |
el isTextElement ifTrue:[ |
|
670 |
t := el text withoutSeparators. |
|
671 |
t notEmpty ifTrue:[ |
|
3545 | 672 |
first ifFalse:[ |
673 |
s space. |
|
674 |
]. |
|
2144 | 675 |
s nextPutAll:t. |
3545 | 676 |
first := false |
2144 | 677 |
]. |
678 |
] ifFalse:[ |
|
679 |
"/ ignore non-text; however, we could care for text in info-titles |
|
680 |
"/ or scripts as well... |
|
681 |
]. |
|
682 |
]. |
|
3659 | 683 |
^ s contents |
2144 | 684 |
|
685 |
" |
|
686 |
self plainTextOfHTML:' |
|
687 |
bla1 bla2 <br>bla3 <table><tr><td>bla4</td></tr></table> bla5<p>bla6 |
|
688 |
' |
|
689 |
" |
|
3545 | 690 |
|
691 |
"Modified: / 06-05-2015 / 17:02:36 / sr" |
|
2144 | 692 |
! ! |
693 |
||
2058 | 694 |
!HTMLUtilities class methodsFor:'documentation'! |
695 |
||
696 |
version |
|
3640 | 697 |
^ '$Header$' |
2434
5625df4b6119
comment/format in: #escapeCharacterEntities:
Claus Gittinger <cg@exept.de>
parents:
2179
diff
changeset
|
698 |
! |
5625df4b6119
comment/format in: #escapeCharacterEntities:
Claus Gittinger <cg@exept.de>
parents:
2179
diff
changeset
|
699 |
|
5625df4b6119
comment/format in: #escapeCharacterEntities:
Claus Gittinger <cg@exept.de>
parents:
2179
diff
changeset
|
700 |
version_CVS |
3640 | 701 |
^ '$Header$' |
2058 | 702 |
! ! |
3098 | 703 |