author | Claus Gittinger <cg@exept.de> |
Wed, 27 Mar 2019 15:22:56 +0100 | |
changeset 4924 | b171682381a1 |
parent 4737 | 610d483cb00a |
child 4929 | 6220f244a435 |
permissions | -rw-r--r-- |
4302 | 1 |
"{ Encoding: utf8 }" |
2 |
||
2058 | 3 |
" |
4 |
COPYRIGHT (c) 2007 by eXept Software AG |
|
5 |
All Rights Reserved |
|
6 |
||
7 |
This software is furnished under a license and may be used |
|
8 |
only in accordance with the terms of that license and with the |
|
9 |
inclusion of the above copyright notice. This software may not |
|
10 |
be provided or otherwise made available to, or used by, any |
|
11 |
other person. No title to or ownership of the software is |
|
12 |
hereby transferred. |
|
13 |
" |
|
14 |
"{ Package: 'stx:libbasic2' }" |
|
15 |
||
3544 | 16 |
"{ NameSpace: Smalltalk }" |
17 |
||
2058 | 18 |
Object subclass:#HTMLUtilities |
19 |
instanceVariableNames:'' |
|
4517 | 20 |
classVariableNames:'EscapeControlCharacters HtmlEntityToCharacter' |
2058 | 21 |
poolDictionaries:'' |
22 |
category:'Net-Communication-Support' |
|
23 |
! |
|
24 |
||
25 |
!HTMLUtilities class methodsFor:'documentation'! |
|
26 |
||
27 |
copyright |
|
28 |
" |
|
29 |
COPYRIGHT (c) 2007 by eXept Software AG |
|
30 |
All Rights Reserved |
|
31 |
||
32 |
This software is furnished under a license and may be used |
|
33 |
only in accordance with the terms of that license and with the |
|
34 |
inclusion of the above copyright notice. This software may not |
|
35 |
be provided or otherwise made available to, or used by, any |
|
36 |
other person. No title to or ownership of the software is |
|
37 |
hereby transferred. |
|
38 |
" |
|
39 |
! |
|
40 |
||
41 |
documentation |
|
42 |
" |
|
43 |
Collected support functions to deal with HTML. |
|
44 |
Used both by HTML generators (DocGenerator), HTMLParsers and the webServer. |
|
45 |
Therefore, it has been put into libbasic2. |
|
46 |
" |
|
47 |
! ! |
|
48 |
||
2442 | 49 |
!HTMLUtilities class methodsFor:'common actions'! |
50 |
||
51 |
openLauncherOnDisplay:displayName |
|
2458 | 52 |
<resource: #obsolete> |
2442 | 53 |
|
2458 | 54 |
"obsolete - do not use" |
2442 | 55 |
|
2458 | 56 |
self obsoleteMethodWarning. |
57 |
Error handle:[:ex | |
|
58 |
^ ex description |
|
59 |
] do:[ |
|
60 |
NewLauncher openLauncherOnInitializedDisplayNamed:displayName |
|
61 |
] |
|
2442 | 62 |
|
2458 | 63 |
"Modified: / 01-06-2010 / 11:25:12 / sr" |
2442 | 64 |
! ! |
65 |
||
4517 | 66 |
!HTMLUtilities class methodsFor:'constants'! |
67 |
||
68 |
htmlEntityToCharacter |
|
69 |
|htmlEntityToCharacter| |
|
70 |
||
71 |
HtmlEntityToCharacter isNil ifTrue:[ |
|
72 |
htmlEntityToCharacter := Dictionary new. |
|
73 |
htmlEntityToCharacter |
|
74 |
at:'quot' put:$"; |
|
75 |
at:'amp' put:$&; |
|
76 |
at:'apos' put:$'; |
|
77 |
at:'lt' put:$<; |
|
78 |
at:'gt' put:$>; |
|
79 |
at:'Auml' put:$Ä; |
|
80 |
at:'Ouml' put:$Ö; |
|
81 |
at:'Uuml' put:$Ü; |
|
82 |
at:'szlig' put:$ß; |
|
83 |
at:'auml' put:$ä; |
|
84 |
at:'ouml' put:$ö; |
|
85 |
at:'uuml' put:$ü. |
|
4924 | 86 |
"/ where to get the mapping??? |
4517 | 87 |
"/ Answer: It is a mess. A good start may be |
88 |
"/ https://www.w3.org/TR/html4/sgml/entities.html with 252 named entities. |
|
89 |
"/ I guess an actual lookup table would be adequate. |
|
90 |
||
91 |
HtmlEntityToCharacter := htmlEntityToCharacter. |
|
92 |
]. |
|
93 |
||
4924 | 94 |
^ HtmlEntityToCharacter |
95 |
||
96 |
"Modified (comment): / 27-03-2019 / 10:26:27 / Claus Gittinger" |
|
4517 | 97 |
! ! |
98 |
||
2058 | 99 |
!HTMLUtilities class methodsFor:'helpers'! |
100 |
||
3557 | 101 |
characterFromHtmlEntityNamed:anHtmlEntityName |
4517 | 102 |
^ self htmlEntityToCharacter |
103 |
at:anHtmlEntityName |
|
104 |
ifAbsent:[ |
|
4924 | 105 |
self halt. |
106 |
"/ where to get the mapping??? |
|
4517 | 107 |
"/ Answer: It is a mess. A good start may be |
108 |
"/ https://www.w3.org/TR/html4/sgml/entities.html with 252 named entities. |
|
109 |
"/ I guess an actual lookup table would be adequate. |
|
110 |
$~ |
|
111 |
] |
|
4924 | 112 |
|
113 |
"Modified (format): / 27-03-2019 / 10:25:58 / Claus Gittinger" |
|
3557 | 114 |
! |
115 |
||
2058 | 116 |
controlCharacters |
117 |
||
118 |
EscapeControlCharacters isNil ifTrue:[ |
|
119 |
EscapeControlCharacters := Dictionary new. |
|
120 |
EscapeControlCharacters at:$< put:'<'. |
|
121 |
EscapeControlCharacters at:$> put:'>'. |
|
122 |
EscapeControlCharacters at:$& put:'&'. |
|
123 |
EscapeControlCharacters at:$" put:'"'. |
|
2436
a5537ae7be4a
added: #withAllSpecialHTMLCharactersEscaped:
Claus Gittinger <cg@exept.de>
parents:
2434
diff
changeset
|
124 |
"/ EscapeControlCharacters at:$' put:'''. |
2058 | 125 |
]. |
126 |
^ EscapeControlCharacters. |
|
3544 | 127 |
|
128 |
"Modified (comment): / 06-05-2015 / 16:17:31 / sr" |
|
2058 | 129 |
! |
130 |
||
4517 | 131 |
copyReplaceCharactersWithHtmlEntitiesIn:aString |
132 |
|stream htmlEntity| |
|
133 |
||
134 |
stream := '' writeStream. |
|
135 |
(aString ? '') do:[:eachCharacter | |
|
136 |
htmlEntity := self htmlEntityForCharacter:eachCharacter. |
|
137 |
htmlEntity isNil ifTrue:[ |
|
138 |
stream nextPut:eachCharacter. |
|
139 |
] ifFalse:[ |
|
140 |
stream |
|
141 |
nextPut:$&; |
|
142 |
nextPutAll:htmlEntity; |
|
143 |
nextPut:$;. |
|
144 |
]. |
|
145 |
]. |
|
146 |
||
147 |
^ stream contents |
|
148 |
! |
|
149 |
||
2058 | 150 |
escapeCharacterEntities:aString |
151 |
"helper to escape invalid/dangerous characters in html strings. |
|
152 |
These are: |
|
153 |
control characters, '<', '>', '&' and space -> %XX ascii as hex digits |
|
154 |
% -> %% |
|
155 |
" |
|
2066 | 156 |
"/ TODO: this is similar to withSpecialHTMLCharactersEscaped. |
157 |
"/ we should refactor this into one method only (can we do hex escapes always ?). |
|
158 |
"/ Notice, that these two methods came into existance due to historic reasons |
|
159 |
"/ and were developed independent of each other, but later moved to this common place. |
|
160 |
||
161 |
||
3545 | 162 |
^self escapeCharacterEntities:aString andControlCharacters:self controlCharacters |
163 |
||
164 |
" |
|
165 |
self escapeCharacterEntities:'a<b' |
|
4302 | 166 |
self escapeCharacterEntities:'aöb' |
3545 | 167 |
" |
168 |
||
169 |
"Modified: / 06-05-2015 / 16:30:13 / sr" |
|
170 |
! |
|
2058 | 171 |
|
3545 | 172 |
escapeCharacterEntities:aString andControlCharacters:controlCharacters |
173 |
"helper to escape invalid/dangerous characters in html strings. |
|
174 |
These are: |
|
175 |
control characters, '<', '>', '&' and space -> %XX ascii as hex digits |
|
176 |
% -> %% |
|
177 |
" |
|
178 |
"/ TODO: this is similar to withSpecialHTMLCharactersEscaped. |
|
179 |
"/ we should refactor this into one method only (can we do hex escapes always ?). |
|
180 |
"/ Notice, that these two methods came into existance due to historic reasons |
|
181 |
"/ and were developed independent of each other, but later moved to this common place. |
|
182 |
||
183 |
||
4296 | 184 |
^ String |
185 |
streamContents:[:ws | |
|
186 |
self escapeCharacterEntities:aString andControlCharacters:controlCharacters on:ws. |
|
2058 | 187 |
] |
188 |
||
189 |
" |
|
190 |
self escapeCharacterEntities:'a<b' |
|
4302 | 191 |
self escapeCharacterEntities:'aöb' |
2058 | 192 |
" |
3545 | 193 |
|
194 |
"Created: / 06-05-2015 / 16:29:51 / sr" |
|
4296 | 195 |
"Modified (format): / 05-02-2017 / 17:59:32 / cg" |
196 |
! |
|
197 |
||
198 |
escapeCharacterEntities:aString andControlCharacters:controlCharacters on:aWriteStream |
|
199 |
"helper to escape invalid/dangerous characters in html strings. |
|
200 |
These are: |
|
201 |
control characters, '<', '>', '&' and space -> %XX ascii as hex digits |
|
202 |
% -> %% |
|
203 |
" |
|
204 |
"/ TODO: this is similar to withSpecialHTMLCharactersEscaped. |
|
205 |
"/ we should refactor this into one method only (can we do hex escapes always ?). |
|
206 |
"/ Notice, that these two methods came into existance due to historic reasons |
|
207 |
"/ and were developed independent of each other, but later moved to this common place. |
|
208 |
||
209 |
||
210 |
|rs c controlString| |
|
211 |
||
212 |
rs := ReadStream on: aString. |
|
213 |
[ rs atEnd ] whileFalse: [ |
|
214 |
c := rs next. |
|
215 |
controlString := controlCharacters notEmptyOrNil ifTrue:[controlCharacters at:c ifAbsent:nil] ifFalse:[nil]. |
|
216 |
controlString notNil ifTrue:[ |
|
217 |
aWriteStream nextPutAll:controlString. |
|
218 |
] ifFalse:[ |
|
219 |
c codePoint > 16r7F ifTrue:[ |
|
4333 | 220 |
aWriteStream nextPutAll:'&#'. |
221 |
c codePoint printOn:aWriteStream. |
|
222 |
aWriteStream nextPut:$;. |
|
4296 | 223 |
] ifFalse:[ |
224 |
aWriteStream nextPut:c. |
|
225 |
] |
|
226 |
] |
|
227 |
]. |
|
228 |
||
229 |
" |
|
230 |
self escapeCharacterEntities:'a<b' |
|
4302 | 231 |
self escapeCharacterEntities:'aöb' |
4296 | 232 |
" |
233 |
||
234 |
"Created: / 05-02-2017 / 17:58:34 / cg" |
|
4333 | 235 |
"Modified: / 17-02-2017 / 10:34:20 / stefan" |
2058 | 236 |
! |
237 |
||
4297 | 238 |
escapeCharacterEntities:aString on:aStream |
239 |
"helper to escape invalid/dangerous characters in html strings. |
|
240 |
These are: |
|
241 |
control characters, '<', '>', '&' and space -> %XX ascii as hex digits |
|
242 |
% -> %% |
|
243 |
" |
|
244 |
"/ TODO: this is similar to withSpecialHTMLCharactersEscaped. |
|
245 |
"/ we should refactor this into one method only (can we do hex escapes always ?). |
|
246 |
"/ Notice, that these two methods came into existance due to historic reasons |
|
247 |
"/ and were developed independent of each other, but later moved to this common place. |
|
248 |
||
249 |
||
250 |
^self escapeCharacterEntities:aString andControlCharacters:self controlCharacters on:aStream |
|
251 |
||
252 |
" |
|
253 |
self escapeCharacterEntities:'a<b' |
|
4302 | 254 |
self escapeCharacterEntities:'aöb' |
4297 | 255 |
" |
256 |
||
257 |
"Created: / 05-02-2017 / 18:00:56 / cg" |
|
258 |
! |
|
259 |
||
2058 | 260 |
extractCharSetEncodingFromContentType:contentTypeLine |
261 |
|idx rest encoding| |
|
262 |
||
263 |
idx := contentTypeLine findString:'charset='. |
|
264 |
idx == 0 ifTrue:[ |
|
265 |
^ nil |
|
266 |
]. |
|
267 |
rest := (contentTypeLine copyFrom:idx+'charset=' size) withoutSeparators. |
|
268 |
idx := (rest indexOfSeparator) min:(rest indexOf:$;). |
|
269 |
idx == 0 ifTrue:[ |
|
270 |
encoding := rest |
|
271 |
] ifFalse:[ |
|
272 |
encoding := rest copyTo:idx-1. |
|
273 |
]. |
|
274 |
(encoding startsWith:$") ifTrue:[ |
|
275 |
encoding := encoding copyFrom:2 to:(encoding indexOf:$" startingAt:3)-1. |
|
276 |
]. |
|
277 |
^ encoding. |
|
278 |
||
279 |
" |
|
280 |
self extractCharSetEncodingFromContentType:'text/html; charset=ascii' |
|
281 |
self extractCharSetEncodingFromContentType:'text/html; charset=' |
|
282 |
self extractCharSetEncodingFromContentType:'text/html; fooBar=bla' |
|
283 |
self extractCharSetEncodingFromContentType:'text/xml; charset=utf-8' |
|
284 |
self extractCharSetEncodingFromContentType:'text/xml; charset=utf-8; bla=fasel' |
|
285 |
" |
|
286 |
! |
|
287 |
||
288 |
extractMimeTypeFromContentType:contentTypeLine |
|
289 |
|idx mimeAndEncoding| |
|
290 |
||
291 |
idx := contentTypeLine indexOf:$:. |
|
292 |
mimeAndEncoding := (contentTypeLine copyFrom:idx+1) withoutSeparators. |
|
293 |
||
294 |
(mimeAndEncoding includes:$;) ifFalse:[ |
|
295 |
^ mimeAndEncoding |
|
296 |
]. |
|
297 |
||
298 |
idx := mimeAndEncoding indexOf:$;. |
|
299 |
^ mimeAndEncoding copyTo:idx-1 |
|
300 |
||
301 |
" |
|
302 |
self extractMimeTypeFromContentType:'text/html; charset=ascii' |
|
303 |
self extractMimeTypeFromContentType:'text/html; ' |
|
304 |
self extractMimeTypeFromContentType:'text/html' |
|
305 |
self extractMimeTypeFromContentType:'text/xml; charset=utf-8' |
|
306 |
" |
|
307 |
! |
|
308 |
||
4517 | 309 |
htmlEntityForCharacter:aCharacter |
4924 | 310 |
aCharacter == Character space ifTrue:[^ nil]. |
311 |
aCharacter isLetterOrDigit ifTrue:[^ nil]. |
|
312 |
||
4517 | 313 |
^ self htmlEntityToCharacter |
314 |
keyAtValue:aCharacter |
|
315 |
ifAbsent:nil |
|
4924 | 316 |
|
317 |
"Modified: / 27-03-2019 / 10:24:29 / Claus Gittinger" |
|
4517 | 318 |
! |
319 |
||
2058 | 320 |
unEscape:aString |
2522 | 321 |
"Convert escaped characters in an urls arguments or post fields back to their proper characters. |
4302 | 322 |
Undoes the effect of #urlEncoded: and #urlEncoded2:. |
2058 | 323 |
These are: |
324 |
+ -> space |
|
325 |
%XX ascii as hex digits |
|
4302 | 326 |
%uXXXX unicode as hex digits NOTE: %u is non-standard bit implemented in MS IIS |
2058 | 327 |
%% -> % |
328 |
" |
|
329 |
||
3544 | 330 |
|rs ws c peekC isUnicodeEscaped| |
2058 | 331 |
|
4204 | 332 |
aString isNil ifTrue:[ |
333 |
^ nil. |
|
334 |
]. |
|
335 |
||
2522 | 336 |
(aString includesAny:'+%') ifFalse:[ |
2058 | 337 |
^ aString |
338 |
]. |
|
339 |
||
340 |
rs := ReadStream on: aString. |
|
3544 | 341 |
ws := CharacterWriteStream on: ''. |
342 |
isUnicodeEscaped := false. |
|
343 |
||
2058 | 344 |
[rs atEnd] whileFalse:[ |
345 |
c := rs next. |
|
3544 | 346 |
|
347 |
isUnicodeEscaped ifTrue:[ |
|
348 |
isUnicodeEscaped := false. |
|
349 |
c := (Integer readFrom:(rs nextAvailable:4) radix:16) asCharacter. |
|
350 |
] ifFalse:[ |
|
351 |
c == $+ ifTrue:[ |
|
352 |
c := Character space. |
|
353 |
] ifFalse:[ |
|
354 |
c == $% ifTrue:[ |
|
355 |
peekC := rs peek. |
|
356 |
(peekC notNil and:[peekC isHexDigit]) ifTrue:[ |
|
357 |
c := (Integer readFrom:(rs nextAvailable:2) radix:16) asCharacter. |
|
358 |
] ifFalse:[ |
|
359 |
(peekC notNil and:[peekC == $u]) ifTrue:[ |
|
360 |
isUnicodeEscaped := true. |
|
361 |
c := nil. |
|
2058 | 362 |
] ifFalse:[ |
363 |
c := rs next. |
|
3544 | 364 |
]. |
365 |
]. |
|
366 |
]. |
|
2058 | 367 |
]. |
3544 | 368 |
]. |
369 |
||
370 |
c notNil ifTrue:[ |
|
371 |
ws nextPut:c. |
|
372 |
]. |
|
2058 | 373 |
]. |
374 |
^ ws contents |
|
375 |
||
376 |
" |
|
2087 | 377 |
self unEscape:'a%20b' |
378 |
self unEscape:'a%%b' |
|
379 |
self unEscape:'a+b' |
|
380 |
self unEscape:'a%+b' |
|
2179
c1cee8bbc1e5
unescape: care for invalid escape sequence (%, %singleDigit atEnd)
sr
parents:
2144
diff
changeset
|
381 |
self unEscape:'a%' |
c1cee8bbc1e5
unescape: care for invalid escape sequence (%, %singleDigit atEnd)
sr
parents:
2144
diff
changeset
|
382 |
self unEscape:'a%2' |
4287 | 383 |
self unEscape:'/Home/a%C3%A4%C3%B6%C3%BCa' |
2058 | 384 |
" |
2179
c1cee8bbc1e5
unescape: care for invalid escape sequence (%, %singleDigit atEnd)
sr
parents:
2144
diff
changeset
|
385 |
|
2522 | 386 |
"Modified: / 09-01-2011 / 10:44:50 / cg" |
3544 | 387 |
"Modified (comment): / 06-05-2015 / 15:40:04 / sr" |
4302 | 388 |
"Modified (comment): / 03-02-2017 / 17:06:32 / stefan" |
2522 | 389 |
! |
390 |
||
3545 | 391 |
unescapeCharacterEntities:aString |
392 |
"helper to unescape character entities in a string. |
|
393 |
Normally, this is done by the HTMLParser when it scans text, |
|
394 |
but seems to be also used in post-data fields which contain non-ascii characters |
|
395 |
(for example: the login postdata of expeccALM). |
|
396 |
||
397 |
Sequences are: |
|
3557 | 398 |
&<specialName>; |
399 |
&#<decimal>; |
|
400 |
&#x<hex> |
|
401 |
||
402 |
From Reference: |
|
403 |
http://wiki.selfhtml.org/wiki/Referenz:HTML/Zeichenreferenz#HTML-eigene_Zeichen |
|
3545 | 404 |
" |
405 |
||
3557 | 406 |
|rs ws c |
407 |
entity entityNumberPart |
|
408 |
htmlEntityMatchingFailed characterFromHtmlEntity| |
|
3545 | 409 |
|
410 |
(aString includes:$&) ifFalse:[ |
|
411 |
^ aString |
|
412 |
]. |
|
413 |
||
3557 | 414 |
rs := ReadStream on:aString. |
415 |
ws := CharacterWriteStream on:''. |
|
3545 | 416 |
|
417 |
[rs atEnd] whileFalse:[ |
|
418 |
c := rs next. |
|
419 |
c == $& ifTrue:[ |
|
3557 | 420 |
entity := rs upToMatching:[:ch | ch == $;]. |
421 |
entity notEmpty ifTrue:[ |
|
422 |
rs peek == $; ifTrue:[ "/ something between & and ; |
|
423 |
rs next. "/ read over semicolon |
|
424 |
htmlEntityMatchingFailed := false. |
|
425 |
||
426 |
entity first == $# ifTrue:[ "/ entity is determined as number |
|
427 |
entityNumberPart := entity copyFrom:2. |
|
428 |
entityNumberPart notEmpty ifTrue:[ |
|
429 |
entityNumberPart first == $x ifTrue:[ |
|
430 |
entityNumberPart := entityNumberPart copyFrom:2. |
|
431 |
entityNumberPart notEmpty ifTrue:[ |
|
432 |
ws nextPut:(Character value:(Integer readFrom:entityNumberPart radix:16)). |
|
433 |
] ifFalse:[ |
|
434 |
htmlEntityMatchingFailed := true. |
|
435 |
]. |
|
3545 | 436 |
] ifFalse:[ |
3557 | 437 |
entityNumberPart isNumeric ifTrue:[ |
438 |
ws nextPut:(Character value:(Integer readFrom:entityNumberPart)). |
|
439 |
] ifFalse:[ |
|
440 |
htmlEntityMatchingFailed := true. |
|
441 |
]. |
|
3545 | 442 |
]. |
3557 | 443 |
] ifFalse:[ |
444 |
htmlEntityMatchingFailed := true. |
|
445 |
]. |
|
446 |
] ifFalse:[ |
|
447 |
characterFromHtmlEntity := self characterFromHtmlEntityNamed:entity. |
|
448 |
characterFromHtmlEntity notNil ifTrue:[ |
|
449 |
ws nextPut:characterFromHtmlEntity. |
|
450 |
] ifFalse:[ |
|
451 |
htmlEntityMatchingFailed := true. |
|
3545 | 452 |
]. |
3557 | 453 |
]. |
454 |
||
455 |
htmlEntityMatchingFailed ifTrue:[ |
|
456 |
ws nextPut:c. |
|
457 |
ws nextPutAll:entity. |
|
4333 | 458 |
ws nextPut:$;. |
3557 | 459 |
]. |
460 |
] ifFalse:[ |
|
461 |
ws nextPut:c. |
|
462 |
ws nextPutAll:entity. |
|
463 |
]. |
|
3545 | 464 |
] ifFalse:[ |
3557 | 465 |
ws nextPut:c. |
466 |
]. |
|
3545 | 467 |
] ifFalse:[ |
3557 | 468 |
ws nextPut:c. |
469 |
]. |
|
3545 | 470 |
]. |
3557 | 471 |
|
3545 | 472 |
^ ws contents |
473 |
||
474 |
" |
|
3557 | 475 |
self unescapeCharacterEntities:'&;' |
3545 | 476 |
self unescapeCharacterEntities:'&16368;' |
477 |
self unescapeCharacterEntities:'&16368;&16368' |
|
478 |
self unescapeCharacterEntities:'&16368;<' |
|
479 |
self unescapeCharacterEntities:'&16368;<' |
|
3557 | 480 |
self unescapeCharacterEntities:'꿾' |
3545 | 481 |
self unescapeCharacterEntities:'"<foo' |
482 |
self unescapeCharacterEntities:'&funny;<foo' |
|
483 |
" |
|
484 |
||
485 |
"Created: / 06-05-2015 / 16:56:14 / sr" |
|
3557 | 486 |
"Modified: / 18-05-2015 / 12:13:35 / sr" |
4333 | 487 |
"Modified: / 17-02-2017 / 10:18:35 / stefan" |
3545 | 488 |
! |
489 |
||
4712 | 490 |
urlDecoded:aString |
491 |
"Convert escaped characters in an urls arguments or post fields back to their proper characters. |
|
492 |
Undoes the effect of #urlEncoded: and #urlEncoded2:. |
|
493 |
These are: |
|
494 |
+ -> space |
|
495 |
%XX ascii as hex digits |
|
496 |
%uXXXX unicode as hex digits NOTE: %u is non-standard bit implemented in MS IIS |
|
497 |
%% -> % |
|
498 |
" |
|
499 |
^ (self unEscape:aString) utf8Decoded |
|
500 |
||
501 |
" |
|
502 |
self urlDecoded:'a%20b' |
|
503 |
self urlDecoded:'a%%b' |
|
504 |
self urlDecoded:'a+b' |
|
505 |
self urlDecoded:'a%+b' |
|
506 |
self urlDecoded:'a%' |
|
507 |
self urlDecoded:'a%2' |
|
508 |
self urlDecoded:'/Home/a%C3%A4%C3%B6%C3%BCa' |
|
509 |
" |
|
510 |
||
511 |
"Created: / 26-08-2018 / 12:49:24 / Claus Gittinger" |
|
512 |
! |
|
513 |
||
2522 | 514 |
urlEncode2:aStringOrStream on:ws |
4302 | 515 |
<resource: #obsolete> |
2522 | 516 |
"helper to escape invalid/dangerous characters in an urls arguments. |
517 |
Similar to urlEncode, but treats '*','~' and spaces differently. |
|
518 |
(some clients, such as bitTorrent seem to require this - time will tell...) |
|
2523 | 519 |
Any byte not in the set 0-9, a-z, A-Z, '.', '-', '_', is encoded using |
2522 | 520 |
the '%nn' format, where nn is the hexadecimal value of the byte. |
521 |
see: RFC1738" |
|
522 |
||
523 |
|rs c space| |
|
524 |
||
525 |
space := Character space. |
|
526 |
rs := aStringOrStream readStream. |
|
527 |
||
528 |
[rs atEnd] whileFalse: [ |
|
529 |
c := rs next. |
|
530 |
||
2523 | 531 |
(c isLetterOrDigit or:[ ('-_.' includes:c) ]) ifTrue:[ |
2522 | 532 |
ws nextPut:c. |
533 |
] ifFalse:[ |
|
534 |
ws nextPut: $%. |
|
3544 | 535 |
c codePoint > 16rFF ifTrue:[ |
536 |
ws nextPut: $u. |
|
537 |
c codePoint printOn:ws base:16 size:4 fill:$0. |
|
538 |
] ifFalse:[ |
|
539 |
c codePoint printOn:ws base:16 size:2 fill:$0. |
|
540 |
] |
|
2522 | 541 |
]. |
542 |
]. |
|
543 |
||
544 |
"Created: / 09-01-2011 / 10:32:27 / cg" |
|
2523 | 545 |
"Modified: / 09-01-2011 / 13:11:17 / cg" |
3544 | 546 |
"Modified: / 06-05-2015 / 15:43:39 / sr" |
2058 | 547 |
! |
548 |
||
2500 | 549 |
urlEncode:aStringOrStream on:ws |
4712 | 550 |
"helper to escape invalid/dangerous characters in an urlÄs argument or post-fields. |
4302 | 551 |
|
4712 | 552 |
Any byte not in the set 0-9, a-z, A-Z, '.', '-', '_' and '~', |
553 |
is encoded using the '%nn' format, where nn is the hexadecimal value of the byte. |
|
4302 | 554 |
Characters outside the ASCII range are encoded into utf8 first. |
2522 | 555 |
Spaces are encoded as '+'. |
556 |
see: application/x-www-form-urlencoded |
|
4302 | 557 |
see: https://tools.ietf.org/html/rfc3986 (obsoletes RFC1738)" |
2058 | 558 |
|
4302 | 559 |
|rs c| |
2058 | 560 |
|
2500 | 561 |
rs := aStringOrStream readStream. |
2058 | 562 |
|
4302 | 563 |
[(c := rs nextOrNil) notNil] whileTrue: [ |
564 |
|cp| |
|
2058 | 565 |
|
4302 | 566 |
(c isLetterOrDigit or:['-_.~' includes:c]) ifTrue:[ |
2058 | 567 |
ws nextPut:c. |
568 |
] ifFalse:[ |
|
4302 | 569 |
c == Character space ifTrue:[ |
2058 | 570 |
ws nextPut:$+. |
571 |
] ifFalse:[ |
|
4302 | 572 |
cp := c codePoint. |
573 |
cp > 16r7F ifTrue:[ |
|
574 |
c utf8Encoded do:[:eachUtf8Char| |
|
575 |
ws nextPut: $%. |
|
576 |
eachUtf8Char codePoint printOn:ws base:16 size:2 fill:$0. |
|
577 |
]. |
|
3544 | 578 |
] ifFalse:[ |
4302 | 579 |
ws nextPut: $%. |
3544 | 580 |
cp printOn:ws base:16 size:2 fill:$0. |
581 |
]. |
|
2058 | 582 |
]. |
583 |
]. |
|
584 |
]. |
|
2522 | 585 |
|
4302 | 586 |
" |
4712 | 587 |
self urlEncoded:'hokus pokus fidibus*-/~' |
588 |
self urlEncoded:'Ützel Brötzel*-/~' |
|
589 |
self urlEncoded:'χαιρε' |
|
590 |
||
591 |
self urlDecoded:(self urlEncoded:'hokus pokus fidibus*-/~') |
|
592 |
self urlDecoded:(self urlEncoded:'Ützel Brötzel*-/~') |
|
593 |
self urlDecoded:(self urlEncoded:'χαιρε') |
|
4302 | 594 |
" |
595 |
||
2522 | 596 |
"Modified: / 09-01-2011 / 10:43:30 / cg" |
3544 | 597 |
"Modified: / 06-05-2015 / 16:06:52 / sr" |
4302 | 598 |
"Modified (comment): / 07-02-2017 / 14:51:42 / stefan" |
4712 | 599 |
"Modified (comment): / 26-08-2018 / 12:50:04 / Claus Gittinger" |
2522 | 600 |
! |
601 |
||
602 |
urlEncoded2: aString |
|
4302 | 603 |
<resource: #obsolete> |
2522 | 604 |
"helper to escape invalid/dangerous characters in an urls arguments or post-fields. |
605 |
Similar to urlEncoded, but treats '*','~' and spaces differently. |
|
606 |
(some clients, such as bitTorrent seem to require this - time will tell...) |
|
607 |
Any byte not in the set 0-9, a-z, A-Z, '.', '-', '_' and '~', is encoded using |
|
608 |
the '%nn' format, where nn is the hexadecimal value of the byte. |
|
609 |
see: application/x-www-form-urlencoded |
|
610 |
see: RFC1738" |
|
611 |
||
612 |
|ws| |
|
613 |
||
614 |
ws := String writeStreamWithInitialSize:aString size. |
|
615 |
self urlEncode2:aString on:ws. |
|
616 |
^ ws contents |
|
617 |
||
618 |
||
619 |
" |
|
620 |
self unEscape:(self urlEncoded:'_-.*Frankfurt(Main) Hbf') |
|
621 |
self urlEncoded2:'_-.*Frankfurt(Main) Hbf' |
|
622 |
||
623 |
self unEscape:(self urlEncoded:'-_.*%exept;') |
|
624 |
self urlEncoded2:'-_.*%exept;' |
|
625 |
self urlEncoded:'-_.*%exept;' |
|
626 |
" |
|
627 |
||
628 |
"Created: / 09-01-2011 / 10:34:50 / cg" |
|
2500 | 629 |
! |
630 |
||
631 |
urlEncoded: aString |
|
632 |
"helper to escape invalid/dangerous characters in an urls arguments or post-fields. |
|
4302 | 633 |
|
634 |
Any byte not in the set 0-9, a-z, A-Z, '.', '-', '_' and '~', is encoded using |
|
2522 | 635 |
the '%nn' format, where nn is the hexadecimal value of the byte. |
4302 | 636 |
Characters outside the ASCII range are encoded into utf8 first. |
2522 | 637 |
Spaces are encoded as '+'. |
638 |
see: application/x-www-form-urlencoded |
|
4302 | 639 |
see: https://tools.ietf.org/html/rfc3986 (obsoletes RFC1738)" |
2500 | 640 |
|
641 |
|ws| |
|
642 |
||
4302 | 643 |
ws := WriteStream on:(String new:aString size + 20). |
2500 | 644 |
self urlEncode:aString on:ws. |
2058 | 645 |
^ ws contents |
646 |
||
647 |
||
648 |
" |
|
2500 | 649 |
self unEscape:(self urlEncoded:'_-.*Frankfurt(Main) Hbf') |
650 |
self urlEncoded:'_-.*Frankfurt(Main) Hbf' |
|
651 |
||
652 |
self unEscape:(self urlEncoded:'-_.*%exept;') |
|
653 |
self urlEncoded:'-_.*%exept;' |
|
2058 | 654 |
" |
2464 | 655 |
|
2522 | 656 |
"Modified: / 09-01-2011 / 10:43:37 / cg" |
4302 | 657 |
"Modified: / 07-02-2017 / 14:54:12 / stefan" |
2066 | 658 |
! |
659 |
||
2436
a5537ae7be4a
added: #withAllSpecialHTMLCharactersEscaped:
Claus Gittinger <cg@exept.de>
parents:
2434
diff
changeset
|
660 |
withAllSpecialHTMLCharactersEscaped:aStringOrCharacter |
a5537ae7be4a
added: #withAllSpecialHTMLCharactersEscaped:
Claus Gittinger <cg@exept.de>
parents:
2434
diff
changeset
|
661 |
"replace ampersand, less, greater and quotes by html-character escapes" |
a5537ae7be4a
added: #withAllSpecialHTMLCharactersEscaped:
Claus Gittinger <cg@exept.de>
parents:
2434
diff
changeset
|
662 |
|
a5537ae7be4a
added: #withAllSpecialHTMLCharactersEscaped:
Claus Gittinger <cg@exept.de>
parents:
2434
diff
changeset
|
663 |
"/ TODO: this is similar to escapeCharacterEntities. |
a5537ae7be4a
added: #withAllSpecialHTMLCharactersEscaped:
Claus Gittinger <cg@exept.de>
parents:
2434
diff
changeset
|
664 |
"/ we should refactor this into one method only (can we do hex escapes always ?). |
a5537ae7be4a
added: #withAllSpecialHTMLCharactersEscaped:
Claus Gittinger <cg@exept.de>
parents:
2434
diff
changeset
|
665 |
"/ Notice, that these two methods came into existance due to historic reasons |
a5537ae7be4a
added: #withAllSpecialHTMLCharactersEscaped:
Claus Gittinger <cg@exept.de>
parents:
2434
diff
changeset
|
666 |
"/ and were developed independent of each other, but later moved to this common place. |
a5537ae7be4a
added: #withAllSpecialHTMLCharactersEscaped:
Claus Gittinger <cg@exept.de>
parents:
2434
diff
changeset
|
667 |
|
a5537ae7be4a
added: #withAllSpecialHTMLCharactersEscaped:
Claus Gittinger <cg@exept.de>
parents:
2434
diff
changeset
|
668 |
|resultStream| |
a5537ae7be4a
added: #withAllSpecialHTMLCharactersEscaped:
Claus Gittinger <cg@exept.de>
parents:
2434
diff
changeset
|
669 |
|
a5537ae7be4a
added: #withAllSpecialHTMLCharactersEscaped:
Claus Gittinger <cg@exept.de>
parents:
2434
diff
changeset
|
670 |
"/ orgs := #( $& $< $> $" $'). |
a5537ae7be4a
added: #withAllSpecialHTMLCharactersEscaped:
Claus Gittinger <cg@exept.de>
parents:
2434
diff
changeset
|
671 |
"/ repls := #( '&' '<' '>' " '). |
a5537ae7be4a
added: #withAllSpecialHTMLCharactersEscaped:
Claus Gittinger <cg@exept.de>
parents:
2434
diff
changeset
|
672 |
|
a5537ae7be4a
added: #withAllSpecialHTMLCharactersEscaped:
Claus Gittinger <cg@exept.de>
parents:
2434
diff
changeset
|
673 |
(aStringOrCharacter isString |
3098 | 674 |
and:[ (aStringOrCharacter includesAny:'&<>''"') not ]) ifTrue:[^ aStringOrCharacter]. |
2436
a5537ae7be4a
added: #withAllSpecialHTMLCharactersEscaped:
Claus Gittinger <cg@exept.de>
parents:
2434
diff
changeset
|
675 |
|
3544 | 676 |
resultStream := CharacterWriteStream on:''. |
2436
a5537ae7be4a
added: #withAllSpecialHTMLCharactersEscaped:
Claus Gittinger <cg@exept.de>
parents:
2434
diff
changeset
|
677 |
aStringOrCharacter asString do:[:eachCharacter | |
a5537ae7be4a
added: #withAllSpecialHTMLCharactersEscaped:
Claus Gittinger <cg@exept.de>
parents:
2434
diff
changeset
|
678 |
"/ huh - a switch. Sorry, but this method is used heavily. |
a5537ae7be4a
added: #withAllSpecialHTMLCharactersEscaped:
Claus Gittinger <cg@exept.de>
parents:
2434
diff
changeset
|
679 |
eachCharacter == $& |
a5537ae7be4a
added: #withAllSpecialHTMLCharactersEscaped:
Claus Gittinger <cg@exept.de>
parents:
2434
diff
changeset
|
680 |
ifTrue:[ resultStream nextPutAll:'&' ] |
a5537ae7be4a
added: #withAllSpecialHTMLCharactersEscaped:
Claus Gittinger <cg@exept.de>
parents:
2434
diff
changeset
|
681 |
ifFalse:[ |
a5537ae7be4a
added: #withAllSpecialHTMLCharactersEscaped:
Claus Gittinger <cg@exept.de>
parents:
2434
diff
changeset
|
682 |
eachCharacter == $< |
a5537ae7be4a
added: #withAllSpecialHTMLCharactersEscaped:
Claus Gittinger <cg@exept.de>
parents:
2434
diff
changeset
|
683 |
ifTrue:[ resultStream nextPutAll:'<' ] |
a5537ae7be4a
added: #withAllSpecialHTMLCharactersEscaped:
Claus Gittinger <cg@exept.de>
parents:
2434
diff
changeset
|
684 |
ifFalse:[ |
a5537ae7be4a
added: #withAllSpecialHTMLCharactersEscaped:
Claus Gittinger <cg@exept.de>
parents:
2434
diff
changeset
|
685 |
eachCharacter == $> |
a5537ae7be4a
added: #withAllSpecialHTMLCharactersEscaped:
Claus Gittinger <cg@exept.de>
parents:
2434
diff
changeset
|
686 |
ifTrue:[ resultStream nextPutAll:'>' ] |
a5537ae7be4a
added: #withAllSpecialHTMLCharactersEscaped:
Claus Gittinger <cg@exept.de>
parents:
2434
diff
changeset
|
687 |
ifFalse:[ |
a5537ae7be4a
added: #withAllSpecialHTMLCharactersEscaped:
Claus Gittinger <cg@exept.de>
parents:
2434
diff
changeset
|
688 |
eachCharacter == $" |
a5537ae7be4a
added: #withAllSpecialHTMLCharactersEscaped:
Claus Gittinger <cg@exept.de>
parents:
2434
diff
changeset
|
689 |
ifTrue:[ resultStream nextPutAll:'"' ] |
a5537ae7be4a
added: #withAllSpecialHTMLCharactersEscaped:
Claus Gittinger <cg@exept.de>
parents:
2434
diff
changeset
|
690 |
ifFalse:[ |
a5537ae7be4a
added: #withAllSpecialHTMLCharactersEscaped:
Claus Gittinger <cg@exept.de>
parents:
2434
diff
changeset
|
691 |
eachCharacter == $' |
a5537ae7be4a
added: #withAllSpecialHTMLCharactersEscaped:
Claus Gittinger <cg@exept.de>
parents:
2434
diff
changeset
|
692 |
ifTrue:[ resultStream nextPutAll:''' ] |
a5537ae7be4a
added: #withAllSpecialHTMLCharactersEscaped:
Claus Gittinger <cg@exept.de>
parents:
2434
diff
changeset
|
693 |
ifFalse:[ |
a5537ae7be4a
added: #withAllSpecialHTMLCharactersEscaped:
Claus Gittinger <cg@exept.de>
parents:
2434
diff
changeset
|
694 |
resultStream nextPut:eachCharacter |
a5537ae7be4a
added: #withAllSpecialHTMLCharactersEscaped:
Claus Gittinger <cg@exept.de>
parents:
2434
diff
changeset
|
695 |
]]]]]. |
a5537ae7be4a
added: #withAllSpecialHTMLCharactersEscaped:
Claus Gittinger <cg@exept.de>
parents:
2434
diff
changeset
|
696 |
]. |
a5537ae7be4a
added: #withAllSpecialHTMLCharactersEscaped:
Claus Gittinger <cg@exept.de>
parents:
2434
diff
changeset
|
697 |
^ resultStream contents |
a5537ae7be4a
added: #withAllSpecialHTMLCharactersEscaped:
Claus Gittinger <cg@exept.de>
parents:
2434
diff
changeset
|
698 |
|
a5537ae7be4a
added: #withAllSpecialHTMLCharactersEscaped:
Claus Gittinger <cg@exept.de>
parents:
2434
diff
changeset
|
699 |
" |
3312 | 700 |
self withAllSpecialHTMLCharactersEscaped:'<>#&' |
701 |
self withAllSpecialHTMLCharactersEscaped:$< |
|
702 |
self withAllSpecialHTMLCharactersEscaped:$# |
|
2436
a5537ae7be4a
added: #withAllSpecialHTMLCharactersEscaped:
Claus Gittinger <cg@exept.de>
parents:
2434
diff
changeset
|
703 |
" |
a5537ae7be4a
added: #withAllSpecialHTMLCharactersEscaped:
Claus Gittinger <cg@exept.de>
parents:
2434
diff
changeset
|
704 |
|
a5537ae7be4a
added: #withAllSpecialHTMLCharactersEscaped:
Claus Gittinger <cg@exept.de>
parents:
2434
diff
changeset
|
705 |
"Modified: / 05-12-2006 / 13:48:59 / cg" |
3544 | 706 |
"Modified: / 06-05-2015 / 15:41:06 / sr" |
2436
a5537ae7be4a
added: #withAllSpecialHTMLCharactersEscaped:
Claus Gittinger <cg@exept.de>
parents:
2434
diff
changeset
|
707 |
! |
a5537ae7be4a
added: #withAllSpecialHTMLCharactersEscaped:
Claus Gittinger <cg@exept.de>
parents:
2434
diff
changeset
|
708 |
|
2066 | 709 |
withSpecialHTMLCharactersEscaped:aStringOrCharacter |
710 |
"replace ampersand, less and greater by html-character escapes" |
|
711 |
||
712 |
"/ TODO: this is similar to escapeCharacterEntities. |
|
713 |
"/ we should refactor this into one method only (can we do hex escapes always ?). |
|
714 |
"/ Notice, that these two methods came into existance due to historic reasons |
|
715 |
"/ and were developed independent of each other, but later moved to this common place. |
|
716 |
||
2866 | 717 |
|resultStream| |
2066 | 718 |
|
719 |
"/ orgs := #( $& $< $> ). |
|
720 |
"/ repls := #( '&' '<' '>' ). |
|
721 |
||
722 |
(aStringOrCharacter isString |
|
2866 | 723 |
and:[ (aStringOrCharacter isWideString not) |
724 |
and:[ (aStringOrCharacter includesAny:'&<>') not ]]) ifTrue:[^ aStringOrCharacter]. |
|
2066 | 725 |
|
3544 | 726 |
resultStream := CharacterWriteStream on:''. |
2066 | 727 |
aStringOrCharacter asString do:[:eachCharacter | |
728 |
"/ huh - a switch. Sorry, but this method is used heavily. |
|
729 |
eachCharacter == $& |
|
730 |
ifTrue:[ resultStream nextPutAll:'&' ] |
|
731 |
ifFalse:[ |
|
732 |
eachCharacter == $< |
|
733 |
ifTrue:[ resultStream nextPutAll:'<' ] |
|
734 |
ifFalse:[ |
|
735 |
eachCharacter == $> |
|
736 |
ifTrue:[ resultStream nextPutAll:'>' ] |
|
737 |
ifFalse:[ |
|
2554
7cd0f7a16fad
changed: #withSpecialHTMLCharactersEscaped:
Claus Gittinger <cg@exept.de>
parents:
2523
diff
changeset
|
738 |
"/ eachCharacter codePoint > 16r7F |
7cd0f7a16fad
changed: #withSpecialHTMLCharactersEscaped:
Claus Gittinger <cg@exept.de>
parents:
2523
diff
changeset
|
739 |
"/ ifTrue:[ |
7cd0f7a16fad
changed: #withSpecialHTMLCharactersEscaped:
Claus Gittinger <cg@exept.de>
parents:
2523
diff
changeset
|
740 |
"/ resultStream |
7cd0f7a16fad
changed: #withSpecialHTMLCharactersEscaped:
Claus Gittinger <cg@exept.de>
parents:
2523
diff
changeset
|
741 |
"/ nextPutAll:'&#'; |
7cd0f7a16fad
changed: #withSpecialHTMLCharactersEscaped:
Claus Gittinger <cg@exept.de>
parents:
2523
diff
changeset
|
742 |
"/ nextPutAll:(eachCharacter codePoint printString); |
7cd0f7a16fad
changed: #withSpecialHTMLCharactersEscaped:
Claus Gittinger <cg@exept.de>
parents:
2523
diff
changeset
|
743 |
"/ nextPutAll:';'] |
7cd0f7a16fad
changed: #withSpecialHTMLCharactersEscaped:
Claus Gittinger <cg@exept.de>
parents:
2523
diff
changeset
|
744 |
"/ ifFalse:[ |
2066 | 745 |
resultStream nextPut:eachCharacter |
2554
7cd0f7a16fad
changed: #withSpecialHTMLCharactersEscaped:
Claus Gittinger <cg@exept.de>
parents:
2523
diff
changeset
|
746 |
"/ ] |
2066 | 747 |
]]]. |
748 |
]. |
|
749 |
^ resultStream contents |
|
750 |
||
751 |
" |
|
752 |
self withSpecialHTMLCharactersEscaped:'<>#&' |
|
753 |
self withSpecialHTMLCharactersEscaped:$< |
|
754 |
self withSpecialHTMLCharactersEscaped:$# |
|
755 |
" |
|
756 |
||
2554
7cd0f7a16fad
changed: #withSpecialHTMLCharactersEscaped:
Claus Gittinger <cg@exept.de>
parents:
2523
diff
changeset
|
757 |
"Modified: / 13-04-2011 / 23:13:32 / cg" |
3544 | 758 |
"Modified: / 06-05-2015 / 15:41:16 / sr" |
2058 | 759 |
! ! |
760 |
||
3647 | 761 |
!HTMLUtilities class methodsFor:'queries'! |
762 |
||
763 |
isUtilityClass |
|
764 |
^ self == HTMLUtilities |
|
765 |
! ! |
|
766 |
||
2058 | 767 |
!HTMLUtilities class methodsFor:'serving-helpers'! |
768 |
||
769 |
escape:aString |
|
2436
a5537ae7be4a
added: #withAllSpecialHTMLCharactersEscaped:
Claus Gittinger <cg@exept.de>
parents:
2434
diff
changeset
|
770 |
"helper to escape invalid/dangerous characters in an url's arguments or post-fields. |
2058 | 771 |
These are: |
3456 | 772 |
control characters, dQuote, '+', ';', '?', '&' and space -> %XX ascii as hex digits |
2058 | 773 |
% -> %% |
774 |
" |
|
775 |
||
3544 | 776 |
| rs ws c cp| |
2058 | 777 |
|
778 |
rs := ReadStream on: aString. |
|
779 |
ws := WriteStream on: ''. |
|
780 |
[ rs atEnd ] whileFalse: [ |
|
781 |
c := rs next. |
|
782 |
c == $% ifTrue:[ |
|
783 |
ws nextPutAll: '%%'. |
|
784 |
] ifFalse:[ |
|
3544 | 785 |
(((cp := c codePoint) < 16r7F) |
3456 | 786 |
and:[ ('+;?&" ' includes:c) not ]) ifTrue: [ |
2058 | 787 |
ws nextPut: c. |
788 |
] ifFalse:[ |
|
789 |
ws nextPut: $%. |
|
4217 | 790 |
cp printOn:ws base:16 size:(cp > 16rFF ifTrue:[4] ifFalse:[2]) fill:$0. |
2058 | 791 |
] |
792 |
] |
|
793 |
]. |
|
794 |
^ ws contents |
|
795 |
||
796 |
" |
|
797 |
self escape:'a b' |
|
798 |
self escape:'a%b' |
|
799 |
self escape:'a b' |
|
800 |
self escape:'a+b' |
|
4302 | 801 |
self escape:'aäüöb' |
2058 | 802 |
" |
3544 | 803 |
|
804 |
"Modified: / 06-05-2015 / 16:07:18 / sr" |
|
4217 | 805 |
"Modified: / 25-11-2016 / 16:37:53 / cg" |
2058 | 806 |
! ! |
807 |
||
2144 | 808 |
!HTMLUtilities class methodsFor:'text processing helpers'! |
809 |
||
810 |
plainTextOfHTML:htmlString |
|
811 |
"given some HTML, extract the raw text. |
|
812 |
Can be used to search for strings in some html text." |
|
813 |
||
3545 | 814 |
|parser doc s first| |
2144 | 815 |
|
816 |
parser := HTMLParser new. |
|
817 |
doc := parser parseText:htmlString. |
|
3660 | 818 |
s := CharacterWriteStream on:(String new:100). |
3545 | 819 |
first := true. |
2144 | 820 |
doc markUpElementsDo:[:el | |
821 |
|t| |
|
822 |
||
823 |
el isTextElement ifTrue:[ |
|
824 |
t := el text withoutSeparators. |
|
825 |
t notEmpty ifTrue:[ |
|
3545 | 826 |
first ifFalse:[ |
827 |
s space. |
|
828 |
]. |
|
2144 | 829 |
s nextPutAll:t. |
3545 | 830 |
first := false |
2144 | 831 |
]. |
832 |
] ifFalse:[ |
|
833 |
"/ ignore non-text; however, we could care for text in info-titles |
|
834 |
"/ or scripts as well... |
|
835 |
]. |
|
836 |
]. |
|
3659 | 837 |
^ s contents |
2144 | 838 |
|
839 |
" |
|
840 |
self plainTextOfHTML:' |
|
4737 | 841 |
bla1 bla2 <br>bla3 <table><tr><td>bla4</td></tr></table> bla5<p>bla6' |
842 |
self plainTextOfHTML:'Hello World' |
|
2144 | 843 |
" |
3545 | 844 |
|
845 |
"Modified: / 06-05-2015 / 17:02:36 / sr" |
|
2144 | 846 |
! ! |
847 |
||
2058 | 848 |
!HTMLUtilities class methodsFor:'documentation'! |
849 |
||
850 |
version |
|
3640 | 851 |
^ '$Header$' |
2434
5625df4b6119
comment/format in: #escapeCharacterEntities:
Claus Gittinger <cg@exept.de>
parents:
2179
diff
changeset
|
852 |
! |
5625df4b6119
comment/format in: #escapeCharacterEntities:
Claus Gittinger <cg@exept.de>
parents:
2179
diff
changeset
|
853 |
|
5625df4b6119
comment/format in: #escapeCharacterEntities:
Claus Gittinger <cg@exept.de>
parents:
2179
diff
changeset
|
854 |
version_CVS |
3640 | 855 |
^ '$Header$' |
2058 | 856 |
! ! |
3098 | 857 |