author | sr |
Tue, 01 Jul 2014 12:12:38 +0200 | |
changeset 3312 | fe3d83508353 |
parent 3098 | 2ae8f1b57bc1 |
child 3456 | 8a3302fd3cce |
permissions | -rw-r--r-- |
2058 | 1 |
" |
2 |
COPYRIGHT (c) 2007 by eXept Software AG |
|
3 |
All Rights Reserved |
|
4 |
||
5 |
This software is furnished under a license and may be used |
|
6 |
only in accordance with the terms of that license and with the |
|
7 |
inclusion of the above copyright notice. This software may not |
|
8 |
be provided or otherwise made available to, or used by, any |
|
9 |
other person. No title to or ownership of the software is |
|
10 |
hereby transferred. |
|
11 |
" |
|
12 |
"{ Package: 'stx:libbasic2' }" |
|
13 |
||
14 |
Object subclass:#HTMLUtilities |
|
15 |
instanceVariableNames:'' |
|
16 |
classVariableNames:'EscapeControlCharacters' |
|
17 |
poolDictionaries:'' |
|
18 |
category:'Net-Communication-Support' |
|
19 |
! |
|
20 |
||
21 |
!HTMLUtilities class methodsFor:'documentation'! |
|
22 |
||
23 |
copyright |
|
24 |
" |
|
25 |
COPYRIGHT (c) 2007 by eXept Software AG |
|
26 |
All Rights Reserved |
|
27 |
||
28 |
This software is furnished under a license and may be used |
|
29 |
only in accordance with the terms of that license and with the |
|
30 |
inclusion of the above copyright notice. This software may not |
|
31 |
be provided or otherwise made available to, or used by, any |
|
32 |
other person. No title to or ownership of the software is |
|
33 |
hereby transferred. |
|
34 |
" |
|
35 |
! |
|
36 |
||
37 |
documentation |
|
38 |
" |
|
39 |
Collected support functions to deal with HTML. |
|
40 |
Used both by HTML generators (DocGenerator), HTMLParsers and the webServer. |
|
41 |
Therefore, it has been put into libbasic2. |
|
42 |
" |
|
43 |
! ! |
|
44 |
||
2442 | 45 |
!HTMLUtilities class methodsFor:'common actions'! |
46 |
||
47 |
openLauncherOnDisplay:displayName |
|
2458 | 48 |
<resource: #obsolete> |
2442 | 49 |
|
2458 | 50 |
"obsolete - do not use" |
2442 | 51 |
|
2458 | 52 |
self obsoleteMethodWarning. |
53 |
Error handle:[:ex | |
|
54 |
^ ex description |
|
55 |
] do:[ |
|
56 |
NewLauncher openLauncherOnInitializedDisplayNamed:displayName |
|
57 |
] |
|
2442 | 58 |
|
2458 | 59 |
"Modified: / 01-06-2010 / 11:25:12 / sr" |
2442 | 60 |
! ! |
61 |
||
2058 | 62 |
!HTMLUtilities class methodsFor:'helpers'! |
63 |
||
64 |
controlCharacters |
|
65 |
||
66 |
EscapeControlCharacters isNil ifTrue:[ |
|
67 |
EscapeControlCharacters := Dictionary new. |
|
68 |
"/ EscapeControlCharacters at:Character space put:' '. |
|
69 |
EscapeControlCharacters at:$< put:'<'. |
|
70 |
EscapeControlCharacters at:$> put:'>'. |
|
71 |
EscapeControlCharacters at:$& put:'&'. |
|
72 |
EscapeControlCharacters at:$" put:'"'. |
|
2436
a5537ae7be4a
added: #withAllSpecialHTMLCharactersEscaped:
Claus Gittinger <cg@exept.de>
parents:
2434
diff
changeset
|
73 |
"/ EscapeControlCharacters at:$' put:'''. |
2058 | 74 |
]. |
75 |
^ EscapeControlCharacters. |
|
76 |
! |
|
77 |
||
78 |
escapeCharacterEntities:aString |
|
79 |
"helper to escape invalid/dangerous characters in html strings. |
|
80 |
These are: |
|
81 |
control characters, '<', '>', '&' and space -> %XX ascii as hex digits |
|
82 |
% -> %% |
|
83 |
" |
|
2066 | 84 |
"/ TODO: this is similar to withSpecialHTMLCharactersEscaped. |
85 |
"/ we should refactor this into one method only (can we do hex escapes always ?). |
|
86 |
"/ Notice, that these two methods came into existance due to historic reasons |
|
87 |
"/ and were developed independent of each other, but later moved to this common place. |
|
88 |
||
89 |
||
2058 | 90 |
|rs ws c controlCharacters controlString| |
91 |
||
92 |
controlCharacters := self controlCharacters. |
|
93 |
rs := ReadStream on: aString. |
|
94 |
ws := WriteStream on: ''. |
|
95 |
[ rs atEnd ] whileFalse: [ |
|
96 |
c := rs next. |
|
97 |
controlString := controlCharacters at:c ifAbsent:nil. |
|
98 |
controlString notNil ifTrue:[ |
|
99 |
ws nextPutAll:controlString. |
|
100 |
] ifFalse:[ |
|
101 |
c codePoint > 16r7F ifTrue:[ |
|
102 |
ws |
|
103 |
nextPutAll:'&#'; |
|
104 |
nextPutAll:(c codePoint printString); |
|
105 |
nextPutAll:';'. |
|
106 |
] ifFalse:[ |
|
107 |
ws nextPut:c. |
|
108 |
] |
|
109 |
] |
|
110 |
]. |
|
111 |
^ ws contents |
|
112 |
||
113 |
" |
|
114 |
self escapeCharacterEntities:'a<b' |
|
115 |
self escapeCharacterEntities:'aöb' |
|
116 |
" |
|
117 |
! |
|
118 |
||
119 |
extractCharSetEncodingFromContentType:contentTypeLine |
|
120 |
|idx rest encoding| |
|
121 |
||
122 |
idx := contentTypeLine findString:'charset='. |
|
123 |
idx == 0 ifTrue:[ |
|
124 |
^ nil |
|
125 |
]. |
|
126 |
rest := (contentTypeLine copyFrom:idx+'charset=' size) withoutSeparators. |
|
127 |
idx := (rest indexOfSeparator) min:(rest indexOf:$;). |
|
128 |
idx == 0 ifTrue:[ |
|
129 |
encoding := rest |
|
130 |
] ifFalse:[ |
|
131 |
encoding := rest copyTo:idx-1. |
|
132 |
]. |
|
133 |
(encoding startsWith:$") ifTrue:[ |
|
134 |
encoding := encoding copyFrom:2 to:(encoding indexOf:$" startingAt:3)-1. |
|
135 |
]. |
|
136 |
^ encoding. |
|
137 |
||
138 |
" |
|
139 |
self extractCharSetEncodingFromContentType:'text/html; charset=ascii' |
|
140 |
self extractCharSetEncodingFromContentType:'text/html; charset=' |
|
141 |
self extractCharSetEncodingFromContentType:'text/html; fooBar=bla' |
|
142 |
self extractCharSetEncodingFromContentType:'text/xml; charset=utf-8' |
|
143 |
self extractCharSetEncodingFromContentType:'text/xml; charset=utf-8; bla=fasel' |
|
144 |
" |
|
145 |
! |
|
146 |
||
147 |
extractMimeTypeFromContentType:contentTypeLine |
|
148 |
|idx mimeAndEncoding| |
|
149 |
||
150 |
idx := contentTypeLine indexOf:$:. |
|
151 |
mimeAndEncoding := (contentTypeLine copyFrom:idx+1) withoutSeparators. |
|
152 |
||
153 |
(mimeAndEncoding includes:$;) ifFalse:[ |
|
154 |
^ mimeAndEncoding |
|
155 |
]. |
|
156 |
||
157 |
idx := mimeAndEncoding indexOf:$;. |
|
158 |
^ mimeAndEncoding copyTo:idx-1 |
|
159 |
||
160 |
" |
|
161 |
self extractMimeTypeFromContentType:'text/html; charset=ascii' |
|
162 |
self extractMimeTypeFromContentType:'text/html; ' |
|
163 |
self extractMimeTypeFromContentType:'text/html' |
|
164 |
self extractMimeTypeFromContentType:'text/xml; charset=utf-8' |
|
165 |
" |
|
166 |
! |
|
167 |
||
168 |
unEscape:aString |
|
2522 | 169 |
"Convert escaped characters in an urls arguments or post fields back to their proper characters. |
170 |
Undoes the effect of urlEncode and urlEncode2. |
|
2058 | 171 |
These are: |
172 |
+ -> space |
|
173 |
%XX ascii as hex digits |
|
174 |
%% -> % |
|
175 |
" |
|
176 |
||
177 |
|rs ws c peekC| |
|
178 |
||
2522 | 179 |
(aString includesAny:'+%') ifFalse:[ |
2058 | 180 |
^ aString |
181 |
]. |
|
182 |
||
183 |
rs := ReadStream on: aString. |
|
184 |
ws := WriteStream on: ''. |
|
185 |
[rs atEnd] whileFalse:[ |
|
186 |
c := rs next. |
|
187 |
c == $+ |
|
188 |
ifTrue:[ c := Character space ] |
|
189 |
ifFalse:[ |
|
190 |
c == $% |
|
191 |
ifTrue: [ |
|
192 |
peekC := rs peek. |
|
193 |
(peekC notNil and:[peekC isHexDigit]) ifTrue:[ |
|
2179
c1cee8bbc1e5
unescape: care for invalid escape sequence (%, %singleDigit atEnd)
sr
parents:
2144
diff
changeset
|
194 |
c := (Integer readFrom:(rs nextAvailable:2) radix:16) asCharacter |
2058 | 195 |
] ifFalse:[ |
196 |
c := rs next. |
|
197 |
] |
|
198 |
] |
|
199 |
]. |
|
2179
c1cee8bbc1e5
unescape: care for invalid escape sequence (%, %singleDigit atEnd)
sr
parents:
2144
diff
changeset
|
200 |
c notNil ifTrue:[ ws nextPut: c ]. |
2058 | 201 |
]. |
202 |
^ ws contents |
|
203 |
||
204 |
" |
|
2087 | 205 |
self unEscape:'a%20b' |
206 |
self unEscape:'a%%b' |
|
207 |
self unEscape:'a+b' |
|
208 |
self unEscape:'a%+b' |
|
2179
c1cee8bbc1e5
unescape: care for invalid escape sequence (%, %singleDigit atEnd)
sr
parents:
2144
diff
changeset
|
209 |
self unEscape:'a%' |
c1cee8bbc1e5
unescape: care for invalid escape sequence (%, %singleDigit atEnd)
sr
parents:
2144
diff
changeset
|
210 |
self unEscape:'a%2' |
2058 | 211 |
" |
2179
c1cee8bbc1e5
unescape: care for invalid escape sequence (%, %singleDigit atEnd)
sr
parents:
2144
diff
changeset
|
212 |
|
c1cee8bbc1e5
unescape: care for invalid escape sequence (%, %singleDigit atEnd)
sr
parents:
2144
diff
changeset
|
213 |
"Modified: / 08-07-2009 / 12:40:56 / sr" |
2522 | 214 |
"Modified: / 09-01-2011 / 10:44:50 / cg" |
215 |
! |
|
216 |
||
217 |
urlEncode2:aStringOrStream on:ws |
|
218 |
"helper to escape invalid/dangerous characters in an urls arguments. |
|
219 |
Similar to urlEncode, but treats '*','~' and spaces differently. |
|
220 |
(some clients, such as bitTorrent seem to require this - time will tell...) |
|
2523 | 221 |
Any byte not in the set 0-9, a-z, A-Z, '.', '-', '_', is encoded using |
2522 | 222 |
the '%nn' format, where nn is the hexadecimal value of the byte. |
223 |
see: RFC1738" |
|
224 |
||
225 |
|rs c space| |
|
226 |
||
227 |
space := Character space. |
|
228 |
rs := aStringOrStream readStream. |
|
229 |
||
230 |
[rs atEnd] whileFalse: [ |
|
231 |
c := rs next. |
|
232 |
||
2523 | 233 |
(c isLetterOrDigit or:[ ('-_.' includes:c) ]) ifTrue:[ |
2522 | 234 |
ws nextPut:c. |
235 |
] ifFalse:[ |
|
236 |
ws nextPut: $%. |
|
237 |
c codePoint printOn:ws base:16 size:2 fill:$0. |
|
238 |
]. |
|
239 |
]. |
|
240 |
||
241 |
"Created: / 09-01-2011 / 10:32:27 / cg" |
|
2523 | 242 |
"Modified: / 09-01-2011 / 13:11:17 / cg" |
2058 | 243 |
! |
244 |
||
2500 | 245 |
urlEncode:aStringOrStream on:ws |
2058 | 246 |
"helper to escape invalid/dangerous characters in an urls arguments or post-fields. |
2522 | 247 |
Similar to urlEncode2, but treats '*','~' and spaces differently. |
248 |
(some clients, such as bitTorrent seem to require urlEncode2 - time will tell...) |
|
249 |
Any byte not in the set 0-9, a-z, A-Z, '.', '-', '_' and '*', is encoded using |
|
250 |
the '%nn' format, where nn is the hexadecimal value of the byte. |
|
251 |
Spaces are encoded as '+'. |
|
252 |
see: application/x-www-form-urlencoded |
|
253 |
see: RFC1738" |
|
2058 | 254 |
|
2500 | 255 |
|rs c space| |
2058 | 256 |
|
257 |
space := Character space. |
|
2500 | 258 |
rs := aStringOrStream readStream. |
2058 | 259 |
|
2500 | 260 |
[rs atEnd] whileFalse: [ |
2058 | 261 |
c := rs next. |
262 |
||
2522 | 263 |
(c isLetterOrDigit or:[ '-_.*' includes:c ]) ifTrue:[ |
2058 | 264 |
ws nextPut:c. |
265 |
] ifFalse:[ |
|
266 |
c == space ifTrue:[ |
|
267 |
ws nextPut:$+. |
|
268 |
] ifFalse:[ |
|
2522 | 269 |
ws nextPut: $%. |
270 |
c codePoint printOn:ws base:16 size:2 fill:$0. |
|
2058 | 271 |
]. |
272 |
]. |
|
273 |
]. |
|
2522 | 274 |
|
275 |
"Modified: / 09-01-2011 / 10:43:30 / cg" |
|
276 |
! |
|
277 |
||
278 |
urlEncoded2: aString |
|
279 |
"helper to escape invalid/dangerous characters in an urls arguments or post-fields. |
|
280 |
Similar to urlEncoded, but treats '*','~' and spaces differently. |
|
281 |
(some clients, such as bitTorrent seem to require this - time will tell...) |
|
282 |
Any byte not in the set 0-9, a-z, A-Z, '.', '-', '_' and '~', is encoded using |
|
283 |
the '%nn' format, where nn is the hexadecimal value of the byte. |
|
284 |
see: application/x-www-form-urlencoded |
|
285 |
see: RFC1738" |
|
286 |
||
287 |
|ws| |
|
288 |
||
289 |
ws := String writeStreamWithInitialSize:aString size. |
|
290 |
self urlEncode2:aString on:ws. |
|
291 |
^ ws contents |
|
292 |
||
293 |
||
294 |
" |
|
295 |
self unEscape:(self urlEncoded:'_-.*Frankfurt(Main) Hbf') |
|
296 |
self urlEncoded2:'_-.*Frankfurt(Main) Hbf' |
|
297 |
||
298 |
self unEscape:(self urlEncoded:'-_.*%exept;') |
|
299 |
self urlEncoded2:'-_.*%exept;' |
|
300 |
self urlEncoded:'-_.*%exept;' |
|
301 |
" |
|
302 |
||
303 |
"Created: / 09-01-2011 / 10:34:50 / cg" |
|
2500 | 304 |
! |
305 |
||
306 |
urlEncoded: aString |
|
307 |
"helper to escape invalid/dangerous characters in an urls arguments or post-fields. |
|
2522 | 308 |
Similar to urlEncoded2, but treats '*','~' and spaces differently. |
309 |
(some clients, such as bitTorrent seem to require urlEncoded2 - time will tell...) |
|
310 |
Any byte not in the set 0-9, a-z, A-Z, '.', '-', '_' and '*', is encoded using |
|
311 |
the '%nn' format, where nn is the hexadecimal value of the byte. |
|
312 |
Spaces are encoded as '+'. |
|
313 |
see: application/x-www-form-urlencoded |
|
314 |
see: RFC1738" |
|
2500 | 315 |
|
316 |
|ws| |
|
317 |
||
2522 | 318 |
ws := String writeStreamWithInitialSize:aString size. |
2500 | 319 |
self urlEncode:aString on:ws. |
2058 | 320 |
^ ws contents |
321 |
||
322 |
||
323 |
" |
|
2500 | 324 |
self unEscape:(self urlEncoded:'_-.*Frankfurt(Main) Hbf') |
325 |
self urlEncoded:'_-.*Frankfurt(Main) Hbf' |
|
326 |
||
327 |
self unEscape:(self urlEncoded:'-_.*%exept;') |
|
328 |
self urlEncoded:'-_.*%exept;' |
|
2058 | 329 |
" |
2464 | 330 |
|
2522 | 331 |
"Modified: / 09-01-2011 / 10:43:37 / cg" |
2066 | 332 |
! |
333 |
||
2436
a5537ae7be4a
added: #withAllSpecialHTMLCharactersEscaped:
Claus Gittinger <cg@exept.de>
parents:
2434
diff
changeset
|
334 |
withAllSpecialHTMLCharactersEscaped:aStringOrCharacter |
a5537ae7be4a
added: #withAllSpecialHTMLCharactersEscaped:
Claus Gittinger <cg@exept.de>
parents:
2434
diff
changeset
|
335 |
"replace ampersand, less, greater and quotes by html-character escapes" |
a5537ae7be4a
added: #withAllSpecialHTMLCharactersEscaped:
Claus Gittinger <cg@exept.de>
parents:
2434
diff
changeset
|
336 |
|
a5537ae7be4a
added: #withAllSpecialHTMLCharactersEscaped:
Claus Gittinger <cg@exept.de>
parents:
2434
diff
changeset
|
337 |
"/ TODO: this is similar to escapeCharacterEntities. |
a5537ae7be4a
added: #withAllSpecialHTMLCharactersEscaped:
Claus Gittinger <cg@exept.de>
parents:
2434
diff
changeset
|
338 |
"/ we should refactor this into one method only (can we do hex escapes always ?). |
a5537ae7be4a
added: #withAllSpecialHTMLCharactersEscaped:
Claus Gittinger <cg@exept.de>
parents:
2434
diff
changeset
|
339 |
"/ Notice, that these two methods came into existance due to historic reasons |
a5537ae7be4a
added: #withAllSpecialHTMLCharactersEscaped:
Claus Gittinger <cg@exept.de>
parents:
2434
diff
changeset
|
340 |
"/ and were developed independent of each other, but later moved to this common place. |
a5537ae7be4a
added: #withAllSpecialHTMLCharactersEscaped:
Claus Gittinger <cg@exept.de>
parents:
2434
diff
changeset
|
341 |
|
a5537ae7be4a
added: #withAllSpecialHTMLCharactersEscaped:
Claus Gittinger <cg@exept.de>
parents:
2434
diff
changeset
|
342 |
|resultStream| |
a5537ae7be4a
added: #withAllSpecialHTMLCharactersEscaped:
Claus Gittinger <cg@exept.de>
parents:
2434
diff
changeset
|
343 |
|
a5537ae7be4a
added: #withAllSpecialHTMLCharactersEscaped:
Claus Gittinger <cg@exept.de>
parents:
2434
diff
changeset
|
344 |
"/ orgs := #( $& $< $> $" $'). |
a5537ae7be4a
added: #withAllSpecialHTMLCharactersEscaped:
Claus Gittinger <cg@exept.de>
parents:
2434
diff
changeset
|
345 |
"/ repls := #( '&' '<' '>' " '). |
a5537ae7be4a
added: #withAllSpecialHTMLCharactersEscaped:
Claus Gittinger <cg@exept.de>
parents:
2434
diff
changeset
|
346 |
|
a5537ae7be4a
added: #withAllSpecialHTMLCharactersEscaped:
Claus Gittinger <cg@exept.de>
parents:
2434
diff
changeset
|
347 |
(aStringOrCharacter isString |
3098 | 348 |
and:[ (aStringOrCharacter includesAny:'&<>''"') not ]) ifTrue:[^ aStringOrCharacter]. |
2436
a5537ae7be4a
added: #withAllSpecialHTMLCharactersEscaped:
Claus Gittinger <cg@exept.de>
parents:
2434
diff
changeset
|
349 |
|
a5537ae7be4a
added: #withAllSpecialHTMLCharactersEscaped:
Claus Gittinger <cg@exept.de>
parents:
2434
diff
changeset
|
350 |
resultStream := WriteStream on:''. |
a5537ae7be4a
added: #withAllSpecialHTMLCharactersEscaped:
Claus Gittinger <cg@exept.de>
parents:
2434
diff
changeset
|
351 |
aStringOrCharacter asString do:[:eachCharacter | |
a5537ae7be4a
added: #withAllSpecialHTMLCharactersEscaped:
Claus Gittinger <cg@exept.de>
parents:
2434
diff
changeset
|
352 |
"/ huh - a switch. Sorry, but this method is used heavily. |
a5537ae7be4a
added: #withAllSpecialHTMLCharactersEscaped:
Claus Gittinger <cg@exept.de>
parents:
2434
diff
changeset
|
353 |
eachCharacter == $& |
a5537ae7be4a
added: #withAllSpecialHTMLCharactersEscaped:
Claus Gittinger <cg@exept.de>
parents:
2434
diff
changeset
|
354 |
ifTrue:[ resultStream nextPutAll:'&' ] |
a5537ae7be4a
added: #withAllSpecialHTMLCharactersEscaped:
Claus Gittinger <cg@exept.de>
parents:
2434
diff
changeset
|
355 |
ifFalse:[ |
a5537ae7be4a
added: #withAllSpecialHTMLCharactersEscaped:
Claus Gittinger <cg@exept.de>
parents:
2434
diff
changeset
|
356 |
eachCharacter == $< |
a5537ae7be4a
added: #withAllSpecialHTMLCharactersEscaped:
Claus Gittinger <cg@exept.de>
parents:
2434
diff
changeset
|
357 |
ifTrue:[ resultStream nextPutAll:'<' ] |
a5537ae7be4a
added: #withAllSpecialHTMLCharactersEscaped:
Claus Gittinger <cg@exept.de>
parents:
2434
diff
changeset
|
358 |
ifFalse:[ |
a5537ae7be4a
added: #withAllSpecialHTMLCharactersEscaped:
Claus Gittinger <cg@exept.de>
parents:
2434
diff
changeset
|
359 |
eachCharacter == $> |
a5537ae7be4a
added: #withAllSpecialHTMLCharactersEscaped:
Claus Gittinger <cg@exept.de>
parents:
2434
diff
changeset
|
360 |
ifTrue:[ resultStream nextPutAll:'>' ] |
a5537ae7be4a
added: #withAllSpecialHTMLCharactersEscaped:
Claus Gittinger <cg@exept.de>
parents:
2434
diff
changeset
|
361 |
ifFalse:[ |
a5537ae7be4a
added: #withAllSpecialHTMLCharactersEscaped:
Claus Gittinger <cg@exept.de>
parents:
2434
diff
changeset
|
362 |
eachCharacter == $" |
a5537ae7be4a
added: #withAllSpecialHTMLCharactersEscaped:
Claus Gittinger <cg@exept.de>
parents:
2434
diff
changeset
|
363 |
ifTrue:[ resultStream nextPutAll:'"' ] |
a5537ae7be4a
added: #withAllSpecialHTMLCharactersEscaped:
Claus Gittinger <cg@exept.de>
parents:
2434
diff
changeset
|
364 |
ifFalse:[ |
a5537ae7be4a
added: #withAllSpecialHTMLCharactersEscaped:
Claus Gittinger <cg@exept.de>
parents:
2434
diff
changeset
|
365 |
eachCharacter == $' |
a5537ae7be4a
added: #withAllSpecialHTMLCharactersEscaped:
Claus Gittinger <cg@exept.de>
parents:
2434
diff
changeset
|
366 |
ifTrue:[ resultStream nextPutAll:''' ] |
a5537ae7be4a
added: #withAllSpecialHTMLCharactersEscaped:
Claus Gittinger <cg@exept.de>
parents:
2434
diff
changeset
|
367 |
ifFalse:[ |
a5537ae7be4a
added: #withAllSpecialHTMLCharactersEscaped:
Claus Gittinger <cg@exept.de>
parents:
2434
diff
changeset
|
368 |
resultStream nextPut:eachCharacter |
a5537ae7be4a
added: #withAllSpecialHTMLCharactersEscaped:
Claus Gittinger <cg@exept.de>
parents:
2434
diff
changeset
|
369 |
]]]]]. |
a5537ae7be4a
added: #withAllSpecialHTMLCharactersEscaped:
Claus Gittinger <cg@exept.de>
parents:
2434
diff
changeset
|
370 |
]. |
a5537ae7be4a
added: #withAllSpecialHTMLCharactersEscaped:
Claus Gittinger <cg@exept.de>
parents:
2434
diff
changeset
|
371 |
^ resultStream contents |
a5537ae7be4a
added: #withAllSpecialHTMLCharactersEscaped:
Claus Gittinger <cg@exept.de>
parents:
2434
diff
changeset
|
372 |
|
a5537ae7be4a
added: #withAllSpecialHTMLCharactersEscaped:
Claus Gittinger <cg@exept.de>
parents:
2434
diff
changeset
|
373 |
" |
3312 | 374 |
self withAllSpecialHTMLCharactersEscaped:'<>#&' |
375 |
self withAllSpecialHTMLCharactersEscaped:$< |
|
376 |
self withAllSpecialHTMLCharactersEscaped:$# |
|
2436
a5537ae7be4a
added: #withAllSpecialHTMLCharactersEscaped:
Claus Gittinger <cg@exept.de>
parents:
2434
diff
changeset
|
377 |
" |
a5537ae7be4a
added: #withAllSpecialHTMLCharactersEscaped:
Claus Gittinger <cg@exept.de>
parents:
2434
diff
changeset
|
378 |
|
a5537ae7be4a
added: #withAllSpecialHTMLCharactersEscaped:
Claus Gittinger <cg@exept.de>
parents:
2434
diff
changeset
|
379 |
"Modified: / 05-12-2006 / 13:48:59 / cg" |
3098 | 380 |
"Modified: / 20-08-2013 / 14:12:20 / sr" |
3312 | 381 |
"Modified (comment): / 01-07-2014 / 12:12:32 / sr" |
2436
a5537ae7be4a
added: #withAllSpecialHTMLCharactersEscaped:
Claus Gittinger <cg@exept.de>
parents:
2434
diff
changeset
|
382 |
! |
a5537ae7be4a
added: #withAllSpecialHTMLCharactersEscaped:
Claus Gittinger <cg@exept.de>
parents:
2434
diff
changeset
|
383 |
|
2066 | 384 |
withSpecialHTMLCharactersEscaped:aStringOrCharacter |
385 |
"replace ampersand, less and greater by html-character escapes" |
|
386 |
||
387 |
"/ TODO: this is similar to escapeCharacterEntities. |
|
388 |
"/ we should refactor this into one method only (can we do hex escapes always ?). |
|
389 |
"/ Notice, that these two methods came into existance due to historic reasons |
|
390 |
"/ and were developed independent of each other, but later moved to this common place. |
|
391 |
||
2866 | 392 |
|resultStream| |
2066 | 393 |
|
394 |
"/ orgs := #( $& $< $> ). |
|
395 |
"/ repls := #( '&' '<' '>' ). |
|
396 |
||
397 |
(aStringOrCharacter isString |
|
2866 | 398 |
and:[ (aStringOrCharacter isWideString not) |
399 |
and:[ (aStringOrCharacter includesAny:'&<>') not ]]) ifTrue:[^ aStringOrCharacter]. |
|
2066 | 400 |
|
401 |
resultStream := WriteStream on:''. |
|
402 |
aStringOrCharacter asString do:[:eachCharacter | |
|
403 |
"/ huh - a switch. Sorry, but this method is used heavily. |
|
404 |
eachCharacter == $& |
|
405 |
ifTrue:[ resultStream nextPutAll:'&' ] |
|
406 |
ifFalse:[ |
|
407 |
eachCharacter == $< |
|
408 |
ifTrue:[ resultStream nextPutAll:'<' ] |
|
409 |
ifFalse:[ |
|
410 |
eachCharacter == $> |
|
411 |
ifTrue:[ resultStream nextPutAll:'>' ] |
|
412 |
ifFalse:[ |
|
2554
7cd0f7a16fad
changed: #withSpecialHTMLCharactersEscaped:
Claus Gittinger <cg@exept.de>
parents:
2523
diff
changeset
|
413 |
"/ eachCharacter codePoint > 16r7F |
7cd0f7a16fad
changed: #withSpecialHTMLCharactersEscaped:
Claus Gittinger <cg@exept.de>
parents:
2523
diff
changeset
|
414 |
"/ ifTrue:[ |
7cd0f7a16fad
changed: #withSpecialHTMLCharactersEscaped:
Claus Gittinger <cg@exept.de>
parents:
2523
diff
changeset
|
415 |
"/ resultStream |
7cd0f7a16fad
changed: #withSpecialHTMLCharactersEscaped:
Claus Gittinger <cg@exept.de>
parents:
2523
diff
changeset
|
416 |
"/ nextPutAll:'&#'; |
7cd0f7a16fad
changed: #withSpecialHTMLCharactersEscaped:
Claus Gittinger <cg@exept.de>
parents:
2523
diff
changeset
|
417 |
"/ nextPutAll:(eachCharacter codePoint printString); |
7cd0f7a16fad
changed: #withSpecialHTMLCharactersEscaped:
Claus Gittinger <cg@exept.de>
parents:
2523
diff
changeset
|
418 |
"/ nextPutAll:';'] |
7cd0f7a16fad
changed: #withSpecialHTMLCharactersEscaped:
Claus Gittinger <cg@exept.de>
parents:
2523
diff
changeset
|
419 |
"/ ifFalse:[ |
2066 | 420 |
resultStream nextPut:eachCharacter |
2554
7cd0f7a16fad
changed: #withSpecialHTMLCharactersEscaped:
Claus Gittinger <cg@exept.de>
parents:
2523
diff
changeset
|
421 |
"/ ] |
2066 | 422 |
]]]. |
423 |
]. |
|
424 |
^ resultStream contents |
|
425 |
||
426 |
" |
|
427 |
self withSpecialHTMLCharactersEscaped:'<>#&' |
|
428 |
self withSpecialHTMLCharactersEscaped:$< |
|
429 |
self withSpecialHTMLCharactersEscaped:$# |
|
430 |
" |
|
431 |
||
2554
7cd0f7a16fad
changed: #withSpecialHTMLCharactersEscaped:
Claus Gittinger <cg@exept.de>
parents:
2523
diff
changeset
|
432 |
"Modified: / 13-04-2011 / 23:13:32 / cg" |
2058 | 433 |
! ! |
434 |
||
435 |
!HTMLUtilities class methodsFor:'serving-helpers'! |
|
436 |
||
437 |
escape:aString |
|
2436
a5537ae7be4a
added: #withAllSpecialHTMLCharactersEscaped:
Claus Gittinger <cg@exept.de>
parents:
2434
diff
changeset
|
438 |
"helper to escape invalid/dangerous characters in an url's arguments or post-fields. |
2058 | 439 |
These are: |
440 |
control characters, '+', ';', '?', '&' and space -> %XX ascii as hex digits |
|
441 |
% -> %% |
|
442 |
" |
|
443 |
||
444 |
| rs ws c | |
|
445 |
||
446 |
rs := ReadStream on: aString. |
|
447 |
ws := WriteStream on: ''. |
|
448 |
[ rs atEnd ] whileFalse: [ |
|
449 |
c := rs next. |
|
450 |
c == $% ifTrue:[ |
|
451 |
ws nextPutAll: '%%'. |
|
452 |
] ifFalse:[ |
|
453 |
((c codePoint < 16r7F) |
|
454 |
and:[ ('+;?& ' includes:c) not ]) ifTrue: [ |
|
455 |
ws nextPut: c. |
|
456 |
] ifFalse:[ |
|
457 |
ws nextPut: $%. |
|
458 |
c codePoint printOn:ws base:16. |
|
459 |
] |
|
460 |
] |
|
461 |
]. |
|
462 |
^ ws contents |
|
463 |
||
464 |
" |
|
465 |
self escape:'a b' |
|
466 |
self escape:'a%b' |
|
467 |
self escape:'a b' |
|
468 |
self escape:'a+b' |
|
469 |
self escape:'aäüöb' |
|
470 |
" |
|
471 |
! ! |
|
472 |
||
2144 | 473 |
!HTMLUtilities class methodsFor:'text processing helpers'! |
474 |
||
475 |
plainTextOfHTML:htmlString |
|
476 |
"given some HTML, extract the raw text. |
|
477 |
Can be used to search for strings in some html text." |
|
478 |
||
479 |
|parser doc s| |
|
480 |
||
481 |
s := CharacterWriteStream on:(String new:100). |
|
482 |
||
483 |
parser := HTMLParser new. |
|
484 |
doc := parser parseText:htmlString. |
|
485 |
doc markUpElementsDo:[:el | |
|
486 |
|t| |
|
487 |
||
488 |
el isTextElement ifTrue:[ |
|
489 |
t := el text withoutSeparators. |
|
490 |
t notEmpty ifTrue:[ |
|
491 |
s nextPutAll:t. |
|
492 |
s space. |
|
493 |
]. |
|
494 |
] ifFalse:[ |
|
495 |
"/ ignore non-text; however, we could care for text in info-titles |
|
496 |
"/ or scripts as well... |
|
497 |
]. |
|
498 |
]. |
|
499 |
^ s contents asSingleByteStringIfPossible |
|
500 |
||
501 |
" |
|
502 |
self plainTextOfHTML:' |
|
503 |
bla1 bla2 <br>bla3 <table><tr><td>bla4</td></tr></table> bla5<p>bla6 |
|
504 |
' |
|
505 |
" |
|
506 |
! ! |
|
507 |
||
2058 | 508 |
!HTMLUtilities class methodsFor:'documentation'! |
509 |
||
510 |
version |
|
3312 | 511 |
^ '$Header: /cvs/stx/stx/libbasic2/HTMLUtilities.st,v 1.18 2014-07-01 10:12:38 sr Exp $' |
2434
5625df4b6119
comment/format in: #escapeCharacterEntities:
Claus Gittinger <cg@exept.de>
parents:
2179
diff
changeset
|
512 |
! |
5625df4b6119
comment/format in: #escapeCharacterEntities:
Claus Gittinger <cg@exept.de>
parents:
2179
diff
changeset
|
513 |
|
5625df4b6119
comment/format in: #escapeCharacterEntities:
Claus Gittinger <cg@exept.de>
parents:
2179
diff
changeset
|
514 |
version_CVS |
3312 | 515 |
^ '$Header: /cvs/stx/stx/libbasic2/HTMLUtilities.st,v 1.18 2014-07-01 10:12:38 sr Exp $' |
2058 | 516 |
! ! |
3098 | 517 |