HTMLUtilities.st
changeset 4737 610d483cb00a
parent 4712 530912590b7f
child 4924 b171682381a1
equal deleted inserted replaced
4736:c5a3e2dae276 4737:610d483cb00a
   800     "given some HTML, extract the raw text. 
   800     "given some HTML, extract the raw text. 
   801      Can be used to search for strings in some html text."
   801      Can be used to search for strings in some html text."
   802 
   802 
   803     |parser doc s first|
   803     |parser doc s first|
   804 
   804 
   805 
       
   806     parser := HTMLParser new.
   805     parser := HTMLParser new.
   807     doc := parser parseText:htmlString.
   806     doc := parser parseText:htmlString.
   808     s := CharacterWriteStream on:(String new:100).
   807     s := CharacterWriteStream on:(String new:100).
   809     first := true.
   808     first := true.
   810     doc markUpElementsDo:[:el |
   809     doc markUpElementsDo:[:el |
   826     ].
   825     ].
   827     ^ s contents
   826     ^ s contents
   828 
   827 
   829     "
   828     "
   830      self plainTextOfHTML:'
   829      self plainTextOfHTML:'
   831 bla1 bla2 <br>bla3 <table><tr><td>bla4</td></tr></table> bla5<p>bla6
   830             bla1 bla2 <br>bla3 <table><tr><td>bla4</td></tr></table> bla5<p>bla6'
   832 '        
   831      self plainTextOfHTML:'Hello World'        
   833     "
   832     "
   834 
   833 
   835     "Modified: / 06-05-2015 / 17:02:36 / sr"
   834     "Modified: / 06-05-2015 / 17:02:36 / sr"
   836 ! !
   835 ! !
   837 
   836