Hi Donald,

attached is the html-to-text engine I used in my %browser.r,
it's a little rough at some edges, but might keep you going ...


kind regards,

Ingo


Once upon a time Donald Dalley spoketh thus:
> 
> Hi, Bo:
> 
> I have a question about this e-mail. As you can see below, the line breaks are
> not right. The lines that have a single quote (") on it probably had a NEWLINE
> of some sort on the previous line. This causes an error when run, so what is
> really supposed to be between the two quote marks?
> 
> I figured out how to get it to run OK, but, for my purposes, the retained text
> needs much better formatting - close to how the author wanted to format it.
> Using an optional arg for the max-linelength would also help. If someone knows
> how and is willing to do this, I would dearly like to see an improved version.
> Anyone who has ever used a good HTML stripper, such as the excellent HTTX
> (Amiga), knows how useful they can be, when called by other programs.
> 
> Thanks for the seed, Bo!
> 



-- Attached file included as plaintext by Listar --

REBOL [
   Title: "HTML to Text Converter"
   File: %html-to-text.r
   Date: 2000-06-10
   Author: "Ingo Hohmann"
   Email:  [EMAIL PROTECTED]
   Site:   http://www.2b1.de/
   Rights: "(c) Ingo Hohmann"
   Purpose: {Create text from html}
   Comments: {
      extracted from my browser.r 
      (which should be updated to current /View, btw.)
   }
]

   html: make object! [
      help: {A html parser}
      evaluate: false
      read-error: none
      skip: false
      spaces: charset " ^-^/"
      non-spaces: complement spaces
      delimiters: charset { ^-^/="}
      non-delimiters: complement delimiters

      html-source: copy ""
      get-html: func [][ return html-source ]

      find-base: func [ url [url! file!] /local u2][
         if #"/" = last url [ return url ]
         if  exists? u2: to-url rejoin [ url "/" ] [
            return u2
         ]
         first split-path url
      ]

      conv-list: [ "&amp;" "&" "&lt;" "<" "&gt;" ">" "&quot;" {"}
         "&auml;" "�" "&Auml;" "�" "&ouml;" "�" "&Ouml;" "�" "&uuml;" "�"
         "&Uuml" "�" "&szlig;" "�" "&nbsp;" " "]

      clean: func [
         {Converts html-entities to special-characters}
         text [string!]
         /local special entity
      ] [
         foreach [special entity] conv-list [
            replace/all text special entity
         ]
         text
      ]

      parse-tag: func [
         {parses a tag, returns block of tag-name arguments}
         tag /local tag-name tag-params] [
         name-rule: [ some non-delimiters ]
         param-rule: [
            any spaces [
               copy param-name name-rule (append tag-params param-name)
               any spaces [
                  "=" any spaces [
                     {"} copy param-val to {"} skip |
                     {'} copy param-val to {'} skip |
                     copy param-val some non-delimiters skip
                  ]
                     (append tag-params param-val) | (append tag-params true)
               ]
            ]
         ]
         tag-params: copy []
         parse/all tag [ copy tag-name name-rule any param-rule ]
         compose [ (tag-name) (tag-params) ]
      ]

      read: func [
         {read url and return the page as ...}
         url [url! file!]
         /html "html source" /text "text" /links "link-list"
         /local data txt lnk return-block
      ] [
         return-block: copy []
         either error? err: try [data: read url] [
            read-error: disarm err
         ] [
            read-error: none
            if html [ append return-block data ]
            if any [ txt links ] [
               set [txt lnk] to-text url data
               if text [ append return-block txt ]
               if links [ append/only return-block lnk ]
            ]
            print dir? url
         ]
      ]

      to-text: func [ 
         {Convert html to text, url is needed for handling of relative urls}
         url [url! file!] html [string!]
         /local elem txt links link lfd link-blk pos end-pos the-script script-funcs
      ] [
         script-funcs: make object! [
            print: func [val][ insert pos load/markup form join val newline ]
            prin: func [val][ insert pos load/markup form val ]
         ]
         url: find-base url
         links: copy []
         link-blk: copy []
         html-source: copy html
         html: load/markup html
         txt: make string! 500
         lfd: 0
         parse html [
            some [
               set elem string! (
                  if not skip [
                     if 0 < length? trim/lines elem [
                        append txt rejoin [ elem " "]
                     ]
                  ]
               ) |
               pos: set elem tag! (
                  elem: parse-tag elem
                  switch first elem [
                     "a" [
                        lfd: lfd + 1
                        append txt rejoin [ "(" lfd ")" ]
                        elem: select elem "href"
                        if elem [
                           if all [ not find elem "://" not find elem "mailto:"] [ ; 
<a name="top"> ???
                              elem: rejoin [ url elem ]
                           ]
                           
                           append links compose [ (lfd) (elem)]
                        ]
                     ]
                     "img" [
                        either elem: select elem "alt" [
                           append txt rejoin [ "[" elem "]" ]
                        ] [
                           append txt "[graphic]"
                        ]
                     ]
                     "p"  [append txt "^/^/"]
                     "br" [append txt newline]
                     "hr" [append txt "^/------------------------------------^/" ]
                     "li" [append txt "^/* "]
                     "ul" [append txt newline]
                     "/ul" [append txt newline]
                     "ol" [append txt newline]
                     "/ol" [append txt newline]
                     "div" [append txt newline]
                     "/div" [append txt newline]
                     "blockquote" [append txt newline]
                     "/blockquote" [append txt newline]
                     "style" [skip: true]
                     "/style" [skip: false]
                     "pre" [
                        end-pos: find pos </pre>
                        append txt rejoin [ newline copy/part next pos end-pos ]
                        pos: end-pos
                     ]
                     "script" [
                        either all [ evaluate "rebol" = select elem "language" ] [
                           end-pos: find pos </script>
                           the-script: copy/part next pos end-pos
                           remove/part pos next end-pos
                           if error? err: try [
                              do bind load rejoin the-script in script-funcs 'print
                           ] [
                              ;inform layout [
                              ;   subtitle red "Error in script !"
                              ;   text mold disarm err
                              ;]
                              print ["Error in Script: ^/" mold disarm err]
                           ]
                           pos: back pos
                        ][
                           skip: true
                        ]
                     ]   ; do bind load rejoin n in t 'print
                     "/script" [skip: false]
                     "/title" [append txt "^/^/"]
                  ]
                  pos: next pos
               )
               :pos
            ]
         ]
         txt: clean txt
         append txt "^/^/^/The links:^/-------^/^/"
         foreach [lfd link] links [ append txt rejoin [ lfd "  " link newline]]
         foreach [lfd link] links [ if not find link "mailto:" [append link-blk rejoin 
[ lfd "  " link]]]
         return compose/deep [ (txt) [(link-blk)]]
      ]
   ]


-- 
To unsubscribe from this list, please send an email to
[EMAIL PROTECTED] with "unsubscribe" in the 
subject, without the quotes.

Reply via email to