Thanks! But that gen'd an empty file for me. But I had already mod'd your
script and got it to work for me. I'll send you (Anthony) the script
separately so everyone doesn't get an attachment.

--M

At 08:23 PM 11/13/98 -0500, you wrote:
>
>Further to my previous messsage:
>
>> 
>>  The sturcutre of the archive file that you have is different from the
>> structure of the archive that script was written to parse. I expected this
>> to be the case when I threw things together so I wrote a "General concept"
>> section to aid in these cases.
>> 
>>  Please reread this section of the script and then look at the
>> if-then-else structure which is used to tell if you are in the body of the
>> message or not.
>> 
>
>  With the structure of the archive that you have at the moment, the
>attached script file (variant-1.txt) should suit your needs. It should
>serve as a example that the script can be modified to suit different
>archive strucutres. 
>
>See 
>
>   http://www.albany.net/~anthonyw/archivedemo/testarea/
>
>for a preview of the output and a differences file that shows the changes 
>that were made.
>
>Hope that helps. 
>
>> 
>> > thanks!
>> > Michael
>> > 
>> > 
>> 
>> 
>
>Regards, 
> 
>Anthony
>#!/usr/local/bin/perl 
>
>##############################################################################
>#
># h2mbx.pl (variant 1) A script to convert hypermail html archives into
># "mailbox"  format.
># 
># As is, no warranty
># 
># Usage:  
>#     ./scriptname hypermail-html-filenames*.html 
>#     cat hyermail-html-filename*.html |./scriptname
>#
>##############################################################################
>#
># General concept:
># 
>#   This is an exercise in parsing a file that has logical sections.
># Find out if one is in a particular logical secion and act accordingly.
># 
># This script was written against a hypermail generated html file
># which has the following structure:
>#
>#       <!-- received="date-time-stamp" -->
>#       .... information to extract ...
>#       <!-- body="start" -->
>#       .... information to extract ...
>#       <!-- body="end" -->
>#       .... information to ignore ...
>#
># If your hypermail pages have slightly different structure, modify the script
># according to the structure you have in place.
>#
>
>       
>$filebegin = "false" ; 
>
>open (OUTFILE, ">>newmailbox.txt"); # Open and append to our output file
>
>while (<>)
>{ 
>               s/\&gt\;/>/g;        # decode > 
>               s/\&lt\;/</g;        # decode <
>    #
>    # Find out if we are entering a new Start section
>    #
> 
>    if (/\<\!--\ received\=\"/)
>    { 
>       print OUTFILE @body;         # Print the current message buffer 
>
>       # reset our flags
>
>       $isinheaders = "false";
>       $isinbody = "false";
>       $filebegin = "true" ;   
>       $isintail = "false"; 
>       @body = ();            
>       next;
>    }
>
>    if ($filebegin eq "true")
>    {
>            chop();
>
>            if (/\<\!--\ sent\=\"/)
>            {
>               # Collect the sent date 
>               s/.*\=\"//g;
>               s/\"\ -->.*//g;
>               $sentdate = $_;
>               next;
>            }
>            if (/\<\!--\ name\=\"/)
>            {
>               # Collect the RFC 822 Phrase (Personal name)
>               s/.*\=\"//g;
>               s/\"\ -->.*//g;
>               $personalname = $_;
>               next;
>            }
>            if (/\<\!--\ email\=\"/)
>            {
>               # Collect the RFC 822 email address
>               s/.*\=\"//g;
>               s/\"\ -->.*//g;
>               $from = $_;
>               next;
>            }
>            if (/\<\!--\ subject\=\"/)
>            {
>               # Collect the subject
>               s/.*\=\"//g;
>               s/\"\ -->.*//g;
>               $subject = $_;
>               next;
>            }
>            if (/\<\!--\ id\=\"/)
>            {
>               # Collect the Message Id 
>               s/.*\=\"//g;
>               s/\"\ -->.*//g;
>               $messageid = $_;
>               next;
>            }
>            if (/\<\!--\ inreplyto\=\"/)
>            {
>               # Collect the inreplyto field 
>               s/.*\=\"//g;
>               s/\"\ -->.*//g;
>               $inreplyto = $_;
>               next;
>            }
>            if (/\<title>/)
>            {
>               $isinheaders = "true";
>               $filebegin = "false";
>               next;
>            }
>     }
>     if ( $isinheaders =~ /true/ )
>     {
>            chop();            
>
>            if (/\<\!--\ body\=\"start/) # Look for the body start line
>            {
>               $isinheaders = "false";
>               $isinbody = "true";
>
>               push (@body, "\nFrom $from $sentdate\n" ) ;
>
>               if ($messageid ne "") {
>                 push (@body, "Message-id: <$messageid>\n" ) ;
>               }
>
>               push (@body, "Date: $sentdate\n");
>
>               if ($personalname  =~ /\@/) {
>                 push (@body, "From: $from\n");
>               } else {
>                 push (@body, "From: $personalname <$from>\n");
>               }
>
>               push (@body, "Subject: $subject\n" );
>
>               if ($inreplyto ne "") {
>                 push (@body, "In-Reply-to: <$inreplyto>\n\n" ) ;
>               }
>
>               push (@body, "\n" );
>
>               next;
>            }
>     }
>     if ($isinbody =~ /true/ )
>     {
>
>               if (/\<\!--\ body\=\"end\"\ --\>/)
>               {
>                  $isintail = "true" ; 
>                  next; 
>               }
>               next if (/\<h1\>\<center\>/);   
>               next if (/\<\/center\>/);   
>               next if ( $isintail =~ /true/) ;
>
>               # Extract URLs 
>               s/\<a\ href\=\"(.*)"\>(.*)\<\/a\>/\2/g;
>
>               s/\<pre>//g;         # remove pre
>               s/\<\/pre>//g;       
>               s/\<i>//g;           # remove italics
>               s/\<\/i>//g;
>               s/\<br\>//g;         # remove linebreaks
>               s/\<b>//g;           # remove bolds
>               s/\<\/b>//g;
>               s/\<hr.*>//g;        # hr's
>               s/\&gt\;/>/g;        # decode > 
>               s/\&lt\;/</g;        # decode <
>               s/\<p\>//g;          # turn <p> into CR
>               
>               s/^From\ />From\ /g; # Watch out for forwarded or quoted mail.
>
>               # Collect the current line
>
>               push (@body, $_ ) ;
>     }
>}
>       
>print OUTFILE @body;
>
>print "Processing complete\n";
>
>exit;

Reply via email to