Further to my previous messsage:

> 
>  The sturcutre of the archive file that you have is different from the
> structure of the archive that script was written to parse. I expected this
> to be the case when I threw things together so I wrote a "General concept"
> section to aid in these cases.
> 
>  Please reread this section of the script and then look at the
> if-then-else structure which is used to tell if you are in the body of the
> message or not.
> 

  With the structure of the archive that you have at the moment, the
attached script file (variant-1.txt) should suit your needs. It should
serve as a example that the script can be modified to suit different
archive strucutres. 

See 

   http://www.albany.net/~anthonyw/archivedemo/testarea/

for a preview of the output and a differences file that shows the changes 
that were made.

Hope that helps. 

> 
> > thanks!
> > Michael
> > 
> > 
> 
> 

Regards, 
 
Anthony
#!/usr/local/bin/perl 

##############################################################################
#
# h2mbx.pl (variant 1) A script to convert hypermail html archives into
# "mailbox"  format.
# 
# As is, no warranty
# 
# Usage:  
#     ./scriptname hypermail-html-filenames*.html 
#     cat hyermail-html-filename*.html |./scriptname
#
##############################################################################
#
# General concept:
# 
#   This is an exercise in parsing a file that has logical sections.
# Find out if one is in a particular logical secion and act accordingly.
# 
# This script was written against a hypermail generated html file
# which has the following structure:
#
#       <!-- received="date-time-stamp" -->
#       .... information to extract ...
#       <!-- body="start" -->
#       .... information to extract ...
#       <!-- body="end" -->
#       .... information to ignore ...
#
# If your hypermail pages have slightly different structure, modify the script
# according to the structure you have in place.
#

       
$filebegin = "false" ; 

open (OUTFILE, ">>newmailbox.txt"); # Open and append to our output file

while (<>)
{ 
               s/\&gt\;/>/g;        # decode > 
               s/\&lt\;/</g;        # decode <
    #
    # Find out if we are entering a new Start section
    #
 
    if (/\<\!--\ received\=\"/)
    { 
       print OUTFILE @body;         # Print the current message buffer 

       # reset our flags

       $isinheaders = "false";
       $isinbody = "false";
       $filebegin = "true" ;   
       $isintail = "false"; 
       @body = ();            
       next;
    }

    if ($filebegin eq "true")
    {
            chop();

            if (/\<\!--\ sent\=\"/)
            {
               # Collect the sent date 
               s/.*\=\"//g;
               s/\"\ -->.*//g;
               $sentdate = $_;
               next;
            }
            if (/\<\!--\ name\=\"/)
            {
               # Collect the RFC 822 Phrase (Personal name)
               s/.*\=\"//g;
               s/\"\ -->.*//g;
               $personalname = $_;
               next;
            }
            if (/\<\!--\ email\=\"/)
            {
               # Collect the RFC 822 email address
               s/.*\=\"//g;
               s/\"\ -->.*//g;
               $from = $_;
               next;
            }
            if (/\<\!--\ subject\=\"/)
            {
               # Collect the subject
               s/.*\=\"//g;
               s/\"\ -->.*//g;
               $subject = $_;
               next;
            }
            if (/\<\!--\ id\=\"/)
            {
               # Collect the Message Id 
               s/.*\=\"//g;
               s/\"\ -->.*//g;
               $messageid = $_;
               next;
            }
            if (/\<\!--\ inreplyto\=\"/)
            {
               # Collect the inreplyto field 
               s/.*\=\"//g;
               s/\"\ -->.*//g;
               $inreplyto = $_;
               next;
            }
            if (/\<title>/)
            {
               $isinheaders = "true";
               $filebegin = "false";
               next;
            }
     }
     if ( $isinheaders =~ /true/ )
     {
            chop();            

            if (/\<\!--\ body\=\"start/) # Look for the body start line
            {
               $isinheaders = "false";
               $isinbody = "true";

               push (@body, "\nFrom $from $sentdate\n" ) ;

               if ($messageid ne "") {
                 push (@body, "Message-id: <$messageid>\n" ) ;
               }

               push (@body, "Date: $sentdate\n");

               if ($personalname  =~ /\@/) {
                 push (@body, "From: $from\n");
               } else {
                 push (@body, "From: $personalname <$from>\n");
               }

               push (@body, "Subject: $subject\n" );

               if ($inreplyto ne "") {
                 push (@body, "In-Reply-to: <$inreplyto>\n\n" ) ;
               }

               push (@body, "\n" );

               next;
            }
     }
     if ($isinbody =~ /true/ )
     {

               if (/\<\!--\ body\=\"end\"\ --\>/)
               {
                  $isintail = "true" ; 
                  next; 
               }
               next if (/\<h1\>\<center\>/);   
               next if (/\<\/center\>/);   
               next if ( $isintail =~ /true/) ;

               # Extract URLs 
               s/\<a\ href\=\"(.*)"\>(.*)\<\/a\>/\2/g;

               s/\<pre>//g;         # remove pre
               s/\<\/pre>//g;       
               s/\<i>//g;           # remove italics
               s/\<\/i>//g;
               s/\<br\>//g;         # remove linebreaks
               s/\<b>//g;           # remove bolds
               s/\<\/b>//g;
               s/\<hr.*>//g;        # hr's
               s/\&gt\;/>/g;        # decode > 
               s/\&lt\;/</g;        # decode <
               s/\<p\>//g;          # turn <p> into CR
               
               s/^From\ />From\ /g; # Watch out for forwarded or quoted mail.

               # Collect the current line

               push (@body, $_ ) ;
     }
}
       
print OUTFILE @body;

print "Processing complete\n";

exit;

Reply via email to