Thanks! But that gen'd an empty file for me. But I had already mod'd your
script and got it to work for me. I'll send you (Anthony) the script
separately so everyone doesn't get an attachment.
--M
At 08:23 PM 11/13/98 -0500, you wrote:
>
>Further to my previous messsage:
>
>>
>> The sturcutre of the archive file that you have is different from the
>> structure of the archive that script was written to parse. I expected this
>> to be the case when I threw things together so I wrote a "General concept"
>> section to aid in these cases.
>>
>> Please reread this section of the script and then look at the
>> if-then-else structure which is used to tell if you are in the body of the
>> message or not.
>>
>
> With the structure of the archive that you have at the moment, the
>attached script file (variant-1.txt) should suit your needs. It should
>serve as a example that the script can be modified to suit different
>archive strucutres.
>
>See
>
> http://www.albany.net/~anthonyw/archivedemo/testarea/
>
>for a preview of the output and a differences file that shows the changes
>that were made.
>
>Hope that helps.
>
>>
>> > thanks!
>> > Michael
>> >
>> >
>>
>>
>
>Regards,
>
>Anthony
>#!/usr/local/bin/perl
>
>##############################################################################
>#
># h2mbx.pl (variant 1) A script to convert hypermail html archives into
># "mailbox" format.
>#
># As is, no warranty
>#
># Usage:
># ./scriptname hypermail-html-filenames*.html
># cat hyermail-html-filename*.html |./scriptname
>#
>##############################################################################
>#
># General concept:
>#
># This is an exercise in parsing a file that has logical sections.
># Find out if one is in a particular logical secion and act accordingly.
>#
># This script was written against a hypermail generated html file
># which has the following structure:
>#
># <!-- received="date-time-stamp" -->
># .... information to extract ...
># <!-- body="start" -->
># .... information to extract ...
># <!-- body="end" -->
># .... information to ignore ...
>#
># If your hypermail pages have slightly different structure, modify the script
># according to the structure you have in place.
>#
>
>
>$filebegin = "false" ;
>
>open (OUTFILE, ">>newmailbox.txt"); # Open and append to our output file
>
>while (<>)
>{
> s/\>\;/>/g; # decode >
> s/\<\;/</g; # decode <
> #
> # Find out if we are entering a new Start section
> #
>
> if (/\<\!--\ received\=\"/)
> {
> print OUTFILE @body; # Print the current message buffer
>
> # reset our flags
>
> $isinheaders = "false";
> $isinbody = "false";
> $filebegin = "true" ;
> $isintail = "false";
> @body = ();
> next;
> }
>
> if ($filebegin eq "true")
> {
> chop();
>
> if (/\<\!--\ sent\=\"/)
> {
> # Collect the sent date
> s/.*\=\"//g;
> s/\"\ -->.*//g;
> $sentdate = $_;
> next;
> }
> if (/\<\!--\ name\=\"/)
> {
> # Collect the RFC 822 Phrase (Personal name)
> s/.*\=\"//g;
> s/\"\ -->.*//g;
> $personalname = $_;
> next;
> }
> if (/\<\!--\ email\=\"/)
> {
> # Collect the RFC 822 email address
> s/.*\=\"//g;
> s/\"\ -->.*//g;
> $from = $_;
> next;
> }
> if (/\<\!--\ subject\=\"/)
> {
> # Collect the subject
> s/.*\=\"//g;
> s/\"\ -->.*//g;
> $subject = $_;
> next;
> }
> if (/\<\!--\ id\=\"/)
> {
> # Collect the Message Id
> s/.*\=\"//g;
> s/\"\ -->.*//g;
> $messageid = $_;
> next;
> }
> if (/\<\!--\ inreplyto\=\"/)
> {
> # Collect the inreplyto field
> s/.*\=\"//g;
> s/\"\ -->.*//g;
> $inreplyto = $_;
> next;
> }
> if (/\<title>/)
> {
> $isinheaders = "true";
> $filebegin = "false";
> next;
> }
> }
> if ( $isinheaders =~ /true/ )
> {
> chop();
>
> if (/\<\!--\ body\=\"start/) # Look for the body start line
> {
> $isinheaders = "false";
> $isinbody = "true";
>
> push (@body, "\nFrom $from $sentdate\n" ) ;
>
> if ($messageid ne "") {
> push (@body, "Message-id: <$messageid>\n" ) ;
> }
>
> push (@body, "Date: $sentdate\n");
>
> if ($personalname =~ /\@/) {
> push (@body, "From: $from\n");
> } else {
> push (@body, "From: $personalname <$from>\n");
> }
>
> push (@body, "Subject: $subject\n" );
>
> if ($inreplyto ne "") {
> push (@body, "In-Reply-to: <$inreplyto>\n\n" ) ;
> }
>
> push (@body, "\n" );
>
> next;
> }
> }
> if ($isinbody =~ /true/ )
> {
>
> if (/\<\!--\ body\=\"end\"\ --\>/)
> {
> $isintail = "true" ;
> next;
> }
> next if (/\<h1\>\<center\>/);
> next if (/\<\/center\>/);
> next if ( $isintail =~ /true/) ;
>
> # Extract URLs
> s/\<a\ href\=\"(.*)"\>(.*)\<\/a\>/\2/g;
>
> s/\<pre>//g; # remove pre
> s/\<\/pre>//g;
> s/\<i>//g; # remove italics
> s/\<\/i>//g;
> s/\<br\>//g; # remove linebreaks
> s/\<b>//g; # remove bolds
> s/\<\/b>//g;
> s/\<hr.*>//g; # hr's
> s/\>\;/>/g; # decode >
> s/\<\;/</g; # decode <
> s/\<p\>//g; # turn <p> into CR
>
> s/^From\ />From\ /g; # Watch out for forwarded or quoted mail.
>
> # Collect the current line
>
> push (@body, $_ ) ;
> }
>}
>
>print OUTFILE @body;
>
>print "Processing complete\n";
>
>exit;