#!/usr/bin/perl -w use strict; my ($From, %headers, $body); my $quiet = 1;
# 1. input: a mail message on STDIN # stashes the three parts of the message into global vars: $From, %headers, $body &get_input; # 2. PROCESS / MUNGE &process_body; # 3. output: a fixed-up mail message to STDOUT # also writes file at 'NOTEDIR/msgid' , this is used by mhonarc to produce META DESCRIPTION print &whole_msg; &generate_note; # # ---------- SUBS ---------------- # sub process_body { # unused at this point - obviated by the later line: # $body =~ s/\n\s\s+(\S)/\n\n$1/mg; # my $SO_MANY_LINES_INDICATES_NO_LINE_BREAKS = 25; # INPUT - global variable $body # OUTPUT - global variable $body # We do 3 things to cleanup the body of the message: # 1. We change patterns in the text on a line by line basis, ensuring logical # chunks get seperated by newlines and translating the machine markup like '=20' and '=' # into newlines or spaces, as appropriate by the context. # 2. We look at chunks (paragraphs), as seperated by blank lines. # Each paragraph is inspected, and it is either 'flow-wrapped' or 'preserve-line-breaks' # 3. We do final cleanup. # 1. LINE BY LINE fixing # some emails have =20 on its own line, this is a seperator, replace with \n\n $body =~ s/\n=20\n/\n\n/mg; # some emails have =20 and then a blank line, this is a sep, replace with \n\n $body =~ s/=20\n\n/\n\n/mg; # some emails have =20 and then a line with text. # replace this with a space to join lines into paragr. $body =~ s/=20\n(\w)/ $1/mg; # some emails have =20 abutting right next to text - just delete the =20 $body =~ s/(\w)=20\n/$1\n/mg; # some emails have '=' at the end of the line, and then a \n. join this to the next line $body =~ s/=\n/ /mg; # ensure lines that are ALL UPPERCASE are seperated from others $body =~ s/(\n[[:upper:]]+\n)/\n$1/mg; # ensure lines that are all '--------' are seperated from others $body =~ s/(\n(-+)\n)/\n$1\n/mg; # replace three or more blank lines with a two blank lines - we don't need more than 2. $body =~ s/\n(\n)+/\n\n/mg; # ensure indented paragraphs get are seperated from the paragraph above them, with a newline $body =~ s/\n\s\s+(\S)/\n\n$1/mg; # 2. PARAGRAPH BY PARAGRAPH FIXING my @paras = split(/(\n\n)/, $body); my $para; $body = ''; my $SO_FEW_CHARACTERS_IN_LINE_INDICATES_SPECIAL = 46; PARAGRAPH: foreach $para (@paras) { $para =~ s/^\n//; # $para =~ s/\n$//; my @lines_in_para = split(/\n/, $para); my $num_lines = 1; if ($#lines_in_para) { $num_lines = 1 + $#lines_in_para; } my $firstline = $lines_in_para[0]; $firstline = '' unless $firstline; my $length_of_first_line = length( $firstline ); my $secondline = ''; if ($num_lines > 1) { $secondline = $lines_in_para[1]; } ; my $length_of_second_line = length( $secondline ); d ("---","---"); d ("num_lines", $num_lines); d ("firstline", $firstline); d ("length of first line", $length_of_first_line); d ("secondline", $secondline); d ("length of second line", $length_of_second_line); # CASE 1: paragraph is just a blank line -> preserve break if ($length_of_first_line < 1) { d("CASE", "empty"); d("para", $para); $body .= "\n"; next PARAGRAPH } # CASE 2: first line and second line in paragraph is quite short, # preserve formatting by adding line break after each line elsif ( ($length_of_first_line < $SO_FEW_CHARACTERS_IN_LINE_INDICATES_SPECIAL) and ($length_of_second_line < $SO_FEW_CHARACTERS_IN_LINE_INDICATES_SPECIAL) ) { d("CASE ", "blockquote para"); for (@lines_in_para) { $body .= '>' . $_ . "\n"; } next PARAGRAPH; } # CASE 3: first line but not second line in paragraph is quite short, # preserve formatting of first line, seperate the rest. elsif ( ($length_of_first_line < $SO_FEW_CHARACTERS_IN_LINE_INDICATES_SPECIAL) and ($length_of_second_line >= $SO_FEW_CHARACTERS_IN_LINE_INDICATES_SPECIAL) ) { d("CASE", "title line, then regular para"); my $i = 1; $body .= $firstline . "\n\n"; my ($j) = $num_lines - 1; for ($i .. $j) { $body .= $lines_in_para[$_] . "\n" ; } ; next PARAGRAPH; } else { d("CASE", "exact"); $body .= $para . "\n\n"; next PARAGRAPH; } } # 3. FINAL CLEANUP $body =~ s/^From/ From/mg; $body =~ s/\n(\n)+/\n\n/mg; $body =~ s/\>\n/\n/mg; } sub generate_note { # the only configuration you _need_ is to change $myhome my $myhome = '/home/wfnorg'; my $sep = '/'; my $notedir = $myhome . $sep . 'notes'; -d $notedir or die "please create $notedir"; my ($overwrite); # will overwrite any existing notes if 1 my ($quiet); # will be quiet about warning if 1 $quiet = 0; my (%note_fields); # find the Message-ID and stash in the the %note_fields array while (my ($key,$value) = each %headers) { $note_fields{ message } = $value if $key =~ /^message-id/i; $note_fields{ msg } = $value if $key =~ /^msg-id/i; $note_fields{ content } = $value if $key =~ /^content-id/i; } # -------------- C) msgid cleanup my ($msgid); $msgid = $note_fields{message} || $note_fields{msg} || $note_fields{content}; if (defined($msgid)) { if ($msgid =~ /<([^>]*)>/) { $msgid = $1; } else { $msgid =~ s/^\s+//; $msgid =~ s/\s+$//; } } else { # create bogus ID if none exists eval { # create message-id using md5 digest of header; # can potentially skip over already archived messages w/o id require Digest::MD5; $msgid = join("", Digest::MD5::md5_hex(join '', values %headers), '@NO-ID-FOUND.mhonarc.org'); }; if ($@) { # unable to require, so create arbitary message-id $msgid = join("", $$, '.', time, '.', $_, '@NO-ID-FOUND.mhonarc.org'); } $headers{ 'Message-ID' } = $msgid; } my $note = ''; my @paras = split(/(\n\n)/, $body); foreach my $para (@paras) { $_ = $para; next if ( length($note) gt 300 ); # grab following paragraph if we have one short but good paragraph. next unless ( $note or (! /^[^ ]: / and /(\.|\?)"?\s*$/ )); $note .= $_; $note =~ s/---+[^-]*---+//g; $note =~ s (\<|\>|'|") ()g; } # -------------- E) write $notedir/$msgid my $notefile = $notedir . $sep . msgid_to_filename($msgid); # sanity check if (-e $notefile and ! $overwrite ) { print STDERR "$notefile ... exists\n" unless $quiet; exit; } else { open (NOTE, ">" . $notefile) or die "could not create $notefile"; print NOTE substr ( $note, 0,600); close (NOTE) or die "could not create $notefile"; } } ; sub get_input { my ($inHeader, $cur_header, $cur_header_value); $inHeader = 1; while (<>) { if ($inHeader) { chomp; # Case 1 - the initial line, like # From [EMAIL PROTECTED] Wed Mar 1 12:40:06 PST 2006 if (/^From\s/) { $From = $_; } # Case 2 - a key-value pair of the header, like # From: <[EMAIL PROTECTED]> elsif (/^\S+:/) { &imprint_header($cur_header, $cur_header_value);; ($cur_header, $cur_header_value) = split (/: /, $_, 2); } # Case 3 - a 'flow-over' line in the header, like a long Subject: line. elsif (/\s+\S/) { chomp; s/^\s+/ /g; $cur_header_value .= $_; } # Case 4 - blank line - the end of the header if (/^$/) { &imprint_header($cur_header, $cur_header_value);; $inHeader = 0; } } # inHeader else { $body .= $_; } ; } } # ---------------------------------------------------------------- # UTILITY SUBS # # combines the three parts of the message ($From, %headers, $body) into a string. sub whole_msg { my ($whole_msg); $whole_msg = "$From\n"; while (my ($key,$value) = each %headers) { $whole_msg .= "$key: $value\n"; } $whole_msg .= "\n" . $body; return $whole_msg; } sub imprint_header ($$) { my ($key, $value) = @_; if ($key) { $headers{$key} .= $value; } } sub msgid_to_filename { my $msgid = shift; $msgid =~ s/([EMAIL PROTECTED])/sprintf("=%02X",unpack("C",$1))/geo; $msgid; } sub d{ my ($label, $value) = @_; return if $quiet; print STDERR "$label: $value\n"; } __END__