#!/usr/local/bin/perl.exe -w
use strict;
# Laola2html.pl
# hacked up by Greg Holmes, 3 July 2001
# based on
#   pdf2html.pl
#   Version 1.0	25-May-2001
#   Written by David Adams <d.j.adams@soton.ac.uk>
#
# Uses laola (specifically lhalw and ldat)
# to read an MS-Word file and produce HTML output.
#  
# Can be called directly from htdig as an external converter,
#  or may be called by doc2html.pl converter script. 
#

####--- Configuration ---####
# Full paths of lhalw and ldat
# (get the loala library from http://user.cs.tu-berlin.de/~schwartz/pmh/):

#### YOU MUST SET THESE  ####

my $DOCTOTEXT = "/usr/bin/lhalw";
my $DOCINFO = "/usr/bin/ldat";
#
# De-hyphenation option (only affects end-of-line hyphens):
my $Dehyphenate = 1;
#
# Set title to be used when none is found:
my $Default_title = "Microsoft Word Document";
#  
# make portable to win32 platform or unix:
my $null = "/dev/null";
if ($^O eq "MSWin32") {$null = "nul";}
####--- End of configuration ---###

if (! -x $DOCTOTEXT) { die "Unable to execute lhalw" }

my $Input = $ARGV[0] || die "Usage: loala2html.pl filename [mime-type] [URL]";
my $MIME_type = $ARGV[1] || '';
if ($MIME_type and ($MIME_type !~ m#^application/msword#i)) {
  die "MIME/type $MIME_type wrong";
}

my $Name = $ARGV[2] || '';
$Name =~ s#^.*/##;

&doc_head;
&doc_body;
exit;

#------------------------------------------------------------------------------

sub doc_head {
#
#  Contributed to pdf2html by Greg Holmes and Michael Fuller
#   (any errors by David Adams)
#   (any new errors by Greg Holmes)
#
    my $title = '';
    my $subject = '';
    my $keywords = '';
    if (open(INFO, "$DOCINFO -a -d '$Input' 2>$null |")) {
        while (<INFO>) {
            if (m/\"title\"/i) {
                s/^.*\(string\)\s+\"//i;
		s/\"\s+//i;
		$title = &clean_doc($_);
	    } elsif (m/\"subject\"/i) {
                s/^.*\(string\)\s+\"//i;
		s/\"\s+//i;
		$subject = &clean_doc($_);
	    } elsif (m/\"keywords\"/i) {
                s/^.*\(string\)\s+\"//i;
		s/\"\s+//i;
                $keywords = &clean_doc($_);
            }

        }
        close INFO;
    } else { warn "cannot execute ldat" }
    if (not length $title) {
      if ($Name) {
        $title = '[' . $Name . ']';
      } else {
        $title = $Default_title;
      }
    }

    print "<HTML>\n<HEAD>\n";
    print "<TITLE>$title</TITLE>\n";
    if (length $subject) {
      print '<META NAME="DESCRIPTION" CONTENT="' . $subject. "\">\n";
    }
    if (length $keywords) {
      print '<META NAME="KEYWORDS" CONTENT="' . $keywords . "\">\n";
    }
    print "</HEAD>\n";

###print STDERR "\n$Name:\n";
###print STDERR "\tTitle:\t$title\n";
###print STDERR "\tDescription:\t$subject\n";
###print STDERR "\tKeywords:\t$keywords\n";

}

#------------------------------------------------------------------------------

sub doc_body {

  my $bline = '';
  open(CAT, "$DOCTOTEXT -F -N '$Input' |") || 
	  die "$DOCTOTEXT doesn't want to be opened using pipe\n";
  print "<BODY>\n";
  while (<CAT>) {
    while ( m/[A-Za-z\300-\377]-\s*$/ && $Dehyphenate) {
	  $_ .= <CAT>;
	  last if eof;
	  s/([A-Za-z\300-\377])-\s*\n\s*([A-Za-z\300-\377])/$1$2/s;
    }
    s/\255/-/g;	# replace dashes with hyphens
    # replace bell, backspace, tab. etc. with single space:
    s/[\000-\040]+/ /g;
    $_ = &HTML($_);
    if (length) {
      print $bline, $_, "\n";
      $bline = "<br>\n";
    } else {
      $bline = "<p>\n";
    }
  }
  close CAT;

  print "</BODY>\n</HTML>\n";
  return;
}

#------------------------------------------------------------------------------

sub HTML {

  my $text = shift;

  $text =~ s/\f/\n/gs;	# replace form feed
  $text =~ s/\s+/ /g;	# replace multiple spaces, etc. with a single space
  $text =~ s/\s+$//gm;	# remove trailing space
  $text =~ s/&/&amp;/g;
  $text =~ s/</&lt;/g;
  $text =~ s/>/&gt;/g;
  chomp $text;

  return $text;
}

#------------------------------------------------------------------------------

sub clean_doc {
# removes odd pair of characters that may be in ldat output
# Any double quotes are replaced with single

  my $text = shift;
  chomp $text;
  $text =~  s/\376\377//g;
  $text =~  s/\"/\'/g;
  return $text;
}
