#!/usr/bin/perl -w
use strict;
#
# Version 1.0	25-May-2001
# Written by David Adams <d.j.adams@soton.ac.uk>
#
# Uses pdftotext & pdfinfo utilities from the xpdf package
# to read an Adobe Acrobat file and produce HTML output.
#  
# Can be called directly from htdig as an external converter,
#  or may be called by doc2html.pl converter script. 
#

####--- Configuration ---####
# Full paths of pdtotext and pdfinfo
# (get them from the xpdf package at http://www.foolabs.com/xpdf/):

#### YOU MUST SET THESE  ####

my $wv = "/usr/bin/wvHtml";
#
#

#open (TEST, ">/tmp/test.txt");
#print TEST "test";
#close TEST;

my $tmpfile = "wvtext.html";
my $tmpdir = "/tmp";
if (! -x $wv) { die "Unable to execute wv" }

my $Input = $ARGV[0] || die "Usage: word2html.pl filename [mime-type] [URL]";
my $MIME_type = $ARGV[1] || '';
if ($MIME_type and ($MIME_type !~ m#^application/msword#i)) {
  die "MIME/type $MIME_type wrong";
}

my $Name = $ARGV[2] || '';
$Name =~ s#^.*/##;
$Name =~ s/%([A-F0-9][A-F0-9])/pack("C", hex($1))/gie;

&wv2html;
&output_html;
exit;

sub wv2html {
`$wv --targetdir=$tmpdir $Input $tmpfile >2 /dev/null`;
return;
}

sub output_html {
open (HTML, '<' . $tmpdir. '/'. $tmpfile) or die "could not open file";
print <HTML>;
close HTML;
return;
}


sub clean_pdf {
# removes odd pair of characters that may be in pdfinfo output
# Any double quotes are replaced with single

  my $text = shift;
  chomp $text;
  $text =~  s/\376\377//g;
  $text =~  s/\"/\'/g;
  return $text;
}