hi, maybe you will find this useful. This perl script may be used by htdig as external parser for pdf-files. cu, Stefan -- Stefan Nehlsen | ParlaNet Administration | [EMAIL PROTECTED] | +49 431 988-1260
#!/usr/bin/perl -- # # parse pdf files for htdig # # - generate anchor tags # - do site specific rewriting url to title # for missing or bad titles # - I suppose it is faster then parse_doc.pl # # based on: # - htdig documentation # - parse_doc.pl # - pdftodig.py (http://po.gaillard.free.fr/pdftodig.py) # # Stefan Nehlsen [EMAIL PROTECTED] # external tools from the xpdf package $parser = "/usr/bin/pdftotext"; $info = "/usr/bin/pdfinfo"; my($infile, $content_type, $url, $config) = @ARGV; # paranoid die "pdfinfo \"$info\" not executable!\n" unless -x $info; die "parser \"$parser\" not executable!\n" unless -x $parser; die "\"$infile\" not readable\n" unless -f $infile; open PDF, $infile or die "opening $infile failed\n"; $text = <PDF>; # read first line close PDF; die "\"$infile is not a PDF-File!\n" unless $text=~/^%PDF-\d\.\d/; # everything seems to be ok # use pdfinfo to retrieve meta information open INFO, "$info \"$infile\" 2>/dev/null |" or warn "$info \"$infile\" failed\n"; while (<INFO>) { chop; if(s/^Title:\s*//){ s/\s+$//; s/\s+/ /g; s/[\376\377]//g; # delete unicode (?) marker # if title is a filename we better use the real filename $title = $_ unless /\.pdf$|Microsoft\s+Word\s+-/i or (length($_)> 16 and /\.\.\.$/); last; } } close INFO; # At this point I do some site-specific rewriting of the title # based on structured urls and/or an external database. # read text from pdftotext undef $/; open PDF, "$parser -raw -q \"$infile\" - 2>/dev/null |" or die "error opening pdf \"$infile\"\n"; $text = <PDF>; # read whole file close PDF; # the point of no return ($title = $url) =~ s#^.*/(.*?\.pdf$)#PDF Dokument $1#i unless $title; $title =~ s/&/\&\;/g; $title =~ s/</\<\;/g; $title =~ s/>/\>\;/g; print "t\t", $title, "\n"; $text =~ s/^[\s\n\f]*//s; $text =~ s/[\s\n\f]*$//s; $text =~ s/-\s*\n+\s*([a-z\340-\377])/$1/gs; # dehyphen ($header = $text) =~ s/[\s\n\f]+/ /gs; if( $header ){ $header =~ s/&/\&\;/g; $header =~ s/</\<\;/g; $header =~ s/>/\>\;/g; print "h\t", $header, "\n"; } @words = grep { /\f|.{3,}/ } split /[^A-Za-z\300-\377\f]+/, $text; $n = 0; $page = 2; $k = 1000 / @words if @words; foreach $word ( @words){ if( $word eq "\f" ){ printf "a\tpage=%d\n", $page++; } else { printf "w\t%s\t%d\t0\n", $word, $n++ * $k; } }

