# ParsePDF.pm
#
# Copyright Deri James 2014 GPL 3

use Compress::Zlib;

my $pgno=0;

sub LoadPDF
{
	my $pdfnm=shift;
	my $loadstrm=shift || 0;
	my $pdf;
	my $pdftxt='';
	my $strmlen=0;
	my $curobj=-1;
	my $instream=0;

	if (!open(PD,"<$pdfnm"))
	{
		Msg(0,"Failed to open PDF '$pdfnm'");
		return undef;
	}

	$/="\n";

	my $hdr=<PD>;
	my $dos=0;

	$/="\r", $dos=1 if (length($hdr) > 10);
	
	while (<PD>)
	{
		chomp;

		s/\n//;

		if (m/endstream(\s+.*)?$/)
		{
			$instream=0;
			$_="endstream";
			$_.=$1 if defined($1)
		}

		next if $instream;

		if (m'/Length\s+(\d+)(\s+\d+\s+R)?')
		{
			if (!defined($2))
			{
				$strmlen=$1;
			
}
			else
			{
				$strmlen=0;
			
}
		}

		if (m'^(\d+) \d+ obj')
		{
			$curobj=$1;
			$pdf->[$curobj]->{OBJ}=undef;
			$pdf->[$curobj]->{OBJNO}=$curobj;
		}
		
		s/ obj/ obj /;
		s/(?<!\\)\(/ (/g;

		if (m'stream\s*$' and ! m/^endstream/)
		{
			if ($curobj > -1)
			{
				$pdf->[$curobj]->{STREAMPOS}=[tell(PD)+$dos,$strmlen];
				seek(PD,$strmlen,1);
				$instream=1;
			}
			else
			{
				Msg(1,"Parsing PDF '$pdfnm' failed");
				return undef;
			}
		}

		$pdftxt.=$_.' ';
	}

#	$pdftxt=~s/\]/ \]/g;
	my (@pdfwds)=split(' ',$pdftxt);
	push(@pdfwds,'','');
	my $wd;

	while ($wd=nextwd(\@pdfwds),length($wd))
	{
		if ($wd=~m/\d+/ and defined($pdfwds[1]) and $pdfwds[1] eq 'obj')
		{
			$curobj=$wd;
			shift(@pdfwds); shift(@pdfwds);
# 			print STDERR "Parsing $curobj\n";
			$pdf->[$curobj]->{OBJ}=ParsePDFObj(\@pdfwds);
		}
		elsif ($wd eq 'trailer' and !exists($pdf->[0]->{OBJ}))
		{
			$pdf->[0]->{OBJ}=ParsePDFObj(\@pdfwds);
		}
		else
		{
# 			print STDERR "Skip '$wd'\n";
		}
	}

	# In PDF1.5+ there may be no trailet, the xref could be in XRef objects
	
	if (!defined($pdf->[0]))
	{
	    foreach my $obj (@{$pdf})
	    {
		next if !defined($obj);
		
		if (exists($obj->{OBJ}->{Type}) and $obj->{OBJ}->{Type} eq '/XRef')
		{
		    $pdf->[0]->{OBJ}->{ID}=$obj->{OBJ}->{ID} if exists($obj->{OBJ}->{ID});
		    $pdf->[0]->{OBJ}->{Info}=$obj->{OBJ}->{Info} if exists($obj->{OBJ}->{Info});
		    $pdf->[0]->{OBJ}->{Root}=$obj->{OBJ}->{Root} if exists($obj->{OBJ}->{Root});
		    $pdf->[0]->{OBJ}->{Size}=$obj->{OBJ}->{Size} if exists($obj->{OBJ}->{Size});
		}
	    }
	}

	    # Load the streamas

	foreach my $obj (@{$pdf})
	{
	    next if !defined($obj);
	    
	    if (exists($obj->{STREAMPOS}) and ($loadstrm or (ref($obj) eq 'HASH' and exists($obj->{OBJ}->{Type}) and $obj->{OBJ}->{Type} eq '/ObjStm')))
	    {
		my $l;

		$l=$obj->{OBJ}->{Length} if exists($obj->{OBJ}->{Length});

		$l=$pdf->[$$l]->{OBJ} if (defined($l) && ref($l) eq 'OBJREF');

		Msg(1,"Unable to determine length of stream \@$obj->{STREAMPOS}->[0]") if !defined($l);

		sysseek(PD,$obj->{STREAMPOS}->[0],0);
		Msg(0,'Failed to read all the stream') if $l != sysread(PD,$obj->{STREAM},$l);
		
		if (exists($obj->{OBJ}->{'Filter'}) and $obj->{OBJ}->{'Filter'} eq '/FlateDecode')
		{
# 			my $x=inflateInit();
# 			my $st;
# 			($obj->{STREAM},$st)=$x->inflate($obj->{STREAM});
# 			if ($st != Z_OK and $st != Z_STREAM_END)
# 			{
# 			    print STDERR $x->msg(),"\n";
# 			}
# 			1
		    my $s=Compress::Zlib::uncompress($obj->{STREAM});
		    if ($s)
		    {
			$obj->{STREAM}=$s, delete($obj->{OBJ }->{'Filter'});
		    }
		    else
		    {
			Msg(0,"Unable to decompress object $obj->{OBJNO}");
		    }
		}
		
		if (ref($obj->{OBJ}) eq 'HASH' and exists($obj->{OBJ}->{Type}) and exists($obj->{OBJ}->{Type}) and $obj->{OBJ}->{Type} eq '/ObjStm')
		{
		    my $N=$obj->{OBJ}->{N};
		    my $First=$obj->{OBJ}->{First};
		    my $stream=$obj->{STREAM};
		    
		    my (@os)=split(' ',substr($stream,0,$First));
		    
		    for (my $j=0; $j<$#os; $j+=2)
		    {
			my $ono=$os[$j];
			my $offs=$os[$j+1];
			my @wds;
			
			if ($j < $#os-2)
			{
			    my $l=$os[$j+3]-$os[$j+1];
			    my $s=substr($stream,$First+$offs,$l);
			    (@wds)=split(' ',$s);
			}
			else
			{
			    my $s=substr($stream,$First+$offs);
			    (@wds)=split(' ',$s);
			}
			
			if (defined($pdf->[$os[$j]]))
			{
			    print STDERR "Redefine object $os[$j]\n";
			}
			
			push(@wds,'','');
			$pdf->[$os[$j]]->{OBJ}=ParsePDFObj(\@wds);
			$pdf->[$os[$j]]->{OBJNO}=$os[$j];
		    }
		    
		    my $o=$obj->{OBJNO};
		    $pdf->[$o]=undef;
		}
	    }
	}
	

	close(PD);
	
	Msg(1,"Failed to parse file: no trailer") if !defined($pdf->[0]);
	return $pdf;
}


sub nextwd
{
	my $pdfwds=shift;

	my $wd=shift(@{$pdfwds});

	return('') if !defined($wd);

	if ($wd=~m/^(.*?)(<<|>>|\[|\])(.*)/)
	{
		if (defined($1) and length($1))
		{
			unshift(@{$pdfwds},$3) if defined($3) and length($3);
			unshift(@{$pdfwds},$2);
			$wd=$1;
		}
		else
		{
			unshift(@{$pdfwds},$3) if defined($3) and length($3);
			$wd=$2;
		}
	}

	return($wd);
}

sub ParsePDFObj
{

	my $pdfwds=shift;
	my $rtn;
	my $wd;

	while ($wd=nextwd($pdfwds),length($wd))
	{
		if ($wd eq 'stream' or $wd eq 'endstream')
		{
			next;
		}
		elsif ($wd eq 'endobj' or $wd eq 'startxref')
		{
			last;
		}
		else
		{
			unshift(@{$pdfwds},$wd);
			$rtn=ParsePDFValue($pdfwds);
		}
	}

	return($rtn);
}

sub ParsePDFHash
{
	my $pdfwds=shift;
	my $rtn={};
	my $wd;

	while ($wd=nextwd($pdfwds),length($wd))
	{
		if ($wd eq '>>')
		{
			last;
		}

		my (@w)=split('/',$wd,3);

		if ($w[0])
		{
			Msg(0,"PDF Dict Key '$wd' does not start with '/'");
		}
		else
		{
			unshift(@{$pdfwds},"/$w[2]") if $w[2];
			$wd=$w[1];
			(@w)=split('\(',$wd,2);
			$wd=$w[0];
			unshift(@{$pdfwds},"($w[1]") if $w[1];

			$rtn->{$wd}=ParsePDFValue($pdfwds);
		}
	}

	return($rtn);
}

sub ParsePDFValue
{
	my $pdfwds=shift;
	my $rtn;
	my $wd=nextwd($pdfwds);

	if ($wd=~m/^\d+$/ and $pdfwds->[1]=~m/^R(\]|\>|\/)?/)
	{
		shift(@{$pdfwds});
		if (defined($1) and length($1))
		{
			$pdfwds->[0]=substr($pdfwds->[0],1);
		}
		else
		{
			shift(@{$pdfwds});
		}
		return(bless(\$wd,'OBJREF'));
	}

	if ($wd eq '<<')
	{
		return(ParsePDFHash($pdfwds));
	}

	if ($wd eq '[')
	{
		return(ParsePDFArray($pdfwds));
	}

	if ($wd=~m/(.*?)(\(.*)$/)
	{
		if (defined($1) and length($1))
		{
			unshift(@{$pdfwds},$2);
			$wd=$1;
		}
		else
		{
			return(ParsePDFString($wd,$pdfwds));
		}
	}

	if ($wd=~m/(.+?)(\/.*)$/)
	{
		if (defined($2) and length($2))
		{
			unshift(@{$pdfwds},$2);
			$wd=$1;
		}
	}

	return($wd);
}

sub ParsePDFString
{
	my $wd=shift;
	my $rtn='';
	my $pdfwds=shift;
	my $lev=0;

	while (length($wd))
	{
		$rtn.=' ' if length($rtn);

		while ($wd=~m/(?<!\\)(?:\\\\)*\(/g) {$lev++;}
		while ($wd=~m/(?<!\\)(?:\\\\)*\)/g) {$lev--;}


		if ($lev<=0 and $wd=~m/^(.*\))([^)]+)$/)
		{
			unshift(@{$pdfwds},$2) if defined($2) and length($2);
			$wd=$1;
		}

		$rtn.=$wd;

		last if $lev <= 0;

		$wd=nextwd($pdfwds);
	}

	return($rtn);
}

sub ParsePDFArray
{
	my $pdfwds=shift;
	my $rtn=[];
	my $wd;

	while ($wd=nextwd($pdfwds),length($wd))
	{
		if ($wd=~m/^\d+$/ and $#{$pdfwds} > 0 and $pdfwds->[1]=~m/^R(\]|\>)?/)
		{
			shift(@{$pdfwds});
			if (defined($1) and length($1))
			{
				$pdfwds->[0]=substr($pdfwds->[0],1);
			}
			else
			{
				shift(@{$pdfwds});
			}
			my $or=$wd;
			push(@{$rtn},bless(\$or,'OBJREF'));
		}
		elsif ($wd eq ']')
		{
			last;
		}
		elsif ($wd eq '<<')
		{
			push(@{$rtn},ParsePDFHash($pdfwds));
		}
		elsif ($wd eq '[')
		{
			push(@{$rtn},ParsePDFArray($pdfwds));
		}
		else
		{
			if ($wd=~m/^(\/.*?)(\/.*)/)
			{
			    $wd=$1;
			    unshift(@{$pdfwds},$2);
			}
			
			push(@{$rtn},ParsePDFString($wd,$pdfwds));
		}
	}

	return($rtn);
}

sub Msg
{
	my ($lev,$msg)=@_;

#	print STDERR "$env{SourceFile}: " if exists($env{SourceFile});
	print STDERR "$msg\n";
	exit 1 if $lev;
}

sub ParsePages
{
    my $pdf=shift;
    my $callback=shift;
    $pgno=0;
    
    my $catalog=${$pdf->[0]->{OBJ}->{Root}};
    my $pages=${$pdf->[$catalog]->{OBJ}->{Pages}};

    NextPage($pdf,$pages,$callback);
}

sub NextPage
{
    my $pdf=shift;
    my $pages=shift;
    my $callback=shift;
    
    if ($pdf->[$pages]->{OBJ}->{Type} eq '/Pages')
    {
	foreach my $kid (@{$pdf->[$pages]->{OBJ}->{Kids}})
	{
	    $ret=NextPage($pdf,$$kid,$callback);
	}
    }
    elsif ($pdf->[$pages]->{OBJ}->{Type} eq '/Page')
    {
	$pgno++;
	my $stream=$pdf->[$pages]->{OBJ}->{Contents};
	
	if (ref($stream) eq 'ARRAY')
	{
	    foreach my $s (@{$stream})
	    {
		&$callback($$s,GetStream($pdf,$s),$pgno,$pages);
	    }
	}
	else
	{
	    &$callback($pages,GetStream($pdf,$stream),$pgno,$pages);
	}
    }
    
    return($ret);
}

sub GetStream
{
    my $pdf=shift;
    my $obj=shift;
    
    return(\$pdf->[$$obj]->{STREAM});
}

sub SavePDF
{
    my $pdf=shift;
    my $flate=shift||0;
    my $objct=$#{$pdf};
    my $free=0;
    
    Put("%PDF-1.4\n\x25\xe2\xe3\xcf\xd3\n");

    foreach my $o (1..$objct)
    {
	PutObj($pdf,$flate,$o) if (defined($pdf->[$o]) and !exists($pdf->[$o]->{XREF}));
    }

    my $xrefct=$fct;

    $objct+=1;
    print "xref\n0 $objct\n0000000000 65535 f \n";

    foreach my $o (1..$objct-1)
    {
	if (defined($pdf->[$o]))
	{
	    printf("%010d 00000 n \n",$pdf->[$o]->{XREF});
	}
	else
	{
	    printf("%010d 00001 f \n",$free);
	    $free=$o;
	}
    }

    my $msg="trailer\n";
    PutField(\$msg,$pdf->[0]->{OBJ});
    $msg.="startxref\n$fct\n\%\%EOF\n";
    Put($msg);
#    print "trailer\n<<\n/Info $info\n/Root 1 0 R\n/Size $objct\n>>\nstartxref\n$fct\n\%\%EOF\n";
}

sub Put
{
    my $msg=shift;
    

    print $msg;
    $fct+=length($msg);
}

sub PutObj
{
    my $pdf=shift;
    my $flate=shift;
    my $ono=shift;
    return if !defined($pdf->[$ono]);
    my $msg="$ono 0 obj "; 
    $pdf->[$ono]->{XREF}=$fct;
    if (exists($pdf->[$ono]->{STREAM}))
    {
	if ($flate && !exists($pdf->[$ono]->{OBJ}->{'Filter'}))
	{
	    $pdf->[$ono]->{STREAM}=Compress::Zlib::compress($pdf->[$ono]->{STREAM});
	    $pdf->[$ono]->{OBJ}->{'Filter'}=['/FlateDecode'];
	}

	$pdf->[$ono]->{OBJ}->{'Length'}=length($pdf->[$ono]->{STREAM});
    }
    PutField(\$msg,$pdf->[$ono]->{OBJ});
    PutStream(\$msg,$pdf,$ono) if exists($pdf->[$ono]->{STREAM});
    Put($msg."endobj\n");
}

sub PutStream
{
    my $msg=shift;
    my $pdf=shift;
    my $ono=shift;

    # We could 'flate' here
    $$msg.="stream\n$pdf->[$ono]->{STREAM}endstream\n";
}

sub PutField
{
    my $pmsg=shift;
    my $fld=shift;
    my $term=shift||"\n";
    my $typ=ref($fld);

    if ($typ eq '')
    {
	$$pmsg.="$fld$term";
    }
    elsif ($typ eq 'ARRAY')
    {
	$$pmsg.='[';
	foreach my $cell (@{$fld})
	{
	    PutField($pmsg,$cell,' ');
	}
	$$pmsg.="]$term";
    }
    elsif ($typ eq 'HASH')
    {
	$$pmsg.='<< ';
	foreach my $key (sort keys %{$fld})
	{
	    $$pmsg.="/$key ";
	    PutField($pmsg,$fld->{$key});
	}
	$$pmsg.=">>$term";
    }
    elsif ($typ eq 'OBJREF')
    {
	$$pmsg.="$$fld 0 R$term";
    }
}

1;