Hi guys,
I try to extract some text from a pdf document. The basic of this step is
not too complicated and works well, but, I do not manage to extract the
text from a section. In my program, I manage to print the bookmarks, for
example:
Sample
1. Introduction
1.1 IntroductionPart1
1.2 IntroductionPart2
2.Section 1
2.1 Section1Part1
2.2Section1Part2
What I would like to do is to create a .txt file for each section with the
corresponding text in it.
Can somebody help me with my code or give me an example ?
This is my code:
import org.apache.pdfbox.exceptions.InvalidPasswordException;
import org.apache.pdfbox.pdfparser.PDFParser;
import org.apache.pdfbox.pdmodel.PDDocument;
import
org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDDocumentOutline;
import
org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlineItem;
import
org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlineNode;
import org.apache.pdfbox.util.PDFTextStripper;
import org.apache.pdfbox.util.PDFTextStripperByArea;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStreamWriter;
public class PrintBookmarks{
public static void main( String[] args ) throws Exception
{
PDDocument document = PDDocument.load("C:/sample.pdf");
PrintBookmarks meta = new PrintBookmarks();
PDDocumentOutline outline =
document.getDocumentCatalog().getDocumentOutline();
meta.printBookmark( outline, "" );
}
public void printBookmark( PDOutlineNode bookmark, String indentation
) throws IOException
{
PDOutlineItem current = bookmark.getFirstChild();
while( current != null )
{
System.out.println( indentation + current.getTitle() );
printBookmark( current, indentation + " " );
PDDocument doc = PDDocument.load("C:/sample.pdf");
File output = new File("C:/"+current.getTitle()+".txt");
BufferedWriter wr = new BufferedWriter(new
OutputStreamWriter(new FileOutputStream(output)));
PDFTextStripperByArea stripper = new PDFTextStripperByArea();
stripper.setStartBookmark(current);
stripper.setEndBookmark(current.getNextSibling());
stripper.;
current = current.getNextSibling();
}
}
}