Author: tilman Date: Wed Nov 7 18:30:16 2018 New Revision: 1846064 URL: http://svn.apache.org/viewvc?rev=1846064&view=rev Log: PDFBOX-4367: run stripper by page as preparation to catch the exception in a later commit; improve usage text
Modified: pdfbox/trunk/tools/src/main/java/org/apache/pdfbox/tools/ExtractText.java Modified: pdfbox/trunk/tools/src/main/java/org/apache/pdfbox/tools/ExtractText.java URL: http://svn.apache.org/viewvc/pdfbox/trunk/tools/src/main/java/org/apache/pdfbox/tools/ExtractText.java?rev=1846064&r1=1846063&r2=1846064&view=diff ============================================================================== --- pdfbox/trunk/tools/src/main/java/org/apache/pdfbox/tools/ExtractText.java (original) +++ pdfbox/trunk/tools/src/main/java/org/apache/pdfbox/tools/ExtractText.java Wed Nov 7 18:30:16 2018 @@ -224,18 +224,30 @@ public final class ExtractText } stripper.setSortByPosition( sort ); stripper.setShouldSeparateByBeads( separateBeads ); - stripper.setStartPage( startPage ); - stripper.setEndPage( endPage ); startTime = startProcessing("Starting text extraction"); if (debug) { System.err.println("Writing to "+outputFile); } - + endPage = Math.min(endPage, document.getNumberOfPages()); + // Extract text for main document: - stripper.writeText( document, output ); - + for (int p = startPage; p <= endPage; ++p) + { + try + { + stripper.setStartPage(p); + stripper.setEndPage(p); + stripper.writeText(document, output); + } + catch (IOException ex) + { + //TODO alternatively, log and continue + throw ex; + } + } + // ... also for any embedded PDFs: PDDocumentCatalog catalog = document.getDocumentCatalog(); PDDocumentNameDictionary names = catalog.getNames(); @@ -264,7 +276,20 @@ public final class ExtractText try (InputStream fis = file.createInputStream(); PDDocument subDoc = PDDocument.load(fis)) { - stripper.writeText( subDoc, output ); + for (int p = 1; p <= subDoc.getNumberOfPages(); ++p) + { + try + { + stripper.setStartPage(p); + stripper.setEndPage(p); + stripper.writeText(subDoc, output); + } + catch (IOException ex) + { + //TODO alternatively, log and continue + throw ex; + } + } } } } @@ -307,17 +332,17 @@ public final class ExtractText { String message = "Usage: java -jar pdfbox-app-x.y.z.jar ExtractText [options] <inputfile> [output-text-file]\n" + "\nOptions:\n" - + " -password <password> : Password to decrypt document\n" - + " -encoding <output encoding> : UTF-8 (default) or ISO-8859-1, UTF-16BE, UTF-16LE, etc.\n" - + " -console : Send text to console instead of file\n" - + " -html : Output in HTML format instead of raw text\n" - + " -sort : Sort the text before writing\n" - + " -ignoreBeads : Disables the separation by beads\n" - + " -debug : Enables debug output about the time consumption of every stage\n" - + " -startPage <number> : The first page to start extraction(1 based)\n" - + " -endPage <number> : The last page to extract(inclusive)\n" - + " <inputfile> : The PDF document to use\n" - + " [output-text-file] : The file to write the text to"; + + " -password <password> : Password to decrypt document\n" + + " -encoding <output encoding> : UTF-8 (default) or ISO-8859-1, UTF-16BE, UTF-16LE, etc.\n" + + " -console : Send text to console instead of file\n" + + " -html : Output in HTML format instead of raw text\n" + + " -sort : Sort the text before writing\n" + + " -ignoreBeads : Disables the separation by beads\n" + + " -debug : Enables debug output about the time consumption of every stage\n" + + " -startPage <number> : The first page to start extraction (1 based)\n" + + " -endPage <number> : The last page to extract (1 based and inclusive)\n" + + " <inputfile> : The PDF document to use\n" + + " [output-text-file] : The file to write the text to"; System.err.println(message); System.exit( 1 );