ExtractText.java

lehmi Sun, 27 Mar 2011 05:32:32 -0700

Author: lehmi
Date: Sun Mar 27 12:32:06 2011
New Revision: 1085923

URL: http://svn.apache.org/viewvc?rev=1085923&view=rev
Log:
PDFBOX-990: added additional debug output about the time consumption of every 
stage of text extraction


Modified:
    pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/ExtractText.java

Modified: pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/ExtractText.java
URL: 
http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/ExtractText.java?rev=1085923&r1=1085922&r2=1085923&view=diff
==============================================================================
--- pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/ExtractText.java 
(original)
+++ pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/ExtractText.java Sun 
Mar 27 12:32:06 2011
@@ -46,9 +46,15 @@ public class ExtractText
     private static final String END_PAGE = "-endPage";
     private static final String SORT = "-sort";
     private static final String IGNORE_BEADS = "-ignoreBeads";
+    private static final String DEBUG = "-debug";
     private static final String HTML = "-html";  // jjb - added simple HTML 
output
     private static final String FORCE = "-force"; //enables pdfbox to skip 
corrupt objects
 
+    /*
+     * debug flag
+     */
+    private boolean debug = false;
+
     /**
      * private constructor.
     */
@@ -66,6 +72,12 @@ public class ExtractText
      */
     public static void main( String[] args ) throws Exception
     {
+        ExtractText extractor = new ExtractText();
+        extractor.startExtraction(args);
+    }
+
+    public void startExtraction( String[] args ) throws Exception
+    {
         boolean toConsole = false;
         boolean toHTML = false;
         boolean force = false;
@@ -121,6 +133,10 @@ public class ExtractText
             {
                 separateBeads = false;
             }
+            else if( args[i].equals( DEBUG ) )
+            {
+                debug = true;
+            }
             else if( args[i].equals( END_PAGE ) )
             {
                 i++;
@@ -162,6 +178,7 @@ public class ExtractText
             PDDocument document = null;
             try
             {
+                long startTime = startProcessing("Loading PDF "+pdfFile);
                 try
                 {
                     //basically try to load it from a url first and if the URL
@@ -182,8 +199,8 @@ public class ExtractText
                         outputFile = pdfFile.substring( 0, pdfFile.length() -4 
) + ext;
                     }
                 }
+                stopProcessing("Time for loading: ", startTime);
 
-                //document.print();
                 if( document.isEncrypted() )
                 {
                     StandardDecryptionMaterial sdm = new 
StandardDecryptionMaterial( password );
@@ -234,7 +251,10 @@ public class ExtractText
                 stripper.setShouldSeparateByBeads( separateBeads );
                 stripper.setStartPage( startPage );
                 stripper.setEndPage( endPage );
+
+                startTime = startProcessing("Starting text extraction");
                 stripper.writeText( document, output );
+                stopProcessing("Time for extraction: ", startTime);
             }
             finally
             {
@@ -250,6 +270,23 @@ public class ExtractText
         }
     }
 
+    private long startProcessing(String message) {
+        if (debug) 
+        {
+            System.err.println(message);
+        }
+        return System.currentTimeMillis();
+    }
+    
+    private void stopProcessing(String message, long startTime) {
+        if (debug)
+        {
+            long stopTime = System.currentTimeMillis();
+            float elapsedTime = ((float)(stopTime - startTime))/1000;
+            System.err.println(message + elapsedTime + " seconds");
+        }
+    }
+
     /**
      * This will print the usage requirements and exit.
      */
@@ -263,6 +300,7 @@ public class ExtractText
             "  -sort                        Sort the text before writing\n" +
             "  -ignoreBeads                 Disables the separation by 
beads\n" +
             "  -force                       Enables pdfbox to ignore corrupt 
objects\n" +
+            "  -debug                       Enables debug output about the 
time consumption of every stage\n" +
             "  -startPage <number>          The first page to start 
extraction(1 based)\n" +
             "  -endPage <number>            The last page to 
extract(inclusive)\n" +
             "  <PDF file>                   The PDF document to use\n" +

svn commit: r1085923 - /pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/ExtractText.java

Reply via email to