a...

tallison Fri, 29 May 2015 07:37:22 -0700

Modified: 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java?rev=1682489&r1=1682488&r2=1682489&view=diff
==============================================================================
--- 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java
 (original)
+++ 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java
 Fri May 29 14:36:21 2015
@@ -16,16 +16,15 @@
  */
 package org.apache.tika.parser.microsoft.ooxml;
 
+import javax.xml.parsers.ParserConfigurationException;
+import javax.xml.parsers.SAXParser;
+import javax.xml.parsers.SAXParserFactory;
 import java.io.IOException;
 import java.io.InputStream;
 import java.util.ArrayList;
 import java.util.List;
 import java.util.Locale;
 
-import javax.xml.parsers.ParserConfigurationException;
-import javax.xml.parsers.SAXParser;
-import javax.xml.parsers.SAXParserFactory;
-
 import org.apache.poi.hssf.extractor.ExcelExtractor;
 import org.apache.poi.openxml4j.exceptions.InvalidFormatException;
 import org.apache.poi.openxml4j.exceptions.OpenXML4JException;
@@ -63,6 +62,10 @@ import org.xml.sax.SAXException;
 import org.xml.sax.XMLReader;
 
 public class XSSFExcelExtractorDecorator extends AbstractOOXMLExtractor {
+    /**
+     * Allows access to headers/footers from raw xml strings
+     */
+    private static HeaderFooterHelper hfHelper = new HeaderFooterHelper();
     private final XSSFEventBasedExcelExtractor extractor;
     private final DataFormatter formatter;
     private final List<PackagePart> sheetParts = new ArrayList<PackagePart>();
@@ -75,11 +78,11 @@ public class XSSFExcelExtractorDecorator
         this.extractor = extractor;
         extractor.setFormulasNotResults(false);
         extractor.setLocale(locale);
-        
-        if(locale == null) {
-           formatter = new DataFormatter();
-        } else  {
-           formatter = new DataFormatter(locale);
+
+        if (locale == null) {
+            formatter = new DataFormatter();
+        } else {
+            formatter = new DataFormatter(locale);
         }
     }
 
@@ -88,10 +91,10 @@ public class XSSFExcelExtractorDecorator
             ContentHandler handler, Metadata metadata, ParseContext context)
             throws SAXException, XmlException, IOException, TikaException {
 
-       this.metadata = metadata;
-       metadata.set(TikaMetadataKeys.PROTECTED, "false");
+        this.metadata = metadata;
+        metadata.set(TikaMetadataKeys.PROTECTED, "false");
 
-       super.getXHTML(handler, metadata, context);
+        super.getXHTML(handler, metadata, context);
     }
 
     /**
@@ -100,277 +103,293 @@ public class XSSFExcelExtractorDecorator
     @Override
     protected void buildXHTML(XHTMLContentHandler xhtml) throws SAXException,
             XmlException, IOException {
-       OPCPackage container = extractor.getPackage();
-       
-       ReadOnlySharedStringsTable strings;
-       XSSFReader.SheetIterator iter;
-       XSSFReader xssfReader;
-       StylesTable styles;
-       try {
-          xssfReader = new XSSFReader(container);
-          styles = xssfReader.getStylesTable();
-          iter = (XSSFReader.SheetIterator) xssfReader.getSheetsData();
-          strings = new ReadOnlySharedStringsTable(container);
-       } catch(InvalidFormatException e) {
-          throw new XmlException(e);
-       } catch (OpenXML4JException oe) {
-          throw new XmlException(oe);
-       }
-
-       while (iter.hasNext()) {
-           InputStream stream = iter.next();
-           sheetParts.add(iter.getSheetPart());
-           
-           SheetTextAsHTML sheetExtractor = new SheetTextAsHTML(xhtml);
-           CommentsTable comments = iter.getSheetComments();
-
-           // Start, and output the sheet name
-           xhtml.startElement("div");
-           xhtml.element("h1", iter.getSheetName());
-           
-           // Extract the main sheet contents
-           xhtml.startElement("table");
-           xhtml.startElement("tbody");
-           
-           processSheet(sheetExtractor, comments, styles, strings, stream);
-
-           xhtml.endElement("tbody");
-           xhtml.endElement("table");
-           
-           // Output any headers and footers
-           // (Need to process the sheet to get them, so we can't
-           //  do the headers before the contents)
-           for(String header : sheetExtractor.headers) {
-              extractHeaderFooter(header, xhtml);
-           }
-           for(String footer : sheetExtractor.footers) {
-              extractHeaderFooter(footer, xhtml);
-           }
-           processShapes(iter.getShapes(), xhtml);
-           // All done with this sheet
-           xhtml.endElement("div");
-       }
+        OPCPackage container = extractor.getPackage();
+
+        ReadOnlySharedStringsTable strings;
+        XSSFReader.SheetIterator iter;
+        XSSFReader xssfReader;
+        StylesTable styles;
+        try {
+            xssfReader = new XSSFReader(container);
+            styles = xssfReader.getStylesTable();
+            iter = (XSSFReader.SheetIterator) xssfReader.getSheetsData();
+            strings = new ReadOnlySharedStringsTable(container);
+        } catch (InvalidFormatException e) {
+            throw new XmlException(e);
+        } catch (OpenXML4JException oe) {
+            throw new XmlException(oe);
+        }
+
+        while (iter.hasNext()) {
+            InputStream stream = iter.next();
+            sheetParts.add(iter.getSheetPart());
+
+            SheetTextAsHTML sheetExtractor = new SheetTextAsHTML(xhtml);
+            CommentsTable comments = iter.getSheetComments();
+
+            // Start, and output the sheet name
+            xhtml.startElement("div");
+            xhtml.element("h1", iter.getSheetName());
+
+            // Extract the main sheet contents
+            xhtml.startElement("table");
+            xhtml.startElement("tbody");
+
+            processSheet(sheetExtractor, comments, styles, strings, stream);
+
+            xhtml.endElement("tbody");
+            xhtml.endElement("table");
+
+            // Output any headers and footers
+            // (Need to process the sheet to get them, so we can't
+            //  do the headers before the contents)
+            for (String header : sheetExtractor.headers) {
+                extractHeaderFooter(header, xhtml);
+            }
+            for (String footer : sheetExtractor.footers) {
+                extractHeaderFooter(footer, xhtml);
+            }
+            processShapes(iter.getShapes(), xhtml);
+            // All done with this sheet
+            xhtml.endElement("div");
+        }
     }
 
     private void extractHeaderFooter(String hf, XHTMLContentHandler xhtml)
             throws SAXException {
         String content = ExcelExtractor._extractHeaderFooter(
-              new HeaderFooterFromString(hf));
+                new HeaderFooterFromString(hf));
         if (content.length() > 0) {
             xhtml.element("p", content);
         }
     }
-    
+
     private void processShapes(List<XSSFShape> shapes, XHTMLContentHandler 
xhtml) throws SAXException {
-       if (shapes == null){
-           return;
-       }
-       for (XSSFShape shape : shapes){
-           if (shape instanceof XSSFSimpleShape){
-               String sText = ((XSSFSimpleShape)shape).getText();
-               if (sText != null && sText.length() > 0){
-                   xhtml.element("p", sText);
-               }
-           }
-       }
-   }
-    
+        if (shapes == null) {
+            return;
+        }
+        for (XSSFShape shape : shapes) {
+            if (shape instanceof XSSFSimpleShape) {
+                String sText = ((XSSFSimpleShape) shape).getText();
+                if (sText != null && sText.length() > 0) {
+                    xhtml.element("p", sText);
+                }
+            }
+        }
+    }
+
     public void processSheet(
-          SheetContentsHandler sheetContentsExtractor,
-          CommentsTable comments,
-          StylesTable styles,
-          ReadOnlySharedStringsTable strings,
-          InputStream sheetInputStream)
-          throws IOException, SAXException {
-      InputSource sheetSource = new InputSource(sheetInputStream);
-      SAXParserFactory saxFactory = SAXParserFactory.newInstance();
-      try {
-         SAXParser saxParser = saxFactory.newSAXParser();
-         XMLReader sheetParser = saxParser.getXMLReader();
-         XSSFSheetInterestingPartsCapturer handler =  
-            new XSSFSheetInterestingPartsCapturer(new XSSFSheetXMLHandler(
-               styles, comments, strings, sheetContentsExtractor, formatter, 
false));
-         sheetParser.setContentHandler(handler);
-         sheetParser.parse(sheetSource);
-         sheetInputStream.close();
-         
-         if (handler.hasProtection) {
-            metadata.set(TikaMetadataKeys.PROTECTED, "true");
-        }
-      } catch(ParserConfigurationException e) {
-         throw new RuntimeException("SAX parser appears to be broken - " + 
e.getMessage());
-      }
+            SheetContentsHandler sheetContentsExtractor,
+            CommentsTable comments,
+            StylesTable styles,
+            ReadOnlySharedStringsTable strings,
+            InputStream sheetInputStream)
+            throws IOException, SAXException {
+        InputSource sheetSource = new InputSource(sheetInputStream);
+        SAXParserFactory saxFactory = SAXParserFactory.newInstance();
+        try {
+            SAXParser saxParser = saxFactory.newSAXParser();
+            XMLReader sheetParser = saxParser.getXMLReader();
+            XSSFSheetInterestingPartsCapturer handler =
+                    new XSSFSheetInterestingPartsCapturer(new 
XSSFSheetXMLHandler(
+                            styles, comments, strings, sheetContentsExtractor, 
formatter, false));
+            sheetParser.setContentHandler(handler);
+            sheetParser.parse(sheetSource);
+            sheetInputStream.close();
+
+            if (handler.hasProtection) {
+                metadata.set(TikaMetadataKeys.PROTECTED, "true");
+            }
+        } catch (ParserConfigurationException e) {
+            throw new RuntimeException("SAX parser appears to be broken - " + 
e.getMessage());
+        }
     }
-     
+
     /**
-     * Turns formatted sheet events into HTML
+     * In Excel files, sheets have things embedded in them,
+     * and sheet drawings which have the images
      */
-    protected static class SheetTextAsHTML implements SheetContentsHandler {
-       private XHTMLContentHandler xhtml;
-       private List<String> headers;
-       private List<String> footers;
-       
-       protected SheetTextAsHTML(XHTMLContentHandler xhtml) {
-          this.xhtml = xhtml;
-          headers = new ArrayList<String>();
-          footers = new ArrayList<String>();
-       }
-       
-       public void startRow(int rowNum) {
-          try {
-             xhtml.startElement("tr");
-          } catch(SAXException e) {}
-       }
-       
-       public void endRow(int rowNum) {
-          try {
-             xhtml.endElement("tr");
-          } catch(SAXException e) {}
-       }
-
-       public void cell(String cellRef, String formattedValue, XSSFComment 
comment) {
-          try {
-             xhtml.startElement("td");
-
-             // Main cell contents
-             if (formattedValue != null) {
-                 xhtml.characters(formattedValue);
-             }
-
-             // Comments
-             if(comment != null) {
-                xhtml.startElement("br");
-                xhtml.endElement("br");
-                xhtml.characters(comment.getAuthor());
-                xhtml.characters(": ");
-                xhtml.characters(comment.getString().getString());
-             }
-
-             xhtml.endElement("td");
-          } catch(SAXException e) {}
-       }
-       
-       public void headerFooter(String text, boolean isHeader, String tagName) 
{
-          if(isHeader) {
-             headers.add(text);
-          } else {
-             footers.add(text);
-          }
-       }
+    @Override
+    protected List<PackagePart> getMainDocumentParts() throws TikaException {
+        List<PackagePart> parts = new ArrayList<PackagePart>();
+        for (PackagePart part : sheetParts) {
+            // Add the sheet
+            parts.add(part);
+
+            // If it has drawings, return those too
+            try {
+                for (PackageRelationship rel : 
part.getRelationshipsByType(XSSFRelation.DRAWINGS.getRelation())) {
+                    if (rel.getTargetMode() == TargetMode.INTERNAL) {
+                        PackagePartName relName = 
PackagingURIHelper.createPartName(rel.getTargetURI());
+                        parts.add(rel.getPackage().getPart(relName));
+                    }
+                }
+                for (PackageRelationship rel : 
part.getRelationshipsByType(XSSFRelation.VML_DRAWINGS.getRelation())) {
+                    if (rel.getTargetMode() == TargetMode.INTERNAL) {
+                        PackagePartName relName = 
PackagingURIHelper.createPartName(rel.getTargetURI());
+                        parts.add(rel.getPackage().getPart(relName));
+                    }
+                }
+            } catch (InvalidFormatException e) {
+                throw new TikaException("Broken OOXML file", e);
+            }
+        }
+
+        return parts;
     }
-    
+
     /**
-     * Allows access to headers/footers from raw xml strings
+     * Turns formatted sheet events into HTML
      */
-    private static HeaderFooterHelper hfHelper = new HeaderFooterHelper();
+    protected static class SheetTextAsHTML implements SheetContentsHandler {
+        private XHTMLContentHandler xhtml;
+        private List<String> headers;
+        private List<String> footers;
+
+        protected SheetTextAsHTML(XHTMLContentHandler xhtml) {
+            this.xhtml = xhtml;
+            headers = new ArrayList<String>();
+            footers = new ArrayList<String>();
+        }
+
+        public void startRow(int rowNum) {
+            try {
+                xhtml.startElement("tr");
+            } catch (SAXException e) {
+            }
+        }
+
+        public void endRow(int rowNum) {
+            try {
+                xhtml.endElement("tr");
+            } catch (SAXException e) {
+            }
+        }
+
+        public void cell(String cellRef, String formattedValue, XSSFComment 
comment) {
+            try {
+                xhtml.startElement("td");
+
+                // Main cell contents
+                if (formattedValue != null) {
+                    xhtml.characters(formattedValue);
+                }
+
+                // Comments
+                if (comment != null) {
+                    xhtml.startElement("br");
+                    xhtml.endElement("br");
+                    xhtml.characters(comment.getAuthor());
+                    xhtml.characters(": ");
+                    xhtml.characters(comment.getString().getString());
+                }
+
+                xhtml.endElement("td");
+            } catch (SAXException e) {
+            }
+        }
+
+        public void headerFooter(String text, boolean isHeader, String 
tagName) {
+            if (isHeader) {
+                headers.add(text);
+            } else {
+                footers.add(text);
+            }
+        }
+    }
+
     protected static class HeaderFooterFromString implements HeaderFooter {
-      private String text;
-      protected HeaderFooterFromString(String text) {
-         this.text = text;
-      }
-
-      public String getCenter() {
-         return hfHelper.getCenterSection(text);
-      }
-      public String getLeft() {
-         return hfHelper.getLeftSection(text);
-      }
-      public String getRight() {
-         return hfHelper.getRightSection(text);
-      }
-
-      public void setCenter(String paramString) {}
-      public void setLeft(String paramString) {}
-      public void setRight(String paramString) {}
+        private String text;
+
+        protected HeaderFooterFromString(String text) {
+            this.text = text;
+        }
+
+        public String getCenter() {
+            return hfHelper.getCenterSection(text);
+        }
+
+        public void setCenter(String paramString) {
+        }
+
+        public String getLeft() {
+            return hfHelper.getLeftSection(text);
+        }
+
+        public void setLeft(String paramString) {
+        }
+
+        public String getRight() {
+            return hfHelper.getRightSection(text);
+        }
+
+        public void setRight(String paramString) {
+        }
     }
-    
+
     /**
      * Captures information on interesting tags, whilst
-     *  delegating the main work to the formatting handler
+     * delegating the main work to the formatting handler
      */
     protected static class XSSFSheetInterestingPartsCapturer implements 
ContentHandler {
-      private ContentHandler delegate;
-      private boolean hasProtection = false;
-      
-      protected XSSFSheetInterestingPartsCapturer(ContentHandler delegate) {
-         this.delegate = delegate;
-      }
-      
-      public void startElement(String uri, String localName, String qName,
-            Attributes atts) throws SAXException {
-         if("sheetProtection".equals(qName)) {
-            hasProtection = true;
-         }
-         delegate.startElement(uri, localName, qName, atts);
-      }
+        private ContentHandler delegate;
+        private boolean hasProtection = false;
 
-      public void characters(char[] ch, int start, int length)
-            throws SAXException {
-         delegate.characters(ch, start, length);
-      }
-      public void endDocument() throws SAXException {
-         delegate.endDocument();
-      }
-      public void endElement(String uri, String localName, String qName)
-            throws SAXException {
-         delegate.endElement(uri, localName, qName);
-      }
-      public void endPrefixMapping(String prefix) throws SAXException {
-         delegate.endPrefixMapping(prefix);
-      }
-      public void ignorableWhitespace(char[] ch, int start, int length)
-            throws SAXException {
-         delegate.ignorableWhitespace(ch, start, length);
-      }
-      public void processingInstruction(String target, String data)
-            throws SAXException {
-         delegate.processingInstruction(target, data);
-      }
-      public void setDocumentLocator(Locator locator) {
-         delegate.setDocumentLocator(locator);
-      }
-      public void skippedEntity(String name) throws SAXException {
-         delegate.skippedEntity(name);
-      }
-      public void startDocument() throws SAXException {
-         delegate.startDocument();
-      }
-      public void startPrefixMapping(String prefix, String uri)
-            throws SAXException {
-         delegate.startPrefixMapping(prefix, uri);
-      }
-    }
-    
-    /**
-     * In Excel files, sheets have things embedded in them,
-     *  and sheet drawings which have the images
-     */
-    @Override
-    protected List<PackagePart> getMainDocumentParts() throws TikaException {
-       List<PackagePart> parts = new ArrayList<PackagePart>();
-       for(PackagePart part : sheetParts) {
-          // Add the sheet
-          parts.add(part);
-          
-          // If it has drawings, return those too
-          try {
-             for(PackageRelationship rel : 
part.getRelationshipsByType(XSSFRelation.DRAWINGS.getRelation())) {
-                if(rel.getTargetMode() == TargetMode.INTERNAL) {
-                   PackagePartName relName = 
PackagingURIHelper.createPartName(rel.getTargetURI());
-                   parts.add( rel.getPackage().getPart(relName) );
-                }
-             }
-             for(PackageRelationship rel : 
part.getRelationshipsByType(XSSFRelation.VML_DRAWINGS.getRelation())) {
-                if(rel.getTargetMode() == TargetMode.INTERNAL) {
-                   PackagePartName relName = 
PackagingURIHelper.createPartName(rel.getTargetURI());
-                   parts.add( rel.getPackage().getPart(relName) );
-                }
-             }
-          } catch(InvalidFormatException e) {
-             throw new TikaException("Broken OOXML file", e);
-          }
-       }
+        protected XSSFSheetInterestingPartsCapturer(ContentHandler delegate) {
+            this.delegate = delegate;
+        }
 
-       return parts;
+        public void startElement(String uri, String localName, String qName,
+                                 Attributes atts) throws SAXException {
+            if ("sheetProtection".equals(qName)) {
+                hasProtection = true;
+            }
+            delegate.startElement(uri, localName, qName, atts);
+        }
+
+        public void characters(char[] ch, int start, int length)
+                throws SAXException {
+            delegate.characters(ch, start, length);
+        }
+
+        public void endDocument() throws SAXException {
+            delegate.endDocument();
+        }
+
+        public void endElement(String uri, String localName, String qName)
+                throws SAXException {
+            delegate.endElement(uri, localName, qName);
+        }
+
+        public void endPrefixMapping(String prefix) throws SAXException {
+            delegate.endPrefixMapping(prefix);
+        }
+
+        public void ignorableWhitespace(char[] ch, int start, int length)
+                throws SAXException {
+            delegate.ignorableWhitespace(ch, start, length);
+        }
+
+        public void processingInstruction(String target, String data)
+                throws SAXException {
+            delegate.processingInstruction(target, data);
+        }
+
+        public void setDocumentLocator(Locator locator) {
+            delegate.setDocumentLocator(locator);
+        }
+
+        public void skippedEntity(String name) throws SAXException {
+            delegate.skippedEntity(name);
+        }
+
+        public void startDocument() throws SAXException {
+            delegate.startDocument();
+        }
+
+        public void startPrefixMapping(String prefix, String uri)
+                throws SAXException {
+            delegate.startPrefixMapping(prefix, uri);
+        }
     }
 }


Modified: 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFListManager.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFListManager.java?rev=1682489&r1=1682488&r2=1682489&view=diff
==============================================================================
--- 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFListManager.java
 (original)
+++ 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFListManager.java
 Fri May 29 14:36:21 2015
@@ -30,13 +30,8 @@ import org.openxmlformats.schemas.wordpr
 
 public class XWPFListManager extends AbstractListManager {
     private final static boolean OVERRIDE_AVAILABLE;
-    private final static String SKIP_FORMAT = 
Character.toString((char)61623);//if this shows up as the lvlText, don't show a 
number
+    private final static String SKIP_FORMAT = Character.toString((char) 
61623);//if this shows up as the lvlText, don't show a number
 
-    private final XWPFNumbering numbering;
-    //map of numId (which paragraph series is this a member of?), levelcounts
-    public XWPFListManager(XWPFDocument document) {
-        numbering = document.getNumbering();
-    }
     static {
         boolean b = false;
         try {
@@ -47,6 +42,14 @@ public class XWPFListManager extends Abs
         b = OVERRIDE_AVAILABLE = false;
 
     }
+
+    private final XWPFNumbering numbering;
+
+    //map of numId (which paragraph series is this a member of?), levelcounts
+    public XWPFListManager(XWPFDocument document) {
+        numbering = document.getNumbering();
+    }
+
     public String getFormattedNumber(final XWPFParagraph paragraph) {
         int currNumId = paragraph.getNumID().intValue();
         CTNum ctNum = numbering.getNum(paragraph.getNumID()).getCTNum();
@@ -74,7 +77,8 @@ public class XWPFListManager extends Abs
      * WARNING: currently always returns null.
      * TODO: Once CTNumLvl is available to Tika,
      * we can turn this back on.
-     * @param ctNum number on which to build the overrides
+     *
+     * @param ctNum  number on which to build the overrides
      * @param length length of intended array
      * @return null or an array of override tuples of length {@param length}
      */
@@ -121,7 +125,7 @@ public class XWPFListManager extends Abs
         boolean isLegal = false;
         int start = 1;
         int restart = -1;
-        String lvlText = "%"+level+".";
+        String lvlText = "%" + level + ".";
         String numFmt = "decimal";
 
 

Modified: 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java?rev=1682489&r1=1682488&r2=1682489&view=diff
==============================================================================
--- 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java
 (original)
+++ 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java
 Fri May 29 14:36:21 2015
@@ -63,7 +63,7 @@ import org.xml.sax.helpers.AttributesImp
 public class XWPFWordExtractorDecorator extends AbstractOOXMLExtractor {
 
     // could be improved by using the real delimiter in xchFollow [MS-DOC], 
v20140721, 2.4.6.3, Part 3, Step 3
-    private static final String LIST_DELIMITER = " "; 
+    private static final String LIST_DELIMITER = " ";
 
 
     private XWPFDocument document;
@@ -71,7 +71,7 @@ public class XWPFWordExtractorDecorator
 
     public XWPFWordExtractorDecorator(ParseContext context, XWPFWordExtractor 
extractor) {
         super(context, extractor);
-        
+
         document = (XWPFDocument) extractor.getDocument();
         styles = document.getStyles();
     }
@@ -85,7 +85,7 @@ public class XWPFWordExtractorDecorator
         XWPFHeaderFooterPolicy hfPolicy = document.getHeaderFooterPolicy();
         XWPFListManager listManager = new XWPFListManager(document);
         // headers
-        if (hfPolicy!=null) {
+        if (hfPolicy != null) {
             extractHeaders(xhtml, hfPolicy, listManager);
         }
 
@@ -93,164 +93,164 @@ public class XWPFWordExtractorDecorator
         extractIBodyText(document, listManager, xhtml);
 
         // then all document tables
-        if (hfPolicy!=null) {
+        if (hfPolicy != null) {
             extractFooters(xhtml, hfPolicy, listManager);
         }
     }
 
     private void extractIBodyText(IBody bodyElement, XWPFListManager 
listManager,
-            XHTMLContentHandler xhtml)
+                                  XHTMLContentHandler xhtml)
             throws SAXException, XmlException, IOException {
-       for(IBodyElement element : bodyElement.getBodyElements()) {
-          if(element instanceof XWPFParagraph) {
-             XWPFParagraph paragraph = (XWPFParagraph)element;
-             extractParagraph(paragraph, listManager, xhtml);
-          }
-          if(element instanceof XWPFTable) {
-             XWPFTable table = (XWPFTable)element;
-             extractTable(table, listManager, xhtml);
-          }
-          if (element instanceof XWPFSDT){
-             extractSDT((XWPFSDT) element, xhtml);
-          }
-
-      }
-    }
-    
-    private void extractSDT(XWPFSDT element, XHTMLContentHandler xhtml) throws 
SAXException, 
-    XmlException, IOException {
-       ISDTContent content = element.getContent();
-       String tag = "p";
-       xhtml.startElement(tag);
-       xhtml.characters(content.getText());
-       xhtml.endElement(tag);
+        for (IBodyElement element : bodyElement.getBodyElements()) {
+            if (element instanceof XWPFParagraph) {
+                XWPFParagraph paragraph = (XWPFParagraph) element;
+                extractParagraph(paragraph, listManager, xhtml);
+            }
+            if (element instanceof XWPFTable) {
+                XWPFTable table = (XWPFTable) element;
+                extractTable(table, listManager, xhtml);
+            }
+            if (element instanceof XWPFSDT) {
+                extractSDT((XWPFSDT) element, xhtml);
+            }
+
+        }
     }
-    
+
+    private void extractSDT(XWPFSDT element, XHTMLContentHandler xhtml) throws 
SAXException,
+            XmlException, IOException {
+        ISDTContent content = element.getContent();
+        String tag = "p";
+        xhtml.startElement(tag);
+        xhtml.characters(content.getText());
+        xhtml.endElement(tag);
+    }
+
     private void extractParagraph(XWPFParagraph paragraph, XWPFListManager 
listManager,
-            XHTMLContentHandler xhtml)
+                                  XHTMLContentHandler xhtml)
             throws SAXException, XmlException, IOException {
-       // If this paragraph is actually a whole new section, then
-       //  it could have its own headers and footers
-       // Check and handle if so
-       XWPFHeaderFooterPolicy headerFooterPolicy = null;
-       if (paragraph.getCTP().getPPr() != null) {
-           CTSectPr ctSectPr = paragraph.getCTP().getPPr().getSectPr();
-           if(ctSectPr != null) {
-              headerFooterPolicy =
-                  new XWPFHeaderFooterPolicy(document, ctSectPr);
-              extractHeaders(xhtml, headerFooterPolicy, listManager);
-           }
-       }
-       
-       // Is this a paragraph, or a heading?
-       String tag = "p";
-       String styleClass = null;
-       if(paragraph.getStyleID() != null) {
-          XWPFStyle style = styles.getStyle(
-                paragraph.getStyleID()
-          );
-
-          if (style != null && style.getName() != null) {
-             TagAndStyle tas = WordExtractor.buildParagraphTagAndStyle(
-                   style.getName(), paragraph.getPartType() == 
BodyType.TABLECELL
-             );
-             tag = tas.getTag();
-             styleClass = tas.getStyleClass();
-          }
-       }
-       
-       if(styleClass == null) {
-          xhtml.startElement(tag);
-       } else {
-          xhtml.startElement(tag, "class", styleClass);
-       }
+        // If this paragraph is actually a whole new section, then
+        //  it could have its own headers and footers
+        // Check and handle if so
+        XWPFHeaderFooterPolicy headerFooterPolicy = null;
+        if (paragraph.getCTP().getPPr() != null) {
+            CTSectPr ctSectPr = paragraph.getCTP().getPPr().getSectPr();
+            if (ctSectPr != null) {
+                headerFooterPolicy =
+                        new XWPFHeaderFooterPolicy(document, ctSectPr);
+                extractHeaders(xhtml, headerFooterPolicy, listManager);
+            }
+        }
+
+        // Is this a paragraph, or a heading?
+        String tag = "p";
+        String styleClass = null;
+        if (paragraph.getStyleID() != null) {
+            XWPFStyle style = styles.getStyle(
+                    paragraph.getStyleID()
+            );
+
+            if (style != null && style.getName() != null) {
+                TagAndStyle tas = WordExtractor.buildParagraphTagAndStyle(
+                        style.getName(), paragraph.getPartType() == 
BodyType.TABLECELL
+                );
+                tag = tas.getTag();
+                styleClass = tas.getStyleClass();
+            }
+        }
+
+        if (styleClass == null) {
+            xhtml.startElement(tag);
+        } else {
+            xhtml.startElement(tag, "class", styleClass);
+        }
 
         writeParagraphNumber(paragraph, listManager, xhtml);
-       // Output placeholder for any embedded docs:
+        // Output placeholder for any embedded docs:
 
-       // TODO: replace w/ XPath/XQuery:
-       for(XWPFRun run : paragraph.getRuns()) {
-          XmlCursor c = run.getCTR().newCursor();
-          c.selectPath("./*");
-          while (c.toNextSelection()) {
-             XmlObject o = c.getObject();
-             if (o instanceof CTObject) {
-                XmlCursor c2 = o.newCursor();
-                c2.selectPath("./*");
-                while (c2.toNextSelection()) {
-                   XmlObject o2 = c2.getObject();
-
-                   XmlObject embedAtt = o2.selectAttribute(new QName("Type"));
-                   if (embedAtt != null && 
embedAtt.getDomNode().getNodeValue().equals("Embed")) {
-                      // Type is "Embed"
-                      XmlObject relIDAtt = o2.selectAttribute(new 
QName("http://schemas.openxmlformats.org/officeDocument/2006/relationships";, 
"id"));
-                      if (relIDAtt != null) {
-                         String relID = relIDAtt.getDomNode().getNodeValue();
-                         AttributesImpl attributes = new AttributesImpl();
-                         attributes.addAttribute("", "class", "class", 
"CDATA", "embedded");
-                         attributes.addAttribute("", "id", "id", "CDATA", 
relID);
-                         xhtml.startElement("div", attributes);
-                         xhtml.endElement("div");
-                      }
-                   }
+        // TODO: replace w/ XPath/XQuery:
+        for (XWPFRun run : paragraph.getRuns()) {
+            XmlCursor c = run.getCTR().newCursor();
+            c.selectPath("./*");
+            while (c.toNextSelection()) {
+                XmlObject o = c.getObject();
+                if (o instanceof CTObject) {
+                    XmlCursor c2 = o.newCursor();
+                    c2.selectPath("./*");
+                    while (c2.toNextSelection()) {
+                        XmlObject o2 = c2.getObject();
+
+                        XmlObject embedAtt = o2.selectAttribute(new 
QName("Type"));
+                        if (embedAtt != null && 
embedAtt.getDomNode().getNodeValue().equals("Embed")) {
+                            // Type is "Embed"
+                            XmlObject relIDAtt = o2.selectAttribute(new 
QName("http://schemas.openxmlformats.org/officeDocument/2006/relationships";, 
"id"));
+                            if (relIDAtt != null) {
+                                String relID = 
relIDAtt.getDomNode().getNodeValue();
+                                AttributesImpl attributes = new 
AttributesImpl();
+                                attributes.addAttribute("", "class", "class", 
"CDATA", "embedded");
+                                attributes.addAttribute("", "id", "id", 
"CDATA", relID);
+                                xhtml.startElement("div", attributes);
+                                xhtml.endElement("div");
+                            }
+                        }
+                    }
+                    c2.dispose();
                 }
-                c2.dispose();
-             }
-          }
-
-          c.dispose();
-       }
-       
-       // Attach bookmarks for the paragraph
-       // (In future, we might put them in the right place, for now
-       //  we just put them in the correct paragraph)
-       for (int i = 0; i < paragraph.getCTP().sizeOfBookmarkStartArray(); i++) 
{
-          CTBookmark bookmark = paragraph.getCTP().getBookmarkStartArray(i);
-          xhtml.startElement("a", "name", bookmark.getName());
-          xhtml.endElement("a");
-       }
-       
-       TmpFormatting fmtg = new TmpFormatting(false, false);
-       
-       // Do the iruns
-       for(IRunElement run : paragraph.getIRuns()) {
-          if (run instanceof XWPFSDT){
-             fmtg = closeStyleTags(xhtml, fmtg);
-             processSDTRun((XWPFSDT)run, xhtml);
-             //for now, we're ignoring formatting in sdt
-             //if you hit an sdt reset to false
-             fmtg.setBold(false);
-             fmtg.setItalic(false);
-          } else {
-             fmtg = processRun((XWPFRun)run, paragraph, xhtml, fmtg);
-          }
-       }
-       closeStyleTags(xhtml, fmtg);
-       
-       
-       // Now do any comments for the paragraph
-       XWPFCommentsDecorator comments = new XWPFCommentsDecorator(paragraph, 
null);
-       String commentText = comments.getCommentText();
-       if(commentText != null && commentText.length() > 0) {
-          xhtml.characters(commentText);
-       }
-
-       String footnameText = paragraph.getFootnoteText();
-       if(footnameText != null && footnameText.length() > 0) {
-          xhtml.characters(footnameText + "\n");
-       }
-
-       // Also extract any paragraphs embedded in text boxes:
-       for (XmlObject embeddedParagraph : 
paragraph.getCTP().selectPath("declare namespace 
w='http://schemas.openxmlformats.org/wordprocessingml/2006/main' declare 
namespace 
wps='http://schemas.microsoft.com/office/word/2010/wordprocessingShape' 
.//*/wps:txbx/w:txbxContent/w:p")) {
-           extractParagraph(new 
XWPFParagraph(CTP.Factory.parse(embeddedParagraph.xmlText()), 
paragraph.getBody()), listManager, xhtml);
-       }
-
-       // Finish this paragraph
-       xhtml.endElement(tag);
-
-       if (headerFooterPolicy != null) {
-           extractFooters(xhtml, headerFooterPolicy, listManager);
-       }
+            }
+
+            c.dispose();
+        }
+
+        // Attach bookmarks for the paragraph
+        // (In future, we might put them in the right place, for now
+        //  we just put them in the correct paragraph)
+        for (int i = 0; i < paragraph.getCTP().sizeOfBookmarkStartArray(); 
i++) {
+            CTBookmark bookmark = paragraph.getCTP().getBookmarkStartArray(i);
+            xhtml.startElement("a", "name", bookmark.getName());
+            xhtml.endElement("a");
+        }
+
+        TmpFormatting fmtg = new TmpFormatting(false, false);
+
+        // Do the iruns
+        for (IRunElement run : paragraph.getIRuns()) {
+            if (run instanceof XWPFSDT) {
+                fmtg = closeStyleTags(xhtml, fmtg);
+                processSDTRun((XWPFSDT) run, xhtml);
+                //for now, we're ignoring formatting in sdt
+                //if you hit an sdt reset to false
+                fmtg.setBold(false);
+                fmtg.setItalic(false);
+            } else {
+                fmtg = processRun((XWPFRun) run, paragraph, xhtml, fmtg);
+            }
+        }
+        closeStyleTags(xhtml, fmtg);
+
+
+        // Now do any comments for the paragraph
+        XWPFCommentsDecorator comments = new XWPFCommentsDecorator(paragraph, 
null);
+        String commentText = comments.getCommentText();
+        if (commentText != null && commentText.length() > 0) {
+            xhtml.characters(commentText);
+        }
+
+        String footnameText = paragraph.getFootnoteText();
+        if (footnameText != null && footnameText.length() > 0) {
+            xhtml.characters(footnameText + "\n");
+        }
+
+        // Also extract any paragraphs embedded in text boxes:
+        for (XmlObject embeddedParagraph : 
paragraph.getCTP().selectPath("declare namespace 
w='http://schemas.openxmlformats.org/wordprocessingml/2006/main' declare 
namespace 
wps='http://schemas.microsoft.com/office/word/2010/wordprocessingShape' 
.//*/wps:txbx/w:txbxContent/w:p")) {
+            extractParagraph(new 
XWPFParagraph(CTP.Factory.parse(embeddedParagraph.xmlText()), 
paragraph.getBody()), listManager, xhtml);
+        }
+
+        // Finish this paragraph
+        xhtml.endElement(tag);
+
+        if (headerFooterPolicy != null) {
+            extractFooters(xhtml, headerFooterPolicy, listManager);
+        }
     }
 
     private void writeParagraphNumber(XWPFParagraph paragraph,
@@ -267,110 +267,110 @@ public class XWPFWordExtractorDecorator
     }
 
     private TmpFormatting closeStyleTags(XHTMLContentHandler xhtml,
-          TmpFormatting fmtg) throws SAXException {
-       // Close any still open style tags
-       if (fmtg.isItalic()) {
-          xhtml.endElement("i");
-          fmtg.setItalic(false);
-       }
-       if (fmtg.isBold()) {
-          xhtml.endElement("b");
-          fmtg.setBold(false);
-       }
-       return fmtg;
-    }
-
-    private TmpFormatting processRun(XWPFRun run, XWPFParagraph paragraph, 
-          XHTMLContentHandler xhtml, TmpFormatting tfmtg) 
-          throws SAXException, XmlException, IOException{
-       // True if we are currently in the named style tag:
-       if (run.isBold() != tfmtg.isBold()) {
-          if (tfmtg.isItalic()) {
-             xhtml.endElement("i");
-             tfmtg.setItalic(false);
-          }
-          if (run.isBold()) {
-             xhtml.startElement("b");
-          } else {
-             xhtml.endElement("b");
-          }
-          tfmtg.setBold(run.isBold());
-       }
-
-       if (run.isItalic() != tfmtg.isItalic()) {
-          if (run.isItalic()) {
-             xhtml.startElement("i");
-          } else {
-             xhtml.endElement("i");
-          }
-          tfmtg.setItalic(run.isItalic());
-       }
-
-       boolean addedHREF = false;
-       if(run instanceof XWPFHyperlinkRun) {
-          XWPFHyperlinkRun linkRun = (XWPFHyperlinkRun)run;
-          XWPFHyperlink link = linkRun.getHyperlink(document);
-          if(link != null && link.getURL() != null) {
-             xhtml.startElement("a", "href", link.getURL());
-             addedHREF = true;
-          } else if(linkRun.getAnchor() != null && 
linkRun.getAnchor().length() > 0) {
-             xhtml.startElement("a", "href", "#" + linkRun.getAnchor());
-             addedHREF = true;
-          }
-       }
-
-       xhtml.characters(run.toString());
-
-       // If we have any pictures, output them
-       for(XWPFPicture picture : run.getEmbeddedPictures()) {
-          if(paragraph.getDocument() != null) {
-             XWPFPictureData data = picture.getPictureData();
-             if(data != null) {
-                AttributesImpl attr = new AttributesImpl();
-
-                attr.addAttribute("", "src", "src", "CDATA", "embedded:" + 
data.getFileName());
-                attr.addAttribute("", "alt", "alt", "CDATA", 
picture.getDescription());
-
-                xhtml.startElement("img", attr);
-                xhtml.endElement("img");
-             }
-          }
-       }
-
-       if (addedHREF) {
-          xhtml.endElement("a");
-       }
+                                         TmpFormatting fmtg) throws 
SAXException {
+        // Close any still open style tags
+        if (fmtg.isItalic()) {
+            xhtml.endElement("i");
+            fmtg.setItalic(false);
+        }
+        if (fmtg.isBold()) {
+            xhtml.endElement("b");
+            fmtg.setBold(false);
+        }
+        return fmtg;
+    }
 
-       return tfmtg;
+    private TmpFormatting processRun(XWPFRun run, XWPFParagraph paragraph,
+                                     XHTMLContentHandler xhtml, TmpFormatting 
tfmtg)
+            throws SAXException, XmlException, IOException {
+        // True if we are currently in the named style tag:
+        if (run.isBold() != tfmtg.isBold()) {
+            if (tfmtg.isItalic()) {
+                xhtml.endElement("i");
+                tfmtg.setItalic(false);
+            }
+            if (run.isBold()) {
+                xhtml.startElement("b");
+            } else {
+                xhtml.endElement("b");
+            }
+            tfmtg.setBold(run.isBold());
+        }
+
+        if (run.isItalic() != tfmtg.isItalic()) {
+            if (run.isItalic()) {
+                xhtml.startElement("i");
+            } else {
+                xhtml.endElement("i");
+            }
+            tfmtg.setItalic(run.isItalic());
+        }
+
+        boolean addedHREF = false;
+        if (run instanceof XWPFHyperlinkRun) {
+            XWPFHyperlinkRun linkRun = (XWPFHyperlinkRun) run;
+            XWPFHyperlink link = linkRun.getHyperlink(document);
+            if (link != null && link.getURL() != null) {
+                xhtml.startElement("a", "href", link.getURL());
+                addedHREF = true;
+            } else if (linkRun.getAnchor() != null && 
linkRun.getAnchor().length() > 0) {
+                xhtml.startElement("a", "href", "#" + linkRun.getAnchor());
+                addedHREF = true;
+            }
+        }
+
+        xhtml.characters(run.toString());
+
+        // If we have any pictures, output them
+        for (XWPFPicture picture : run.getEmbeddedPictures()) {
+            if (paragraph.getDocument() != null) {
+                XWPFPictureData data = picture.getPictureData();
+                if (data != null) {
+                    AttributesImpl attr = new AttributesImpl();
+
+                    attr.addAttribute("", "src", "src", "CDATA", "embedded:" + 
data.getFileName());
+                    attr.addAttribute("", "alt", "alt", "CDATA", 
picture.getDescription());
+
+                    xhtml.startElement("img", attr);
+                    xhtml.endElement("img");
+                }
+            }
+        }
+
+        if (addedHREF) {
+            xhtml.endElement("a");
+        }
+
+        return tfmtg;
     }
 
     private void processSDTRun(XWPFSDT run, XHTMLContentHandler xhtml)
-          throws SAXException, XmlException, IOException{
-       xhtml.characters(run.getContent().getText());
+            throws SAXException, XmlException, IOException {
+        xhtml.characters(run.getContent().getText());
     }
 
-    private void extractTable(XWPFTable table, XWPFListManager listManager, 
-            XHTMLContentHandler xhtml)
+    private void extractTable(XWPFTable table, XWPFListManager listManager,
+                              XHTMLContentHandler xhtml)
             throws SAXException, XmlException, IOException {
-       xhtml.startElement("table");
-       xhtml.startElement("tbody");
-       for(XWPFTableRow row : table.getRows()) {
-          xhtml.startElement("tr");
-          for(ICell cell : row.getTableICells()){
-              xhtml.startElement("td");
-              if (cell instanceof XWPFTableCell) {
-                  extractIBodyText((XWPFTableCell)cell, listManager, xhtml);
-              } else if (cell instanceof XWPFSDTCell) {
-                  xhtml.characters(((XWPFSDTCell)cell).getContent().getText());
-              }
-              xhtml.endElement("td");
-          }
-          xhtml.endElement("tr");
-       }
-       xhtml.endElement("tbody");
-       xhtml.endElement("table");
+        xhtml.startElement("table");
+        xhtml.startElement("tbody");
+        for (XWPFTableRow row : table.getRows()) {
+            xhtml.startElement("tr");
+            for (ICell cell : row.getTableICells()) {
+                xhtml.startElement("td");
+                if (cell instanceof XWPFTableCell) {
+                    extractIBodyText((XWPFTableCell) cell, listManager, xhtml);
+                } else if (cell instanceof XWPFSDTCell) {
+                    xhtml.characters(((XWPFSDTCell) 
cell).getContent().getText());
+                }
+                xhtml.endElement("td");
+            }
+            xhtml.endElement("tr");
+        }
+        xhtml.endElement("tbody");
+        xhtml.endElement("table");
     }
-    
+
     private void extractFooters(
             XHTMLContentHandler xhtml, XWPFHeaderFooterPolicy hfPolicy,
             XWPFListManager listManager)
@@ -391,7 +391,7 @@ public class XWPFWordExtractorDecorator
             XHTMLContentHandler xhtml, XWPFHeaderFooterPolicy hfPolicy, 
XWPFListManager listManager)
             throws SAXException, XmlException, IOException {
         if (hfPolicy == null) return;
-       
+
         if (hfPolicy.getFirstPageHeader() != null) {
             extractHeaderText(xhtml, hfPolicy.getFirstPageHeader(), 
listManager);
         }
@@ -407,48 +407,53 @@ public class XWPFWordExtractorDecorator
 
     private void extractHeaderText(XHTMLContentHandler xhtml, XWPFHeaderFooter 
header, XWPFListManager listManager) throws SAXException, XmlException, 
IOException {
 
-        for (IBodyElement e : header.getBodyElements()){
-           if (e instanceof XWPFParagraph){
-              extractParagraph((XWPFParagraph)e, listManager, xhtml);
-           } else if (e instanceof XWPFTable){
-              extractTable((XWPFTable)e, listManager, xhtml);
-           } else if (e instanceof XWPFSDT){
-              extractSDT((XWPFSDT)e, xhtml);
-           }
+        for (IBodyElement e : header.getBodyElements()) {
+            if (e instanceof XWPFParagraph) {
+                extractParagraph((XWPFParagraph) e, listManager, xhtml);
+            } else if (e instanceof XWPFTable) {
+                extractTable((XWPFTable) e, listManager, xhtml);
+            } else if (e instanceof XWPFSDT) {
+                extractSDT((XWPFSDT) e, xhtml);
+            }
         }
     }
 
     /**
      * Word documents are simple, they only have the one
-     *  main part
+     * main part
      */
     @Override
     protected List<PackagePart> getMainDocumentParts() {
-       List<PackagePart> parts = new ArrayList<PackagePart>();
-       parts.add( document.getPackagePart() );
-       return parts;
-    }
-    
-    private class TmpFormatting{
-       private boolean bold = false;
-       private boolean italic = false;
-       private TmpFormatting(boolean bold, boolean italic){
-          this.bold = bold;
-          this.italic = italic;
-       }
-       public boolean isBold() {
-          return bold;
-       }
-       public void setBold(boolean bold) {
-          this.bold = bold;
-       }
-       public boolean isItalic() {
-          return italic;
-       }
-       public void setItalic(boolean italic) {
-          this.italic = italic;
-       }
-       
+        List<PackagePart> parts = new ArrayList<PackagePart>();
+        parts.add(document.getPackagePart());
+        return parts;
+    }
+
+    private class TmpFormatting {
+        private boolean bold = false;
+        private boolean italic = false;
+
+        private TmpFormatting(boolean bold, boolean italic) {
+            this.bold = bold;
+            this.italic = italic;
+        }
+
+        public boolean isBold() {
+            return bold;
+        }
+
+        public void setBold(boolean bold) {
+            this.bold = bold;
+        }
+
+        public boolean isItalic() {
+            return italic;
+        }
+
+        public void setItalic(boolean italic) {
+            this.italic = italic;
+        }
+
     }
 
 }

Modified: 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/AccessChecker.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/AccessChecker.java?rev=1682489&r1=1682488&r2=1682489&view=diff
==============================================================================
--- 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/AccessChecker.java
 (original)
+++ 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/AccessChecker.java
 Fri May 29 14:36:21 2015
@@ -38,13 +38,14 @@ public class AccessChecker implements Se
      * This constructs an {@link AccessChecker} that
      * will not perform any checking and will always return without
      * throwing an exception.
-     * <p>
+     * <p/>
      * This constructor is available to allow for Tika's legacy ( <= v1.7) 
behavior.
      */
     public AccessChecker() {
         needToCheck = false;
         allowAccessibility = true;
     }
+
     /**
      * This constructs an {@link AccessChecker} that will check
      * for whether or not content should be extracted from a document.
@@ -69,7 +70,7 @@ public class AccessChecker implements Se
         }
         if ("false".equals(metadata.get(AccessPermissions.EXTRACT_CONTENT))) {
             if (allowAccessibility) {
-                
if("true".equals(metadata.get(AccessPermissions.EXTRACT_FOR_ACCESSIBILITY))) {
+                if 
("true".equals(metadata.get(AccessPermissions.EXTRACT_FOR_ACCESSIBILITY))) {
                     return;
                 }
                 throw new AccessPermissionException("Content extraction for 
accessibility is not allowed.");

Modified: 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java?rev=1682489&r1=1682488&r2=1682489&view=diff
==============================================================================
--- 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java 
(original)
+++ 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java 
Fri May 29 14:36:21 2015
@@ -81,19 +81,21 @@ import org.xml.sax.helpers.AttributesImp
  * stream.
  */
 class PDF2XHTML extends PDFTextStripper {
-    
+
+    /**
+     * Maximum recursive depth during AcroForm processing.
+     * Prevents theoretical AcroForm recursion bomb.
+     */
+    private final static int MAX_ACROFORM_RECURSIONS = 10;
     /**
      * Format used for signature dates
      * TODO Make this thread-safe
      */
     private final SimpleDateFormat dateFormat = new 
SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ssZ", Locale.ROOT);
- 
-    /**
-     * Maximum recursive depth during AcroForm processing.
-     * Prevents theoretical AcroForm recursion bomb. 
-     */
-    private final static int MAX_ACROFORM_RECURSIONS = 10;
-
+    private final ContentHandler originalHandler;
+    private final ParseContext context;
+    private final XHTMLContentHandler handler;
+    private final PDFParserConfig config;
     /**
      * This keeps track of the pdf object ids for inline
      * images that have been processed.  If {@link 
PDFParserConfig#getExtractUniqueInlineImagesOnly()
@@ -102,17 +104,26 @@ class PDF2XHTML extends PDFTextStripper
      * This integer is used to identify images in the markup.
      */
     private Map<String, Integer> processedInlineImages = new HashMap<String, 
Integer>();
-
     private int inlineImageCounter = 0;
+    private PDF2XHTML(ContentHandler handler, ParseContext context, Metadata 
metadata,
+                      PDFParserConfig config)
+            throws IOException {
+        //source of config (derives from context or PDFParser?) is
+        //already determined in PDFParser.  No need to check context here.
+        this.config = config;
+        this.originalHandler = handler;
+        this.context = context;
+        this.handler = new XHTMLContentHandler(handler, metadata);
+    }
 
     /**
      * Converts the given PDF document (and related metadata) to a stream
      * of XHTML SAX events sent to the given content handler.
      *
      * @param document PDF document
-     * @param handler SAX content handler
+     * @param handler  SAX content handler
      * @param metadata PDF metadata
-     * @throws SAXException if the content handler fails to process SAX events
+     * @throws SAXException  if the content handler fails to process SAX events
      * @throws TikaException if the PDF document can not be processed
      */
     public static void process(
@@ -124,16 +135,18 @@ class PDF2XHTML extends PDFTextStripper
             // key methods to output to the given content
             // handler.
             PDF2XHTML pdf2XHTML = new PDF2XHTML(handler, context, metadata, 
config);
-            
+
             config.configure(pdf2XHTML);
 
             pdf2XHTML.writeText(document, new Writer() {
                 @Override
                 public void write(char[] cbuf, int off, int len) {
                 }
+
                 @Override
                 public void flush() {
                 }
+
                 @Override
                 public void close() {
                 }
@@ -147,22 +160,6 @@ class PDF2XHTML extends PDFTextStripper
             }
         }
     }
-    
-    private final ContentHandler originalHandler;
-    private final ParseContext context;
-    private final XHTMLContentHandler handler;
-    private final PDFParserConfig config;
-    
-    private PDF2XHTML(ContentHandler handler, ParseContext context, Metadata 
metadata, 
-            PDFParserConfig config)
-            throws IOException {
-        //source of config (derives from context or PDFParser?) is
-        //already determined in PDFParser.  No need to check context here.
-        this.config = config;
-        this.originalHandler = handler;
-        this.context = context;
-        this.handler = new XHTMLContentHandler(handler, metadata);
-    }
 
     void extractBookmarkText() throws SAXException {
         PDDocumentOutline outline = 
document.getDocumentCatalog().getDocumentOutline();
@@ -202,14 +199,14 @@ class PDF2XHTML extends PDFTextStripper
             // Extract text for any bookmarks:
             extractBookmarkText();
             extractEmbeddedDocuments(pdf, originalHandler);
-            
+
             //extract acroform data at end of doc
             if (config.getExtractAcroFormContent() == true) {
                 extractAcroForm(pdf, handler);
-             }
+            }
             handler.endDocument();
         } catch (TikaException e) {
-           throw new IOExceptionWithCause("Unable to end a document", e);
+            throw new IOExceptionWithCause("Unable to end a document", e);
         } catch (SAXException e) {
             throw new IOExceptionWithCause("Unable to end a document", e);
         }
@@ -235,7 +232,7 @@ class PDF2XHTML extends PDFTextStripper
             EmbeddedDocumentExtractor extractor = 
getEmbeddedDocumentExtractor();
             for (PDAnnotation annotation : page.getAnnotations()) {
 
-                if (annotation instanceof PDAnnotationFileAttachment){
+                if (annotation instanceof PDAnnotationFileAttachment) {
                     PDAnnotationFileAttachment fann = 
(PDAnnotationFileAttachment) annotation;
                     PDComplexFileSpecification fileSpec = 
(PDComplexFileSpecification) fann.getFile();
                     try {
@@ -316,7 +313,7 @@ class PDF2XHTML extends PDFTextStripper
         }
 
         for (Map.Entry<String, PDXObject> entry : xObjects.entrySet()) {
-                        
+
             PDXObject object = entry.getValue();
             if (object instanceof PDXObjectForm) {
                 extractImages(((PDXObjectForm) object).getResources());
@@ -341,7 +338,7 @@ class PDF2XHTML extends PDFTextStripper
                 if (imageNumber == null) {
                     imageNumber = inlineImageCounter++;
                 }
-                String fileName = "image"+imageNumber+extension;
+                String fileName = "image" + imageNumber + extension;
                 metadata.set(Metadata.RESOURCE_NAME_KEY, fileName);
 
                 // Output the img tag
@@ -355,7 +352,7 @@ class PDF2XHTML extends PDFTextStripper
                 //If so, have we already processed this one?
                 if (config.getExtractUniqueInlineImagesOnly() == true) {
                     String cosObjectId = entry.getKey();
-                    if (processedInlineImages.containsKey(cosObjectId)){
+                    if (processedInlineImages.containsKey(cosObjectId)) {
                         continue;
                     }
                     processedInlineImages.put(cosObjectId, imageNumber);
@@ -452,7 +449,7 @@ class PDF2XHTML extends PDFTextStripper
                     "Unable to write a newline character", e);
         }
     }
-    
+
     private void extractEmbeddedDocuments(PDDocument document, ContentHandler 
handler)
             throws IOException, SAXException, TikaException {
         PDDocumentCatalog catalog = document.getDocumentCatalog();
@@ -495,14 +492,14 @@ class PDF2XHTML extends PDFTextStripper
         }
 
         EmbeddedDocumentExtractor extractor = getEmbeddedDocumentExtractor();
-        for (Map.Entry<String,COSObjectable> ent : 
embeddedFileNames.entrySet()) {
+        for (Map.Entry<String, COSObjectable> ent : 
embeddedFileNames.entrySet()) {
             PDComplexFileSpecification spec = (PDComplexFileSpecification) 
ent.getValue();
             extractMultiOSPDEmbeddedFiles(ent.getKey(), spec, extractor);
         }
     }
 
     private void extractMultiOSPDEmbeddedFiles(String defaultName,
-        PDComplexFileSpecification spec, EmbeddedDocumentExtractor extractor) 
throws IOException,
+                                               PDComplexFileSpecification 
spec, EmbeddedDocumentExtractor extractor) throws IOException,
             SAXException, TikaException {
 
         if (spec == null) {
@@ -516,8 +513,8 @@ class PDF2XHTML extends PDFTextStripper
     }
 
     private void extractPDEmbeddedFile(String defaultName, String fileName, 
PDEmbeddedFile file,
-                              EmbeddedDocumentExtractor extractor)
-            throws SAXException, IOException, TikaException{
+                                       EmbeddedDocumentExtractor extractor)
+            throws SAXException, IOException, TikaException {
 
         if (file == null) {
             //skip silently
@@ -536,7 +533,7 @@ class PDF2XHTML extends PDFTextStripper
 
         if (extractor.shouldParseEmbedded(metadata)) {
             TikaInputStream stream = null;
-            try{
+            try {
                 stream = TikaInputStream.get(file.createInputStream());
                 extractor.parseEmbedded(
                         stream,
@@ -554,8 +551,8 @@ class PDF2XHTML extends PDFTextStripper
         }
     }
 
-    private void extractAcroForm(PDDocument pdf, XHTMLContentHandler handler) 
throws IOException, 
-    SAXException {
+    private void extractAcroForm(PDDocument pdf, XHTMLContentHandler handler) 
throws IOException,
+            SAXException {
         //Thank you, Ben Litchfield, for 
org.apache.pdfbox.examples.fdf.PrintFields
         //this code derives from Ben's code
         PDDocumentCatalog catalog = pdf.getDocumentCatalog();
@@ -574,7 +571,7 @@ class PDF2XHTML extends PDFTextStripper
             return;
 
         @SuppressWarnings("rawtypes")
-        ListIterator itr  = fields.listIterator();
+        ListIterator itr = fields.listIterator();
 
         if (itr == null)
             return;
@@ -585,7 +582,7 @@ class PDF2XHTML extends PDFTextStripper
         while (itr.hasNext()) {
             Object obj = itr.next();
             if (obj != null && obj instanceof PDField) {
-                processAcroField((PDField)obj, handler, 0);
+                processAcroField((PDField) obj, handler, 0);
             }
         }
         handler.endElement("ol");
@@ -593,7 +590,7 @@ class PDF2XHTML extends PDFTextStripper
     }
 
     private void processAcroField(PDField field, XHTMLContentHandler handler, 
final int currentRecursiveDepth)
-            throws SAXException, IOException { 
+            throws SAXException, IOException {
 
         if (currentRecursiveDepth >= MAX_ACROFORM_RECURSIONS) {
             return;
@@ -602,14 +599,14 @@ class PDF2XHTML extends PDFTextStripper
         addFieldString(field, handler);
 
         List<COSObjectable> kids = field.getKids();
-        if(kids != null) {
+        if (kids != null) {
 
-            int r = currentRecursiveDepth+1;
+            int r = currentRecursiveDepth + 1;
             handler.startElement("ol");
             //TODO: can generate <ol/>. Rework to avoid that.
-            for(COSObjectable pdfObj : kids) {
-                if(pdfObj != null && pdfObj instanceof PDField) {
-                    PDField kid = (PDField)pdfObj;
+            for (COSObjectable pdfObj : kids) {
+                if (pdfObj != null && pdfObj instanceof PDField) {
+                    PDField kid = (PDField) pdfObj;
                     //recurse
                     processAcroField(kid, handler, r);
                 }
@@ -635,13 +632,13 @@ class PDF2XHTML extends PDFTextStripper
         }
         //return early if PDSignature field
         if (field instanceof PDSignatureField) {
-            handleSignature(attrs, (PDSignatureField)field, handler);
+            handleSignature(attrs, (PDSignatureField) field, handler);
             return;
         }
         try {
             //getValue can throw an IOException if there is no value
             String value = field.getValue();
-            if (value != null && ! value.equals("null")) {
+            if (value != null && !value.equals("null")) {
                 sb.append(value);
             }
         } catch (IOException e) {
@@ -656,14 +653,14 @@ class PDF2XHTML extends PDFTextStripper
     }
 
     private void handleSignature(AttributesImpl parentAttributes, 
PDSignatureField sigField,
-            XHTMLContentHandler handler) throws SAXException {
+                                 XHTMLContentHandler handler) throws 
SAXException {
 
 
         PDSignature sig = sigField.getSignature();
         if (sig == null) {
             return;
         }
-        Map<String, String> vals= new TreeMap<String, String>();
+        Map<String, String> vals = new TreeMap<String, String>();
         vals.put("name", sig.getName());
         vals.put("contactInfo", sig.getContactInfo());
         vals.put("location", sig.getLocation());
@@ -677,7 +674,7 @@ class PDF2XHTML extends PDFTextStripper
         //see if there is any data
         int nonNull = 0;
         for (String val : vals.keySet()) {
-            if (val != null && ! val.equals("")) {
+            if (val != null && !val.equals("")) {
                 nonNull++;
             }
         }

Modified: 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java?rev=1682489&r1=1682488&r2=1682489&view=diff
==============================================================================
--- 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java 
(original)
+++ 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java 
Fri May 29 14:36:21 2015
@@ -61,7 +61,7 @@ import org.xml.sax.SAXException;
 
 /**
  * PDF parser.
- * <p>
+ * <p/>
  * This parser can process also encrypted PDF documents if the required
  * password is given as a part of the input metadata associated with a
  * document. If no password is given, then this parser will try decrypting
@@ -69,7 +69,7 @@ import org.xml.sax.SAXException;
  * the PDF contains any embedded documents (for example as part of a PDF
  * package) then this parser will use the {@link EmbeddedDocumentExtractor}
  * to handle them.
- * <p>
+ * <p/>
  * As of Tika 1.6, it is possible to extract inline images with
  * the {@link EmbeddedDocumentExtractor} as if they were regular
  * attachments.  By default, this feature is turned off because of
@@ -80,12 +80,6 @@ import org.xml.sax.SAXException;
 public class PDFParser extends AbstractParser {
 
 
-    private static final MediaType MEDIA_TYPE = MediaType.application("pdf");
-
-    /** Serial version UID */
-    private static final long serialVersionUID = -752276948656079347L;
-
-    private PDFParserConfig defaultConfig = new PDFParserConfig();
     /**
      * Metadata key for giving the document password to the parser.
      *
@@ -93,9 +87,14 @@ public class PDFParser extends AbstractP
      * @deprecated Supply a {@link PasswordProvider} on the {@link 
ParseContext} instead
      */
     public static final String PASSWORD = 
"org.apache.tika.parser.pdf.password";
-
+    private static final MediaType MEDIA_TYPE = MediaType.application("pdf");
+    /**
+     * Serial version UID
+     */
+    private static final long serialVersionUID = -752276948656079347L;
     private static final Set<MediaType> SUPPORTED_TYPES =
-        Collections.singleton(MEDIA_TYPE);
+            Collections.singleton(MEDIA_TYPE);
+    private PDFParserConfig defaultConfig = new PDFParserConfig();
 
     public Set<MediaType> getSupportedTypes(ParseContext context) {
         return SUPPORTED_TYPES;
@@ -105,7 +104,7 @@ public class PDFParser extends AbstractP
             InputStream stream, ContentHandler handler,
             Metadata metadata, ParseContext context)
             throws IOException, SAXException, TikaException {
-       
+
         PDDocument pdfDocument = null;
         TemporaryResources tmp = new TemporaryResources();
         //config from context, or default if not set via context
@@ -136,7 +135,7 @@ public class PDFParser extends AbstractP
             metadata.set("pdf:encrypted", 
Boolean.toString(pdfDocument.isEncrypted()));
 
             //if using the classic parser and the doc is encrypted, we must 
manually decrypt
-            if (! localConfig.getUseNonSequentialParser() && 
pdfDocument.isEncrypted()) {
+            if (!localConfig.getUseNonSequentialParser() && 
pdfDocument.isEncrypted()) {
                 pdfDocument.decrypt(password);
             }
 
@@ -148,14 +147,14 @@ public class PDFParser extends AbstractP
             if (handler != null) {
                 PDF2XHTML.process(pdfDocument, handler, context, metadata, 
localConfig);
             }
-            
+
         } catch (CryptographyException e) {
             //seq parser throws CryptographyException for bad password
             throw new EncryptedDocumentException(e);
         } catch (IOException e) {
             //nonseq parser throws IOException for bad password
             //At the Tika level, we want the same exception to be thrown
-            if (e.getMessage() != null && 
+            if (e.getMessage() != null &&
                     e.getMessage().contains("Error (CryptographyException)")) {
                 metadata.set("pdf:encrypted", Boolean.toString(true));
                 throw new EncryptedDocumentException(e);
@@ -164,7 +163,7 @@ public class PDFParser extends AbstractP
             throw e;
         } finally {
             if (pdfDocument != null) {
-               pdfDocument.close();
+                pdfDocument.close();
             }
             tmp.dispose();
             //TODO: once we migrate to PDFBox 2.0, remove this (PDFBOX-2200)
@@ -217,11 +216,10 @@ public class PDFParser extends AbstractP
                 Boolean.toString(ap.canPrintDegraded()));
 
 
-
         //now go for the XMP stuff
         org.apache.jempbox.xmp.XMPMetadata xmp = null;
         XMPSchemaDublinCore dcSchema = null;
-        try{
+        try {
             if (document.getDocumentCatalog().getMetadata() != null) {
                 xmp = 
document.getDocumentCatalog().getMetadata().exportXMPMetadata();
             }
@@ -258,15 +256,15 @@ public class PDFParser extends AbstractP
         } catch (IOException e) {
             // Invalid date format, just ignore
         }
-        
+
         // All remaining metadata is custom
         // Copy this over as-is
         List<String> handledMetadata = Arrays.asList("Author", "Creator", 
"CreationDate", "ModDate",
                 "Keywords", "Producer", "Subject", "Title", "Trapped");
-        for(COSName key : info.getDictionary().keySet()) {
+        for (COSName key : info.getDictionary().keySet()) {
             String name = key.getName();
-            if(! handledMetadata.contains(name)) {
-               addMetadata(metadata, name, 
info.getDictionary().getDictionaryObject(key));
+            if (!handledMetadata.contains(name)) {
+                addMetadata(metadata, name, 
info.getDictionary().getDictionaryObject(key));
             }
         }
 
@@ -276,50 +274,50 @@ public class PDFParser extends AbstractP
         //    TikaCoreProperties.FORMAT can be multivalued
         //    There are also three potential pdf specific version keys: 
pdf:PDFVersion, pdfa:PDFVersion, pdf:PDFExtensionVersion        
         metadata.set("pdf:PDFVersion", 
Float.toString(document.getDocument().getVersion()));
-        metadata.add(TikaCoreProperties.FORMAT.getName(), 
-            MEDIA_TYPE.toString()+"; version="+
-            Float.toString(document.getDocument().getVersion()));
+        metadata.add(TikaCoreProperties.FORMAT.getName(),
+                MEDIA_TYPE.toString() + "; version=" +
+                        Float.toString(document.getDocument().getVersion()));
 
-        try {           
-            if( xmp != null ) {
+        try {
+            if (xmp != null) {
                 xmp.addXMLNSMapping(XMPSchemaPDFAId.NAMESPACE, 
XMPSchemaPDFAId.class);
                 XMPSchemaPDFAId pdfaxmp = (XMPSchemaPDFAId) 
xmp.getSchemaByClass(XMPSchemaPDFAId.class);
-                if( pdfaxmp != null ) {
+                if (pdfaxmp != null) {
                     if (pdfaxmp.getPart() != null) {
                         metadata.set("pdfaid:part", 
Integer.toString(pdfaxmp.getPart()));
                     }
                     if (pdfaxmp.getConformance() != null) {
                         metadata.set("pdfaid:conformance", 
pdfaxmp.getConformance());
-                        String version = 
"A-"+pdfaxmp.getPart()+pdfaxmp.getConformance().toLowerCase(Locale.ROOT);
-                        metadata.set("pdfa:PDFVersion", version );
-                        metadata.add(TikaCoreProperties.FORMAT.getName(), 
-                            MEDIA_TYPE.toString()+"; version=\""+version+"\"" 
);
+                        String version = "A-" + pdfaxmp.getPart() + 
pdfaxmp.getConformance().toLowerCase(Locale.ROOT);
+                        metadata.set("pdfa:PDFVersion", version);
+                        metadata.add(TikaCoreProperties.FORMAT.getName(),
+                                MEDIA_TYPE.toString() + "; version=\"" + 
version + "\"");
                     }
-                } 
+                }
                 // TODO WARN if this XMP version is inconsistent with document 
header version?          
             }
         } catch (IOException e) {
-            
metadata.set(TikaCoreProperties.TIKA_META_PREFIX+"pdf:metadata-xmp-parse-failed",
 ""+e);
+            metadata.set(TikaCoreProperties.TIKA_META_PREFIX + 
"pdf:metadata-xmp-parse-failed", "" + e);
         }
         //TODO: Let's try to move this into PDFBox.
         //Attempt to determine Adobe extension level, if present:
         COSDictionary root = document.getDocumentCatalog().getCOSDictionary();
-        COSDictionary extensions = (COSDictionary) 
root.getDictionaryObject(COSName.getPDFName("Extensions") );
-        if( extensions != null ) {
-            for( COSName extName : extensions.keySet() ) {
+        COSDictionary extensions = (COSDictionary) 
root.getDictionaryObject(COSName.getPDFName("Extensions"));
+        if (extensions != null) {
+            for (COSName extName : extensions.keySet()) {
                 // If it's an Adobe one, interpret it to determine the 
extension level:
-                if( extName.equals( COSName.getPDFName("ADBE") )) {
+                if (extName.equals(COSName.getPDFName("ADBE"))) {
                     COSDictionary adobeExt = (COSDictionary) 
extensions.getDictionaryObject(extName);
-                    if( adobeExt != null ) {
+                    if (adobeExt != null) {
                         String baseVersion = 
adobeExt.getNameAsString(COSName.getPDFName("BaseVersion"));
                         int el = 
adobeExt.getInt(COSName.getPDFName("ExtensionLevel"));
                         //-1 is sentinel value that something went wrong in 
getInt
                         if (el != -1) {
-                            metadata.set("pdf:PDFExtensionVersion", 
baseVersion+" Adobe Extension Level "+el );
-                            metadata.add(TikaCoreProperties.FORMAT.getName(), 
-                                MEDIA_TYPE.toString()+"; 
version=\""+baseVersion+" Adobe Extension Level "+el+"\"");
+                            metadata.set("pdf:PDFExtensionVersion", 
baseVersion + " Adobe Extension Level " + el);
+                            metadata.add(TikaCoreProperties.FORMAT.getName(),
+                                    MEDIA_TYPE.toString() + "; version=\"" + 
baseVersion + " Adobe Extension Level " + el + "\"");
                         }
-                    }                   
+                    }
                 } else {
                     // WARN that there is an Extension, but it's not Adobe's, 
and so is a 'new' format'.
                     metadata.set("pdf:foundNonAdobeExtensionName", 
extName.getName());
@@ -328,19 +326,20 @@ public class PDFParser extends AbstractP
         }
     }
 
-   /**
+    /**
      * Try to extract all multilingual items from the XMPSchema
-     * <p>
+     * <p/>
      * This relies on the property having a valid xmp getName()
-     * <p>
+     * <p/>
      * For now, this only extracts the first language if the property does not 
allow multiple values (see TIKA-1295)
+     *
      * @param metadata
      * @param property
      * @param pdfBoxBaseline
      * @param schema
      */
     private void extractMultilingualItems(Metadata metadata, Property property,
-            String pdfBoxBaseline, XMPSchema schema) {
+                                          String pdfBoxBaseline, XMPSchema 
schema) {
         //if schema is null, just go with pdfBoxBaseline
         if (schema == null) {
             if (pdfBoxBaseline != null && pdfBoxBaseline.length() > 0) {
@@ -354,11 +353,11 @@ public class PDFParser extends AbstractP
 
             if (value != null && value.length() > 0) {
                 //if you're going to add it below in the baseline addition, 
don't add it now
-                if (pdfBoxBaseline != null && value.equals(pdfBoxBaseline)){
+                if (pdfBoxBaseline != null && value.equals(pdfBoxBaseline)) {
                     continue;
                 }
-                metadata.add(property, value); 
-                if (! property.isMultiValuePermitted()){
+                metadata.add(property, value);
+                if (!property.isMultiValuePermitted()) {
                     return;
                 }
             }
@@ -367,12 +366,12 @@ public class PDFParser extends AbstractP
         if (pdfBoxBaseline != null && pdfBoxBaseline.length() > 0) {
             //if we've already added something above and multivalue is not 
permitted
             //return.
-            if (! property.isMultiValuePermitted()){
-                if (metadata.get(property) != null){
+            if (!property.isMultiValuePermitted()) {
+                if (metadata.get(property) != null) {
                     return;
                 }
             }
-            metadata.add(property,  pdfBoxBaseline);
+            metadata.add(property, pdfBoxBaseline);
         }
     }
 
@@ -380,24 +379,24 @@ public class PDFParser extends AbstractP
     /**
      * This tries to read a list from a particular property in
      * XMPSchemaDublinCore.
-     * If it can't find the information, it falls back to the 
+     * If it can't find the information, it falls back to the
      * pdfboxBaseline.  The pdfboxBaseline should be the value
      * that pdfbox returns from its PDDocumentInformation object
      * (e.g. getAuthor()) This method is designed include the pdfboxBaseline,
      * and it should not duplicate the pdfboxBaseline.
-     * <p>
+     * <p/>
      * Until PDFBOX-1803/TIKA-1233 are fixed, do not call this
      * on dates!
-     * <p>
+     * <p/>
      * This relies on the property having a DublinCore compliant getName()
-     * 
+     *
      * @param property
      * @param pdfBoxBaseline
      * @param dc
      * @param metadata
      */
-    private void extractDublinCoreListItems(Metadata metadata, Property 
property, 
-            String pdfBoxBaseline, XMPSchemaDublinCore dc) {
+    private void extractDublinCoreListItems(Metadata metadata, Property 
property,
+                                            String pdfBoxBaseline, 
XMPSchemaDublinCore dc) {
         //if no dc, add baseline and return
         if (dc == null) {
             if (pdfBoxBaseline != null && pdfBoxBaseline.length() > 0) {
@@ -413,22 +412,22 @@ public class PDFParser extends AbstractP
             return;
         }
         for (String item : items) {
-            if (pdfBoxBaseline != null && ! item.equals(pdfBoxBaseline)) {
+            if (pdfBoxBaseline != null && !item.equals(pdfBoxBaseline)) {
                 addMetadata(metadata, property, item);
             }
         }
         //finally, add the baseline
         if (pdfBoxBaseline != null && pdfBoxBaseline.length() > 0) {
             addMetadata(metadata, property, pdfBoxBaseline);
-        }    
+        }
     }
 
     /**
      * As of this writing, XMPSchema can contain bags or sequence lists
-     * for some attributes...despite standards documentation.  
+     * for some attributes...despite standards documentation.
      * JempBox expects one or the other for specific attributes.
      * Until more flexibility is added to JempBox, Tika will have to handle 
both.
-     * 
+     *
      * @param schema
      * @param name
      * @return list of values or null
@@ -446,7 +445,7 @@ public class PDFParser extends AbstractP
             metadata.add(property, value);
         }
     }
-    
+
     private void addMetadata(Metadata metadata, String name, String value) {
         if (value != null) {
             metadata.add(name, value);
@@ -467,15 +466,15 @@ public class PDFParser extends AbstractP
 
     /**
      * Used when processing custom metadata entries, as PDFBox won't do
-     *  the conversion for us in the way it does for the standard ones
+     * the conversion for us in the way it does for the standard ones
      */
     private void addMetadata(Metadata metadata, String name, COSBase value) {
-        if(value instanceof COSArray) {
-            for(Object v : ((COSArray)value).toList()) {
+        if (value instanceof COSArray) {
+            for (Object v : ((COSArray) value).toList()) {
                 addMetadata(metadata, name, ((COSBase) v));
             }
-        } else if(value instanceof COSString) {
-            addMetadata(metadata, name, ((COSString)value).getString());
+        } else if (value instanceof COSString) {
+            addMetadata(metadata, name, ((COSString) value).getString());
         }
         // Avoid calling COSDictionary#toString, since it can lead to infinite
         // recursion. See TIKA-1038 and PDFBOX-1835.
@@ -484,56 +483,66 @@ public class PDFParser extends AbstractP
         }
     }
 
+    public PDFParserConfig getPDFParserConfig() {
+        return defaultConfig;
+    }
+
     public void setPDFParserConfig(PDFParserConfig config) {
         this.defaultConfig = config;
     }
-    
-    public PDFParserConfig getPDFParserConfig() {
-        return defaultConfig;
+
+    /**
+     * @see #setUseNonSequentialParser(boolean)
+     * @deprecated use {@link #getPDFParserConfig()}
+     */
+    public boolean getUseNonSequentialParser() {
+        return defaultConfig.getUseNonSequentialParser();
     }
-    
+
     /**
      * If true, the parser will use the NonSequentialParser.  This may
      * be faster than the full doc parser.
      * If false (default), this will use the full doc parser.
-     * 
+     *
      * @deprecated use {@link #setPDFParserConfig(PDFParserConfig)}
      */
     public void setUseNonSequentialParser(boolean v) {
         defaultConfig.setUseNonSequentialParser(v);
     }
-    
-    /** 
-     * @see #setUseNonSequentialParser(boolean) 
+
+    /**
+     * @see #setEnableAutoSpace(boolean)
      * @deprecated use {@link #getPDFParserConfig()}
      */
-    public boolean getUseNonSequentialParser() {
-        return defaultConfig.getUseNonSequentialParser();
+    public boolean getEnableAutoSpace() {
+        return defaultConfig.getEnableAutoSpace();
     }
-    
+
     /**
-     *  If true (the default), the parser should estimate
-     *  where spaces should be inserted between words.  For
-     *  many PDFs this is necessary as they do not include
-     *  explicit whitespace characters.
+     * If true (the default), the parser should estimate
+     * where spaces should be inserted between words.  For
+     * many PDFs this is necessary as they do not include
+     * explicit whitespace characters.
      *
-     *  @deprecated use {@link #setPDFParserConfig(PDFParserConfig)}
+     * @deprecated use {@link #setPDFParserConfig(PDFParserConfig)}
      */
     public void setEnableAutoSpace(boolean v) {
         defaultConfig.setEnableAutoSpace(v);
     }
 
-    /** 
-     * @see #setEnableAutoSpace(boolean) 
+    /**
+     * If true, text in annotations will be extracted.
+     *
      * @deprecated use {@link #getPDFParserConfig()}
      */
-    public boolean getEnableAutoSpace() {
-        return defaultConfig.getEnableAutoSpace();
+    public boolean getExtractAnnotationText() {
+        return defaultConfig.getExtractAnnotationText();
     }
 
     /**
      * If true (the default), text in annotations will be
      * extracted.
+     *
      * @deprecated use {@link #setPDFParserConfig(PDFParserConfig)}
      */
     public void setExtractAnnotationText(boolean v) {
@@ -541,59 +550,48 @@ public class PDFParser extends AbstractP
     }
 
     /**
-     * If true, text in annotations will be extracted.
-     * 
+     * @see #setSuppressDuplicateOverlappingText(boolean)
      * @deprecated use {@link #getPDFParserConfig()}
      */
-    public boolean getExtractAnnotationText() {
-        return defaultConfig.getExtractAnnotationText();
+    public boolean getSuppressDuplicateOverlappingText() {
+        return defaultConfig.getSuppressDuplicateOverlappingText();
     }
 
     /**
-     *  If true, the parser should try to remove duplicated
-     *  text over the same region.  This is needed for some
-     *  PDFs that achieve bolding by re-writing the same
-     *  text in the same area.  Note that this can
-     *  slow down extraction substantially (PDFBOX-956) and
-     *  sometimes remove characters that were not in fact
-     *  duplicated (PDFBOX-1155).  By default this is disabled.
-     *  
-     *  @deprecated use {@link #setPDFParserConfig(PDFParserConfig)}
+     * If true, the parser should try to remove duplicated
+     * text over the same region.  This is needed for some
+     * PDFs that achieve bolding by re-writing the same
+     * text in the same area.  Note that this can
+     * slow down extraction substantially (PDFBOX-956) and
+     * sometimes remove characters that were not in fact
+     * duplicated (PDFBOX-1155).  By default this is disabled.
+     *
+     * @deprecated use {@link #setPDFParserConfig(PDFParserConfig)}
      */
     public void setSuppressDuplicateOverlappingText(boolean v) {
         defaultConfig.setSuppressDuplicateOverlappingText(v);
     }
 
-    /** 
-     * @see #setSuppressDuplicateOverlappingText(boolean) 
-     * 
+    /**
+     * @see #setSortByPosition(boolean)
      * @deprecated use {@link #getPDFParserConfig()}
      */
-    public boolean getSuppressDuplicateOverlappingText() {
-        return defaultConfig.getSuppressDuplicateOverlappingText();
+    public boolean getSortByPosition() {
+        return defaultConfig.getSortByPosition();
     }
 
     /**
-     *  If true, sort text tokens by their x/y position
-     *  before extracting text.  This may be necessary for
-     *  some PDFs (if the text tokens are not rendered "in
-     *  order"), while for other PDFs it can produce the
-     *  wrong result (for example if there are 2 columns,
-     *  the text will be interleaved).  Default is false.
-     *  
-     *  @deprecated use {@link #setPDFParserConfig(PDFParserConfig)}
+     * If true, sort text tokens by their x/y position
+     * before extracting text.  This may be necessary for
+     * some PDFs (if the text tokens are not rendered "in
+     * order"), while for other PDFs it can produce the
+     * wrong result (for example if there are 2 columns,
+     * the text will be interleaved).  Default is false.
+     *
+     * @deprecated use {@link #setPDFParserConfig(PDFParserConfig)}
      */
     public void setSortByPosition(boolean v) {
         defaultConfig.setSortByPosition(v);
     }
 
-    /** 
-     * @see #setSortByPosition(boolean) 
-     * 
-     * @deprecated use {@link #getPDFParserConfig()}
-     */
-    public boolean getSortByPosition() {
-        return defaultConfig.getSortByPosition();
-    }
-
 }

svn commit: r1682489 [5/14] - in /tika/trunk: tika-parsers/src/main/java/org/apache/tika/parser/html/ tika-parsers/src/main/java/org/apache/tika/parser/image/ tika-parsers/src/main/java/org/apache/tika/parser/image/xmp/ tika-parsers/src/main/java/org/a...

Reply via email to