Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java?rev=1682489&r1=1682488&r2=1682489&view=diff ============================================================================== --- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java (original) +++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java Fri May 29 14:36:21 2015 @@ -16,16 +16,15 @@ */ package org.apache.tika.parser.microsoft.ooxml; +import javax.xml.parsers.ParserConfigurationException; +import javax.xml.parsers.SAXParser; +import javax.xml.parsers.SAXParserFactory; import java.io.IOException; import java.io.InputStream; import java.util.ArrayList; import java.util.List; import java.util.Locale; -import javax.xml.parsers.ParserConfigurationException; -import javax.xml.parsers.SAXParser; -import javax.xml.parsers.SAXParserFactory; - import org.apache.poi.hssf.extractor.ExcelExtractor; import org.apache.poi.openxml4j.exceptions.InvalidFormatException; import org.apache.poi.openxml4j.exceptions.OpenXML4JException; @@ -63,6 +62,10 @@ import org.xml.sax.SAXException; import org.xml.sax.XMLReader; public class XSSFExcelExtractorDecorator extends AbstractOOXMLExtractor { + /** + * Allows access to headers/footers from raw xml strings + */ + private static HeaderFooterHelper hfHelper = new HeaderFooterHelper(); private final XSSFEventBasedExcelExtractor extractor; private final DataFormatter formatter; private final List<PackagePart> sheetParts = new ArrayList<PackagePart>(); @@ -75,11 +78,11 @@ public class XSSFExcelExtractorDecorator this.extractor = extractor; extractor.setFormulasNotResults(false); extractor.setLocale(locale); - - if(locale == null) { - formatter = new DataFormatter(); - } else { - formatter = new DataFormatter(locale); + + if (locale == null) { + formatter = new DataFormatter(); + } else { + formatter = new DataFormatter(locale); } } @@ -88,10 +91,10 @@ public class XSSFExcelExtractorDecorator ContentHandler handler, Metadata metadata, ParseContext context) throws SAXException, XmlException, IOException, TikaException { - this.metadata = metadata; - metadata.set(TikaMetadataKeys.PROTECTED, "false"); + this.metadata = metadata; + metadata.set(TikaMetadataKeys.PROTECTED, "false"); - super.getXHTML(handler, metadata, context); + super.getXHTML(handler, metadata, context); } /** @@ -100,277 +103,293 @@ public class XSSFExcelExtractorDecorator @Override protected void buildXHTML(XHTMLContentHandler xhtml) throws SAXException, XmlException, IOException { - OPCPackage container = extractor.getPackage(); - - ReadOnlySharedStringsTable strings; - XSSFReader.SheetIterator iter; - XSSFReader xssfReader; - StylesTable styles; - try { - xssfReader = new XSSFReader(container); - styles = xssfReader.getStylesTable(); - iter = (XSSFReader.SheetIterator) xssfReader.getSheetsData(); - strings = new ReadOnlySharedStringsTable(container); - } catch(InvalidFormatException e) { - throw new XmlException(e); - } catch (OpenXML4JException oe) { - throw new XmlException(oe); - } - - while (iter.hasNext()) { - InputStream stream = iter.next(); - sheetParts.add(iter.getSheetPart()); - - SheetTextAsHTML sheetExtractor = new SheetTextAsHTML(xhtml); - CommentsTable comments = iter.getSheetComments(); - - // Start, and output the sheet name - xhtml.startElement("div"); - xhtml.element("h1", iter.getSheetName()); - - // Extract the main sheet contents - xhtml.startElement("table"); - xhtml.startElement("tbody"); - - processSheet(sheetExtractor, comments, styles, strings, stream); - - xhtml.endElement("tbody"); - xhtml.endElement("table"); - - // Output any headers and footers - // (Need to process the sheet to get them, so we can't - // do the headers before the contents) - for(String header : sheetExtractor.headers) { - extractHeaderFooter(header, xhtml); - } - for(String footer : sheetExtractor.footers) { - extractHeaderFooter(footer, xhtml); - } - processShapes(iter.getShapes(), xhtml); - // All done with this sheet - xhtml.endElement("div"); - } + OPCPackage container = extractor.getPackage(); + + ReadOnlySharedStringsTable strings; + XSSFReader.SheetIterator iter; + XSSFReader xssfReader; + StylesTable styles; + try { + xssfReader = new XSSFReader(container); + styles = xssfReader.getStylesTable(); + iter = (XSSFReader.SheetIterator) xssfReader.getSheetsData(); + strings = new ReadOnlySharedStringsTable(container); + } catch (InvalidFormatException e) { + throw new XmlException(e); + } catch (OpenXML4JException oe) { + throw new XmlException(oe); + } + + while (iter.hasNext()) { + InputStream stream = iter.next(); + sheetParts.add(iter.getSheetPart()); + + SheetTextAsHTML sheetExtractor = new SheetTextAsHTML(xhtml); + CommentsTable comments = iter.getSheetComments(); + + // Start, and output the sheet name + xhtml.startElement("div"); + xhtml.element("h1", iter.getSheetName()); + + // Extract the main sheet contents + xhtml.startElement("table"); + xhtml.startElement("tbody"); + + processSheet(sheetExtractor, comments, styles, strings, stream); + + xhtml.endElement("tbody"); + xhtml.endElement("table"); + + // Output any headers and footers + // (Need to process the sheet to get them, so we can't + // do the headers before the contents) + for (String header : sheetExtractor.headers) { + extractHeaderFooter(header, xhtml); + } + for (String footer : sheetExtractor.footers) { + extractHeaderFooter(footer, xhtml); + } + processShapes(iter.getShapes(), xhtml); + // All done with this sheet + xhtml.endElement("div"); + } } private void extractHeaderFooter(String hf, XHTMLContentHandler xhtml) throws SAXException { String content = ExcelExtractor._extractHeaderFooter( - new HeaderFooterFromString(hf)); + new HeaderFooterFromString(hf)); if (content.length() > 0) { xhtml.element("p", content); } } - + private void processShapes(List<XSSFShape> shapes, XHTMLContentHandler xhtml) throws SAXException { - if (shapes == null){ - return; - } - for (XSSFShape shape : shapes){ - if (shape instanceof XSSFSimpleShape){ - String sText = ((XSSFSimpleShape)shape).getText(); - if (sText != null && sText.length() > 0){ - xhtml.element("p", sText); - } - } - } - } - + if (shapes == null) { + return; + } + for (XSSFShape shape : shapes) { + if (shape instanceof XSSFSimpleShape) { + String sText = ((XSSFSimpleShape) shape).getText(); + if (sText != null && sText.length() > 0) { + xhtml.element("p", sText); + } + } + } + } + public void processSheet( - SheetContentsHandler sheetContentsExtractor, - CommentsTable comments, - StylesTable styles, - ReadOnlySharedStringsTable strings, - InputStream sheetInputStream) - throws IOException, SAXException { - InputSource sheetSource = new InputSource(sheetInputStream); - SAXParserFactory saxFactory = SAXParserFactory.newInstance(); - try { - SAXParser saxParser = saxFactory.newSAXParser(); - XMLReader sheetParser = saxParser.getXMLReader(); - XSSFSheetInterestingPartsCapturer handler = - new XSSFSheetInterestingPartsCapturer(new XSSFSheetXMLHandler( - styles, comments, strings, sheetContentsExtractor, formatter, false)); - sheetParser.setContentHandler(handler); - sheetParser.parse(sheetSource); - sheetInputStream.close(); - - if (handler.hasProtection) { - metadata.set(TikaMetadataKeys.PROTECTED, "true"); - } - } catch(ParserConfigurationException e) { - throw new RuntimeException("SAX parser appears to be broken - " + e.getMessage()); - } + SheetContentsHandler sheetContentsExtractor, + CommentsTable comments, + StylesTable styles, + ReadOnlySharedStringsTable strings, + InputStream sheetInputStream) + throws IOException, SAXException { + InputSource sheetSource = new InputSource(sheetInputStream); + SAXParserFactory saxFactory = SAXParserFactory.newInstance(); + try { + SAXParser saxParser = saxFactory.newSAXParser(); + XMLReader sheetParser = saxParser.getXMLReader(); + XSSFSheetInterestingPartsCapturer handler = + new XSSFSheetInterestingPartsCapturer(new XSSFSheetXMLHandler( + styles, comments, strings, sheetContentsExtractor, formatter, false)); + sheetParser.setContentHandler(handler); + sheetParser.parse(sheetSource); + sheetInputStream.close(); + + if (handler.hasProtection) { + metadata.set(TikaMetadataKeys.PROTECTED, "true"); + } + } catch (ParserConfigurationException e) { + throw new RuntimeException("SAX parser appears to be broken - " + e.getMessage()); + } } - + /** - * Turns formatted sheet events into HTML + * In Excel files, sheets have things embedded in them, + * and sheet drawings which have the images */ - protected static class SheetTextAsHTML implements SheetContentsHandler { - private XHTMLContentHandler xhtml; - private List<String> headers; - private List<String> footers; - - protected SheetTextAsHTML(XHTMLContentHandler xhtml) { - this.xhtml = xhtml; - headers = new ArrayList<String>(); - footers = new ArrayList<String>(); - } - - public void startRow(int rowNum) { - try { - xhtml.startElement("tr"); - } catch(SAXException e) {} - } - - public void endRow(int rowNum) { - try { - xhtml.endElement("tr"); - } catch(SAXException e) {} - } - - public void cell(String cellRef, String formattedValue, XSSFComment comment) { - try { - xhtml.startElement("td"); - - // Main cell contents - if (formattedValue != null) { - xhtml.characters(formattedValue); - } - - // Comments - if(comment != null) { - xhtml.startElement("br"); - xhtml.endElement("br"); - xhtml.characters(comment.getAuthor()); - xhtml.characters(": "); - xhtml.characters(comment.getString().getString()); - } - - xhtml.endElement("td"); - } catch(SAXException e) {} - } - - public void headerFooter(String text, boolean isHeader, String tagName) { - if(isHeader) { - headers.add(text); - } else { - footers.add(text); - } - } + @Override + protected List<PackagePart> getMainDocumentParts() throws TikaException { + List<PackagePart> parts = new ArrayList<PackagePart>(); + for (PackagePart part : sheetParts) { + // Add the sheet + parts.add(part); + + // If it has drawings, return those too + try { + for (PackageRelationship rel : part.getRelationshipsByType(XSSFRelation.DRAWINGS.getRelation())) { + if (rel.getTargetMode() == TargetMode.INTERNAL) { + PackagePartName relName = PackagingURIHelper.createPartName(rel.getTargetURI()); + parts.add(rel.getPackage().getPart(relName)); + } + } + for (PackageRelationship rel : part.getRelationshipsByType(XSSFRelation.VML_DRAWINGS.getRelation())) { + if (rel.getTargetMode() == TargetMode.INTERNAL) { + PackagePartName relName = PackagingURIHelper.createPartName(rel.getTargetURI()); + parts.add(rel.getPackage().getPart(relName)); + } + } + } catch (InvalidFormatException e) { + throw new TikaException("Broken OOXML file", e); + } + } + + return parts; } - + /** - * Allows access to headers/footers from raw xml strings + * Turns formatted sheet events into HTML */ - private static HeaderFooterHelper hfHelper = new HeaderFooterHelper(); + protected static class SheetTextAsHTML implements SheetContentsHandler { + private XHTMLContentHandler xhtml; + private List<String> headers; + private List<String> footers; + + protected SheetTextAsHTML(XHTMLContentHandler xhtml) { + this.xhtml = xhtml; + headers = new ArrayList<String>(); + footers = new ArrayList<String>(); + } + + public void startRow(int rowNum) { + try { + xhtml.startElement("tr"); + } catch (SAXException e) { + } + } + + public void endRow(int rowNum) { + try { + xhtml.endElement("tr"); + } catch (SAXException e) { + } + } + + public void cell(String cellRef, String formattedValue, XSSFComment comment) { + try { + xhtml.startElement("td"); + + // Main cell contents + if (formattedValue != null) { + xhtml.characters(formattedValue); + } + + // Comments + if (comment != null) { + xhtml.startElement("br"); + xhtml.endElement("br"); + xhtml.characters(comment.getAuthor()); + xhtml.characters(": "); + xhtml.characters(comment.getString().getString()); + } + + xhtml.endElement("td"); + } catch (SAXException e) { + } + } + + public void headerFooter(String text, boolean isHeader, String tagName) { + if (isHeader) { + headers.add(text); + } else { + footers.add(text); + } + } + } + protected static class HeaderFooterFromString implements HeaderFooter { - private String text; - protected HeaderFooterFromString(String text) { - this.text = text; - } - - public String getCenter() { - return hfHelper.getCenterSection(text); - } - public String getLeft() { - return hfHelper.getLeftSection(text); - } - public String getRight() { - return hfHelper.getRightSection(text); - } - - public void setCenter(String paramString) {} - public void setLeft(String paramString) {} - public void setRight(String paramString) {} + private String text; + + protected HeaderFooterFromString(String text) { + this.text = text; + } + + public String getCenter() { + return hfHelper.getCenterSection(text); + } + + public void setCenter(String paramString) { + } + + public String getLeft() { + return hfHelper.getLeftSection(text); + } + + public void setLeft(String paramString) { + } + + public String getRight() { + return hfHelper.getRightSection(text); + } + + public void setRight(String paramString) { + } } - + /** * Captures information on interesting tags, whilst - * delegating the main work to the formatting handler + * delegating the main work to the formatting handler */ protected static class XSSFSheetInterestingPartsCapturer implements ContentHandler { - private ContentHandler delegate; - private boolean hasProtection = false; - - protected XSSFSheetInterestingPartsCapturer(ContentHandler delegate) { - this.delegate = delegate; - } - - public void startElement(String uri, String localName, String qName, - Attributes atts) throws SAXException { - if("sheetProtection".equals(qName)) { - hasProtection = true; - } - delegate.startElement(uri, localName, qName, atts); - } + private ContentHandler delegate; + private boolean hasProtection = false; - public void characters(char[] ch, int start, int length) - throws SAXException { - delegate.characters(ch, start, length); - } - public void endDocument() throws SAXException { - delegate.endDocument(); - } - public void endElement(String uri, String localName, String qName) - throws SAXException { - delegate.endElement(uri, localName, qName); - } - public void endPrefixMapping(String prefix) throws SAXException { - delegate.endPrefixMapping(prefix); - } - public void ignorableWhitespace(char[] ch, int start, int length) - throws SAXException { - delegate.ignorableWhitespace(ch, start, length); - } - public void processingInstruction(String target, String data) - throws SAXException { - delegate.processingInstruction(target, data); - } - public void setDocumentLocator(Locator locator) { - delegate.setDocumentLocator(locator); - } - public void skippedEntity(String name) throws SAXException { - delegate.skippedEntity(name); - } - public void startDocument() throws SAXException { - delegate.startDocument(); - } - public void startPrefixMapping(String prefix, String uri) - throws SAXException { - delegate.startPrefixMapping(prefix, uri); - } - } - - /** - * In Excel files, sheets have things embedded in them, - * and sheet drawings which have the images - */ - @Override - protected List<PackagePart> getMainDocumentParts() throws TikaException { - List<PackagePart> parts = new ArrayList<PackagePart>(); - for(PackagePart part : sheetParts) { - // Add the sheet - parts.add(part); - - // If it has drawings, return those too - try { - for(PackageRelationship rel : part.getRelationshipsByType(XSSFRelation.DRAWINGS.getRelation())) { - if(rel.getTargetMode() == TargetMode.INTERNAL) { - PackagePartName relName = PackagingURIHelper.createPartName(rel.getTargetURI()); - parts.add( rel.getPackage().getPart(relName) ); - } - } - for(PackageRelationship rel : part.getRelationshipsByType(XSSFRelation.VML_DRAWINGS.getRelation())) { - if(rel.getTargetMode() == TargetMode.INTERNAL) { - PackagePartName relName = PackagingURIHelper.createPartName(rel.getTargetURI()); - parts.add( rel.getPackage().getPart(relName) ); - } - } - } catch(InvalidFormatException e) { - throw new TikaException("Broken OOXML file", e); - } - } + protected XSSFSheetInterestingPartsCapturer(ContentHandler delegate) { + this.delegate = delegate; + } - return parts; + public void startElement(String uri, String localName, String qName, + Attributes atts) throws SAXException { + if ("sheetProtection".equals(qName)) { + hasProtection = true; + } + delegate.startElement(uri, localName, qName, atts); + } + + public void characters(char[] ch, int start, int length) + throws SAXException { + delegate.characters(ch, start, length); + } + + public void endDocument() throws SAXException { + delegate.endDocument(); + } + + public void endElement(String uri, String localName, String qName) + throws SAXException { + delegate.endElement(uri, localName, qName); + } + + public void endPrefixMapping(String prefix) throws SAXException { + delegate.endPrefixMapping(prefix); + } + + public void ignorableWhitespace(char[] ch, int start, int length) + throws SAXException { + delegate.ignorableWhitespace(ch, start, length); + } + + public void processingInstruction(String target, String data) + throws SAXException { + delegate.processingInstruction(target, data); + } + + public void setDocumentLocator(Locator locator) { + delegate.setDocumentLocator(locator); + } + + public void skippedEntity(String name) throws SAXException { + delegate.skippedEntity(name); + } + + public void startDocument() throws SAXException { + delegate.startDocument(); + } + + public void startPrefixMapping(String prefix, String uri) + throws SAXException { + delegate.startPrefixMapping(prefix, uri); + } } }
Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFListManager.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFListManager.java?rev=1682489&r1=1682488&r2=1682489&view=diff ============================================================================== --- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFListManager.java (original) +++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFListManager.java Fri May 29 14:36:21 2015 @@ -30,13 +30,8 @@ import org.openxmlformats.schemas.wordpr public class XWPFListManager extends AbstractListManager { private final static boolean OVERRIDE_AVAILABLE; - private final static String SKIP_FORMAT = Character.toString((char)61623);//if this shows up as the lvlText, don't show a number + private final static String SKIP_FORMAT = Character.toString((char) 61623);//if this shows up as the lvlText, don't show a number - private final XWPFNumbering numbering; - //map of numId (which paragraph series is this a member of?), levelcounts - public XWPFListManager(XWPFDocument document) { - numbering = document.getNumbering(); - } static { boolean b = false; try { @@ -47,6 +42,14 @@ public class XWPFListManager extends Abs b = OVERRIDE_AVAILABLE = false; } + + private final XWPFNumbering numbering; + + //map of numId (which paragraph series is this a member of?), levelcounts + public XWPFListManager(XWPFDocument document) { + numbering = document.getNumbering(); + } + public String getFormattedNumber(final XWPFParagraph paragraph) { int currNumId = paragraph.getNumID().intValue(); CTNum ctNum = numbering.getNum(paragraph.getNumID()).getCTNum(); @@ -74,7 +77,8 @@ public class XWPFListManager extends Abs * WARNING: currently always returns null. * TODO: Once CTNumLvl is available to Tika, * we can turn this back on. - * @param ctNum number on which to build the overrides + * + * @param ctNum number on which to build the overrides * @param length length of intended array * @return null or an array of override tuples of length {@param length} */ @@ -121,7 +125,7 @@ public class XWPFListManager extends Abs boolean isLegal = false; int start = 1; int restart = -1; - String lvlText = "%"+level+"."; + String lvlText = "%" + level + "."; String numFmt = "decimal"; Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java?rev=1682489&r1=1682488&r2=1682489&view=diff ============================================================================== --- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java (original) +++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java Fri May 29 14:36:21 2015 @@ -63,7 +63,7 @@ import org.xml.sax.helpers.AttributesImp public class XWPFWordExtractorDecorator extends AbstractOOXMLExtractor { // could be improved by using the real delimiter in xchFollow [MS-DOC], v20140721, 2.4.6.3, Part 3, Step 3 - private static final String LIST_DELIMITER = " "; + private static final String LIST_DELIMITER = " "; private XWPFDocument document; @@ -71,7 +71,7 @@ public class XWPFWordExtractorDecorator public XWPFWordExtractorDecorator(ParseContext context, XWPFWordExtractor extractor) { super(context, extractor); - + document = (XWPFDocument) extractor.getDocument(); styles = document.getStyles(); } @@ -85,7 +85,7 @@ public class XWPFWordExtractorDecorator XWPFHeaderFooterPolicy hfPolicy = document.getHeaderFooterPolicy(); XWPFListManager listManager = new XWPFListManager(document); // headers - if (hfPolicy!=null) { + if (hfPolicy != null) { extractHeaders(xhtml, hfPolicy, listManager); } @@ -93,164 +93,164 @@ public class XWPFWordExtractorDecorator extractIBodyText(document, listManager, xhtml); // then all document tables - if (hfPolicy!=null) { + if (hfPolicy != null) { extractFooters(xhtml, hfPolicy, listManager); } } private void extractIBodyText(IBody bodyElement, XWPFListManager listManager, - XHTMLContentHandler xhtml) + XHTMLContentHandler xhtml) throws SAXException, XmlException, IOException { - for(IBodyElement element : bodyElement.getBodyElements()) { - if(element instanceof XWPFParagraph) { - XWPFParagraph paragraph = (XWPFParagraph)element; - extractParagraph(paragraph, listManager, xhtml); - } - if(element instanceof XWPFTable) { - XWPFTable table = (XWPFTable)element; - extractTable(table, listManager, xhtml); - } - if (element instanceof XWPFSDT){ - extractSDT((XWPFSDT) element, xhtml); - } - - } - } - - private void extractSDT(XWPFSDT element, XHTMLContentHandler xhtml) throws SAXException, - XmlException, IOException { - ISDTContent content = element.getContent(); - String tag = "p"; - xhtml.startElement(tag); - xhtml.characters(content.getText()); - xhtml.endElement(tag); + for (IBodyElement element : bodyElement.getBodyElements()) { + if (element instanceof XWPFParagraph) { + XWPFParagraph paragraph = (XWPFParagraph) element; + extractParagraph(paragraph, listManager, xhtml); + } + if (element instanceof XWPFTable) { + XWPFTable table = (XWPFTable) element; + extractTable(table, listManager, xhtml); + } + if (element instanceof XWPFSDT) { + extractSDT((XWPFSDT) element, xhtml); + } + + } } - + + private void extractSDT(XWPFSDT element, XHTMLContentHandler xhtml) throws SAXException, + XmlException, IOException { + ISDTContent content = element.getContent(); + String tag = "p"; + xhtml.startElement(tag); + xhtml.characters(content.getText()); + xhtml.endElement(tag); + } + private void extractParagraph(XWPFParagraph paragraph, XWPFListManager listManager, - XHTMLContentHandler xhtml) + XHTMLContentHandler xhtml) throws SAXException, XmlException, IOException { - // If this paragraph is actually a whole new section, then - // it could have its own headers and footers - // Check and handle if so - XWPFHeaderFooterPolicy headerFooterPolicy = null; - if (paragraph.getCTP().getPPr() != null) { - CTSectPr ctSectPr = paragraph.getCTP().getPPr().getSectPr(); - if(ctSectPr != null) { - headerFooterPolicy = - new XWPFHeaderFooterPolicy(document, ctSectPr); - extractHeaders(xhtml, headerFooterPolicy, listManager); - } - } - - // Is this a paragraph, or a heading? - String tag = "p"; - String styleClass = null; - if(paragraph.getStyleID() != null) { - XWPFStyle style = styles.getStyle( - paragraph.getStyleID() - ); - - if (style != null && style.getName() != null) { - TagAndStyle tas = WordExtractor.buildParagraphTagAndStyle( - style.getName(), paragraph.getPartType() == BodyType.TABLECELL - ); - tag = tas.getTag(); - styleClass = tas.getStyleClass(); - } - } - - if(styleClass == null) { - xhtml.startElement(tag); - } else { - xhtml.startElement(tag, "class", styleClass); - } + // If this paragraph is actually a whole new section, then + // it could have its own headers and footers + // Check and handle if so + XWPFHeaderFooterPolicy headerFooterPolicy = null; + if (paragraph.getCTP().getPPr() != null) { + CTSectPr ctSectPr = paragraph.getCTP().getPPr().getSectPr(); + if (ctSectPr != null) { + headerFooterPolicy = + new XWPFHeaderFooterPolicy(document, ctSectPr); + extractHeaders(xhtml, headerFooterPolicy, listManager); + } + } + + // Is this a paragraph, or a heading? + String tag = "p"; + String styleClass = null; + if (paragraph.getStyleID() != null) { + XWPFStyle style = styles.getStyle( + paragraph.getStyleID() + ); + + if (style != null && style.getName() != null) { + TagAndStyle tas = WordExtractor.buildParagraphTagAndStyle( + style.getName(), paragraph.getPartType() == BodyType.TABLECELL + ); + tag = tas.getTag(); + styleClass = tas.getStyleClass(); + } + } + + if (styleClass == null) { + xhtml.startElement(tag); + } else { + xhtml.startElement(tag, "class", styleClass); + } writeParagraphNumber(paragraph, listManager, xhtml); - // Output placeholder for any embedded docs: + // Output placeholder for any embedded docs: - // TODO: replace w/ XPath/XQuery: - for(XWPFRun run : paragraph.getRuns()) { - XmlCursor c = run.getCTR().newCursor(); - c.selectPath("./*"); - while (c.toNextSelection()) { - XmlObject o = c.getObject(); - if (o instanceof CTObject) { - XmlCursor c2 = o.newCursor(); - c2.selectPath("./*"); - while (c2.toNextSelection()) { - XmlObject o2 = c2.getObject(); - - XmlObject embedAtt = o2.selectAttribute(new QName("Type")); - if (embedAtt != null && embedAtt.getDomNode().getNodeValue().equals("Embed")) { - // Type is "Embed" - XmlObject relIDAtt = o2.selectAttribute(new QName("http://schemas.openxmlformats.org/officeDocument/2006/relationships", "id")); - if (relIDAtt != null) { - String relID = relIDAtt.getDomNode().getNodeValue(); - AttributesImpl attributes = new AttributesImpl(); - attributes.addAttribute("", "class", "class", "CDATA", "embedded"); - attributes.addAttribute("", "id", "id", "CDATA", relID); - xhtml.startElement("div", attributes); - xhtml.endElement("div"); - } - } + // TODO: replace w/ XPath/XQuery: + for (XWPFRun run : paragraph.getRuns()) { + XmlCursor c = run.getCTR().newCursor(); + c.selectPath("./*"); + while (c.toNextSelection()) { + XmlObject o = c.getObject(); + if (o instanceof CTObject) { + XmlCursor c2 = o.newCursor(); + c2.selectPath("./*"); + while (c2.toNextSelection()) { + XmlObject o2 = c2.getObject(); + + XmlObject embedAtt = o2.selectAttribute(new QName("Type")); + if (embedAtt != null && embedAtt.getDomNode().getNodeValue().equals("Embed")) { + // Type is "Embed" + XmlObject relIDAtt = o2.selectAttribute(new QName("http://schemas.openxmlformats.org/officeDocument/2006/relationships", "id")); + if (relIDAtt != null) { + String relID = relIDAtt.getDomNode().getNodeValue(); + AttributesImpl attributes = new AttributesImpl(); + attributes.addAttribute("", "class", "class", "CDATA", "embedded"); + attributes.addAttribute("", "id", "id", "CDATA", relID); + xhtml.startElement("div", attributes); + xhtml.endElement("div"); + } + } + } + c2.dispose(); } - c2.dispose(); - } - } - - c.dispose(); - } - - // Attach bookmarks for the paragraph - // (In future, we might put them in the right place, for now - // we just put them in the correct paragraph) - for (int i = 0; i < paragraph.getCTP().sizeOfBookmarkStartArray(); i++) { - CTBookmark bookmark = paragraph.getCTP().getBookmarkStartArray(i); - xhtml.startElement("a", "name", bookmark.getName()); - xhtml.endElement("a"); - } - - TmpFormatting fmtg = new TmpFormatting(false, false); - - // Do the iruns - for(IRunElement run : paragraph.getIRuns()) { - if (run instanceof XWPFSDT){ - fmtg = closeStyleTags(xhtml, fmtg); - processSDTRun((XWPFSDT)run, xhtml); - //for now, we're ignoring formatting in sdt - //if you hit an sdt reset to false - fmtg.setBold(false); - fmtg.setItalic(false); - } else { - fmtg = processRun((XWPFRun)run, paragraph, xhtml, fmtg); - } - } - closeStyleTags(xhtml, fmtg); - - - // Now do any comments for the paragraph - XWPFCommentsDecorator comments = new XWPFCommentsDecorator(paragraph, null); - String commentText = comments.getCommentText(); - if(commentText != null && commentText.length() > 0) { - xhtml.characters(commentText); - } - - String footnameText = paragraph.getFootnoteText(); - if(footnameText != null && footnameText.length() > 0) { - xhtml.characters(footnameText + "\n"); - } - - // Also extract any paragraphs embedded in text boxes: - for (XmlObject embeddedParagraph : paragraph.getCTP().selectPath("declare namespace w='http://schemas.openxmlformats.org/wordprocessingml/2006/main' declare namespace wps='http://schemas.microsoft.com/office/word/2010/wordprocessingShape' .//*/wps:txbx/w:txbxContent/w:p")) { - extractParagraph(new XWPFParagraph(CTP.Factory.parse(embeddedParagraph.xmlText()), paragraph.getBody()), listManager, xhtml); - } - - // Finish this paragraph - xhtml.endElement(tag); - - if (headerFooterPolicy != null) { - extractFooters(xhtml, headerFooterPolicy, listManager); - } + } + + c.dispose(); + } + + // Attach bookmarks for the paragraph + // (In future, we might put them in the right place, for now + // we just put them in the correct paragraph) + for (int i = 0; i < paragraph.getCTP().sizeOfBookmarkStartArray(); i++) { + CTBookmark bookmark = paragraph.getCTP().getBookmarkStartArray(i); + xhtml.startElement("a", "name", bookmark.getName()); + xhtml.endElement("a"); + } + + TmpFormatting fmtg = new TmpFormatting(false, false); + + // Do the iruns + for (IRunElement run : paragraph.getIRuns()) { + if (run instanceof XWPFSDT) { + fmtg = closeStyleTags(xhtml, fmtg); + processSDTRun((XWPFSDT) run, xhtml); + //for now, we're ignoring formatting in sdt + //if you hit an sdt reset to false + fmtg.setBold(false); + fmtg.setItalic(false); + } else { + fmtg = processRun((XWPFRun) run, paragraph, xhtml, fmtg); + } + } + closeStyleTags(xhtml, fmtg); + + + // Now do any comments for the paragraph + XWPFCommentsDecorator comments = new XWPFCommentsDecorator(paragraph, null); + String commentText = comments.getCommentText(); + if (commentText != null && commentText.length() > 0) { + xhtml.characters(commentText); + } + + String footnameText = paragraph.getFootnoteText(); + if (footnameText != null && footnameText.length() > 0) { + xhtml.characters(footnameText + "\n"); + } + + // Also extract any paragraphs embedded in text boxes: + for (XmlObject embeddedParagraph : paragraph.getCTP().selectPath("declare namespace w='http://schemas.openxmlformats.org/wordprocessingml/2006/main' declare namespace wps='http://schemas.microsoft.com/office/word/2010/wordprocessingShape' .//*/wps:txbx/w:txbxContent/w:p")) { + extractParagraph(new XWPFParagraph(CTP.Factory.parse(embeddedParagraph.xmlText()), paragraph.getBody()), listManager, xhtml); + } + + // Finish this paragraph + xhtml.endElement(tag); + + if (headerFooterPolicy != null) { + extractFooters(xhtml, headerFooterPolicy, listManager); + } } private void writeParagraphNumber(XWPFParagraph paragraph, @@ -267,110 +267,110 @@ public class XWPFWordExtractorDecorator } private TmpFormatting closeStyleTags(XHTMLContentHandler xhtml, - TmpFormatting fmtg) throws SAXException { - // Close any still open style tags - if (fmtg.isItalic()) { - xhtml.endElement("i"); - fmtg.setItalic(false); - } - if (fmtg.isBold()) { - xhtml.endElement("b"); - fmtg.setBold(false); - } - return fmtg; - } - - private TmpFormatting processRun(XWPFRun run, XWPFParagraph paragraph, - XHTMLContentHandler xhtml, TmpFormatting tfmtg) - throws SAXException, XmlException, IOException{ - // True if we are currently in the named style tag: - if (run.isBold() != tfmtg.isBold()) { - if (tfmtg.isItalic()) { - xhtml.endElement("i"); - tfmtg.setItalic(false); - } - if (run.isBold()) { - xhtml.startElement("b"); - } else { - xhtml.endElement("b"); - } - tfmtg.setBold(run.isBold()); - } - - if (run.isItalic() != tfmtg.isItalic()) { - if (run.isItalic()) { - xhtml.startElement("i"); - } else { - xhtml.endElement("i"); - } - tfmtg.setItalic(run.isItalic()); - } - - boolean addedHREF = false; - if(run instanceof XWPFHyperlinkRun) { - XWPFHyperlinkRun linkRun = (XWPFHyperlinkRun)run; - XWPFHyperlink link = linkRun.getHyperlink(document); - if(link != null && link.getURL() != null) { - xhtml.startElement("a", "href", link.getURL()); - addedHREF = true; - } else if(linkRun.getAnchor() != null && linkRun.getAnchor().length() > 0) { - xhtml.startElement("a", "href", "#" + linkRun.getAnchor()); - addedHREF = true; - } - } - - xhtml.characters(run.toString()); - - // If we have any pictures, output them - for(XWPFPicture picture : run.getEmbeddedPictures()) { - if(paragraph.getDocument() != null) { - XWPFPictureData data = picture.getPictureData(); - if(data != null) { - AttributesImpl attr = new AttributesImpl(); - - attr.addAttribute("", "src", "src", "CDATA", "embedded:" + data.getFileName()); - attr.addAttribute("", "alt", "alt", "CDATA", picture.getDescription()); - - xhtml.startElement("img", attr); - xhtml.endElement("img"); - } - } - } - - if (addedHREF) { - xhtml.endElement("a"); - } + TmpFormatting fmtg) throws SAXException { + // Close any still open style tags + if (fmtg.isItalic()) { + xhtml.endElement("i"); + fmtg.setItalic(false); + } + if (fmtg.isBold()) { + xhtml.endElement("b"); + fmtg.setBold(false); + } + return fmtg; + } - return tfmtg; + private TmpFormatting processRun(XWPFRun run, XWPFParagraph paragraph, + XHTMLContentHandler xhtml, TmpFormatting tfmtg) + throws SAXException, XmlException, IOException { + // True if we are currently in the named style tag: + if (run.isBold() != tfmtg.isBold()) { + if (tfmtg.isItalic()) { + xhtml.endElement("i"); + tfmtg.setItalic(false); + } + if (run.isBold()) { + xhtml.startElement("b"); + } else { + xhtml.endElement("b"); + } + tfmtg.setBold(run.isBold()); + } + + if (run.isItalic() != tfmtg.isItalic()) { + if (run.isItalic()) { + xhtml.startElement("i"); + } else { + xhtml.endElement("i"); + } + tfmtg.setItalic(run.isItalic()); + } + + boolean addedHREF = false; + if (run instanceof XWPFHyperlinkRun) { + XWPFHyperlinkRun linkRun = (XWPFHyperlinkRun) run; + XWPFHyperlink link = linkRun.getHyperlink(document); + if (link != null && link.getURL() != null) { + xhtml.startElement("a", "href", link.getURL()); + addedHREF = true; + } else if (linkRun.getAnchor() != null && linkRun.getAnchor().length() > 0) { + xhtml.startElement("a", "href", "#" + linkRun.getAnchor()); + addedHREF = true; + } + } + + xhtml.characters(run.toString()); + + // If we have any pictures, output them + for (XWPFPicture picture : run.getEmbeddedPictures()) { + if (paragraph.getDocument() != null) { + XWPFPictureData data = picture.getPictureData(); + if (data != null) { + AttributesImpl attr = new AttributesImpl(); + + attr.addAttribute("", "src", "src", "CDATA", "embedded:" + data.getFileName()); + attr.addAttribute("", "alt", "alt", "CDATA", picture.getDescription()); + + xhtml.startElement("img", attr); + xhtml.endElement("img"); + } + } + } + + if (addedHREF) { + xhtml.endElement("a"); + } + + return tfmtg; } private void processSDTRun(XWPFSDT run, XHTMLContentHandler xhtml) - throws SAXException, XmlException, IOException{ - xhtml.characters(run.getContent().getText()); + throws SAXException, XmlException, IOException { + xhtml.characters(run.getContent().getText()); } - private void extractTable(XWPFTable table, XWPFListManager listManager, - XHTMLContentHandler xhtml) + private void extractTable(XWPFTable table, XWPFListManager listManager, + XHTMLContentHandler xhtml) throws SAXException, XmlException, IOException { - xhtml.startElement("table"); - xhtml.startElement("tbody"); - for(XWPFTableRow row : table.getRows()) { - xhtml.startElement("tr"); - for(ICell cell : row.getTableICells()){ - xhtml.startElement("td"); - if (cell instanceof XWPFTableCell) { - extractIBodyText((XWPFTableCell)cell, listManager, xhtml); - } else if (cell instanceof XWPFSDTCell) { - xhtml.characters(((XWPFSDTCell)cell).getContent().getText()); - } - xhtml.endElement("td"); - } - xhtml.endElement("tr"); - } - xhtml.endElement("tbody"); - xhtml.endElement("table"); + xhtml.startElement("table"); + xhtml.startElement("tbody"); + for (XWPFTableRow row : table.getRows()) { + xhtml.startElement("tr"); + for (ICell cell : row.getTableICells()) { + xhtml.startElement("td"); + if (cell instanceof XWPFTableCell) { + extractIBodyText((XWPFTableCell) cell, listManager, xhtml); + } else if (cell instanceof XWPFSDTCell) { + xhtml.characters(((XWPFSDTCell) cell).getContent().getText()); + } + xhtml.endElement("td"); + } + xhtml.endElement("tr"); + } + xhtml.endElement("tbody"); + xhtml.endElement("table"); } - + private void extractFooters( XHTMLContentHandler xhtml, XWPFHeaderFooterPolicy hfPolicy, XWPFListManager listManager) @@ -391,7 +391,7 @@ public class XWPFWordExtractorDecorator XHTMLContentHandler xhtml, XWPFHeaderFooterPolicy hfPolicy, XWPFListManager listManager) throws SAXException, XmlException, IOException { if (hfPolicy == null) return; - + if (hfPolicy.getFirstPageHeader() != null) { extractHeaderText(xhtml, hfPolicy.getFirstPageHeader(), listManager); } @@ -407,48 +407,53 @@ public class XWPFWordExtractorDecorator private void extractHeaderText(XHTMLContentHandler xhtml, XWPFHeaderFooter header, XWPFListManager listManager) throws SAXException, XmlException, IOException { - for (IBodyElement e : header.getBodyElements()){ - if (e instanceof XWPFParagraph){ - extractParagraph((XWPFParagraph)e, listManager, xhtml); - } else if (e instanceof XWPFTable){ - extractTable((XWPFTable)e, listManager, xhtml); - } else if (e instanceof XWPFSDT){ - extractSDT((XWPFSDT)e, xhtml); - } + for (IBodyElement e : header.getBodyElements()) { + if (e instanceof XWPFParagraph) { + extractParagraph((XWPFParagraph) e, listManager, xhtml); + } else if (e instanceof XWPFTable) { + extractTable((XWPFTable) e, listManager, xhtml); + } else if (e instanceof XWPFSDT) { + extractSDT((XWPFSDT) e, xhtml); + } } } /** * Word documents are simple, they only have the one - * main part + * main part */ @Override protected List<PackagePart> getMainDocumentParts() { - List<PackagePart> parts = new ArrayList<PackagePart>(); - parts.add( document.getPackagePart() ); - return parts; - } - - private class TmpFormatting{ - private boolean bold = false; - private boolean italic = false; - private TmpFormatting(boolean bold, boolean italic){ - this.bold = bold; - this.italic = italic; - } - public boolean isBold() { - return bold; - } - public void setBold(boolean bold) { - this.bold = bold; - } - public boolean isItalic() { - return italic; - } - public void setItalic(boolean italic) { - this.italic = italic; - } - + List<PackagePart> parts = new ArrayList<PackagePart>(); + parts.add(document.getPackagePart()); + return parts; + } + + private class TmpFormatting { + private boolean bold = false; + private boolean italic = false; + + private TmpFormatting(boolean bold, boolean italic) { + this.bold = bold; + this.italic = italic; + } + + public boolean isBold() { + return bold; + } + + public void setBold(boolean bold) { + this.bold = bold; + } + + public boolean isItalic() { + return italic; + } + + public void setItalic(boolean italic) { + this.italic = italic; + } + } } Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/AccessChecker.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/AccessChecker.java?rev=1682489&r1=1682488&r2=1682489&view=diff ============================================================================== --- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/AccessChecker.java (original) +++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/AccessChecker.java Fri May 29 14:36:21 2015 @@ -38,13 +38,14 @@ public class AccessChecker implements Se * This constructs an {@link AccessChecker} that * will not perform any checking and will always return without * throwing an exception. - * <p> + * <p/> * This constructor is available to allow for Tika's legacy ( <= v1.7) behavior. */ public AccessChecker() { needToCheck = false; allowAccessibility = true; } + /** * This constructs an {@link AccessChecker} that will check * for whether or not content should be extracted from a document. @@ -69,7 +70,7 @@ public class AccessChecker implements Se } if ("false".equals(metadata.get(AccessPermissions.EXTRACT_CONTENT))) { if (allowAccessibility) { - if("true".equals(metadata.get(AccessPermissions.EXTRACT_FOR_ACCESSIBILITY))) { + if ("true".equals(metadata.get(AccessPermissions.EXTRACT_FOR_ACCESSIBILITY))) { return; } throw new AccessPermissionException("Content extraction for accessibility is not allowed."); Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java?rev=1682489&r1=1682488&r2=1682489&view=diff ============================================================================== --- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java (original) +++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java Fri May 29 14:36:21 2015 @@ -81,19 +81,21 @@ import org.xml.sax.helpers.AttributesImp * stream. */ class PDF2XHTML extends PDFTextStripper { - + + /** + * Maximum recursive depth during AcroForm processing. + * Prevents theoretical AcroForm recursion bomb. + */ + private final static int MAX_ACROFORM_RECURSIONS = 10; /** * Format used for signature dates * TODO Make this thread-safe */ private final SimpleDateFormat dateFormat = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ssZ", Locale.ROOT); - - /** - * Maximum recursive depth during AcroForm processing. - * Prevents theoretical AcroForm recursion bomb. - */ - private final static int MAX_ACROFORM_RECURSIONS = 10; - + private final ContentHandler originalHandler; + private final ParseContext context; + private final XHTMLContentHandler handler; + private final PDFParserConfig config; /** * This keeps track of the pdf object ids for inline * images that have been processed. If {@link PDFParserConfig#getExtractUniqueInlineImagesOnly() @@ -102,17 +104,26 @@ class PDF2XHTML extends PDFTextStripper * This integer is used to identify images in the markup. */ private Map<String, Integer> processedInlineImages = new HashMap<String, Integer>(); - private int inlineImageCounter = 0; + private PDF2XHTML(ContentHandler handler, ParseContext context, Metadata metadata, + PDFParserConfig config) + throws IOException { + //source of config (derives from context or PDFParser?) is + //already determined in PDFParser. No need to check context here. + this.config = config; + this.originalHandler = handler; + this.context = context; + this.handler = new XHTMLContentHandler(handler, metadata); + } /** * Converts the given PDF document (and related metadata) to a stream * of XHTML SAX events sent to the given content handler. * * @param document PDF document - * @param handler SAX content handler + * @param handler SAX content handler * @param metadata PDF metadata - * @throws SAXException if the content handler fails to process SAX events + * @throws SAXException if the content handler fails to process SAX events * @throws TikaException if the PDF document can not be processed */ public static void process( @@ -124,16 +135,18 @@ class PDF2XHTML extends PDFTextStripper // key methods to output to the given content // handler. PDF2XHTML pdf2XHTML = new PDF2XHTML(handler, context, metadata, config); - + config.configure(pdf2XHTML); pdf2XHTML.writeText(document, new Writer() { @Override public void write(char[] cbuf, int off, int len) { } + @Override public void flush() { } + @Override public void close() { } @@ -147,22 +160,6 @@ class PDF2XHTML extends PDFTextStripper } } } - - private final ContentHandler originalHandler; - private final ParseContext context; - private final XHTMLContentHandler handler; - private final PDFParserConfig config; - - private PDF2XHTML(ContentHandler handler, ParseContext context, Metadata metadata, - PDFParserConfig config) - throws IOException { - //source of config (derives from context or PDFParser?) is - //already determined in PDFParser. No need to check context here. - this.config = config; - this.originalHandler = handler; - this.context = context; - this.handler = new XHTMLContentHandler(handler, metadata); - } void extractBookmarkText() throws SAXException { PDDocumentOutline outline = document.getDocumentCatalog().getDocumentOutline(); @@ -202,14 +199,14 @@ class PDF2XHTML extends PDFTextStripper // Extract text for any bookmarks: extractBookmarkText(); extractEmbeddedDocuments(pdf, originalHandler); - + //extract acroform data at end of doc if (config.getExtractAcroFormContent() == true) { extractAcroForm(pdf, handler); - } + } handler.endDocument(); } catch (TikaException e) { - throw new IOExceptionWithCause("Unable to end a document", e); + throw new IOExceptionWithCause("Unable to end a document", e); } catch (SAXException e) { throw new IOExceptionWithCause("Unable to end a document", e); } @@ -235,7 +232,7 @@ class PDF2XHTML extends PDFTextStripper EmbeddedDocumentExtractor extractor = getEmbeddedDocumentExtractor(); for (PDAnnotation annotation : page.getAnnotations()) { - if (annotation instanceof PDAnnotationFileAttachment){ + if (annotation instanceof PDAnnotationFileAttachment) { PDAnnotationFileAttachment fann = (PDAnnotationFileAttachment) annotation; PDComplexFileSpecification fileSpec = (PDComplexFileSpecification) fann.getFile(); try { @@ -316,7 +313,7 @@ class PDF2XHTML extends PDFTextStripper } for (Map.Entry<String, PDXObject> entry : xObjects.entrySet()) { - + PDXObject object = entry.getValue(); if (object instanceof PDXObjectForm) { extractImages(((PDXObjectForm) object).getResources()); @@ -341,7 +338,7 @@ class PDF2XHTML extends PDFTextStripper if (imageNumber == null) { imageNumber = inlineImageCounter++; } - String fileName = "image"+imageNumber+extension; + String fileName = "image" + imageNumber + extension; metadata.set(Metadata.RESOURCE_NAME_KEY, fileName); // Output the img tag @@ -355,7 +352,7 @@ class PDF2XHTML extends PDFTextStripper //If so, have we already processed this one? if (config.getExtractUniqueInlineImagesOnly() == true) { String cosObjectId = entry.getKey(); - if (processedInlineImages.containsKey(cosObjectId)){ + if (processedInlineImages.containsKey(cosObjectId)) { continue; } processedInlineImages.put(cosObjectId, imageNumber); @@ -452,7 +449,7 @@ class PDF2XHTML extends PDFTextStripper "Unable to write a newline character", e); } } - + private void extractEmbeddedDocuments(PDDocument document, ContentHandler handler) throws IOException, SAXException, TikaException { PDDocumentCatalog catalog = document.getDocumentCatalog(); @@ -495,14 +492,14 @@ class PDF2XHTML extends PDFTextStripper } EmbeddedDocumentExtractor extractor = getEmbeddedDocumentExtractor(); - for (Map.Entry<String,COSObjectable> ent : embeddedFileNames.entrySet()) { + for (Map.Entry<String, COSObjectable> ent : embeddedFileNames.entrySet()) { PDComplexFileSpecification spec = (PDComplexFileSpecification) ent.getValue(); extractMultiOSPDEmbeddedFiles(ent.getKey(), spec, extractor); } } private void extractMultiOSPDEmbeddedFiles(String defaultName, - PDComplexFileSpecification spec, EmbeddedDocumentExtractor extractor) throws IOException, + PDComplexFileSpecification spec, EmbeddedDocumentExtractor extractor) throws IOException, SAXException, TikaException { if (spec == null) { @@ -516,8 +513,8 @@ class PDF2XHTML extends PDFTextStripper } private void extractPDEmbeddedFile(String defaultName, String fileName, PDEmbeddedFile file, - EmbeddedDocumentExtractor extractor) - throws SAXException, IOException, TikaException{ + EmbeddedDocumentExtractor extractor) + throws SAXException, IOException, TikaException { if (file == null) { //skip silently @@ -536,7 +533,7 @@ class PDF2XHTML extends PDFTextStripper if (extractor.shouldParseEmbedded(metadata)) { TikaInputStream stream = null; - try{ + try { stream = TikaInputStream.get(file.createInputStream()); extractor.parseEmbedded( stream, @@ -554,8 +551,8 @@ class PDF2XHTML extends PDFTextStripper } } - private void extractAcroForm(PDDocument pdf, XHTMLContentHandler handler) throws IOException, - SAXException { + private void extractAcroForm(PDDocument pdf, XHTMLContentHandler handler) throws IOException, + SAXException { //Thank you, Ben Litchfield, for org.apache.pdfbox.examples.fdf.PrintFields //this code derives from Ben's code PDDocumentCatalog catalog = pdf.getDocumentCatalog(); @@ -574,7 +571,7 @@ class PDF2XHTML extends PDFTextStripper return; @SuppressWarnings("rawtypes") - ListIterator itr = fields.listIterator(); + ListIterator itr = fields.listIterator(); if (itr == null) return; @@ -585,7 +582,7 @@ class PDF2XHTML extends PDFTextStripper while (itr.hasNext()) { Object obj = itr.next(); if (obj != null && obj instanceof PDField) { - processAcroField((PDField)obj, handler, 0); + processAcroField((PDField) obj, handler, 0); } } handler.endElement("ol"); @@ -593,7 +590,7 @@ class PDF2XHTML extends PDFTextStripper } private void processAcroField(PDField field, XHTMLContentHandler handler, final int currentRecursiveDepth) - throws SAXException, IOException { + throws SAXException, IOException { if (currentRecursiveDepth >= MAX_ACROFORM_RECURSIONS) { return; @@ -602,14 +599,14 @@ class PDF2XHTML extends PDFTextStripper addFieldString(field, handler); List<COSObjectable> kids = field.getKids(); - if(kids != null) { + if (kids != null) { - int r = currentRecursiveDepth+1; + int r = currentRecursiveDepth + 1; handler.startElement("ol"); //TODO: can generate <ol/>. Rework to avoid that. - for(COSObjectable pdfObj : kids) { - if(pdfObj != null && pdfObj instanceof PDField) { - PDField kid = (PDField)pdfObj; + for (COSObjectable pdfObj : kids) { + if (pdfObj != null && pdfObj instanceof PDField) { + PDField kid = (PDField) pdfObj; //recurse processAcroField(kid, handler, r); } @@ -635,13 +632,13 @@ class PDF2XHTML extends PDFTextStripper } //return early if PDSignature field if (field instanceof PDSignatureField) { - handleSignature(attrs, (PDSignatureField)field, handler); + handleSignature(attrs, (PDSignatureField) field, handler); return; } try { //getValue can throw an IOException if there is no value String value = field.getValue(); - if (value != null && ! value.equals("null")) { + if (value != null && !value.equals("null")) { sb.append(value); } } catch (IOException e) { @@ -656,14 +653,14 @@ class PDF2XHTML extends PDFTextStripper } private void handleSignature(AttributesImpl parentAttributes, PDSignatureField sigField, - XHTMLContentHandler handler) throws SAXException { + XHTMLContentHandler handler) throws SAXException { PDSignature sig = sigField.getSignature(); if (sig == null) { return; } - Map<String, String> vals= new TreeMap<String, String>(); + Map<String, String> vals = new TreeMap<String, String>(); vals.put("name", sig.getName()); vals.put("contactInfo", sig.getContactInfo()); vals.put("location", sig.getLocation()); @@ -677,7 +674,7 @@ class PDF2XHTML extends PDFTextStripper //see if there is any data int nonNull = 0; for (String val : vals.keySet()) { - if (val != null && ! val.equals("")) { + if (val != null && !val.equals("")) { nonNull++; } } Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java?rev=1682489&r1=1682488&r2=1682489&view=diff ============================================================================== --- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java (original) +++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java Fri May 29 14:36:21 2015 @@ -61,7 +61,7 @@ import org.xml.sax.SAXException; /** * PDF parser. - * <p> + * <p/> * This parser can process also encrypted PDF documents if the required * password is given as a part of the input metadata associated with a * document. If no password is given, then this parser will try decrypting @@ -69,7 +69,7 @@ import org.xml.sax.SAXException; * the PDF contains any embedded documents (for example as part of a PDF * package) then this parser will use the {@link EmbeddedDocumentExtractor} * to handle them. - * <p> + * <p/> * As of Tika 1.6, it is possible to extract inline images with * the {@link EmbeddedDocumentExtractor} as if they were regular * attachments. By default, this feature is turned off because of @@ -80,12 +80,6 @@ import org.xml.sax.SAXException; public class PDFParser extends AbstractParser { - private static final MediaType MEDIA_TYPE = MediaType.application("pdf"); - - /** Serial version UID */ - private static final long serialVersionUID = -752276948656079347L; - - private PDFParserConfig defaultConfig = new PDFParserConfig(); /** * Metadata key for giving the document password to the parser. * @@ -93,9 +87,14 @@ public class PDFParser extends AbstractP * @deprecated Supply a {@link PasswordProvider} on the {@link ParseContext} instead */ public static final String PASSWORD = "org.apache.tika.parser.pdf.password"; - + private static final MediaType MEDIA_TYPE = MediaType.application("pdf"); + /** + * Serial version UID + */ + private static final long serialVersionUID = -752276948656079347L; private static final Set<MediaType> SUPPORTED_TYPES = - Collections.singleton(MEDIA_TYPE); + Collections.singleton(MEDIA_TYPE); + private PDFParserConfig defaultConfig = new PDFParserConfig(); public Set<MediaType> getSupportedTypes(ParseContext context) { return SUPPORTED_TYPES; @@ -105,7 +104,7 @@ public class PDFParser extends AbstractP InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException { - + PDDocument pdfDocument = null; TemporaryResources tmp = new TemporaryResources(); //config from context, or default if not set via context @@ -136,7 +135,7 @@ public class PDFParser extends AbstractP metadata.set("pdf:encrypted", Boolean.toString(pdfDocument.isEncrypted())); //if using the classic parser and the doc is encrypted, we must manually decrypt - if (! localConfig.getUseNonSequentialParser() && pdfDocument.isEncrypted()) { + if (!localConfig.getUseNonSequentialParser() && pdfDocument.isEncrypted()) { pdfDocument.decrypt(password); } @@ -148,14 +147,14 @@ public class PDFParser extends AbstractP if (handler != null) { PDF2XHTML.process(pdfDocument, handler, context, metadata, localConfig); } - + } catch (CryptographyException e) { //seq parser throws CryptographyException for bad password throw new EncryptedDocumentException(e); } catch (IOException e) { //nonseq parser throws IOException for bad password //At the Tika level, we want the same exception to be thrown - if (e.getMessage() != null && + if (e.getMessage() != null && e.getMessage().contains("Error (CryptographyException)")) { metadata.set("pdf:encrypted", Boolean.toString(true)); throw new EncryptedDocumentException(e); @@ -164,7 +163,7 @@ public class PDFParser extends AbstractP throw e; } finally { if (pdfDocument != null) { - pdfDocument.close(); + pdfDocument.close(); } tmp.dispose(); //TODO: once we migrate to PDFBox 2.0, remove this (PDFBOX-2200) @@ -217,11 +216,10 @@ public class PDFParser extends AbstractP Boolean.toString(ap.canPrintDegraded())); - //now go for the XMP stuff org.apache.jempbox.xmp.XMPMetadata xmp = null; XMPSchemaDublinCore dcSchema = null; - try{ + try { if (document.getDocumentCatalog().getMetadata() != null) { xmp = document.getDocumentCatalog().getMetadata().exportXMPMetadata(); } @@ -258,15 +256,15 @@ public class PDFParser extends AbstractP } catch (IOException e) { // Invalid date format, just ignore } - + // All remaining metadata is custom // Copy this over as-is List<String> handledMetadata = Arrays.asList("Author", "Creator", "CreationDate", "ModDate", "Keywords", "Producer", "Subject", "Title", "Trapped"); - for(COSName key : info.getDictionary().keySet()) { + for (COSName key : info.getDictionary().keySet()) { String name = key.getName(); - if(! handledMetadata.contains(name)) { - addMetadata(metadata, name, info.getDictionary().getDictionaryObject(key)); + if (!handledMetadata.contains(name)) { + addMetadata(metadata, name, info.getDictionary().getDictionaryObject(key)); } } @@ -276,50 +274,50 @@ public class PDFParser extends AbstractP // TikaCoreProperties.FORMAT can be multivalued // There are also three potential pdf specific version keys: pdf:PDFVersion, pdfa:PDFVersion, pdf:PDFExtensionVersion metadata.set("pdf:PDFVersion", Float.toString(document.getDocument().getVersion())); - metadata.add(TikaCoreProperties.FORMAT.getName(), - MEDIA_TYPE.toString()+"; version="+ - Float.toString(document.getDocument().getVersion())); + metadata.add(TikaCoreProperties.FORMAT.getName(), + MEDIA_TYPE.toString() + "; version=" + + Float.toString(document.getDocument().getVersion())); - try { - if( xmp != null ) { + try { + if (xmp != null) { xmp.addXMLNSMapping(XMPSchemaPDFAId.NAMESPACE, XMPSchemaPDFAId.class); XMPSchemaPDFAId pdfaxmp = (XMPSchemaPDFAId) xmp.getSchemaByClass(XMPSchemaPDFAId.class); - if( pdfaxmp != null ) { + if (pdfaxmp != null) { if (pdfaxmp.getPart() != null) { metadata.set("pdfaid:part", Integer.toString(pdfaxmp.getPart())); } if (pdfaxmp.getConformance() != null) { metadata.set("pdfaid:conformance", pdfaxmp.getConformance()); - String version = "A-"+pdfaxmp.getPart()+pdfaxmp.getConformance().toLowerCase(Locale.ROOT); - metadata.set("pdfa:PDFVersion", version ); - metadata.add(TikaCoreProperties.FORMAT.getName(), - MEDIA_TYPE.toString()+"; version=\""+version+"\"" ); + String version = "A-" + pdfaxmp.getPart() + pdfaxmp.getConformance().toLowerCase(Locale.ROOT); + metadata.set("pdfa:PDFVersion", version); + metadata.add(TikaCoreProperties.FORMAT.getName(), + MEDIA_TYPE.toString() + "; version=\"" + version + "\""); } - } + } // TODO WARN if this XMP version is inconsistent with document header version? } } catch (IOException e) { - metadata.set(TikaCoreProperties.TIKA_META_PREFIX+"pdf:metadata-xmp-parse-failed", ""+e); + metadata.set(TikaCoreProperties.TIKA_META_PREFIX + "pdf:metadata-xmp-parse-failed", "" + e); } //TODO: Let's try to move this into PDFBox. //Attempt to determine Adobe extension level, if present: COSDictionary root = document.getDocumentCatalog().getCOSDictionary(); - COSDictionary extensions = (COSDictionary) root.getDictionaryObject(COSName.getPDFName("Extensions") ); - if( extensions != null ) { - for( COSName extName : extensions.keySet() ) { + COSDictionary extensions = (COSDictionary) root.getDictionaryObject(COSName.getPDFName("Extensions")); + if (extensions != null) { + for (COSName extName : extensions.keySet()) { // If it's an Adobe one, interpret it to determine the extension level: - if( extName.equals( COSName.getPDFName("ADBE") )) { + if (extName.equals(COSName.getPDFName("ADBE"))) { COSDictionary adobeExt = (COSDictionary) extensions.getDictionaryObject(extName); - if( adobeExt != null ) { + if (adobeExt != null) { String baseVersion = adobeExt.getNameAsString(COSName.getPDFName("BaseVersion")); int el = adobeExt.getInt(COSName.getPDFName("ExtensionLevel")); //-1 is sentinel value that something went wrong in getInt if (el != -1) { - metadata.set("pdf:PDFExtensionVersion", baseVersion+" Adobe Extension Level "+el ); - metadata.add(TikaCoreProperties.FORMAT.getName(), - MEDIA_TYPE.toString()+"; version=\""+baseVersion+" Adobe Extension Level "+el+"\""); + metadata.set("pdf:PDFExtensionVersion", baseVersion + " Adobe Extension Level " + el); + metadata.add(TikaCoreProperties.FORMAT.getName(), + MEDIA_TYPE.toString() + "; version=\"" + baseVersion + " Adobe Extension Level " + el + "\""); } - } + } } else { // WARN that there is an Extension, but it's not Adobe's, and so is a 'new' format'. metadata.set("pdf:foundNonAdobeExtensionName", extName.getName()); @@ -328,19 +326,20 @@ public class PDFParser extends AbstractP } } - /** + /** * Try to extract all multilingual items from the XMPSchema - * <p> + * <p/> * This relies on the property having a valid xmp getName() - * <p> + * <p/> * For now, this only extracts the first language if the property does not allow multiple values (see TIKA-1295) + * * @param metadata * @param property * @param pdfBoxBaseline * @param schema */ private void extractMultilingualItems(Metadata metadata, Property property, - String pdfBoxBaseline, XMPSchema schema) { + String pdfBoxBaseline, XMPSchema schema) { //if schema is null, just go with pdfBoxBaseline if (schema == null) { if (pdfBoxBaseline != null && pdfBoxBaseline.length() > 0) { @@ -354,11 +353,11 @@ public class PDFParser extends AbstractP if (value != null && value.length() > 0) { //if you're going to add it below in the baseline addition, don't add it now - if (pdfBoxBaseline != null && value.equals(pdfBoxBaseline)){ + if (pdfBoxBaseline != null && value.equals(pdfBoxBaseline)) { continue; } - metadata.add(property, value); - if (! property.isMultiValuePermitted()){ + metadata.add(property, value); + if (!property.isMultiValuePermitted()) { return; } } @@ -367,12 +366,12 @@ public class PDFParser extends AbstractP if (pdfBoxBaseline != null && pdfBoxBaseline.length() > 0) { //if we've already added something above and multivalue is not permitted //return. - if (! property.isMultiValuePermitted()){ - if (metadata.get(property) != null){ + if (!property.isMultiValuePermitted()) { + if (metadata.get(property) != null) { return; } } - metadata.add(property, pdfBoxBaseline); + metadata.add(property, pdfBoxBaseline); } } @@ -380,24 +379,24 @@ public class PDFParser extends AbstractP /** * This tries to read a list from a particular property in * XMPSchemaDublinCore. - * If it can't find the information, it falls back to the + * If it can't find the information, it falls back to the * pdfboxBaseline. The pdfboxBaseline should be the value * that pdfbox returns from its PDDocumentInformation object * (e.g. getAuthor()) This method is designed include the pdfboxBaseline, * and it should not duplicate the pdfboxBaseline. - * <p> + * <p/> * Until PDFBOX-1803/TIKA-1233 are fixed, do not call this * on dates! - * <p> + * <p/> * This relies on the property having a DublinCore compliant getName() - * + * * @param property * @param pdfBoxBaseline * @param dc * @param metadata */ - private void extractDublinCoreListItems(Metadata metadata, Property property, - String pdfBoxBaseline, XMPSchemaDublinCore dc) { + private void extractDublinCoreListItems(Metadata metadata, Property property, + String pdfBoxBaseline, XMPSchemaDublinCore dc) { //if no dc, add baseline and return if (dc == null) { if (pdfBoxBaseline != null && pdfBoxBaseline.length() > 0) { @@ -413,22 +412,22 @@ public class PDFParser extends AbstractP return; } for (String item : items) { - if (pdfBoxBaseline != null && ! item.equals(pdfBoxBaseline)) { + if (pdfBoxBaseline != null && !item.equals(pdfBoxBaseline)) { addMetadata(metadata, property, item); } } //finally, add the baseline if (pdfBoxBaseline != null && pdfBoxBaseline.length() > 0) { addMetadata(metadata, property, pdfBoxBaseline); - } + } } /** * As of this writing, XMPSchema can contain bags or sequence lists - * for some attributes...despite standards documentation. + * for some attributes...despite standards documentation. * JempBox expects one or the other for specific attributes. * Until more flexibility is added to JempBox, Tika will have to handle both. - * + * * @param schema * @param name * @return list of values or null @@ -446,7 +445,7 @@ public class PDFParser extends AbstractP metadata.add(property, value); } } - + private void addMetadata(Metadata metadata, String name, String value) { if (value != null) { metadata.add(name, value); @@ -467,15 +466,15 @@ public class PDFParser extends AbstractP /** * Used when processing custom metadata entries, as PDFBox won't do - * the conversion for us in the way it does for the standard ones + * the conversion for us in the way it does for the standard ones */ private void addMetadata(Metadata metadata, String name, COSBase value) { - if(value instanceof COSArray) { - for(Object v : ((COSArray)value).toList()) { + if (value instanceof COSArray) { + for (Object v : ((COSArray) value).toList()) { addMetadata(metadata, name, ((COSBase) v)); } - } else if(value instanceof COSString) { - addMetadata(metadata, name, ((COSString)value).getString()); + } else if (value instanceof COSString) { + addMetadata(metadata, name, ((COSString) value).getString()); } // Avoid calling COSDictionary#toString, since it can lead to infinite // recursion. See TIKA-1038 and PDFBOX-1835. @@ -484,56 +483,66 @@ public class PDFParser extends AbstractP } } + public PDFParserConfig getPDFParserConfig() { + return defaultConfig; + } + public void setPDFParserConfig(PDFParserConfig config) { this.defaultConfig = config; } - - public PDFParserConfig getPDFParserConfig() { - return defaultConfig; + + /** + * @see #setUseNonSequentialParser(boolean) + * @deprecated use {@link #getPDFParserConfig()} + */ + public boolean getUseNonSequentialParser() { + return defaultConfig.getUseNonSequentialParser(); } - + /** * If true, the parser will use the NonSequentialParser. This may * be faster than the full doc parser. * If false (default), this will use the full doc parser. - * + * * @deprecated use {@link #setPDFParserConfig(PDFParserConfig)} */ public void setUseNonSequentialParser(boolean v) { defaultConfig.setUseNonSequentialParser(v); } - - /** - * @see #setUseNonSequentialParser(boolean) + + /** + * @see #setEnableAutoSpace(boolean) * @deprecated use {@link #getPDFParserConfig()} */ - public boolean getUseNonSequentialParser() { - return defaultConfig.getUseNonSequentialParser(); + public boolean getEnableAutoSpace() { + return defaultConfig.getEnableAutoSpace(); } - + /** - * If true (the default), the parser should estimate - * where spaces should be inserted between words. For - * many PDFs this is necessary as they do not include - * explicit whitespace characters. + * If true (the default), the parser should estimate + * where spaces should be inserted between words. For + * many PDFs this is necessary as they do not include + * explicit whitespace characters. * - * @deprecated use {@link #setPDFParserConfig(PDFParserConfig)} + * @deprecated use {@link #setPDFParserConfig(PDFParserConfig)} */ public void setEnableAutoSpace(boolean v) { defaultConfig.setEnableAutoSpace(v); } - /** - * @see #setEnableAutoSpace(boolean) + /** + * If true, text in annotations will be extracted. + * * @deprecated use {@link #getPDFParserConfig()} */ - public boolean getEnableAutoSpace() { - return defaultConfig.getEnableAutoSpace(); + public boolean getExtractAnnotationText() { + return defaultConfig.getExtractAnnotationText(); } /** * If true (the default), text in annotations will be * extracted. + * * @deprecated use {@link #setPDFParserConfig(PDFParserConfig)} */ public void setExtractAnnotationText(boolean v) { @@ -541,59 +550,48 @@ public class PDFParser extends AbstractP } /** - * If true, text in annotations will be extracted. - * + * @see #setSuppressDuplicateOverlappingText(boolean) * @deprecated use {@link #getPDFParserConfig()} */ - public boolean getExtractAnnotationText() { - return defaultConfig.getExtractAnnotationText(); + public boolean getSuppressDuplicateOverlappingText() { + return defaultConfig.getSuppressDuplicateOverlappingText(); } /** - * If true, the parser should try to remove duplicated - * text over the same region. This is needed for some - * PDFs that achieve bolding by re-writing the same - * text in the same area. Note that this can - * slow down extraction substantially (PDFBOX-956) and - * sometimes remove characters that were not in fact - * duplicated (PDFBOX-1155). By default this is disabled. - * - * @deprecated use {@link #setPDFParserConfig(PDFParserConfig)} + * If true, the parser should try to remove duplicated + * text over the same region. This is needed for some + * PDFs that achieve bolding by re-writing the same + * text in the same area. Note that this can + * slow down extraction substantially (PDFBOX-956) and + * sometimes remove characters that were not in fact + * duplicated (PDFBOX-1155). By default this is disabled. + * + * @deprecated use {@link #setPDFParserConfig(PDFParserConfig)} */ public void setSuppressDuplicateOverlappingText(boolean v) { defaultConfig.setSuppressDuplicateOverlappingText(v); } - /** - * @see #setSuppressDuplicateOverlappingText(boolean) - * + /** + * @see #setSortByPosition(boolean) * @deprecated use {@link #getPDFParserConfig()} */ - public boolean getSuppressDuplicateOverlappingText() { - return defaultConfig.getSuppressDuplicateOverlappingText(); + public boolean getSortByPosition() { + return defaultConfig.getSortByPosition(); } /** - * If true, sort text tokens by their x/y position - * before extracting text. This may be necessary for - * some PDFs (if the text tokens are not rendered "in - * order"), while for other PDFs it can produce the - * wrong result (for example if there are 2 columns, - * the text will be interleaved). Default is false. - * - * @deprecated use {@link #setPDFParserConfig(PDFParserConfig)} + * If true, sort text tokens by their x/y position + * before extracting text. This may be necessary for + * some PDFs (if the text tokens are not rendered "in + * order"), while for other PDFs it can produce the + * wrong result (for example if there are 2 columns, + * the text will be interleaved). Default is false. + * + * @deprecated use {@link #setPDFParserConfig(PDFParserConfig)} */ public void setSortByPosition(boolean v) { defaultConfig.setSortByPosition(v); } - /** - * @see #setSortByPosition(boolean) - * - * @deprecated use {@link #getPDFParserConfig()} - */ - public boolean getSortByPosition() { - return defaultConfig.getSortByPosition(); - } - }
