This is an automated email from the ASF dual-hosted git repository. nick pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/tika.git
commit aa4954fb44f707779693faea785acc219739ccd5 Author: Nick Burch <n...@gagravarr.org> AuthorDate: Thu Apr 27 17:58:35 2017 +0100 TIKA-2346 Add OfficeParserConfig support to control extraction from shapes from non-shape-based formats --- .../tika/parser/microsoft/AbstractOfficeParser.java | 5 +++++ .../tika/parser/microsoft/OfficeParserConfig.java | 19 +++++++++++++++++++ .../microsoft/ooxml/AbstractOOXMLExtractor.java | 5 +++++ .../parser/microsoft/ooxml/OOXMLExtractorFactory.java | 5 ++--- .../microsoft/ooxml/POIXMLTextExtractorDecorator.java | 5 +++++ .../microsoft/ooxml/SXWPFWordExtractorDecorator.java | 3 +-- .../microsoft/ooxml/XSSFExcelExtractorDecorator.java | 9 +++++++-- 7 files changed, 44 insertions(+), 7 deletions(-) diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/AbstractOfficeParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/AbstractOfficeParser.java index 48a756e..489a16d 100644 --- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/AbstractOfficeParser.java +++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/AbstractOfficeParser.java @@ -67,6 +67,11 @@ public abstract class AbstractOfficeParser extends AbstractParser { public void setIncludeMoveFromContent(boolean includeMoveFromContent) { defaultOfficeParserConfig.setIncludeMoveFromContent(includeMoveFromContent); } + + @Field + public void setIncludeShapeBasedContent(boolean includeShapeBasedContent) { + defaultOfficeParserConfig.setIncludeShapeBasedContent(includeShapeBasedContent); + } @Field public void setUseSAXDocxExtractor(boolean useSAXDocxExtractor) { diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParserConfig.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParserConfig.java index e1947a5..8f0f975 100644 --- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParserConfig.java +++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParserConfig.java @@ -25,6 +25,7 @@ public class OfficeParserConfig implements Serializable { private boolean includeDeletedContent = false; private boolean includeMoveFromContent = false; + private boolean includeShapeBasedContent = true; private boolean useSAXDocxExtractor = false; private boolean useSAXPptxExtractor = false; @@ -82,6 +83,24 @@ public class OfficeParserConfig implements Serializable { return includeMoveFromContent; } + /** + * In Excel and Word, there can be text stored within drawing shapes. + * (In PowerPoint everything is in a Shape) + * <p/> + * If you'd like to skip processing these to look for text, set this to + * <code>false</code> + * <p/> + * Default: <code>true</code> + * @param includeShapeBasedContent + */ + public void setIncludeShapeBasedContent(boolean includeShapeBasedContent) { + this.includeShapeBasedContent = includeShapeBasedContent; + } + + public boolean getIncludeShapeBasedContent() { + return includeShapeBasedContent; + } + public boolean getUseSAXDocxExtractor() { return useSAXDocxExtractor; } diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java index 26711b2..ff586ba 100644 --- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java +++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java @@ -93,12 +93,17 @@ public abstract class AbstractOOXMLExtractor implements OOXMLExtractor { private final EmbeddedDocumentExtractor embeddedExtractor; private final ParseContext context; + protected OfficeParserConfig config; protected POIXMLTextExtractor extractor; public AbstractOOXMLExtractor(ParseContext context, POIXMLTextExtractor extractor) { this.context = context; this.extractor = extractor; embeddedExtractor = EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context); + + // This has already been set by OOXMLParser's call to configure() + // We can rely on this being non-null. + this.config = context.get(OfficeParserConfig.class); } /** diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java index 92963a8..f4366cc 100644 --- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java +++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java @@ -91,8 +91,8 @@ public class OOXMLExtractorFactory { // Have the appropriate OOXML text extractor picked POIXMLTextExtractor poiExtractor = null; - //This has already been set by OOXMLParser's call to configure() - //We can rely on this being non-null. + // This has already been set by OOXMLParser's call to configure() + // We can rely on this being non-null. OfficeParserConfig config = context.get(OfficeParserConfig.class); if (config.getUseSAXDocxExtractor()) { poiExtractor = trySXWPF(pkg); @@ -107,7 +107,6 @@ public class OOXMLExtractorFactory { POIXMLDocument document = poiExtractor.getDocument(); if (poiExtractor instanceof XSSFBEventBasedExcelExtractor) { extractor = new XSSFBExcelExtractorDecorator(context, poiExtractor, locale); - } else if (poiExtractor instanceof XSSFEventBasedExcelExtractor) { extractor = new XSSFExcelExtractorDecorator( context, poiExtractor, locale); diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/POIXMLTextExtractorDecorator.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/POIXMLTextExtractorDecorator.java index ff44176..f6ec3bf 100644 --- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/POIXMLTextExtractorDecorator.java +++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/POIXMLTextExtractorDecorator.java @@ -21,6 +21,7 @@ import java.util.List; import org.apache.poi.POIXMLTextExtractor; import org.apache.poi.openxml4j.opc.PackagePart; +import org.apache.poi.xssf.extractor.XSSFExcelExtractor; import org.apache.tika.parser.ParseContext; import org.apache.tika.sax.XHTMLContentHandler; import org.xml.sax.SAXException; @@ -29,6 +30,10 @@ public class POIXMLTextExtractorDecorator extends AbstractOOXMLExtractor { public POIXMLTextExtractorDecorator(ParseContext context, POIXMLTextExtractor extractor) { super(context, extractor); + + if (extractor instanceof XSSFExcelExtractor) { + ((XSSFExcelExtractor)extractor).setIncludeTextBoxes(config.getIncludeShapeBasedContent()); + } } @Override diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXWPFWordExtractorDecorator.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXWPFWordExtractorDecorator.java index 89ad4e5..d923a2c 100644 --- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXWPFWordExtractorDecorator.java +++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXWPFWordExtractorDecorator.java @@ -36,7 +36,6 @@ import org.apache.tika.exception.TikaException; import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.TikaCoreProperties; import org.apache.tika.parser.ParseContext; -import org.apache.tika.parser.microsoft.OfficeParserConfig; import org.apache.tika.parser.microsoft.ooxml.xwpf.XWPFEventBasedWordExtractor; import org.apache.tika.parser.microsoft.ooxml.xwpf.XWPFNumberingShim; import org.apache.tika.parser.microsoft.ooxml.xwpf.XWPFStylesShim; @@ -184,7 +183,7 @@ public class SXWPFWordExtractorDecorator extends AbstractOOXMLExtractor { new OfflineContentHandler(new EmbeddedContentHandler( new OOXMLWordAndPowerPointTextHandler( new OOXMLTikaBodyPartHandler(xhtml, styles, listManager, - context.get(OfficeParserConfig.class)), linkedRelationships)))); + config), linkedRelationships)))); } catch (TikaException e) { metadata.add(TikaCoreProperties.TIKA_META_EXCEPTION_WARNING, ExceptionUtils.getStackTrace(e)); diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java index dbf21d1..11277d5 100644 --- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java +++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java @@ -93,6 +93,7 @@ public class XSSFExcelExtractorDecorator extends AbstractOOXMLExtractor { } protected void configureExtractor(POIXMLTextExtractor extractor, Locale locale) { + ((XSSFEventBasedExcelExtractor)extractor).setIncludeTextBoxes(config.getIncludeShapeBasedContent()); ((XSSFEventBasedExcelExtractor)extractor).setFormulasNotResults(false); ((XSSFEventBasedExcelExtractor)extractor).setLocale(locale); } @@ -163,8 +164,12 @@ public class XSSFExcelExtractorDecorator extends AbstractOOXMLExtractor { for (String footer : sheetExtractor.footers) { extractHeaderFooter(footer, xhtml); } - List<XSSFShape> shapes = iter.getShapes(); - processShapes(shapes, xhtml); + + // Do text held in shapes, if required + if (config.getIncludeShapeBasedContent()) { + List<XSSFShape> shapes = iter.getShapes(); + processShapes(shapes, xhtml); + } //for now dump sheet hyperlinks at bottom of page //consider a double-pass of the inputstream to reunite hyperlinks with cells/textboxes -- To stop receiving notification emails like this one, please contact "commits@tika.apache.org" <commits@tika.apache.org>.