This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/master by this push:
new 5cbaed8 TIKA-2362 -- Allow users to turn off extraction of headers
and footers from .doc, .docx, .xls, .xlsx, .xlsb
5cbaed8 is described below
commit 5cbaed87235c2cee49c9d4fa15d84158d000e986
Author: tballison <[email protected]>
AuthorDate: Thu Jun 8 21:44:51 2017 -0400
TIKA-2362 -- Allow users to turn off extraction of headers and footers from
.doc, .docx, .xls, .xlsx, .xlsb
---
CHANGES.txt | 3 ++
.../tika/parser/microsoft/ExcelExtractor.java | 18 ++++++---
.../tika/parser/microsoft/OfficeParserConfig.java | 17 +++++++++
.../tika/parser/microsoft/WordExtractor.java | 24 ++++++------
.../ooxml/SXWPFWordExtractorDecorator.java | 31 ++++++++++------
.../ooxml/XSSFBExcelExtractorDecorator.java | 2 +-
.../ooxml/XSSFExcelExtractorDecorator.java | 9 ++++-
.../ooxml/XWPFWordExtractorDecorator.java | 8 ++--
.../tika/parser/microsoft/ExcelParserTest.java | 32 ++++++++++++++++
.../tika/parser/microsoft/WordParserTest.java | 11 ++++++
.../parser/microsoft/ooxml/OOXMLParserTest.java | 43 ++++++++++++++++++++++
.../parser/microsoft/ooxml/SXWPFExtractorTest.java | 12 ++++++
12 files changed, 174 insertions(+), 36 deletions(-)
diff --git a/CHANGES.txt b/CHANGES.txt
index 7c2eac2..6859eb7 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -1,5 +1,8 @@
Release 1.15.1 - ??/??/????
+ * Allow users to turn off extraction of headers and footers
+ from .doc, .docx, .xls, .xlsx, .xlsb (TIKA-2362)
+
* Extract text from charts in .docx, .pptx, .xlsx and .xlsb
(TIKA-2254).
diff --git
a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java
b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java
index 40f0b52..ac6aae4 100644
---
a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java
+++
b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java
@@ -301,8 +301,10 @@ public class ExcelExtractor extends AbstractPOIFSExtractor
{
hssfRequest.addListener(formatListener, FormatRecord.sid);
hssfRequest.addListener(formatListener,
ExtendedFormatRecord.sid);
hssfRequest.addListener(formatListener,
DrawingGroupRecord.sid);
- hssfRequest.addListener(formatListener, HeaderRecord.sid);
- hssfRequest.addListener(formatListener, FooterRecord.sid);
+ if
(extractor.officeParserConfig.getIncludeHeadersAndFooters()) {
+ hssfRequest.addListener(formatListener, HeaderRecord.sid);
+ hssfRequest.addListener(formatListener, FooterRecord.sid);
+ }
}
// Create event factory and process Workbook (fire events)
@@ -473,13 +475,17 @@ public class ExcelExtractor extends
AbstractPOIFSExtractor {
break;
case HeaderRecord.sid:
- HeaderRecord headerRecord = (HeaderRecord) record;
- addTextCell(record, headerRecord.getText());
+ if
(extractor.officeParserConfig.getIncludeHeadersAndFooters()) {
+ HeaderRecord headerRecord = (HeaderRecord) record;
+ addTextCell(record, headerRecord.getText());
+ }
break;
case FooterRecord.sid:
- FooterRecord footerRecord = (FooterRecord) record;
- addTextCell(record, footerRecord.getText());
+ if
(extractor.officeParserConfig.getIncludeHeadersAndFooters()) {
+ FooterRecord footerRecord = (FooterRecord) record;
+ addTextCell(record, footerRecord.getText());
+ }
break;
}
diff --git
a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParserConfig.java
b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParserConfig.java
index 8f0f975..e660cb8 100644
---
a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParserConfig.java
+++
b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParserConfig.java
@@ -26,6 +26,7 @@ public class OfficeParserConfig implements Serializable {
private boolean includeDeletedContent = false;
private boolean includeMoveFromContent = false;
private boolean includeShapeBasedContent = true;
+ private boolean includeHeadersAndFooters = true;
private boolean useSAXDocxExtractor = false;
private boolean useSAXPptxExtractor = false;
@@ -101,6 +102,22 @@ public class OfficeParserConfig implements Serializable {
return includeShapeBasedContent;
}
+ /**
+ * Whether or not to include headers and footers.
+ * <p/>
+ * This only operates on headers and footers in Word and Excel,
+ * not master slide content in Powerpoint.
+ * <p/>
+ * Default: <code>true</code>
+ * @param includeHeadersAndFooters
+ */
+ public void setIncludeHeadersAndFooters(boolean includeHeadersAndFooters) {
+ this.includeHeadersAndFooters = includeHeadersAndFooters;
+ }
+
+ public boolean getIncludeHeadersAndFooters() {
+ return includeHeadersAndFooters;
+ }
public boolean getUseSAXDocxExtractor() {
return useSAXDocxExtractor;
}
diff --git
a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java
b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java
index 8cb6106..0622ddc 100644
---
a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java
+++
b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java
@@ -166,19 +166,20 @@ public class WordExtractor extends AbstractPOIFSExtractor
{
org.apache.poi.hwpf.extractor.WordExtractor wordExtractor =
new org.apache.poi.hwpf.extractor.WordExtractor(document);
- HeaderStories headerFooter = new HeaderStories(document);
// Grab the list of pictures. As far as we can tell,
// the pictures should be in order, and may be directly
// placed or referenced from an anchor
PicturesTable pictureTable = document.getPicturesTable();
PicturesSource pictures = new PicturesSource(document);
-
+ HeaderStories headerFooter = null;
// Do any headers, if present
- Range[] headers = new Range[]{headerFooter.getFirstHeaderSubrange(),
- headerFooter.getEvenHeaderSubrange(),
headerFooter.getOddHeaderSubrange()};
- handleHeaderFooter(headers, "header", document, pictures,
pictureTable, xhtml);
-
+ if (officeParserConfig.getIncludeHeadersAndFooters()) {
+ headerFooter = new HeaderStories(document);
+ Range[] headers = new
Range[]{headerFooter.getFirstHeaderSubrange(),
+ headerFooter.getEvenHeaderSubrange(),
headerFooter.getOddHeaderSubrange()};
+ handleHeaderFooter(headers, "header", document, pictures,
pictureTable, xhtml);
+ }
// Do the main paragraph text
Range r = document.getRange();
ListManager listManager = new ListManager(document);
@@ -206,11 +207,12 @@ public class WordExtractor extends AbstractPOIFSExtractor
{
xhtml.element("p", paragraph);
}
- // Do any footers, if present
- Range[] footers = new Range[]{headerFooter.getFirstFooterSubrange(),
- headerFooter.getEvenFooterSubrange(),
headerFooter.getOddFooterSubrange()};
- handleHeaderFooter(footers, "footer", document, pictures,
pictureTable, xhtml);
-
+ if (officeParserConfig.getIncludeHeadersAndFooters()) {
+ // Do any footers, if present
+ Range[] footers = new
Range[]{headerFooter.getFirstFooterSubrange(),
+ headerFooter.getEvenFooterSubrange(),
headerFooter.getOddFooterSubrange()};
+ handleHeaderFooter(footers, "footer", document, pictures,
pictureTable, xhtml);
+ }
// Handle any pictures that we haven't output yet
for (Picture p = pictures.nextUnclaimed(); p != null; ) {
handlePictureCharacterRun(
diff --git
a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXWPFWordExtractorDecorator.java
b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXWPFWordExtractorDecorator.java
index 1f0eb77..17b4e33 100644
---
a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXWPFWordExtractorDecorator.java
+++
b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXWPFWordExtractorDecorator.java
@@ -130,18 +130,20 @@ public class SXWPFWordExtractorDecorator extends
AbstractOOXMLExtractor {
ExceptionUtils.getStackTrace(e));
}
- //headers
- try {
- PackageRelationshipCollection headersPRC =
documentPart.getRelationshipsByType(XWPFRelation.HEADER.getRelation());
- if (headersPRC != null) {
- for (int i = 0; i < headersPRC.size(); i++) {
- PackagePart header =
documentPart.getRelatedPart(headersPRC.getRelationship(i));
- handlePart(header, styles, listManager, xhtml);
+ if (config.getIncludeHeadersAndFooters()) {
+ //headers
+ try {
+ PackageRelationshipCollection headersPRC =
documentPart.getRelationshipsByType(XWPFRelation.HEADER.getRelation());
+ if (headersPRC != null) {
+ for (int i = 0; i < headersPRC.size(); i++) {
+ PackagePart header =
documentPart.getRelatedPart(headersPRC.getRelationship(i));
+ handlePart(header, styles, listManager, xhtml);
+ }
}
+ } catch (InvalidFormatException | ZipException e) {
+ metadata.add(TikaCoreProperties.TIKA_META_EXCEPTION_WARNING,
+ ExceptionUtils.getStackTrace(e));
}
- } catch (InvalidFormatException|ZipException e) {
- metadata.add(TikaCoreProperties.TIKA_META_EXCEPTION_WARNING,
- ExceptionUtils.getStackTrace(e));
}
//main document
@@ -153,13 +155,18 @@ public class SXWPFWordExtractorDecorator extends
AbstractOOXMLExtractor {
}
//for now, just dump other components at end
for (String rel : new String[]{
+ AbstractOOXMLExtractor.RELATION_DIAGRAM_DATA,
+ AbstractOOXMLExtractor.RELATION_CHART,
XWPFRelation.FOOTNOTE.getRelation(),
XWPFRelation.COMMENT.getRelation(),
XWPFRelation.FOOTER.getRelation(),
XWPFRelation.ENDNOTE.getRelation(),
- AbstractOOXMLExtractor.RELATION_DIAGRAM_DATA,
- AbstractOOXMLExtractor.RELATION_CHART
}) {
+ //skip footers if we shouldn't extract them
+ if (! config.getIncludeHeadersAndFooters() &&
+ rel.equals(XWPFRelation.FOOTER.getRelation())) {
+ continue;
+ }
try {
PackageRelationshipCollection prc =
documentPart.getRelationshipsByType(rel);
if (prc != null) {
diff --git
a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFBExcelExtractorDecorator.java
b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFBExcelExtractorDecorator.java
index 14744d1..d6beed2 100644
---
a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFBExcelExtractorDecorator.java
+++
b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFBExcelExtractorDecorator.java
@@ -112,7 +112,7 @@ public class XSSFBExcelExtractorDecorator extends
XSSFExcelExtractorDecorator {
addDrawingHyperLinks(sheetPart);
sheetParts.add(sheetPart);
- SheetTextAsHTML sheetExtractor = new SheetTextAsHTML(xhtml);
+ SheetTextAsHTML sheetExtractor = new
SheetTextAsHTML(config.getIncludeHeadersAndFooters(), xhtml);
XSSFBCommentsTable comments = iter.getXSSFBSheetComments();
// Start, and output the sheet name
diff --git
a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java
b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java
index b554354..55df80c 100644
---
a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java
+++
b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java
@@ -143,7 +143,7 @@ public class XSSFExcelExtractorDecorator extends
AbstractOOXMLExtractor {
while (iter.hasNext()) {
- SheetTextAsHTML sheetExtractor = new SheetTextAsHTML(xhtml);
+ SheetTextAsHTML sheetExtractor = new
SheetTextAsHTML(config.getIncludeHeadersAndFooters(), xhtml);
PackagePart sheetPart = null;
try (InputStream stream = iter.next()) {
sheetPart = iter.getSheetPart();
@@ -390,10 +390,12 @@ public class XSSFExcelExtractorDecorator extends
AbstractOOXMLExtractor {
*/
protected static class SheetTextAsHTML implements SheetContentsHandler {
private XHTMLContentHandler xhtml;
+ private final boolean includeHeadersFooters;
protected List<String> headers;
protected List<String> footers;
- protected SheetTextAsHTML(XHTMLContentHandler xhtml) {
+ protected SheetTextAsHTML(boolean includeHeaderFooters,
XHTMLContentHandler xhtml) {
+ this.includeHeadersFooters = includeHeaderFooters;
this.xhtml = xhtml;
headers = new ArrayList<String>();
footers = new ArrayList<String>();
@@ -437,6 +439,9 @@ public class XSSFExcelExtractorDecorator extends
AbstractOOXMLExtractor {
}
public void headerFooter(String text, boolean isHeader, String
tagName) {
+ if (! includeHeadersFooters) {
+ return;
+ }
if (isHeader) {
headers.add(text);
} else {
diff --git
a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java
b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java
index e7893c8..181f777 100644
---
a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java
+++
b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java
@@ -112,7 +112,7 @@ public class XWPFWordExtractorDecorator extends
AbstractOOXMLExtractor {
XWPFHeaderFooterPolicy hfPolicy = document.getHeaderFooterPolicy();
XWPFListManager listManager = new
XWPFListManager(document.getNumbering());
// headers
- if (hfPolicy != null) {
+ if (hfPolicy != null && config.getIncludeHeadersAndFooters()) {
extractHeaders(xhtml, hfPolicy, listManager);
}
@@ -143,7 +143,7 @@ public class XWPFWordExtractorDecorator extends
AbstractOOXMLExtractor {
);
// then all document footers
- if (hfPolicy != null) {
+ if (hfPolicy != null && config.getIncludeHeadersAndFooters()) {
extractFooters(xhtml, hfPolicy, listManager);
}
}
@@ -185,7 +185,7 @@ public class XWPFWordExtractorDecorator extends
AbstractOOXMLExtractor {
XWPFHeaderFooterPolicy headerFooterPolicy = null;
if (paragraph.getCTP().getPPr() != null) {
CTSectPr ctSectPr = paragraph.getCTP().getPPr().getSectPr();
- if (ctSectPr != null) {
+ if (ctSectPr != null && config.getIncludeHeadersAndFooters()) {
headerFooterPolicy =
new XWPFHeaderFooterPolicy(document, ctSectPr);
extractHeaders(xhtml, headerFooterPolicy, listManager);
@@ -336,7 +336,7 @@ public class XWPFWordExtractorDecorator extends
AbstractOOXMLExtractor {
// Finish this paragraph
xhtml.endElement(tag);
- if (headerFooterPolicy != null) {
+ if (headerFooterPolicy != null &&
config.getIncludeHeadersAndFooters()) {
extractFooters(xhtml, headerFooterPolicy, listManager);
}
}
diff --git
a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java
b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java
index d9abdf8..496080c 100644
---
a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java
+++
b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java
@@ -408,6 +408,38 @@ public class ExcelParserTest extends TikaTest {
}
@Test
+ public void testHeaderAndFooterNotExtraction() throws Exception {
+ try (InputStream input = ExcelParserTest.class.getResourceAsStream(
+ "/test-documents/testEXCEL_headers_footers.xls")) {
+ Metadata metadata = new Metadata();
+ ContentHandler handler = new BodyContentHandler();
+ ParseContext context = new ParseContext();
+ context.set(Locale.class, Locale.UK);
+
+ OfficeParserConfig officeParserConfig = new OfficeParserConfig();
+ officeParserConfig.setIncludeHeadersAndFooters(false);
+ context.set(OfficeParserConfig.class, officeParserConfig);
+ new OfficeParser().parse(input, handler, metadata, context);
+
+ assertEquals(
+ "application/vnd.ms-excel",
+ metadata.get(Metadata.CONTENT_TYPE));
+
+ String content = handler.toString();
+ assertContains("John Smith1", content);
+ assertContains("John Smith50", content);
+ assertContains("1 Corporate HQ", content);
+ assertNotContained("Header - Corporate Spreadsheet", content);
+ assertNotContained("Header - For Internal Use Only", content);
+ assertNotContained("Header - Author: John Smith", content);
+ assertNotContained("Footer - Corporate Spreadsheet", content);
+ assertNotContained("Footer - For Internal Use Only", content);
+ assertNotContained("Footer - Author: John Smith", content);
+ }
+ }
+
+
+ @Test
public void testHyperlinksInXLS() throws Exception {
String xml = getXML("testEXCEL_hyperlinks.xls").xml;
//external url
diff --git
a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java
b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java
index f7036a8..fb53c95 100644
---
a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java
+++
b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java
@@ -276,6 +276,17 @@ public class WordParserTest extends TikaTest {
assertContains("\uD800\uDF32\uD800\uDF3f\uD800\uDF44\uD800\uDF39\uD800\uDF43\uD800\uDF3A",
content);
}
+ @Test
+ public void testHeaderFooterNotExtraction() throws Exception {
+ ParseContext parseContext = new ParseContext();
+ OfficeParserConfig officeParserConfig = new OfficeParserConfig();
+ officeParserConfig.setIncludeHeadersAndFooters(false);
+ parseContext.set(OfficeParserConfig.class, officeParserConfig);
+ String xml = getXML("testWORD_various.doc", parseContext).xml;
+ assertNotContained("This is the header text.", xml);
+ assertNotContained("This is the footer text.", xml);
+ }
+
/**
* TIKA-1044 - Handle documents where parts of the
* text have no formatting or styles applied to them
diff --git
a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
index bb18e46..018935f 100644
---
a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
+++
b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
@@ -654,6 +654,17 @@ public class OOXMLParserTest extends TikaTest {
}
@Test
+ public void testDOCXHeaderFooterNotExtraction() throws Exception {
+ ParseContext parseContext = new ParseContext();
+ OfficeParserConfig officeParserConfig = new OfficeParserConfig();
+ officeParserConfig.setIncludeHeadersAndFooters(false);
+ parseContext.set(OfficeParserConfig.class, officeParserConfig);
+ String xml = getXML("testWORD_various.docx", parseContext).xml;
+ assertNotContained("This is the header text.", xml);
+ assertNotContained("This is the footer text.", xml);
+ }
+
+ @Test
public void testVariousPPTX() throws Exception {
Metadata metadata = new Metadata();
String xml = getXML("testPPT_various.pptx", metadata).xml;
@@ -1245,6 +1256,23 @@ public class OOXMLParserTest extends TikaTest {
}
@Test
+ public void testExcelHeaderAndFooterNotExtraction() throws Exception {
+ ParseContext parseContext = new ParseContext();
+ OfficeParserConfig officeParserConfig = new OfficeParserConfig();
+ officeParserConfig.setIncludeHeadersAndFooters(false);
+ parseContext.set(OfficeParserConfig.class, officeParserConfig);
+
+ String content = getXML("testEXCEL_headers_footers.xlsx",
parseContext).xml;
+ assertNotContained("Header - Corporate Spreadsheet", content);
+ assertNotContained("Header - For Internal Use Only", content);
+ assertNotContained("Header - Author: John Smith", content);
+ assertNotContained("Footer - Corporate Spreadsheet", content);
+ assertNotContained("Footer - For Internal Use Only", content);
+ assertNotContained("Footer - Author: John Smith", content);
+ }
+
+
+ @Test
public void testMultiAuthorsManagers() throws Exception {
XMLResult r = getXML("testWORD_multi_authors.docx");
String[] authors = r.metadata.getValues(TikaCoreProperties.CREATOR);
@@ -1538,7 +1566,22 @@ public class OOXMLParserTest extends TikaTest {
assertContains("OddLeftFooter OddCenterFooter OddRightFooter", xml);
assertContains("EvenLeftFooter EvenCenterFooter EvenRightFooter", xml);
assertContains("FirstPageLeftFooter FirstPageCenterFooter
FirstPageRightFooter", xml);
+ }
+ @Test
+ public void testXLSBNoHeaderFooters() throws Exception {
+ ParseContext parseContext = new ParseContext();
+ OfficeParserConfig officeParserConfig = new OfficeParserConfig();
+ officeParserConfig.setIncludeHeadersAndFooters(false);
+ parseContext.set(OfficeParserConfig.class, officeParserConfig);
+ String xml = getXML("testEXCEL_various.xlsb", parseContext).xml;
+ assertNotContained("OddLeftHeader OddCenterHeader OddRightHeader",
xml);
+ assertNotContained("EvenLeftHeader EvenCenterHeader EvenRightHeader",
xml);
+
+ assertNotContained("FirstPageLeftHeader FirstPageCenterHeader
FirstPageRightHeader", xml);
+ assertNotContained("OddLeftFooter OddCenterFooter OddRightFooter",
xml);
+ assertNotContained("EvenLeftFooter EvenCenterFooter EvenRightFooter",
xml);
+ assertNotContained("FirstPageLeftFooter FirstPageCenterFooter
FirstPageRightFooter", xml);
}
diff --git
a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/SXWPFExtractorTest.java
b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/SXWPFExtractorTest.java
index 6994a3a..bc131f0 100644
---
a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/SXWPFExtractorTest.java
+++
b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/SXWPFExtractorTest.java
@@ -810,4 +810,16 @@ public class SXWPFExtractorTest extends TikaTest {
assertNotContained("chartSpace", xml);
}
+ @Test
+ public void testHeaderFooterNotExtraction() throws Exception {
+ ParseContext parseContext = new ParseContext();
+ OfficeParserConfig officeParserConfig = new OfficeParserConfig();
+ officeParserConfig.setIncludeHeadersAndFooters(false);
+ officeParserConfig.setUseSAXDocxExtractor(true);
+ parseContext.set(OfficeParserConfig.class, officeParserConfig);
+ String xml = getXML("testWORD_various.docx", parseContext).xml;
+ assertNotContained("This is the header text.", xml);
+ assertNotContained("This is the footer text.", xml);
+ }
+
}
--
To stop receiving notification emails like this one, please contact
['"[email protected]" <[email protected]>'].