Author: jukka
Date: Fri Oct 2 11:05:21 2009
New Revision: 820961
URL: http://svn.apache.org/viewvc?rev=820961&view=rev
Log:
TIKA-279: XWPFWordExtractorDecorator does not extract some headers/footers
Patch by Maxim Valyanskiy
Modified:
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java
Modified:
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java
URL:
http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java?rev=820961&r1=820960&r2=820961&view=diff
==============================================================================
---
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java
(original)
+++
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java
Fri Oct 2 11:05:21 2009
@@ -30,6 +30,7 @@
import org.apache.xmlbeans.XmlException;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTP;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTRow;
+import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTSectPr;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTTbl;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTTc;
import org.xml.sax.SAXException;
@@ -44,33 +45,49 @@
* @see org.apache.poi.xwpf.extractor.XWPFWordExtractor#getText()
*/
@Override
- protected void buildXHTML(XHTMLContentHandler xhtml) throws SAXException,
- XmlException, IOException {
+ protected void buildXHTML(XHTMLContentHandler xhtml)
+ throws SAXException, XmlException, IOException {
XWPFDocument document = (XWPFDocument) extractor.getDocument();
XWPFHeaderFooterPolicy hfPolicy = document.getHeaderFooterPolicy();
// headers
- if (hfPolicy.getFirstPageHeader() != null) {
- xhtml.element("p", hfPolicy.getFirstPageHeader().getText());
- }
- if (hfPolicy.getEvenPageHeader() != null) {
- xhtml.element("p", hfPolicy.getEvenPageHeader().getText());
- }
- if (hfPolicy.getDefaultHeader() != null) {
- xhtml.element("p", hfPolicy.getDefaultHeader().getText());
- }
+ extractHeaders(xhtml, hfPolicy);
// first all paragraphs
Iterator<XWPFParagraph> i = document.getParagraphsIterator();
while (i.hasNext()) {
+ XWPFParagraph paragraph = i.next();
+
+ CTSectPr ctSectPr = null;
+ if (paragraph.getCTP().getPPr() != null) {
+ ctSectPr = paragraph.getCTP().getPPr().getSectPr();
+ }
+
+ XWPFHeaderFooterPolicy headerFooterPolicy = null;
+
+ if (ctSectPr != null) {
+ headerFooterPolicy =
+ new XWPFHeaderFooterPolicy(document, ctSectPr);
+ extractHeaders(xhtml, headerFooterPolicy);
+ }
+
XWPFParagraphDecorator decorator = new XWPFCommentsDecorator(
- new XWPFHyperlinkDecorator(i.next(), null, true));
+ new XWPFHyperlinkDecorator(paragraph, null, true));
xhtml.element("p", decorator.getText());
+
+ if (ctSectPr != null) {
+ extractFooters(xhtml, headerFooterPolicy);
+ }
}
// then all document tables
extractTableContent(document, xhtml);
+ extractFooters(xhtml, hfPolicy);
+ }
+ private void extractFooters(
+ XHTMLContentHandler xhtml, XWPFHeaderFooterPolicy hfPolicy)
+ throws SAXException {
// footers
if (hfPolicy.getFirstPageFooter() != null) {
xhtml.element("p", hfPolicy.getFirstPageFooter().getText());
@@ -83,6 +100,20 @@
}
}
+ private void extractHeaders(
+ XHTMLContentHandler xhtml, XWPFHeaderFooterPolicy hfPolicy)
+ throws SAXException {
+ if (hfPolicy.getFirstPageHeader() != null) {
+ xhtml.element("p", hfPolicy.getFirstPageHeader().getText());
+ }
+ if (hfPolicy.getEvenPageHeader() != null) {
+ xhtml.element("p", hfPolicy.getEvenPageHeader().getText());
+ }
+ if (hfPolicy.getDefaultHeader() != null) {
+ xhtml.element("p", hfPolicy.getDefaultHeader().getText());
+ }
+ }
+
/**
* Low level structured parsing of document tables.
*/