Author: jukka
Date: Fri Jun 26 05:55:45 2009
New Revision: 788595
URL: http://svn.apache.org/viewvc?rev=788595&view=rev
Log:
TIKA-244: Missing Header/Footer text for Word'97 documents
Patch contributed by Maxim Valyanskiy.
Modified:
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java
Modified:
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java
URL:
http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java?rev=788595&r1=788594&r2=788595&view=diff
==============================================================================
---
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java
(original)
+++
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java
Fri Jun 26 05:55:45 2009
@@ -76,9 +76,11 @@
} else if ("WordDocument".equals(name)) {
setType(metadata, "application/msword");
WordExtractor extractor = new WordExtractor(filesystem);
+ addTextIfAny(xhtml, "header", extractor.getHeaderText());
for (String paragraph : extractor.getParagraphText()) {
xhtml.element("p", paragraph);
}
+ addTextIfAny(xhtml, "footer", extractor.getFooterText());
} else if ("PowerPoint Document".equals(name)) {
setType(metadata, "application/vnd.ms-powerpoint");
PowerPointExtractor extractor =
@@ -185,4 +187,22 @@
}
}
+ /**
+ * Outputs a section of text if the given text is non-empty.
+ *
+ * @param xhtml XHTML content handler
+ * @param section the class of the <div/> section emitted
+ * @param text text to be emitted, if any
+ * @throws SAXException if an error occurs
+ */
+ private void addTextIfAny(
+ XHTMLContentHandler xhtml, String section, String text)
+ throws SAXException {
+ if (text != null && text.length() > 0) {
+ xhtml.startElement("div", "class", section);
+ xhtml.element("p", text);
+ xhtml.endElement("div");
+ }
+ }
+
}