Author: jukka
Date: Fri Jun 26 05:55:45 2009
New Revision: 788595

URL: http://svn.apache.org/viewvc?rev=788595&view=rev
Log:
TIKA-244: Missing Header/Footer text for Word'97 documents

Patch contributed by Maxim Valyanskiy.

Modified:
    
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java

Modified: 
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java
URL: 
http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java?rev=788595&r1=788594&r2=788595&view=diff
==============================================================================
--- 
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java
 (original)
+++ 
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java
 Fri Jun 26 05:55:45 2009
@@ -76,9 +76,11 @@
             } else if ("WordDocument".equals(name)) {
                 setType(metadata, "application/msword");
                 WordExtractor extractor = new WordExtractor(filesystem);
+                addTextIfAny(xhtml, "header", extractor.getHeaderText());
                 for (String paragraph : extractor.getParagraphText()) {
                     xhtml.element("p", paragraph);
                 }
+                addTextIfAny(xhtml, "footer", extractor.getFooterText());
             } else if ("PowerPoint Document".equals(name)) {
                 setType(metadata, "application/vnd.ms-powerpoint");
                 PowerPointExtractor extractor =
@@ -185,4 +187,22 @@
         }
     }
 
+    /**
+     * Outputs a section of text if the given text is non-empty.
+     *
+     * @param xhtml XHTML content handler
+     * @param section the class of the <div/> section emitted
+     * @param text text to be emitted, if any
+     * @throws SAXException if an error occurs
+     */
+    private void addTextIfAny(
+            XHTMLContentHandler xhtml, String section, String text)
+            throws SAXException {
+        if (text != null && text.length() > 0) {
+            xhtml.startElement("div", "class", section);
+            xhtml.element("p", text);
+            xhtml.endElement("div");
+        }
+    }
+
 }


Reply via email to