Author: jukka
Date: Fri Oct  2 10:57:44 2009
New Revision: 820960

URL: http://svn.apache.org/viewvc?rev=820960&view=rev
Log:
TIKA-256: MSWord parser does not extract footnotes and comments

Patch by Maxim Valyanskiy

Modified:
    
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java

Modified: 
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java
URL: 
http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java?rev=820960&r1=820959&r2=820960&view=diff
==============================================================================
--- 
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java
 (original)
+++ 
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java
 Fri Oct  2 10:57:44 2009
@@ -85,10 +85,25 @@
             } else if ("WordDocument".equals(name)) {
                 setType(metadata, "application/msword");
                 WordExtractor extractor = new WordExtractor(filesystem);
+
                 addTextIfAny(xhtml, "header", extractor.getHeaderText());
+
                 for (String paragraph : extractor.getParagraphText()) {
                     xhtml.element("p", paragraph);
                 }
+
+                for (String paragraph : extractor.getFootnoteText()) {
+                    xhtml.element("p", paragraph);
+                }
+
+                for (String paragraph : extractor.getCommentsText()) {
+                    xhtml.element("p", paragraph);
+                }
+
+                for (String paragraph : extractor.getEndnoteText()) {
+                    xhtml.element("p", paragraph);
+                }
+
                 addTextIfAny(xhtml, "footer", extractor.getFooterText());
             } else if ("PowerPoint Document".equals(name)) {
                 setType(metadata, "application/vnd.ms-powerpoint");


Reply via email to