Author: jukka
Date: Sun Aug 21 12:56:21 2011
New Revision: 1159979
URL: http://svn.apache.org/viewvc?rev=1159979&view=rev
Log:
TIKA-692: TikaCLI -x or -h on a Word doc sometimes adds newline after </b> tag
Automatically pretty-print the <head> section generated by the
XHTMLContentHandler
Modified:
tika/trunk/tika-core/src/main/java/org/apache/tika/sax/XHTMLContentHandler.java
Modified:
tika/trunk/tika-core/src/main/java/org/apache/tika/sax/XHTMLContentHandler.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/sax/XHTMLContentHandler.java?rev=1159979&r1=1159978&r2=1159979&view=diff
==============================================================================
---
tika/trunk/tika-core/src/main/java/org/apache/tika/sax/XHTMLContentHandler.java
(original)
+++
tika/trunk/tika-core/src/main/java/org/apache/tika/sax/XHTMLContentHandler.java
Sun Aug 21 12:56:21 2011
@@ -128,7 +128,9 @@ public class XHTMLContentHandler extends
// Call directly, so we don't go through our startElement(), which
will
// ignore these elements.
super.startElement(XHTML, "html", "html", EMPTY_ATTRIBUTES);
+ newline();
super.startElement(XHTML, "head", "head", EMPTY_ATTRIBUTES);
+ newline();
}
}
@@ -165,6 +167,7 @@ public class XHTMLContentHandler extends
attributes.addAttribute("", "content", "content",
"CDATA", value);
super.startElement(XHTML, "meta", "meta", attributes);
super.endElement(XHTML, "meta", "meta");
+ newline();
}
}
}
@@ -175,10 +178,11 @@ public class XHTMLContentHandler extends
char[] titleChars = title.toCharArray();
super.characters(titleChars, 0, titleChars.length);
}
-
super.endElement(XHTML, "title", "title");
+ newline();
super.endElement(XHTML, "head", "head");
+ newline();
if (useFrameset) {
super.startElement(XHTML, "frameset", "frameset",
EMPTY_ATTRIBUTES);