Author: jukka
Date: Fri Dec 27 03:31:22 2013
New Revision: 1553619
URL: http://svn.apache.org/r1553619
Log:
TIKA-1110: Incorrect declared SUPPORTED_TYPES in ChmParser
Fix the XHTML output of ChmParser and add a test case for proper integration
with AutoDetectParser
Removed:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/CHM2XHTML.java
Modified:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/ChmParser.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/AutoDetectParserTest.java
Modified:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/ChmParser.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/ChmParser.java?rev=1553619&r1=1553618&r2=1553619&view=diff
==============================================================================
---
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/ChmParser.java
(original)
+++
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/ChmParser.java
Fri Dec 27 03:31:22 2013
@@ -28,6 +28,7 @@ import org.apache.tika.metadata.Metadata
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.AbstractParser;
import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.XHTMLContentHandler;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
@@ -50,13 +51,16 @@ public class ChmParser extends AbstractP
Metadata metadata, ParseContext context) throws IOException,
SAXException, TikaException {
CHMDocumentInformation chmInfo = CHMDocumentInformation.load(stream);
- metadata.set(Metadata.CONTENT_TYPE, "application/vnd.ms-htmlhelp");
- extractMetadata(chmInfo, metadata);
- CHM2XHTML.process(chmInfo, handler);
- }
- private void extractMetadata(CHMDocumentInformation chmInfo,
- Metadata metadata) throws TikaException, IOException {
+ // metadata
+ metadata.set(Metadata.CONTENT_TYPE, "application/vnd.ms-htmlhelp");
chmInfo.getCHMDocInformation(metadata);
+
+ // content
+ XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
+ xhtml.startDocument();
+ xhtml.characters(chmInfo.getText());
+ xhtml.endDocument();
}
+
}
Modified:
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/AutoDetectParserTest.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/AutoDetectParserTest.java?rev=1553619&r1=1553618&r2=1553619&view=diff
==============================================================================
---
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/AutoDetectParserTest.java
(original)
+++
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/AutoDetectParserTest.java
Fri Dec 27 03:31:22 2013
@@ -49,6 +49,7 @@ public class AutoDetectParserTest {
private static final String KEYNOTE = "application/vnd.apple.keynote";
private static final String PAGES = "application/vnd.apple.pages";
private static final String NUMBERS = "application/vnd.apple.numbers";
+ private static final String CHM = "application/vnd.ms-htmlhelp";
private static final String RTF = "application/rtf";
private static final String PLAINTEXT = "text/plain; charset=ISO-8859-1";
private static final String UTF8TEXT = "text/plain; charset=UTF-8";
@@ -165,6 +166,11 @@ public class AutoDetectParserTest {
}
@Test
+ public void testChm() throws Exception {
+ assertAutoDetect("testChm.chm", CHM, "If you do not specify a window
type or a window name, the main window is used.");
+ }
+
+ @Test
public void testEpub() throws Exception {
assertAutoDetect(
"testEPUB.epub", "application/epub+zip",