Author: jukka
Date: Fri Dec 27 15:53:57 2013
New Revision: 1553685
URL: http://svn.apache.org/r1553685
Log:
TIKA-672: Proper error handling in the CHM parser
Inline CHMDocumentInformation into the main ChmParser class for simplicity
Removed:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/CHMDocumentInformation.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/chm/TestChmDocumentInformation.java
Modified:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/ChmParser.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/chm/TestChmExtraction.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/chm/TestChmExtractor.java
Modified:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/ChmParser.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/ChmParser.java?rev=1553685&r1=1553684&r2=1553685&view=diff
==============================================================================
---
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/ChmParser.java
(original)
+++
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/ChmParser.java
Fri Dec 27 15:53:57 2013
@@ -16,11 +16,13 @@
*/
package org.apache.tika.parser.chm;
+import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashSet;
+import java.util.Iterator;
import java.util.Set;
import org.apache.tika.exception.TikaException;
@@ -28,6 +30,10 @@ import org.apache.tika.metadata.Metadata
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.AbstractParser;
import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.chm.accessor.DirectoryListingEntry;
+import org.apache.tika.parser.chm.core.ChmExtractor;
+import org.apache.tika.parser.html.HtmlParser;
+import org.apache.tika.sax.BodyContentHandler;
import org.apache.tika.sax.XHTMLContentHandler;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
@@ -50,7 +56,7 @@ public class ChmParser extends AbstractP
public void parse(InputStream stream, ContentHandler handler,
Metadata metadata, ParseContext context) throws IOException,
SAXException, TikaException {
- CHMDocumentInformation chmInfo = new CHMDocumentInformation(stream);
+ ChmExtractor chmExtractor = new ChmExtractor(stream);
// metadata
metadata.set(Metadata.CONTENT_TYPE, "application/vnd.ms-htmlhelp");
@@ -58,8 +64,41 @@ public class ChmParser extends AbstractP
// content
XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
xhtml.startDocument();
- xhtml.characters(chmInfo.getText());
+
+ Iterator<DirectoryListingEntry> it =
+
chmExtractor.getChmDirList().getDirectoryListingEntryList().iterator();
+ while (it.hasNext()) {
+ DirectoryListingEntry entry = it.next();
+ if (entry.getName().endsWith(".html") ||
entry.getName().endsWith(".htm")) {
+ xhtml.characters(extract(chmExtractor.extractChmEntry(entry)));
+ }
+ }
+
xhtml.endDocument();
}
+ /**
+ * Extracts data from byte[]
+ */
+ private String extract(byte[] byteObject) throws TikaException {// throws
IOException
+ StringBuilder wBuf = new StringBuilder();
+ InputStream stream = null;
+ Metadata metadata = new Metadata();
+ HtmlParser htmlParser = new HtmlParser();
+ BodyContentHandler handler = new BodyContentHandler(-1);// -1
+ ParseContext parser = new ParseContext();
+ try {
+ stream = new ByteArrayInputStream(byteObject);
+ htmlParser.parse(stream, handler, metadata, parser);
+ wBuf.append(handler.toString()
+ + System.getProperty("line.separator"));
+ } catch (SAXException e) {
+ throw new RuntimeException(e);
+ } catch (IOException e) {
+ // Pushback overflow from tagsoup
+ }
+ return wBuf.toString();
+ }
+
+
}
Modified:
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/chm/TestChmExtraction.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/chm/TestChmExtraction.java?rev=1553685&r1=1553684&r2=1553685&view=diff
==============================================================================
---
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/chm/TestChmExtraction.java
(original)
+++
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/chm/TestChmExtraction.java
Fri Dec 27 15:53:57 2013
@@ -18,46 +18,67 @@ package org.apache.tika.parser.chm;
import static org.junit.Assert.assertTrue;
+import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
-import java.util.ArrayList;
+import java.util.Arrays;
import java.util.List;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
-import java.util.concurrent.locks.Lock;
-import java.util.concurrent.locks.ReentrantLock;
-import org.junit.Before;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.sax.BodyContentHandler;
import org.junit.Test;
public class TestChmExtraction {
- private List<String> files = new ArrayList<String>();
+ private final Parser parser = new ChmParser();
- @Before
- public void setUp() {
- files.add("/test-documents/testChm.chm");
- files.add("/test-documents/testChm3.chm");
+ private final List<String> files = Arrays.asList(
+ "/test-documents/testChm.chm",
+ "/test-documents/testChm3.chm");
+
+ @Test
+ public void testGetText() throws Exception {
+ BodyContentHandler handler = new BodyContentHandler();
+ new ChmParser().parse(
+ new ByteArrayInputStream(TestParameters.chmData),
+ handler, new Metadata(), new ParseContext());
+ assertTrue(handler.toString().contains(
+ "The TCard method accepts only numeric arguments"));
+ }
+
+ @Test
+ public void testChmParser() throws Exception{
+ for (String fileName : files) {
+ InputStream stream =
+ TestChmExtraction.class.getResourceAsStream(fileName);
+ try {
+ BodyContentHandler handler = new BodyContentHandler(-1);
+ parser.parse(stream, handler, new Metadata(), new
ParseContext());
+ assertTrue(!handler.toString().isEmpty());
+ } finally {
+ stream.close();
+ }
+ }
}
+
@Test
public void testMultiThreadedChmExtraction() throws InterruptedException {
- ExecutorService executor = Executors
- .newFixedThreadPool(TestParameters.NTHREADS);
+ ExecutorService executor =
Executors.newFixedThreadPool(TestParameters.NTHREADS);
for (int i = 0; i < TestParameters.NTHREADS; i++) {
executor.execute(new Runnable() {
public void run() {
- Lock mutex = new ReentrantLock();
for (String fileName : files) {
InputStream stream = null;
try {
stream =
TestChmExtraction.class.getResourceAsStream(fileName);
-
- CHMDocumentInformation chmDocInfo =
- new CHMDocumentInformation(stream);
- mutex.lock();
- String text = chmDocInfo.getText();
- assertTrue(text.length() > 0);
+ BodyContentHandler handler = new
BodyContentHandler(-1);
+ parser.parse(stream, handler, new Metadata(), new
ParseContext());
+ assertTrue(!handler.toString().isEmpty());
} catch (Exception e) {
e.printStackTrace();
} finally {
@@ -66,7 +87,6 @@ public class TestChmExtraction {
} catch (IOException e) {
e.printStackTrace();
}
- mutex.unlock();
}
}
}
Modified:
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/chm/TestChmExtractor.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/chm/TestChmExtractor.java?rev=1553685&r1=1553684&r2=1553685&view=diff
==============================================================================
---
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/chm/TestChmExtractor.java
(original)
+++
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/chm/TestChmExtractor.java
Fri Dec 27 15:53:57 2013
@@ -18,11 +18,8 @@ package org.apache.tika.parser.chm;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertNotNull;
-import static org.junit.Assert.assertTrue;
import java.io.ByteArrayInputStream;
-import java.io.InputStream;
-import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
@@ -66,23 +63,4 @@ public class TestChmExtractor {
assertEquals(TestParameters.VP_CHM_ENTITIES_NUMBER, count);
}
- @Test
- public void testChmParser() throws Exception{
- List<String> files = new ArrayList<String>();
- files.add("/test-documents/testChm.chm");
- files.add("/test-documents/testChm3.chm");
-
- for (String fileName : files) {
- InputStream stream =
- TestChmBlockInfo.class.getResourceAsStream(fileName);
- try {
- CHMDocumentInformation chmDocInfo = new
CHMDocumentInformation(stream);
- String text = chmDocInfo.getText();
- assertTrue(text.length() > 0);
- } finally {
- stream.close();
- }
- }
- }
-
}