Author: jukka
Date: Fri Dec 27 15:53:57 2013
New Revision: 1553685

URL: http://svn.apache.org/r1553685
Log:
TIKA-672: Proper error handling in the CHM parser

Inline CHMDocumentInformation into the main ChmParser class for simplicity

Removed:
    
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/CHMDocumentInformation.java
    
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/chm/TestChmDocumentInformation.java
Modified:
    
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/ChmParser.java
    
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/chm/TestChmExtraction.java
    
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/chm/TestChmExtractor.java

Modified: 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/ChmParser.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/ChmParser.java?rev=1553685&r1=1553684&r2=1553685&view=diff
==============================================================================
--- 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/ChmParser.java 
(original)
+++ 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/ChmParser.java 
Fri Dec 27 15:53:57 2013
@@ -16,11 +16,13 @@
  */
 package org.apache.tika.parser.chm;
 
+import java.io.ByteArrayInputStream;
 import java.io.IOException;
 import java.io.InputStream;
 import java.util.Arrays;
 import java.util.Collections;
 import java.util.HashSet;
+import java.util.Iterator;
 import java.util.Set;
 
 import org.apache.tika.exception.TikaException;
@@ -28,6 +30,10 @@ import org.apache.tika.metadata.Metadata
 import org.apache.tika.mime.MediaType;
 import org.apache.tika.parser.AbstractParser;
 import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.chm.accessor.DirectoryListingEntry;
+import org.apache.tika.parser.chm.core.ChmExtractor;
+import org.apache.tika.parser.html.HtmlParser;
+import org.apache.tika.sax.BodyContentHandler;
 import org.apache.tika.sax.XHTMLContentHandler;
 import org.xml.sax.ContentHandler;
 import org.xml.sax.SAXException;
@@ -50,7 +56,7 @@ public class ChmParser extends AbstractP
     public void parse(InputStream stream, ContentHandler handler,
             Metadata metadata, ParseContext context) throws IOException,
             SAXException, TikaException {
-        CHMDocumentInformation chmInfo = new CHMDocumentInformation(stream);
+        ChmExtractor chmExtractor = new ChmExtractor(stream);
 
         // metadata
         metadata.set(Metadata.CONTENT_TYPE, "application/vnd.ms-htmlhelp");
@@ -58,8 +64,41 @@ public class ChmParser extends AbstractP
         // content
         XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
         xhtml.startDocument();
-        xhtml.characters(chmInfo.getText());
+
+        Iterator<DirectoryListingEntry> it =
+                
chmExtractor.getChmDirList().getDirectoryListingEntryList().iterator();
+        while (it.hasNext()) {
+            DirectoryListingEntry entry = it.next();
+            if (entry.getName().endsWith(".html") || 
entry.getName().endsWith(".htm")) {
+                xhtml.characters(extract(chmExtractor.extractChmEntry(entry)));
+            }
+        }
+
         xhtml.endDocument();
     }
 
+    /**
+     * Extracts data from byte[]
+     */
+    private String extract(byte[] byteObject) throws TikaException {// throws 
IOException
+        StringBuilder wBuf = new StringBuilder();
+        InputStream stream = null;
+        Metadata metadata = new Metadata();
+        HtmlParser htmlParser = new HtmlParser();
+        BodyContentHandler handler = new BodyContentHandler(-1);// -1
+        ParseContext parser = new ParseContext();
+        try {
+            stream = new ByteArrayInputStream(byteObject);
+            htmlParser.parse(stream, handler, metadata, parser);
+            wBuf.append(handler.toString()
+                    + System.getProperty("line.separator"));
+        } catch (SAXException e) {
+            throw new RuntimeException(e);
+        } catch (IOException e) {
+            // Pushback overflow from tagsoup
+        }
+        return wBuf.toString();
+    }
+
+
 }

Modified: 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/chm/TestChmExtraction.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/chm/TestChmExtraction.java?rev=1553685&r1=1553684&r2=1553685&view=diff
==============================================================================
--- 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/chm/TestChmExtraction.java
 (original)
+++ 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/chm/TestChmExtraction.java
 Fri Dec 27 15:53:57 2013
@@ -18,46 +18,67 @@ package org.apache.tika.parser.chm;
 
 import static org.junit.Assert.assertTrue;
 
+import java.io.ByteArrayInputStream;
 import java.io.IOException;
 import java.io.InputStream;
-import java.util.ArrayList;
+import java.util.Arrays;
 import java.util.List;
 import java.util.concurrent.ExecutorService;
 import java.util.concurrent.Executors;
-import java.util.concurrent.locks.Lock;
-import java.util.concurrent.locks.ReentrantLock;
 
-import org.junit.Before;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.sax.BodyContentHandler;
 import org.junit.Test;
 
 public class TestChmExtraction {
 
-    private List<String> files = new ArrayList<String>();
+    private final Parser parser = new ChmParser();
 
-    @Before
-    public void setUp() {
-        files.add("/test-documents/testChm.chm");
-        files.add("/test-documents/testChm3.chm");
+    private final List<String> files = Arrays.asList(
+            "/test-documents/testChm.chm",
+            "/test-documents/testChm3.chm");
+
+    @Test
+    public void testGetText() throws Exception {
+        BodyContentHandler handler = new BodyContentHandler();
+        new ChmParser().parse(
+                new ByteArrayInputStream(TestParameters.chmData),
+                handler, new Metadata(), new ParseContext());
+        assertTrue(handler.toString().contains(
+                "The TCard method accepts only numeric arguments"));
+    }
+
+    @Test
+    public void testChmParser() throws Exception{
+        for (String fileName : files) {
+            InputStream stream =
+                    TestChmExtraction.class.getResourceAsStream(fileName);
+            try {
+                BodyContentHandler handler = new BodyContentHandler(-1);
+                parser.parse(stream, handler, new Metadata(), new 
ParseContext());
+                assertTrue(!handler.toString().isEmpty());
+            } finally {
+                stream.close();
+            }
+        }
     }
 
+
     @Test
     public void testMultiThreadedChmExtraction() throws InterruptedException {
-        ExecutorService executor = Executors
-                .newFixedThreadPool(TestParameters.NTHREADS);
+        ExecutorService executor = 
Executors.newFixedThreadPool(TestParameters.NTHREADS);
         for (int i = 0; i < TestParameters.NTHREADS; i++) {
             executor.execute(new Runnable() {
                 public void run() {
-                    Lock mutex = new ReentrantLock();
                     for (String fileName : files) {
                         InputStream stream = null;
                         try {
                             stream = 
TestChmExtraction.class.getResourceAsStream(fileName);
-
-                            CHMDocumentInformation chmDocInfo =
-                                    new CHMDocumentInformation(stream);
-                            mutex.lock();
-                            String text = chmDocInfo.getText();
-                            assertTrue(text.length() > 0);
+                            BodyContentHandler handler = new 
BodyContentHandler(-1);
+                            parser.parse(stream, handler, new Metadata(), new 
ParseContext());
+                            assertTrue(!handler.toString().isEmpty());
                         } catch (Exception e) {
                             e.printStackTrace();
                         } finally {
@@ -66,7 +87,6 @@ public class TestChmExtraction {
                             } catch (IOException e) {
                                 e.printStackTrace();
                             }
-                            mutex.unlock();
                         }
                     }
                 }

Modified: 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/chm/TestChmExtractor.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/chm/TestChmExtractor.java?rev=1553685&r1=1553684&r2=1553685&view=diff
==============================================================================
--- 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/chm/TestChmExtractor.java
 (original)
+++ 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/chm/TestChmExtractor.java
 Fri Dec 27 15:53:57 2013
@@ -18,11 +18,8 @@ package org.apache.tika.parser.chm;
 
 import static org.junit.Assert.assertEquals;
 import static org.junit.Assert.assertNotNull;
-import static org.junit.Assert.assertTrue;
 
 import java.io.ByteArrayInputStream;
-import java.io.InputStream;
-import java.util.ArrayList;
 import java.util.Iterator;
 import java.util.List;
 
@@ -66,23 +63,4 @@ public class TestChmExtractor {
         assertEquals(TestParameters.VP_CHM_ENTITIES_NUMBER, count);
     }
 
-    @Test
-    public void testChmParser() throws Exception{
-        List<String> files = new ArrayList<String>();
-        files.add("/test-documents/testChm.chm");
-        files.add("/test-documents/testChm3.chm");
-
-        for (String fileName : files) {
-            InputStream stream =
-                    TestChmBlockInfo.class.getResourceAsStream(fileName);
-            try {
-                CHMDocumentInformation chmDocInfo = new 
CHMDocumentInformation(stream);
-                String text = chmDocInfo.getText();
-                assertTrue(text.length() > 0);
-            } finally {
-                stream.close();
-            }
-        }
-    }
-
 }


Reply via email to