Author: jukka
Date: Fri Dec 27 15:42:57 2013
New Revision: 1553684
URL: http://svn.apache.org/r1553684
Log:
TIKA-672: Proper error handling in the CHM parser
Further CHM parser streamlining to avoid unnecessary catching of exceptions
Modified:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/CHMDocumentInformation.java
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/core/ChmExtractor.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/chm/TestChmExtractor.java
Modified:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/CHMDocumentInformation.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/CHMDocumentInformation.java?rev=1553684&r1=1553683&r2=1553684&view=diff
==============================================================================
---
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/CHMDocumentInformation.java
(original)
+++
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/CHMDocumentInformation.java
Fri Dec 27 15:42:57 2013
@@ -48,18 +48,6 @@ class CHMDocumentInformation {
}
/**
- * Checks if an entry is a html or not.
- *
- * @param entry
- * chm directory listing entry
- *
- * @return boolean
- */
- private boolean isRightEntry(DirectoryListingEntry entry) {
- return (entry.getName().endsWith(".html") ||
entry.getName().endsWith(".htm"));
- }
-
- /**
* Returns extracted text from chm file
*
* @return text
@@ -68,36 +56,23 @@ class CHMDocumentInformation {
*/
public String getText() throws TikaException {
StringBuilder sb = new StringBuilder();
- DirectoryListingEntry entry;
-
- for (Iterator<DirectoryListingEntry> it = chmExtractor
- .getChmDirList().getDirectoryListingEntryList().iterator();
it.hasNext();)
- {
- try {
- entry = it.next();
- if (isRightEntry(entry)) {
- byte[][] tmp = chmExtractor.extractChmEntry(entry);
- if (tmp != null) {
- sb.append(extract(tmp));
- }
- }
- } catch (TikaException e) {
- //ignore
- } // catch (IOException e) {//Pushback exception from tagsoup
- // System.err.println(e.getMessage());
+
+ Iterator<DirectoryListingEntry> it =
+
chmExtractor.getChmDirList().getDirectoryListingEntryList().iterator();
+ while (it.hasNext()) {
+ DirectoryListingEntry entry = it.next();
+ if (entry.getName().endsWith(".html") ||
entry.getName().endsWith(".htm")) {
+ byte[] tmp = chmExtractor.extractChmEntry(entry);
+ sb.append(extract(tmp));
+ }
}
return sb.toString();
}
/**
- * Extracts data from byte[][]
- *
- * @param byteObject
- * @return
- * @throws IOException
- * @throws SAXException
+ * Extracts data from byte[]
*/
- private String extract(byte[][] byteObject) {// throws IOException
+ private String extract(byte[] byteObject) throws TikaException {// throws
IOException
StringBuilder wBuf = new StringBuilder();
InputStream stream = null;
Metadata metadata = new Metadata();
@@ -105,25 +80,14 @@ class CHMDocumentInformation {
BodyContentHandler handler = new BodyContentHandler(-1);// -1
ParseContext parser = new ParseContext();
try {
- for (int i = 0; i < byteObject.length; i++) {
- stream = new ByteArrayInputStream(byteObject[i]);
- try {
- htmlParser.parse(stream, handler, metadata, parser);
- } catch (TikaException e) {
- wBuf.append(new String(byteObject[i]));
-// System.err.println("\n"
-// + CHMDocumentInformation.class.getName()
-// + " extract " + e.getMessage());
- } finally {
- wBuf.append(handler.toString()
- + System.getProperty("line.separator"));
- stream.close();
- }
- }
+ stream = new ByteArrayInputStream(byteObject);
+ htmlParser.parse(stream, handler, metadata, parser);
+ wBuf.append(handler.toString()
+ + System.getProperty("line.separator"));
} catch (SAXException e) {
throw new RuntimeException(e);
- } catch (IOException e) {//
- // Pushback overflow from tagsoup
+ } catch (IOException e) {
+ // Pushback overflow from tagsoup
}
return wBuf.toString();
}
Modified:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/core/ChmExtractor.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/core/ChmExtractor.java?rev=1553684&r1=1553683&r2=1553684&view=diff
==============================================================================
---
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/core/ChmExtractor.java
(original)
+++
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/core/ChmExtractor.java
Fri Dec 27 15:42:57 2013
@@ -16,11 +16,13 @@
*/
package org.apache.tika.parser.chm.core;
+import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
+
import org.apache.tika.exception.TikaException;
import org.apache.tika.io.IOUtils;
import org.apache.tika.parser.chm.accessor.ChmDirectoryListingSet;
@@ -238,9 +240,8 @@ public class ChmExtractor {
* @return decompressed data
* @throws TikaException
*/
- public byte[][] extractChmEntry(DirectoryListingEntry
directoryListingEntry) throws TikaException {
- byte[][] tmp = null;
- byte[] dataSegment = null;
+ public byte[] extractChmEntry(DirectoryListingEntry directoryListingEntry)
throws TikaException {
+ ByteArrayOutputStream buffer = new ByteArrayOutputStream();
ChmLzxBlock lzxBlock = null;
try {
/* UNCOMPRESSED type is easiest one */
@@ -251,15 +252,15 @@ public class ChmExtractor {
.getOffset());
// dataSegment = Arrays.copyOfRange(getData(), dataOffset,
// dataOffset + directoryListingEntry.getLength());
- dataSegment = ChmCommons.copyOfRange(getData(), dataOffset,
- dataOffset + directoryListingEntry.getLength());
+ buffer.write(ChmCommons.copyOfRange(
+ getData(), dataOffset,
+ dataOffset + directoryListingEntry.getLength()));
} else if (directoryListingEntry.getEntryType() ==
EntryType.COMPRESSED
&& !ChmCommons.hasSkip(directoryListingEntry)) {
/* Gets a chm block info */
ChmBlockInfo bb = ChmBlockInfo.getChmBlockInfoInstance(
directoryListingEntry, (int) getChmLzxcResetTable()
.getBlockLen(), getChmLzxcControlData());
- tmp = new byte[bb.getEndBlock() - bb.getStartBlock() + 1][];
int i = 0, start = 0, block = 0;
@@ -286,7 +287,8 @@ public class ChmExtractor {
if (i == getLzxBlocksCache().size() && i == 0) {
start = bb.getIniBlock();
- dataSegment = ChmCommons.getChmBlockSegment(getData(),
+ byte[] dataSegment = ChmCommons.getChmBlockSegment(
+ getData(),
getChmLzxcResetTable(), start,
(int) getLzxBlockOffset(),
(int) getLzxBlockLength());
@@ -301,27 +303,23 @@ public class ChmExtractor {
for (i = start; i <= bb.getEndBlock();) {
if (i == bb.getStartBlock() && i == bb.getEndBlock()) {
- dataSegment = lzxBlock.getContent(
- bb.getStartOffset(), bb.getEndOffset());
- tmp[0] = dataSegment;
+ buffer.write(lzxBlock.getContent(
+ bb.getStartOffset(), bb.getEndOffset()));
break;
}
if (i == bb.getStartBlock()) {
- dataSegment = lzxBlock.getContent(bb
- .getStartOffset());
- tmp[0] = dataSegment;
+ buffer.write(lzxBlock.getContent(
+ bb.getStartOffset()));
}
if (i > bb.getStartBlock() && i < bb.getEndBlock()) {
- dataSegment = lzxBlock.getContent();
- tmp[i - bb.getStartBlock()] = dataSegment;
+ buffer.write(lzxBlock.getContent());
}
if (i == bb.getEndBlock()) {
- dataSegment = lzxBlock.getContent(0,
- bb.getEndOffset());
- tmp[i - bb.getStartBlock()] = dataSegment;
+ buffer.write(lzxBlock.getContent(
+ 0, bb.getEndOffset()));
break;
}
@@ -356,7 +354,8 @@ public class ChmExtractor {
} catch (Exception e) {
throw new TikaException(e.getMessage());
}
- return (tmp != null) ? tmp : (new byte[1][]);
+
+ return buffer.toByteArray();
}
private void setLzxBlocksCache(List<ChmLzxBlock> lzxBlocksCache) {
Modified:
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/chm/TestChmExtractor.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/chm/TestChmExtractor.java?rev=1553684&r1=1553683&r2=1553684&view=diff
==============================================================================
---
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/chm/TestChmExtractor.java
(original)
+++
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/chm/TestChmExtractor.java
Fri Dec 27 15:42:57 2013
@@ -57,14 +57,11 @@ public class TestChmExtractor {
@Test
public void testExtractChmEntry() throws TikaException{
ChmDirectoryListingSet entries = chmExtractor.getChmDirList();
- byte[][] localFile;
int count = 0;
for (Iterator<DirectoryListingEntry> it = entries
.getDirectoryListingEntryList().iterator(); it.hasNext();) {
- localFile = chmExtractor.extractChmEntry(it.next());
- if (localFile != null) {
- ++count;
- }
+ chmExtractor.extractChmEntry(it.next());
+ ++count;
}
assertEquals(TestParameters.VP_CHM_ENTITIES_NUMBER, count);
}