Author: shinichiro
Date: Thu Sep 3 01:41:21 2015
New Revision: 1700924
URL: http://svn.apache.org/r1700924
Log:
Fix for CONNECTORS-1230
Added:
manifoldcf/trunk/connectors/tika/connector/src/main/java/org/apache/manifoldcf/agents/transformation/tika/TikaParser.java
manifoldcf/trunk/connectors/tika/connector/src/test/
manifoldcf/trunk/connectors/tika/connector/src/test/java/
manifoldcf/trunk/connectors/tika/connector/src/test/java/org/
manifoldcf/trunk/connectors/tika/connector/src/test/java/org/apache/
manifoldcf/trunk/connectors/tika/connector/src/test/java/org/apache/manifoldcf/
manifoldcf/trunk/connectors/tika/connector/src/test/java/org/apache/manifoldcf/agents/
manifoldcf/trunk/connectors/tika/connector/src/test/java/org/apache/manifoldcf/agents/transformation/
manifoldcf/trunk/connectors/tika/connector/src/test/java/org/apache/manifoldcf/agents/transformation/tika/
manifoldcf/trunk/connectors/tika/connector/src/test/java/org/apache/manifoldcf/agents/transformation/tika/tests/
manifoldcf/trunk/connectors/tika/connector/src/test/java/org/apache/manifoldcf/agents/transformation/tika/tests/TikaParserTest.java
manifoldcf/trunk/connectors/tika/connector/src/test/resources/
manifoldcf/trunk/connectors/tika/connector/src/test/resources/test-documents/
manifoldcf/trunk/connectors/tika/connector/src/test/resources/test-documents/testEXCEL.xlsx
(with props)
manifoldcf/trunk/connectors/tika/connector/src/test/resources/test-documents/testHTML.html
manifoldcf/trunk/connectors/tika/connector/src/test/resources/test-documents/testPDF.pdf
(with props)
Modified:
manifoldcf/trunk/CHANGES.txt
manifoldcf/trunk/connectors/tika/connector/src/main/java/org/apache/manifoldcf/agents/transformation/tika/TikaConfig.java
manifoldcf/trunk/connectors/tika/connector/src/main/java/org/apache/manifoldcf/agents/transformation/tika/TikaExtractor.java
manifoldcf/trunk/connectors/tika/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/tika/common_en_US.properties
manifoldcf/trunk/connectors/tika/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/tika/common_ja_JP.properties
manifoldcf/trunk/connectors/tika/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/tika/common_zh_CN.properties
manifoldcf/trunk/connectors/tika/connector/src/main/resources/org/apache/manifoldcf/agents/transformation/tika/editSpecification_FieldMapping.html
manifoldcf/trunk/connectors/tika/connector/src/main/resources/org/apache/manifoldcf/agents/transformation/tika/viewSpecification.html
Modified: manifoldcf/trunk/CHANGES.txt
URL:
http://svn.apache.org/viewvc/manifoldcf/trunk/CHANGES.txt?rev=1700924&r1=1700923&r2=1700924&view=diff
==============================================================================
--- manifoldcf/trunk/CHANGES.txt (original)
+++ manifoldcf/trunk/CHANGES.txt Thu Sep 3 01:41:21 2015
@@ -3,6 +3,8 @@ $Id$
======================= 2.3-dev =====================
+CONNECTORS-1230: Add writeLimit option on Tika extractor.
+(Shinichiro Abe)
======================= Release 2.2 =====================
Modified:
manifoldcf/trunk/connectors/tika/connector/src/main/java/org/apache/manifoldcf/agents/transformation/tika/TikaConfig.java
URL:
http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/tika/connector/src/main/java/org/apache/manifoldcf/agents/transformation/tika/TikaConfig.java?rev=1700924&r1=1700923&r2=1700924&view=diff
==============================================================================
---
manifoldcf/trunk/connectors/tika/connector/src/main/java/org/apache/manifoldcf/agents/transformation/tika/TikaConfig.java
(original)
+++
manifoldcf/trunk/connectors/tika/connector/src/main/java/org/apache/manifoldcf/agents/transformation/tika/TikaConfig.java
Thu Sep 3 01:41:21 2015
@@ -30,6 +30,8 @@ public class TikaConfig {
public static final String NODE_FIELDMAP = "fieldmap";
public static final String NODE_KEEPMETADATA = "keepAllMetadata";
public static final String NODE_LOWERNAMES = "lowerNames";
+ public static final String NODE_WRITELIMIT = "writeLimit";
+ public static final int WRITELIMIT_DEFAULT = -1;
public static final String NODE_IGNORETIKAEXCEPTION = "ignoreException";
public static final String NODE_BOILERPLATEPROCESSOR =
"boilerplateprocessor";
public static final String ATTRIBUTE_SOURCE = "source";
Modified:
manifoldcf/trunk/connectors/tika/connector/src/main/java/org/apache/manifoldcf/agents/transformation/tika/TikaExtractor.java
URL:
http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/tika/connector/src/main/java/org/apache/manifoldcf/agents/transformation/tika/TikaExtractor.java?rev=1700924&r1=1700923&r2=1700924&view=diff
==============================================================================
---
manifoldcf/trunk/connectors/tika/connector/src/main/java/org/apache/manifoldcf/agents/transformation/tika/TikaExtractor.java
(original)
+++
manifoldcf/trunk/connectors/tika/connector/src/main/java/org/apache/manifoldcf/agents/transformation/tika/TikaExtractor.java
Thu Sep 3 01:41:21 2015
@@ -27,10 +27,6 @@ import java.util.*;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
-import org.apache.tika.parser.ParseContext;
-import org.apache.tika.parser.Parser;
-import org.apache.tika.parser.AutoDetectParser;
-import org.apache.tika.sax.BodyContentHandler;
import org.apache.tika.metadata.TikaMetadataKeys;
import org.apache.tika.parser.html.BoilerpipeContentHandler;
@@ -222,14 +218,12 @@ public class TikaExtractor extends org.a
try
{
// Use tika to parse stuff
- Parser parser = new AutoDetectParser();
- ContentHandler handler = new BodyContentHandler(w);
+ ContentHandler handler =
TikaParser.newWriteOutBodyContentHandler(w, sp.writeLimit());
if (extractorClassInstance != null)
handler = new BoilerpipeContentHandler(handler,
extractorClassInstance);
- ParseContext pc = new ParseContext();
try
{
- parser.parse(document.getBinaryStream(), handler, metadata, pc);
+ TikaParser.parse(document.getBinaryStream(), metadata, handler);
}
catch (TikaException e)
{
@@ -458,7 +452,8 @@ public class TikaExtractor extends org.a
SpecificationNode node = os.getChild(i);
if (node.getType().equals(TikaConfig.NODE_FIELDMAP)
|| node.getType().equals(TikaConfig.NODE_KEEPMETADATA)
- || node.getType().equals(TikaConfig.NODE_LOWERNAMES))
+ || node.getType().equals(TikaConfig.NODE_LOWERNAMES)
+ || node.getType().equals(TikaConfig.NODE_WRITELIMIT))
os.removeChild(i);
else
i++;
@@ -523,6 +518,18 @@ public class TikaExtractor extends org.a
node2.setAttribute(TikaConfig.ATTRIBUTE_VALUE, "false");
}
os.addChild(os.getChildCount(), node2);
+
+ SpecificationNode node3 = new
SpecificationNode(TikaConfig.NODE_WRITELIMIT);
+ String writeLimit = variableContext.getParameter(seqPrefix+"writelimit");
+ if (writeLimit != null)
+ {
+ node3.setAttribute(TikaConfig.ATTRIBUTE_VALUE, writeLimit);
+ }
+ else
+ {
+ node3.setAttribute(TikaConfig.ATTRIBUTE_VALUE, "");
+ }
+ os.addChild(os.getChildCount(), node3);
}
if (variableContext.getParameter(seqPrefix+"ignoretikaexceptions_present")
!= null)
@@ -602,6 +609,7 @@ public class TikaExtractor extends org.a
List<Map<String,String>> fieldMappings = new
ArrayList<Map<String,String>>();
String keepAllMetadataValue = "true";
String lowernamesValue = "false";
+ String writeLimitValue = "";
for (int i = 0; i < os.getChildCount(); i++)
{
SpecificationNode sn = os.getChild(i);
@@ -630,10 +638,15 @@ public class TikaExtractor extends org.a
{
lowernamesValue = sn.getAttributeValue(TikaConfig.ATTRIBUTE_VALUE);
}
+ else if (sn.getType().equals(TikaConfig.NODE_WRITELIMIT))
+ {
+ writeLimitValue = sn.getAttributeValue(TikaConfig.ATTRIBUTE_VALUE);
+ }
}
paramMap.put("FIELDMAPPINGS",fieldMappings);
paramMap.put("KEEPALLMETADATA",keepAllMetadataValue);
paramMap.put("LOWERNAMES",lowernamesValue);
+ paramMap.put("WRITELIMIT",writeLimitValue);
}
protected static void fillInExceptionsSpecificationMap(Map<String,Object>
paramMap, Specification os)
@@ -832,12 +845,14 @@ public class TikaExtractor extends org.a
private final Map<String,String> sourceTargets = new
HashMap<String,String>();
private final boolean keepAllMetadata;
private final boolean lowerNames;
+ private final int writeLimit;
private final boolean ignoreTikaException;
private final String extractorClassName;
public SpecPacker(Specification os) {
boolean keepAllMetadata = true;
boolean lowerNames = false;
+ int writeLimit = TikaConfig.WRITELIMIT_DEFAULT;
boolean ignoreTikaException = true;
String extractorClassName = null;
for (int i = 0; i < os.getChildCount(); i++) {
@@ -849,6 +864,13 @@ public class TikaExtractor extends org.a
} else if(sn.getType().equals(TikaConfig.NODE_LOWERNAMES)) {
String value = sn.getAttributeValue(TikaConfig.ATTRIBUTE_VALUE);
lowerNames = Boolean.parseBoolean(value);
+ } else if(sn.getType().equals(TikaConfig.NODE_WRITELIMIT)) {
+ String value = sn.getAttributeValue(TikaConfig.ATTRIBUTE_VALUE);
+ if (value.length() == 0) {
+ writeLimit = TikaConfig.WRITELIMIT_DEFAULT;
+ } else {
+ writeLimit = Integer.parseInt(value);
+ }
} else if (sn.getType().equals(TikaConfig.NODE_FIELDMAP)) {
String source = sn.getAttributeValue(TikaConfig.ATTRIBUTE_SOURCE);
String target = sn.getAttributeValue(TikaConfig.ATTRIBUTE_TARGET);
@@ -866,6 +888,7 @@ public class TikaExtractor extends org.a
}
this.keepAllMetadata = keepAllMetadata;
this.lowerNames = lowerNames;
+ this.writeLimit = writeLimit;
this.ignoreTikaException = ignoreTikaException;
this.extractorClassName = extractorClassName;
}
@@ -903,6 +926,13 @@ public class TikaExtractor extends org.a
sb.append('+');
else
sb.append('-');
+
+ if (writeLimit != TikaConfig.WRITELIMIT_DEFAULT)
+ {
+ sb.append('+');
+ sb.append(writeLimit);
+ }
+
if (ignoreTikaException)
sb.append('+');
else
@@ -931,6 +961,10 @@ public class TikaExtractor extends org.a
return lowerNames;
}
+ public int writeLimit() {
+ return writeLimit;
+ }
+
public boolean ignoreTikaException() {
return ignoreTikaException;
}
Added:
manifoldcf/trunk/connectors/tika/connector/src/main/java/org/apache/manifoldcf/agents/transformation/tika/TikaParser.java
URL:
http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/tika/connector/src/main/java/org/apache/manifoldcf/agents/transformation/tika/TikaParser.java?rev=1700924&view=auto
==============================================================================
---
manifoldcf/trunk/connectors/tika/connector/src/main/java/org/apache/manifoldcf/agents/transformation/tika/TikaParser.java
(added)
+++
manifoldcf/trunk/connectors/tika/connector/src/main/java/org/apache/manifoldcf/agents/transformation/tika/TikaParser.java
Thu Sep 3 01:41:21 2015
@@ -0,0 +1,51 @@
+/**
+* Licensed to the Apache Software Foundation (ASF) under one or more
+* contributor license agreements. See the NOTICE file distributed with
+* this work for additional information regarding copyright ownership.
+* The ASF licenses this file to You under the Apache License, Version 2.0
+* (the "License"); you may not use this file except in compliance with
+* the License. You may obtain a copy of the License at
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+package org.apache.manifoldcf.agents.transformation.tika;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.Writer;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.AutoDetectParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.sax.BodyContentHandler;
+import org.apache.tika.sax.WriteOutContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+public class TikaParser {
+
+ private static Parser parser = new AutoDetectParser();
+
+ private TikaParser() { }
+
+ public static ContentHandler newWriteOutBodyContentHandler(Writer w, int
writeLimit) {
+ ContentHandler writeOutContentHandler = new WriteOutContentHandler(w,
writeLimit);
+ return new BodyContentHandler(writeOutContentHandler);
+ }
+
+ public static void parse(InputStream stream, Metadata metadata,
ContentHandler handler)
+ throws IOException, SAXException, TikaException {
+ ParseContext context = new ParseContext();
+ context.set(Parser.class, parser);
+ parser.parse(stream, handler, metadata, context);
+ }
+
+}
Modified:
manifoldcf/trunk/connectors/tika/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/tika/common_en_US.properties
URL:
http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/tika/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/tika/common_en_US.properties?rev=1700924&r1=1700923&r2=1700924&view=diff
==============================================================================
---
manifoldcf/trunk/connectors/tika/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/tika/common_en_US.properties
(original)
+++
manifoldcf/trunk/connectors/tika/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/tika/common_en_US.properties
Thu Sep 3 01:41:21 2015
@@ -30,6 +30,7 @@ TikaExtractor.FinalFieldName=Final field
TikaExtractor.NoFieldMappingSpecified=No field mapping specified
TikaExtractor.KeepAllMetadata=Keep all metadata:
TikaExtractor.LowerNames=Lower names:
+TikaExtractor.WriteLimit=Write limit:
TikaExtractor.Add=Add
TikaExtractor.AddFieldMapping=Add field mapping
TikaExtractor.Delete=Delete
Modified:
manifoldcf/trunk/connectors/tika/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/tika/common_ja_JP.properties
URL:
http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/tika/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/tika/common_ja_JP.properties?rev=1700924&r1=1700923&r2=1700924&view=diff
==============================================================================
---
manifoldcf/trunk/connectors/tika/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/tika/common_ja_JP.properties
(original)
+++
manifoldcf/trunk/connectors/tika/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/tika/common_ja_JP.properties
Thu Sep 3 01:41:21 2015
@@ -30,6 +30,7 @@ TikaExtractor.FinalFieldName=æå¾�
TikaExtractor.NoFieldMappingSpecified=ãã£ã¼ã«ããããã³ã°ãæå®ãã¦ãã
ãã
TikaExtractor.KeepAllMetadata=å
¨ã¡ã¿ãã¼ã¿ãä¿å:
TikaExtractor.LowerNames=å°æåå:
+TikaExtractor.WriteLimit=æå¤§æåé·:
TikaExtractor.Add=追å
TikaExtractor.AddFieldMapping=ãã£ã¼ã«ããããã³ã°ã追å
TikaExtractor.Delete=åé¤
Modified:
manifoldcf/trunk/connectors/tika/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/tika/common_zh_CN.properties
URL:
http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/tika/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/tika/common_zh_CN.properties?rev=1700924&r1=1700923&r2=1700924&view=diff
==============================================================================
---
manifoldcf/trunk/connectors/tika/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/tika/common_zh_CN.properties
(original)
+++
manifoldcf/trunk/connectors/tika/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/tika/common_zh_CN.properties
Thu Sep 3 01:41:21 2015
@@ -30,6 +30,7 @@ TikaExtractor.FinalFieldName=æç»�
TikaExtractor.NoFieldMappingSpecified=æªæå®å段æ å°
TikaExtractor.KeepAllMetadata=ä¿åææå
æ°æ®:
TikaExtractor.LowerNames=å°å:
+TikaExtractor.WriteLimit=æå¤§å符é¿åº¦:
TikaExtractor.Add=æ·»å
TikaExtractor.AddFieldMapping=æ·»å åæ®µæ å°
TikaExtractor.Delete=å é¤
Modified:
manifoldcf/trunk/connectors/tika/connector/src/main/resources/org/apache/manifoldcf/agents/transformation/tika/editSpecification_FieldMapping.html
URL:
http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/tika/connector/src/main/resources/org/apache/manifoldcf/agents/transformation/tika/editSpecification_FieldMapping.html?rev=1700924&r1=1700923&r2=1700924&view=diff
==============================================================================
---
manifoldcf/trunk/connectors/tika/connector/src/main/resources/org/apache/manifoldcf/agents/transformation/tika/editSpecification_FieldMapping.html
(original)
+++
manifoldcf/trunk/connectors/tika/connector/src/main/resources/org/apache/manifoldcf/agents/transformation/tika/editSpecification_FieldMapping.html
Thu Sep 3 01:41:21 2015
@@ -102,6 +102,13 @@
#end
</td>
</tr>
+
+ <tr>
+ <td
class="description"><nobr>$Encoder.bodyEscape($ResourceBundle.getString('TikaExtractor.WriteLimit'))</nobr></td>
+ <td class="value"><input name="s${SEQNUM}_writelimit" type="text"
+ value="$Encoder.attributeEscape($WRITELIMIT)" size="16" />
+ </td>
+ </tr>
</table>
#else
@@ -115,5 +122,6 @@
<input type="hidden" name="s${SEQNUM}_fieldmapping_count"
value="$fieldcounter"/>
<input type="hidden" name="s${SEQNUM}_keepallmetadata"
value="$Encoder.bodyEscape($KEEPALLMETADATA)"/>
<input type="hidden" name="s${SEQNUM}_lowernames"
value="$Encoder.bodyEscape($LOWERNAMES)"/>
+<input type="hidden" name="s${SEQNUM}_writelimit"
value="$Encoder.attributeEscape($WRITELIMIT)" />
#end
\ No newline at end of file
Modified:
manifoldcf/trunk/connectors/tika/connector/src/main/resources/org/apache/manifoldcf/agents/transformation/tika/viewSpecification.html
URL:
http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/tika/connector/src/main/resources/org/apache/manifoldcf/agents/transformation/tika/viewSpecification.html?rev=1700924&r1=1700923&r2=1700924&view=diff
==============================================================================
---
manifoldcf/trunk/connectors/tika/connector/src/main/resources/org/apache/manifoldcf/agents/transformation/tika/viewSpecification.html
(original)
+++
manifoldcf/trunk/connectors/tika/connector/src/main/resources/org/apache/manifoldcf/agents/transformation/tika/viewSpecification.html
Thu Sep 3 01:41:21 2015
@@ -58,6 +58,11 @@
</tr>
<tr><td class="separator" colspan="2"><hr/></td></tr>
<tr>
+ <td
class="description"><nobr>$Encoder.bodyEscape($ResourceBundle.getString('TikaExtractor.WriteLimit'))</nobr></td>
+ <td class="value"><nobr>$Encoder.bodyEscape($WRITELIMIT)</nobr></td>
+ </tr>
+ <tr><td class="separator" colspan="2"><hr/></td></tr>
+ <tr>
<td
class="description"><nobr>$Encoder.bodyEscape($ResourceBundle.getString('TikaExtractor.IgnoreTikaExceptions'))</nobr></td>
<td
class="value"><nobr>$Encoder.bodyEscape($IGNORETIKAEXCEPTIONS)</nobr></td>
</tr>
Added:
manifoldcf/trunk/connectors/tika/connector/src/test/java/org/apache/manifoldcf/agents/transformation/tika/tests/TikaParserTest.java
URL:
http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/tika/connector/src/test/java/org/apache/manifoldcf/agents/transformation/tika/tests/TikaParserTest.java?rev=1700924&view=auto
==============================================================================
---
manifoldcf/trunk/connectors/tika/connector/src/test/java/org/apache/manifoldcf/agents/transformation/tika/tests/TikaParserTest.java
(added)
+++
manifoldcf/trunk/connectors/tika/connector/src/test/java/org/apache/manifoldcf/agents/transformation/tika/tests/TikaParserTest.java
Thu Sep 3 01:41:21 2015
@@ -0,0 +1,99 @@
+/**
+* Licensed to the Apache Software Foundation (ASF) under one or more
+* contributor license agreements. See the NOTICE file distributed with
+* this work for additional information regarding copyright ownership.
+* The ASF licenses this file to You under the Apache License, Version 2.0
+* (the "License"); you may not use this file except in compliance with
+* the License. You may obtain a copy of the License at
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+package org.apache.manifoldcf.agents.transformation.tika.tests;
+
+import java.io.File;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.StringWriter;
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.manifoldcf.agents.transformation.tika.TikaParser;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaMetadataKeys;
+import org.junit.Test;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+import static org.junit.Assert.*;
+import static org.hamcrest.CoreMatchers.*;
+
+public class TikaParserTest {
+
+ private static List<String> docs = new ArrayList<>();
+ static {
+ docs.add("/test-documents/testEXCEL.xlsx");
+ docs.add("/test-documents/testHTML.html");
+ docs.add("/test-documents/testPDF.pdf");
+ }
+
+ @Test
+ public void testSimple() throws IOException, SAXException, TikaException {
+ for (String doc : docs) {
+ String path = doc;
+ InputStream stream = getClass().getResourceAsStream(path);
+ Metadata metadata = new Metadata();
+ metadata.add(TikaMetadataKeys.RESOURCE_NAME_KEY, new
File(getClass().getResource(path).getFile()).getName());
+ ContentHandler unlimitedHandler
+ = TikaParser.newWriteOutBodyContentHandler(new StringWriter(), -1);
+ TikaParser.parse(stream, metadata, unlimitedHandler);
+
+ assertThat(unlimitedHandler.toString().length(), not(0));
+ assertThat(metadata.get("Content-Type"), notNullValue());
+ assertThat(metadata.get("resourceName"), notNullValue());
+ }
+ }
+
+ @Test
+ public void testExtractWithWriteLimit() throws IOException, SAXException,
TikaException {
+ for (String doc : docs) {
+ String path = doc;
+ InputStream stream = getClass().getResourceAsStream(path);
+ Metadata metadata = new Metadata();
+ metadata.add(TikaMetadataKeys.RESOURCE_NAME_KEY, new
File(getClass().getResource(path).getFile()).getName());
+ ContentHandler limitedHandler
+ = TikaParser.newWriteOutBodyContentHandler(new StringWriter(), 100 *
1000);
+ TikaParser.parse(stream, metadata, limitedHandler);
+
+ assertThat(limitedHandler.toString().length(), not(0));
+ assertThat(metadata.get("Content-Type"), notNullValue());
+ assertThat(metadata.get("resourceName"), notNullValue());
+ }
+ }
+
+ @Test
+ public void testExtractWithTooShortWriteLimit() {
+ for (String doc : docs) {
+ String path = doc;
+ InputStream stream = getClass().getResourceAsStream(path);
+ Metadata metadata = new Metadata();
+ metadata.add(TikaMetadataKeys.RESOURCE_NAME_KEY, new
File(getClass().getResource(path).getFile()).getName());
+ ContentHandler limitedHandler
+ = TikaParser.newWriteOutBodyContentHandler(new StringWriter(), 10);
+ try {
+ TikaParser.parse(stream, metadata, limitedHandler);
+ fail("Should not get here");
+ } catch (Exception e) {
+ assert e instanceof SAXException;
+
assertThat(e.toString().indexOf("org.apache.tika.sax.WriteOutContentHandler$WriteLimitReachedException"),
not(-1));
+ }
+ }
+ }
+
+}
Added:
manifoldcf/trunk/connectors/tika/connector/src/test/resources/test-documents/testEXCEL.xlsx
URL:
http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/tika/connector/src/test/resources/test-documents/testEXCEL.xlsx?rev=1700924&view=auto
==============================================================================
Binary file - no diff available.
Propchange:
manifoldcf/trunk/connectors/tika/connector/src/test/resources/test-documents/testEXCEL.xlsx
------------------------------------------------------------------------------
svn:executable = *
Propchange:
manifoldcf/trunk/connectors/tika/connector/src/test/resources/test-documents/testEXCEL.xlsx
------------------------------------------------------------------------------
svn:mime-type = application/octet-stream
Added:
manifoldcf/trunk/connectors/tika/connector/src/test/resources/test-documents/testHTML.html
URL:
http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/tika/connector/src/test/resources/test-documents/testHTML.html?rev=1700924&view=auto
==============================================================================
---
manifoldcf/trunk/connectors/tika/connector/src/test/resources/test-documents/testHTML.html
(added)
+++
manifoldcf/trunk/connectors/tika/connector/src/test/resources/test-documents/testHTML.html
Thu Sep 3 01:41:21 2015
@@ -0,0 +1,17 @@
+<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"
"http://www.w3.org/TR/html4/loose.dtd">
+<html>
+
+<head>
+<META http-equiv="Content-Type" content="text/html; charset=UTF-8">
+<title>Welcome to the Apache ManifoldCF™ project!</title>
+</head>
+
+<body>
+<div id="content">
+<h1>Welcome to the Apache ManifoldCF™ project!</h1>
+
+<p>Please click the appropriate tab above to see this site in the language of
your choice.</p>
+
+</div>
+</body>
+</html>
Added:
manifoldcf/trunk/connectors/tika/connector/src/test/resources/test-documents/testPDF.pdf
URL:
http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/tika/connector/src/test/resources/test-documents/testPDF.pdf?rev=1700924&view=auto
==============================================================================
Binary file - no diff available.
Propchange:
manifoldcf/trunk/connectors/tika/connector/src/test/resources/test-documents/testPDF.pdf
------------------------------------------------------------------------------
svn:executable = *
Propchange:
manifoldcf/trunk/connectors/tika/connector/src/test/resources/test-documents/testPDF.pdf
------------------------------------------------------------------------------
svn:mime-type = application/octet-stream