This is an automated email from the ASF dual-hosted git repository.
snagel pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/nutch.git
The following commit(s) were added to refs/heads/master by this push:
new 0341f0d NUTCH-1945 Test for XLSX parser - add Tika unit test for XLSX
files - bundle instance variables and utility methods in class TikaParserTest -
clean up javadoc comments
new e61a8a3 Merge pull request #525 from sebastian-nagel/NUTCH-1945
0341f0d is described below
commit 0341f0dfa156d3963e88b2cb9507013b0eef8668
Author: Sebastian Nagel <[email protected]>
AuthorDate: Tue May 5 13:25:15 2020 +0200
NUTCH-1945 Test for XLSX parser
- add Tika unit test for XLSX files
- bundle instance variables and utility methods in class TikaParserTest
- clean up javadoc comments
---
src/plugin/parse-tika/build.xml | 1 +
src/plugin/parse-tika/sample/test.xlsx | Bin 0 -> 3950 bytes
.../nutch/parse/tika/TestEmbeddedDocuments.java | 36 ++-----------
.../apache/nutch/parse/tika/TestFeedParser.java | 14 +----
.../apache/nutch/parse/tika/TestHtmlParser.java | 1 -
.../apache/nutch/parse/tika/TestImageMetadata.java | 5 +-
.../apache/nutch/parse/tika/TestMSWordParser.java | 42 ++-------------
.../org/apache/nutch/parse/tika/TestOOParser.java | 33 ++----------
.../org/apache/nutch/parse/tika/TestPdfParser.java | 34 ++----------
.../org/apache/nutch/parse/tika/TestRTFParser.java | 24 ++-------
.../apache/nutch/parse/tika/TestXlsxParser.java | 38 +++++++++++++
...tEmbeddedDocuments.java => TikaParserTest.java} | 60 ++++++++-------------
12 files changed, 84 insertions(+), 204 deletions(-)
diff --git a/src/plugin/parse-tika/build.xml b/src/plugin/parse-tika/build.xml
index b17643d..af3e610 100644
--- a/src/plugin/parse-tika/build.xml
+++ b/src/plugin/parse-tika/build.xml
@@ -36,6 +36,7 @@
<include name="*.doc"/>
<include name="*.gif"/>
<include name="*.docx"/>
+ <include name="*.xlsx"/>
</fileset>
</copy>
diff --git a/src/plugin/parse-tika/sample/test.xlsx
b/src/plugin/parse-tika/sample/test.xlsx
new file mode 100644
index 0000000..de33f28
Binary files /dev/null and b/src/plugin/parse-tika/sample/test.xlsx differ
diff --git
a/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestEmbeddedDocuments.java
b/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestEmbeddedDocuments.java
index cecf251..79ed286 100644
---
a/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestEmbeddedDocuments.java
+++
b/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestEmbeddedDocuments.java
@@ -16,59 +16,29 @@
*/
package org.apache.nutch.parse.tika;
-import org.apache.nutch.protocol.ProtocolFactory;
-import org.apache.nutch.protocol.Protocol;
-import org.apache.nutch.protocol.Content;
-import org.apache.nutch.protocol.ProtocolException;
-import org.apache.nutch.parse.Parse;
-import org.apache.nutch.parse.ParseUtil;
import org.apache.nutch.parse.ParseException;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.nutch.util.NutchConfiguration;
-import org.apache.hadoop.io.Text;
-import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.protocol.ProtocolException;
import org.junit.Assert;
import org.junit.Before;
import org.junit.Test;
-import java.io.File;
-
/**
* Unit tests for MSWordParser.
- *
- * @author John Xing
*/
-public class TestEmbeddedDocuments {
+public class TestEmbeddedDocuments extends TikaParserTest {
- private String fileSeparator = System.getProperty("file.separator");
- // This system property is defined in ./src/plugin/build-plugin.xml
- private String sampleDir = System.getProperty("test.data", ".");
// Make sure sample files are copied to "test.data" as specified in
// ./src/plugin/parse-tika/build.xml during plugin compilation.
private String[] sampleFiles = { "test_recursive_embedded.docx" };
private String expectedText = "When in the Course of human events";
- private Configuration conf;
-
@Before
public void setUp() {
- conf = NutchConfiguration.create();
- conf.set("file.content.limit", "-1");
+ super.setUp();
conf.setBoolean("tika.parse.embedded", true);
}
- public String getTextContent(String fileName) throws ProtocolException,
- ParseException {
- String urlString = "file:" + sampleDir + fileSeparator + fileName;
- Protocol protocol = new ProtocolFactory(conf).getProtocol(urlString);
- Content content = protocol.getProtocolOutput(new Text(urlString),
- new CrawlDatum()).getContent();
- Parse parse = new ParseUtil(conf).parseByExtensionId("parse-tika", content)
- .get(content.getUrl());
- return parse.getText();
- }
-
@Test
public void testIt() throws ProtocolException, ParseException {
for (int i = 0; i < sampleFiles.length; i++) {
diff --git
a/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestFeedParser.java
b/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestFeedParser.java
index 87b452c..94eec53 100644
---
a/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestFeedParser.java
+++
b/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestFeedParser.java
@@ -26,7 +26,6 @@ import org.apache.nutch.parse.Parse;
import org.apache.nutch.parse.ParseData;
import org.apache.nutch.parse.ParseException;
import org.apache.nutch.parse.ParseUtil;
-import org.apache.nutch.parse.tika.TikaParser;
import org.apache.nutch.protocol.Content;
import org.apache.nutch.protocol.Protocol;
import org.apache.nutch.protocol.ProtocolException;
@@ -34,18 +33,9 @@ import org.apache.nutch.protocol.ProtocolFactory;
import org.apache.nutch.util.NutchConfiguration;
/**
- *
- * @author mattmann / jnioche
- *
- * Test Suite for the RSS feeds with the {@link TikaParser}.
- *
+ * Test Suite for the RSS feeds with the {@link TikaParser}.
*/
-public class TestFeedParser {
-
- private String fileSeparator = System.getProperty("file.separator");
-
- // This system property is defined in ./src/plugin/build-plugin.xml
- private String sampleDir = System.getProperty("test.data", ".");
+public class TestFeedParser extends TikaParserTest {
private String[] sampleFiles = { "rsstest.rss" };
diff --git
a/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestHtmlParser.java
b/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestHtmlParser.java
index 4924511..781e891 100644
---
a/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestHtmlParser.java
+++
b/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestHtmlParser.java
@@ -22,7 +22,6 @@ import java.nio.charset.StandardCharsets;
import org.apache.hadoop.conf.Configuration;
import org.apache.nutch.metadata.Metadata;
-import org.apache.nutch.parse.tika.TikaParser;
import org.apache.nutch.parse.Outlink;
import org.apache.nutch.parse.Parse;
import org.apache.nutch.parse.Parser;
diff --git
a/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestImageMetadata.java
b/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestImageMetadata.java
index 779278c..0f1505d 100644
---
a/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestImageMetadata.java
+++
b/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestImageMetadata.java
@@ -33,11 +33,8 @@ import org.junit.Test;
/**
* Test extraction of image metadata
*/
-public class TestImageMetadata {
+public class TestImageMetadata extends TikaParserTest {
- private String fileSeparator = System.getProperty("file.separator");
- // This system property is defined in ./src/plugin/build-plugin.xml
- private String sampleDir = System.getProperty("test.data", ".");
// Make sure sample files are copied to "test.data" as specified in
private String[] sampleFiles = { "nutch_logo_tm.gif", };
diff --git
a/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestMSWordParser.java
b/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestMSWordParser.java
index 37c536c..c5062f6 100644
---
a/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestMSWordParser.java
+++
b/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestMSWordParser.java
@@ -16,58 +16,24 @@
*/
package org.apache.nutch.parse.tika;
-import org.apache.nutch.protocol.ProtocolFactory;
-import org.apache.nutch.protocol.Protocol;
-import org.apache.nutch.protocol.Content;
-import org.apache.nutch.protocol.ProtocolException;
-import org.apache.nutch.parse.Parse;
-import org.apache.nutch.parse.ParseUtil;
+import java.io.File;
+
import org.apache.nutch.parse.ParseException;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.nutch.util.NutchConfiguration;
-import org.apache.hadoop.io.Text;
-import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.protocol.ProtocolException;
import org.junit.Assert;
-import org.junit.Before;
import org.junit.Test;
-import java.io.File;
-
/**
* Unit tests for MSWordParser.
- *
- * @author John Xing
*/
-public class TestMSWordParser {
+public class TestMSWordParser extends TikaParserTest {
- private String fileSeparator = System.getProperty("file.separator");
- // This system property is defined in ./src/plugin/build-plugin.xml
- private String sampleDir = System.getProperty("test.data", ".");
// Make sure sample files are copied to "test.data" as specified in
// ./src/plugin/parse-tika/build.xml during plugin compilation.
private String[] sampleFiles = { "word97.doc" };
private String expectedText = "This is a sample doc file prepared for
nutch.";
- private Configuration conf;
-
- @Before
- public void setUp() {
- conf = NutchConfiguration.create();
- conf.set("file.content.limit", "-1");
- }
-
- public String getTextContent(String fileName) throws ProtocolException,
- ParseException {
- String urlString = "file:" + sampleDir + fileSeparator + fileName;
- Protocol protocol = new ProtocolFactory(conf).getProtocol(urlString);
- Content content = protocol.getProtocolOutput(new Text(urlString),
- new CrawlDatum()).getContent();
- Parse parse = new ParseUtil(conf).parseByExtensionId("parse-tika", content)
- .get(content.getUrl());
- return parse.getText();
- }
-
@Test
public void testIt() throws ProtocolException, ParseException {
for (int i = 0; i < sampleFiles.length; i++) {
diff --git
a/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestOOParser.java
b/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestOOParser.java
index 93c0a2c..41c47e9 100644
---
a/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestOOParser.java
+++
b/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestOOParser.java
@@ -19,27 +19,16 @@ package org.apache.nutch.parse.tika;
import java.io.FileInputStream;
import java.io.InputStreamReader;
-import org.apache.nutch.crawl.CrawlDatum;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.io.Text;
-import org.apache.nutch.protocol.*;
-import org.apache.nutch.parse.Parse;
import org.apache.nutch.parse.ParseException;
-import org.apache.nutch.parse.ParseUtil;
-import org.apache.nutch.util.NutchConfiguration;
+import org.apache.nutch.protocol.ProtocolException;
import org.junit.Assert;
import org.junit.Test;
/**
* Unit tests for OOParser.
- *
- * @author Andrzej Bialecki
*/
-public class TestOOParser {
+public class TestOOParser extends TikaParserTest {
- private String fileSeparator = System.getProperty("file.separator");
- // This system property is defined in ./src/plugin/build-plugin.xml
- private String sampleDir = System.getProperty("test.data", ".");
// Make sure sample files are copied to "test.data" as specified in
// ./src/plugin/parse-tika/build.xml during plugin compilation.
private String[] sampleFiles = { "ootest.odt", "ootest.sxw" };
@@ -50,28 +39,16 @@ public class TestOOParser {
@Test
public void testIt() throws ProtocolException, ParseException {
- String urlString;
- Content content;
- Parse parse;
- Configuration conf = NutchConfiguration.create();
- Protocol protocol;
- ProtocolFactory factory = new ProtocolFactory(conf);
System.out.println("Expected : " + expectedText);
for (int i = 0; i < sampleFiles.length; i++) {
- urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];
-
+
if (sampleFiles[i].startsWith("ootest") == false)
continue;
- protocol = factory.getProtocol(urlString);
- content = protocol.getProtocolOutput(new Text(urlString),
- new CrawlDatum()).getContent();
- parse = new ParseUtil(conf).parseByExtensionId("parse-tika", content)
- .get(content.getUrl());
-
- String text = parse.getText().replaceAll("[ \t\r\n]+", " ").trim();
+ String text = getTextContent(sampleFiles[i]).replaceAll("[ \t\r\n]+", "
")
+ .trim();
// simply test for the presence of a text - the ordering of the elements
// may differ from what was expected
diff --git
a/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestPdfParser.java
b/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestPdfParser.java
index fff6e9a..784b55c 100644
---
a/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestPdfParser.java
+++
b/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestPdfParser.java
@@ -16,30 +16,16 @@
*/
package org.apache.nutch.parse.tika;
-import org.apache.nutch.protocol.ProtocolFactory;
-import org.apache.nutch.protocol.Protocol;
-import org.apache.nutch.protocol.Content;
-import org.apache.nutch.protocol.ProtocolException;
-import org.apache.nutch.parse.Parse;
-import org.apache.nutch.parse.ParseUtil;
import org.apache.nutch.parse.ParseException;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.nutch.util.NutchConfiguration;
-import org.apache.hadoop.io.Text;
-import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.protocol.ProtocolException;
import org.junit.Assert;
import org.junit.Test;
/**
* Unit tests for PdfParser.
- *
- * @author John Xing
*/
-public class TestPdfParser {
+public class TestPdfParser extends TikaParserTest {
- private String fileSeparator = System.getProperty("file.separator");
- // This system property is defined in ./src/plugin/build-plugin.xml
- private String sampleDir = System.getProperty("test.data", ".");
// Make sure sample files are copied to "test.data" as specified in
// ./src/plugin/parse-tika/build.xml during plugin compilation.
private String[] sampleFiles = { "pdftest.pdf", "encrypted.pdf" };
@@ -48,22 +34,8 @@ public class TestPdfParser {
@Test
public void testIt() throws ProtocolException, ParseException {
- String urlString;
- Protocol protocol;
- Content content;
- Parse parse;
-
for (int i = 0; i < sampleFiles.length; i++) {
- urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];
-
- Configuration conf = NutchConfiguration.create();
- protocol = new ProtocolFactory(conf).getProtocol(urlString);
- content = protocol.getProtocolOutput(new Text(urlString),
- new CrawlDatum()).getContent();
- parse = new ParseUtil(conf).parseByExtensionId("parse-tika", content)
- .get(content.getUrl());
-
- int index = parse.getText().indexOf(expectedText);
+ int index = getTextContent(sampleFiles[i]).indexOf(expectedText);
Assert.assertTrue(index > 0);
}
}
diff --git
a/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestRTFParser.java
b/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestRTFParser.java
index 115220b..4de9d85 100644
---
a/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestRTFParser.java
+++
b/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestRTFParser.java
@@ -16,41 +16,29 @@
*/
package org.apache.nutch.parse.tika;
-// Nutch imports
+import org.apache.hadoop.io.Text;
import org.apache.nutch.crawl.CrawlDatum;
import org.apache.nutch.metadata.DublinCore;
import org.apache.nutch.metadata.Metadata;
import org.apache.nutch.parse.Parse;
-import org.apache.nutch.parse.ParseUtil;
import org.apache.nutch.parse.ParseException;
+import org.apache.nutch.parse.ParseUtil;
import org.apache.nutch.protocol.Content;
import org.apache.nutch.protocol.Protocol;
import org.apache.nutch.protocol.ProtocolException;
import org.apache.nutch.protocol.ProtocolFactory;
-import org.apache.nutch.util.NutchConfiguration;
-
-// Hadoop imports
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.io.Text;
import org.junit.Assert;
-import org.junit.Ignore;
import org.junit.Test;
/**
- * Unit tests for TestRTFParser. (Adapted from John Xing msword unit tests).
- *
- * @author Andy Hedges
+ * Unit tests for TestRTFParser.
*/
-public class TestRTFParser {
+public class TestRTFParser extends TikaParserTest {
- private String fileSeparator = System.getProperty("file.separator");
- // This system property is defined in ./src/plugin/build-plugin.xml
- private String sampleDir = System.getProperty("test.data", ".");
// Make sure sample files are copied to "test.data" as specified in
// ./src/plugin/parse-tika/build.xml during plugin compilation.
private String rtfFile = "test.rtf";
- @Ignore("There seems to be an issue with line 71 e.g. text.trim()")
@Test
public void testIt() throws ProtocolException, ParseException {
@@ -59,7 +47,6 @@ public class TestRTFParser {
Content content;
Parse parse;
- Configuration conf = NutchConfiguration.create();
urlString = "file:" + sampleDir + fileSeparator + rtfFile;
protocol = new ProtocolFactory(conf).getProtocol(urlString);
content = protocol.getProtocolOutput(new Text(urlString), new CrawlDatum())
@@ -67,8 +54,7 @@ public class TestRTFParser {
parse = new ParseUtil(conf).parseByExtensionId("parse-tika", content).get(
content.getUrl());
String text = parse.getText();
- Assert.assertEquals("The quick brown fox jumps over the lazy dog",
- text.trim());
+ Assert.assertTrue(text.contains("The quick brown fox jumps over the lazy
dog"));
String title = parse.getData().getTitle();
Metadata meta = parse.getData().getParseMeta();
diff --git
a/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestXlsxParser.java
b/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestXlsxParser.java
new file mode 100644
index 0000000..85427db
--- /dev/null
+++
b/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestXlsxParser.java
@@ -0,0 +1,38 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.parse.tika;
+
+import static org.junit.Assert.*;
+
+import java.io.IOException;
+
+import org.apache.nutch.parse.ParseException;
+import org.apache.nutch.protocol.ProtocolException;
+import org.junit.Test;
+
+public class TestXlsxParser extends TikaParserTest {
+
+ @Test
+ public void testIt() throws ProtocolException, ParseException, IOException {
+ String found = getTextContent("test.xlsx");
+ String expected = "test.txt This is a test for spreadsheets xlsx";
+ // text is distributed over columns and rows, need to normalize white space
+ found = found.replaceAll("\\s+", " ").trim();
+ assertEquals(found, expected);
+ }
+
+}
diff --git
a/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestEmbeddedDocuments.java
b/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TikaParserTest.java
similarity index 59%
copy from
src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestEmbeddedDocuments.java
copy to
src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TikaParserTest.java
index cecf251..781debb 100644
---
a/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestEmbeddedDocuments.java
+++
b/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TikaParserTest.java
@@ -16,66 +16,50 @@
*/
package org.apache.nutch.parse.tika;
-import org.apache.nutch.protocol.ProtocolFactory;
-import org.apache.nutch.protocol.Protocol;
-import org.apache.nutch.protocol.Content;
-import org.apache.nutch.protocol.ProtocolException;
-import org.apache.nutch.parse.Parse;
-import org.apache.nutch.parse.ParseUtil;
-import org.apache.nutch.parse.ParseException;
import org.apache.hadoop.conf.Configuration;
-import org.apache.nutch.util.NutchConfiguration;
import org.apache.hadoop.io.Text;
import org.apache.nutch.crawl.CrawlDatum;
-import org.junit.Assert;
+import org.apache.nutch.parse.Parse;
+import org.apache.nutch.parse.ParseException;
+import org.apache.nutch.parse.ParseUtil;
+import org.apache.nutch.protocol.Content;
+import org.apache.nutch.protocol.Protocol;
+import org.apache.nutch.protocol.ProtocolException;
+import org.apache.nutch.protocol.ProtocolFactory;
+import org.apache.nutch.util.NutchConfiguration;
import org.junit.Before;
-import org.junit.Test;
-
-import java.io.File;
/**
- * Unit tests for MSWordParser.
- *
- * @author John Xing
+ * Base class to extend Tika parser tests from.
*/
-public class TestEmbeddedDocuments {
+public class TikaParserTest {
- private String fileSeparator = System.getProperty("file.separator");
- // This system property is defined in ./src/plugin/build-plugin.xml
- private String sampleDir = System.getProperty("test.data", ".");
- // Make sure sample files are copied to "test.data" as specified in
- // ./src/plugin/parse-tika/build.xml during plugin compilation.
- private String[] sampleFiles = { "test_recursive_embedded.docx" };
+ protected String fileSeparator = System.getProperty("file.separator");
- private String expectedText = "When in the Course of human events";
+ /**
+ * Folder with test data, defined in src/plugin/build-plugin.xml. Make sure
+ * that all sample files are copied to "test.data", they must be listed in
+ * src/plugin/parse-tika/build.xml
+ */
+ protected String sampleDir = System.getProperty("test.data", ".");
- private Configuration conf;
+ protected Configuration conf;
@Before
public void setUp() {
conf = NutchConfiguration.create();
conf.set("file.content.limit", "-1");
- conf.setBoolean("tika.parse.embedded", true);
}
- public String getTextContent(String fileName) throws ProtocolException,
- ParseException {
+ public String getTextContent(String fileName)
+ throws ProtocolException, ParseException {
String urlString = "file:" + sampleDir + fileSeparator + fileName;
Protocol protocol = new ProtocolFactory(conf).getProtocol(urlString);
- Content content = protocol.getProtocolOutput(new Text(urlString),
- new CrawlDatum()).getContent();
+ Content content = protocol
+ .getProtocolOutput(new Text(urlString), new CrawlDatum()).getContent();
Parse parse = new ParseUtil(conf).parseByExtensionId("parse-tika", content)
.get(content.getUrl());
return parse.getText();
}
- @Test
- public void testIt() throws ProtocolException, ParseException {
- for (int i = 0; i < sampleFiles.length; i++) {
- String found = getTextContent(sampleFiles[i]);
- Assert.assertTrue("text found : '" + found + "'",
- found.contains(expectedText));
- }
- }
-
}