[nutch] branch master updated: NUTCH-1945 Test for XLSX parser - add Tika unit test for XLSX files - bundle instance variables and utility methods in class TikaParserTest - clean up javadoc comments

snagel Tue, 12 May 2020 06:36:02 -0700

This is an automated email from the ASF dual-hosted git repository.

snagel pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/nutch.git



The following commit(s) were added to refs/heads/master by this push:
     new 0341f0d  NUTCH-1945 Test for XLSX parser - add Tika unit test for XLSX 
files - bundle instance variables and utility methods in class TikaParserTest - 
clean up javadoc comments
     new e61a8a3  Merge pull request #525 from sebastian-nagel/NUTCH-1945
0341f0d is described below

commit 0341f0dfa156d3963e88b2cb9507013b0eef8668
Author: Sebastian Nagel <[email protected]>
AuthorDate: Tue May 5 13:25:15 2020 +0200

    NUTCH-1945 Test for XLSX parser
    - add Tika unit test for XLSX files
    - bundle instance variables and utility methods in class TikaParserTest
    - clean up javadoc comments
---
 src/plugin/parse-tika/build.xml                    |   1 +
 src/plugin/parse-tika/sample/test.xlsx             | Bin 0 -> 3950 bytes
 .../nutch/parse/tika/TestEmbeddedDocuments.java    |  36 ++-----------
 .../apache/nutch/parse/tika/TestFeedParser.java    |  14 +----
 .../apache/nutch/parse/tika/TestHtmlParser.java    |   1 -
 .../apache/nutch/parse/tika/TestImageMetadata.java |   5 +-
 .../apache/nutch/parse/tika/TestMSWordParser.java  |  42 ++-------------
 .../org/apache/nutch/parse/tika/TestOOParser.java  |  33 ++----------
 .../org/apache/nutch/parse/tika/TestPdfParser.java |  34 ++----------
 .../org/apache/nutch/parse/tika/TestRTFParser.java |  24 ++-------
 .../apache/nutch/parse/tika/TestXlsxParser.java    |  38 +++++++++++++
 ...tEmbeddedDocuments.java => TikaParserTest.java} |  60 ++++++++-------------
 12 files changed, 84 insertions(+), 204 deletions(-)

diff --git a/src/plugin/parse-tika/build.xml b/src/plugin/parse-tika/build.xml
index b17643d..af3e610 100644
--- a/src/plugin/parse-tika/build.xml
+++ b/src/plugin/parse-tika/build.xml
@@ -36,6 +36,7 @@
       <include name="*.doc"/>
       <include name="*.gif"/>
       <include name="*.docx"/>
+      <include name="*.xlsx"/>
     </fileset>
   </copy>
   
diff --git a/src/plugin/parse-tika/sample/test.xlsx 
b/src/plugin/parse-tika/sample/test.xlsx
new file mode 100644
index 0000000..de33f28
Binary files /dev/null and b/src/plugin/parse-tika/sample/test.xlsx differ
diff --git 
a/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestEmbeddedDocuments.java
 
b/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestEmbeddedDocuments.java
index cecf251..79ed286 100644
--- 
a/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestEmbeddedDocuments.java
+++ 
b/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestEmbeddedDocuments.java
@@ -16,59 +16,29 @@
  */
 package org.apache.nutch.parse.tika;
 
-import org.apache.nutch.protocol.ProtocolFactory;
-import org.apache.nutch.protocol.Protocol;
-import org.apache.nutch.protocol.Content;
-import org.apache.nutch.protocol.ProtocolException;
-import org.apache.nutch.parse.Parse;
-import org.apache.nutch.parse.ParseUtil;
 import org.apache.nutch.parse.ParseException;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.nutch.util.NutchConfiguration;
-import org.apache.hadoop.io.Text;
-import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.protocol.ProtocolException;
 import org.junit.Assert;
 import org.junit.Before;
 import org.junit.Test;
 
-import java.io.File;
-
 /**
  * Unit tests for MSWordParser.
- * 
- * @author John Xing
  */
-public class TestEmbeddedDocuments {
+public class TestEmbeddedDocuments extends TikaParserTest {
 
-  private String fileSeparator = System.getProperty("file.separator");
-  // This system property is defined in ./src/plugin/build-plugin.xml
-  private String sampleDir = System.getProperty("test.data", ".");
   // Make sure sample files are copied to "test.data" as specified in
   // ./src/plugin/parse-tika/build.xml during plugin compilation.
   private String[] sampleFiles = { "test_recursive_embedded.docx" };
 
   private String expectedText = "When in the Course of human events";
 
-  private Configuration conf;
-
   @Before
   public void setUp() {
-    conf = NutchConfiguration.create();
-    conf.set("file.content.limit", "-1");
+    super.setUp();
     conf.setBoolean("tika.parse.embedded", true);
   }
 
-  public String getTextContent(String fileName) throws ProtocolException,
-      ParseException {
-    String urlString = "file:" + sampleDir + fileSeparator + fileName;
-    Protocol protocol = new ProtocolFactory(conf).getProtocol(urlString);
-    Content content = protocol.getProtocolOutput(new Text(urlString),
-        new CrawlDatum()).getContent();
-    Parse parse = new ParseUtil(conf).parseByExtensionId("parse-tika", content)
-        .get(content.getUrl());
-    return parse.getText();
-  }
-
   @Test
   public void testIt() throws ProtocolException, ParseException {
     for (int i = 0; i < sampleFiles.length; i++) {
diff --git 
a/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestFeedParser.java
 
b/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestFeedParser.java
index 87b452c..94eec53 100644
--- 
a/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestFeedParser.java
+++ 
b/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestFeedParser.java
@@ -26,7 +26,6 @@ import org.apache.nutch.parse.Parse;
 import org.apache.nutch.parse.ParseData;
 import org.apache.nutch.parse.ParseException;
 import org.apache.nutch.parse.ParseUtil;
-import org.apache.nutch.parse.tika.TikaParser;
 import org.apache.nutch.protocol.Content;
 import org.apache.nutch.protocol.Protocol;
 import org.apache.nutch.protocol.ProtocolException;
@@ -34,18 +33,9 @@ import org.apache.nutch.protocol.ProtocolFactory;
 import org.apache.nutch.util.NutchConfiguration;
 
 /**
- * 
- * @author mattmann / jnioche
- * 
- *         Test Suite for the RSS feeds with the {@link TikaParser}.
- * 
+ * Test Suite for the RSS feeds with the {@link TikaParser}.
  */
-public class TestFeedParser {
-
-  private String fileSeparator = System.getProperty("file.separator");
-
-  // This system property is defined in ./src/plugin/build-plugin.xml
-  private String sampleDir = System.getProperty("test.data", ".");
+public class TestFeedParser extends TikaParserTest {
 
   private String[] sampleFiles = { "rsstest.rss" };
 
diff --git 
a/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestHtmlParser.java
 
b/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestHtmlParser.java
index 4924511..781e891 100644
--- 
a/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestHtmlParser.java
+++ 
b/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestHtmlParser.java
@@ -22,7 +22,6 @@ import java.nio.charset.StandardCharsets;
 
 import org.apache.hadoop.conf.Configuration;
 import org.apache.nutch.metadata.Metadata;
-import org.apache.nutch.parse.tika.TikaParser;
 import org.apache.nutch.parse.Outlink;
 import org.apache.nutch.parse.Parse;
 import org.apache.nutch.parse.Parser;
diff --git 
a/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestImageMetadata.java
 
b/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestImageMetadata.java
index 779278c..0f1505d 100644
--- 
a/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestImageMetadata.java
+++ 
b/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestImageMetadata.java
@@ -33,11 +33,8 @@ import org.junit.Test;
 /**
  * Test extraction of image metadata
  */
-public class TestImageMetadata {
+public class TestImageMetadata extends TikaParserTest {
 
-  private String fileSeparator = System.getProperty("file.separator");
-  // This system property is defined in ./src/plugin/build-plugin.xml
-  private String sampleDir = System.getProperty("test.data", ".");
   // Make sure sample files are copied to "test.data" as specified in
   private String[] sampleFiles = { "nutch_logo_tm.gif", };
 
diff --git 
a/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestMSWordParser.java
 
b/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestMSWordParser.java
index 37c536c..c5062f6 100644
--- 
a/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestMSWordParser.java
+++ 
b/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestMSWordParser.java
@@ -16,58 +16,24 @@
  */
 package org.apache.nutch.parse.tika;
 
-import org.apache.nutch.protocol.ProtocolFactory;
-import org.apache.nutch.protocol.Protocol;
-import org.apache.nutch.protocol.Content;
-import org.apache.nutch.protocol.ProtocolException;
-import org.apache.nutch.parse.Parse;
-import org.apache.nutch.parse.ParseUtil;
+import java.io.File;
+
 import org.apache.nutch.parse.ParseException;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.nutch.util.NutchConfiguration;
-import org.apache.hadoop.io.Text;
-import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.protocol.ProtocolException;
 import org.junit.Assert;
-import org.junit.Before;
 import org.junit.Test;
 
-import java.io.File;
-
 /**
  * Unit tests for MSWordParser.
- * 
- * @author John Xing
  */
-public class TestMSWordParser {
+public class TestMSWordParser extends TikaParserTest {
 
-  private String fileSeparator = System.getProperty("file.separator");
-  // This system property is defined in ./src/plugin/build-plugin.xml
-  private String sampleDir = System.getProperty("test.data", ".");
   // Make sure sample files are copied to "test.data" as specified in
   // ./src/plugin/parse-tika/build.xml during plugin compilation.
   private String[] sampleFiles = { "word97.doc" };
 
   private String expectedText = "This is a sample doc file prepared for 
nutch.";
 
-  private Configuration conf;
-
-  @Before
-  public void setUp() {
-    conf = NutchConfiguration.create();
-    conf.set("file.content.limit", "-1");
-  }
-
-  public String getTextContent(String fileName) throws ProtocolException,
-      ParseException {
-    String urlString = "file:" + sampleDir + fileSeparator + fileName;
-    Protocol protocol = new ProtocolFactory(conf).getProtocol(urlString);
-    Content content = protocol.getProtocolOutput(new Text(urlString),
-        new CrawlDatum()).getContent();
-    Parse parse = new ParseUtil(conf).parseByExtensionId("parse-tika", content)
-        .get(content.getUrl());
-    return parse.getText();
-  }
-
   @Test
   public void testIt() throws ProtocolException, ParseException {
     for (int i = 0; i < sampleFiles.length; i++) {
diff --git 
a/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestOOParser.java 
b/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestOOParser.java
index 93c0a2c..41c47e9 100644
--- 
a/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestOOParser.java
+++ 
b/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestOOParser.java
@@ -19,27 +19,16 @@ package org.apache.nutch.parse.tika;
 import java.io.FileInputStream;
 import java.io.InputStreamReader;
 
-import org.apache.nutch.crawl.CrawlDatum;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.io.Text;
-import org.apache.nutch.protocol.*;
-import org.apache.nutch.parse.Parse;
 import org.apache.nutch.parse.ParseException;
-import org.apache.nutch.parse.ParseUtil;
-import org.apache.nutch.util.NutchConfiguration;
+import org.apache.nutch.protocol.ProtocolException;
 import org.junit.Assert;
 import org.junit.Test;
 
 /**
  * Unit tests for OOParser.
- * 
- * @author Andrzej Bialecki
  */
-public class TestOOParser {
+public class TestOOParser extends TikaParserTest {
 
-  private String fileSeparator = System.getProperty("file.separator");
-  // This system property is defined in ./src/plugin/build-plugin.xml
-  private String sampleDir = System.getProperty("test.data", ".");
   // Make sure sample files are copied to "test.data" as specified in
   // ./src/plugin/parse-tika/build.xml during plugin compilation.
   private String[] sampleFiles = { "ootest.odt", "ootest.sxw" };
@@ -50,28 +39,16 @@ public class TestOOParser {
 
   @Test
   public void testIt() throws ProtocolException, ParseException {
-    String urlString;
-    Content content;
-    Parse parse;
-    Configuration conf = NutchConfiguration.create();
-    Protocol protocol;
-    ProtocolFactory factory = new ProtocolFactory(conf);
 
     System.out.println("Expected : " + expectedText);
 
     for (int i = 0; i < sampleFiles.length; i++) {
-      urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];
-
+ 
       if (sampleFiles[i].startsWith("ootest") == false)
         continue;
 
-      protocol = factory.getProtocol(urlString);
-      content = protocol.getProtocolOutput(new Text(urlString),
-          new CrawlDatum()).getContent();
-      parse = new ParseUtil(conf).parseByExtensionId("parse-tika", content)
-          .get(content.getUrl());
-
-      String text = parse.getText().replaceAll("[ \t\r\n]+", " ").trim();
+      String text = getTextContent(sampleFiles[i]).replaceAll("[ \t\r\n]+", " 
")
+          .trim();
 
       // simply test for the presence of a text - the ordering of the elements
       // may differ from what was expected
diff --git 
a/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestPdfParser.java 
b/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestPdfParser.java
index fff6e9a..784b55c 100644
--- 
a/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestPdfParser.java
+++ 
b/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestPdfParser.java
@@ -16,30 +16,16 @@
  */
 package org.apache.nutch.parse.tika;
 
-import org.apache.nutch.protocol.ProtocolFactory;
-import org.apache.nutch.protocol.Protocol;
-import org.apache.nutch.protocol.Content;
-import org.apache.nutch.protocol.ProtocolException;
-import org.apache.nutch.parse.Parse;
-import org.apache.nutch.parse.ParseUtil;
 import org.apache.nutch.parse.ParseException;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.nutch.util.NutchConfiguration;
-import org.apache.hadoop.io.Text;
-import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.protocol.ProtocolException;
 import org.junit.Assert;
 import org.junit.Test;
 
 /**
  * Unit tests for PdfParser.
- * 
- * @author John Xing
  */
-public class TestPdfParser {
+public class TestPdfParser extends TikaParserTest {
 
-  private String fileSeparator = System.getProperty("file.separator");
-  // This system property is defined in ./src/plugin/build-plugin.xml
-  private String sampleDir = System.getProperty("test.data", ".");
   // Make sure sample files are copied to "test.data" as specified in
   // ./src/plugin/parse-tika/build.xml during plugin compilation.
   private String[] sampleFiles = { "pdftest.pdf", "encrypted.pdf" };
@@ -48,22 +34,8 @@ public class TestPdfParser {
 
   @Test
   public void testIt() throws ProtocolException, ParseException {
-    String urlString;
-    Protocol protocol;
-    Content content;
-    Parse parse;
-
     for (int i = 0; i < sampleFiles.length; i++) {
-      urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];
-
-      Configuration conf = NutchConfiguration.create();
-      protocol = new ProtocolFactory(conf).getProtocol(urlString);
-      content = protocol.getProtocolOutput(new Text(urlString),
-          new CrawlDatum()).getContent();
-      parse = new ParseUtil(conf).parseByExtensionId("parse-tika", content)
-          .get(content.getUrl());
-
-      int index = parse.getText().indexOf(expectedText);
+      int index = getTextContent(sampleFiles[i]).indexOf(expectedText);
       Assert.assertTrue(index > 0);
     }
   }
diff --git 
a/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestRTFParser.java 
b/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestRTFParser.java
index 115220b..4de9d85 100644
--- 
a/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestRTFParser.java
+++ 
b/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestRTFParser.java
@@ -16,41 +16,29 @@
  */
 package org.apache.nutch.parse.tika;
 
-// Nutch imports
+import org.apache.hadoop.io.Text;
 import org.apache.nutch.crawl.CrawlDatum;
 import org.apache.nutch.metadata.DublinCore;
 import org.apache.nutch.metadata.Metadata;
 import org.apache.nutch.parse.Parse;
-import org.apache.nutch.parse.ParseUtil;
 import org.apache.nutch.parse.ParseException;
+import org.apache.nutch.parse.ParseUtil;
 import org.apache.nutch.protocol.Content;
 import org.apache.nutch.protocol.Protocol;
 import org.apache.nutch.protocol.ProtocolException;
 import org.apache.nutch.protocol.ProtocolFactory;
-import org.apache.nutch.util.NutchConfiguration;
-
-// Hadoop imports
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.io.Text;
 import org.junit.Assert;
-import org.junit.Ignore;
 import org.junit.Test;
 
 /**
- * Unit tests for TestRTFParser. (Adapted from John Xing msword unit tests).
- * 
- * @author Andy Hedges
+ * Unit tests for TestRTFParser.
  */
-public class TestRTFParser {
+public class TestRTFParser extends TikaParserTest {
 
-  private String fileSeparator = System.getProperty("file.separator");
-  // This system property is defined in ./src/plugin/build-plugin.xml
-  private String sampleDir = System.getProperty("test.data", ".");
   // Make sure sample files are copied to "test.data" as specified in
   // ./src/plugin/parse-tika/build.xml during plugin compilation.
   private String rtfFile = "test.rtf";
 
-  @Ignore("There seems to be an issue with line 71 e.g. text.trim()")
   @Test
   public void testIt() throws ProtocolException, ParseException {
 
@@ -59,7 +47,6 @@ public class TestRTFParser {
     Content content;
     Parse parse;
 
-    Configuration conf = NutchConfiguration.create();
     urlString = "file:" + sampleDir + fileSeparator + rtfFile;
     protocol = new ProtocolFactory(conf).getProtocol(urlString);
     content = protocol.getProtocolOutput(new Text(urlString), new CrawlDatum())
@@ -67,8 +54,7 @@ public class TestRTFParser {
     parse = new ParseUtil(conf).parseByExtensionId("parse-tika", content).get(
         content.getUrl());
     String text = parse.getText();
-    Assert.assertEquals("The quick brown fox jumps over the lazy dog",
-        text.trim());
+    Assert.assertTrue(text.contains("The quick brown fox jumps over the lazy 
dog"));
 
     String title = parse.getData().getTitle();
     Metadata meta = parse.getData().getParseMeta();
diff --git 
a/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestXlsxParser.java
 
b/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestXlsxParser.java
new file mode 100644
index 0000000..85427db
--- /dev/null
+++ 
b/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestXlsxParser.java
@@ -0,0 +1,38 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.parse.tika;
+
+import static org.junit.Assert.*;
+
+import java.io.IOException;
+
+import org.apache.nutch.parse.ParseException;
+import org.apache.nutch.protocol.ProtocolException;
+import org.junit.Test;
+
+public class TestXlsxParser extends TikaParserTest {
+
+  @Test
+  public void testIt() throws ProtocolException, ParseException, IOException {
+    String found = getTextContent("test.xlsx");
+    String expected = "test.txt This is a test for spreadsheets xlsx";
+    // text is distributed over columns and rows, need to normalize white space
+    found = found.replaceAll("\\s+", " ").trim();
+    assertEquals(found, expected);
+  }
+
+}
diff --git 
a/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestEmbeddedDocuments.java
 
b/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TikaParserTest.java
similarity index 59%
copy from 
src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestEmbeddedDocuments.java
copy to 
src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TikaParserTest.java
index cecf251..781debb 100644
--- 
a/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestEmbeddedDocuments.java
+++ 
b/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TikaParserTest.java
@@ -16,66 +16,50 @@
  */
 package org.apache.nutch.parse.tika;
 
-import org.apache.nutch.protocol.ProtocolFactory;
-import org.apache.nutch.protocol.Protocol;
-import org.apache.nutch.protocol.Content;
-import org.apache.nutch.protocol.ProtocolException;
-import org.apache.nutch.parse.Parse;
-import org.apache.nutch.parse.ParseUtil;
-import org.apache.nutch.parse.ParseException;
 import org.apache.hadoop.conf.Configuration;
-import org.apache.nutch.util.NutchConfiguration;
 import org.apache.hadoop.io.Text;
 import org.apache.nutch.crawl.CrawlDatum;
-import org.junit.Assert;
+import org.apache.nutch.parse.Parse;
+import org.apache.nutch.parse.ParseException;
+import org.apache.nutch.parse.ParseUtil;
+import org.apache.nutch.protocol.Content;
+import org.apache.nutch.protocol.Protocol;
+import org.apache.nutch.protocol.ProtocolException;
+import org.apache.nutch.protocol.ProtocolFactory;
+import org.apache.nutch.util.NutchConfiguration;
 import org.junit.Before;
-import org.junit.Test;
-
-import java.io.File;
 
 /**
- * Unit tests for MSWordParser.
- * 
- * @author John Xing
+ * Base class to extend Tika parser tests from.
  */
-public class TestEmbeddedDocuments {
+public class TikaParserTest {
 
-  private String fileSeparator = System.getProperty("file.separator");
-  // This system property is defined in ./src/plugin/build-plugin.xml
-  private String sampleDir = System.getProperty("test.data", ".");
-  // Make sure sample files are copied to "test.data" as specified in
-  // ./src/plugin/parse-tika/build.xml during plugin compilation.
-  private String[] sampleFiles = { "test_recursive_embedded.docx" };
+  protected String fileSeparator = System.getProperty("file.separator");
 
-  private String expectedText = "When in the Course of human events";
+  /**
+   * Folder with test data, defined in src/plugin/build-plugin.xml. Make sure
+   * that all sample files are copied to "test.data", they must be listed in
+   * src/plugin/parse-tika/build.xml
+   */
+  protected String sampleDir = System.getProperty("test.data", ".");
 
-  private Configuration conf;
+  protected Configuration conf;
 
   @Before
   public void setUp() {
     conf = NutchConfiguration.create();
     conf.set("file.content.limit", "-1");
-    conf.setBoolean("tika.parse.embedded", true);
   }
 
-  public String getTextContent(String fileName) throws ProtocolException,
-      ParseException {
+  public String getTextContent(String fileName)
+      throws ProtocolException, ParseException {
     String urlString = "file:" + sampleDir + fileSeparator + fileName;
     Protocol protocol = new ProtocolFactory(conf).getProtocol(urlString);
-    Content content = protocol.getProtocolOutput(new Text(urlString),
-        new CrawlDatum()).getContent();
+    Content content = protocol
+        .getProtocolOutput(new Text(urlString), new CrawlDatum()).getContent();
     Parse parse = new ParseUtil(conf).parseByExtensionId("parse-tika", content)
         .get(content.getUrl());
     return parse.getText();
   }
 
-  @Test
-  public void testIt() throws ProtocolException, ParseException {
-    for (int i = 0; i < sampleFiles.length; i++) {
-      String found = getTextContent(sampleFiles[i]);
-      Assert.assertTrue("text found : '" + found + "'",
-          found.contains(expectedText));
-    }
-  }
-
 }

[nutch] branch master updated: NUTCH-1945 Test for XLSX parser - add Tika unit test for XLSX files - bundle instance variables and utility methods in class TikaParserTest - clean up javadoc comments

Reply via email to