This is an automated email from the ASF dual-hosted git repository. snagel pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/nutch.git
commit 37590194f0f604d8a39a2ae814a1874079715822 Author: Sebastian Nagel <[email protected]> AuthorDate: Tue May 5 13:25:15 2020 +0200 NUTCH-1945 Test for XLSX parser - add Tika unit test for XLSX files - bundle instance variables and utility methods in class TikaParserTest - clean up javadoc comments --- src/plugin/parse-tika/build.xml | 1 + src/plugin/parse-tika/sample/test.xlsx | Bin 0 -> 3950 bytes .../nutch/parse/tika/TestEmbeddedDocuments.java | 36 ++----------- .../apache/nutch/parse/tika/TestFeedParser.java | 14 +---- .../apache/nutch/parse/tika/TestHtmlParser.java | 1 - .../apache/nutch/parse/tika/TestImageMetadata.java | 5 +- .../apache/nutch/parse/tika/TestMSWordParser.java | 42 ++------------- .../org/apache/nutch/parse/tika/TestOOParser.java | 33 ++---------- .../org/apache/nutch/parse/tika/TestPdfParser.java | 34 ++---------- .../org/apache/nutch/parse/tika/TestRTFParser.java | 24 ++------- .../apache/nutch/parse/tika/TestXlsxParser.java | 38 +++++++++++++ ...tEmbeddedDocuments.java => TikaParserTest.java} | 60 ++++++++------------- 12 files changed, 84 insertions(+), 204 deletions(-) diff --git a/src/plugin/parse-tika/build.xml b/src/plugin/parse-tika/build.xml index b17643d..af3e610 100644 --- a/src/plugin/parse-tika/build.xml +++ b/src/plugin/parse-tika/build.xml @@ -36,6 +36,7 @@ <include name="*.doc"/> <include name="*.gif"/> <include name="*.docx"/> + <include name="*.xlsx"/> </fileset> </copy> diff --git a/src/plugin/parse-tika/sample/test.xlsx b/src/plugin/parse-tika/sample/test.xlsx new file mode 100644 index 0000000..de33f28 Binary files /dev/null and b/src/plugin/parse-tika/sample/test.xlsx differ diff --git a/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestEmbeddedDocuments.java b/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestEmbeddedDocuments.java index cecf251..79ed286 100644 --- a/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestEmbeddedDocuments.java +++ b/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestEmbeddedDocuments.java @@ -16,59 +16,29 @@ */ package org.apache.nutch.parse.tika; -import org.apache.nutch.protocol.ProtocolFactory; -import org.apache.nutch.protocol.Protocol; -import org.apache.nutch.protocol.Content; -import org.apache.nutch.protocol.ProtocolException; -import org.apache.nutch.parse.Parse; -import org.apache.nutch.parse.ParseUtil; import org.apache.nutch.parse.ParseException; -import org.apache.hadoop.conf.Configuration; -import org.apache.nutch.util.NutchConfiguration; -import org.apache.hadoop.io.Text; -import org.apache.nutch.crawl.CrawlDatum; +import org.apache.nutch.protocol.ProtocolException; import org.junit.Assert; import org.junit.Before; import org.junit.Test; -import java.io.File; - /** * Unit tests for MSWordParser. - * - * @author John Xing */ -public class TestEmbeddedDocuments { +public class TestEmbeddedDocuments extends TikaParserTest { - private String fileSeparator = System.getProperty("file.separator"); - // This system property is defined in ./src/plugin/build-plugin.xml - private String sampleDir = System.getProperty("test.data", "."); // Make sure sample files are copied to "test.data" as specified in // ./src/plugin/parse-tika/build.xml during plugin compilation. private String[] sampleFiles = { "test_recursive_embedded.docx" }; private String expectedText = "When in the Course of human events"; - private Configuration conf; - @Before public void setUp() { - conf = NutchConfiguration.create(); - conf.set("file.content.limit", "-1"); + super.setUp(); conf.setBoolean("tika.parse.embedded", true); } - public String getTextContent(String fileName) throws ProtocolException, - ParseException { - String urlString = "file:" + sampleDir + fileSeparator + fileName; - Protocol protocol = new ProtocolFactory(conf).getProtocol(urlString); - Content content = protocol.getProtocolOutput(new Text(urlString), - new CrawlDatum()).getContent(); - Parse parse = new ParseUtil(conf).parseByExtensionId("parse-tika", content) - .get(content.getUrl()); - return parse.getText(); - } - @Test public void testIt() throws ProtocolException, ParseException { for (int i = 0; i < sampleFiles.length; i++) { diff --git a/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestFeedParser.java b/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestFeedParser.java index 87b452c..94eec53 100644 --- a/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestFeedParser.java +++ b/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestFeedParser.java @@ -26,7 +26,6 @@ import org.apache.nutch.parse.Parse; import org.apache.nutch.parse.ParseData; import org.apache.nutch.parse.ParseException; import org.apache.nutch.parse.ParseUtil; -import org.apache.nutch.parse.tika.TikaParser; import org.apache.nutch.protocol.Content; import org.apache.nutch.protocol.Protocol; import org.apache.nutch.protocol.ProtocolException; @@ -34,18 +33,9 @@ import org.apache.nutch.protocol.ProtocolFactory; import org.apache.nutch.util.NutchConfiguration; /** - * - * @author mattmann / jnioche - * - * Test Suite for the RSS feeds with the {@link TikaParser}. - * + * Test Suite for the RSS feeds with the {@link TikaParser}. */ -public class TestFeedParser { - - private String fileSeparator = System.getProperty("file.separator"); - - // This system property is defined in ./src/plugin/build-plugin.xml - private String sampleDir = System.getProperty("test.data", "."); +public class TestFeedParser extends TikaParserTest { private String[] sampleFiles = { "rsstest.rss" }; diff --git a/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestHtmlParser.java b/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestHtmlParser.java index 4924511..781e891 100644 --- a/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestHtmlParser.java +++ b/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestHtmlParser.java @@ -22,7 +22,6 @@ import java.nio.charset.StandardCharsets; import org.apache.hadoop.conf.Configuration; import org.apache.nutch.metadata.Metadata; -import org.apache.nutch.parse.tika.TikaParser; import org.apache.nutch.parse.Outlink; import org.apache.nutch.parse.Parse; import org.apache.nutch.parse.Parser; diff --git a/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestImageMetadata.java b/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestImageMetadata.java index 779278c..0f1505d 100644 --- a/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestImageMetadata.java +++ b/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestImageMetadata.java @@ -33,11 +33,8 @@ import org.junit.Test; /** * Test extraction of image metadata */ -public class TestImageMetadata { +public class TestImageMetadata extends TikaParserTest { - private String fileSeparator = System.getProperty("file.separator"); - // This system property is defined in ./src/plugin/build-plugin.xml - private String sampleDir = System.getProperty("test.data", "."); // Make sure sample files are copied to "test.data" as specified in private String[] sampleFiles = { "nutch_logo_tm.gif", }; diff --git a/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestMSWordParser.java b/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestMSWordParser.java index 37c536c..c5062f6 100644 --- a/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestMSWordParser.java +++ b/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestMSWordParser.java @@ -16,58 +16,24 @@ */ package org.apache.nutch.parse.tika; -import org.apache.nutch.protocol.ProtocolFactory; -import org.apache.nutch.protocol.Protocol; -import org.apache.nutch.protocol.Content; -import org.apache.nutch.protocol.ProtocolException; -import org.apache.nutch.parse.Parse; -import org.apache.nutch.parse.ParseUtil; +import java.io.File; + import org.apache.nutch.parse.ParseException; -import org.apache.hadoop.conf.Configuration; -import org.apache.nutch.util.NutchConfiguration; -import org.apache.hadoop.io.Text; -import org.apache.nutch.crawl.CrawlDatum; +import org.apache.nutch.protocol.ProtocolException; import org.junit.Assert; -import org.junit.Before; import org.junit.Test; -import java.io.File; - /** * Unit tests for MSWordParser. - * - * @author John Xing */ -public class TestMSWordParser { +public class TestMSWordParser extends TikaParserTest { - private String fileSeparator = System.getProperty("file.separator"); - // This system property is defined in ./src/plugin/build-plugin.xml - private String sampleDir = System.getProperty("test.data", "."); // Make sure sample files are copied to "test.data" as specified in // ./src/plugin/parse-tika/build.xml during plugin compilation. private String[] sampleFiles = { "word97.doc" }; private String expectedText = "This is a sample doc file prepared for nutch."; - private Configuration conf; - - @Before - public void setUp() { - conf = NutchConfiguration.create(); - conf.set("file.content.limit", "-1"); - } - - public String getTextContent(String fileName) throws ProtocolException, - ParseException { - String urlString = "file:" + sampleDir + fileSeparator + fileName; - Protocol protocol = new ProtocolFactory(conf).getProtocol(urlString); - Content content = protocol.getProtocolOutput(new Text(urlString), - new CrawlDatum()).getContent(); - Parse parse = new ParseUtil(conf).parseByExtensionId("parse-tika", content) - .get(content.getUrl()); - return parse.getText(); - } - @Test public void testIt() throws ProtocolException, ParseException { for (int i = 0; i < sampleFiles.length; i++) { diff --git a/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestOOParser.java b/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestOOParser.java index 93c0a2c..41c47e9 100644 --- a/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestOOParser.java +++ b/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestOOParser.java @@ -19,27 +19,16 @@ package org.apache.nutch.parse.tika; import java.io.FileInputStream; import java.io.InputStreamReader; -import org.apache.nutch.crawl.CrawlDatum; -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.io.Text; -import org.apache.nutch.protocol.*; -import org.apache.nutch.parse.Parse; import org.apache.nutch.parse.ParseException; -import org.apache.nutch.parse.ParseUtil; -import org.apache.nutch.util.NutchConfiguration; +import org.apache.nutch.protocol.ProtocolException; import org.junit.Assert; import org.junit.Test; /** * Unit tests for OOParser. - * - * @author Andrzej Bialecki */ -public class TestOOParser { +public class TestOOParser extends TikaParserTest { - private String fileSeparator = System.getProperty("file.separator"); - // This system property is defined in ./src/plugin/build-plugin.xml - private String sampleDir = System.getProperty("test.data", "."); // Make sure sample files are copied to "test.data" as specified in // ./src/plugin/parse-tika/build.xml during plugin compilation. private String[] sampleFiles = { "ootest.odt", "ootest.sxw" }; @@ -50,28 +39,16 @@ public class TestOOParser { @Test public void testIt() throws ProtocolException, ParseException { - String urlString; - Content content; - Parse parse; - Configuration conf = NutchConfiguration.create(); - Protocol protocol; - ProtocolFactory factory = new ProtocolFactory(conf); System.out.println("Expected : " + expectedText); for (int i = 0; i < sampleFiles.length; i++) { - urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i]; - + if (sampleFiles[i].startsWith("ootest") == false) continue; - protocol = factory.getProtocol(urlString); - content = protocol.getProtocolOutput(new Text(urlString), - new CrawlDatum()).getContent(); - parse = new ParseUtil(conf).parseByExtensionId("parse-tika", content) - .get(content.getUrl()); - - String text = parse.getText().replaceAll("[ \t\r\n]+", " ").trim(); + String text = getTextContent(sampleFiles[i]).replaceAll("[ \t\r\n]+", " ") + .trim(); // simply test for the presence of a text - the ordering of the elements // may differ from what was expected diff --git a/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestPdfParser.java b/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestPdfParser.java index fff6e9a..784b55c 100644 --- a/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestPdfParser.java +++ b/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestPdfParser.java @@ -16,30 +16,16 @@ */ package org.apache.nutch.parse.tika; -import org.apache.nutch.protocol.ProtocolFactory; -import org.apache.nutch.protocol.Protocol; -import org.apache.nutch.protocol.Content; -import org.apache.nutch.protocol.ProtocolException; -import org.apache.nutch.parse.Parse; -import org.apache.nutch.parse.ParseUtil; import org.apache.nutch.parse.ParseException; -import org.apache.hadoop.conf.Configuration; -import org.apache.nutch.util.NutchConfiguration; -import org.apache.hadoop.io.Text; -import org.apache.nutch.crawl.CrawlDatum; +import org.apache.nutch.protocol.ProtocolException; import org.junit.Assert; import org.junit.Test; /** * Unit tests for PdfParser. - * - * @author John Xing */ -public class TestPdfParser { +public class TestPdfParser extends TikaParserTest { - private String fileSeparator = System.getProperty("file.separator"); - // This system property is defined in ./src/plugin/build-plugin.xml - private String sampleDir = System.getProperty("test.data", "."); // Make sure sample files are copied to "test.data" as specified in // ./src/plugin/parse-tika/build.xml during plugin compilation. private String[] sampleFiles = { "pdftest.pdf", "encrypted.pdf" }; @@ -48,22 +34,8 @@ public class TestPdfParser { @Test public void testIt() throws ProtocolException, ParseException { - String urlString; - Protocol protocol; - Content content; - Parse parse; - for (int i = 0; i < sampleFiles.length; i++) { - urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i]; - - Configuration conf = NutchConfiguration.create(); - protocol = new ProtocolFactory(conf).getProtocol(urlString); - content = protocol.getProtocolOutput(new Text(urlString), - new CrawlDatum()).getContent(); - parse = new ParseUtil(conf).parseByExtensionId("parse-tika", content) - .get(content.getUrl()); - - int index = parse.getText().indexOf(expectedText); + int index = getTextContent(sampleFiles[i]).indexOf(expectedText); Assert.assertTrue(index > 0); } } diff --git a/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestRTFParser.java b/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestRTFParser.java index 115220b..4de9d85 100644 --- a/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestRTFParser.java +++ b/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestRTFParser.java @@ -16,41 +16,29 @@ */ package org.apache.nutch.parse.tika; -// Nutch imports +import org.apache.hadoop.io.Text; import org.apache.nutch.crawl.CrawlDatum; import org.apache.nutch.metadata.DublinCore; import org.apache.nutch.metadata.Metadata; import org.apache.nutch.parse.Parse; -import org.apache.nutch.parse.ParseUtil; import org.apache.nutch.parse.ParseException; +import org.apache.nutch.parse.ParseUtil; import org.apache.nutch.protocol.Content; import org.apache.nutch.protocol.Protocol; import org.apache.nutch.protocol.ProtocolException; import org.apache.nutch.protocol.ProtocolFactory; -import org.apache.nutch.util.NutchConfiguration; - -// Hadoop imports -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.io.Text; import org.junit.Assert; -import org.junit.Ignore; import org.junit.Test; /** - * Unit tests for TestRTFParser. (Adapted from John Xing msword unit tests). - * - * @author Andy Hedges + * Unit tests for TestRTFParser. */ -public class TestRTFParser { +public class TestRTFParser extends TikaParserTest { - private String fileSeparator = System.getProperty("file.separator"); - // This system property is defined in ./src/plugin/build-plugin.xml - private String sampleDir = System.getProperty("test.data", "."); // Make sure sample files are copied to "test.data" as specified in // ./src/plugin/parse-tika/build.xml during plugin compilation. private String rtfFile = "test.rtf"; - @Ignore("There seems to be an issue with line 71 e.g. text.trim()") @Test public void testIt() throws ProtocolException, ParseException { @@ -59,7 +47,6 @@ public class TestRTFParser { Content content; Parse parse; - Configuration conf = NutchConfiguration.create(); urlString = "file:" + sampleDir + fileSeparator + rtfFile; protocol = new ProtocolFactory(conf).getProtocol(urlString); content = protocol.getProtocolOutput(new Text(urlString), new CrawlDatum()) @@ -67,8 +54,7 @@ public class TestRTFParser { parse = new ParseUtil(conf).parseByExtensionId("parse-tika", content).get( content.getUrl()); String text = parse.getText(); - Assert.assertEquals("The quick brown fox jumps over the lazy dog", - text.trim()); + Assert.assertTrue(text.contains("The quick brown fox jumps over the lazy dog")); String title = parse.getData().getTitle(); Metadata meta = parse.getData().getParseMeta(); diff --git a/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestXlsxParser.java b/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestXlsxParser.java new file mode 100644 index 0000000..85427db --- /dev/null +++ b/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestXlsxParser.java @@ -0,0 +1,38 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.parse.tika; + +import static org.junit.Assert.*; + +import java.io.IOException; + +import org.apache.nutch.parse.ParseException; +import org.apache.nutch.protocol.ProtocolException; +import org.junit.Test; + +public class TestXlsxParser extends TikaParserTest { + + @Test + public void testIt() throws ProtocolException, ParseException, IOException { + String found = getTextContent("test.xlsx"); + String expected = "test.txt This is a test for spreadsheets xlsx"; + // text is distributed over columns and rows, need to normalize white space + found = found.replaceAll("\\s+", " ").trim(); + assertEquals(found, expected); + } + +} diff --git a/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestEmbeddedDocuments.java b/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TikaParserTest.java similarity index 59% copy from src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestEmbeddedDocuments.java copy to src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TikaParserTest.java index cecf251..781debb 100644 --- a/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestEmbeddedDocuments.java +++ b/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TikaParserTest.java @@ -16,66 +16,50 @@ */ package org.apache.nutch.parse.tika; -import org.apache.nutch.protocol.ProtocolFactory; -import org.apache.nutch.protocol.Protocol; -import org.apache.nutch.protocol.Content; -import org.apache.nutch.protocol.ProtocolException; -import org.apache.nutch.parse.Parse; -import org.apache.nutch.parse.ParseUtil; -import org.apache.nutch.parse.ParseException; import org.apache.hadoop.conf.Configuration; -import org.apache.nutch.util.NutchConfiguration; import org.apache.hadoop.io.Text; import org.apache.nutch.crawl.CrawlDatum; -import org.junit.Assert; +import org.apache.nutch.parse.Parse; +import org.apache.nutch.parse.ParseException; +import org.apache.nutch.parse.ParseUtil; +import org.apache.nutch.protocol.Content; +import org.apache.nutch.protocol.Protocol; +import org.apache.nutch.protocol.ProtocolException; +import org.apache.nutch.protocol.ProtocolFactory; +import org.apache.nutch.util.NutchConfiguration; import org.junit.Before; -import org.junit.Test; - -import java.io.File; /** - * Unit tests for MSWordParser. - * - * @author John Xing + * Base class to extend Tika parser tests from. */ -public class TestEmbeddedDocuments { +public class TikaParserTest { - private String fileSeparator = System.getProperty("file.separator"); - // This system property is defined in ./src/plugin/build-plugin.xml - private String sampleDir = System.getProperty("test.data", "."); - // Make sure sample files are copied to "test.data" as specified in - // ./src/plugin/parse-tika/build.xml during plugin compilation. - private String[] sampleFiles = { "test_recursive_embedded.docx" }; + protected String fileSeparator = System.getProperty("file.separator"); - private String expectedText = "When in the Course of human events"; + /** + * Folder with test data, defined in src/plugin/build-plugin.xml. Make sure + * that all sample files are copied to "test.data", they must be listed in + * src/plugin/parse-tika/build.xml + */ + protected String sampleDir = System.getProperty("test.data", "."); - private Configuration conf; + protected Configuration conf; @Before public void setUp() { conf = NutchConfiguration.create(); conf.set("file.content.limit", "-1"); - conf.setBoolean("tika.parse.embedded", true); } - public String getTextContent(String fileName) throws ProtocolException, - ParseException { + public String getTextContent(String fileName) + throws ProtocolException, ParseException { String urlString = "file:" + sampleDir + fileSeparator + fileName; Protocol protocol = new ProtocolFactory(conf).getProtocol(urlString); - Content content = protocol.getProtocolOutput(new Text(urlString), - new CrawlDatum()).getContent(); + Content content = protocol + .getProtocolOutput(new Text(urlString), new CrawlDatum()).getContent(); Parse parse = new ParseUtil(conf).parseByExtensionId("parse-tika", content) .get(content.getUrl()); return parse.getText(); } - @Test - public void testIt() throws ProtocolException, ParseException { - for (int i = 0; i < sampleFiles.length; i++) { - String found = getTextContent(sampleFiles[i]); - Assert.assertTrue("text found : '" + found + "'", - found.contains(expectedText)); - } - } - }
