Added: lucene/nutch/trunk/src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestMSWordParser.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestMSWordParser.java?rev=909268&view=auto ============================================================================== --- lucene/nutch/trunk/src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestMSWordParser.java (added) +++ lucene/nutch/trunk/src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestMSWordParser.java Fri Feb 12 06:50:53 2010 @@ -0,0 +1,92 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nutch.tika; + +import org.apache.nutch.protocol.ProtocolFactory; +import org.apache.nutch.protocol.Protocol; +import org.apache.nutch.protocol.Content; +import org.apache.nutch.protocol.ProtocolException; + +import org.apache.nutch.parse.Parse; +import org.apache.nutch.parse.ParseImpl; +import org.apache.nutch.parse.ParseUtil; +import org.apache.nutch.parse.ParseException; +import org.apache.hadoop.conf.Configuration; +import org.apache.nutch.util.NutchConfiguration; + +import org.apache.hadoop.io.Text; +import org.apache.nutch.crawl.CrawlDatum; + +import java.io.File; +import java.io.FilenameFilter; + +import junit.framework.TestCase; + +/** + * Unit tests for MSWordParser. + * + * @author John Xing + */ +public class TestMSWordParser extends TestCase { + + private String fileSeparator = System.getProperty("file.separator"); + // This system property is defined in ./src/plugin/build-plugin.xml + private String sampleDir = System.getProperty("test.data","."); + // Make sure sample files are copied to "test.data" as specified in + // ./src/plugin/parse-msword/build.xml during plugin compilation. + // Check ./src/plugin/parse-msword/sample/README.txt for what they are. + private String[] sampleFiles = {"word97.doc"}; + + private String expectedText = "This is a sample doc file prepared for nutch."; + + private Configuration conf; + + public TestMSWordParser(String name) { + super(name); + } + + protected void setUp() { + conf = NutchConfiguration.create(); + conf.set("file.content.limit", "-1"); + } + + protected void tearDown() {} + + public String getTextContent(String fileName) throws ProtocolException, ParseException { + String urlString = "file:" + sampleDir + fileSeparator + fileName; + Protocol protocol = new ProtocolFactory(conf).getProtocol(urlString); + Content content = protocol.getProtocolOutput(new Text(urlString), new CrawlDatum()).getContent(); + Parse parse = new ParseUtil(conf).parseByExtensionId("parse-tika", content).get(content.getUrl()); + return parse.getText(); + } + + public void testIt() throws ProtocolException, ParseException { + for (int i=0; i<sampleFiles.length; i++) { + String found = getTextContent(sampleFiles[i]); + assertTrue("text found : '"+found+"'",found.startsWith(expectedText)); + } + } + + public void testOpeningDocs() throws ProtocolException, ParseException { + String[] filenames = new File(sampleDir).list(); + for (int i = 0; i < filenames.length; i++) { + if (filenames[i].endsWith(".doc")==false) continue; + assertTrue("cann't read content of " + filenames[i], getTextContent(filenames[i]).length() > 0); + } + } +}
Propchange: lucene/nutch/trunk/src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestMSWordParser.java ------------------------------------------------------------------------------ svn:eol-style = native Added: lucene/nutch/trunk/src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestOOParser.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestOOParser.java?rev=909268&view=auto ============================================================================== --- lucene/nutch/trunk/src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestOOParser.java (added) +++ lucene/nutch/trunk/src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestOOParser.java Fri Feb 12 06:50:53 2010 @@ -0,0 +1,107 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nutch.tika; + +import java.io.FileInputStream; +import java.io.InputStreamReader; + +import org.apache.nutch.crawl.CrawlDatum; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.io.Text; +import org.apache.nutch.protocol.*; + +import org.apache.nutch.parse.Parse; +import org.apache.nutch.parse.ParseException; +import org.apache.nutch.parse.ParseUtil; +import org.apache.nutch.util.NutchConfiguration; + +import junit.framework.TestCase; + +/** + * Unit tests for OOParser. + * + * @author Andrzej Bialecki + */ +public class TestOOParser extends TestCase { + + private String fileSeparator = System.getProperty("file.separator"); + // This system property is defined in ./src/plugin/build-plugin.xml + private String sampleDir = System.getProperty("test.data","."); + // Make sure sample files are copied to "test.data" as specified in + // ./src/plugin/parse-oo/build.xml during plugin compilation. + private String[] sampleFiles = {"ootest.odt", "ootest.sxw"}; + + private String sampleText = "ootest.txt"; + + private String expectedText; + + public TestOOParser(String name) { + super(name); + try { + // read the test string + FileInputStream fis = new FileInputStream(sampleDir + fileSeparator + sampleText); + StringBuffer sb = new StringBuffer(); + int len = 0; + InputStreamReader isr = new InputStreamReader(fis, "UTF-8"); + char[] buf = new char[1024]; + while ((len = isr.read(buf)) > 0) { + sb.append(buf, 0, len); + } + isr.close(); + expectedText = sb.toString(); + // normalize space + expectedText = expectedText.replaceAll("[ \t\r\n]+", " "); + } catch (Exception e) { + e.printStackTrace(); + } + } + + protected void setUp() {} + + protected void tearDown() {} + + public void testIt() throws ProtocolException, ParseException { + String urlString; + Content content; + Parse parse; + Configuration conf = NutchConfiguration.create(); + Protocol protocol; + ProtocolFactory factory = new ProtocolFactory(conf); + + System.out.println("Expected : "+expectedText); + + for (int i=0; i<sampleFiles.length; i++) { + urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i]; + + if (sampleFiles[i].startsWith("ootest")==false) continue; + + protocol = factory.getProtocol(urlString); + content = protocol.getProtocolOutput(new Text(urlString), new CrawlDatum()).getContent(); + parse = new ParseUtil(conf).parseByExtensionId("parse-tika", content).get(content.getUrl()); + + String text = parse.getText().replaceAll("[ \t\r\n]+", " ").trim(); + + // simply test for the presence of a text - the ordering of the elements may differ from what was expected + // in the previous tests + assertTrue(text!=null && text.length() > 0); + + System.out.println("Found "+sampleFiles[i]+": "+text); + } + } + +} Propchange: lucene/nutch/trunk/src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestOOParser.java ------------------------------------------------------------------------------ svn:eol-style = native Added: lucene/nutch/trunk/src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestPdfParser.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestPdfParser.java?rev=909268&view=auto ============================================================================== --- lucene/nutch/trunk/src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestPdfParser.java (added) +++ lucene/nutch/trunk/src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestPdfParser.java Fri Feb 12 06:50:53 2010 @@ -0,0 +1,83 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nutch.tika; + +import org.apache.nutch.protocol.ProtocolFactory; +import org.apache.nutch.protocol.Protocol; +import org.apache.nutch.protocol.Content; +import org.apache.nutch.protocol.ProtocolException; + +import org.apache.nutch.parse.Parse; +import org.apache.nutch.parse.ParseUtil; +import org.apache.nutch.parse.ParseException; +import org.apache.hadoop.conf.Configuration; +import org.apache.nutch.util.NutchConfiguration; + +import org.apache.hadoop.io.Text; +import org.apache.nutch.crawl.CrawlDatum; + +import junit.framework.TestCase; + +/** + * Unit tests for PdfParser. + * + * @author John Xing + */ +public class TestPdfParser extends TestCase { + + private String fileSeparator = System.getProperty("file.separator"); + // This system property is defined in ./src/plugin/build-plugin.xml + private String sampleDir = System.getProperty("test.data","."); + // Make sure sample files are copied to "test.data" as specified in + // ./src/plugin/parse-pdf/build.xml during plugin compilation. + // Check ./src/plugin/parse-pdf/sample/README.txt for what they are. + private String[] sampleFiles = { + "pdftest.pdf", + "encrypted.pdf" + }; + + private String expectedText = "A VERY SMALL PDF FILE"; + + public TestPdfParser(String name) { + super(name); + } + + protected void setUp() {} + + protected void tearDown() {} + + public void testIt() throws ProtocolException, ParseException { + String urlString; + Protocol protocol; + Content content; + Parse parse; + + for (int i = 0; i < sampleFiles.length; i++) { + urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i]; + + Configuration conf = NutchConfiguration.create(); + protocol = new ProtocolFactory(conf).getProtocol(urlString); + content = protocol.getProtocolOutput(new Text(urlString), new CrawlDatum()).getContent(); + parse = new ParseUtil(conf).parseByExtensionId("parse-tika", content).get(content.getUrl()); + + int index = parse.getText().indexOf(expectedText); + assertTrue(index > 0); + } + } + +} Propchange: lucene/nutch/trunk/src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestPdfParser.java ------------------------------------------------------------------------------ svn:eol-style = native Added: lucene/nutch/trunk/src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestRTFParser.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestRTFParser.java?rev=909268&view=auto ============================================================================== --- lucene/nutch/trunk/src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestRTFParser.java (added) +++ lucene/nutch/trunk/src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestRTFParser.java Fri Feb 12 06:50:53 2010 @@ -0,0 +1,90 @@ +package org.apache.nutch.tika; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// JUnit imports +import junit.framework.TestCase; + +// Nutch imports +import org.apache.nutch.crawl.CrawlDatum; +import org.apache.nutch.metadata.DublinCore; +import org.apache.nutch.metadata.Metadata; +import org.apache.nutch.parse.Parse; +import org.apache.nutch.parse.ParseUtil; +import org.apache.nutch.parse.ParseException; +import org.apache.nutch.protocol.Content; +import org.apache.nutch.protocol.Protocol; +import org.apache.nutch.protocol.ProtocolException; +import org.apache.nutch.protocol.ProtocolFactory; +import org.apache.nutch.util.NutchConfiguration; + +// Hadoop imports +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.io.Text; + +/** + * Unit tests for TestRTFParser. (Adapted from John Xing msword unit tests). + * + * @author Andy Hedges + */ +public class TestRTFParser extends TestCase { + + private String fileSeparator = System.getProperty("file.separator"); + // This system property is defined in ./src/plugin/build-plugin.xml + private String sampleDir = System.getProperty("test.data", "."); + // Make sure sample files are copied to "test.data" as specified in + // ./src/plugin/parse-rtf/build.xml during plugin compilation. + // Check ./src/plugin/parse-rtf/sample/README.txt for what they are. + private String rtfFile = "test.rtf"; + + public TestRTFParser(String name) { + super(name); + } + + protected void setUp() { + } + + protected void tearDown() { + } + + public void testIt() throws ProtocolException, ParseException { + + String urlString; + Protocol protocol; + Content content; + Parse parse; + + Configuration conf = NutchConfiguration.create(); + urlString = "file:" + sampleDir + fileSeparator + rtfFile; + protocol = new ProtocolFactory(conf).getProtocol(urlString); + content = protocol.getProtocolOutput(new Text(urlString), + new CrawlDatum()).getContent(); + parse = new ParseUtil(conf).parseByExtensionId("parse-tika", content) + .get(content.getUrl()); + String text = parse.getText(); + assertEquals("The quick brown fox jumps over the lazy dog", text.trim()); + + String title = parse.getData().getTitle(); + Metadata meta = parse.getData().getParseMeta(); + + // METADATA extraction is not yet supported in Tika + // assertEquals("test rft document", title); + // assertEquals("tests", meta.get(DublinCore.SUBJECT)); + } + +} Propchange: lucene/nutch/trunk/src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestRTFParser.java ------------------------------------------------------------------------------ svn:eol-style = native Modified: lucene/nutch/trunk/src/test/org/apache/nutch/parse/TestParserFactory.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/test/org/apache/nutch/parse/TestParserFactory.java?rev=909268&r1=909267&r2=909268&view=diff ============================================================================== --- lucene/nutch/trunk/src/test/org/apache/nutch/parse/TestParserFactory.java (original) +++ lucene/nutch/trunk/src/test/org/apache/nutch/parse/TestParserFactory.java Fri Feb 12 06:50:53 2010 @@ -54,7 +54,7 @@ ext = (Extension) parserFactory.getExtensions("text/html; charset=ISO-8859-1").get(0); assertEquals("parse-html", ext.getDescriptor().getPluginId()); ext = (Extension)parserFactory.getExtensions("foo/bar").get(0); - assertEquals("parse-text", ext.getDescriptor().getPluginId()); + assertEquals("parse-tika", ext.getDescriptor().getPluginId()); } /** Unit test to check <code>getParsers</code> method */