ja...

mattmann Thu, 11 Feb 2010 22:51:24 -0800

Added: 
lucene/nutch/trunk/src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestMSWordParser.java
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestMSWordParser.java?rev=909268&view=auto
==============================================================================
--- 
lucene/nutch/trunk/src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestMSWordParser.java
 (added)
+++ 
lucene/nutch/trunk/src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestMSWordParser.java
 Fri Feb 12 06:50:53 2010
@@ -0,0 +1,92 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.tika;
+
+import org.apache.nutch.protocol.ProtocolFactory;
+import org.apache.nutch.protocol.Protocol;
+import org.apache.nutch.protocol.Content;
+import org.apache.nutch.protocol.ProtocolException;
+
+import org.apache.nutch.parse.Parse;
+import org.apache.nutch.parse.ParseImpl;
+import org.apache.nutch.parse.ParseUtil;
+import org.apache.nutch.parse.ParseException;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.util.NutchConfiguration;
+
+import org.apache.hadoop.io.Text;
+import org.apache.nutch.crawl.CrawlDatum;
+
+import java.io.File;
+import java.io.FilenameFilter;
+
+import junit.framework.TestCase;
+
+/** 
+ * Unit tests for MSWordParser.
+ *
+ * @author John Xing
+ */
+public class TestMSWordParser extends TestCase {
+
+  private String fileSeparator = System.getProperty("file.separator");
+  // This system property is defined in ./src/plugin/build-plugin.xml
+  private String sampleDir = System.getProperty("test.data",".");
+  // Make sure sample files are copied to "test.data" as specified in
+  // ./src/plugin/parse-msword/build.xml during plugin compilation.
+  // Check ./src/plugin/parse-msword/sample/README.txt for what they are.
+  private String[] sampleFiles = {"word97.doc"};
+
+  private String expectedText = "This is a sample doc file prepared for 
nutch.";
+  
+  private Configuration conf;
+
+  public TestMSWordParser(String name) { 
+    super(name); 
+  }
+
+  protected void setUp() {
+    conf = NutchConfiguration.create();
+    conf.set("file.content.limit", "-1");
+  }
+
+  protected void tearDown() {}
+
+  public String getTextContent(String fileName) throws ProtocolException, 
ParseException {
+    String urlString = "file:" + sampleDir + fileSeparator + fileName;
+    Protocol protocol = new ProtocolFactory(conf).getProtocol(urlString);
+    Content content = protocol.getProtocolOutput(new Text(urlString), new 
CrawlDatum()).getContent();
+    Parse parse = new ParseUtil(conf).parseByExtensionId("parse-tika", 
content).get(content.getUrl());
+    return parse.getText();
+  }
+  
+  public void testIt() throws ProtocolException, ParseException {
+    for (int i=0; i<sampleFiles.length; i++) {
+      String found = getTextContent(sampleFiles[i]);
+      assertTrue("text found : '"+found+"'",found.startsWith(expectedText));
+    }
+  }
+
+  public void testOpeningDocs() throws ProtocolException, ParseException {
+    String[] filenames = new File(sampleDir).list();
+      for (int i = 0; i < filenames.length; i++) {
+       if (filenames[i].endsWith(".doc")==false) continue;
+        assertTrue("cann't read content of " + filenames[i], 
getTextContent(filenames[i]).length() > 0);
+      }      
+  }
+}


Propchange: 
lucene/nutch/trunk/src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestMSWordParser.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: 
lucene/nutch/trunk/src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestOOParser.java
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestOOParser.java?rev=909268&view=auto
==============================================================================
--- 
lucene/nutch/trunk/src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestOOParser.java
 (added)
+++ 
lucene/nutch/trunk/src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestOOParser.java
 Fri Feb 12 06:50:53 2010
@@ -0,0 +1,107 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.tika;
+
+import java.io.FileInputStream;
+import java.io.InputStreamReader;
+
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.Text;
+import org.apache.nutch.protocol.*;
+
+import org.apache.nutch.parse.Parse;
+import org.apache.nutch.parse.ParseException;
+import org.apache.nutch.parse.ParseUtil;
+import org.apache.nutch.util.NutchConfiguration;
+
+import junit.framework.TestCase;
+
+/** 
+ * Unit tests for OOParser.
+ *
+ * @author Andrzej Bialecki
+ */
+public class TestOOParser extends TestCase {
+
+  private String fileSeparator = System.getProperty("file.separator");
+  // This system property is defined in ./src/plugin/build-plugin.xml
+  private String sampleDir = System.getProperty("test.data",".");
+  // Make sure sample files are copied to "test.data" as specified in
+  // ./src/plugin/parse-oo/build.xml during plugin compilation.
+  private String[] sampleFiles = {"ootest.odt", "ootest.sxw"};
+
+  private String sampleText = "ootest.txt";
+  
+  private String expectedText;
+
+  public TestOOParser(String name) { 
+    super(name);
+    try {
+      // read the test string
+      FileInputStream fis = new FileInputStream(sampleDir + fileSeparator + 
sampleText);
+      StringBuffer sb = new StringBuffer();
+      int len = 0;
+      InputStreamReader isr = new InputStreamReader(fis, "UTF-8");
+      char[] buf = new char[1024];
+      while ((len = isr.read(buf)) > 0) {
+        sb.append(buf, 0, len);
+      }
+      isr.close();
+      expectedText = sb.toString();
+      // normalize space
+      expectedText = expectedText.replaceAll("[ \t\r\n]+", " ");
+    } catch (Exception e) {
+      e.printStackTrace();
+    }
+  }
+
+  protected void setUp() {}
+
+  protected void tearDown() {}
+
+  public void testIt() throws ProtocolException, ParseException {
+    String urlString;
+    Content content;
+    Parse parse;
+    Configuration conf = NutchConfiguration.create();
+    Protocol protocol;
+    ProtocolFactory factory = new ProtocolFactory(conf);
+
+    System.out.println("Expected : "+expectedText);
+    
+    for (int i=0; i<sampleFiles.length; i++) {
+      urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];
+
+      if (sampleFiles[i].startsWith("ootest")==false) continue;
+      
+      protocol = factory.getProtocol(urlString);
+      content = protocol.getProtocolOutput(new Text(urlString), new 
CrawlDatum()).getContent();
+      parse = new ParseUtil(conf).parseByExtensionId("parse-tika", 
content).get(content.getUrl());
+      
+      String text = parse.getText().replaceAll("[ \t\r\n]+", " ").trim();
+
+      // simply test for the presence of a text - the ordering of the elements 
may differ from what was expected
+      // in the previous tests
+      assertTrue(text!=null && text.length() > 0);
+      
+      System.out.println("Found "+sampleFiles[i]+": "+text);
+    }
+  }
+
+}

Propchange: 
lucene/nutch/trunk/src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestOOParser.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: 
lucene/nutch/trunk/src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestPdfParser.java
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestPdfParser.java?rev=909268&view=auto
==============================================================================
--- 
lucene/nutch/trunk/src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestPdfParser.java
 (added)
+++ 
lucene/nutch/trunk/src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestPdfParser.java
 Fri Feb 12 06:50:53 2010
@@ -0,0 +1,83 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.tika;
+
+import org.apache.nutch.protocol.ProtocolFactory;
+import org.apache.nutch.protocol.Protocol;
+import org.apache.nutch.protocol.Content;
+import org.apache.nutch.protocol.ProtocolException;
+
+import org.apache.nutch.parse.Parse;
+import org.apache.nutch.parse.ParseUtil;
+import org.apache.nutch.parse.ParseException;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.util.NutchConfiguration;
+
+import org.apache.hadoop.io.Text;
+import org.apache.nutch.crawl.CrawlDatum;
+
+import junit.framework.TestCase;
+
+/** 
+ * Unit tests for PdfParser.
+ *
+ * @author John Xing
+ */
+public class TestPdfParser extends TestCase {
+
+  private String fileSeparator = System.getProperty("file.separator");
+  // This system property is defined in ./src/plugin/build-plugin.xml
+  private String sampleDir = System.getProperty("test.data",".");
+  // Make sure sample files are copied to "test.data" as specified in
+  // ./src/plugin/parse-pdf/build.xml during plugin compilation.
+  // Check ./src/plugin/parse-pdf/sample/README.txt for what they are.
+  private String[] sampleFiles = {
+      "pdftest.pdf",
+      "encrypted.pdf"
+  };
+
+  private String expectedText = "A VERY SMALL PDF FILE";
+
+  public TestPdfParser(String name) { 
+    super(name); 
+  }
+
+  protected void setUp() {}
+
+  protected void tearDown() {}
+
+  public void testIt() throws ProtocolException, ParseException {
+    String urlString;
+    Protocol protocol;
+    Content content;
+    Parse parse;
+
+    for (int i = 0; i < sampleFiles.length; i++) {
+      urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];
+
+      Configuration conf = NutchConfiguration.create();
+      protocol = new ProtocolFactory(conf).getProtocol(urlString);
+      content = protocol.getProtocolOutput(new Text(urlString), new 
CrawlDatum()).getContent();
+      parse = new ParseUtil(conf).parseByExtensionId("parse-tika", 
content).get(content.getUrl());
+
+      int index = parse.getText().indexOf(expectedText);
+      assertTrue(index > 0);
+    }
+  }
+
+}

Propchange: 
lucene/nutch/trunk/src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestPdfParser.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: 
lucene/nutch/trunk/src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestRTFParser.java
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestRTFParser.java?rev=909268&view=auto
==============================================================================
--- 
lucene/nutch/trunk/src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestRTFParser.java
 (added)
+++ 
lucene/nutch/trunk/src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestRTFParser.java
 Fri Feb 12 06:50:53 2010
@@ -0,0 +1,90 @@
+package org.apache.nutch.tika;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// JUnit imports
+import junit.framework.TestCase;
+
+// Nutch imports
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.metadata.DublinCore;
+import org.apache.nutch.metadata.Metadata;
+import org.apache.nutch.parse.Parse;
+import org.apache.nutch.parse.ParseUtil;
+import org.apache.nutch.parse.ParseException;
+import org.apache.nutch.protocol.Content;
+import org.apache.nutch.protocol.Protocol;
+import org.apache.nutch.protocol.ProtocolException;
+import org.apache.nutch.protocol.ProtocolFactory;
+import org.apache.nutch.util.NutchConfiguration;
+
+// Hadoop imports
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.Text;
+
+/**
+ * Unit tests for TestRTFParser. (Adapted from John Xing msword unit tests).
+ * 
+ * @author Andy Hedges
+ */
+public class TestRTFParser extends TestCase {
+
+       private String fileSeparator = System.getProperty("file.separator");
+       // This system property is defined in ./src/plugin/build-plugin.xml
+       private String sampleDir = System.getProperty("test.data", ".");
+       // Make sure sample files are copied to "test.data" as specified in
+       // ./src/plugin/parse-rtf/build.xml during plugin compilation.
+       // Check ./src/plugin/parse-rtf/sample/README.txt for what they are.
+       private String rtfFile = "test.rtf";
+
+       public TestRTFParser(String name) {
+               super(name);
+       }
+
+       protected void setUp() {
+       }
+
+       protected void tearDown() {
+       }
+
+       public void testIt() throws ProtocolException, ParseException {
+
+               String urlString;
+               Protocol protocol;
+               Content content;
+               Parse parse;
+
+               Configuration conf = NutchConfiguration.create();
+               urlString = "file:" + sampleDir + fileSeparator + rtfFile;
+               protocol = new ProtocolFactory(conf).getProtocol(urlString);
+               content = protocol.getProtocolOutput(new Text(urlString),
+                               new CrawlDatum()).getContent();
+               parse = new ParseUtil(conf).parseByExtensionId("parse-tika", 
content)
+                               .get(content.getUrl());
+               String text = parse.getText();
+               assertEquals("The quick brown fox jumps over the lazy dog", 
text.trim());
+
+               String title = parse.getData().getTitle();
+               Metadata meta = parse.getData().getParseMeta();
+
+               // METADATA extraction is not yet supported in Tika
+               // assertEquals("test rft document", title);
+               // assertEquals("tests", meta.get(DublinCore.SUBJECT));
+       }
+
+}

Propchange: 
lucene/nutch/trunk/src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestRTFParser.java
------------------------------------------------------------------------------
    svn:eol-style = native

Modified: 
lucene/nutch/trunk/src/test/org/apache/nutch/parse/TestParserFactory.java
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/test/org/apache/nutch/parse/TestParserFactory.java?rev=909268&r1=909267&r2=909268&view=diff
==============================================================================
--- lucene/nutch/trunk/src/test/org/apache/nutch/parse/TestParserFactory.java 
(original)
+++ lucene/nutch/trunk/src/test/org/apache/nutch/parse/TestParserFactory.java 
Fri Feb 12 06:50:53 2010
@@ -54,7 +54,7 @@
     ext = (Extension) parserFactory.getExtensions("text/html; 
charset=ISO-8859-1").get(0);
     assertEquals("parse-html", ext.getDescriptor().getPluginId());
     ext = (Extension)parserFactory.getExtensions("foo/bar").get(0);
-    assertEquals("parse-text", ext.getDescriptor().getPluginId());
+    assertEquals("parse-tika", ext.getDescriptor().getPluginId());
   }
   
   /** Unit test to check <code>getParsers</code> method */

svn commit: r909268 [2/2] - in /lucene/nutch/trunk: ./ conf/ src/java/org/apache/nutch/parse/ src/plugin/ src/plugin/parse-tika/ src/plugin/parse-tika/src/ src/plugin/parse-tika/src/java/ src/plugin/parse-tika/src/java/org/ src/plugin/parse-tika/src/ja...

Reply via email to