Author: lewismc
Date: Tue Jan  8 03:35:21 2013
New Revision: 1430129

URL: http://svn.apache.org/viewvc?rev=1430129&view=rev
Log:
NUTCH-1119 JUnit test for index-static

Added:
    nutch/trunk/src/plugin/index-static/src/test/
    nutch/trunk/src/plugin/index-static/src/test/org/
    nutch/trunk/src/plugin/index-static/src/test/org/apache/
    nutch/trunk/src/plugin/index-static/src/test/org/apache/nutch/
    nutch/trunk/src/plugin/index-static/src/test/org/apache/nutch/indexer/
    
nutch/trunk/src/plugin/index-static/src/test/org/apache/nutch/indexer/staticfield/
    
nutch/trunk/src/plugin/index-static/src/test/org/apache/nutch/indexer/staticfield/TestStaticFieldIndexerTest.java
Modified:
    nutch/trunk/CHANGES.txt
    nutch/trunk/src/plugin/build.xml
    
nutch/trunk/src/plugin/index-static/src/java/org/apache/nutch/indexer/staticfield/StaticFieldIndexer.java

Modified: nutch/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1430129&r1=1430128&r2=1430129&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Tue Jan  8 03:35:21 2013
@@ -2,6 +2,8 @@ Nutch Change Log
 
 (trunk): Current Development
 
+* NUTCH-1119 JUnit test for index-static (Tejas Patil via lewismc)
+
 * NUTCH-1510 Upgrade to Hadoop 1.1.1 (markus)
 
 * NUTCH-1118 JUnit test for index-basic (Tejas Patil via lewismc)

Modified: nutch/trunk/src/plugin/build.xml
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/build.xml?rev=1430129&r1=1430128&r2=1430129&view=diff
==============================================================================
--- nutch/trunk/src/plugin/build.xml (original)
+++ nutch/trunk/src/plugin/build.xml Tue Jan  8 03:35:21 2013
@@ -80,6 +80,7 @@
      <ant dir="index-basic" target="test"/>
      <ant dir="index-anchor" target="test"/>
      <ant dir="index-more" target="test"/>
+     <ant dir="index-static" target="test"/>
      <ant dir="language-identifier" target="test"/>
      <ant dir="lib-http" target="test"/>
      <ant dir="protocol-file" target="test"/>

Modified: 
nutch/trunk/src/plugin/index-static/src/java/org/apache/nutch/indexer/staticfield/StaticFieldIndexer.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/index-static/src/java/org/apache/nutch/indexer/staticfield/StaticFieldIndexer.java?rev=1430129&r1=1430128&r2=1430129&view=diff
==============================================================================
--- 
nutch/trunk/src/plugin/index-static/src/java/org/apache/nutch/indexer/staticfield/StaticFieldIndexer.java
 (original)
+++ 
nutch/trunk/src/plugin/index-static/src/java/org/apache/nutch/indexer/staticfield/StaticFieldIndexer.java
 Tue Jan  8 03:35:21 2013
@@ -20,8 +20,6 @@ package org.apache.nutch.indexer.staticf
 import java.util.HashMap;
 import java.util.Map.Entry;
 
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
 import org.apache.nutch.crawl.CrawlDatum;
 import org.apache.nutch.crawl.Inlinks;
 import org.apache.nutch.indexer.IndexingFilter;
@@ -31,52 +29,78 @@ import org.apache.nutch.parse.Parse;
 import org.apache.hadoop.io.Text;
 import org.apache.hadoop.conf.Configuration;
 
-/** A simple plugin called at indexing that adds fields with static data. 
- *  You can specify a list of fieldname:fieldcontent per nutch job.
- *  It can be useful when collections can't be created by urlpatterns, 
- *  like in subcollection, but on a job-basis. */
+/**
+ * A simple plugin called at indexing that adds fields with static data. You 
can
+ * specify a list of fieldname:fieldcontent per nutch job. It can be useful 
when
+ * collections can't be created by urlpatterns, like in subcollection, but on a
+ * job-basis.
+ */
 
 public class StaticFieldIndexer implements IndexingFilter {
-       private Configuration conf;
-       private HashMap<String, String[]> fields;
-       private boolean addStaticFields = false;
-
-       public NutchDocument filter(NutchDocument doc, Parse parse, Text url, 
CrawlDatum datum, Inlinks inlinks)
-           throws IndexingException {
-               
-               if(this.addStaticFields == true){
-                       for(Entry<String,String[]> entry: 
this.fields.entrySet()){
-                               doc.add(entry.getKey(), entry.getValue());
-                       }
-               }       
-               return doc;
-       }
-
-       private HashMap<String, String[]> parseFields(String fieldsString) {
-               HashMap<String, String[]> fields = new HashMap<String, 
String[]>();
-               
-               /*
-                 The format is very easy, it's a comma-separated list of 
fields in the form <name>:<value>
-               */
-               for(String field: fieldsString.split(",")){
-                       String[] entry = field.split(":");
-                       if(entry.length == 2)
-                               fields.put(entry[0].trim(), 
entry[1].trim().split(" "));
-               }
-
-               return fields;
-       }
-
-       public void setConf(Configuration conf) {
-               this.conf = conf;
-               String fieldsString = conf.get("index.static", null);
-               if(fieldsString != null){
-                       this.addStaticFields = true;
-                       this.fields = parseFields(fieldsString);
-               }
-       }
-
-       public Configuration getConf() {
-               return this.conf;
-       }
+  private Configuration conf;
+  private HashMap<String, String[]> fields;
+  private boolean addStaticFields = false;
+
+  /**
+   * The {@link StaticFieldIndexer} filter object which adds fields as per
+   * configuration setting. See {@code index.static} in nutch-default.xml.
+   * 
+   * @param doc The {@link NutchDocument} object
+   * @param parse  The relevant {@link Parse} object passing through the filter
+   * @param url URL to be filtered for anchor text
+   * @param datum The {@link CrawlDatum} entry
+   * @param inlinks The {@link Inlinks} containing anchor text
+   * @return filtered NutchDocument
+   */
+  public NutchDocument filter(NutchDocument doc, Parse parse, Text url,
+      CrawlDatum datum, Inlinks inlinks) throws IndexingException {
+
+    if (this.addStaticFields == true) {
+      for (Entry<String, String[]> entry : this.fields.entrySet()) {
+        doc.add(entry.getKey(), entry.getValue());
+      }
+    }
+    return doc;
+  }
+
+  /**
+   * Populate a HashMap from a list of fieldname:fieldcontent.
+   * See {@index.static} in nutch-default.xml.
+   * 
+   * @param fieldsString string containing field:value pairs
+   * @return HashMap of fields and their corresponding values
+   */
+  private HashMap<String, String[]> parseFields(String fieldsString) {
+    HashMap<String, String[]> fields = new HashMap<String, String[]>();
+
+    /* The format is very easy, it's a comma-separated list of fields in the
+       form <name>:<value>
+     */
+    for (String field : fieldsString.split(",")) {
+      String[] entry = field.split(":");
+      if (entry.length == 2)
+        fields.put(entry[0].trim(), entry[1].trim().split(" "));
+    }
+
+    return fields;
+  }
+
+  /**
+   * Set the {@link Configuration} object
+   */
+  public void setConf(Configuration conf) {
+    this.conf = conf;
+    String fieldsString = conf.get("index.static", null);
+    if (fieldsString != null) {
+      this.addStaticFields = true;
+      this.fields = parseFields(fieldsString);
+    }
+  }
+
+  /**
+   * Get the {@link Configuration} object
+   */
+  public Configuration getConf() {
+    return this.conf;
+  }
 }

Added: 
nutch/trunk/src/plugin/index-static/src/test/org/apache/nutch/indexer/staticfield/TestStaticFieldIndexerTest.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/index-static/src/test/org/apache/nutch/indexer/staticfield/TestStaticFieldIndexerTest.java?rev=1430129&view=auto
==============================================================================
--- 
nutch/trunk/src/plugin/index-static/src/test/org/apache/nutch/indexer/staticfield/TestStaticFieldIndexerTest.java
 (added)
+++ 
nutch/trunk/src/plugin/index-static/src/test/org/apache/nutch/indexer/staticfield/TestStaticFieldIndexerTest.java
 Tue Jan  8 03:35:21 2013
@@ -0,0 +1,110 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.indexer.staticfield;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.Text;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.crawl.Inlinks;
+import org.apache.nutch.indexer.NutchDocument;
+import org.apache.nutch.parse.ParseImpl;
+import org.apache.nutch.util.NutchConfiguration;
+
+import junit.framework.TestCase;
+
+/**
+ * JUnit test case which tests 
+ * 1. that static data fields are added to a document
+ * 2. that empty {@code index.static} does not add anything to the document
+ * 3. that valid field:value pairs are added to the document
+ * 4. that fields and values added to the document are trimmed 
+ * 
+ * @author tejasp
+ */
+
+public class TestStaticFieldIndexerTest extends TestCase {
+
+  Configuration conf;
+
+  Inlinks inlinks;
+  ParseImpl parse;
+  CrawlDatum crawlDatum;
+  Text url;
+  StaticFieldIndexer filter;
+
+  protected void setUp() throws Exception {
+    conf = NutchConfiguration.create();
+    parse = new ParseImpl();
+    url = new Text("http://nutch.apache.org/index.html";);
+    crawlDatum = new CrawlDatum();
+    inlinks = new Inlinks();
+    filter = new StaticFieldIndexer();
+  }
+
+  /**
+   * Test that empty {@code index.static} does not add anything to the document
+   * @throws Exception 
+   */
+  public void testEmptyIndexStatic() throws Exception {
+
+    assertNotNull(filter);
+    filter.setConf(conf);
+
+    NutchDocument doc = new NutchDocument();
+
+    try {
+      filter.filter(doc, parse, url, crawlDatum, inlinks);
+    } catch (Exception e) {
+      e.printStackTrace();
+      fail(e.getMessage());
+    }
+
+    assertNotNull(doc);
+    assertTrue("tests if no field is set for empty index.static", 
doc.getFieldNames().isEmpty());
+  }
+
+  /**
+   * Test that valid field:value pairs are added to the document
+   * @throws Exception 
+   */
+  public void testNormalScenario() throws Exception {
+
+    conf.set("index.static",
+        "field1:val1, field2    :      val2 val3     , field3, field4 :val4 , 
");
+    assertNotNull(filter);
+    filter.setConf(conf);
+
+    NutchDocument doc = new NutchDocument();
+
+    try {
+      filter.filter(doc, parse, url, crawlDatum, inlinks);
+    } catch (Exception e) {
+      e.printStackTrace();
+      fail(e.getMessage());
+    }
+
+    assertNotNull(doc);
+    assertFalse("test if doc is not empty", doc.getFieldNames().isEmpty());
+    assertEquals("test if doc has 3 fields", 3, doc.getFieldNames().size());
+    assertEquals("test if doc has field1", "val1",
+        ((String[]) doc.getField("field1").getValues().get(0))[0]);
+    assertEquals("test if doc has field2", "val2",
+        ((String[]) doc.getField("field2").getValues().get(0))[0]);
+    assertEquals("test if doc has field4", "val4",
+        ((String[]) doc.getField("field4").getValues().get(0))[0]);
+  }
+}


Reply via email to