Author: lewismc
Date: Sun Oct 27 11:54:36 2013
New Revision: 1536106

URL: http://svn.apache.org/r1536106
Log:
NUTCH-1124 JUnit tests for OPIC Scoring

Added:
    nutch/branches/2.x/src/plugin/scoring-opic/src/test/
    nutch/branches/2.x/src/plugin/scoring-opic/src/test/org/
    nutch/branches/2.x/src/plugin/scoring-opic/src/test/org/apache/
    nutch/branches/2.x/src/plugin/scoring-opic/src/test/org/apache/nutch/
    
nutch/branches/2.x/src/plugin/scoring-opic/src/test/org/apache/nutch/scoring/
    
nutch/branches/2.x/src/plugin/scoring-opic/src/test/org/apache/nutch/scoring/opic/
    
nutch/branches/2.x/src/plugin/scoring-opic/src/test/org/apache/nutch/scoring/opic/TestOPICScoringFilter.java
Modified:
    nutch/branches/2.x/CHANGES.txt
    nutch/branches/2.x/src/plugin/build.xml
    
nutch/branches/2.x/src/plugin/index-anchor/src/test/org/apache/nutch/indexer/anchor/TestAnchorIndexingFilter.java

Modified: nutch/branches/2.x/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/CHANGES.txt?rev=1536106&r1=1536105&r2=1536106&view=diff
==============================================================================
--- nutch/branches/2.x/CHANGES.txt (original)
+++ nutch/branches/2.x/CHANGES.txt Sun Oct 27 11:54:36 2013
@@ -2,6 +2,8 @@ Nutch Change Log
 
 Current Development
 
+* NUTCH-1124 JUnit test for scoring-opic (Talat UYARER via lewismc)
+
 * NUTCH-1641 Log timings for main jobs (jnioche)
 
 * NUTCH-1556 enabling updatedb to accept batchId (kaveh minooie,Feng)

Modified: nutch/branches/2.x/src/plugin/build.xml
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/build.xml?rev=1536106&r1=1536105&r2=1536106&view=diff
==============================================================================
--- nutch/branches/2.x/src/plugin/build.xml (original)
+++ nutch/branches/2.x/src/plugin/build.xml Sun Oct 27 11:54:36 2013
@@ -91,6 +91,7 @@
      <ant dir="lib-http" target="test"/>
      <ant dir="subcollection" target="test"/>
      <ant dir="microformats-reltag" target="test"/>
+     <ant dir="scoring-opic" target="test"/>
      <!--
      <ant dir="feed" target="test"/>
      <ant dir="parse-ext" target="test"/>

Modified: 
nutch/branches/2.x/src/plugin/index-anchor/src/test/org/apache/nutch/indexer/anchor/TestAnchorIndexingFilter.java
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/index-anchor/src/test/org/apache/nutch/indexer/anchor/TestAnchorIndexingFilter.java?rev=1536106&r1=1536105&r2=1536106&view=diff
==============================================================================
--- 
nutch/branches/2.x/src/plugin/index-anchor/src/test/org/apache/nutch/indexer/anchor/TestAnchorIndexingFilter.java
 (original)
+++ 
nutch/branches/2.x/src/plugin/index-anchor/src/test/org/apache/nutch/indexer/anchor/TestAnchorIndexingFilter.java
 Sun Oct 27 11:54:36 2013
@@ -23,21 +23,15 @@ import org.apache.nutch.storage.WebPage;
 import org.apache.nutch.util.NutchConfiguration;
 import org.junit.Test;
 import static org.junit.Assert.*;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
 
 /**
  * JUnit test case which tests
  * 1. that anchor text is obtained
  * 2. that anchor deduplication functionality is working
- * 
- * @author lewismc
  *
  */
 public class TestAnchorIndexingFilter {
   
-  public static final Logger LOG = 
LoggerFactory.getLogger(TestAnchorIndexingFilter.class);
-       
   @Test
   public void testDeduplicateAnchor() throws Exception {
     Configuration conf = NutchConfiguration.create();

Added: 
nutch/branches/2.x/src/plugin/scoring-opic/src/test/org/apache/nutch/scoring/opic/TestOPICScoringFilter.java
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/scoring-opic/src/test/org/apache/nutch/scoring/opic/TestOPICScoringFilter.java?rev=1536106&view=auto
==============================================================================
--- 
nutch/branches/2.x/src/plugin/scoring-opic/src/test/org/apache/nutch/scoring/opic/TestOPICScoringFilter.java
 (added)
+++ 
nutch/branches/2.x/src/plugin/scoring-opic/src/test/org/apache/nutch/scoring/opic/TestOPICScoringFilter.java
 Sun Oct 27 11:54:36 2013
@@ -0,0 +1,285 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.scoring.opic;
+
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.Iterator;
+import java.util.LinkedHashMap;
+import java.util.LinkedList;
+import java.util.List;
+import java.util.Map;
+import java.util.Map.Entry;
+
+import org.apache.avro.util.Utf8;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.scoring.ScoreDatum;
+import org.apache.nutch.storage.WebPage;
+import org.apache.nutch.util.NutchConfiguration;
+import org.apache.nutch.util.TableUtil;
+
+import java.text.DecimalFormat;
+
+import org.junit.Before;
+import org.junit.Test;
+
+import static org.junit.Assert.assertTrue;
+
+/**
+ * JUnit test for <code>OPICScoringFilter</code>. For an example set of URLs, 
we
+ * simulate inlinks and outlinks of the available graph. By manual calculation,
+ * we determined the correct score points of URLs for each depth. For
+ * convenience, a Map (dbWebPages) is used to store the calculated scores
+ * instead of a persistent data store. At the end of the test, calculated 
scores
+ * in the map are compared to our correct scores and a boolean result is
+ * returned.
+ * 
+ */
+public class TestOPICScoringFilter {
+
+  // These lists will be used when simulating the graph
+  private Map<String, String[]> linkList = new LinkedHashMap<String, 
String[]>();
+  private final List<ScoreDatum> outlinkedScoreData = new 
ArrayList<ScoreDatum>();
+  private static final int DEPTH = 3;
+
+  DecimalFormat df = new DecimalFormat("#.###");
+
+  private final String[] seedList = new String[] { "http://a.com";,
+      "http://b.com";, "http://c.com";, };
+
+  // An example graph; shows websites as connected nodes
+  private void fillLinks() {
+    linkList.put("http://a.com";, new String[] { "http://b.com"; });
+    linkList.put("http://b.com";,
+        new String[] { "http://a.com";, "http://c.com"; });
+    linkList.put("http://c.com";, new String[] { "http://a.com";, "http://b.com";,
+        "http://d.com"; });
+    linkList.put("http://d.com";, new String[] {});
+  }
+
+  // Previously calculated values for each three depths. We will compare these
+  // to the results this test generates
+  private static HashMap<Integer, HashMap<String, Float>> acceptedScores = new 
HashMap<Integer, HashMap<String, Float>>() {
+    /**
+        * 
+        */
+    private static final long serialVersionUID = 278328450774664407L;
+
+    {
+      put(1, new HashMap<String, Float>() {
+        /**
+                * 
+                */
+        private static final long serialVersionUID = 6145080304388858096L;
+
+        {
+          put(new String("http://a.com";), new Float(1.833));
+          put(new String("http://b.com";), new Float(2.333));
+          put(new String("http://c.com";), new Float(1.5));
+          put(new String("http://d.com";), new Float(0.333));
+        }
+      });
+      put(2, new HashMap<String, Float>() {
+        /**
+                * 
+                */
+        private static final long serialVersionUID = 8948751511219885073L;
+
+        {
+          put(new String("http://a.com";), new Float(2.666));
+          put(new String("http://b.com";), new Float(3.333));
+          put(new String("http://c.com";), new Float(2.166));
+          put(new String("http://d.com";), new Float(0.278));
+        }
+      });
+      put(3, new HashMap<String, Float>() {
+        /**
+                * 
+                */
+        private static final long serialVersionUID = -7025018421800845103L;
+
+        {
+          put(new String("http://a.com";), new Float(3.388));
+          put(new String("http://b.com";), new Float(4.388));
+          put(new String("http://c.com";), new Float(2.666));
+          put(new String("http://d.com";), new Float(0.5));
+        }
+      });
+    }
+  };
+
+  private HashMap<Integer, HashMap<String, Float>> resultScores = new 
HashMap<Integer, HashMap<String, Float>>();
+
+  private OPICScoringFilter scoringFilter;
+
+  @Before
+  public void setUp() throws Exception {
+
+    Configuration conf = NutchConfiguration.create();
+    // LinkedHashMap dbWebPages is used instead of a persistent
+    // data store for this test class
+    Map<String, Map<WebPage, List<ScoreDatum>>> dbWebPages = new 
LinkedHashMap<String, Map<WebPage, List<ScoreDatum>>>();
+
+    // All WebPages stored in this map with an initial true value.
+    // After processing, it is set to false.
+    Map<String, Boolean> dbWebPagesControl = new LinkedHashMap<String, 
Boolean>();
+
+    TestOPICScoringFilter self = new TestOPICScoringFilter();
+    self.fillLinks();
+
+    float scoreInjected = conf.getFloat("db.score.injected", 1.0f);
+
+    scoringFilter = new OPICScoringFilter();
+    scoringFilter.setConf(conf);
+
+    // injecting seed list, with scored attached to webpages
+    for (String url : self.seedList) {
+      WebPage row = new WebPage();
+      row.setScore(scoreInjected);
+      scoringFilter.injectedScore(url, row);
+
+      List<ScoreDatum> scList = new LinkedList<ScoreDatum>();
+      Map<WebPage, List<ScoreDatum>> webPageMap = new HashMap<WebPage, 
List<ScoreDatum>>();
+      webPageMap.put(row, scList);
+      dbWebPages.put(TableUtil.reverseUrl(url), webPageMap);
+      dbWebPagesControl.put(TableUtil.reverseUrl(url), true);
+    }
+
+    // Depth Loop
+    for (int i = 1; i <= DEPTH; i++) {
+      Iterator<Map.Entry<String, Map<WebPage, List<ScoreDatum>>>> iter = 
dbWebPages
+          .entrySet().iterator();
+
+      // OPIC Score calculated for each website one by one
+      while (iter.hasNext()) {
+        Map.Entry<String, Map<WebPage, List<ScoreDatum>>> entry = iter.next();
+        Map<WebPage, List<ScoreDatum>> webPageMap = entry.getValue();
+
+        WebPage row = null;
+        List<ScoreDatum> scoreList = null;
+        Iterator<Map.Entry<WebPage, List<ScoreDatum>>> iters = webPageMap
+            .entrySet().iterator();
+        if (iters.hasNext()) {
+          Map.Entry<WebPage, List<ScoreDatum>> values = iters.next();
+          row = values.getKey();
+          scoreList = values.getValue();
+        }
+
+        String reverseUrl = entry.getKey();
+        String url = TableUtil.unreverseUrl(reverseUrl);
+        float score = row.getScore();
+
+        if (dbWebPagesControl.get(TableUtil.reverseUrl(url))) {
+          row.setScore(scoringFilter.generatorSortValue(url, row, score));
+          dbWebPagesControl.put(TableUtil.reverseUrl(url), false);
+        }
+
+        // getting outlinks from testdata
+        String[] seedOutlinks = self.linkList.get(url);
+        for (String seedOutlink : seedOutlinks) {
+          row.putToOutlinks(new Utf8(seedOutlink), new Utf8());
+        }
+
+        self.outlinkedScoreData.clear();
+
+        // Existing outlinks are added to outlinkedScoreData
+        Map<Utf8, Utf8> outlinks = row.getOutlinks();
+        if (outlinks != null) {
+          for (Entry<Utf8, Utf8> e : outlinks.entrySet()) {
+            int depth = Integer.MAX_VALUE;
+            self.outlinkedScoreData.add(new ScoreDatum(0.0f, e.getKey()
+                .toString(), e.getValue().toString(), depth));
+          }
+        }
+        scoringFilter.distributeScoreToOutlinks(url, row,
+            self.outlinkedScoreData, (outlinks == null ? 0 : outlinks.size()));
+
+        // DbUpdate Reducer simulation
+        for (ScoreDatum sc : self.outlinkedScoreData) {
+          if (dbWebPages.get(TableUtil.reverseUrl(sc.getUrl())) == null) {
+            // Check each outlink and creates new webpages if it's not
+            // exist in database (dbWebPages)
+            WebPage outlinkRow = new WebPage();
+            scoringFilter.initialScore(sc.getUrl(), outlinkRow);
+            List<ScoreDatum> newScoreList = new LinkedList<ScoreDatum>();
+            newScoreList.add(sc);
+            Map<WebPage, List<ScoreDatum>> values = new HashMap<WebPage, 
List<ScoreDatum>>();
+            values.put(outlinkRow, newScoreList);
+            dbWebPages.put(TableUtil.reverseUrl(sc.getUrl()), values);
+            dbWebPagesControl.put(TableUtil.reverseUrl(sc.getUrl()), true);
+          } else {
+            // Outlinks are added to list for each webpage
+            Map<WebPage, List<ScoreDatum>> values = dbWebPages.get(TableUtil
+                .reverseUrl(sc.getUrl()));
+            Iterator<Map.Entry<WebPage, List<ScoreDatum>>> value = values
+                .entrySet().iterator();
+            if (value.hasNext()) {
+              Map.Entry<WebPage, List<ScoreDatum>> list = value.next();
+              scoreList = list.getValue();
+              scoreList.add(sc);
+            }
+          }
+        }
+      }
+
+      // Simulate Reducing
+      for (Map.Entry<String, Map<WebPage, List<ScoreDatum>>> page : dbWebPages
+          .entrySet()) {
+
+        String reversedUrl = page.getKey();
+        String url = TableUtil.unreverseUrl(reversedUrl);
+
+        Iterator<Map.Entry<WebPage, List<ScoreDatum>>> rr = page.getValue()
+            .entrySet().iterator();
+
+        List<ScoreDatum> inlinkedScoreDataList = null;
+        WebPage row = null;
+        if (rr.hasNext()) {
+          Map.Entry<WebPage, List<ScoreDatum>> aa = rr.next();
+          inlinkedScoreDataList = aa.getValue();
+          row = aa.getKey();
+        }
+        // Scores are updated here
+        scoringFilter.updateScore(url, row, inlinkedScoreDataList);
+        inlinkedScoreDataList.clear();
+        HashMap<String, Float> result = new HashMap<String, Float>();
+        result.put(url, row.getScore());
+
+        resultScores.put(i, result);
+      }
+
+    }
+  }
+
+  /**
+   * Assertion that the accepted and and actual resultant scores are the same.
+   */
+  @Test
+  public void testModeAccept() {
+    for (int i = 1; i <= DEPTH; i++) {
+      for (String resultUrl : resultScores.get(i).keySet()) {
+        String accepted = df.format(acceptedScores.get(i).get(resultUrl));
+        System.out.println("Accepted Score: " + accepted);
+        String result = df.format(resultScores.get(i).get(resultUrl));
+        System.out.println("Resulted Score: " + result);
+        assertTrue(accepted.equals(result));
+      }
+    }
+
+  }
+
+}


Reply via email to