Author: lewismc
Date: Fri Nov  1 18:44:23 2013
New Revision: 1538023

URL: http://svn.apache.org/r1538023
Log:
NUTCH-1125 JUnit test for TLD

Added:
    nutch/branches/2.x/src/plugin/tld/src/test/
    nutch/branches/2.x/src/plugin/tld/src/test/org/
    nutch/branches/2.x/src/plugin/tld/src/test/org/apache/
    nutch/branches/2.x/src/plugin/tld/src/test/org/apache/nutch/
    nutch/branches/2.x/src/plugin/tld/src/test/org/apache/nutch/indexer/
    nutch/branches/2.x/src/plugin/tld/src/test/org/apache/nutch/indexer/tld/
    
nutch/branches/2.x/src/plugin/tld/src/test/org/apache/nutch/indexer/tld/TestTLDIndexingFilter.java
Modified:
    nutch/branches/2.x/CHANGES.txt
    nutch/branches/2.x/src/plugin/build.xml

Modified: nutch/branches/2.x/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/CHANGES.txt?rev=1538023&r1=1538022&r2=1538023&view=diff
==============================================================================
--- nutch/branches/2.x/CHANGES.txt (original)
+++ nutch/branches/2.x/CHANGES.txt Fri Nov  1 18:44:23 2013
@@ -2,6 +2,8 @@ Nutch Change Log
 
 Current Development
 
+* NUTCH-1125 JUnit test for tld (Sertac TURKEL via lewismc)
+
 * NUTCH-1124 JUnit test for scoring-opic (Talat UYARER via lewismc)
 
 * NUTCH-1641 Log timings for main jobs (jnioche)

Modified: nutch/branches/2.x/src/plugin/build.xml
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/build.xml?rev=1538023&r1=1538022&r2=1538023&view=diff
==============================================================================
--- nutch/branches/2.x/src/plugin/build.xml (original)
+++ nutch/branches/2.x/src/plugin/build.xml Fri Nov  1 18:44:23 2013
@@ -92,6 +92,7 @@
      <ant dir="subcollection" target="test"/>
      <ant dir="microformats-reltag" target="test"/>
      <ant dir="scoring-opic" target="test"/>
+     <ant dir="tld" target="test"/>
      <!--
      <ant dir="feed" target="test"/>
      <ant dir="parse-ext" target="test"/>

Added: 
nutch/branches/2.x/src/plugin/tld/src/test/org/apache/nutch/indexer/tld/TestTLDIndexingFilter.java
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/tld/src/test/org/apache/nutch/indexer/tld/TestTLDIndexingFilter.java?rev=1538023&view=auto
==============================================================================
--- 
nutch/branches/2.x/src/plugin/tld/src/test/org/apache/nutch/indexer/tld/TestTLDIndexingFilter.java
 (added)
+++ 
nutch/branches/2.x/src/plugin/tld/src/test/org/apache/nutch/indexer/tld/TestTLDIndexingFilter.java
 Fri Nov  1 18:44:23 2013
@@ -0,0 +1,116 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.indexer.tld;
+
+import static org.junit.Assert.*;
+
+import java.util.HashMap;
+import java.util.Map;
+import java.util.Map.Entry;
+
+import org.apache.nutch.indexer.NutchDocument;
+import org.apache.nutch.storage.WebPage;
+
+import org.junit.Test;
+
+/**
+ * JUnit test case which populates a HashMap
+ * with URL's and top level domain qualifiers 
+ * as key's and value's respectively.
+ * We assert that each value entry in the HashMap equals 
+ * the expect field value for the document after being filtered.
+ * 
+ */
+
+public class TestTLDIndexingFilter {
+
+  @Test
+  public void testBasicFields() throws Exception {
+    Map<String, String> urls = new HashMap<String, String>();
+
+    urls.put("http://www.a.com";, "com");
+    urls.put("http://www.b.aero";, "aero");
+    urls.put("http://www.c.coop";, "coop");
+    urls.put("http://d.biz";, "biz");
+    urls.put("http://www.e.cat";, "cat");
+    urls.put("http://www.lib.f.edu";, "edu");
+    urls.put("http://www.g.gov";, "gov");
+    urls.put("http://www.h.int";, "int");
+    urls.put("http://www.i.mil";, "mil");
+    urls.put("http://j.net";, "net");
+    urls.put("http://www.k.org";, "org");
+    urls.put("http://www.l.pro";, "pro");
+    urls.put("http://www.m.int";, "int");
+    urls.put("http://www.n.nato";, "nato");
+    urls.put("http://www.o.bitnet";, "bitnet");
+    urls.put("http://www.p.ubc.ca";, "ca");
+    urls.put("http://www.q.ubc.an";, "an");
+    urls.put("http://www.r.ubc.ch";, "ch");
+    urls.put("http://www.s.ubc.eu";, "eu");
+    urls.put("http://www.t.ubc.kp";, "kp");
+    urls.put("http://www.u.ubc.nz";, "nz");
+    urls.put("http://www.v.ubc.tr";, "tr");
+    urls.put("http://www.w.ubc.yu";, "yu");
+    urls.put("http://www.x.id.us";, "id.us");
+    urls.put("http://www.w.gu.us";, "gu.us");
+    urls.put("http://www.z.vi.us";, "vi.us");
+    urls.put("http://www.a.co.uk/";, "co.uk");
+    urls.put("http://www.b.gov.uk/";, "gov.uk");
+    urls.put("http://www.c.nic.uk/";, "nic.uk");
+    urls.put("http://www.d.govt.uk/";, "govt.uk");
+    urls.put("http://www.e.orgn.uk/";, "orgn.uk");
+    urls.put("http://www.f.com.tr/";, "com.tr");
+    urls.put("http://www.g.web.tr/";, "web.tr");
+    urls.put("http://www.h.tel.tr/";, "tel.tr");
+    urls.put("http://i.nom.ad";, "nom.ad");
+    urls.put("http://j.tp";, "tp");
+    urls.put("http://k.e164.arpa";, "e164.arpa");
+    urls.put("http://l.ip6.arpa";, "ip6.arpa");
+    urls.put("http://m.act.edu.au";, "act.edu.au");
+    urls.put("http://n.한글.kr";, "한글.kr");
+    urls.put("http://p.Ã¥fjord.no";, "Ã¥fjord.no");
+    urls.put("http://q.Ã¥mot.no";, "Ã¥mot.no");
+    urls.put("http://r.lærdal.no";, "lærdal.no");
+    urls.put("http://s.組織.tw";, "組織.tw");
+    urls.put("http://t.gs.aa.no";, "gs.aa.no");
+    urls.put("http://u.gs.oslo.no/";, "gs.oslo.no");
+    urls.put("https://v.florø.no/";, "florø.no");
+    urls.put("ftp://w.info.nf/";, "info.nf");
+    urls.put("file://x.aa.no", "aa.no");
+
+    WebPage page = new WebPage();
+
+    TLDIndexingFilter filter = new TLDIndexingFilter();
+    assertNotNull(filter);
+
+    for (Entry<String, String> entry : urls.entrySet()) {
+      NutchDocument doc = new NutchDocument();
+      assertNotNull(doc);
+      String url = entry.getKey();
+      try {
+        filter.filter(doc, url, page);
+      } catch (Exception e) {
+        e.printStackTrace();
+        fail(e.getMessage());
+      }
+
+      assertEquals(doc.getFieldValue("tld"), entry.getValue());
+    }
+
+  }
+
+}


Reply via email to