Author: ab
Date: Tue Feb  3 15:12:48 2009
New Revision: 740318

URL: http://svn.apache.org/viewvc?rev=740318&view=rev
Log:
NUTCH-279 Additions to urlnormalizer-regex (modified).

Added:
    lucene/nutch/trunk/src/java/org/apache/nutch/net/URLNormalizerChecker.java  
 (with props)
Modified:
    lucene/nutch/trunk/CHANGES.txt

Modified: lucene/nutch/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?rev=740318&r1=740317&r2=740318&view=diff
==============================================================================
--- lucene/nutch/trunk/CHANGES.txt (original)
+++ lucene/nutch/trunk/CHANGES.txt Tue Feb  3 15:12:48 2009
@@ -328,6 +328,9 @@
 
 122. NUTCH-682 - SOLR indexer does not set boost on the document.
                  (julien nioche via dogacan)
+
+123. NUTCH-279 - Additions to urlnormalizer-regex (Stefan Neufeind, ab)
+
      
 Release 0.9 - 2007-04-02
 

Added: 
lucene/nutch/trunk/src/java/org/apache/nutch/net/URLNormalizerChecker.java
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/net/URLNormalizerChecker.java?rev=740318&view=auto
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/net/URLNormalizerChecker.java 
(added)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/net/URLNormalizerChecker.java 
Tue Feb  3 15:12:48 2009
@@ -0,0 +1,115 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.net;
+
+import org.apache.nutch.plugin.Extension;
+import org.apache.nutch.plugin.ExtensionPoint;
+import org.apache.nutch.plugin.PluginRepository;
+
+import org.apache.hadoop.conf.Configuration;
+
+import org.apache.nutch.util.NutchConfiguration;
+
+import java.io.BufferedReader;
+import java.io.InputStreamReader;
+
+/**
+ * Checks one given normalizer or all normalizers.
+ */
+public class URLNormalizerChecker {
+
+  private Configuration conf;
+
+  public URLNormalizerChecker(Configuration conf) {
+      this.conf = conf;
+  }
+
+  private void checkOne(String normalizerName, String scope) throws Exception {
+    URLNormalizer normalizer = null;
+
+    ExtensionPoint point =
+      PluginRepository.get(conf).getExtensionPoint(URLNormalizer.X_POINT_ID);
+
+    if (point == null)
+      throw new RuntimeException(URLNormalizer.X_POINT_ID+" not found.");
+
+    Extension[] extensions = point.getExtensions();
+
+    for (int i = 0; i < extensions.length; i++) {
+      Extension extension = extensions[i];
+      normalizer = (URLNormalizer)extension.getExtensionInstance();
+      if (normalizer.getClass().getName().equals(normalizerName)) {
+        break;
+      } else {
+        normalizer = null;
+      }
+    }
+
+    if (normalizer == null)
+      throw new RuntimeException("URLNormalizer "+normalizerName+" not 
found.");
+
+    System.out.println("Checking URLNormalizer " + normalizerName);
+
+    BufferedReader in = new BufferedReader(new InputStreamReader(System.in));
+    String line;
+    while ((line = in.readLine()) != null) {
+      String out = normalizer.normalize(line, scope);
+      System.out.println(out);
+    }
+  }
+
+  private void checkAll(String scope) throws Exception {
+    System.out.println("Checking combination of all URLNormalizers available");
+
+    BufferedReader in = new BufferedReader(new InputStreamReader(System.in));
+    String line;
+    URLNormalizers normalizers = new URLNormalizers(conf, scope);
+    while((line = in.readLine()) != null) {
+      String out = normalizers.normalize(line, scope);
+      System.out.println(out);
+    }
+  }
+
+  public static void main(String[] args) throws Exception {
+
+    String usage = "Usage: URLNormalizerChecker [-normalizer <normalizerName>] 
[-scope <scope>]"
+      + "\n\tscope can be one of: 
default,partition,generate_host_count,fetcher,crawldb,linkdb,inject,outlink";
+
+    String normalizerName = null;
+    String scope = URLNormalizers.SCOPE_DEFAULT;
+    for (int i = 0; i < args.length; i++) {
+      if (args[i].equals("-normalizer")) {
+        normalizerName = args[++i];
+      } else if (args[i].equals("-scope")) {
+        scope = args[++i];
+      } else {
+        System.err.println(usage);
+        System.exit(-1);
+      }
+    }
+
+    URLNormalizerChecker checker = new 
URLNormalizerChecker(NutchConfiguration.create());
+    if (normalizerName != null) {
+      checker.checkOne(normalizerName, scope);
+    } else {
+      checker.checkAll(scope);
+    }
+
+    System.exit(0);
+  }
+}

Propchange: 
lucene/nutch/trunk/src/java/org/apache/nutch/net/URLNormalizerChecker.java
------------------------------------------------------------------------------
    svn:eol-style = native


Reply via email to